Pronunciation/transcription module for Northern Thai. See {{nod-pron}}
for usage.
Testcases: Module:User:RichardW57/nod-translit/testcases.
local export = {}
local gsub = mw.ustring.gsub
local sub = mw.ustring.sub
local match = mw.ustring.match
local find = mw.ustring.find
local failed_cat = "]"
local namespace = mw.title.getCurrentTitle().nsText
local thai_range = ""
local categories = {}
local systems = {
= 1,
= 2,
= 3,
= 4,
= 5,
= 6
}
local initial = {
= { "g", "k", "k", class = "mid" },
= { "j", "ch", "t͡ɕ", class = "mid" },
= { "d", "d", "d", class = "mid" },
= { "d", "d", "d", class = "mid" },
= { "dt", "t", "t", class = "mid" },
= { "dt", "t", "t", class = "mid" },
= { "b", "b", "b", class = "mid" },
= { "bp", "p", "p", class = "mid" },
= { "", "@", "ʔ", class = "mid" },
= { "ng", "$ng", "ŋ", class = "low" },
= { "n", "n", "n", class = "low" },
= { "n", "n", "n", class = "low" },
= { "m", "m", "m", class = "low" },
= { "ny", "ny", "ɲ", class = "low" },
= { "y", "y", "j", class = "low" },
= { "r", "r", "r", class = "low" },
= { "l", "l", "l", class = "low" },
= { "l", "l", "l", class = "low" },
= { "w", "w", "w", class = "low" },
= { "k", "kh", "kʰ", class = "low" },
= { "k", "kh", "kʰ", class = "low" },
= { "k", "kh", "kʰ", class = "low" },
= { "k", "kh", "kʰ", class = "high" },
= { "k", "kh", "kʰ", class = "high" },
= { "ch", "ch", "t͡ɕʰ", class = "low" },
= { "ch", "ch", "t͡ɕʰ", class = "low" },
= { "ch", "ch", "t͡ɕʰ", class = "high" },
= { "t", "th", "tʰ", class = "low" },
= { "t", "th", "tʰ", class = "low" },
= { "t", "th", "tʰ", class = "low" },
= { "t", "th", "tʰ", class = "low" },
= { "t", "th", "tʰ", class = "high" },
= { "t", "th", "tʰ", class = "high" },
= { "p", "ph", "pʰ", class = "low" },
= { "p", "ph", "pʰ", class = "low" },
= { "p", "ph", "pʰ", class = "high" },
= { "f", "f", "f", class = "low" },
= { "f", "f", "f", class = "high" },
= { "s", "s", "s", class = "low" },
= { "s", "s", "s", class = "high" },
= { "s", "s", "s", class = "high" },
= { "s", "s", "s", class = "high" },
= { "h", "h", "h", class = "low" },
= { "h", "h", "h", class = "high" },
= { "ng", "$ng", "ŋ", class = "high" },
= { "n", "n", "n", class = "high" },
= { "m", "m", "m", class = "high" },
= { "y", "y", "j", class = "high" },
= { "y", "y", "j", class = "high" },
= { "r", "r", "r", class = "high" },
= { "l", "l", "l", class = "high" },
= { "w", "w", "w", class = "high" },
= { "…", "…", "…", class = "" },
= { failed_cat, failed_cat, "", class = "" },
}
local vowel = {
= {
= { "a", "a", "a" },
= { "a", "a", "a" },
= { "i", "i", "i" },
= { "ʉ", "ue", "ɯ" },
= { "u", "u", "u" },
= { "e", "e", "eʔ" },
= { "ɛ", "ae", "ɛʔ" },
= { "o", "o", "oʔ" },
= { "ɔ", "o", "ɔʔ" },
= { "ɔ", "o", "ɔ" },
= { "ə", "oe", "ɤ" },
= { "ə", "oe", "ɤʔ" },
= { "aa", "a", "aː" },
= { "ii", "i", "iː" },
= { "uu", "u", "uː" },
= { "ʉʉ", "ue", "ɯː" },
= { "ee", "e", "eː" },
= { "ɛɛ", "ae", "ɛː" },
= { "oo", "o", "oː" },
= { "ɔɔ", "o", "ɔː" },
= { "ɔɔn", "on", "ɔːn" },
= { "əə", "oe", "ɤː" },
= { "ia", "ia", "ia̯ʔ" },
= { "ʉa", "uea", "ɯa̯ʔ" },
= { "ua", "ua", "ua̯ʔ" },
= { "iia", "ia", "ia̯" },
= { "ʉʉa", "uea", "ɯa̯" },
= { "uua", "ua", "ua̯" },
= { "iu", "io", "iw" },
= { "iiu", "io", "iːw" },
= { "eo", "eo", "ew" },
= { "ɛo", "aeo", "ɛw" },
= { "ao", "ao", "aw" },
= { "eeo", "eo", "eːw" },
= { "ɛɛo", "aeo", "ɛːw" },
= { "aao", "ao", "aːw" },
= { "əəo", "oeu", "ɤːw" }, --royin inferred
= { "oow", "ou", "oːw" }, --royin inferred
= { "iao", "iao", "ia̯w" },
= { "ai", "ai", "aj" },
= { "ai", "ai", "aj" },
= { "ai", "ai", "aj" },
= { "ai", "ai", "aj" },
= { "ʉi", "uei", "ɯj" },
= { "ɔi", "oi", "ɔj" },
= { "əi", "oei", "ɤj" },
= { "ui", "ui", "uj" },
= { "aai", "ai", "aːj" },
= { "ɔɔi", "oi", "ɔːj" },
= { "ooi", "oi", "oːj" },
= { "əəi", "oei", "ɤːj" },
= { "əəi", "oei", "ɤːj" }, -- Certainly occurs in rup pariwat.
= { "uui", "ui", "uːj" },
= { "uai", "uai", "ua̯j" },
= { "ʉai", "ueai", "ɯa̯j" },
= { "am", "am", "am" },
},
= {
= { "a", "a", "a" },
= { "a", "a", "a" },
= { "i", "i", "i" },
= { "ʉ", "ue", "ɯ" },
= { "u", "u", "u" },
= { "ee", "e", "eː" },
= { "e", "e", "e" },
= { "ɛ", "ae", "ɛ" },
= { "ɛɛ", "ae", "ɛː" },
= { "o", "o", "o" },
= { "ɔ", "o", "ɔ" },
= { "ə", "oe", "ɤ" },
= { "aa", "a", "aː" },
= { "ii", "i", "iː" },
= { "ʉʉ", "ue", "ɯː" },
= { "uu", "u", "uː" },
= { "oo", "o", "oː" },
= { "ɔɔ", "o", "ɔː" },
= { "əə", "oe", "ɤː" },
= { "əə", "oe", "ɤː" },
= { "iia", "ia", "ia̯" },
= { "ʉʉa", "uea", "ɯa̯" },
= { "uua", "ua", "ua̯" },
= { "ai", "ai", "aj" },
= { "ao", "ao", "aw" },
= { "ɔi", "oi", "ɔj" },
}
}
local unromLong = {
= true, = true, = true, = true,
= true, = true, = true,
= true,
}
local liveExc = {
= true, = true, = true, = true,
= true, = true, = true, = true,
= true, = true, = true, = true,
= true,
}
-- ย,ว are not included.
-- ช,ซ,ส,ฟ,ล are changed for loanwords.
-- ห,อ,ฮ can never be c2s.
local coda = {
= { "k", "k", "k̚" },
= { "k", "k", "k̚" },
= { "k", "k", "k̚" },
= { "k", "k", "k̚" },
= { "k", "k", "k̚" },
= { "k", "k", "k̚" },
= { "t", "t", "t̚" },
= { "t", "t", "t̚" },
= { "ch", "ch", "t͡ɕʰ" },
= { "s", "s", "s" },
= { "t", "t", "t̚" },
= { "t", "t", "t̚" },
= { "t", "t", "t̚" },
= { "t", "t", "t̚" },
= { "t", "t", "t̚" },
= { "t", "t", "t̚" },
= { "t", "t", "t̚" },
= { "t", "t", "t̚" },
= { "t", "t", "t̚" },
= { "t", "t", "t̚" },
= { "t", "t", "t̚" },
= { "t", "t", "t̚" },
= { "t", "t", "t̚" },
= { "s", "s", "s" },
= { "p", "p", "p̚" },
= { "p", "p", "p̚" },
= { "p", "p", "p̚" },
= { "p", "p", "p̚" },
= { "p", "p", "p̚" },
= { "f", "f", "f" },
= { "p", "p", "p̚" },
= { "ng", "ng$", "ŋ" },
= { "n", "n", "n" },
= { "n", "n", "n" },
= { "n", "n", "n" },
= { "n", "n", "n" },
= { "l", "l", "l" },
= { "n", "n", "n" },
= { "m", "m", "m" },
}
local tFromMark = {
-- common
= { = "low", = "low", = "falling" },
= { = "special", = "special", = "high" },
= { = "high", = "high", = "high" },
= { = "rising", = "rising", = "rising" },
-- forced mid tone
= { = "mid", = "mid", = "mid" },
}
local tNoMark = {
= { = "rising", = "rising", = "high" },
= { = "low", = "low", = "falling" },
= { = "rising", = "mid", = "mid" },
}
local tRomMarks = {
= "́", = "", = "̀",
= "̌", = "̂", = "᷆"
}
local tLevels = {
= "˦˥", = "˧", = "˨˩",
= "˩˩˦", = "˦˩", = "˥˥˨", -- More work needed!
}
local symbols = {
= 0, = 1, = 2, = 3, = 4,
= 5, = 6, = 7, = 8, = 9,
}
local permitted_cluster = {
= 1, = 1, = 1, = 1, = 1,
}
-- modified ISO 11940 (to make sound values are more apparent), uses spacing marks for signs
local char_table = {
= "k", = "kʰ", = "x", = "g", = "ɣ", = "gʰ", = "ŋ",
= "t͡ɕ", = "t͡ɕʰ", = "d͡ʑ", = "z", = "d͡ʑʰ", = "ɲ",
= "ᶑ", = "ʈ", = "ʈʰ", = "ɖ", = "ɖʰ", = "ɳ",
= "ɗ", = "t", = "tʰ", = "d", = "dʰ", = "n",
= "ɓ", = "p", = "pʰ", = "v", = "b", = "f", = "bʰ", = "m",
= "y", = "r", = "ṛ", = "l", = "ḷ", = "w",
= "ɕ", = "ʂ", = "s", = "h", = "ɭ", = "ɒ", = "ɦ",
= "a", = "ạ", = "ā", = "å", = "i", = "ī",
= "ụ", = "ụ̄", = "u", = "ū", = "ɨ", = "̥", = "฿",
= "e", = "æ", = "o", = "au", = "ai",
= "ˋ", = "ˆ", = "ˊ", = "ˇ",
= "ǂ", = "«", = "˘",
= "ʻ", = "˚", = "~", = "§",
= "ǁ", = "»",
= "0", = "1", = "2", = "3", = "4",
= "5", = "6", = "7", = "8", = "9",
= " ", = "–", = "…",
}
local mgvc_pattern = "^(?)(???ะ?)(?)$"
local full_pattern = "^(?)(หฺ)(ฺ??)(?็????ะ?)(??)$"
local partial_pattern = "^(?)()(ฺ??)(?็????ะ?)(??)$"
function export.translit(text, lang, sc, mode, source)
local seq = systems
for word in mw.ustring.gmatch(text, thai_range .. "+") do
local orig_word, class, tMark, tone, long, c2 = word, "", false, false, false, false
if match(word, ".?") then
return nil
end
local function c2_decomp(c2_char, seq, source)
local converted_c2 = {}
for character in mw.text.gsplit(c2_char, "") do
table.insert(converted_c2, coda and coda or nil)
end
local cluster = table.concat(converted_c2)
if source == "translit-module" and not permitted_cluster then
return c2_char
else
return cluster
end
end
local function syllable(v1, c1, g, v2, c2)
tMark = match(v2, "")
v2 = gsub(v2, "", "")
if match(c1, "^ห.$") then
if match(sub(c1, 2, 2) .. g .. v2 .. c2, mgvc_pattern) then
c1, g, v2, c2 = "ห", match(sub(c1, 2, 2) .. g .. v2 .. c2, mgvc_pattern)
if g ~= "" and not v2 == "ย" then c1, g = c1 .. g, "" end
end
end
if g == "ล" and v2 .. c2 == "" then
c2 = g
g = ""
end
openness = c2 ~= "" and "closed" or "open"
if source == "pron-module"
and (mw.ustring.len(c1) > 1 or match(g, ""))
and not match(c1 .. g, "ฺ") then
error("Please replace " .. c1 .. g .. " in the respelling with " ..
sub(c1, 1, 1) .. "ฺ" .. (sub(c1, 2, -1) or "") .. g .. ".")
end
if vowel then
orig_v = v1 .. g .. v2
v, g = vowel, ""
else
orig_v = v1 .. v2
v = vowel and vowel or (v1 .. v2)
g = (initial or initial)
end
c1 = gsub(c1, "ฺ", "")
ini, class = "", ""
if initial then
ini, class = initial, initial.class
else
return nil
end
length = (match(v, "()%1") or match(v, "ː") or unromLong) and "long" or "short"
life = (match(c2, "") or (match(orig_v, "ย$") and match(v, "i$")) or
c2..length == "long" or liveExc) and "live" or "dead"
c2 = coda and coda or c2_decomp(c2, seq, source)
-- note: not add tone for royin
tone = tMark and tFromMark or (tNoMark or tNoMark)
if mode == "paiboon" then
v = gsub(v, "^(*)()", "%1%2" .. tRomMarks)
elseif mode == "ipa" then
c2 = c2 .. tLevels
end
return ini .. g .. v .. c2
end
word = gsub(word, full_pattern, syllable)
word = gsub(word, partial_pattern, syllable)
text = gsub(text, orig_word, word, 1)
end
text = gsub(text, "", symbols)
-- postprocessing
if mode == "royin" then
-- initial อ
text = gsub(text, "^@", "")
text = gsub(text, "()@", "%1")
text = gsub(text, "@", "-")
-- initial ง
text = gsub(text, "^%$ng", "ng")
text = gsub(text, "()%$ng", "%1ng")
text = gsub(text, "()%$ng", "%1-ng")
text = gsub(text, "%$ng", "ng")
-- final ง
text = gsub(text, "ng%$$", "ng")
text = gsub(text, "ng%$()", "ng%1") -- includes hyphen
text = gsub(text, "ng%$", "ng")
end
local count_syl = 0
if mode == "ipa" then
text, count_syl = gsub(text, "", ".") -- space, common hyphen, en dash
if not match(text, "%.$") then
count_syl = count_syl + 1
end
text = gsub(text, "()(+)$", "%1ʔ%2") -- add ʔ if last syllable ends with
end
if match(text, thai_range) then
if source == "translit-module" or namespace ~= "" then
return nil
else
return failed_cat
end
else
table.insert(categories, count_syl > 0 and "]" or "")
return mw.ustring.toNFC(text)
end
end
function annotate(c1_text, annotation)
if not annotation then return c1_text end
return tostring( mw.html.create( "span" )
:css( "border-bottom", "1px dotted #000" )
:css( "cursor", "help" )
:attr( "title", annotation )
:wikitext( c1_text ))
end
local front_v = ""
local char_annotation = {
= "CANCEL"
}
function export.getCharSeqTbl(text)
local result = {}
for character in mw.text.gsplit(text, "") do
local charDetail = char_table or nil
if find(character, front_v) then
table.insert(result, tostring( mw.html.create( "span" )
:css( "border", "1px dotted gray" )
:css( "border-radius", "50%" )
:css( "cursor", "help" )
:attr( "title", "Vowel sign appearing in front of the initial consonant." )
:wikitext( charDetail )))
else
table.insert(result, annotate(charDetail, char_annotation))
end
end
return result
end
function getCharSeq(text)
return "<br><small>" ..
table.concat(export.getCharSeqTbl(text), " ") ..
"</small>"
end
local note_set = {
= annotate(
"-ɔɔ r-",
"In this word, the double consonant combinations กร, ทร, ธร, มร, and หร are pronounced 'gaaw ra', 'thaaw ra', 'maaw ra' and 'haaw ra', respectively."
),
= annotate(
"Reduplication",
"This word exhibits reduplication in pronunciation, i.e. one written consonant is used as the final consonant of a syllable as well as the initial consonant of the next syllable."
),
= annotate(
"Short",
"The vowel in this word is pronounced irregularly short."
),
= annotate(
"Unorthographical",
"This phonetic respelling violates Thai alphabet rules to indicate an irregular pronunciation."
),
}
function export.show(frame)
local lang, sc = "th", "Thai"
local args = frame:getParent().args
local pagename = args.pagename or mw.title.getCurrentTitle().text
local p, note = {}, {}
if args then
for ind_note in mw.text.gsplit(args, ",") do
table.insert(note, note_set)
end
end
if args then
for index, item in ipairs(args) do
table.insert(p, (item ~= "") and item or nil)
end
else
table.insert(p, pagename)
end
local p_concatenated = table.concat(p, "/")
if match(p_concatenated, "็" .. thai_range .. "*")
or match(p_concatenated, "" .. thai_range .. "*็")
or match(p_concatenated, "ิ็") then
table.insert(note, note_set)
if not match(args or "", "short") then
table.insert(note, note_set)
end
elseif match(p_concatenated, "็")
and not match(pagename, "็")
and not match(args or "", "short") then
table.insert(note, note_set)
end
homEdit = tostring( mw.html.create( "div" )
:css( "float", "right" )
:css( "clear", "right" )
:css( "font-size", "60%" )
:wikitext( tostring( mw.html.create( "span" )
:attr( "class", "plainlinks" )
:wikitext( "[" .. tostring( mw.uri.fullUrl(
"Module:nod-hom/data",
{ = "edit" }
)) .." edit]" ))))
function formatThai(text, pron, qualifiers)
local readclass = ''
if pron == true then
readclass = 'th-reading'
end
return ((qualifiers and table.concat(qualifiers) ~= "")
and '<span style="background-color:#ffffe6"><small>[' ..
table.concat(qualifiers, ", ") .. ']</small></span><br>'
or '') .. '<span lang="th" class="Thai ' .. readclass .. '">' .. text .. '</span>'
end
local result = {
= {},
= {},
= {},
= {},
= {},
= {}
}
table.insert(categories, "]")
local m_hom_data = require("Module:User:RichardW57/nod-hom/data")
local m_hom = require("Module:User:RichardW57/nod-hom").makeList
local m_fileData = require("Module:User:RichardW57/nod-pron/files")
local qualifiers = {}
for system, _ in pairs(systems) do
local function f(text, system)
if system == "paiboon" or system == "royin" then
return '<span class="tr">' .. text .. '</span>'
elseif system == "ipa" then
return '<span class="IPA">/' .. text .. '/</span>'
elseif system == "file" then
local length = ((mw.ustring.len(gsub(text, "", "")) + 1) * 25 + 50)
if m_fileData then
table.insert(categories, "]")
return "[[File:" ..
m_fileData ..
"|" .. (length > 200 and 200 or length) ..
"px|center]]"
else
return ""
end
else
return m_hom_data
and (#m_hom_data > 3
and tostring( mw.html.create( "div" )
:attr {
= "vsSwitcher",
= "homophones",
}
:wikitext(
tostring( mw.html.create( "span" )
:attr( "class", "vsToggleElement" )
:wikitext( " " )) ..
tostring( mw.html.create( "div" )
:attr( "class", "vsShow" )
:css( "display", "none" )) ..
tostring( mw.html.create( "div" )
:attr( "class", "vsHide" )
:wikitext(( gsub(m_hom(text), ", ", "<br>" ))))))
or gsub(m_hom(text), ", ", "<br>"))
or ""
end
end
for i, spelling in ipairs(p) do
if system == "charPhon" then
qualifiers = {}
if match(spelling, "%:") then
table.insert(qualifiers, match(spelling, "%:(.+)"))
spelling = match(spelling, "(+)")
end
if match(spelling, "-$") then
table.insert(qualifiers, "bound form")
end
else
spelling = match(spelling, "(+)")
end
table.insert(result, tostring( mw.html.create( "td" )
:css( "border-right", i < #p and "1px solid lightgray" or "0px" )
:wikitext(
(system == "charPhon"
and formatThai(spelling, false, qualifiers) .. getCharSeq(spelling)
or f(find(system, "^")
and export.translit(spelling, lang, sc, system, "pron-module")
or spelling, system)))))
end
end
has_hom = match(table.concat(result), "Northern Thai") or false
has_file = match(table.concat(result), "File") or false
notes = (#note > 0 and "<br><small>{" .. table.concat(note, "; ") .. "}</small>" or "")
local function row(a, b, o)
return ((o and o) and "" or "\n<tr>") ..
tostring( mw.html.create( "td" )
:attr( "bgcolor", "fafeff" )
:attr( (o and o) and "rowspan" or "colspan", (o and o) and 1 or 2 )
:css( "border-bottom", (o and o) and "0px" or "1px solid lightgray" )
:css( "border-right", "1px solid lightgray" )
:css( "font-weight", "bold" )
:wikitext(a)) ..
((o and o) and "" or
((o and o)
and tostring(mw.html.create( "td" )
:attr( "colspan", #p )
:css( "border-bottom", "1px solid lightgray" )
:wikitext(b))
or b) ..
"</tr>")
end
if find(pagename, "ทร") and find(table.concat(result), "ซ") then
table.insert(categories, "]")
end
return
'<table cellpadding=10 style="border-spacing: 2px; border: 1px solid darkgray; text-align:center">' ..
((namespace ~= "" and not args.pagename)
and row(
"'']''" .. notes,
'<div class="th-reading">' .. table.concat(result)) .. '</div>'
or (pagename == table.concat(p)
and row(
"'']''",
formatThai(pagename,true) .. getCharSeq(pagename),
{ = true })
or row(
"'']''",
formatThai(pagename,true) .. getCharSeq(pagename),
{ = true }) ..
row(
"'']''" .. notes,
'<div class="th-reading">' .. table.concat(result)))) .. '</div>' ..
row("'']''", nil, { = true }) ..
row(
"'']''",
table.concat(result),
{ = true }) ..
row(
"'']''",
table.concat(result),
{ = true }) ..
row(
"('']'') " ..
"]" ..
"<sup>(])</sup>",
table.concat(result),
{ = not has_hom and not has_file }) ..
(has_hom
and row(
"''Homophones''" .. homEdit,
table.concat(result) or "",
{ = not has_file })
or "") ..
(has_file
and row(
"''Audio''",
table.concat(result) or "",
{ = true })
or "") ..
"\n</table>" .. (namespace == "" and table.concat(categories) or "")
end
return export