This module will transliterate Khmer language text per WT:KM TR.
The module should preferably not be called directly from templates or other modules.
To use it from a template, use {{xlit}}
.
Within a module, use Module:languages#Language:transliterate.
For testcases, see Module:km-translit/testcases.
tr(text, lang, sc)
text
written in the script specified by the code sc
, and language specified by the code lang
.nil
.local export = {}
local toNFC = mw.ustring.toNFC
local gsub = mw.ustring.gsub
local len = mw.ustring.len
local match = mw.ustring.match
local sub = mw.ustring.sub
local cons_conv = {
= { "k", "a" },
= { "kh", "a" },
= { "k", "o" },
= { "kh", "o" },
= { "ng", "o" },
= { "ch", "a" },
= { "chh", "a" },
= { "ch", "o" },
= { "chh", "o" },
= { "nh", "o" },
= { "d", "a" },
= { "th", "a" },
= { "d", "o" },
= { "th", "o" },
= { "n", "a" },
= { "t", "a" },
= { "th", "a" },
= { "t", "o" },
= { "th", "o" },
= { "n", "o" },
= { "b", "a" },
= { "ph", "a" },
= { "p", "o" },
= { "ph", "o" },
= { "m", "o" },
= { "y", "o" },
= { "r", "o" },
= { "l", "o" },
= { "v", "o" },
= { "sh", "a" },
= { "ss", "o" },
= { "s", "a" },
= { "h", "a" },
= { "l", "a" },
= { "ʼ", "a" },
= { "", "" },
= { "p", "a" },
}
local digraph = {
= "g", = "n", = "m", = "l", = "f", = "z",
}
local indep_vowel = {
= "ʼĕ", = "ʼei",
= "ʼŏ", = "ʼŏk", = "ʼŭ", = "ʼŏu",
= "rœ̆", = "rœ",
= "lœ̆", = "lœ",
= "ʼé", = "ʼai", = "ʼaô", = "ʼaô", = "ʼâu",
}
local vowel_conv = {
= { = "â", = "ô" },
= { = "a", = "éa" },
= { = "ĕ", = "ĭ" },
= { = "ei", = "i" },
= { = "œ̆", = "œ̆" },
= { = "œ", = "œ" },
= { = "ŏ", = "ŭ" },
= { = "o", = "u" },
= { = "uŏ", = "uŏ" },
= { = "aeu", = "eu" },
= { = "eua", = "eua" },
= { = "iĕ", = "iĕ" },
= { = "é", = "é" },
= { = "ê", = "ê" },
= { = "ai", = "ey" },
= { = "aô", = "oŭ" },
= { = "au", = "ŏu" },
= { = "om", = "ŭm" },
= { = "âm", = "um" },
= { = "ăm", = "ŏâm" },
= { = "ăng", = "eăng" },
= { = "ăh", = "eăh" },
= { = "ŏh", = "uh" },
= { = "éh", = "éh" },
= { = "aŏh", = "uŏh" },
= { = "ĕh", = "ĭh" },
= { = "ĕh", = "ĭh" },
= { = "aʼ", = "éaʼ" },
= { = '<span style="font-color:#DCDCDC">â</span>', = '<span style="font-color:#DCDCDC">ô</span>' },
}
local char_type = {
= "consonant", = "consonant", = "consonant", = "consonant", = "consonant",
= "consonant", = "consonant", = "consonant", = "consonant", = "consonant",
= "consonant", = "consonant", = "consonant", = "consonant", = "consonant",
= "consonant", = "consonant", = "consonant", = "consonant", = "consonant",
= "consonant", = "consonant", = "consonant", = "consonant", = "consonant",
= "consonant", = "consonant", = "consonant", = "consonant", = "consonant",
= "consonant", = "consonant", = "consonant", = "consonant", = "consonant",
= "indep_vowel", = "indep_vowel", = "indep_vowel",
= "indep_vowel", = "indep_vowel", = "indep_vowel", = "indep_vowel", = "indep_vowel",
= "indep_vowel", = "indep_vowel", = "indep_vowel", = "indep_vowel", = "indep_vowel",
= "indep_vowel", = "indep_vowel",
= "vowel_sign", = "vowel_sign", = "vowel_sign", = "vowel_sign", = "vowel_sign",
= "vowel_sign", = "vowel_sign", = "vowel_sign", = "vowel_sign", = "vowel_sign",
= "vowel_sign", = "vowel_sign", = "vowel_sign",
= "terminating_vowel",
= "vowel_sign", = "vowel_sign",
= "terminating_vowel", = "terminating_vowel", = "terminating_vowel",
= "consonant_shift", = "consonant_shift",
= "terminating_sign",
= "sign", = "sign", = "sign", = "sign", = "sign", = "sign",
= "combining_sign",
= "sign",
= "punctuation", = "punctuation",
= "sign",
= "punctuation", = "punctuation", = "punctuation", = "punctuation", = "punctuation",
= "sign", = "sign",
= "ZWS",
}
local sp_symbols = {
= "0", = "1", = "2", = "3", = "4",
= "5", = "6", = "7", = "8", = "9",
= "0", = "1", = "2", = "3", = "4",
= "5", = "6", = "7", = "8", = "9",
}
function export.tr(text, lang, sc)
if not sc then
sc = require("Module:languages").getByCode(lang):findBestScript(text)
else
sc = require("Module:scripts").getByCode(sc)
end
text = sc:fixDiscouragedSequences(text)
text = sc:toFixedNFD(text)
text = gsub(text, '', sp_symbols)
text = gsub(text, '(.)្(.្.)', '%1%2')
text = gsub(text, '(្)()', '%1%2')
text = gsub(text, '()(្?)', '%1%2')
text = gsub(text, '(.៍)', '%1')
for word in mw.ustring.gmatch(text, '+') do
local original_text = word
local c, chartype, syl, curr_syl = {}, {}, {}, {}
local progress = 'none'
for i = 1, len(word) do
c = sub(word, i, i)
chartype = char_type]
end
for i = 1, #c + 1 do
local next_types = {}
if i == #c + 1 or chartype == 'ZWS' then
progress = 'none'
table.insert(syl, table.concat(curr_syl, ""))
curr_syl = {}
elseif progress == 'none' then
if chartype == 'consonant' then
table.insert(curr_syl, c)
progress = 'initial'
else
table.insert(syl, c)
end
elseif progress == 'initial' then
if chartype == 'combining_sign' then
table.insert(curr_syl, c)
progress = 'initial_combining'
elseif chartype == 'sign' or chartype == 'consonant_shift' then
table.insert(curr_syl, c)
elseif chartype == 'vowel_sign' then
table.insert(curr_syl, c)
progress = 'vowel'
elseif chartype == 'terminating_vowel' then
if c .. c .. (c or '') == 'ាំង' and (i == #c - 1 or (i > #c + 1 and chartype == 'consonant')) then
table.insert(curr_syl, c)
progress = 'vowel'
else
table.insert(curr_syl, c)
table.insert(syl, table.concat(curr_syl, ""))
curr_syl = {}
progress = 'none'
end
elseif chartype == 'consonant' then
vowel_found = false
local j, skipped = i, 0
while not vowel_found do
if not chartype or chartype == 'punctuation' or chartype == 'indep_vowel' or chartype == 'terminating_sign' or chartype == 'ZWS' then
skipped = 1
break
elseif chartype == 'consonant' or chartype == 'combining_sign' or (chartype == 'sign' and c ~= '័') then
table.insert(next_types, chartype)
else
vowel_found = true
end
j = j + 1
end
if skipped ~= 0 or match(table.concat(next_types, " "), 'consonant s?i?g?n? ?consonant') then
table.insert(curr_syl, c)
progress = 'coda'
else
table.insert(syl, table.concat(curr_syl, ""))
curr_syl = {c}
progress = 'initial'
end
else
table.insert(syl, c)
progress = 'none'
end
elseif progress == 'initial_combining' then
if chartype == 'consonant' then
table.insert(curr_syl, c)
progress = 'initial'
else
table.insert(syl, c)
progress = 'none'
end
elseif progress == 'vowel' then
if chartype == 'vowel_sign' then
table.insert(curr_syl, c)
elseif chartype == 'terminating_vowel' then
if c .. c .. (c or '') == 'ាំង' and (i == #c - 1 or (i > #c + 1 and chartype == 'consonant')) then
table.insert(curr_syl, c)
progress = 'vowel'
else
table.insert(curr_syl, c)
table.insert(syl, table.concat(curr_syl, ""))
curr_syl = {}
progress = 'none'
end
elseif chartype == 'consonant' then
vowel_found = false
local j, skipped = i, 0
while not vowel_found do
if not chartype or chartype == 'punctuation' or chartype == 'indep_vowel' or chartype == 'terminating_sign' or chartype == 'ZWS' then
skipped = 1
break
elseif chartype == 'consonant' or chartype == 'combining_sign' or (chartype == 'sign' and c ~= '័') then
table.insert(next_types, chartype)
else
vowel_found = true
end
j = j + 1
end
if skipped ~= 0 or match(table.concat(next_types, " "), 'consonant s?i?g?n? ?consonant') then
table.insert(curr_syl, c)
progress = 'coda'
else
table.insert(syl, table.concat(curr_syl, ""))
curr_syl = {c}
progress = 'initial'
end
else
table.insert(syl, c)
progress = 'none'
end
elseif progress == 'coda' then
if chartype == 'combining_sign' then
table.insert(curr_syl, c)
progress = 'coda_combining'
elseif chartype == 'sign' or chartype == 'terminating_sign' then
table.insert(curr_syl, c)
else
table.insert(syl, table.concat(curr_syl, ""))
curr_syl = {}
if chartype == 'consonant' then
table.insert(curr_syl, c)
progress = 'initial'
else
table.insert(syl, c)
progress = 'none'
end
end
elseif progress == 'coda_combining' then
if chartype == 'consonant' then
table.insert(curr_syl, c)
progress = 'coda'
else
table.insert(syl, table.concat(curr_syl, ""))
curr_syl = {}
progress = 'none'
end
end
end
for i = 1, #syl do
if match(syl, '៍') then
syl = '<small><del>' .. gsub(syl, '.', function(consonant)
if cons_conv then
return cons_conv
end end) .. '</del></small>'
break
end
syl = gsub(syl, '់$', '')
syl = gsub(syl, '^()្?(?)(?)(??)(?)(?៉?)្?(?)(៖?)$', function(initial_a, initial_b, cons_shifter_a, vowel, cons_shifter_b, coda_a, coda_b, optional_sign)
if cons_shifter_a .. cons_shifter_b .. vowel .. coda_a .. coda_b == '' and initial_b ~= '' and not match(syl, '្') then
coda_a = initial_b
initial_b = ''
end
base = initial_a
if initial_b ~= '' and not match(initial_b, '') then
base = initial_b
end
if vowel .. coda_a .. coda_b == 'ាំង' then
vowel, coda_a, coda_b = 'ាំង', '', ''
end
optional_sign = gsub(optional_sign, '៖', 'ː')
cons_shifter = cons_shifter_a .. cons_shifter_b
if cons_shifter == '' and cons_conv then
vowel_class = cons_conv
elseif cons_shifter == '៉' then
vowel_class = 'a'
elseif cons_shifter == '៊' then
vowel_class = 'o'
else
return initial_a .. initial_b .. cons_shifter .. vowel .. coda_a .. coda_b .. optional_sign
end
if digraph and (digraph or (cons_conv and cons_conv)) and vowel_conv then
return digraph .. vowel_conv .. (digraph or cons_conv .. cons_conv) .. optional_sign
elseif cons_conv and cons_conv and vowel_conv and cons_conv and cons_conv then
return cons_conv .. cons_conv .. vowel_conv .. cons_conv .. cons_conv .. optional_sign
end end)
if syl == 'ៗ' and i > 1 then
syl = syl
end
end
word = table.concat(syl, "")
text = gsub(text, original_text, word)
end
text = gsub(text, '.', indep_vowel)
text = gsub(text, '(*) ៗ', '%1 %1')
return toNFC(text)
-- To do: other signs
end
return export