sandhi checkers:
西安/三天
廣場/老闆/椅子/老虎
熱熱兒/短短兒/胖胖兒
鏡兒
local export = {}
-- see the encoding below
local initials = {
b = "p", p = "pʰ", m = "m", f = "f", v = "v", B = "p͡f", P = "p͡fʰ",
d = "t", t = "tʰ", n = "n", l = "l",
g = "k", k = "kʰ", N = "ŋ", h = "x",
j = "t͡ɕ", q = "t͡ɕʰ", x = "ɕ",
Z = "ʈ͡ʂ", C = "ʈ͡ʂʰ", S = "ʂ", r = "ʐ",
z = "t͡s", c = "t͡sʰ", s = "s",
= "",
}
-- see the encoding below (U=ü, N=ng)
local finals = {
a = "a", ia = "ia", ua = "ua",
ar = "ɐr", iar = "iɐr", uar = "uɐr",
o = "o", uo = "uo", Uo = "yo",
er = "ər", uor = "uər",
e = "ɤ",
ue = "ɯ", ie = "iɛ", Ue = "yɛ",
ier = "iɛr", Uer = "yɛr",
ii = "z̩", ih = "ʐ̩", i = "i", u = "u", U = "y",
iir = "ər", ihr = "ər", ir = "iər", ur = "uər", Ur = "yər",
ai = "æ", iai = "iæ", uai = "uæ",
air = "ær", iair = "iær", uair = "uær",
ei = "ei", ui = "uei",
eir = "er", uir = "uer",
ao = "au", iao = "iau",
aor = "ɔr", iaor = "iɔr",
ou = "ɤu", iu = "iɤu",
our = "ər", iur = "iər",
an = "ã", ian = "iã", uan = "uã", Uan = "yã",
anr = "ɐ̃r", ianr = "iɐ̃r", uanr = "uɐ̃r", Uanr = "yɐ̃r",
en = "ẽ", = "iẽ", un = "uẽ", Un = "yẽ",
enr = "ə̃r", inr = "iə̃r", unr = "uə̃r", Unr = "yə̃r",
aN = "aŋ", iaN = "iaŋ", uaN = "uaŋ",
aNr = "ɐ̃r", iaNr = "iɐ̃r", uaNr = "uɐ̃r",
eN = "əŋ", iN = "iŋ", oN = "uəŋ", ioN = "yoŋ",
eNr = "ə̃r", iNr = "iə̃r", oNr = "uə̃r", ioNr = "yə̃r",
}
local tones = {
= "²¹", --陰平(T1)
= "²⁴", --陽平(T2)
= "⁵³", --上(T3)
= "⁵⁵", --去(T4)
= "", -- toneless (T0)
}
-- internal use, encode and decode digraphs
local digraph_encode = {
bv = "B", pf = "P", ng = "N", zh = "Z", ch = "C", sh = "S",
= "\1",
= "\2",
= "\3",
= "\4",
}
local digraph_decode = {
B = "bv", P = "pf", N = "ng", Z = "zh", C = "ch", S = "sh", U = "ü",
= "\204\140",
= "\204\129",
= "\204\128",
= "\204\132",
= '<span style="background-color:#F5DEB3">',
= "</span>",
}
local function encode(text)
text = mw.ustring.toNFD(text)
:gsub("u\204\136","U")
:gsub("",digraph_encode)
return text
end
local function decode(text)
text = mw.ustring.toNFC(text:gsub("",digraph_decode))
return text
end
local function py_join_syllables(text)
text = text:gsub("'(\5?)","%1"):gsub("ng","N")
return text
end
local function py_divide_syllables(text)
local res = text
:gsub("()N%f","%1n'g")
:gsub("","'%0")
:gsub("''+","'")
:gsub("%f'","")
local check = py_join_syllables(res)
if text ~= check then
error("Xi'an: error with apostrophes, "..decode(text).." should be "..decode(check)..".")
end
return res
end
local function py_put_tone(syllable, tone)
syllable = syllable:gsub("?", "%0" .. (tone~="5" and string.char(tone) or ""), 1)
return syllable
end
local function py_transf(syllable)
local tone = tostring((syllable:match("") or "\5"):byte(1))
local syllable_detone, count = syllable:gsub("","")
if count > 1 then error("Xi'an: two tones in one syllable: " .. decode(syllable)) end
local check = py_put_tone(syllable_detone,tone)
if check ~= syllable then
error("Xi'an: error with tone placement, "..decode(syllable).." should be "..decode(check)..".")
end
return tone .. syllable_detone
end
-- canonize to adhere to pinyin rules, e.g. jü -> ju
local function py_canonize(text)
text = text
:gsub("()U","%1u")
:gsub("%fu?",{u="w",ui="wei",un="wen"})
:gsub("%foN","weN")
:gsub("w(r?)%f","wu%1")
:gsub("%fi?",{i="y",ih="ri",iu="you"})
:gsub("y(?r?)%f","yi%1")
:gsub("%fU","yu")
:gsub("i","i")
return text
end
-- normalize to initial+final, e.g. ju -> jü
local function py_normalize(text)
local res = text
:gsub("()u","%1U")
:gsub("w?",{wu="u",wei="ui",wen="un",weN="oN"})
:gsub("w","u")
:gsub("yu?",{yi="i",yu="U",you="iu"})
:gsub("y","i")
:gsub("()i","%1ii")
:gsub("()i","%1ih")
:gsub("rih%f","ih")
local check = py_canonize(res)
if text ~= check then
error("Xi'an: invalid syllable: "..decode(text).." should be "..decode(check))
end
return res
end
local function py_to_ipa(text)
text = text:gsub("+",function(syllable)
local a,b,c,d = syllable:match("^()(?)(*)(?)$")
if not a then error("Xi'an: Invalid syllable: " .. decode(syllable)) end
return (initials or error("Xi'an: Invalid initial: " .. decode(b)))
.. (finals or error ("Xi'an: Invalid final: " .. decode(c)))
.. tones
.. (d~="" and "⁻"..tones or "")
end)
return "/" .. text .. "/"
end
-- returns (display_text, phonetic_text, ipa)
function export.py_process(text)
local conv_display = {}
local conv_hidden = {}
local conv_ipa = {}
local i = 0
for reading in mw.text.gsplit(text,"/",true) do
i = i + 1
conv_display = reading:gsub("","")
-- no check is done for things like "xUān", any capitalisation is valid
reading = mw.ustring.lower(reading)
reading = encode(reading)
reading = py_divide_syllables(reading)
if reading:match("") then
local phonetic = reading
:gsub("(??)?(*)()", function(a,b,c)
return "\5" .. a .. (c~="5" and string.char(c) or "") .. b .. "\6"
end)
phonetic = py_join_syllables(phonetic)
conv_hidden = conv_display .. " "
else
conv_hidden = conv_display
end
reading = reading:gsub("'"," "):gsub("+",py_transf)
reading = py_normalize(reading)
conv_ipa = py_to_ipa(reading)
end
return table.concat(conv_display, " / "),
table.concat(conv_hidden, " / "),
table.concat(conv_ipa, ", ")
end
return export