local concat = table.concat
local error = error
local explode = require("Module:string utilities").explode_utf8
local insert = table.insert
local rawget = rawget
local rawset = rawset
local select = select
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local type = type
local _lower = string.lower
local _upper = string.upper
local ulower = string.ulower
local uupper = string.uupper
local data = mw.loadData("Module:User:Theknightwho/cmn-pron/data")
local num_to_segment = data.num_to_segment
local segment_to_num = data.segment_to_num
local export = {}
local function lower(str)
return str and (#str > 1 and ulower or _lower)(str) or nil
end
local function upper(str)
return str and (#str > 1 and uupper or _upper)(str) or nil
end
local function match(str, pat)
return type(str) == "string" and str:match(pat) or nil
end
local m_parser = require("Module:parser")
local Parser = m_parser.Parser
-- Modified read method to add keep_capital parameter.
function Parser:read(delta, keep_capital)
local this = self.text or ""
return keep_capital and this or lower(this)
end
local Node = m_parser.Node
local Wikitext = m_parser.Wikitext
function Wikitext:iterate()
local proxy = self:new_proxy()
for node, parent, key in self:__pairs("next") do
if type(node) == "string" or node.type == "syllable" then
proxy:build(node, parent, key)
end
end
return proxy.iter, proxy, 0
end
local Syllable = Node:new_class("syllable")
Syllable.next = Node.next_node
function Syllable:__index(k)
local ret = Syllable or rawget(self, segment_to_num or k)
return ret ~= "" and ret or nil
end
function Syllable:__newindex(k, v)
local segment_k = segment_to_num
rawset(self, segment_k or k, v and v or segment_k and "" or nil)
end
function Syllable:__tostring()
return concat(self, nil, 1, 6)
end
function Syllable:normalize_keys(raw_syl)
local i, len = 0, raw_syl.len
while i < len do
i = i + 1
local v = raw_syl
self[
v == "o" and (
self.nucleus and "glide2" or "nucleus"
) or
match(v, "\188?$") and ( -- iuüwy (final char)
(self.glide1 or self.nucleus) and "glide2" or "glide1"
) or
match(v, "^\170?$") and "nucleus" or -- aeê
match(v, "^") and (
i > 1 or
len == 1 or
(len == 2 and match(raw_syl, "r"))
) and "nasal" or
i > 1 and match(v, "r") and "erhua" or
"initial"
] = v
end
self.tone = raw_syl.tone or 5
return self
end
function Syllable:check_invalid()
if (
self.initial == "gn" and self.glide1 == "u" or
not self.glide1 and (
self.initial == "gn" or
match(self.initial, "")
)
) then
error("'" .. self.initial .. self.nucleus .. "' is not valid in pinyin.")
elseif (
self.nucleus == "o" and self.glide2 == "i" or
match(self.nucleus, "^\170?$") and match(self.glide2, "")
) then
error("'" .. self.nucleus .. self.glide2 .. "' is not valid in pinyin.")
end
return self
end
function Syllable:normalize_glide1()
local glide1 = self.glide1
if not glide1 then
if (
self.nucleus == "o" and
not (self.glide2 or self.nasal) and
match(self.initial, "")
) then
self.glide1 = "u"
end
elseif match(glide1, "^y?i?$") then
self.glide1 = "i"
elseif match(glide1, "^w?u?$") then
self.glide1 = match(self.initial, "") and "ü" or "u"
elseif match(glide1, "^y??\188?$") then
self.glide1 = "ü"
else -- "wi" and "wü" are too weird to try to correct
error("'" .. glide1 .. "' is not valid in pinyin.")
end
return self
end
function Syllable:normalize_e_nucleus()
if self.glide2 or (self.glide1 and not self.nasal) then
self.nucleus = "ê"
end
end
function Syllable:normalize_o_nucleus()
if self.glide2 or not self.nasal then
return
elseif match(self.glide1, "^\188?$") then
self.glide1 = "ü"
self.nucleus = "e"
elseif self.initial or self.glide1 then -- not "on(g)"
self.glide1 = "u"
self.nucleus = "e"
end
end
function Syllable:normalize_implicit_nucleus()
if self.glide2 then
if self.nasal then
self.glide1 = "ü"
self.nucleus = "e"
self.glide2 = nil
elseif self.glide1 == "i" then
self.nucleus = "o"
else
self.nucleus = "ê"
end
elseif self.nasal then
self.nucleus = "e"
elseif self.glide1 == "i" and match(self.initial, "") then
self.glide1 = nil
self.nucleus = "ɨ"
end
end
function Syllable:convert(funcs)
local output = {}
for i = 1, 7 do
funcs(self, self, output)
end
if self.capitalize then
output = output:gsub("^*", upper)
end
return concat(output)
end
function Syllable:new(raw_syl)
local syl = setmetatable({"", "", "", "", "", ""}, Syllable)
:normalize_keys(raw_syl)
:check_invalid()
:normalize_glide1()
if syl.glide2 == "o" then
syl.glide2 = "u"
elseif syl.nucleus == "e" then
syl:normalize_e_nucleus()
elseif syl.nucleus == "o" then
syl:normalize_o_nucleus()
elseif syl.glide1 and not syl.nucleus then
syl:normalize_implicit_nucleus()
end
if raw_syl.capitalize then
syl.capitalize = true
end
return syl
end
do
local tones = data.raw_tones
local handle_initial
local handle_glide1
local handle_nucleus
local handle_glide2
local handle_nasal
local handle_erhua
local handle_number
function handle_initial(self, this)
self.n.handler = handle_glide1
if not match(this, "^\139?$") then -- bcdfghjklmnŋpqrstvxz
return self:consume()
end
local nxt = self:read(1)
if (
match(this, "^\139?$") and
(tones or match(nxt, ""))
) then
self.n.handler = handle_nasal
return self:consume()
elseif match(this, "") and match(nxt, "^\130?$") then -- h + circumflex
self:advance()
this = this .. "h"
nxt = self:read(1)
elseif (
this == "n" and nxt == "g" or
(
this == "g" and nxt == "n" and
match(self:read(2), "^\139?$") -- aeimnŋou
) -- not *gng etc.
) then
self:advance()
this = this .. nxt
nxt = self:read(1)
elseif this == "ŋ" then
this = "ng"
end
self:emit(this)
if tones or match(nxt, "") then
self:advance()
self.n.tone = tones or tonumber(nxt == "0" and 5 or nxt)
return self:pop()
end
end
function handle_glide1(self, this)
self.n.handler = handle_nucleus
if not match(this, "") then
return self:consume()
end
local nxt = self:read(1)
if match(this, "") then
if match(nxt, "") then
self:advance()
if nxt == "u" and self:read(1) == "\204\136" then -- diaeresis
this = this .. "ü"
self:advance()
else
this = this .. nxt
end
nxt = self:read(1)
end
self:emit(this)
elseif this == "i" then
self:emit(this)
elseif this == "u" then
if nxt == "\204\136" then -- diaeresis
this = "ü"
self:advance()
nxt = self:read(1)
end
self:emit(this)
end
if tones then
self:advance()
self.n.tone = tones
end
end
function handle_nucleus(self, this)
self.n.handler = handle_glide2
local nxt = self:read(1)
if self.n.tone and (
tones or
nxt == "\204\130" and tones or -- circumflex
match(nxt, "")
) then
self:advance(-1)
return self:pop()
elseif not match(this, "") then
return self:consume()
elseif this == "e" and self:read(1) == "\204\130" then -- circumflex
this = "ê"
self:advance()
end
self:emit(this)
nxt = self:read(1)
if tones then
self:advance()
self.n.tone = tones
end
end
function handle_glide2(self, this)
self.n.handler = handle_nasal
local nxt = self:read(1)
if (
self.n.tone and (tones or match(nxt, "")) or
this == "i" and match(self:emitted(), "i$") or
this == "u" and match(self:emitted(), "\188?$") -- uü
) then
self:advance(-1)
return self:pop()
elseif match(this, "") then
self:emit(this)
else
return self:consume()
end
if tones then
self:advance()
self.n.tone = tones
end
end
function handle_nasal(self, this)
self.n.handler = handle_erhua
if not match(this, "^\139?$") then
return self:consume()
end
local emitted = self:emitted()
local nxt = self:read(1)
if (
match(emitted, "^") or
match(nxt, "") or
(tones) and match(emitted, "^?$") or -- aeêiouü
self.n.tone and (tones or match(nxt, ""))
) then
self:advance(-1)
return self:pop()
elseif tones then
self:advance()
self.n.tone = tones
nxt = self:read(1)
end
if (
this == "n" and nxt == "g" and
not match(self:read(2), "^?$") -- aeiou + tones
) then
this = "ng"
self:advance()
elseif this == "ŋ" then
this = "ng"
end
self:emit(this)
end
function handle_erhua(self, this)
self.n.handler = handle_number
local nxt = self:read(1)
if this ~= "r" and (this ~= "'" or nxt ~= "r") then
return self:consume()
elseif (
self.n.tone and (tones or match(nxt, "")) or
match(
this == "'" and self:read(2) or nxt,
"^?$" -- aeiou + tones
)
) then
self:advance(-1)
return self:pop()
elseif this == "r" and (#self.n ~= 1 or self.n ~= "e") then
this = "'r"
elseif this == "'" and nxt == "r" then
this = "'r"
self:advance()
end
self:emit(this)
end
function handle_number(self, this)
if not match(this, "^$") then
self:advance(-1)
return self:pop()
end
self.n.tone = tonumber(this == "0" and 5 or this)
return self:pop()
end
function Parser:do_syllable(capitalize)
self.n.capitalize = capitalize and true or nil
rawset(self.n, "handler", handle_initial)
end
end
do
local function handle_syllable_break(self, this)
if this ~= " " and this ~= "-" and this ~= "'" then
self.n.override = nil
return self:consume()
end
end
local function main_handler(self, this)
if match(this, "^\139?$") then
a = true
self:emit(Syllable:new(self:get(
"do_syllable",
self.n.allow_capital and match(self:read(0, true), "^\138?$")
)))
self.n.allow_capital = nil
elseif this == " " or this == "-" then
self:emit(this)
self.n.allow_capital = true
self.n.override = handle_syllable_break
elseif this == "'" then
self.n.override = handle_syllable_break
elseif this == "" then
return self:pop()
else
error("Invalid character (" .. this .. ") at position " .. self.head .. ".")
end
end
function Parser:do_parse()
self.n.allow_capital = true
rawset(self.n, "handler", main_handler)
end
function export.normalize(text)
return (select(2, Parser:parse{
text = explode(toNFD(text)),
node = {Wikitext, true},
route = {"do_parse"}
}))
end
end
function Wikitext:convert(funcs)
self.output = {}
for i, syl, proxy in self:iterate() do
--iteration(syl, prev)
end
-- return output
end
local Converter = {}
do
local function no_op(self, this, output)
if this ~= "" then
insert(output, this)
end
end
function Converter:__index(k)
return rawget(self, num_to_segment) or no_op
end
end
function Converter:new()
return setmetatable({}, Converter)
end
do
local pinyin = Converter:new()
local tones = data.pinyin_tones
local tone_priority = data.pinyin_tone_priority
function pinyin.glide1(self, this, output)
if this == "" then
return
elseif self.nucleus == "e" and not self.glide2 and self.nasal == "ng" then
if this == "u" and self.initial then
return
elseif this == "ü" then
this = "i"
end
end
if not self.initial then
insert(output, this == "u" and "w" or "y")
if this == "ü" then
insert(output, "u")
elseif not (self.nucleus or self.glide2 or self.nasal) then
insert(output, this)
end
return
elseif (
this == "u" and
self.nucleus == "o" and
not (self.glide2 or self.nasal) and
match(self.initial, "")
) then
return
elseif this == "ü" and match(self.initial, "") then
this = "u"
end
insert(output, this)
end
function pinyin.nucleus(self, this, output)
if this == "" then
return
elseif this == "e" and self.glide1 and not self.glide2 and self.nasal then
if self.glide1 == "u" and not self.initial then
this = "e"
elseif (
self.nasal == "ng" and
(self.glide1 == "u" or self.glide1 == "ü")
) then
this = "o"
elseif (
self.glide1 == "i" and
not (self.initial or self.glide2) and
self.nasal
) then
this = "i"
else
return
end
elseif this == "ê" then
if (
(self.initial or self.glide1 == "ü") and
self.glide2 == "i" and
not self.nasal and
(self.glide1 == "u" or self.glide1 == "ü")
) then
return
elseif self.glide2 or (self.glide1 and not self.nasal) then
this = "e"
end
elseif (
this == "o" and
self.initial and
self.glide1 == "i" and
self.glide2 == "u" and
not self.nasal
) then
return
elseif this == "ɨ" then
this = "i"
end
insert(output, this)
end
function pinyin.glide2(self, this, output)
if this == "" then
return
elseif this == "u" and self.nucleus == "a" then
this = "o"
end
insert(output, this)
end
function pinyin.erhua(self, this, output)
if this == "" then
return
elseif this == "'r" and (
self.glide1 or
self.nucleus ~= "e" or
self.glide2 or
self.nasal
) then
this = "r"
end
insert(output, this)
end
function pinyin.tone(self, this, output)
local best, pos = 0
for i = 1, #output do
local score = tone_priority] or 0
if score >= best then
best = score
pos = i
end
end
output = output:gsub(
"^*",
"%0" .. (tones or "")
)
if pos == 1 or output == "ng" or output == "gn" then
insert(output, 1, "'")
end
end
local function iteration(syl, output, prev)
if type(syl) == "string" then
insert(output, syl)
return
end
syl = syl:convert(pinyin)
if (
syl:sub(1, 1) == "'" and
(not prev or prev == " " or prev == "-")
) then
syl = syl:sub(2)
end
insert(output, syl)
end
function export.pinyin(text)
local output, prev = {}
for syl in text:iterate() do
iteration(syl, output, prev)
prev = syl
end
return toNFC(concat(output))
end
end
do
local zhuyin = Converter:new()
local letters = data.zhuyin_letters
local compounds = data.zhuyin_compounds
local tones = data.zhuyin_tones
function zhuyin.initial(self, this, output)
if this == "" then
return
end
insert(output, letters)
end
function zhuyin.glide1(self, this, output)
if this == "" or (
this == "u" and
self.nucleus == "o" and
not (self.glide2 or self.nasal) and
match(self.initial, "")
) then
return
end
insert(output, letters)
end
zhuyin.nucleus = zhuyin.initial
function zhuyin.glide2(self, this, output)
if this == "" then
return
end
this = letters
local prev = output
if prev then
local compound = compounds
if compound and compound ~= this then
output = compound
return
end
end
insert(output, this)
end
zhuyin.nasal = zhuyin.glide2
function zhuyin.erhua(self, this, output)
if this == "r" then
output = "ㄦ"
elseif this == "'r" then
insert(output, "ㄦ")
end
end
function zhuyin.tone(self, this, output)
if this == 5 then
insert(output, 1, "˙")
elseif self.erhua == "'r" then
insert(output, #output, tones)
else
insert(output, tones)
end
end
local function iteration(syl, output, prev)
if type(syl) == "string" then
insert(output, syl)
return
end
syl = syl:convert(zhuyin)
if syl == "ㄦ" and prev.tone == 1 and not prev.erhua then
syl = "ㄦˉ"
end
insert(output, syl)
end
function export.zhuyin(text)
local output, prev = {}
for syl in text:iterate() do
iteration(syl, output, prev)
prev = syl
end
return toNFC(concat(output))
end
end
return export