local export = {}
local data = mw.loadData("Module:nan-pron/data")
local m_str_utils = require("Module:string utilities")
local find = m_str_utils.find
local gsplit = m_str_utils.gsplit
local gsub = m_str_utils.gsub
local sub = m_str_utils.sub
local match = m_str_utils.match
local len = m_str_utils.len
local lower = m_str_utils.lower
local split = m_str_utils.split
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
-- We use this table to encode digraphs and other multi-byte sequences.
-- Capitalization is encoded as a preceding "^", e.g. Tân → ^ta5n
-- Special treatments for digraphs: the diacritic that goes in between is
-- moved to the end, e.g. ńg → G2, ó͘ → O2, ó̤ → o_2, ṳ́ → u_2
local encoding = {
ph = "P", th = "T", kh = "K", ng = "G", ch = "c", chh = "C", sh = "S",
= "E", = "N",
= "2", -- á
= "3", -- à
= "5", -- â
= "6", -- ã (POJ T6)
= "&", -- ǎ (TL T6)
= "7", -- ā
= "8", -- a̍
= "9", -- ă (POJ T9)
= "0", -- a̋ (TL T9)
= "_",
= "O", -- o͘
}
local decoding = {}
for key,val in pairs(encoding) do
decoding = key
end
-- e.g. Tân → ^ta5n
function export.do_encode(text)
text = toNFD(text)
:gsub("",function(c) return "^"..c:lower() end)
:gsub("chh",encoding)
:gsub("ⁿ",encoding)
:gsub("",encoding)
:gsub("n(?)g","G%1")
:gsub("o(?)\205\152","O%1")
return text
end
-- e.g. ^ta5n → Tân
function export.do_decode(text)
text = text:gsub("G(?)","n%1g")
:gsub("",decoding)
:gsub("%^(.)",string.upper)
return toNFC(text)
end
-- simpler version that ignores numbers or uppercase
local function decode(text)
return toNFC(text:gsub("",decoding))
end
-- convert to an internal representation that uses the encoding above
-- and also place the tone at the end
-- and splits 4 and 8 into 4A 4B 8A 8B
-- e.g. Khóng-chú → ^KoG2-cu2
function export.poj_to_internal(text, check_diminutive)
if text:find("^",1,true) then
error("Hokkien: Invalid character found.")
end
text = export.do_encode(text)
local checked_category = {p="A",t="A",k="A",h="B"}
text = text:gsub("+",function(syl)
if check_diminutive and syl == "a2" then
error("The diminutive should be specified with 仔.")
end
if syl == "仔" then
return syl
end
local tone = syl:match("")
local detone = syl:gsub("","")
local category = checked_category
if not tone then
tone = category and ("4"..category) or "1"
elseif tone == "8" then
tone = "8" .. category
else
tone = ({="6T",="9T"}) or tone
end
if detone:find("^",2,true) then
error("Only the first letter in a syllable may be capitalised.")
end
return detone..tone
end)
return text
end
local tone_placement_order = "aOoEeuiymG"
local tone_placement_exceptions = {
ioa=2, ioaN=2,
oa=1, oaN=1, oeh=2, oehN=2
}
-- convert the internal representation as described above back to POJ
-- this function determines the canonical tone placement
function export.internal_to_poj(text)
text = text:gsub("+",function(syl)
local i,f,t = syl:match("^(?)(?*h??)(%dT?)?$")
if not i then
if syl == "仔" then
return syl
end
error("Syllable decomposition failed: " .. decode(syl))
end
if t == "1" or t == "4" then
return i..f
else
t = ({="&", ="0"}) or t
end
-- tone placement rule: a>O>o>E>e>u>i>y>m>G, exceptions specified above
local idx = tone_placement_exceptions
if not idx then
for j=1,#tone_placement_order do
idx = f:find(tone_placement_order:sub(j,j),1,true)
if idx then break end
end
end
if not idx then
error("Tone placement failed: " .. decode(syl))
end
return i..f:sub(1,idx)..t..f:sub(idx+1)
end)
return export.do_decode(text)
end
function export.poj_check_invalid(text)
if not text then
return nil
end
local title = mw.title.getCurrentTitle().text
local check_diminutive = (title:find("子") or title:find("仔")) and title ~= "明仔早"
local reading = mw.text.split(text, "/", true)
local internal,loc = {},{}
for i=1,#reading do
local colon = reading:find(':',1,true)
if colon then
loc, reading = reading:sub(1,colon-1), reading:sub(colon+1)
end
internal = export.poj_to_internal(reading, check_diminutive)
local normalized = export.internal_to_poj(internal)
if reading ~= normalized then
error("Invalid POJ input \"" .. reading .. "\": please change it to \"" .. normalized .. "\"")
end
end
return reading, internal, loc
end
local function check_canonical_POJ(text)
for syl in text:gmatch("+") do
local i,f,t = syl:match("^(?)(?*h??)(%d)?$")
if not data.poj.canonical_final or t=="6" or t=="9" then
return false
end
end
return true
end
function export.poj_display_one(reading, internal)
reading = reading:gsub("仔", "á"):gsub("#", "")
if check_canonical_POJ(internal) then
reading = "]"
end
return reading
end
function export.poj_display(readings, internals)
for i = 1, #readings do
readings = export.poj_display_one(readings, internals)
end
return table.concat(readings, " / ")
end
local tl_conv = {
O="oo", u_="ir", o_="er", E="ee", hN="nnh",
c="ts", C="tsh",
="\204\140", -- ã → ǎ
="\204\139", -- ă → a̋
}
local tl_tone_placement_exceptions = {
ere=3
}
function export.internal_to_tl(text)
text = text:gsub("仔","a2")
:gsub("#","")
:gsub("e()","i%1") -- eng/ek → ing/ik
:gsub("o()","u%1") -- oa/oe/oɛ → ua/ue/uɛ
:gsub("(h?)N", "nn%1") -- (h)ⁿ → nn(h)
:gsub("_?", tl_conv) -- ɛ/o͘/ṳ/o̤ → ee/oo/ir/er
-- place tones
:gsub("+",function(syl)
local i,f,t = syl:match("^(?)(*)(%d)?$")
if not i then
error("Syllable decomposition failed: " .. decode(syl))
end
if t == "1" or t == "4" then -- no tone diacritic needed
return i..f
end
t = tl_conv or t
-- tone placement rule: if i or u occurs before other vowel, put on the other vowel
local idx = tl_tone_placement_exceptions
or f:match("^?u?()")
or f:match("^()")
if not idx then
error("Tone placement failed: " .. decode(syl))
end
return i..f:sub(1,idx)..t..f:sub(idx+1)
end)
:gsub("", tl_conv) -- ch → ts
return export.do_decode(text)
end
function export.poj_check_syllable(initial, final, loc)
if not ((validInitials or moreValidInitials) and (validFinals or moreValidFinals)) then
--error("The syllable " .. initial .. "+" .. final .. " does not appear to be a valid " .. loc .. " POJ syllable.")
return " .. "]]"
end
return nil
end
function export.internal_to_psdb(text)
-- TODO
return text
end
function export.internal_to_IPA(text,loc)
-- TODO
return 'IPA '..text.." "..data.IPA.tone
end
local default_location_list = { "Xiamen", "Quanzhou", "Zhangzhou", "Taiwan" }
local default_IPA_location_list = { "Xiamen", "Quanzhou", "Zhangzhou", "Taipei", "Kaohsiung" }
local loc_overrides = {
= { 'Taipei', 'Kaohsiung' },
= { 'Xiamen-d' },
= { 'Xiamen', 'Quanzhou', 'Zhangzhou' }
}
-- returns a list of locations for the header, and a list of locations for the IPA
-- these generally coïncide except for the overrides defined above, or
-- when it is tagged with -d (dated) etc.
function export.parse_locations(loc)
if not loc then
return default_location_list, default_IPA_location_list
end
local locations, IPA_locations = {},{}
for location_abbrev in mw.text.gsplit(loc, ",", true) do
if data.loc.invalid_code_hint then
error("Invalid Hokkien location code: " .. location_abbrev .. ", maybe you meant: " .. data.loc.invalid_code_hint)
end
local loc_name = data.loc.code
if not loc_name then
error("The region label '" .. location_abbrev .. "' cannot be found. Please see ].")
end
table.insert(locations, loc_name)
loc_name = gsub(loc_name, '^Taiwan%-?(?)$', { = 'Taipei', = 'Kaohsiung', = 'Taiwan' })
if loc_overrides then
for _,IPA_loc in ipairs(loc_overrides) do
table.insert(IPA_locations, IPA_loc)
end
else
loc_name = gsub(loc_name, '%-d$', '')
table.insert(IPA_locations, loc_name)
end
end
return locations, IPA_locations
end
-- used by ] and ]
function export.generate_all(text)
local nan_pronunc
if type(text) == "table" then
text, nan_pronunc = text.args, text.args
end
local output_text = {}
local prefix = (nan_pronunc and nan_pronunc ~= "") and "\n" or "\n*"
local function fmt(item, content) --formatting
table.insert(output_text,
prefix .. data.boilerplate.leading
.. (content or "") .. data.boilerplate.trailing)
end
-- e.g. poj={"koe","ke"}, internal={"koe1","ke1"}, loc={"xm,qz","zz"}
local poj,internal,loc = export.poj_check_invalid(text)
local locations, IPA_locations, display, tl, psdb, ipa
local backwards_compatibility = not text:find(":",1,true)
for i=1,#poj do
locations, IPA_locations = export.parse_locations(loc)
fmt("POJ", export.poj_display_one(poj,internal))
fmt("TL", export.internal_to_tl(internal))
if not find(text, "%-%-") then
psdb = export.internal_to_psdb(internal)
ipa = {} -- store the generated ipa AND which lects have which ipa
for _,IPA_location in ipairs(IPA_locations) do
if data.loc.IPA_available then
local generated_IPA = export.internal_to_IPA(internal,IPA_location)
if not ipa then
table.insert(ipa, generated_IPA)
ipa = {}
end
IPA_location = IPA_location:gsub("%-d$","")
table.insert(ipa, IPA_location)
end
end
end
mw.logObject({poj=poj,int=internal,loc=loc,loca=locations,ipaloc=IPA_locations,dis=display,tl=tl,psdb=psdb,ipa=ipa})
end
return table.concat(output_text)
--[==[
if not find(text, ":") then
table.insert(output_text, fmt("LV2")
.. fmt("POJ", export.poj_display(poj,internal))
.. fmt("TL", export.poj_to_tl_conv(text)))
if not find(text, "%-%-") then
local psdb_hash = export.poj_to_psdb_conv(text)
if not find(psdb_hash, "error") then
table.insert(output_text, formatting.PSDB.leading .. psdb_hash .. formatting.PSDB.trailing)
end
for _, IPA_location in ipairs(IPA_available_list) do
IPA_location = IPA_location == "Taiwan" and { "Taipei", "Kaohsiung" } or { IPA_location }
for _, location in ipairs(IPA_location) do
table.insert(output_text, formatting.IPA.leading .. location_link .. formatting.IPA.trailing)
local reading_IPA_hash = {}
for poj_reading in gsplit(text, "/", true) do
table.insert(reading_IPA_hash, export.generate_IPA(poj_reading, location))
end
table.insert(output_text, table.concat(reading_IPA_hash, ", "))
if #reading_IPA_hash > 1 then
table.insert(output_text, string.format("]", #reading_IPA_hash))
end
end
end
end
else
for i, poj_reading in ipairs(all_readings) do
table.insert(output_text, formatting.LV_two.leading)
local location_hash = {}
for _, location_name in ipairs(locations) do
table.insert(location_hash, location_link)
end
table.insert(output_text, ": " .. table.concat(location_hash, ", ") .. formatting.LV_two.trailing)
table.insert(output_text, formatting.POJ.leading .. export.poj_display(poj_reading) .. formatting.POJ.trailing ..
formatting.TL.leading .. export.poj_to_tl_conv(poj_reading) .. formatting.TL.trailing)
if not find(poj_reading, "%-%-") then
local psdb_hash = export.poj_to_psdb_conv(poj_reading)
if not find(psdb_hash, "error") then
table.insert(output_text, formatting.PSDB.leading .. psdb_hash .. formatting.PSDB.trailing)
end
local IPA_readings = {}
for j, location_name in ipairs(locations) do
location_name = gsub(location_name, '^Taiwan%-?(?)$', { = 'Taipei', = 'Kaohsiung', = 'Taiwan' })
loc = {
= { 'Taipei', 'Kaohsiung' },
= { 'Xiamen-d' },
= { 'Xiamen', 'Quanzhou', 'Zhangzhou' }
}
location_name = loc or { gsub(location_name, '%-d$', '') }
for k, location in ipairs(location_name) do
local loc = gsub(location, '%-d$', '')
if IPA_available then
local poj_to_ipa = export.generate_IPA(poj_reading, location)
if IPA_readings then
table.insert(IPA_readings, location_link)
else
IPA_readings = { j + (k/10), { location_link } }
end
end
end
end
for reading, reading_info in pairs(IPA_readings) do
table.insert(output_text, formatting.IPA.leading .. table.concat(reading_info, ", ") ..
formatting.IPA.trailing .. reading)
end
end
end
end
]==]
end
function export.generate_IPA(text, location)
-- (Wyang) I can't seem to find an example where 'triple' is used.. The code is below:
--if match(p, "%(") then
-- p = gsub(p, "", "")
-- triple = true
--end
--if triple then
-- if tone == "一" then
-- ipa = (initial .. final .. "一至七 " .. initial .. final .. "一至七 " .. initial .. final .. (i == #tone and "一" or "一至七"))
-- elseif tone == "二" then
-- ipa = (initial .. final .. "二至一 " .. initial .. final .. "二至一 " .. initial .. final .. (i == #tone and "二" or "二至一"))
-- elseif tone == "三" then
-- ipa = (initial .. final .. "三至二 " .. initial .. final .. "三至二 " .. initial .. final .. (i == #tone and "三" or "三至二"))
-- elseif tone == "四A" then
-- ipa = (initial .. final .. "四至八 " .. initial .. final .. "四至八 " .. initial .. final .. (i == #tone and "四" or "四至八"))
-- elseif tone == "四B" then
-- final = gsub(final, "ʔ", "(ʔ)")
-- ipa = (initial .. final .. "四至二 " .. initial .. final .. "四至二 " .. initial .. final .. (i == #tone and "四" or "四至二"))
-- elseif tone == "五" then
-- if loc == "Quanzhou" or loc == "Taipei" then
-- ipa = (initial .. final .. "五 " .. initial .. final .. "五至三 " .. initial .. final .. (i == #tone and "五" or "五至三"))
-- else
-- ipa = (initial .. final .. "五 " .. initial .. final .. "五至七 " .. initial .. final .. (i == #tone and "五" or "五至七"))
-- end
-- elseif tone == "七" then
-- ipa = (initial .. final .. "七至一 " .. initial .. final .. "七至三 " .. initial .. final .. (i == #tone and "七" or "七至三"))
-- elseif tone == "八A" then
-- ipa = (initial .. final .. "八至四 " .. initial .. final .. "八至四 " .. initial .. final .. (i == #tone and "八" or "八至四"))
-- elseif tone == "八B" then
-- final = gsub(final, "ʔ", "(ʔ)")
-- ipa = (initial .. final .. "八至五 " .. initial .. final .. "八至三 " .. initial .. final .. (i == #tone and "八" or "八至三"))
-- end
--end
if type(text) == "table" then text, location = text.args, text.args end
local tone_from_mark = {
= "1",
= "2",
= "3",
= "4A", = "4A", = "4A",
= "4B",
= "5",
= "6",
= "7",
= "8A", = "8A", = "8A",
= "8B",
= "9",
= "9",
}
local initial_ipa = {
= "p", = "pʰ", = "m", = "b", = "f",
= "t", = "tʰ", = "n", = "l", = "d",
= "t͡s", = "t͡sʰ", = "d͡z", = "s", = "ʃ",
= "k", = "kʰ", = "ŋ", = "ɡ",
= "h", = "ɹ", = "w", = "j", = "",
= "z",
}
local final_ipa = {
= "a", = "aʔ", = "ãʔ",
= "ai", = "aiʔ", = "ãi", = "ãiʔ",
= "ak̚", = "am", = "an", = "ã",
= "aŋ", = "ap̚", = "at̚",
= "au", = "auʔ", = "ãuʔ", = "ãu",
= "e", = "ɛ", = "ɛʔ",
= "ɛk̚", = "ɛŋ",
= "eʔ", = "ẽʔ", = "ei", = "iɪk̚",
= "ɛm", = "ɛn", = "ẽ",
= "iɪŋ", = "ɵy", = "ə",
= "əʔ", = "əm", = "ən",
= "ət̚", = "ək̚", = "ɛt̚", = "ep̚",
= "eu", = "ẽu",
= "i", = "ia", = "iaʔ",
= "iãʔ", = "iak̚",
= "iam", = "iɛn", = "iã",
= "iaŋ", = "iap̚", = "iɛt̚",
= "iau", = "iauʔ", = "iãuʔ", = "iãu",
= "ie", = "iɛ", = "iɛ̃",
= "iʔ", = "ĩʔ",
= "im", = "in", = "ĩ", = "iŋ",
= "io", = "iua", = "iuã", = "ioʔ", = "iɔʔ", = "iɔ",
= "iop", = "iɔk̚", = "iɔ̃", = "iom", = "iɔŋ",
= "ip̚", = "ɯ", = "ɯʔ", = "ən", = "it̚",
= "iu", = "iua", = "iuʔ", = "iũ", = "iuã", = "iũʔ",
= "ie", = "iɛ", = "iɛ̃",
= "m̩", = "m̩ʔ",
= "ŋ̍", = "ŋ̍ʔ",
= "o", = "ɔ", = "ua", = "uaʔ", = "uãʔ", = "uai",
= "uaiʔ", = "uãiʔ", = "uãi", = "uan", = "uã",
= "uaŋ", = "uat̚", = "uak̚",
= "ue", = "ueʔ", = "uẽʔ", = "uẽ", = "uɛ",
= "oʔ", = "ɔʔ", = "ɔ̃ʔ", = "ɔi", = "ɔ̃i",
= "ɔk̚", = "ɔm", = "ɔ̃", = "ɔŋ", = "ɔp̚",
= "ɔt̚", = "ou",
= "u", = "ũ", = "uʔ", = "ũʔ",
= "ui", = "uĩ", = "uiʔ", = "uĩʔ",
= "ok̚", = "om",
= "un", = "oŋ", = "ut̚",
= "y", = "yn",
= "iai",
= "iei",
= "ɛ̃", = "ɛ̃", = "ɛ̃",
= "ɛ̃ʔ", = "ɛ̃ʔ",
= "ɤ", = "iɤ",
= "ɤʔ", = "iɤʔ",
= "ɤ", = "iɤ",
= "ɤʔ", = "iɤʔ",
= "uɛ",
= "uɛ̃",
= "uɛʔ",
= "ɛŋ", = "ɛk̚",
= "eŋ", = "ek̚",
= "eŋ", = "ek̚", = "ik̚",
= "ɔu", = "ɔ̃u",
= "eŋ", = "ek̚",
= "ɔ",
= "iɔ",
= "ɔʔ",
= "iɔʔ",
= "eu", = "ẽu",
= "eŋ", = "ek̚",
= "en", = "et̚",
= "em",
= "oŋ", = "ok̚",
= "ioŋ", = "iok̚",
= "õ", = "iõ",
= "uɛ̃",
= "z̩",
= "ɨ",
= "eŋ", = "ek̚",
}
local tone_sandhi = { }
-- (Wyang) I'm not sure about the 'Xd' ones, when tone X is followed by the diminutive 仔.
tone_sandhi = {
= "7", = "1", = "2", = "8A", = "2",
= "7", = "3", = "4A", = "3",
}
tone_sandhi = tone_sandhi
tone_sandhi = { -- 2 and 4 are special cases
= "7", = "10",
= "9", = "9", = "11", = "11",
}
tone_sandhi = {
= "1", = "5", = "2", = "8A", = "4B",
= "6", = "6", = "6", = "S", = "S",
}
tone_sandhi = {
= "1", = "5", = "2", = "8A", = "4B",
= "S1", = "S1", = "S1", = "S2", = "S2",
}
tone_sandhi = {
= "1", = "5", = "2", = "4A", = "4B",
= "6", = "6", = "6", = "S", = "S",
}
tone_sandhi = {
= "1", = "5", = "2", = "5", = "4B",
= "6", = "6", = "6", = "S", = "S",
}
tone_sandhi = {
= "7", = "1", = "S1", = "8B", = "8B",
= "7", = "3", = "S2", = "S2",
}
tone_sandhi = {
= "7", = "1", = "2", = "S", = "2",
= "7", = "3", = "3", = "3",
= "1", = "7",
}
tone_sandhi = {
= "7", = "1", = "2", = "8A", = "2",
= "7", = "3", = "4A", = "3",
}
tone_sandhi = {
= "7", = "1", = "2", = "4A", = "4B",
= "7", = "3", = "7", = "7",
}
tone_sandhi = {
= "1",
= "5", = "5", = "2", = "2",
}
tone_sandhi = {
= "7", = "1", = "2", = "8A", = "2",
= "3", = "3", = "4A", = "3", = "9",
= "1", = "1", = "7", = "7", = "7",
}
tone_sandhi = {
= "7", = "1", = "2", = "8A", = "2",
= "7", = "3", = "4A", = "3", = "9",
= "1", = "1", = "7", = "7", = "7",
}
tone_sandhi = {
= "1", = "1", = "2", = "8A", = "2",
= "3", = "3", = "3", = "S", = "S",
}
tone_sandhi = {
= "1", = "1", = "2", = "8A", = "2",
= "3", = "3", = "3", = "S", = "S",
}
tone_sandhi = {
= "7", = "S1", = "2", = "8A", = "2",
= "7", = "S2", = "S3", = "S3", = "9",
}
tone_sandhi = {
= "1", = "8A", = "S1", = "4A", = "S1",
= "S2", = "S2", = "S2", = "S3", = "S3", = "9",
}
tone_sandhi = {
= "7", = "S", = "2", = "8A", = "2",
= "7", = "3", = "4A", = "4B", = "9",
}
tone_sandhi = { -- 3 and 4B are special cases
= "7", = "5", = "8A",
= "3", = "3", = "4A", = "3"
}
tone_sandhi = { --Xiamen/Zhangzhou-like
= "7", = "5", = "2", = "8As", = "2",
= "3", = "3", = "3", = "3"
}
tone_sandhi = {
= "7", = "1", = "1", = "8A", = "8B",
= "7", = "6", = "3", = "4A", = "4B", = "9"
}
tone_sandhi = {
= "1", = "S2", = "S3", = "S3", = "S3",
= "S1", = "S1", = "8B", = "8B",
}
tone_sandhi = tone_sandhi
local tone_value = { }
tone_value = {
= "44", = "53", = "21", = "32", = "32",
= "24", = "22", = "4", = "4",
}
tone_value = tone_value
tone_value = {
= "44", = "31", = "112", = "32", = "32",
= "24", = "22", = "53", = "53",
= "11", = "42", = "1", = "4" --sandhi-only tones
}
tone_value = {
= "33", = "554", = "41", = "5", = "5",
= "24", = "22", = "41", = "24", = "24",
= "2", --sandhi-only
}
tone_value = {
= "33", = "554", = "41", = "5", = "5",
= "24", = "33", = "41", = "24", = "24",
= "22", = "2", --sandhi-only
}
tone_value = {
= "33", = "554", = "31", = "5", = "5",
= "24", = "22", = "31", = "23", = "23",
= "2", --sandhi-only
}
tone_value = {
= "33", = "54", = "21", = "4", = "4",
= "24", = "22", = "21", = "23", = "23",
= "2", --sandhi-only
}
tone_value = {
= "44", = "53", = "21", = "32", = "32",
= "24", = "22", = "24", = "4",
= "53", = "21", --sandhi-only
}
tone_value = {
= "44", = "53", = "21", = "32", = "32",
= "13", = "22", = "121", = "121",
= "5", --sandhi-only
}
tone_value = {
= "44", = "53", = "21", = "32", = "32",
= "24", = "22", = "3", = "3",
}
tone_value = {
= "55", = "53", = "11", = "32", = "32",
= "213", = "33", = "14", = "14",
}
tone_value = {
= "334", = "21", = "213", = "5", = "5",
= "11", = "53", = "55", = "32", = "32",
= "34", --sandhi-only
}
tone_value = {
= "44", = "53", = "11", = "32", = "32",
= "24", = "33", = "4", = "4", = "35"
}
tone_value = {
= "44", = "41", = "21", = "32", = "32",
= "23", = "33", = "4", = "4", = "35"
}
tone_value = {
= "33", = "51", = "11", = "31", = "31",
= "13", = "31", = "11", = "5", = "5",
= "1", --sandhi-only
}
tone_value = {
= "33", = "51", = "11", = "31", = "31",
= "13", = "31", = "11", = "5", = "5",
= "1", --sandhi-only
}
tone_value = {
= "44", = "53", = "21", = "32", = "32",
= "24", = "33", = "4", = "4", = "35",
= "55", = "11", = "1", --sandhi-only
}
tone_value = {
= "33", = "55", = "31", = "5", = "5",
= "24", = "33", = "31", = "35", = "35", = "35",
= "53", = "22", = "2", --sandhi-only
}
tone_value = {
= "44", = "53", = "21", = "2", = "2",
= "24", = "33", = "5", = "5", = "35",
= "55", --sandhi-only
}
tone_value = {
= "44", = "53", = "12", = "32", = "32",
= "24", = "22", = "54", = "54"
}
tone_value = { --Xiamen/Zhangzhou-like
= "44", = "42", = "21", = "32", = "32",
= "24", = "22", = "43", = "43", = "4"
}
tone_value = {
= "33", = "445", = "21", = "3", = "3",
= "23", = "55", = "21", = "4", = "4", = "5"
}
tone_value = {
= "33", = "53", = "31", = "53", = "53",
= "24", = "31", = "3", = "3",
= "22", = "34", = "54", --sandhi-only
}
tone_value = tone_value
local function get_sandhi_from_post(location, current, post)
if post then
if location == "Tong'an" then
if current == "2" then
if find(post, "^$") or find(post, "^8$") then
return "7"
else
return "5"
end
elseif find(current, "^4$") then
if post == "2" then
return "10"
else
return "12"
end
end
elseif location == "Kinmen" then
if current == "3" or current == "4B" then
if find(post, "^$") or find(post, "^4$") then
return "1"
else
return "2"
end
end
elseif location == "Longyan" then
if current == "2" then
if post == "2" or post == "5" then
return "3"
else
return "2"
end
elseif current == "3" then
if post == "2" or post == "5" then
return "3"
else
return "2"
end
elseif find(current, "^4$") then
if post == "2" or post == "5" then
return current
else
return "S"
end
elseif current == "7" then
if post == "2" or post == "5" then
return "7"
else
return "1"
end
end
end
end
end
local function get_tone(text)
local tone = gsub(text, "^+(?)*(?)ⁿ?", function(tone_symbol, coda)
return tone_from_mark end)
return tone
end
local function nasalize(final)
if find(final, "^mh?$") or find(final, "^ngh?$") then return final end
if find(final, "o͘h?$") then
final = gsub(final, "͘", "")
elseif find(final, "oh?$") then
error("Invalid POJ: nasal initial cannot go with -" .. final)
elseif find(final, "eeh?$") then
final = gsub(final, "ee", "e")
end
return final .. "ⁿ"
end
local formatting = {
leading = "<span class=\"IPA\">/",
trailing = "/</span>"
}
local tone_superscript = { = "¹", = "²", = "³", = "⁴", = "⁵", = "⁻" }
local word_result = {}
local attention = {}
if location ~= 'Xiamen-d' then
location = gsub(location, '%-d$', '')
end
text = gsub(text, " ", "-")
text = gsub(text, ",", "#")
text = gsub(text, "%-?%.%.%.%-?", "#")
text = gsub(text, "#$", "")
text = gsub(text, "#%-?", " ")
text = toNFD(lower(text))
for word in gsplit(text, " ", true) do
local initial, final, tone, diminutive, sandhi, result = {}, {}, {}, {}, {}, {}
local syllables = split(word, "-", true)
syllables.length = #syllables
for index, syllable in ipairs(syllables) do
if syllable == "仔" then
syllable = "a".."́"
diminutive = true
end
local original_syllable = syllable
syllable = gsub(syllable, "", "")
if not find(syllable, "") then
final = match(syllable, "^?h?h?(ngh?)$") or match(syllable, "^h?(mh?)$")
initial = syllable ~= final and sub(syllable, 1, len(syllable) - len(final)) or "" --original code: "ʔ"
else
initial = match(syllable, "^??h?")
final = sub(syllable, len(initial) + 1, -1)
end
tone = get_tone(sub(original_syllable, len(initial) + 1, -1))
local nasal_initial = match(initial, "^g?$")
if nasal_initial then
if find(final, "ⁿ") then
error("Too much nasality in POJ. " .. original_syllable .. " should be " .. gsub(original_syllable, "ⁿ", ""))
end
if location ~= "Penang" and location ~= "Philippines" and location ~= "Singapore" then --exception for Penang, Philippines and Singapore
final = nasalize(final)
end
end
if location == "Longyan" and find(final, "h$") then
final = gsub(final, "h", "")
end
local nasal_final = match(final, "^") or match(final, "ⁿ")
local not_nasal_initial = match(initial, "^$")
if ((nasal_initial and not nasal_final) or (not_nasal_initial and nasal_final)) and (location ~= "Penang" and location ~= "Philippines" and location ~= "Singapore") then --exception for Penang, Philippines and Singapore
error("POJ error: nasality of initial and final not synchronized.")
end
table.insert(attention, export.poj_check_syllable(initial, final, location))
initial = initial_ipa] or initial_ipa]
final = final_ipa] or final_ipa]
or error("Cannot recognise " .. final .. ".")
if index < syllables.length then
final = gsub(final, "ʔ", "(ʔ)")
end
end
for index = 1, syllables.length do
sandhi = tone_value]
local sandhi_hash = get_sandhi_from_post(location, tone, tone)
or tone_sandhi..(diminutive and "d" or "")]
or tone_sandhi]
if index < syllables.length and tone_value ~= tone_value] then
sandhi = sandhi .. "-" .. tone_value
end
table.insert(result, initial .. final .. sandhi)
end
table.insert(word_result, table.concat(result, " "))
end
return (gsub(formatting.leading .. table.concat(word_result, " ") ..
formatting.trailing, "", tone_superscript)) .. table.concat(attention)
end
local psdb_initial = {
= "'p", = "ph", = "'b",
= "'d", = "'t",
= "'k", = "'q", = "'g",
= "c", = "z",
= "ch", = "zh",
= "s", = "s",
= "j",
= "l", = "'h",
= "m", = "n", = "ng",
= "'"
}
local function psdb_final(text)
local basic_psdb = {
--single vowel tone 12357
= "af", = "ar", = "ax", = "aa", = "a",
= "y", = "ie", = "ix", = "ii", = "i",
= "w", = "uo", = "ux", = "uu", = "u",
= "ef", = "ea", = "ex", = "ee", = "e",
= "of", = "or", = "ox", = "oo", = "o",
= "oy", = "oir", = "oix", = "ooi", = "oi",
= "'ngf", = "'ngr", = "'ngx", = "'ngg", = "'ng",
= "'mf", = "'mr", = "'mx", = "'mm", = "'m",
--double vowel tone 12357
= "ay", = "ae", = "aix", = "aai", = "ai",
= "aw", = "ao", = "aux", = "aau", = "au",
= "iaf", = "iar", = "iax", = "iaa", = "ia",
= "iaw", = "iao", = "iaux", = "iaau", = "iau",
= "ioy", = "ioir", = "ioix", = "iooi", = "ioi",
= "iw", = "iuo", = "iux", = "iuu", = "iu",
= "oaf", = "oar", = "oax", = "oaa", = "oa",
= "oay", = "oae", = "oaix", = "oaai", = "oai",
= "oef", = "oea", = "oex", = "oee", = "oe",
= "uy", = "uie", = "uix", = "uii", = "ui",
--nasal vowel tone 12357
--nasal ending tone 12357
= "iefn", = "iern", = "iexn", = "ieen", = "ien",
= "iofng", = "iorng", = "ioxng", = "ioong", = "iong",
--stopped single vowel tone 48
= "ob", = "op",
= "od", = "ot",
= "og", = "ok",
--stopped double vowel tone 48
= "iob", = "iop",
= "iod", = "iot",
= "iog", = "iok",
}
text = gsub(text, "", { = "1", = "2", = "3", = "4", = "5", = "6", = "7", = "8"})
if find(text, "ⁿ$") then
local basic = gsub(text, "ⁿ", "")
basic = gsub(basic, "^o()$", "oo%1")
if basic_psdb then
return "v" .. basic_psdb
end
elseif find(text, ".g?$") and not find(text, "^ian$") and not find(text, "^iong$") then
local basic = gsub(text, "g?()$", "%1")
local ending = match(text, "(g?)$")
basic = gsub(basic, "^o()$", "oo%1")
if basic_psdb then
return basic_psdb .. ending
end
elseif find(text, "ⁿ?4$") and not find(text, "^i?o4$") then
local basic = gsub(text, "(ⁿ?)4$", "%1") .. "7"
local ending = match(text, "()ⁿ?4$")
ending = gsub(ending, "",{p = "b", t = "d", k = "g", h = "q"})
if find(basic, "ⁿ") then
basic = gsub(basic, "ⁿ", "")
basic = gsub(basic, "^o()$", "oo%1")
if basic_psdb then
return "v" .. basic_psdb .. ending
end
else
if basic_psdb then
return basic_psdb .. ending
end
end
elseif find(text, "ⁿ?8$") and not find(text, "^i?o8$") then
local basic = gsub(text, "(ⁿ?)8$", "%1") .. "7"
local ending = match(text, "()ⁿ?8$")
if find(basic, "ⁿ") then
basic = gsub(basic, "ⁿ", "")
basic = gsub(basic, "^o()$", "oo%1")
if basic_psdb then
return "v" .. basic_psdb .. ending
end
else
if basic_psdb then
return basic_psdb .. ending
end
end
else
return basic_psdb
end
end
function export.poj_to_psdb_conv(text)
if type(text) == "table" then text = text.args end
local readings = split(lower(text), "/", true)
for i = 1, #readings do
-- will ignore # boundary marker
local parts = split(gsub(readings, "#", ""), " ", true)
for j = 1, #parts do
local initial = {}
local final = {}
local psdb = {}
local tone = {}
local tonesandhi = {}
local neutral = {}
parts = gsub(parts, "%-%-", "-0")
local p = split(parts, "-", true)
local ar = {}
local triple = {}
for i, item in ipairs(p) do
if find(item, "仔") then
item = gsub(item, "仔", "á")
ar = true
end
if find(item, "%(") then
item = gsub(item, "", "")
triple = true
end
if find(item, "^0") then
item = gsub(item, "0", "")
neutral = true
end
item = gsub(item, "ớ", "óo")
item = gsub(item, "ờ", "òo")
item = gsub(item, "ơ̂", "ôo")
item = gsub(item, "ơ̄", "ōo")
item = gsub(item, "ơ̍", "o̍o")
item = gsub(item, "ơ", "oo")
item = gsub(item, "͘", "o")
item = gsub(item, "",{ = "捌", = "伍", = "柒", = "叁"})
if find(item, "?") or find(item, "?g?") then
if find(item, "捌") then
tone = "八"
else
tone = "四"
end
elseif find(item, "") then
tone = "二"
elseif find(item, "") then
tone = "三"
elseif find(item, "") then
tone = "五"
elseif find(item, "") then
tone = "七"
else
tone = "一"
end
item = gsub(item, "",{ = "a", = "i", = "u", = "e", = "o", = "m", = "n", = "", = "a", = "i", = "u", = "e", = "o", = "n", = "", = "a", = "i", = "u", = "e", = "o", = "", = "a", = "i", = "u", = "e", = "o", = "", = ""})
if sub(item,1,3) == "chh" then
initial = "chh"
final = sub(item,4,-1)
elseif sub(item,1,1) == "m" then
if sub(item,2,2) == "h" then
initial = ""
final = "mh"
elseif sub(item,2,2) == "" then
initial = ""
final = "m"
else
initial = "m"
final = sub(item,2,-1)
end
elseif sub(item,1,2) == "ng" then
if sub(item,3,3) == "h" then
initial = ""
final = "ngh"
elseif sub(item,3,3) == "" then
initial = ""
final = "ng"
else
initial = "ng"
final = sub(item,3,-1)
end
elseif find(item, "^h") then
initial = sub(item,1,2)
final = sub(item,3,-1)
elseif find(item, "^") then
initial = sub(item,1,1)
final = sub(item,2,-1)
else
initial = ""
final = item
end
if find(initial, "^chh?$") or initial == "s" then
if find(final, "^i") then
initial = initial .. "i"
end
end
p = item
end
for i = 1, #p do
if tone == "一" then
tonesandhi = "七"
elseif tone == "二" then
tonesandhi = "一"
elseif tone == "三" then
tonesandhi = ar and "一" or "二"
elseif tone == "四" then
tonesandhi = "八"
elseif tone == "五" then
tonesandhi = "七"
elseif tone == "七" then
tonesandhi = ar and "七" or "三"
elseif tone == "八" then
tonesandhi = "四"
end
if triple then
local tonesandhi1 = nil
if tone == "五" then
tonesandhi1 = "五"
elseif tone == "七" then
tonesandhi1 = "一"
end
psdb = (psdb_initial] or "error")
..(psdb_final(final..(tonesandhi1 or tonesandhi)) or "error")
..psdb_initial]
..psdb_final(final..tonesandhi)
..psdb_initial]
..psdb_final(final..(i == #tone and tone or tonesandhi))
else
psdb = (psdb_initial] or "error")
..(psdb_final(final..(i == #tone and tone or tonesandhi)) or "error")
end
if neutral then
psdb = "~" .. (psdb_initial] or "error")
..(psdb_final(final.."七") or "error")
end --psdb = p
end
parts = table.concat(psdb, "")
end
readings = table.concat(parts, " ")
readings = gsub(readings, "'+", "'")
readings = gsub(readings, "^'", "")
readings = gsub(readings, "()'", "%1")
readings = gsub(readings, "()'()", "%1%2")
readings = gsub(readings, "()'()", "%1%2")
readings = gsub(readings, "()'g", "%1g")
readings = gsub(readings, "()'h", "%1h")
end
return (gsub(table.concat(readings, ", "),'/()',' / %1'))
end
function export.pengim_check_syllable(initial, final, loc)
local validInitials = {
= 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1, = 1,
}
local validFinals = {
= 1, = 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1, = 1, = 1,
= 1, = 1, = 1,
= 1, = 1, = 1,
= 1, = 1, = 1, = 1, = 1, = 1,
= 1, = 1,
}
local moreValidFinals = {
= {
= 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1, = 1,
},
= {
= 1, = 1, = 1, = 1,
= 1, = 1, = 1,
= 1, = 1, = 1,
= 1, = 1, = 1,
= 1, = 1, = 1,
= 1, = 1,
},
= {
= 1, = 1, = 1, = 1,
= 1, = 1,
= 1, = 1,
= 1, = 1,
},
= {
= 1, = 1, = 1, = 1,
= 1, = 1, = 1,
= 1, = 1, = 1,
= 1, = 1, = 1,
= 1, = 1, = 1,
= 1, = 1,
},
= {
= 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1, = 1, = 1,
},
= {
= 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1, = 1, = 1, = 1,
= 1,
},
}
if not (validInitials and (validFinals or moreValidFinals)) then
return "]"
end
return nil
end
local pengim_to_ipa_two_letters_above = {
= "ɡ", = "β", = "ŋ",
= "au",
}
local pengim_to_ipa_one_letter = {
--initials
= "m", = "n",
= "p", = "t", = "k",
= "pʰ", = "tʰ", = "kʰ",
= "s", = "h",
= "d͡z",
= "t͡s",
= "t͡sʰ",
= "l",
--vowels
= "a",
= "e",
= "ɯ",
= "i",
= "o",
= "u",
--tones
= "³³⁻²³",
= "⁵²⁻³⁵",
= "⁵²⁻²¹",
= "²¹³⁻⁵⁵",
= "²⁻⁴",
= "⁵⁵⁻¹¹",
= "³⁵⁻¹¹",
= "¹¹",
= "⁴⁻²",
}
local pengim_tone_value = {}
pengim_tone_value = {
= "33", = "53", = "213", = "2",
= "55", = "35", = "11", = "5"
}
pengim_tone_value = pengim_tone_value
pengim_tone_value = pengim_tone_value
pengim_tone_value = pengim_tone_value
pengim_tone_value = {
= "33", = "53", = "31", = "2",
= "55", = "313", = "11", = "5"
}
local pengim_tone_sandhi = {}
pengim_tone_sandhi = {
= "23",
= "23", = "35",
= "31", = "53",
= "3", = "5",
= "11", = "21", = "", = "2"
}
pengim_tone_sandhi = {
= "23",
= "35", = "35",
= "55", = "55",
= "5", = "5",
= "11", = "21", = "", = "2"
}
pengim_tone_sandhi = pengim_tone_value
pengim_tone_sandhi = pengim_tone_value
pengim_tone_sandhi = {
= "23",
= "31", = "31",
= "55", = "55",
= "5", = "5",
= "11", = "33", = "33", = "2"
}
local pengim_to_ipa_fix = {
= "b",
+)"] = "p̚%1",
+)"] = "k̚%1",
+)"] = "ʔ%1",
}
local pengim_to_ipa_nasal = {
= "ã",
= "ẽ", -- ê
= "ɯ̃", -- e
= "ĩ",
= "õ",
= "ũ",
= "",
}
function export.pengim_to_ipa_conv(text)
local result
for key, val in pairs(pengim_to_ipa_two_letters_above) do
text = gsub(text, key, val)
end
text = gsub(text, "() (+)2$", "%1 %22")
local function verbose_function(char) return pengim_to_ipa_one_letter or char end
-- This should work, but it doesn't convert the tone number in "diên1":
-- result = gsub(text, ".", pengim_to_ipa_one_letter)
result = gsub(text, ".", verbose_function)
result = result .. "/"
for key, val in pairs(pengim_to_ipa_fix) do
result = gsub(result, key, val)
end
result = gsub(result, "+nʔ?+", function (a)
return gsub(a, ".", pengim_to_ipa_nasal)
end)
result = gsub(result, "(⁻+)/", function(a) return (a ~= "⁻²¹" and "/" or a .. "/") end)
result = gsub(result, ",", "#")
result = gsub(result, "(⁻+)#", function(a) return (a ~= "⁻²¹" and "" or a) end)
result = gsub(result, "#", "")
result = gsub(result, "/$", "")
result = gsub(result, "/", "/, /")
return "/" .. result .. "/"
end
function export.pengim_display(text)
text = gsub(text, "()/", "%1 / ")
text = gsub(text, "+", "<sup>%0</sup>") -- note: originally + but it seems like websites have the final tone within parentheses, if at all
return (gsub(text, "#", ""))
end
function export.pengim_to_pojlike_conv(text)
-- kind of based on MTR (http://www.ispeakmin.com/bbs/viewthread.php?tid=2784)
text = gsub(text, "#", "")
local words = split(text, "/", true)
local tone_marks = {
= '',
= '́',
= '̀',
= '',
= '̂',
= '̆', -- this is a breve; MTR: breve; current hokkien dialect convention: hacek; missionary: tilde or breve??
= '̄',
= '̍'
}
local function get_tone_mark(a, num) return tone_marks .. a end
local function convert_final(x,c,t) -- convert final -g and -b (but not -ng)
if c=='b' then c='p'
elseif c=='g' then c='k' end
return x..c..t
end
local cons_correspondences = { ='b', ='g', ='p', ='t',
='k', ='ph', ='th', ='kh', ='ts', ='tsh',
='j' }
local function nasalization(n,h,t) return h..'ⁿ'..t end
for i, word in ipairs(words) do
local syllables = split(word, " ", true)
for i, syllable in ipairs(syllables) do
syllable = gsub(syllable, '^h?', cons_correspondences)
syllable = gsub(syllable, '()()(%d)', convert_final)
syllable = gsub(syllable, '', { ='e', ='ṳ' } )
syllable = gsub(syllable, 'ao', 'au' )
syllable = gsub(syllable, '(n)(h?)(%d)', nasalization)
if find(syllable, 'uai') then
syllable = gsub(syllable, 'uai', 'ua符i')
elseif find(syllable, '') then
syllable = gsub(syllable, '()i', '%1符i') -- ?i
syllable = gsub(syllable, 'i()', 'i%1符') -- i?
syllable = gsub(syllable, '()()', '%1符%2') -- ?u
syllable = gsub(syllable, '()()', '%1%2符') -- u?
elseif find(syllable, '') or find(syllable, '^') then
syllable = gsub(syllable, '()', '%1符')
elseif find(syllable, 'ngh?%d') then
syllable = gsub(syllable, 'ng(h?)(%d)', 'n符g%1%2')
elseif find(syllable, 'h?%d') then
syllable = gsub(syllable, '()(h?)(%d)', '%1符%2%3')
end
syllable = gsub(syllable, '符(.*)(%d)', get_tone_mark)
syllables = syllable
end
words = table.concat(syllables, ' ')
end
return toNFC(table.concat(words, ' / '))
end
return export