local export = {}
local m_str_utils = require("Module:string utilities")
local m_table = require("Module:table")
local baybayin_encode_module = "Module:tl-bay_sc"
local lang = require("Module:languages").getByCode("tl")
local sc_Tglg = require("Module:scripts").getByCode("Tglg")
local rfind = m_str_utils.find
local rmatch = m_str_utils.match
local rsubn = m_str_utils.gsub
local rsplit = m_str_utils.split
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local trim = mw.text.trim
local u = m_str_utils.char
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local uupper = m_str_utils.upper
local AC = u(0x0301) -- acute = ́
local GR = u(0x0300) -- grave = ̀
local CFLEX = u(0x0302) -- circumflex = ̂
local TILDE = u(0x0303) -- tilde = ̃
local DIA = u(0x0308) -- diaeresis = ̈
local MACRON = u(0x0304) -- macron = ̄
local DOTOVER = u(0x0307) -- dot over = ̇
local vowel = "aeëəiou" -- vowel
local V = ""
local NV = ""
local accent = AC .. GR .. CFLEX .. MACRON
local accent_c = ""
local ipa_stress = "ˈˌ"
local ipa_stress_c = ""
local separator = accent .. ipa_stress .. "# .-"
local C = "" -- consonant
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
while true do
local new_term = rsub(term, foo, bar)
if new_term == term then
return term
end
term = new_term
end
end
local function decompose(text, recompose_e_dia)
-- decompose everything but ñ and ü
text = toNFD(text)
text = rsub(text, ".", {
= "ñ",
= "Ñ",
= "ü",
= "Ü",
})
if recompose_e_dia then
text = rsub(text, ".", {
= "ë",
= "Ë",
})
end
return text
end
-- Fix capitalization but considers syllable breaks
local function fix_capitalization(input, caps_map)
local syllbreak = 0
local text = ulower(input)
local syllbreak_chars = ".7"
for i=1, #text do
local text_pre = text:sub(1, i-1)
local text_current = text:sub(i,i)
local text_post = text:sub(i+1)
local caps_current = caps_map:sub(i-syllbreak, i-syllbreak)
if rfind(text_current, "") and not rfind(caps_current, "")then
syllbreak = syllbreak + 1
elseif uupper(text_current) == caps_current then
text = table.concat({text_pre, uupper(text_current), text_post})
end
end
return text
end
function export.remove_accents(str)
str = decompose(str, "recompose e-dia")
str = rsub(str, "(.)" .. accent_c, "%1")
return str
end
--Cleanup Baybayin inputs--
function export.decode_baybayin(text)
local text = rsub(text, "+", function(baybayin)
result = lang:transliterate(baybayin, sc_Tglg)
result = rsub(result, "()()", "%1-%2")
result = rsub(result, "%-", "7")
result = rsub(result, "()", "%1" .. MACRON) -- No way to know stress in Baybayin. Disable for now.
return result
end)
return text
end
-- "Align" syllabified respelling `syllab` to original spelling `spelling` by matching character-by-character, allowing
-- for extra syllable and accent markers in the syllabification and certain mismatches in the consonants. The goal is to
-- produce the appropriately syllabified version of the original spelling (the pagename) by matching characters in the
-- syllabified respelling to the original spelling, putting the syllable boundaries in the appropriate places in the
-- original spelling. As an example, given syllabified respelling 'a.ma.7ín' and original spelling 'amain', we would
-- like to produce 'a.ma.in'.
--
-- If we encounter an extra syllable marker (.), we allow and keep it. If we encounter an extra accent marker in thes
-- syllabification, we drop it. We allow for mismatches in capitalization and for certain other mismatches, e.g. extra
-- glottal stops (written 7), h in respelling vs. g or j in the original, etc. If we can't match, we return nil
-- indicating the alignment failed.
function export.align_syllabification_to_spelling(syllab, spelling)
local result = {}
local function concat_result()
-- Postprocess to remove dots (syllable boundaries) next to hyphens.
return (toNFC(table.concat(result)):gsub("%.%-", "-"):gsub("%-%.", "-"))
end
-- Remove glottal stop (7) from respelling to simplify the code below, because it's never found in the original
-- spelling. (FIXME: We should do the same for diacritics, but they're currently removed earlier, in
-- syllabify_from_spelling(). We should probably get rid of the removal there and put it here.)
syllab = decompose(syllab:gsub("ː", ""), "recompose e-dia"):gsub("7", "")
spelling = decompose(spelling, "recompose e-dia")
local syll_chars = rsplit(ulower(syllab), "")
local spelling_chars = rsplit(spelling, "")
local i = 1
local j = 1
local function matches(uci, ucj)
-- Return true if a syllabified respelling character (uci) matches the corresponding spelling char (ucj).
-- Both uci and ucj should be lowercase.
-- Sound is at the key, values are the letters sound can match
local matching_chars = {
= {"v"},
= {"i"},
= {"a", "e", "o", "u"},
= {"g", "j", "x"},
= {"e", "y"},
= {"g"},
= {"c", "j"},
= {"u"},
= {"f"},
= {"j", "c", "x", "z"},
= {"o"},
= {"u", "o"},
= {"i"}
}
return uci == ucj or (matching_chars and m_table.contains(matching_chars, ucj) and true) or false
end
local function silent_spelling_letter(ucj)
return ucj == "h" or ucj == "'" or ucj == "-"
end
local function syll_at(pos)
return syll_chars or ""
end
local function spell_at(pos)
return spelling_chars or ""
end
local function uspell_at(pos)
local c = spelling_chars
return c and ulower(c) or ""
end
while i <= #syll_chars or j <= #spelling_chars do
local uci = syll_at(i)
local cj = spell_at(j)
local ucj = uspell_at(j)
if uci == "g" and syll_at(i - 1) == "n" and syll_at(i + 1) == "." and matches(syll_at(i + 2), ucj) and
not matches(syll_at(i + 2), uspell_at(j + 1)) then
-- As a special case, before checking whether the corresponding characters match, we have to skip an extra
-- g in an -ng- sequence in the syllabified respelling if the corresponding spelling character matches the
-- next respelling character (taking into account the syllable boundary). This is so that e.g.
-- syll='ba.rang.gay' matches spelling='barangay'. Otherwise we will match the first respelling g against
-- the spelling g and the second respelling g won't match. A similar case occurs with
-- syll='E.vang.he.lis.ta' and spelling='Evangelista'. But we need an extra condition to not do this hack
-- when syll='ba.rang.gay' matches spelling='baranggay'.
i = i + 1
elseif uci == "g" and ucj == "g" and uspell_at(j + 1) == TILDE then
table.insert(result, cj)
table.insert(result, uspell_at(j + 1))
i = i + 1
j = j + 2
elseif uci == "f" and ucj == "p" and uspell_at(j + 1) == "h" then
table.insert(result, cj)
table.insert(result, uspell_at(j + 1))
i = i + 1
j = j + 2
elseif matches(uci, ucj) then
table.insert(result, cj)
i = i + 1
j = j + 1
elseif ucj == uspell_at(j - 1) and uci == "." and ucj ~= syll_at(i + 1) then
-- See below. We want to allow for a doubled letter in spelling that is pronounced single, and preserve the
-- doubled letter. But it's tricky in the presence of syllable boundaries on both sides of the doubled
-- letter as well as doubled letters pronounced double. Specifically, there are three possibilities,
-- exemplified by:
-- (1) syll='Mal.lig', spelling='Mallig' -> 'Mal.lig';
-- (2) syll='Ma.lig', spelling='Mallig' -> 'Ma.llig';
-- (3) syll='Wil.iam', spelling='William' -> 'Will.iam'.
-- If we copy the dot first, we get (1) and (2) right but not (3).
-- If we copy the double letter first, we get (2) and (3) right but not (1).
-- We choose to copy the dot first except in the situation exemplified by (3), where we copy the doubled
-- letter first. The condition above handles (3) (the doubled letter matches against a dot) while not
-- interfering with (1) (where the doubled letter also matches against a dot but the next letter in the
-- syllabification is the same as the doubled letter, because the doubled letter is pronounced double).
table.insert(result, cj)
j = j + 1
elseif silent_spelling_letter(ucj) and uci == "." and ucj ~= syll_at(i + 1) and
not rfind(uspell_at(j + 1), V) then
-- See below for silent h or apostrophe in spelling. This condition is parallel to the one directly above
-- for silent doubled letters in spelling and handles the case of syllab='Abduramán', spelling='Abdurahman',
-- which should be syllabified 'Ab.du.rah.man'. But we need a check to see that the next spelling character
-- isn't a vowel, because in that case we want the silent letter to go after the period, e.g.
-- syllab='Jumu7á', spelling='Jumu'ah' -> 'Ju.mu.'ah' (the 7 is removed above).
table.insert(result, cj)
j = j + 1
elseif uci == "." then
table.insert(result, uci)
i = i + 1
elseif ucj == uspell_at(j - 1) then
-- A doubled letter in spelling that is pronounced single. Examples:
-- * syllab='Ma.líg', spelling='Mallig' -> 'Ma.llig' (with l)
-- * syllab='Lu.il.yér', spelling='Lhuillier' -> 'Lhu.ill.ier' (with l; a more complex example)
-- * syllab='a.sa.la.mu a.lai.kum', spelling='assalamu alaikum' -> 'as.sa.la.mu a.lai.kum' (with s)
-- * syllab='Jé.fer.son', spelling='Jefferson' -> 'Je.ffer.son' (with f)
-- * syllab='Je.ma', spelling='Gemma' -> 'Ge.mma' (with m)
-- * syllab='Ha.na', spelling='Hannah' -> 'Ha.nnah' (with n)
-- * syllab='A.by', spelling='Abby' -> 'A.bby' (with b)
-- * syllab='Ka.ba', spelling='Kaaba' -> 'Kaa.ba' (with a)
-- * syllab='Fu.ji', spelling='Fujii' -> 'Fu.jii' (with i)
table.insert(result, cj)
j = j + 1
elseif silent_spelling_letter(ucj) and not (ucj == "h" and rfind(uspell_at(j - 1), V) and rfind(uspell_at(j + 1), V)) then
-- A silent h, apostrophe or hyphen in spelling. Examples:
-- * syllab='adán', spelling='adhan' -> 'a.dhan'
-- * syllab='Atanasya', spelling='Athanasia' -> 'A.tha.nas.ia'
-- * syllab='Cýntiya', spelling='Cynthia' -> 'Cyn.thi.a'
-- * syllab='Ermóhenes', spelling='Hermogenes' -> 'Her.mo.ge.nes'
-- * syllab='Abduramán', spelling='Abdurahman' -> 'Ab.du.rah.man'
-- * syllab='Jumu7á', spelling='Jumu'ah' -> 'Ju.mu.'ah'
-- * syllab='pag7ibig', spelling='pag-ibig' -> 'pag-i.big'
table.insert(result, cj)
j = j + 1
elseif uci == AC or uci == GR or uci == CFLEX or uci == DIA or uci == TILDE or uci == MACRON or
uci == "y" or uci == "w" then
-- skip character
i = i + 1
else
-- non-matching character
mw.log(("Syllabification alignment mismatch for pagename '%s' (position %s, character %s), syllabified respelling '%s' (position %s, character %s), aligned result so far '%s'"
):format(spelling, j, ucj, syllab, i, uci, concat_result()))
return nil
end
end
if i <= #syll_chars or j <= #spelling_chars then
-- left-over characters on one side or the other
mw.log(("Syllabification alignment mismatch for pagename '%s' (%s), syllabified respelling '%s' (%s), aligned result so far '%s'"
):format(
spelling, j > #spelling_chars and "end of string" or ("position %s, character %s"):format(j, uspell_at(j)),
syllab, i > #syll_chars and "end of string" or ("position %s, character %s"):format(i, syll_at(i)),
concat_result()))
return nil
end
return concat_result()
end
function export.has_baybayin(text)
return text:match("")
end
-- canonicalize multiple spaces and remove leading and trailing spaces
local function canon_spaces(text)
text = rsub(text, "%s+", " ")
text = rsub(text, "^ ", "")
text = rsub(text, " $", "")
return text
end
function export.syllabify_from_spelling(text, pagename)
-- Auto syllabifications start --
local vowel = vowel .. "ẃý" -- vowel
local V = ""
local NV = ""
local C = "" -- consonant
text = trim(text)
text = canon_spaces(text)
text = rsub(text, "+", function(baybayin)
return "<᜶" .. export.decode_baybayin(baybayin) .. "᜶>"
end)
text = decompose(text, "recompose e-dia")
local origtext = text
text = string.lower(text)
text = rsub(text, " ", "․ ")
text = rsub(text, "$", "․")
-- put # at word beginning and end and double ## at text/foot boundary beginning/end
text = rsub(text, " | ", "# | #")
text = "##" .. rsub(text, " ", "# #") .. "##"
text = rsub_repeatedly(text, "(?)#(?)", "#")
-- special word "mga"
text = rsub(text, "#mga#", "#m.ga#")
text = rsub(text, "ng̃", "ŋ")
text = rsub(text, "ng", "ŋ")
text = rsub(text, "g̃", "ġ")
text = rsub(text, "ch", "ĉ")
text = rsub(text, "t_s", "ć")
text = rsub(text, "sh", "ʃ")
text = rsub(text, "gu()", "ǵ%1")
text = rsub(text, "qu()", "ḱ%1")
text = rsub(text, "r", "ɾ")
text = rsub(text, "ɾɾ", "r")
text = rsub(text, "ʔ", "7")
-- double dot improvements
text = rsub(text, "()%.y", "%1..y")
text = rsub(text, "n%.k", "n..k")
text = rsub_repeatedly(text, "#(" .. C .. "+)i()","#%1i.%2")
text = rsub_repeatedly(text, "#(" .. C .. "+)u()","#%1u.%2")
text = rsub_repeatedly(text, "(" .. C .. ")(%1)i()","%1%2.i%3")
text = rsub_repeatedly(text, "(" .. C .. ")(%1)u()","%1%2.u%3")
text = rsub_repeatedly(text, "(" .. C .. ")(" .. C .. ")i()","%1%2i.%3")
text = rsub_repeatedly(text, "(" .. C .. ")(" .. C .. ")u()","%1%2u.%3")
text = rsub_repeatedly(text, "(" .. V .. accent_c .. "*)(" .. C .. ")u()","%1%2.u%3")
text = rsub_repeatedly(text, "(" .. V .. accent_c .. "*)(" .. C .. ")i()","%1%2.i%3")
text = rsub_repeatedly(text, "(" .. V .. accent_c .. "*)u()","%1.u%2")
text = rsub_repeatedly(text, "(" .. V .. accent_c .. "*)o()","%1.ó%2")
text = rsub(text, "a(" .. accent_c .. "*)o()","a%1ó%2")
-- eu rules
text = rsub_repeatedly(text, "()()(" .. accent_c .. "?)()(" .. accent_c .. "?)","%1%2%3.%4%5")
text = rsub(text, "y(*)()","ý%1%2")
text = rsub(text, "ý(*)()()","y%1%2%3")
text = rsub(text, "ý(" .. V .. ")", "y%1")
text = rsub(text, "w(?)()","ẃ%1%2")
text = rsub(text, "ẃ(*)()()","w%1%2%3")
text = rsub(text, "ẃ(" .. V .. ")","w%1")
text = rsub(text, "(" .. V .. ")(" .. accent_c .. "?)ẃ()()" ,"%1%2w%3%4")
text = rsub(text, "(" .. V .. ")(" .. accent_c .. "?)ẃ()()" ,"%1%2w%3%4")
text = rsub(text, "(" .. V .. ")(" .. accent_c .. "?)ý()()" ,"%1%2y%3%4")
text = rsub(text, "(" .. V .. ")(" .. accent_c .. "?)ý()()" ,"%1%2y%3%4")
text = rsub_repeatedly(text, "(" .. V .. accent_c .. "*)(" .. C .. V .. ")", "%1.%2")
-- "mb", "mp", "nd", "nk", "nt" combinations
text = rsub_repeatedly(text, "(m)()()(" .. V .. ")", "%1%2.%3%4")
text = rsub_repeatedly(text, "(n)()()(" .. V .. ")", "%1%2.%3%4")
text = rsub_repeatedly(text, "(n)()()(" .. V .. ")", "%1%2.%3%4")
text = rsub_repeatedly(text, "(n)()()(" .. V .. ")", "%1%2.%3%4")
text = rsub_repeatedly(text, "(ŋ)()()(" .. V .. ")", "%1%2.%3%4")
text = rsub_repeatedly(text, "()()()(" .. V .. ")", "%1%2.%3%4")
text = rsub_repeatedly(text, "()()()(" .. V .. ")", "%1%2.%3%4")
text = rsub_repeatedly(text, "()()(?)()(" .. V .. ")", "%1%2%3.%4%5")
text = rsub_repeatedly(text, "(s)()()(" .. V .. ")", "%1%2.%3%4")
text = rsub_repeatedly(text, "(" .. V .. accent_c .. "*" .. C .. ")(" .. C .. V .. ")", "%1.%2")
text = rsub_repeatedly(text, "(" .. V .. accent_c .. "*" .. C .. "+)(" .. C .. C .. V .. ")", "%1.%2")
text = rsub_repeatedly(text, "(" .. C .. ")%.s()", "%1s.%2")
-- Any aeëo, or stressed iu, should be syllabically divided from a following aeëo or stressed iu.
text = rsub_repeatedly(text, "(" .. accent_c .. "*)()", "%1.%2")
text = rsub_repeatedly(text, "(" .. accent_c .. "*)(" .. V .. accent_c .. ")", "%1.%2")
text = rsub(text, "(" .. accent_c .. ")()", "%1.%2")
text = rsub_repeatedly(text, "(" .. accent_c .. ")(" .. V .. accent_c .. ")", "%1.%2")
text = rsub_repeatedly(text, "i(" .. accent_c .. "*)i", "i%1.i")
text = rsub_repeatedly(text, "u(" .. accent_c .. "*)u", "u%1.u")
text = rsub(text, "ĉ", "ch")
text = rsub(text, "ć", "ts")
text = rsub(text, "ŋ", "ng")
text = rsub(text, "ʃ", "sh")
text = rsub(text, "ǵ.()", "g.u%1")
text = rsub(text, "ǵ", "gu")
text = rsub(text, "ġ", "g̃")
text = rsub(text, "ḱ.()", "q.u%1")
text = rsub(text, "ḱ", "qu")
text = rsub(text, "r", "rr")
text = rsub(text, "ɾ", "r")
text = rsub_repeatedly(text, "(+)", ".")
text = rsub(text, "?-?", "-")
text = rsub(text, "()", "|%1")
text = rsub(text, "()", "|%1")
text = rsub(text, "()+", "%1")
-- remove # symbols at word and text boundaries
text = rsub_repeatedly(text, "(?)#(?)", "")
text = rsub(text, "․", ".")
text = rsub(text, "ẃ", "w")
text = rsub(text, "ý", "y")
-- Fix Capitalization --
text = fix_capitalization(text, origtext)
-- Fix hyphens --
-- FIXME!!! Why are we relying on looking at the pagename here? This should not be happening.
origtext = pagename
if (table.concat(rsplit(origtext, "-")) == table.concat(rsplit(table.concat(rsplit(text, "|")), "-"))) then
syllbreak = 0
for i=1, #text do
if text:sub(i,i) == "|" then
if origtext:sub(i-syllbreak, i-syllbreak) == "-" then
text = table.concat({text:sub(1, i-1), "-", text:sub(i+1)})
else
syllbreak = syllbreak + 1
end
end
end
end
-- Reencode Baybayin
text = rsub(text, "(+)", function(baybayin)
baybayin = baybayin:gsub("|", "/"):gsub("7", "")
local result = require(baybayin_encode_module).transcribe(baybayin:gsub("|", "/"), false, false, false)
result = rsub(result, " ᜵ ", "|")
return result
end)
-- FIXME! Hack -- up above we changed periods to vertical bars. The rest of the code expects periods so change
-- them back. We should clean up the code above to leave the periods alone.
return (text:gsub("|", "%."))
end
function export.syllabify_and_align(respelling, pagename)
if pagename == nil then
pagename = respelling
end
local syllabification = export.syllabify_from_spelling(respelling, pagename)
return export.align_syllabification_to_spelling(syllabification, pagename)
end
-- Assimilates nasal endings in prefixes
-- options = {
-- = <value> Assimilation can be "none", "partial", or "total"
-- }
local function nasal_adjust(text1, text2, assimilation)
local t1 = text1
local t2 = text2
if assimilation ~= nil and not m_table.contains({"none", "partial", "total"}, assimilation) then
error('Assimilation options can only be "none", "partial", or "total".')
end
t2 = rsub(t2, "^ng", "ŋ")
t1 = rsub(t1, "ng(*)$", "ŋ%1")
t1 = rsub(t1, "m(*)$", "ṃ%1")
t1 = rsub(t1, "n(*)$", "ṇ%1")
local result = t1 .. t2
if assimilation == "partial" then
result = rsub(result, "(*)()", "m%1%2")
result = rsub(result, "(*)()", "n%1%2")
result = rsub(result, "(*)()", "ŋ%1%2")
elseif assimilation == "total" then
result = rsub(result, "*()(" .. V .. ")%1(?)%2(" .. NV .. "+)(" .. V .. ")", "m%2m%3%2%4%5")
result = rsub(result, "*()(?)(" .. V .. ")%1%2%3(" .. NV .. "+)(" .. V .. ")", "m%2%3m%2%3%4%5")
result = rsub(result, "*()(" .. V .. ")%1(?)%2(" .. NV .. "+)(" .. V .. ")", "n%2n%3%2%4%5")
result = rsub(result, "*()(?)(" .. V .. ")%1%2%3(" .. NV .. "+)(" .. V .. ")", "n%2%3n%2%3%4%5")
result = rsub(result, "*()(" .. V .. ")r%2(" .. NV .. "+)(" .. V .. ")", "n%2n%2%3%4")
result = rsub(result, "*()(" .. V .. ")%1(?)%2(" .. NV .. "+)(" .. V .. ")", "ŋ%2ŋ%3%2%4%5")
result = rsub(result, "*()(?)(" .. V .. ")%1%2%3(" .. NV .. "+)(" .. V .. ")", "ŋ%2%3ŋ%2%3%4%5")
result = rsub(result, "*()", "m")
result = rsub(result, "*()", "n")
result = rsub(result, "*()", "ŋ")
result = rsub(result, "(*)()", "n%1%2")
end
result = rsub(result, "ŋ", "ng")
result = rsub(result, "ṃ", "m")
result = rsub(result, "ṇ", "n")
return result
end
-- Adjusts d to r
-- Modify the d-r option in add affix
-- options = {
-- = Values can be no value, "d", "r"
-- }
-- No value (default): If "d" does not follow any of the following consonants "d", "l", "r", then "d" would change to "r" if between vowels
-- "between": "d" would change to "r" if between vowels (regardless of the next consonant)
-- "d": Retain "d" as "d"
-- "r": Force "d" as "r"
local function d_r_adjust_root(text1, text2, d)
local t1 = text1
local t2 = text2
local t2_start_d = rmatch(t2, "^d")
if not t2_start_d or d == "d" then
return t2
end
local d_valid_consonant_check = string.sub(rsub(t2, V, ""), 2, 2) -- Get consonant after "d"
d_valid_consonant_check = not rmatch(d_valid_consonant_check, "")
if d_valid_consonant_check or d == "between" or d == "r" then
local t1_end_vowel = rmatch(t1, V .. "$")
local t2_d_before_vowel = rmatch(t2, "^d" .. V)
if (d == "r") or (t1_end_vowel and t2_d_before_vowel) then
t2 = rsub(t2, "^d", "r")
end
end
return t2
end
local function add_prefix(root, affix, options)
local hyphen = ''
local result = ulower(root)
local root_vowel_start = rfind(result, "^(" .. V .. ")")
local affix_consonant_end = rmatch(affix, C .. "$")
if root_vowel_start then
result = "ʔ" .. result
result = rsub(result, "^ʔ(.)%1", "ʔ%1ʔ%1")
end
if options or (root_vowel_start and affix_consonant_end) or (ulower(root) ~= root) or (not rmatch(root, "^")) then
hyphen = '-'
end
affix = affix .. hyphen
result = d_r_adjust_root(affix, result, options)
result = nasal_adjust(affix, result, options)
if(ulower(root) ~= root) then
result = rsub(result, ulower(root) .. "$", root)
end
result = rsub(result, "+", "-")
result = rsub(result, "ʔ", "")
return result
end
local function add_infix(root, affix, options)
local result = root
local new_affix = affix
local first_vowel_idx = rfind(result, V)
local special_start_conditions = {V, "ng", "ts", "ch", "sh"}
local has_special_start = false
for idx, starting in ipairs(special_start_conditions) do
has_special_start = has_special_start or rfind(result, "^" .. starting)
if has_special_start then break end
end
if options then
if first_vowel_idx ~= 1 then
new_affix = "-" .. new_affix
end
new_affix = new_affix .. "-"
end
if options or has_special_start then
result = string.sub(result, 1, first_vowel_idx-1) .. new_affix .. string.sub(result, first_vowel_idx)
else
result = string.sub(result, 1, 1) .. new_affix .. string.sub(result, 2)
end
result = rsub(result, "ŋ", "ng")
return result
end
local function add_suffix(root, affix, options)
local result = root
local new_affix = affix
local tl_native_verb_suffixes = {
"an", "in", "i"
}
local glottal = options
-- Allow diacritics on input
result = decompose(result, true)
if rmatch(result, V .. "$") then
glottal = not (glottal == false)
end
result = rsub(result, "$", "")
result = toNFC(result)
result = rsub(result, "ng$", "ŋ")
if new_affix == "ng" then
result = rsub(result, "(" .. V .. ")n$", "%1")
elseif (m_table.contains(tl_native_verb_suffixes, new_affix)) then
if rmatch(result, V .. "$") and not glottal then
new_affix = "h" .. new_affix
end
if rmatch(result, V .. "d$") and
rmatch(new_affix, "^" .. V) and
options ~= "d" and
not options then
result = rsub(result, "(" .. V .. ")" .. "d$", "%1r")
end
result = "#" .. result
if options == nil then
result = rsub(result, "()e(" .. C .. "?)$", "%1i%2")
elseif options == "i" then
result = rsub(result, "()ee(" .. C .. "?)$", "%1ii%2")
result = rsub(result, "()e(" .. C .. "?)$", "%1i%2")
end
if options == nil then
result = rsub(result, "()o(" .. C .. "?)$", "%1u%2")
elseif options == "u" then
result = rsub(result, "()oo(" .. C .. "?)$", "%1uu%2")
result = rsub(result, "()o(" .. C .. "?)$", "%1u%2")
end
result = rsub(result, "#", "")
end
if options then
new_affix = "-" .. new_affix
end
result = rsub(result, "ŋ", "ng")
result = result .. new_affix
return result
end
-- TODO
-- Prefix -- DONE
-- Consonant cluster cases
-- Infix -- DONE
-- Suffix + changing spellings -- DONE
-- Circumfix -- DONE
-- By word affixation
-- Hyphen addition
-- Nasal assimilation - DONE
-- Syllabify
-- Reduplication
-- Capitalization, hyphen on proper noun
-- Pronunciation doesn't match spelling of root, provide phonetic spellings
-- D/R change,
-- SY/DY/TS/CH/SH cases
-- /ng/ Cases
-- double o or uo
-- Metathesis (nl, w, y), iC-in-V pattern ipinasok -> inipasok, ihinanda -> inihanda, iinuwi -> iniuwi (vowels)
-- Diacritics (optional)
-- Syncope
-- Baybayin?
-- Analyze word what affix
-- options = {
-- = See nasal_adjust()
-- = Word to be affixed
-- = Force hyphen boolean
-- = Hyphenated words should be considered as one unit if false
-- = Keep consonant cluster infix
-- = TODO: Metathesis if syncope
-- = When adding suffix, add to prevent adding "h" at end of root ending with vowel
-- = Return syllabification data
-- = See dr_adjust()
-- = When adding suffix, add to force ending "e" to become "i"
-- = When adding suffix, add to force ending "o" to become "u"
-- }
function export.add_affix(root, affix, options)
assert(type(root) == "string", "Expected string for root")
assert(type(affix) == "string", "Expected string for affix")
if options == nil then
options = {}
elseif type(options) ~= "table" then
error("Options parameter must be a table or nil.")
end
local affix_actions = {
= add_prefix,
= add_infix,
= add_suffix
}
local word_idx = 1
if tonumber(options) ~= nil and tonumber(options) >= 1 then
options = tonumber(options)
else
options = 1
end
local words = rsplit(canon_spaces(root), " ")
for i=1, #words do
local hyph_words = rsplit(words, "-")
if options == false then
hyph_words = {words}
end
for j=1, #hyph_words do
if (word_idx == options) then
local affixes = rsplit(canon_spaces(affix), " ")
for i=1, #affixes do
local affix_type = ""
local new_affix = affixes
local has_beginning_hyphen = rfind(new_affix, "^-")
local has_ending_hyphen = rfind(new_affix, "-$")
if has_beginning_hyphen and has_ending_hyphen then
affix_type = "infix"
elseif has_beginning_hyphen then
affix_type = "suffix"
elseif has_ending_hyphen then
affix_type = "prefix"
end
new_affix = rsub(new_affix, "^-", "")
new_affix = rsub(new_affix, "-$", "")
hyph_words = affix_actions(hyph_words, new_affix, options)
end
end
hyph_words = rsub(hyph_words, "ʔ", "")
word_idx = word_idx + 1
end
words = table.concat(hyph_words, '-')
end
words = table.concat(words, " ")
words = export.remove_accents(words)
if options then
return export.syllabify_and_align(words)
end
return words
end
function export.reduplicate(root, syllcount, options)
local syllable_count = syllcount
if options == nil then
options = {}
elseif type(options) ~= "table" then
error("Options parameter must be a table or nil.")
end
local word_idx = 1
if tonumber(options) ~= nil and tonumber(options) >= 1 then
options = tonumber(options)
else
options = 1
end
if syllable_count == nil then
syllable_count = 1
end
local words = rsplit(canon_spaces(root), " ")
for i=1, #words do
if (word_idx == options) then
if syllcount == "all" then
words = words .. "-" .. words
elseif tonumber(syllable_count) ~= nil then
syllable_count = tonumber(syllable_count)
local rdp_word = words
local rdp_syllabification = export.syllabify_and_align(rsub(rdp_word, "(" .. V .. ")" .. "(" .. V .. ")", "%1.%2"))
rdp_syllabification = rsub(rdp_syllabification, "()", "-.%1")
rdp_syllabification = rsplit(rdp_syllabification, "%.")
local rdp_add = ""
mw.logObject(rdp_syllabification)
for j=1, #rdp_syllabification do
local rdp_syll = rdp_syllabification
if j > syllable_count then
break
elseif j == syllable_count then
if syllable_count ~= #rdp_syllabification then
if rdp_syll:sub(-1) ~= "-" or syllable_count == 1 then
rdp_syll = rsub(rdp_syll, "(".. V .. ").*", "%1")
end
rdp_syll = rsub(rdp_syll, "-$", "")
if options ~= true then
rdp_syll = rsub(rdp_syll, "(" .. C .. "+)(".. V .. ")",
function(consonants, vowel)
if not m_table.contains({"ng", "ts", "ch"}, consonants)
and not rmatch(consonants, "y") then
consonants = consonants:sub(1,1)
elseif consonants == "ch" then
consonants = "ts"
end
return consonants .. vowel
end
)
end
end
end
rdp_add = rdp_add .. rdp_syll
end
if options or syllable_count > 1 or rdp_word ~= ulower(rdp_word) then
rdp_add = rdp_add .. "-"
end
if syllable_count == 1 then
rdp_add = ulower(rdp_add)
end
rdp_word = d_r_adjust_root(rdp_add, rdp_word, options)
words = rdp_add .. rdp_word
else
error('Syllable count must be numerical or "all".')
end
end
word_idx = word_idx + 1
end
words = table.concat(words, " ")
words = export.remove_accents(words)
return words
end
-- Pang-angkop/Linker goes here
function export.add_linker(text, consider_case)
if text == nil or type(text) ~= "string" then
error("There should be an input parameter.")
end
local orig_text = export.remove_accents(text)
local input_text = ulower(orig_text)
local last_letter = rmatch(text, "(.)$")
local is_last_caps = ulower(last_letter) ~= last_letter
if(rfind(input_text, "$")) and (not is_last_caps or consider_case) then
input_text = export.add_affix(input_text, "-ng")
else
input_text = input_text .. " na"
end
-- Fix capitalization with what already exists
local linker = input_text:sub(#orig_text + 1)
input_text = orig_text .. ((is_last_caps and consider_case) and uupper(linker) or linker)
return input_text
end
function export.testing()
mw.logObject({
export.reduplicate("ikot", 1),
export.reduplicate("ngiti", 1),
export.reduplicate("bundok", 1),
export.reduplicate("drama", 1, {
= true
}),
export.reduplicate("dating", 1),
export.reduplicate("kain", 1),
export.reduplicate("Tagalog", 1),
export.reduplicate("sip-unin", 1),
export.reduplicate("ilaw-trapiko", 3),
})
end
return export