User:Arafsymudwr/Sandbox

Module:cy-IPA

local export = {}

local lang = require("Module:languages").getByCode("cy")

local m_IPA = require("Module:IPA") local m_a = require("Module:accent qualifier") local m_table = require("Module:table")

local parse_utilities_module = "Module:parse utilities" local patut_module = "Module:pattern utilities"

local listToSet = require("Module:table").listToSet

--[=[ FIXME:

1. Some words in ng have /ŋ/ and others have /ŋg// Wiktionary already sorts these separately. 2. Consonant clusters assimilate by losing voice, not regressive or progressive assimilation 3. Some common words in y and u are /i/ or /ɪ/ in North Wales 4. North Wales colloquial: unstressed /ɛ/ as /a/ 5. South Wales colloquial, lots of monophthongisation ]=]

local usub = mw.ustring.sub local rfind = mw.ustring.find local rmatch = mw.ustring.match local rsplit = mw.text.split local rsubn = mw.ustring.gsub local ulower = mw.ustring.lower local u = mw.ustring.char local ugcodepoint = mw.ustring.gcodepoint

export.dialects = {"NW", "SW"} export.dialects_to_names = { NW = "North Wales", SW = "South Wales", }

local written_unstressed_vowel_l = "aeiouwyAEIOUWY" local written_long_vowel_l = "àáâäèéêëìíîïòóôöùúûẁẃŵẅüỳýŷÿÀÁÈÉÊËÌÍÎÏÒÓÔÖÙÚÛÜẀẂŴẄỲÝŶŸ" local written_stressed_not_long_vowel_l = "àèìòùẁỳÀÈÌÒÙẀỲ" local written_stressed_vowel_l = written_long_vowel_l .. written_stressed_not_long_vowel_l local ipa_vowel_l = "ɪɨ̞ʊɛəɔ" local written_vowel_l = written_unstressed_vowel_l .. written_stressed_vowel_l local vowel_l = written_vowel_l .. ipa_vowel_l local V = "" local written_stressed_to_plain_vowel = { = "a", = "a", = "a", = "a", = "e", = "e", = "e", = "e", = "i", = "i", = "i", = "i", = "o", = "o", = "o", = "o", = "u", = "u", = "u", = "u", = "w", = "w", = "w", = "w", = "y", = "y", = "y", = "y", = "A", = "A", = "A", = "A", = "E", = "E", = "E", = "E", = "I", = "I", = "I", = "I", = "O", = "O", = "O", = "O", = "U", = "U", = "U", = "U", = "W", = "W", = "W", = "W", = "Y", = "Y", = "Y", = "Y", }

local sequences = { = { = "a"; = "a"; = "a"; = "a"; = "a"; = "ɑːɨ̯"; = "ai̯"; = "aɨ̯"; = "ɑːu̯"; }; = { = "k"; = "χ"; }; = { = "d"; = "ð"; }; = { = "ɛ"; = "ɛ"; = "eː"; = "eː"; = "e"; = "ɛi̯"; = "əɨ̯"; = "eːu̯"; = "aɨ̯"; }; = { = "v"; = "f"; }; = "ɡ"; = "h"; = { = "ɪ"; = "ɪ"; = "iː"; = "i"; = "iː"; = "ɛi̯"; }; = "d͡ʒ"; = { = "k"; = "k"; }; = { = "l"; = "ɬ"; }; = { = "m"; = "m̥"; }; = { = "n"; = "n̥"; = "ŋ"; = "n"; }; = { = "oː"; = { "ɔ", "s" }; = "ɔ"; }; = { -- XXX: manchmal /øː/ = "œ"; = { "œ", "s" }; }; = { = "f"; = "p"; = "p"; }; = { = { "k", "f" }; = "k"; -- XXX }; = { -- XXX: /ʀ/? /r/?; manchmal /ɐ/ ("Uhr"); auch /ər/ ("oder") = "r"; = "r"; }; = { = "s"; = "ʃ"; = { "ʃ", "p" }; = "s"; = { "ʃ", "t" }; }; = { = "t"; = "t͡ʃ"; = "t"; = { "t͡s", "i̯", "o", "n" }; }; = { = "ʊ"; = { "ʊ", "x" }; }; = { = "yː"; = "yː"; }; = "f"; = "ʋ"; = { "k", "s" }; -- XXX = "i"; = "z"; -- already converted from s = "s"; = "ˈ"; -- FIXME = {}; }

local AC = u(0x0301) -- acute = ́ local GR = u(0x0300) -- grave = ̀ local CFLEX = u(0x0302) -- circumflex = ̂ local DOTOVER = u(0x0307) -- dot over = ̇ local DIA = u(0x0308) -- diaeresis = ̈ local LINEUNDER = u(0x0331) -- lineunder = ̱

local stress_l = AC .. GR local stress_c = "" local ipa_stress_l = "ˈˌ" local ipa_stress_c = "" local sylsep_l = "%-." -- hyphen included for syllabifying from spelling; FIXME: formerly included SYLDIV local sylsep_c = "" local tie_l = "‿'" local tie_c = "" local charsep_l = sylsep_l .. tie_l .. stress_l .. ipa_stress_l local charsep_c = "" local wordsep_l = "# " local wordsep_c = "" local separator_l = charsep_l .. wordsep_l local separator_c = "" local neg_guts_of_cons = vowel_l .. separator_l local C = "" -- consonant class including h

export.mid_vowel_hints = "éèêëóòô" export.mid_vowel_hint_c = ""

local TEMP_PAREN_R = u(0xFFF1) local TEMP_PAREN_RR = u(0xFFF2) -- Pseudo-consonant at the edge of prefixes ending in a vowel and suffixes beginning with a vowel; FIXME: not currently -- used. local PSEUDOCONS = u(0xFFF3) -- local PREFIX_MARKER = u(0xFFF4) -- marker indicating a prefix so we can convert primary to secondary accents

local valid_onsets = listToSet { "b", "bl", "br", "c", "cl", "cr", "ç", "d", "dj", "dr", "f", "fl", "fr", "g", "gl", "gr", "gu", "gü", "h", "i", "j", "k", "kl", "kr", "l", "ll", "m", "n", "ny", "ñ", "p", "pl", "pr", "qu", "qü", "r", "rr", "s", "ss", "t", "tg", "tj", "tr", "tx", "tz", "u", "v", "vl", "vr", "w", "x", "ʃ", -- e.g. 'χruʃóf' respelling of Khrusxov "χ", -- in case of respelling "y", "z", }

local decompose_dotover = { -- No composed i, u or U with DOTOVER. = "a" .. DOTOVER, = "e" .. DOTOVER, = "o" .. DOTOVER, = "y" .. DOTOVER, = "A" .. DOTOVER, = "E" .. DOTOVER, = "I" .. DOTOVER, = "O" .. DOTOVER, = "Y" .. DOTOVER, }

local unstressed_words = listToSet { -- proclitic object pronouns "em", "et", "es", "el", "la", "els", "les", "li", "ens", "us", "ho", "hi", "en", -- enclitic object pronouns usually attach with hyphen to preceding verb but not always, cf. tant me fa "me", "te", "se", "lo", "los", "nos", "vos", "ne", -- contracted object pronouns and articles attached with apostrophe so no need to include -- unstressed possessives "mon", "ma", "mos", "mes", "ton", "ta", "tos", "tes", "son", "sa", "sos", "ses", -- prepositions "a", "de", "per", "amb", "ab", -- 'en' already included as proclitic object pronouns -- prepositional contractions "al", "als", "del", "dels", "pel", "pels", -- articles 'el', 'la', 'els', 'les' already included as proclitic pronouns -- personal articles "na", -- 'en' already included above -- indefinite articles "un", "uns", -- salat articles "ets", "so", -- 'es' already included as proclitic object pronouns and 'ses', 'sa', 'sos' as possessives -- conjunctions "i", "o", "si", "ni", "que", }

-- Version of rsubn() that discards all but the first return value. local function rsub(term, foo, bar) local retval = rsubn(term, foo, bar) return retval end

-- Version of rsubn() that returns a 2nd argument boolean indicating whether a substitution was made. local function rsubb(term, foo, bar) local retval, nsubs = rsubn(term, foo, bar) return retval, nsubs > 0 end

-- Apply rsub() repeatedly until no change. local function rsub_repeatedly(term, foo, bar) while true do local new_term = rsub(term, foo, bar) if new_term == term then return term end term = new_term end end

local function split_into_chars(text) local chars = {} for codepoint in ugcodepoint(text) do table.insert(chars, u(codepoint)) end return chars end

local function split_on_comma(term) if term:find(",%s") or term:find("\\") then return require(parse_utilities_module).split_on_comma(term) else return rsplit(term, ",") end end

local function concat_keys(tab) local res = {} for k, _ in pairs(tab) do table.insert(res, k) end return table.concat(res) end

local function handle_unstressed_words(words) words = m_table.deepcopy(words)

-- Lowercase all words for ease in further processing. for i, wordobj in ipairs(words) do wordobj.term = ulower(wordobj.term) end

-- Check if the word at index `i` in `words` is "amb" and the following word begins with a vowel. local function is_amb_to_join(words, i) return i < #words and words.term == "a" .. DOTOVER .. "mb" and rfind(words.term, "^h?" .. V) end local saw_amb_to_join = true

-- Mark all unstressed words with DOTOVER, so that split_syllables() doesn't assign stress. We need to do this -- before special handling for amb, because amb may join to another unstressed word like el, in the -- process losing the identity of the two words. In the process, see if amb occurs before a following -- vowel-initial word (which may begin with h-). for i, wordobj in ipairs(words) do -- Put DOTOVER after the last vowel (to handle the case of que). It doesn't actually matter where we put -- it, because split_syllables() just looks for DOTOVER anywhere in the word. if unstressed_words then wordobj.term = rsub(wordobj.term, "^(.*" .. V .. ")", "%1" .. DOTOVER) end if is_amb_to_join(words, i) then saw_amb_to_join = true end end

-- Join amb before vowel-initial word with following word. if saw_amb_to_join then local new_words = {} local i = 1 while i <= #words do if is_amb_to_join(words, i) then table.insert(new_words, {term = words.term .. "‿" .. words.term, pos = words.pos}) i = i + 2 else table.insert(new_words, words) i = i + 1 end end words = new_words end

-- Finally, rewrite some unstressed words to get the right pronunciation. Any remaining amb not before a -- vowel-initial word is pronounced even in Valencian (where / would be expected), and per always -- has a pronounced <r>. local unstressed_word_replacement = { = "a" .. DOTOVER .. "m", = "pe" .. DOTOVER .. "rr", }

for i, wordobj in ipairs(words) do wordobj.term = unstressed_word_replacement or wordobj.term end

return words end

local function fix_prefixes(word) -- Voiced s in prefix roots -fons-, -dins-, -trans- word = rsub(word, "^enfons()", "enfonz%1") word = rsub(word, "^endins()", "endinz%1") word = rsub(word, "tr()ns()", "tr%1nz%2")

-- in + ex > ineks/inegz word = rsub(word, "^inex", "in.ex")

return word end

local function restore_diaereses(word) -- Some structural forms do not have diaeresis per diacritic savings, let's restore it to identify hiatus

word = rsub(word, "()um(s?)$", "%1üm%2") -- Latinisms (-ius is ambiguous but rare)

word = rsub(word, "()isme(s?)$", "%1ísme%2") -- suffix -isme word = rsub(word, "()ist(s?)$", "%1íst%2") -- suffix -ista

word = rsub(word, "()ir$", "%1ír") -- verbs -ir word = rsub(word, "()int$", "%1ínt") -- present participle word = rsub(word, "()ir()$", "%1ïr%2") -- future word = rsub(word, "(u)ir()$", "%1ïr%2") word = rsub(word, "()iràs$", "%1ïràs") word = rsub(word, "(u)iràs$", "%1ïràs") word = rsub(word, "()ir(e)$", "%1ïr%2") word = rsub(word, "(u)ir(e)$", "%1ïr%2") word = rsub(word, "()iran$", "%1ïran") word = rsub(word, "(u)iran$", "%1ïran") word = rsub(word, "()iria$", "%1ïria") -- conditional word = rsub(word, "(u)iria$", "%1ïria") word = rsub(word, "()ir(ie)$", "%1ïr%2") word = rsub(word, "(u)ir(ie)$", "%1ïr%2")

return word end

local function fix_y(word) -- y > vowel i else consonant /j/, except ny

word = rsub(word, "ny", "ñ")

word = rsub(word, "y()", "i%1") -- vowel if not next to another vowel word = rsub(word, "()y", "%1i") -- excluding also syllables separators

return word end

local function mid_vowel_fixes(word) local function track_mid_vowel(vowel, cont) require("Module:debug/track"){"cy-IPA/" .. vowel, "cy-IPA/" .. vowel .. "/" .. cont} return true end local changed -- final -el (not -ell) usually è but not too many cases word, changed = rsubb(word, "e(nts?)$", "é%1") if changed then track_mid_vowel("e", "nt-nts") end word, changed = rsubb(word, "e(rs?)$", "é%1") if changed then track_mid_vowel("e", "r-rs") end word, changed = rsubb(word, "o(rs?)$", "ó%1") if changed then track_mid_vowel("o", "r-rs") end word, changed = rsubb(word, "è(s?)$", "ê%1") if changed then track_mid_vowel("è", "s-blank") end word, changed = rsubb(word, "e(ss)$", "ê%1") if changed then track_mid_vowel("e", "sos-sa-ses") end word, changed = rsubb(word, "e(sa)$", "ê%1") if changed then track_mid_vowel("e", "sos-sa-ses") end return word end

local function word_fixes(word, dialect) word = rsub(word, "%(rr%)", TEMP_PAREN_RR) word = rsub(word, "%(r%)", TEMP_PAREN_R) word = rsub(word, "%-(?)", "-%1%1") if dialect == "val" then word = rsub(word, "%-x", "-tx") end word = rsub(word, "rç$", "rrs") -- silent r only in plurals -rs word = fix_prefixes(word) -- internal pause after a prefix word = restore_diaereses(word) -- no diaeresis saving word = fix_y(word) -- ny > ñ else y > i vowel or consonant word = mid_vowel_fixes(word) -- all words in pn- (e.g. pneumotòrax and mn- (e.g. mnemònic) have silent p/m in both Central and Valencian word = rsub(word, "^n", "n") -- Respell ch + vowel as tx, before we remove other h's after consonants. word = rsub(word, "ch(" .. V ..")", "tx%1") -- Delete h after a consonant. This must happen here, before split_syllables(). We don't delete h after a vowel -- yet because it indicates a hiatus. word = rsub(word, "(" .. C .. ")h", "%1")

return word end

local function split_vowels(vowels, saw_dotover, saw_lineunder) local syllables = {{onset = "", vowel = usub(vowels, 1, 1), coda = "", separator = "", has_dotover = saw_dotover, has_lineunder = saw_lineunder}} vowels = usub(vowels, 2)

while vowels ~= "" do local syll = {onset = "", vowel = "", coda = ""} syll.onset, syll.vowel, vowels = rmatch(vowels, "^(?)(.)(.-)$") table.insert(syllables, syll) end

local count = #syllables

if count >= 2 and (syllables.vowel == "i" or syllables.vowel == "u") then syllables.coda = syllables.vowel syllables = nil end

return syllables end

-- Split the word into syllables. Return a list of syllable objects, each of which contains fields `onset`, `vowel`, -- `coda`, `separator` (a user-specified syllable divider that goes before the syllable; one of '·', '-' or '.') and -- `stressed` (a boolean indicating that the syllable is stressed). In addition, the list has fields `stress` (the -- index of the syllable with primary stress) and `is_prefix` (true if the word is a prefix, i.e. it ends in '-'). -- Normally, prefixes are treated as unstressed if a stressed syllable isn't explicitly marked, but this can be -- overridden with `stress_prefixes`, which causes the automatic stress-assignment algorithm to run for these terms. local function split_syllables(word, stress_prefixes, may_be_uppercase) local syllables = {} local saw_dotover = false

local remainder = word local is_prefix = false if remainder:find("%-$") then -- prefix is_prefix = true remainder = remainder:gsub("%-$", "") end local is_suffix = false if remainder:find("^%-") then -- suffix is_suffix = true remainder = remainder:gsub("^%-", "") end

while remainder ~= "" do local consonants, vowels

-- FIXME: Using C and V below instead of the existing patterns slows things down TREMENDOUSLY. -- Not sure why. local vowel_list = may_be_uppercase and "aeiouàèéêëíòóôúïüAEIOUÀÈÉÊËÍÒÓÔÚÏÜ" .. DOTOVER .. LINEUNDER or "aeiouàèéêëíòóôúïü" .. DOTOVER .. LINEUNDER consonants, remainder = rmatch(remainder, "^(*)(.-)$") vowels, remainder = rmatch(remainder, "^(*)(.-)$") local this_saw_dotover = not not rfind(vowels, DOTOVER) if this_saw_dotover then saw_dotover = true vowels = vowels:gsub(DOTOVER, "") end local this_saw_lineunder = not not rfind(vowels, LINEUNDER) if this_saw_lineunder then vowels = vowels:gsub(LINEUNDER, "") end

if vowels == "" then if #syllables > 0 then syllables.coda = syllables.coda .. consonants else -- word without vowels, e.g. foot boundary | table.insert(syllables, {onset = consonants, vowel = "", coda = "", separator = ""}) end else local onset = consonants local first_vowel = usub(vowels, 1, 1)

if (rfind(onset, "$") and (first_vowel == "ü" or (first_vowel == "u" and vowels ~= "u"))) or ((onset == "" or onset == "h" or onset == "H") and #syllables == 0 and (first_vowel == "i" or first_vowel == "I") and (vowels ~= "i" and vowels ~= "I")) then onset = onset .. usub(vowels, 1, 1) vowels = usub(vowels, 2) end

local vsyllables = split_vowels(vowels, this_saw_dotover, this_saw_lineunder) vsyllables.onset = onset .. vsyllables.onset

for _, s in ipairs(vsyllables) do table.insert(syllables, s) end end end

-- Shift over consonants from the onset to the preceding coda, until the syllable onset is valid for i = 2, #syllables do local current = syllables local previous = syllables

while not (current.onset == "" or valid_onsets?$", ""), "_", "")]) do local letter = usub(current.onset, 1, 1) current.onset = usub(current.onset, 2) if rfind(letter, "") then -- syllable separators current.separator = letter break else previous.coda = previous.coda .. letter if rfind(letter, tie_c) then break end end end end

-- Detect stress for i, syll in ipairs(syllables) do if rfind(syll.vowel, "^$") then syll.stressed = true -- primary stress: the last one stressed without LINEUNDER if not syll.has_lineunder then syllables.stress = i end end end

-- Assign default stress if not syllables.stress and not saw_dotover and (stress_prefixes or not is_prefix) then local count = #syllables

if count == 1 then if syllables.vowel ~= "" then -- vowel-less words don't get stress syllables.stress = 1 end else local final = syllables

-- Take account of tie symbols (apostrophes and ‿). if rfind(final.coda, "^*$") or (rfind(final.coda, "^" .. tie_c .. "*n" .. tie_c .. "*$") and ( final.vowel == "e" or final.vowel == "i" or final.vowel == "ï")) then syllables.stress = count - 1 else syllables.stress = count end end if syllables.stress then syllables.stressed = true end end

syllables.is_prefix = is_prefix syllables.is_suffix = is_suffix return syllables end

local function reconstitute_word_from_syllables(syllables) local parts = {} local function ins(txt) table.insert(parts, txt) end if syllables.is_suffix then ins("-") end for _, syl in ipairs(syllables) do ins(syl.separator) ins(syl.onset) ins(syl.vowel) if syl.has_dotover then ins(DOTOVER) end if syl.has_lineunder then ins(LINEUNDER) end ins(syl.coda) end if syllables.is_prefix then ins("-") end return table.concat(parts) end

local function decompose_respelling(text) local dotover_keys = concat_keys(decompose_dotover) return rsub(text, "", decompose_dotover) end

local function canon_respelling(text) local function canon_spaces(text) text = rsub(text, "%s+", " ") text = rsub(text, "^ ", "") text = rsub(text, " $", "") return text end

text = canon_spaces(text) -- eliminate upside down punctuation text = rsub(text, "", "") -- eliminate utterance-final punctuation text = rsub(text, "$", "") -- eliminate double and triple quotes text = rsub(text, "+", "") -- Convert commas and em/en dashes to IPA foot boundaries; require a space after commas and en dashes (for the -- latter, in particular, to avoid treating the en dash in 'Bose–Einstein condensate' as a foot boundary. text = rsub(text, " * ", " | ") text = rsub(text, " * *", " | ") -- ... in phrases like com es diu...en català and necessito ... become foot boundaries text = rsub(text, " *%.%.%. *", " | ") -- remaining commas and en dashes become spaces text = rsub(text, "", " ") -- may need to eliminate extraneous spaces again, e.g. if there was a space before or after an eliminated -- punctuation mark text = canon_spaces(text) -- question mark or exclamation point in the middle of a sentence -> IPA foot boundary text = rsub(text, "() * *()", "%1 | %2") return text end

local IPA_vowels_central = { = "ɛ", = "ɛ", = "ɔ", } local IPA_vowels_balearic = { = "ə", = "ɛ", = "ɔ", } local IPA_vowels_valencian = { = "e", = "e", = "o", }

local IPA_vowels = { = "a", = "ɛ", = "ɛ", = "ɛ", = "e", = "i", = "i", = "ɔ", = "ɔ", = "o", = "u", = "u", }

local function replace_context_free(cons) cons = rsub(cons, "ŀ", "l")

cons = rsub(cons, "r", "ɾ") cons = rsub(cons, "ɾɾ", "r") cons = rsub(cons, "ss", "s") cons = rsub(cons, "ll", "ʎ") cons = rsub(cons, "ñ", "ɲ") -- hint ny > ñ

-- NOTE: We use single-character affricate symbols during processing for ease in handling, and convert them -- to tied multi-character affricates at the end of join_syllables(). cons = rsub(cons, "j", "ʤ") cons = rsub(cons, "tx", "ʧ") cons = rsub(cons, "z", "ʣ")

cons = rsub(cons, "ç", "s") cons = rsub(cons, "", "k") cons = rsub(cons, "h", "") cons = rsub(cons, "j", "ʒ") -- Don't replace x -> ʃ yet so we can distinguish x from manually specified ʃ.

cons = rsub(cons, "i", "j") -- must be after j > ʒ cons = rsub(cons, "y", "j") -- must be after j > ʒ and fix_y cons = rsub(cons, "", "w") cons = rsub(cons, "'", "‿")

return cons end

-- Do context-sensitive phonological changes. Formerly this was all done syllable-by-syllable but that made the code -- tricky (since it often had to look at adjacent syllables) and full of subtle bugs. Now we first concatenate the -- syllables back to words and the words to the combined text and work on the text as a whole. FIXME: We should move -- more of the work done in preprocess_word(), e.g. most of replace_context_free(), here. local function postprocess_general(text, dialect) local function verify(cond, msg) if not cond then error(("Internal error: %s; processed respelling at this point is '%s'"):format(msg, text)) end return true end

local voiced = listToSet {"b", "d", "g", "m", "n", "ɲ", "l", "ʎ", "r", "ɾ", "v", "z", "ʒ", "ʣ", "ʤ"} local voiced_keys = concat_keys(voiced) local voiceless = listToSet {"p", "t", "k", "f", "s", "ʃ", "ʦ", "ʧ"} local voiceless_keys = concat_keys(voiceless) local voicing = { = "b", = "d", = "g", = "v", = "z", = "ʒ", = "ʤ", = "ʤ"} local voicing_keys = concat_keys(voicing) local devoicing = {} for k, v in pairs(voicing) do devoicing = k end local devoicing_keys = concat_keys(devoicing)

------------------ Handle <x>

-- Handle ex- + vowel > -egz-. We handle -x- on either side of the syllable boundary. Note that this also handles -- inex- + vowel because in fix_prefixes we respell inex- as in.ex-, which ends up at this stage as in.e.xV. text = rsub_repeatedly(text, "(" .. stress_c .. "*)(" .. charsep_c .. "*)x(" .. charsep_c .. "*" .. V .. ")", function(e, pre, post) -- Preserve other character separators (especially the tie character ‿). pre = pre:gsub("%.", "") post = post:gsub("%.", "") return e .. pre .. "g.z" .. post end) -- -x- at the beginning of a coda becomes , e.g. annex, apèndix, extracció; but not elsewhere in -- the coda, e.g. in romanx, ponx; words with in -nx such as esfinx, linx, manx need -- respelling with ; words ending in vowel + x like ídix need respelling with text = rsub(text, "(" .. V .. stress_c .. "*)x", "%1ks") if dialect == "val" then -- Word-initial <x> as well as <x> after a consonant other than /j/ (including in the coda, e.g. ponx) -- becomes . text = rsub(text, "#x", "#ʧ") text = rsub(text, "(" .. charsep_c .. "*)x", "%1ʧ") end -- Other x becomes text = rsub(text, "x", "ʃ")

-- Doubled ss -> s e.g. in exs-, exc(e/i)-, sc(e/i)-; FIXME: should this apply across word boundaries? text = rsub(text, "s(" .. charsep_c .. "*)s", "%1s")

------------------ Coda consonant losses

-- In Central Catalan, coda losses happen everywhere, but otherwise they don't happen when -- absolutely word-finally before a vowel or end of utterance (e.g. blanc has /k/ in Balearic and -- Valencian but not blancs). Must precede consonant assimilations. local boundary = dialect == "cen" and "(.)" or "()" text = rsub(text, "m" .. boundary, "m%1") text = rsub(text, "()" .. boundary, "%1%2") text = rsub(text, "" .. boundary, "ŋ%1") if dialect == "val" or dialect == "bal" then local before_cons = "(" .. separator_c .. "*" .. C .. ")" text = rsub(text, "m" .. before_cons, "m%1") text = rsub(text, "()" .. before_cons, "%1%2") text = rsub(text, "" .. before_cons, "ŋ%1") end

-- Delete /t/ between /s/ and any consonant other than /s/ or /ɾ/. Must precede voicing assimilation and -- t + lateral/nasal assimilation. text = rsub(text, "st(" .. sylsep_c .. "*)", "s%1")

------------------ Consonant assimilations

if dialect == "cen" then -- v > b in onsets (not in codas, e.g. ovni and hafni ). This needs to precede -- assimilation of nb -> mb. text = rsub(text, "v(" .. C .. "*" .. V ..")", "b%1") end

-- t + lateral assimilation -> geminate across syllable boundary. We don't any more do t + nasal assimiation -- because there are too many exceptions, e.g. aritmètic, atmosfèric, ètnia. Instead, we require that -- cases where it does happen use respelling to effect this. FIXME: this doesn't always happen in -tl- either, -- e.g. atlàntic has in GDLC but in DNV. -- -- FIXME: Clean this up, maybe move below voicing assimilation, investigate whether it operates across words, -- move stuff below that special-cases tll in Valencian here. text = rsub(text, "t(" .. sylsep_c .. ")()", "%2%1%2")

-- n + labial > labialized assimilation text = rsub(text, "n(" .. separator_c .. "*)", "m%1") text = rsub(text, "n(" .. separator_c .. "*)", "ɱ%1")

-- n + velar > velarized assimilation text = rsub(text, "n(" .. separator_c .. "*)", "ŋ%1")

-- l/n + palatal > palatalized assimilation text = rsub(text, "()(" .. separator_c .. "*)", function(ln, palatal) ln = ({ = "ʎ", = "ɲ"}) return ln .. palatal end)

-- ɲs > ɲʃ; FIXME: not sure the purpose of this; it doesn't apply in menys or derived terms like menyspreu -- NOTE: Per , it does apply in these scenarios but the result is -- somewhere between and , which is why it isn't shown in GDLC. -- text = rsub(text, "ɲs", "%1ʃ")

------------------ Handle <r>

-- In replace_context_free(), we converted single r to ɾ and double rr to r. if dialect == "cen" then text = rsub(text, TEMP_PAREN_R, "") text = rsub(text, TEMP_PAREN_RR, "r") elseif dialect == "bal" then text = rsub(text, TEMP_PAREN_R, "") text = rsub(text, TEMP_PAREN_RR, "") else verify(dialect == "val", ("Unrecognized dialect '%s'"):format(dialect)) text = rsub(text, TEMP_PAREN_R, "ɾ") text = rsub(text, TEMP_PAREN_RR, "ɾ") end if dialect ~= "val" then -- Coda /ɾ/ -> /r/ -- FIXME: This is inherited from the older code. Correct? text = rsub(text, "(" .. V .. stress_c .. "*" .. C .. "*)ɾ", "%1r") end

-- ɾ -> r word-initially or after ; needs to precede voicing assimilation as ~~will be voiced to before -- /ɾ/. text = rsub(text, "(" .. sylsep_c .. "*)ɾ", "%1r")~~

~~------------------ Voicing assimilation~~

-- Voicing or devoicing; we want to proceed from right to left, and due to the limitations of patterns (in -- particular, the lack of support for alternations), it's difficult to do this cleanly using Lua patterns, so we -- do it character by character. local chars = split_into_chars(text) -- We need to look two characters ahead in some cases, so start two characters from the end. This is safe because -- the overall respelling ends in "##". (Similarly, as an optimization, don't check the first two characters, which -- are always "##".) for i = #chars - 2, 3, -1 do -- We are looking for two consonants next to each other, possibly separated by a syllable or word divider. -- We also handle a consonant followed by a syllable divider then a vowel, and a consonant word-finally. -- Note that only coda consonants can change voicing, so we need to check to make sure we're in the coda. local first = chars -- If `second` is nil, no assimilation occurs. Otherwise, `second` should be a consonant or empty string (which -- represents a syllable or word boundary followed by a vowel or end of string), and we assimilate to that -- consonant (empty string forces devoicing). local second -- If set to true, we're processing a consonant directly before a word boundary followed by a word beginning -- with a vowel. In this context, voiceless sibilants voice. Note that we handle voicing of word-internally -- separately, in preprocess_word() [FIXME: maybe move much of the processing in preprocess_word() into this -- function]. local word_boundary_before_vowel if not rfind(first, C) then -- leave `second` at nil; no assimilation elseif chars == "#" then -- word boundary if chars == " " then -- chars should always be "#" verify(chars == "#", "Word boundary followed by space but not #") if rfind(chars, C) then second = chars else second = "" word_boundary_before_vowel = true end else second = "" end elseif rfind(chars, sylsep_c) then -- syllable boundary if rfind(chars, C) then second = chars else second = "" end elseif rfind(chars, C) then second = chars else -- followed by a vowel not across a syllable or word boundary; leave `second` as nil, no assimilation end if second then -- Make sure we're in the coda. We have to look backwards until we find a vowel or syllable/word boundary. local in_coda = false local j = i - 1 while true do verify(j > 0, "Missing word boundary at beginning of overall respelling") if rfind(chars, "") then break elseif rfind(chars, V) then in_coda = true break end j = j - 1 end if in_coda then if word_boundary_before_vowel and rfind(first, "") then -- leave alone elseif voiced and voicing or word_boundary_before_vowel and rfind(first, "") then chars = voicing elseif (voiceless or second == "") and devoicing then chars = devoicing end end end end text = table.concat(chars)

-- gn -> ŋn e.g. regnar (including word-initial gn- e.g. gnòmic, gneis) -- FIXME: This should be moved below voicing assimilation, and we need to investigate if it operates across words -- (here I'm guessing yes). if dialect ~= "cen" then text = rsub(text, "#gn", "#n") end text = rsub(text, "g(" .. separator_c .. "*n)", "ŋ%1")

-- gʒ > d͡ʒ -- FIXME: We need to investigate if it operates across words text = rsub(text, "g(" .. sylsep_c .. "*)ʒ", "%1ʤ") -- sʃ -> ʃ (desxifrar), zʒ -> ʒ (disjuntor) if dialect ~= "val" then text = rsub(text, "s(" .. separator_c .. "*ʃ)", "%1") text = rsub(text, "z(" .. separator_c .. "*ʒ)", "%1") end

~~------------------ Gemination of <bl>, <gl>~~

if dialect ~= "val" then -- bl -> bbl, gl -> ggl after the stress when following a vowel; to avoid this, use <b_l> or <g_l>. -- This must follow v > b above. To force a hard ungeminated or , use <_b> or <_g>. text = rsub(text, "(" .. stress_c .. ")(" .. sylsep_c .. ")()l", "%1%3%2%3l") else -- Valencian; undo manually written 'bbl', 'ggl' in words like poblar, reglament text = rsub(text, "()(" .. sylsep_c .. ")%1l", "%2%1l") end

~~------------------ Lenition of voiced stops~~

-- In Central Catalan, b/d/g become fricatives (actually approximants, like in Spanish) in the onset following a -- vowel and (except for <d>) after <l> and <ll> (cf. GDLC cabellblanc ). This also happens across -- word boundaries but doesn't happen after stops, nor in Central Catalan after , or (and hence probably -- not after either, although I can't find any examples in GDLC). -- -- In Valencian, doesn't lenite (at least formally?), but <d> and <g> do lenite after , or . -- -- Balearic is like Valencian in not leniting , and probably like Central Catalan otherwise. local lenite_bdg = { = "β", = "ð", = "ɣ"} if dialect == "cen" then text = rsub(text, "(" .. separator_c .. "*" .. separator_c .. "*)()", function(before, bdg) return before .. lenite_bdg end) elseif dialect == "val" then text = rsub(text, "(" .. separator_c .. "*" .. separator_c .. "*)()", function(before, dg) return before .. lenite_bdg end) else verify(dialect == "bal", ("Unrecognized dialect '%s'"):format(dialect)) text = rsub(text, "(" .. separator_c .. "*" .. separator_c .. "*)()", function(before, dg) return before .. lenite_bdg end) end

~~------------------ Vowel reduction~~

-- Reduction of unstressed a,e in Central and Balearic (Eastern Catalan). if dialect ~= "val" then -- The following rules seem to apply, based on the old code: -- (1) Stressed a and e are never reduced. -- (2) Unstressed e directly following ə is not reduced. -- (3) Unstressed e directly before written <a> or before /ɔ/ is not reduced. -- (4) Written <ee> when both vowels precede the primary stress is reduced to . (This rule preempts #2.) -- (5) Written <ee> when both vowels follow the primary stress isn't reduced at all. -- Rule #2 in particular seems to require that we proceed left to right, which is how the old code was -- implemented. -- FIXME: These rules seem overly complex and may produce incorrect results in some circumstances. local words = rsplit(text, " ") for j, word in ipairs(words) do local chars = split_into_chars(word) -- See above where voicing assimilation is handled. The overall respelling begins and ends in #, which we -- can ignore. We need to look ahead three chars in some circumstances, but in all those circumstances we -- shoudn't run off the end (and have assertions to check this). local seen_primary_stress = false for i = 2, #chars - 1 do local this = chars if chars == AC then seen_primary_stress = true end if (this ~= "a" and this ~= "e") or rfind(chars, stress_c) then -- Not a/e, or a stressed vowel; continue else local reduction = true local prev, prev_stress, nxt, nxt_stress if not rfind(chars, sylsep_c) then prev = "" else prev = chars -- this should be non-nil as chars is a syllable separator (not #) verify(prev, "Missing # at word boundary") prev_stress = "" if rfind(prev, stress_c) then prev_stress = prev prev = chars -- As above; chars is a stress indicator (not #). verify(prev, "Missing # at word boundary") end end if not rfind(chars, sylsep_c) then nxt = "" -- leave nxt at nil else nxt = chars nxt_stress = chars -- chars is a syllable separator, so chars should not be a word boundary, so -- chars should exist. verify(nxt and nxt_stress, "Syllable separator at word boundary or missing # at word boundary") end if this == "e" and rfind(prev, "ə") then reduction = false elseif this == "e" and rfind(nxt, "") then reduction = false elseif this == "e" and nxt == "e" and not rfind(nxt_stress, AC) then -- FIXME: Check specifically for AC duplicates previous logic but is probably wrong or unnecessary. if not seen_primary_stress then chars = "ə" else reduction = false end end if reduction then chars = "ə" end end end words = table.concat(chars) end text = table.concat(words, " ") end

if dialect == "cen" then -- Reduction of unstressed o (not before w) text = rsub(text, "o()", "u%1") elseif dialect == "bal" then -- Reduction of unstressed o per vowel harmony: unstressed /o/ -> /u/ directly before stressed /i/ or /u/; -- as a Lua pattern, o can be followed only by consonants and/or syllable separators (no vowels, stress marks -- or word separators). text = rsub(text, "o(*" .. stress_c .. ")", "u%1") end

~~-- Final losses. text = rsub(text, "j(ʧs?#)", "%1") -- boigs /bɔt͡ʃ/ text = rsub(text, "()s#", "%1#") -- homophone plurals -xs, -igs, -çs~~

if dialect ~= "val" then -- Remove j before palatal obstruents text = rsub(text, "j(" .. sylsep_c .. "*)", "%1") else -- Valencian -- Fortition of palatal fricatives text = rsub(text, "ʒ", "ʤ") text = rsub(text, "(i" .. stress_c .. "*" .. sylsep_c .. ")ʣ", "%1z") end

if dialect ~= "cen" then -- No palatal gemination ʎʎ > ll or ʎ, in Valencian and Balearic. -- FIXME: These conditions seem to be targeting specific words and should probably be fixed using respelling -- instead. text = rsub(text, "(a" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l") text = rsub(text, "(e" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l") text = rsub(text, "(ti" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l") text = rsub(text, "(m" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l") text = rsub(text, "(u" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l") text = rsub(text, "ʎ(" .. sylsep_c .. "*ʎ)", "%1") end

~~---------- Convert pseudo-symbols to real ones.~~

~~-- Convert g to IPA ɡ. text = rsub(text, "g", "ɡ")~~

~~-- Convert pseudo-afficate symbols to full affricates. local full_affricates = { = "t͡s", = "d͡z", = "t͡ʃ", = "d͡ʒ" } text = rsub(text, "()", full_affricates)~~

~~---------- Generate IPA stress marks.~~

-- Convert acute and grave to IPA stress marks. text = rsub(text, AC, "ˈ") text = rsub(text, GR, "ˌ") -- Move IPA stress marks to the beginning of the syllable. text = rsub_repeatedly(text, "()(*)(" .. ipa_stress_c .. ")", "%1%3%2") -- Suppress syllable divider before IPA stress indicator. text = rsub(text, "%.(#?" .. ipa_stress_c .. ")", "%1") -- Make all primary stresses but the last one in a given word be secondary. May be fed by the first rule above. -- FIXME: Currently this is handled earlier, but we might want to move it here, as is done in Module:pt-pronunc. -- text = rsub_repeatedly(text, "ˈ(+)ˈ", "ˌ%1ˈ") -- Make primary stresses in prefixes become secondary. (FIXME: Handled earlier now.) -- text = rsub_repeatedly(text, "ˈ(*#" .. PREFIX_MARKER .. ")", "ˌ%1")

-- Remove # symbols at word/text boundaries, as well as _ (which forces separate interpretation), pseudo-consonant -- markers (at edges of some prefixes/suffixes), and prefix markers, and recompose. text = rsub(text, "", "") text = mw.ustring.toNFC(text)

~~return text end~~

local function preprocess_word(syllables, suffix_syllables, dialect, pos, orig_word) -- Stressed vowel is ambiguous if syllables.stress then local stressed_vowel = syllables.vowel if rfind(stressed_vowel, "") then local marks = { = {AC, GR, CFLEX, DIA}, = {AC, GR, CFLEX}} local marked_vowels = {} for _, mark in ipairs(marks) do table.insert(marked_vowels, stressed_vowel .. mark) end

error(("In respelling '%s', the stressed vowel '%s' is ambiguous. Please mark it with an acute, " .. "grave, or combined accent: %s."):format(orig_word, stressed_vowel, m_table.serialCommaJoin(marked_vowels, {dontTag = true, conj = "or"}))) end end

-- Final -r is ambiguous in many cases. local final = syllables -- Stressed final r after a or i in non-monosyllables is treated as (r), i.e. verbal infinitives are assumed (NOTE: -- not always the case, e.g. there are many adjectives and nouns in -ar that should be marked as '(rr)', and -- several loanword nouns in -ir that should be marked as 'rr'). Likewise for stressed final r or rs after é in -- non-monosyllables (which are usually adjectives or nouns with the -er ending, but may be verbal infinitives, -- which should be marked as 'ê(r)'). That is, it disappears other than in Valencian. All other final r and final -- rs are considered ambiguous and need to be rewritten using rr, (rr) or (r). if #syllables > 1 and final.stressed then if final.coda == "r" and rfind(final.vowel, "") or final.coda == "rs" and final.vowel == "é" or final.vowel == "ó" and rfind(final.coda, "^rs?$") and rfind(final.onset, "") then final.coda = TEMP_PAREN_R end end if rfind(final.coda, "^rs?$") or rfind(final.coda, "rs?$") then error(("In respelling '%s', final -r by itself or in -rs is ambiguous except in the verbal endings -ar or " .. "-ir, in the nominal or adjectival endings -er(s) and -or(s). In all other cases it needs to be " .. "rewritten using one of 'rr' (pronounced everywhere), '(rr)' (pronounced everywhere but Balearic) or " .. "'(r)' (pronounced only in Valencian). Note that adjectives in -ar usually need rewriting using '(rr)'; " .. "nouns in -ar referring to places should be rewritten using '(r)'; and loanword nouns in -ir usually " .. "need rewriting using 'rr'."):format(orig_word)) end

~~local syllables_IPA = {stress = syllables.stress, is_prefix = syllables.is_prefix, is_suffix = syllables.is_suffix}~~

~~for key, val in ipairs(syllables) do syllables_IPA = {onset = val.onset, vowel = val.vowel, coda = val.coda, stressed = val.stressed} end~~

~~-- Replace letters with IPA equivalents for i, syll in ipairs(syllables_IPA) do -- Voicing of s if syll.onset == "s" and i > 1 and rfind(syllables.coda, "^?$") then syll.onset = "z" end~~

if rfind(syll.vowel, "^$") then syll.onset = rsub(syll.onset, "tg$", "ʤ") syll.onset = rsub(syll.onset, "$", { = "s", = "ʒ"}) syll.onset = rsub(syll.onset, "u$", { = "k", = "g"}) end

~~syll.coda = rsub(syll.coda, "igs?$", "iʤ")~~

~~syll.onset = replace_context_free(syll.onset) syll.coda = replace_context_free(syll.coda)~~

syll.vowel = rsub(syll.vowel, ".", dialect == "cen" and IPA_vowels_central or dialect == "bal" and IPA_vowels_balearic or IPA_vowels_valencian ) syll.vowel = rsub(syll.vowel, ".", IPA_vowels) end

~~for _, suffix_syl in ipairs(suffix_syllables) do table.insert(syllables_IPA, suffix_syl) end~~

~~return syllables_IPA end~~

-- Given a single substitution spec, `to`, figure out the corresponding value of `from` used in a complete -- substitution spec. `pagename` is the name of the page, either the actual one or taken from the `pagename` param. -- `whole_word`, if set, indicates that the match must be to a whole word (it was preceded by ~). local function convert_single_substitution_to_original(to, pagename, whole_word) -- Replace specially-handled characters with a class matching the character and possible replacements. local escaped_from = to -- Handling of '(rr)', '(r)', '.' and '-' needs to be done before calling pattern_escape(); otherwise they will be -- escaped. escaped_from = escaped_from:gsub("%(rr%)", "r") escaped_from = escaped_from:gsub("%(r%)", "r") escaped_from = escaped_from:gsub("ks", "x"):gsub("Ks", "X"):gsub("gz", "x"):gsub("()%1l", "%1l"):gsub("", "") escaped_from = require(patut_module).pattern_escape(escaped_from) escaped_from = escaped_from:gsub("rr", "rr?") escaped_from = escaped_from:gsub("ss", "ss?") escaped_from = escaped_from:gsub("ʃ", "") -- This is tricky, because we already passed `escaped_from` through pattern_escape() causing a hyphen to get a -- % sign before it, and have to double up the percent signs to match and replace a literal %. escaped_from = escaped_from:gsub("%%%-", "%%-?") -- Tie sign (‿) should match against space, hyphen or nothing in the original. escaped_from = escaped_from:gsub("‿", "?") escaped_from = rsub(escaped_from, "", function(v) return " .. "]" end) escaped_from = escaped_from:gsub(DOTOVER, DOTOVER .. "?"):gsub(LINEUNDER, LINEUNDER .. "?") escaped_from = "(" .. escaped_from .. ")" if whole_word then escaped_from = "%f" .. escaped_from .. "%f" end local match = rmatch(pagename, escaped_from) if match then if match == to then error(("Single substitution spec '%s' found in pagename '%s', replacement would have no effect"): format(to, pagename)) end return match end error(("Single substitution spec '%s' couldn't be matched to pagename '%s'"):format(to, pagename)) end

local function apply_substitution_spec(respelling, pagename, pos, allow_mid_vowel_hints, parse_err) local subs = split_on_comma(rmatch(respelling, "^%$")) respelling = pagename local mid_vowel_hint local regular_subs = {} for _, sub in ipairs(subs) do if rfind(sub, "^" .. export.mid_vowel_hint_c .. "$") then if mid_vowel_hint then parse_err(("Specified mid vowel hint twice, '%s' and '%s'"):format( mid_vowel_hint, sub)) end mid_vowel_hint = sub else table.insert(regular_subs, sub) end end if mid_vowel_hint then if not allow_mid_vowel_hints then parse_err(("Mid vowel hint '%s' not allowed when apply one substitution spec to multiple words"):format( mid_vowel_hint)) end local suffix = "" -- FIXME: This duplicates logic in to_IPA(). if not pos or pos == "adverb" then local part_before_ment, ment = rmatch(respelling, "^(.*)(mnt)$") if part_before_ment and (pos == "adverb" or not rfind(part_before_ment, "$") and rfind(part_before_ment, V .. ".*" .. V)) then suffix = ment respelling = part_before_ment end end local syllables = split_syllables(respelling, "stress prefixes", "may be uppercase") local stressed_vowel = syllables.vowel if stressed_vowel == mid_vowel_hint then -- do nothing elseif rfind(mid_vowel_hint, "") and rfind(stressed_vowel, "") or rfind(mid_vowel_hint, "") and rfind(stressed_vowel, "") then syllables.vowel = mid_vowel_hint else parse_err(("Stressed vowel '%s' not compatible with mid vowel hint '%s'"):format( stressed_vowel, mid_vowel_hint)) end respelling = reconstitute_word_from_syllables(syllables) .. suffix end

for _, sub in ipairs(regular_subs) do local from, escaped_from, to, escaped_to, whole_word if rfind(sub, "^~") then -- whole-word match sub = rmatch(sub, "^~(.*)$") whole_word = true end if sub:find(":") then from, to = rmatch(sub, "^(.-):(.*)$") else to = sub from = convert_single_substitution_to_original(to, pagename, whole_word) end if from then local patut = require(patut_module) escaped_from = patut.pattern_escape(from) if whole_word then escaped_from = "%f" .. escaped_from .. "%f" end escaped_to = patut.replacement_escape(to) local subbed_respelling, nsubs = rsubn(respelling, escaped_from, escaped_to) if nsubs == 0 then parse_err(("Substitution spec %s -> %s didn't match processed pagename '%s'"):format( from, to, respelling)) elseif nsubs > 1 then parse_err(("Substitution spec %s -> %s matched multiple substrings in processed pagename '%s', add " .. "more context"):format(from, to, respelling)) else respelling = subbed_respelling end end end

~~return respelling end~~

local canonicalize_pos = { n = "noun", noun = "noun", v = "verb", vb = "verb", verb = "verb", a = "adjective", adj = "adjective", adjective = "adjective", av = "adverb", adv = "adverb", adverb = "adverb", o = "other", other = "other", }

local function parse_off_pos(respelling, parse_err) local pos, rest = respelling:match("^(+)/(.*)$") if pos then if not canonicalize_pos then local valid_pos = {} for vp, _ in pairs(canonicalize_pos) do table.insert(valid_pos, vp) end table.sort(valid_pos) parse_err(("Unrecognized part of speech '%s', should be one of %s"):format(pos, table.concat(valid_pos, ", "))) end pos = canonicalize_pos respelling = rest if respelling == "" then respelling = "+" end end return pos, respelling end

-- Parse a respelling given by the user, allowing for '+' for pagename, mid vowel hints in place of a respelling and -- substitution specs like '' or . In general, return an object {words = {WORD, WORD, ...}} where -- WORD is of the form {term = PARSED_RESPELLING, pos = POS}. Other fields are set in special cases: If a raw respelling -- was seen, the fields `raw_phonemic` and/or `raw_phonetic` are set; if '?' is seen, the field `unknown` is set; and if -- '-' is seen, the field `omitted` is set. local function parse_respelling(respelling, pagename, parse_err) if respelling == "?" then return { unknown = true } end if respelling == "-" then return { omitted = true } end local saw_raw local remaining_respelling = respelling:match("^raw:(.*)$") if remaining_respelling then saw_raw = true respelling = remaining_respelling end local raw_phonemic, raw_phonetic = respelling:match("^/(.*)/ %$") if not raw_phonemic then raw_phonemic = respelling:match("^/(.*)/$") end if not raw_phonemic and saw_raw then raw_phonetic = respelling:match("^%$") end if raw_phonemic or raw_phonetic then return { raw_phonemic = raw_phonemic, raw_phonetic = raw_phonetic, } end

~~pagename = decompose_respelling(pagename) respelling = decompose_respelling(respelling)~~

local function split_respelling_into_words(respelling, parse_pos) respelling = canon_respelling(respelling) local word_objs = {} local respelling_words = rsplit(respelling, " ") for _, word in ipairs(respelling_words) do local pos if parse_pos then pos, word = parse_off_pos(word, parse_err) end table.insert(word_objs, {term = word, pos = pos}) end return {words = word_objs} end

local function substitute_respelling_word(respelling_word, pagename_word) local pos pos, respelling_word = parse_off_pos(respelling_word, parse_err) if respelling_word == "+" then respelling_word = pagename_word else if rfind(respelling_word, "^" .. export.mid_vowel_hint_c .. "$") then respelling_word = "" end if rfind(respelling_word, "^%$") then respelling_word = apply_substitution_spec(respelling_word, pagename_word, pos, "allow mid vowel hint", parse_err) end end return {term = respelling_word, pos = pos} end

-- At this point, if there are multiple words in the pagename, there are three syntaxes allowed: all-at-once, -- replacement or word-by-word. All-at-once syntax involves either a + representing the entire pagename, or a -- substitution spec that applies to all words in the pagename. This syntax cannot have a prefixed part of speech -- because it wouldn't be clear which word to apply the part of speech to. Replacement syntax simply spells out the -- respelling without any substitution specs or +'s (but possibly with parts of speech prefixed to individual -- words), and can have a different number of words than the pagename (essentially, the pagename is disregarded). -- Word-by-word syntax involves a combination of respelled words, per-word substitution specs and/or a + -- representing an individual word, and must have the same number of words as the pagename so that substitution -- specs and +'s can be lined up with words in the pagename. In all cases, the return value is in the same format; -- see comment at top of function. if pagename:find(" ") or respelling:find(" ") then if respelling == "+" then return split_respelling_into_words(pagename) elseif rfind(respelling, "^%$") then -- all-at-once syntax with substitution spec return split_respelling_into_words(apply_substitution_spec(respelling, pagename, nil, false, parse_err)) elseif rfind(respelling, "^(+)/$") or rfind(respelling, "^(+)/%]*%]$") then -- attempt to include a part of speech in all-at-once syntax parse_err(("Part of speech not allowed when pagename is multiword and all-at-once syntax is used in " .. "the respelling, but saw '%s'"):format(respelling)) elseif rfind(respelling, "^" .. export.mid_vowel_hint_c .. "$") then -- attempt to use a mid-vowel hint in all-at-once syntax parse_err(("Single mid-vowel hint not allowed when pagename is multiword because it's not clear which " .. "word to apply it to, but saw '%s'"):format(respelling)) elseif rfind(respelling, "]") or rfind(respelling, "^" .. export.mid_vowel_hint_c .. " ") or rfind(respelling, " " .. export.mid_vowel_hint_c .. " ") or rfind(respelling, " " .. export.mid_vowel_hint_c .. "$") then -- word-by-word syntax local sub_with_space = rmatch(respelling, "%]* ]*%]") if sub_with_space then parse_err(("When using word-by-word syntax with a multiword pagename, saw substitution spec '%s' " .. "with spaces, which is not allowed because it must match a single word"):format(sub_with_space)) end pagename = canon_respelling(pagename) respelling = canon_respelling(respelling) local pagename_words = rsplit(pagename, " ") local respelling_words = rsplit(respelling, " ") if #pagename_words ~= #respelling_words then parse_err(("When using word-by-word syntax with a multiword pagename, saw %s words in pagename but " .. "%s word%s in respelling; they need to match"):format(#pagename_words, #respelling_words, #respelling_words > 1 and "s" or "")) end local word_objs = {} for i = 1, #pagename_words do table.insert(word_objs, substitute_respelling_word(respelling_words, pagename_words)) end return {words = word_objs} else -- replacement syntax; pagename ignored return split_respelling_into_words(respelling, "parse pos") end else local word_obj = substitute_respelling_word(respelling, pagename) word_obj.term = canon_respelling(word_obj.term) return {words = {word_obj}} end end

-- Parse a list of comma-split runs containing one or more respellings, i.e. after calling parse_balanced_segment_run() -- or the like followed by split_alternating_runs() or the like (see Module:parse utilities). `pagename` is the -- pagename, for use when a respelling is just '+', a mid-vowel hint like 'ê' or a substitution spec like ''. -- `original_input` is the raw input and `input_param` the name of the param containing the raw input; both are used -- only in error messages. Return an object specifying the respellings, currently with a single field 'terms' (this -- format is used in case other outer properties exist in the future), where 'terms' is a list of term objects. Each -- term object contains either a field `term` with the respelling and an optional part of speech `pos`, or fields -- `raw_phonemic` and/or `raw_phonetic` (if the user specified raw IPA using "/.../" or "/.../ " or "raw:"), -- `unknown` (if the user specified "?"), or `omitted` (if the user specified "-"). In addition, there may be fields -- `q`, `qq`, `a`, `aa`, and/or `ref` corresponding to inline modifiers. Each such field is a list; all are lists of -- strings except for `ref`, which is a list of objects as returned by parse_references() in Module:references. function export.parse_comma_separated_groups(comma_separated_groups, pagename, original_input, input_param) local function generate_obj(respelling, parse_err) return parse_respelling(respelling, pagename == true and respelling or pagename, parse_err) end local put = require(parse_utilities_module)

local outer_container = {terms = {}} for _, group in ipairs(comma_separated_groups) do -- Rejoin runs that don't involve <...>. local j = 2 while j <= #group do if not group:find("^<.*>$") then group = group .. group .. group table.remove(group, j) table.remove(group, j) else j = j + 2 end end

local param_mods = { -- pre = { overall = true }, -- post = { overall = true }, ref = { store = "insert", convert = function(arg, parse_err) return require("Module:references").parse_references(arg) end }, q = { store = "insert" }, qq = { store = "insert" }, a = { store = "insert" }, aa = { store = "insert" }, }

table.insert(outer_container.terms, put.parse_inline_modifiers_from_segments { group = group, arg = original_input, props = { paramname = input_param, param_mods = param_mods, generate_obj = generate_obj, splitchar = ",", outer_container = outer_container, }, }) end

~~return outer_container end~~

-- Generate the pronunciation of `words` (a list of word objects representing respellings, each of which is an object -- of the form {term = RESPELLING, pos = PART_OF_SPEECH} in `dialect` ("cen", "bal" or "val"). local function to_IPA(words, dialect) local pronuns = {}

for _, wordobj in ipairs(words) do if rfind(wordobj.term, "") then error(("Invalid accented character in respelling '%s'; use accented à í ú, not the reversed versions" ):format(wordobj.term)) end end

~~words = handle_unstressed_words(words)~~

~~for _, wordobj in ipairs(words) do local word = wordobj.term local pos = wordobj.pos local suffix_syllables = {} local orig_word = word~~

word = ulower(word) if not pos or pos == "adverb" then local word_before_ment, ment = rmatch(word, "^(.*)(mnt)$") if word_before_ment and (pos == "adverb" or not rfind(word_before_ment, "$") and rfind(word_before_ment, V .. ".*" .. V)) then suffix_syllables = Template:onset = "m", vowel = "e", coda = "nt", stressed = true pos = "adjective" word = word_before_ment end end

word = word_fixes(word, dialect) local syllables = split_syllables(word) syllables = preprocess_word(syllables, suffix_syllables, dialect, pos, orig_word) -- Combine syllables. local combined = {} local has_ment = #suffix_syllables > 0 for i, syll in ipairs(syllables) do local ac = (i == syllables.stress and not syllables.is_prefix and not has_ment or has_ment and i == #syllables) and AC or -- primary stress syllables.stressed and GR or -- secondary stress "" table.insert(combined, syll.onset .. syll.vowel .. ac .. syll.coda) end table.insert(pronuns, table.concat(combined, ".")) end

-- Put double ## at utterance boundaries (beginning/end of string) and at foot boundaries (marked with |). -- Note that if the string without pound signs is 'foo bar baz | bat quux', the final string will be -- '##foo# #bar# #baz## #|# ##bat# #quux##'. local text = "##" .. table.concat(pronuns, " ") .. "##" text = rsub(text, " | ", "# | #") text = rsub(text, " ", "# #") return postprocess_general(text, dialect) end

-- Generate the phonemic and phonetic pronunciations of the respellings in `parsed_respellings`, which is a table whose -- keys are dialect identifiers (e.g. "cen" for Central Catalan, "val" for Valencian) and whose values are objects of -- the format returned by parse_comma_separated_groups() (see comment above that function). This destructively modifies -- `parsed_respellings`, adding fields `phonemic` and `phonetic` containing the generated pronunciations and removing -- the input fields used to generate those output fields. (FIXME: Currently only phonetic pronunciation is generated.) function export.generate_phonemic_phonetic(parsed_respellings) -- Convert each canonicalized respelling to phonemic/phonetic IPA. for dialect, respelling_spec in pairs(parsed_respellings) do for _, termobj in ipairs(respelling_spec.terms) do if termobj.unknown or termobj.omitted then -- leave alone, will handle later elseif termobj.raw_phonemic or termobj.raw_phonetic then termobj.phonemic = termobj.raw_phonemic termobj.phonetic = termobj.raw_phonetic -- set to nil so by-value comparisons respect only the resulting phonemic/phonetic and qualifiers termobj.raw_phonemic = nil termobj.raw_phonetic = nil else termobj.phonetic = to_IPA(termobj.words, dialect) -- set to nil so by-value comparisons respect only the resulting phonemic/phonetic and qualifiers termobj.words = nil end end end end

-- Group pronunciations by dialect, i.e. grouping pronunciations that are identical in every way (including both the -- pronunciation(s) and any qualifiers and other inline modifiers). `parsed_respellings` contains the output from -- generate_phonemic_phonetic(), and the return value is a list of grouped pronunciations, where each object in the list -- contains fields `dialects` (a list of dialects containing the pronunciations) and `pronuns` (a list of -- pronunciations, where each pronunciation is specified by an object containing fields `phonemic` and `phonetic`, as -- generated by generate_phonemic_phonetic(), along with any inline modifier fields `q`, `qq`, `a`, `aa` and/or `ref`). function export.group_pronuns_by_dialect(parsed_respellings) local grouped_pronuns = {} for dialect, pronun_spec in pairs(parsed_respellings) do local saw_omitted = false for _, termobj in ipairs(pronun_spec.terms) do if termobj.omitted then saw_omitted = true break end end if not saw_omitted then local saw_existing = false for _, group in ipairs(grouped_pronuns) do if m_table.deepEquals(group.pronuns, pronun_spec.terms) then table.insert(group.dialects, dialect) saw_existing = true break end end if not saw_existing then table.insert(grouped_pronuns, {dialects = {dialect}, pronuns = pronun_spec.terms}) end end end return grouped_pronuns end

-- Format pronunciations grouped by dialect. `grouped_pronuns` contains the output of group_pronuns_by_dialect(). -- This destructively modifies `grouped_pronuns`, adding a field 'formatted' to the first-level values of -- `grouped_pronuns` containing the formatted pronunciation(s) for a given set of dialects. function export.format_grouped_pronunciations(grouped_pronuns) for _, grouped_pronun_spec in pairs(grouped_pronuns) do local pronunciations = {}

-- Loop through each pronunciation. For each one, add the phonemic and phonetic versions to `pronunciations`, -- for formatting by Module:IPA or raw (for use in Module:cy-headword). for j, pronun in ipairs(grouped_pronun_spec.pronuns) do -- Add dialect tags to left accent qualifiers if first one local as = pronun.a if j == 1 then if as then as = m_table.deepcopy(as) else as = {} end for _, dialect in ipairs(grouped_pronun_spec.dialects) do table.insert(as, export.dialects_to_names) end end

~~local first_pronun = #pronunciations + 1~~

if pronun.unknown then -- FIXME: This is a massive hack but it works for now. table.insert(pronunciations, { pron = "", pretext = "unknown" }) else if not pronun.phonemic and not pronun.phonetic then error("Internal error: Saw neither phonemic nor phonetic pronunciation") end

~~if pronun.phonemic then -- missing if 'raw:' given local slash_pron = "/" .. pronun.phonemic .. "/" table.insert(pronunciations, { pron = slash_pron, }) end~~

~~if pronun.phonetic then -- missing if '/.../' given local bracket_pron = "" table.insert(pronunciations, { pron = bracket_pron, }) end end~~

~~local last_pronun = #pronunciations~~

if pronun.q then pronunciations.q = pronun.q end if as then pronunciations.a = as end if j > 1 then pronunciations.separator = ", " end if pronun.qq then pronunciations.qq = pronun.qq end if pronun.aa then pronunciations.aa = pronun.aa end if pronun.refs then pronunciations.refs = pronun.refs end if first_pronun ~= last_pronun then pronunciations.separator = " " end end

~~grouped_pronun_spec.formatted = m_IPA.format_IPA_full(lang, pronunciations, nil, "") end end~~

~~function export.show(frame) local params = { = {}, indent = {}, pagename = {} -- for testing or documentation pages }~~

~~for _, dialect in ipairs(export.dialects) do params = {} end for dialect_group, _ in pairs(export.dialect_groups) do params = {} end~~

~~local args = require("Module:parameters").process(frame:getParent().args, params) local pagename = args.pagename or mw.title.getCurrentTitle().subpageText~~

-- Set inputs local inputs = {} -- If 1= specified, do all dialects. if args then for _, dialect in ipairs(export.dialects) do inputs = {input = args, param = 1} end end -- Then do dialect groups. for dialect_group, group_dialects in pairs(export.dialect_groups) do if args then for _, dialect in ipairs(group_dialects) do inputs = {input = args, param = dialect_group} end end end -- Then do individual dialect settings. for _, dialect in ipairs(export.dialects) do if args then inputs = {input = args, param = dialect} end end -- If no inputs given, set all dialects based on current pagename. if not next(inputs) then for _, dialect in ipairs(export.dialects) do inputs = {input = "+", param = "(pagename)"} end end

-- Parse the arguments. local parsed_respellings = {} for dialect, inputspec in pairs(inputs) do local function generate_obj(respelling, parse_err) return parse_respelling(respelling, pagename, parse_err) end

if inputspec.input:find("") then local put = require(parse_utilities_module) -- Parse balanced segment runs involving either (substitution notation) or <...> (inline modifiers). -- We do this because we don't want commas inside of square or angle brackets to count as respelling -- delimiters. However, we need to rejoin square-bracketed segments with nearby ones after splitting -- alternating runs on comma. For example, if we are given -- "aa<q:learned>,<q:nonstandard>", after calling -- parse_multi_delimiter_balanced_segment_run() we get the following output: -- -- {"a", "", "a", "<q:learned>", ",", "", "", "<q:nonstandard>", ""} -- -- After calling split_alternating_runs(), we get the following: -- -- {{"a", "", "a", "<q:learned>", ""}, {"", "", "", "<q:nonstandard>", ""}} -- -- We need to rejoin stuff on either side of the square-bracketed portions. local segments = put.parse_multi_delimiter_balanced_segment_run(inputspec.input, {{"<", ">"}, {""}})

~~local comma_separated_groups = put.split_alternating_runs_on_comma(segments)~~

-- Process each value. local outer_container = export.parse_comma_separated_groups(comma_separated_groups, pagename, inputspec.input, inputspec.param) parsed_respellings = outer_container else local termobjs = {} local function parse_err(msg) error(msg .. ": " .. inputspec.param .. "=" .. inputspec.input) end for _, term in ipairs(split_on_comma(inputspec.input)) do table.insert(termobjs, generate_obj(term, parse_err)) end parsed_respellings = { terms = termobjs, } end end

~~-- Convert each canonicalized respelling to phonemic/phonetic IPA. export.generate_phonemic_phonetic(parsed_respellings)~~

~~-- Group the results. local grouped_pronuns = export.group_pronuns_by_dialect(parsed_respellings)~~

~~-- Format the results. export.format_grouped_pronunciations(grouped_pronuns)~~

-- Concatenate formatted results. local formatted = {} for _, grouped_pronun_spec in ipairs(grouped_pronuns) do table.insert(formatted, grouped_pronun_spec.formatted) end local indent = (args.indent or "*") .. " " local out = table.concat(formatted, "\n" .. indent) if args.indent then out = indent .. out end

~~return out end~~

-- Used by Module:cy-IPA/testcases. function export.test(pagename, respelling, dialect) local function parse_err(msg) error(msg) end local parsed = parse_respelling(respelling, pagename, parse_err) return to_IPA(parsed.words, dialect) end

~~return export~~

User:Arafsymudwr/Sandbox

Module:cy-IPA

Wikious

Boobota

Sagapedia