local export = {}
local lang = require("Module:languages").getByCode("cy")
local m_IPA = require("Module:IPA") local m_a = require("Module:accent qualifier") local m_table = require("Module:table")
local parse_utilities_module = "Module:parse utilities" local patut_module = "Module:pattern utilities"
local listToSet = require("Module:table").listToSet
--[=[ FIXME:
1. Some words in ng have /ŋ/ and others have /ŋg// Wiktionary already sorts these separately. 2. Consonant clusters assimilate by losing voice, not regressive or progressive assimilation 3. Some common words in y and u are /i/ or /ɪ/ in North Wales 4. North Wales colloquial: unstressed /ɛ/ as /a/ 5. South Wales colloquial, lots of monophthongisation ]=]
local usub = mw.ustring.sub
local rfind = mw.ustring.find
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local rsubn = mw.ustring.gsub
local ulower = mw.ustring.lower
local u = mw.ustring.char
local ugcodepoint = mw.ustring.gcodepoint
export.dialects = {"NW", "SW"}
export.dialects_to_names = {
NW = "North Wales",
SW = "South Wales",
}
local written_unstressed_vowel_l = "aeiouwyAEIOUWY"
local written_long_vowel_l = "àáâäèéêëìíîïòóôöùúûẁẃŵẅüỳýŷÿÀÁÈÉÊËÌÍÎÏÒÓÔÖÙÚÛÜẀẂŴẄỲÝŶŸ"
local written_stressed_not_long_vowel_l = "àèìòùẁỳÀÈÌÒÙẀỲ"
local written_stressed_vowel_l = written_long_vowel_l .. written_stressed_not_long_vowel_l
local ipa_vowel_l = "ɪɨ̞ʊɛəɔ"
local written_vowel_l = written_unstressed_vowel_l .. written_stressed_vowel_l
local vowel_l = written_vowel_l .. ipa_vowel_l
local V = ""
local written_stressed_to_plain_vowel = {
= "a",
= "a",
= "a",
= "a",
= "e",
= "e",
= "e",
= "e",
= "i",
= "i",
= "i",
= "i",
= "o",
= "o",
= "o",
= "o",
= "u",
= "u",
= "u",
= "u",
= "w",
= "w",
= "w",
= "w",
= "y",
= "y",
= "y",
= "y",
= "A",
= "A",
= "A",
= "A",
= "E",
= "E",
= "E",
= "E",
= "I",
= "I",
= "I",
= "I",
= "O",
= "O",
= "O",
= "O",
= "U",
= "U",
= "U",
= "U",
= "W",
= "W",
= "W",
= "W",
= "Y",
= "Y",
= "Y",
= "Y",
}
local sequences = { = { = "a"; = "a"; = "a"; = "a"; = "a"; = "ɑːɨ̯"; = "ai̯"; = "aɨ̯"; = "ɑːu̯"; }; = { = "k"; = "χ"; }; = { = "d"; = "ð"; }; = { = "ɛ"; = "ɛ"; = "eː"; = "eː"; = "e"; = "ɛi̯"; = "əɨ̯"; = "eːu̯"; = "aɨ̯"; }; = { = "v"; = "f"; }; = "ɡ"; = "h"; = { = "ɪ"; = "ɪ"; = "iː"; = "i"; = "iː"; = "ɛi̯"; }; = "d͡ʒ"; = { = "k"; = "k"; }; = { = "l"; = "ɬ"; }; = { = "m"; = "m̥"; }; = { = "n"; = "n̥"; = "ŋ"; = "n"; }; = { = "oː"; = { "ɔ", "s" }; = "ɔ"; }; = { -- XXX: manchmal /øː/ = "œ"; = { "œ", "s" }; }; = { = "f"; = "p"; = "p"; }; = { = { "k", "f" }; = "k"; -- XXX }; = { -- XXX: /ʀ/? /r/?; manchmal /ɐ/ ("Uhr"); auch /ər/ ("oder") = "r"; = "r"; }; = { = "s"; = "ʃ"; = { "ʃ", "p" }; = "s"; = { "ʃ", "t" }; }; = { = "t"; = "t͡ʃ"; = "t"; = { "t͡s", "i̯", "o", "n" }; }; = { = "ʊ"; = { "ʊ", "x" }; }; = { = "yː"; = "yː"; }; = "f"; = "ʋ"; = { "k", "s" }; -- XXX = "i"; = "z"; -- already converted from s = "s"; = "ˈ"; -- FIXME = {}; }
local AC = u(0x0301) -- acute = ́ local GR = u(0x0300) -- grave = ̀ local CFLEX = u(0x0302) -- circumflex = ̂ local DOTOVER = u(0x0307) -- dot over = ̇ local DIA = u(0x0308) -- diaeresis = ̈ local LINEUNDER = u(0x0331) -- lineunder = ̱
local stress_l = AC .. GR local stress_c = "" local ipa_stress_l = "ˈˌ" local ipa_stress_c = "" local sylsep_l = "%-." -- hyphen included for syllabifying from spelling; FIXME: formerly included SYLDIV local sylsep_c = "" local tie_l = "‿'" local tie_c = "" local charsep_l = sylsep_l .. tie_l .. stress_l .. ipa_stress_l local charsep_c = "" local wordsep_l = "# " local wordsep_c = "" local separator_l = charsep_l .. wordsep_l local separator_c = "" local neg_guts_of_cons = vowel_l .. separator_l local C = "" -- consonant class including h
export.mid_vowel_hints = "éèêëóòô" export.mid_vowel_hint_c = ""
local TEMP_PAREN_R = u(0xFFF1) local TEMP_PAREN_RR = u(0xFFF2) -- Pseudo-consonant at the edge of prefixes ending in a vowel and suffixes beginning with a vowel; FIXME: not currently -- used. local PSEUDOCONS = u(0xFFF3) -- local PREFIX_MARKER = u(0xFFF4) -- marker indicating a prefix so we can convert primary to secondary accents
local valid_onsets = listToSet {
"b", "bl", "br",
"c", "cl", "cr",
"ç",
"d", "dj", "dr",
"f", "fl", "fr",
"g", "gl", "gr", "gu", "gü",
"h",
"i",
"j",
"k", "kl", "kr",
"l", "ll",
"m",
"n", "ny", "ñ",
"p", "pl", "pr",
"qu", "qü",
"r", "rr",
"s", "ss",
"t", "tg", "tj", "tr", "tx", "tz",
"u",
"v", "vl", "vr",
"w",
"x",
"ʃ", -- e.g. 'χruʃóf' respelling of Khrusxov
"χ", -- in case of respelling
"y",
"z",
}
local decompose_dotover = { -- No composed i, u or U with DOTOVER. = "a" .. DOTOVER, = "e" .. DOTOVER, = "o" .. DOTOVER, = "y" .. DOTOVER, = "A" .. DOTOVER, = "E" .. DOTOVER, = "I" .. DOTOVER, = "O" .. DOTOVER, = "Y" .. DOTOVER, }
local unstressed_words = listToSet { -- proclitic object pronouns "em", "et", "es", "el", "la", "els", "les", "li", "ens", "us", "ho", "hi", "en", -- enclitic object pronouns usually attach with hyphen to preceding verb but not always, cf. tant me fa "me", "te", "se", "lo", "los", "nos", "vos", "ne", -- contracted object pronouns and articles attached with apostrophe so no need to include -- unstressed possessives "mon", "ma", "mos", "mes", "ton", "ta", "tos", "tes", "son", "sa", "sos", "ses", -- prepositions "a", "de", "per", "amb", "ab", -- 'en' already included as proclitic object pronouns -- prepositional contractions "al", "als", "del", "dels", "pel", "pels", -- articles 'el', 'la', 'els', 'les' already included as proclitic pronouns -- personal articles "na", -- 'en' already included above -- indefinite articles "un", "uns", -- salat articles "ets", "so", -- 'es' already included as proclitic object pronouns and 'ses', 'sa', 'sos' as possessives -- conjunctions "i", "o", "si", "ni", "que", }
-- Version of rsubn() that discards all but the first return value. local function rsub(term, foo, bar) local retval = rsubn(term, foo, bar) return retval end
-- Version of rsubn() that returns a 2nd argument boolean indicating whether a substitution was made. local function rsubb(term, foo, bar) local retval, nsubs = rsubn(term, foo, bar) return retval, nsubs > 0 end
-- Apply rsub() repeatedly until no change. local function rsub_repeatedly(term, foo, bar) while true do local new_term = rsub(term, foo, bar) if new_term == term then return term end term = new_term end end
local function split_into_chars(text) local chars = {} for codepoint in ugcodepoint(text) do table.insert(chars, u(codepoint)) end return chars end
local function split_on_comma(term) if term:find(",%s") or term:find("\\") then return require(parse_utilities_module).split_on_comma(term) else return rsplit(term, ",") end end
local function concat_keys(tab) local res = {} for k, _ in pairs(tab) do table.insert(res, k) end return table.concat(res) end
local function handle_unstressed_words(words)
words = m_table.deepcopy(words)
-- Lowercase all words for ease in further processing. for i, wordobj in ipairs(words) do wordobj.term = ulower(wordobj.term) end
-- Check if the word at index `i` in `words` is "amb" and the following word begins with a vowel. local function is_amb_to_join(words, i) return i < #words and words.term == "a" .. DOTOVER .. "mb" and rfind(words.term, "^h?" .. V) end local saw_amb_to_join = true
-- Mark all unstressed words with DOTOVER, so that split_syllables() doesn't assign stress. We need to do this -- before special handling for amb, because amb may join to another unstressed word like el, in the -- process losing the identity of the two words. In the process, see if amb occurs before a following -- vowel-initial word (which may begin with h-). for i, wordobj in ipairs(words) do -- Put DOTOVER after the last vowel (to handle the case of que). It doesn't actually matter where we put -- it, because split_syllables() just looks for DOTOVER anywhere in the word. if unstressed_words then wordobj.term = rsub(wordobj.term, "^(.*" .. V .. ")", "%1" .. DOTOVER) end if is_amb_to_join(words, i) then saw_amb_to_join = true end end
-- Join amb before vowel-initial word with following word. if saw_amb_to_join then local new_words = {} local i = 1 while i <= #words do if is_amb_to_join(words, i) then table.insert(new_words, {term = words.term .. "‿" .. words.term, pos = words.pos}) i = i + 2 else table.insert(new_words, words) i = i + 1 end end words = new_words end
-- Finally, rewrite some unstressed words to get the right pronunciation. Any remaining amb not before a -- vowel-initial word is pronounced even in Valencian (where / would be expected), and per always -- has a pronounced <r>. local unstressed_word_replacement = { = "a" .. DOTOVER .. "m", = "pe" .. DOTOVER .. "rr", }
for i, wordobj in ipairs(words) do wordobj.term = unstressed_word_replacement or wordobj.term end
return words end
local function fix_prefixes(word)
-- Voiced s in prefix roots -fons-, -dins-, -trans-
word = rsub(word, "^enfons()", "enfonz%1")
word = rsub(word, "^endins()", "endinz%1")
word = rsub(word, "tr()ns()", "tr%1nz%2")
-- in + ex > ineks/inegz word = rsub(word, "^inex", "in.ex")
return word end
local function restore_diaereses(word) -- Some structural forms do not have diaeresis per diacritic savings, let's restore it to identify hiatus
word = rsub(word, "()um(s?)$", "%1üm%2") -- Latinisms (-ius is ambiguous but rare)
word = rsub(word, "()isme(s?)$", "%1ísme%2") -- suffix -isme word = rsub(word, "()ist(s?)$", "%1íst%2") -- suffix -ista
word = rsub(word, "()ir$", "%1ír") -- verbs -ir word = rsub(word, "()int$", "%1ínt") -- present participle word = rsub(word, "()ir()$", "%1ïr%2") -- future word = rsub(word, "(u)ir()$", "%1ïr%2") word = rsub(word, "()iràs$", "%1ïràs") word = rsub(word, "(u)iràs$", "%1ïràs") word = rsub(word, "()ir(e)$", "%1ïr%2") word = rsub(word, "(u)ir(e)$", "%1ïr%2") word = rsub(word, "()iran$", "%1ïran") word = rsub(word, "(u)iran$", "%1ïran") word = rsub(word, "()iria$", "%1ïria") -- conditional word = rsub(word, "(u)iria$", "%1ïria") word = rsub(word, "()ir(ie)$", "%1ïr%2") word = rsub(word, "(u)ir(ie)$", "%1ïr%2")
return word end
local function fix_y(word) -- y > vowel i else consonant /j/, except ny
word = rsub(word, "ny", "ñ")
word = rsub(word, "y()", "i%1") -- vowel if not next to another vowel word = rsub(word, "()y", "%1i") -- excluding also syllables separators
return word end
local function mid_vowel_fixes(word) local function track_mid_vowel(vowel, cont) require("Module:debug/track"){"cy-IPA/" .. vowel, "cy-IPA/" .. vowel .. "/" .. cont} return true end local changed -- final -el (not -ell) usually è but not too many cases word, changed = rsubb(word, "e(nts?)$", "é%1") if changed then track_mid_vowel("e", "nt-nts") end word, changed = rsubb(word, "e(rs?)$", "é%1") if changed then track_mid_vowel("e", "r-rs") end word, changed = rsubb(word, "o(rs?)$", "ó%1") if changed then track_mid_vowel("o", "r-rs") end word, changed = rsubb(word, "è(s?)$", "ê%1") if changed then track_mid_vowel("è", "s-blank") end word, changed = rsubb(word, "e(ss)$", "ê%1") if changed then track_mid_vowel("e", "sos-sa-ses") end word, changed = rsubb(word, "e(sa)$", "ê%1") if changed then track_mid_vowel("e", "sos-sa-ses") end return word end
local function word_fixes(word, dialect) word = rsub(word, "%(rr%)", TEMP_PAREN_RR) word = rsub(word, "%(r%)", TEMP_PAREN_R) word = rsub(word, "%-(?)", "-%1%1") if dialect == "val" then word = rsub(word, "%-x", "-tx") end word = rsub(word, "rç$", "rrs") -- silent r only in plurals -rs word = fix_prefixes(word) -- internal pause after a prefix word = restore_diaereses(word) -- no diaeresis saving word = fix_y(word) -- ny > ñ else y > i vowel or consonant word = mid_vowel_fixes(word) -- all words in pn- (e.g. pneumotòrax and mn- (e.g. mnemònic) have silent p/m in both Central and Valencian word = rsub(word, "^n", "n") -- Respell ch + vowel as tx, before we remove other h's after consonants. word = rsub(word, "ch(" .. V ..")", "tx%1") -- Delete h after a consonant. This must happen here, before split_syllables(). We don't delete h after a vowel -- yet because it indicates a hiatus. word = rsub(word, "(" .. C .. ")h", "%1")
return word end
local function split_vowels(vowels, saw_dotover, saw_lineunder) local syllables = {{onset = "", vowel = usub(vowels, 1, 1), coda = "", separator = "", has_dotover = saw_dotover, has_lineunder = saw_lineunder}} vowels = usub(vowels, 2)
while vowels ~= "" do local syll = {onset = "", vowel = "", coda = ""} syll.onset, syll.vowel, vowels = rmatch(vowels, "^(?)(.)(.-)$") table.insert(syllables, syll) end
local count = #syllables
if count >= 2 and (syllables.vowel == "i" or syllables.vowel == "u") then syllables.coda = syllables.vowel syllables = nil end
return syllables end
-- Split the word into syllables. Return a list of syllable objects, each of which contains fields `onset`, `vowel`, -- `coda`, `separator` (a user-specified syllable divider that goes before the syllable; one of '·', '-' or '.') and -- `stressed` (a boolean indicating that the syllable is stressed). In addition, the list has fields `stress` (the -- index of the syllable with primary stress) and `is_prefix` (true if the word is a prefix, i.e. it ends in '-'). -- Normally, prefixes are treated as unstressed if a stressed syllable isn't explicitly marked, but this can be -- overridden with `stress_prefixes`, which causes the automatic stress-assignment algorithm to run for these terms. local function split_syllables(word, stress_prefixes, may_be_uppercase) local syllables = {} local saw_dotover = false
local remainder = word local is_prefix = false if remainder:find("%-$") then -- prefix is_prefix = true remainder = remainder:gsub("%-$", "") end local is_suffix = false if remainder:find("^%-") then -- suffix is_suffix = true remainder = remainder:gsub("^%-", "") end
while remainder ~= "" do local consonants, vowels
-- FIXME: Using C and V below instead of the existing patterns slows things down TREMENDOUSLY. -- Not sure why. local vowel_list = may_be_uppercase and "aeiouàèéêëíòóôúïüAEIOUÀÈÉÊËÍÒÓÔÚÏÜ" .. DOTOVER .. LINEUNDER or "aeiouàèéêëíòóôúïü" .. DOTOVER .. LINEUNDER consonants, remainder = rmatch(remainder, "^(*)(.-)$") vowels, remainder = rmatch(remainder, "^(*)(.-)$") local this_saw_dotover = not not rfind(vowels, DOTOVER) if this_saw_dotover then saw_dotover = true vowels = vowels:gsub(DOTOVER, "") end local this_saw_lineunder = not not rfind(vowels, LINEUNDER) if this_saw_lineunder then vowels = vowels:gsub(LINEUNDER, "") end
if vowels == "" then if #syllables > 0 then syllables.coda = syllables.coda .. consonants else -- word without vowels, e.g. foot boundary | table.insert(syllables, {onset = consonants, vowel = "", coda = "", separator = ""}) end else local onset = consonants local first_vowel = usub(vowels, 1, 1)
if (rfind(onset, "$") and (first_vowel == "ü" or (first_vowel == "u" and vowels ~= "u"))) or ((onset == "" or onset == "h" or onset == "H") and #syllables == 0 and (first_vowel == "i" or first_vowel == "I") and (vowels ~= "i" and vowels ~= "I")) then onset = onset .. usub(vowels, 1, 1) vowels = usub(vowels, 2) end
local vsyllables = split_vowels(vowels, this_saw_dotover, this_saw_lineunder) vsyllables.onset = onset .. vsyllables.onset
for _, s in ipairs(vsyllables) do table.insert(syllables, s) end end end
-- Shift over consonants from the onset to the preceding coda, until the syllable onset is valid for i = 2, #syllables do local current = syllables local previous = syllables
while not (current.onset == "" or valid_onsets?$", ""), "_", "")]) do local letter = usub(current.onset, 1, 1) current.onset = usub(current.onset, 2) if rfind(letter, "") then -- syllable separators current.separator = letter break else previous.coda = previous.coda .. letter if rfind(letter, tie_c) then break end end end end
-- Detect stress for i, syll in ipairs(syllables) do if rfind(syll.vowel, "^$") then syll.stressed = true -- primary stress: the last one stressed without LINEUNDER if not syll.has_lineunder then syllables.stress = i end end end
-- Assign default stress if not syllables.stress and not saw_dotover and (stress_prefixes or not is_prefix) then local count = #syllables
if count == 1 then if syllables.vowel ~= "" then -- vowel-less words don't get stress syllables.stress = 1 end else local final = syllables
-- Take account of tie symbols (apostrophes and ‿). if rfind(final.coda, "^*$") or (rfind(final.coda, "^" .. tie_c .. "*n" .. tie_c .. "*$") and ( final.vowel == "e" or final.vowel == "i" or final.vowel == "ï")) then syllables.stress = count - 1 else syllables.stress = count end end if syllables.stress then syllables.stressed = true end end
syllables.is_prefix = is_prefix syllables.is_suffix = is_suffix return syllables end
local function reconstitute_word_from_syllables(syllables) local parts = {} local function ins(txt) table.insert(parts, txt) end if syllables.is_suffix then ins("-") end for _, syl in ipairs(syllables) do ins(syl.separator) ins(syl.onset) ins(syl.vowel) if syl.has_dotover then ins(DOTOVER) end if syl.has_lineunder then ins(LINEUNDER) end ins(syl.coda) end if syllables.is_prefix then ins("-") end return table.concat(parts) end
local function decompose_respelling(text) local dotover_keys = concat_keys(decompose_dotover) return rsub(text, "", decompose_dotover) end
local function canon_respelling(text) local function canon_spaces(text) text = rsub(text, "%s+", " ") text = rsub(text, "^ ", "") text = rsub(text, " $", "") return text end
text = canon_spaces(text) -- eliminate upside down punctuation text = rsub(text, "", "") -- eliminate utterance-final punctuation text = rsub(text, "$", "") -- eliminate double and triple quotes text = rsub(text, "+", "") -- Convert commas and em/en dashes to IPA foot boundaries; require a space after commas and en dashes (for the -- latter, in particular, to avoid treating the en dash in 'Bose–Einstein condensate' as a foot boundary. text = rsub(text, " * ", " | ") text = rsub(text, " * *", " | ") -- ... in phrases like com es diu...en català and necessito ... become foot boundaries text = rsub(text, " *%.%.%. *", " | ") -- remaining commas and en dashes become spaces text = rsub(text, "", " ") -- may need to eliminate extraneous spaces again, e.g. if there was a space before or after an eliminated -- punctuation mark text = canon_spaces(text) -- question mark or exclamation point in the middle of a sentence -> IPA foot boundary text = rsub(text, "() * *()", "%1 | %2") return text end
local IPA_vowels_central = {
= "ɛ", = "ɛ", = "ɔ",
}
local IPA_vowels_balearic = {
= "ə", = "ɛ", = "ɔ",
}
local IPA_vowels_valencian = {
= "e", = "e", = "o",
}
local IPA_vowels = { = "a", = "ɛ", = "ɛ", = "ɛ", = "e", = "i", = "i", = "ɔ", = "ɔ", = "o", = "u", = "u", }
local function replace_context_free(cons) cons = rsub(cons, "ŀ", "l")
cons = rsub(cons, "r", "ɾ") cons = rsub(cons, "ɾɾ", "r") cons = rsub(cons, "ss", "s") cons = rsub(cons, "ll", "ʎ") cons = rsub(cons, "ñ", "ɲ") -- hint ny > ñ
-- NOTE: We use single-character affricate symbols during processing for ease in handling, and convert them -- to tied multi-character affricates at the end of join_syllables(). cons = rsub(cons, "j", "ʤ") cons = rsub(cons, "tx", "ʧ") cons = rsub(cons, "z", "ʣ")
cons = rsub(cons, "ç", "s") cons = rsub(cons, "", "k") cons = rsub(cons, "h", "") cons = rsub(cons, "j", "ʒ") -- Don't replace x -> ʃ yet so we can distinguish x from manually specified ʃ.
cons = rsub(cons, "i", "j") -- must be after j > ʒ cons = rsub(cons, "y", "j") -- must be after j > ʒ and fix_y cons = rsub(cons, "", "w") cons = rsub(cons, "'", "‿")
return cons end
-- Do context-sensitive phonological changes. Formerly this was all done syllable-by-syllable but that made the code
-- tricky (since it often had to look at adjacent syllables) and full of subtle bugs. Now we first concatenate the
-- syllables back to words and the words to the combined text and work on the text as a whole. FIXME: We should move
-- more of the work done in preprocess_word(), e.g. most of replace_context_free(), here.
local function postprocess_general(text, dialect)
local function verify(cond, msg)
if not cond then
error(("Internal error: %s; processed respelling at this point is '%s'"):format(msg, text))
end
return true
end
local voiced = listToSet {"b", "d", "g", "m", "n", "ɲ", "l", "ʎ", "r", "ɾ", "v", "z", "ʒ", "ʣ", "ʤ"} local voiced_keys = concat_keys(voiced) local voiceless = listToSet {"p", "t", "k", "f", "s", "ʃ", "ʦ", "ʧ"} local voiceless_keys = concat_keys(voiceless) local voicing = { = "b", = "d", = "g", = "v", = "z", = "ʒ", = "ʤ", = "ʤ"} local voicing_keys = concat_keys(voicing) local devoicing = {} for k, v in pairs(voicing) do devoicing = k end local devoicing_keys = concat_keys(devoicing)
------------------ Handle <x>
-- Handle ex- + vowel > -egz-. We handle -x- on either side of the syllable boundary. Note that this also handles -- inex- + vowel because in fix_prefixes we respell inex- as in.ex-, which ends up at this stage as in.e.xV. text = rsub_repeatedly(text, "(" .. stress_c .. "*)(" .. charsep_c .. "*)x(" .. charsep_c .. "*" .. V .. ")", function(e, pre, post) -- Preserve other character separators (especially the tie character ‿). pre = pre:gsub("%.", "") post = post:gsub("%.", "") return e .. pre .. "g.z" .. post end) -- -x- at the beginning of a coda becomes , e.g. annex, apèndix, extracció; but not elsewhere in -- the coda, e.g. in romanx, ponx; words with in -nx such as esfinx, linx, manx need -- respelling with ; words ending in vowel + x like ídix need respelling with text = rsub(text, "(" .. V .. stress_c .. "*)x", "%1ks") if dialect == "val" then -- Word-initial <x> as well as <x> after a consonant other than /j/ (including in the coda, e.g. ponx) -- becomes . text = rsub(text, "#x", "#ʧ") text = rsub(text, "(" .. charsep_c .. "*)x", "%1ʧ") end -- Other x becomes text = rsub(text, "x", "ʃ")
-- Doubled ss -> s e.g. in exs-, exc(e/i)-, sc(e/i)-; FIXME: should this apply across word boundaries? text = rsub(text, "s(" .. charsep_c .. "*)s", "%1s")
------------------ Coda consonant losses
-- In Central Catalan, coda losses happen everywhere, but otherwise they don't happen when -- absolutely word-finally before a vowel or end of utterance (e.g. blanc has /k/ in Balearic and -- Valencian but not blancs). Must precede consonant assimilations. local boundary = dialect == "cen" and "(.)" or "()" text = rsub(text, "m" .. boundary, "m%1") text = rsub(text, "()" .. boundary, "%1%2") text = rsub(text, "" .. boundary, "ŋ%1") if dialect == "val" or dialect == "bal" then local before_cons = "(" .. separator_c .. "*" .. C .. ")" text = rsub(text, "m" .. before_cons, "m%1") text = rsub(text, "()" .. before_cons, "%1%2") text = rsub(text, "" .. before_cons, "ŋ%1") end
-- Delete /t/ between /s/ and any consonant other than /s/ or /ɾ/. Must precede voicing assimilation and -- t + lateral/nasal assimilation. text = rsub(text, "st(" .. sylsep_c .. "*)", "s%1")
------------------ Consonant assimilations
if dialect == "cen" then -- v > b in onsets (not in codas, e.g. ovni and hafni ). This needs to precede -- assimilation of nb -> mb. text = rsub(text, "v(" .. C .. "*" .. V ..")", "b%1") end
-- t + lateral assimilation -> geminate across syllable boundary. We don't any more do t + nasal assimiation -- because there are too many exceptions, e.g. aritmètic, atmosfèric, ètnia. Instead, we require that -- cases where it does happen use respelling to effect this. FIXME: this doesn't always happen in -tl- either, -- e.g. atlàntic has in GDLC but in DNV. -- -- FIXME: Clean this up, maybe move below voicing assimilation, investigate whether it operates across words, -- move stuff below that special-cases tll in Valencian here. text = rsub(text, "t(" .. sylsep_c .. ")()", "%2%1%2")
-- n + labial > labialized assimilation text = rsub(text, "n(" .. separator_c .. "*)", "m%1") text = rsub(text, "n(" .. separator_c .. "*)", "ɱ%1")
-- n + velar > velarized assimilation text = rsub(text, "n(" .. separator_c .. "*)", "ŋ%1")
-- l/n + palatal > palatalized assimilation text = rsub(text, "()(" .. separator_c .. "*)", function(ln, palatal) ln = ({ = "ʎ", = "ɲ"}) return ln .. palatal end)
-- ɲs > ɲʃ; FIXME: not sure the purpose of this; it doesn't apply in menys or derived terms like menyspreu -- NOTE: Per , it does apply in these scenarios but the result is -- somewhere between and , which is why it isn't shown in GDLC. -- text = rsub(text, "ɲs", "%1ʃ")
------------------ Handle <r>
-- In replace_context_free(), we converted single r to ɾ and double rr to r. if dialect == "cen" then text = rsub(text, TEMP_PAREN_R, "") text = rsub(text, TEMP_PAREN_RR, "r") elseif dialect == "bal" then text = rsub(text, TEMP_PAREN_R, "") text = rsub(text, TEMP_PAREN_RR, "") else verify(dialect == "val", ("Unrecognized dialect '%s'"):format(dialect)) text = rsub(text, TEMP_PAREN_R, "ɾ") text = rsub(text, TEMP_PAREN_RR, "ɾ") end if dialect ~= "val" then -- Coda /ɾ/ -> /r/ -- FIXME: This is inherited from the older code. Correct? text = rsub(text, "(" .. V .. stress_c .. "*" .. C .. "*)ɾ", "%1r") end
-- ɾ -> r word-initially or after ; needs to precede voicing assimilation as will be voiced to before
-- /ɾ/.
text = rsub(text, "(" .. sylsep_c .. "*)ɾ", "%1r")
------------------ Voicing assimilation
-- Voicing or devoicing; we want to proceed from right to left, and due to the limitations of patterns (in
-- particular, the lack of support for alternations), it's difficult to do this cleanly using Lua patterns, so we
-- do it character by character.
local chars = split_into_chars(text)
-- We need to look two characters ahead in some cases, so start two characters from the end. This is safe because
-- the overall respelling ends in "##". (Similarly, as an optimization, don't check the first two characters, which
-- are always "##".)
for i = #chars - 2, 3, -1 do
-- We are looking for two consonants next to each other, possibly separated by a syllable or word divider.
-- We also handle a consonant followed by a syllable divider then a vowel, and a consonant word-finally.
-- Note that only coda consonants can change voicing, so we need to check to make sure we're in the coda.
local first = chars
-- If `second` is nil, no assimilation occurs. Otherwise, `second` should be a consonant or empty string (which
-- represents a syllable or word boundary followed by a vowel or end of string), and we assimilate to that
-- consonant (empty string forces devoicing).
local second
-- If set to true, we're processing a consonant directly before a word boundary followed by a word beginning
-- with a vowel. In this context, voiceless sibilants voice. Note that we handle voicing of word-internally
-- separately, in preprocess_word() [FIXME: maybe move much of the processing in preprocess_word() into this
-- function].
local word_boundary_before_vowel
if not rfind(first, C) then
-- leave `second` at nil; no assimilation
elseif chars == "#" then -- word boundary
if chars == " " then
-- chars should always be "#"
verify(chars == "#", "Word boundary followed by space but not #")
if rfind(chars, C) then
second = chars
else
second = ""
word_boundary_before_vowel = true
end
else
second = ""
end
elseif rfind(chars, sylsep_c) then -- syllable boundary
if rfind(chars, C) then
second = chars
else
second = ""
end
elseif rfind(chars, C) then
second = chars
else
-- followed by a vowel not across a syllable or word boundary; leave `second` as nil, no assimilation
end
if second then
-- Make sure we're in the coda. We have to look backwards until we find a vowel or syllable/word boundary.
local in_coda = false
local j = i - 1
while true do
verify(j > 0, "Missing word boundary at beginning of overall respelling")
if rfind(chars, "") then
break
elseif rfind(chars, V) then
in_coda = true
break
end
j = j - 1
end
if in_coda then
if word_boundary_before_vowel and rfind(first, "") then
-- leave alone
elseif voiced and voicing or word_boundary_before_vowel and rfind(first, "") then
chars = voicing
elseif (voiceless or second == "") and devoicing then
chars = devoicing
end
end
end
end
text = table.concat(chars)
-- gn -> ŋn e.g. regnar (including word-initial gn- e.g. gnòmic, gneis)
-- FIXME: This should be moved below voicing assimilation, and we need to investigate if it operates across words
-- (here I'm guessing yes).
if dialect ~= "cen" then
text = rsub(text, "#gn", "#n")
end
text = rsub(text, "g(" .. separator_c .. "*n)", "ŋ%1")
-- gʒ > d͡ʒ
-- FIXME: We need to investigate if it operates across words
text = rsub(text, "g(" .. sylsep_c .. "*)ʒ", "%1ʤ")
-- sʃ -> ʃ (desxifrar), zʒ -> ʒ (disjuntor)
if dialect ~= "val" then
text = rsub(text, "s(" .. separator_c .. "*ʃ)", "%1")
text = rsub(text, "z(" .. separator_c .. "*ʒ)", "%1")
end
------------------ Gemination of <bl>, <gl>
if dialect ~= "val" then
-- bl -> bbl, gl -> ggl after the stress when following a vowel; to avoid this, use <b_l> or <g_l>.
-- This must follow v > b above. To force a hard ungeminated or , use <_b> or <_g>.
text = rsub(text, "(" .. stress_c .. ")(" .. sylsep_c .. ")()l", "%1%3%2%3l")
else -- Valencian; undo manually written 'bbl', 'ggl' in words like poblar, reglament
text = rsub(text, "()(" .. sylsep_c .. ")%1l", "%2%1l")
end
------------------ Lenition of voiced stops
-- In Central Catalan, b/d/g become fricatives (actually approximants, like in Spanish) in the onset following a
-- vowel and (except for <d>) after <l> and <ll> (cf. GDLC cabellblanc ). This also happens across
-- word boundaries but doesn't happen after stops, nor in Central Catalan after , or (and hence probably
-- not after either, although I can't find any examples in GDLC).
--
-- In Valencian, doesn't lenite (at least formally?), but <d> and <g> do lenite after , or .
--
-- Balearic is like Valencian in not leniting , and probably like Central Catalan otherwise.
local lenite_bdg = { = "β", = "ð", = "ɣ"}
if dialect == "cen" then
text = rsub(text, "(" .. separator_c .. "*" .. separator_c .. "*)()",
function(before, bdg) return before .. lenite_bdg end)
elseif dialect == "val" then
text = rsub(text, "(" .. separator_c .. "*" .. separator_c .. "*)()",
function(before, dg) return before .. lenite_bdg end)
else
verify(dialect == "bal", ("Unrecognized dialect '%s'"):format(dialect))
text = rsub(text, "(" .. separator_c .. "*" .. separator_c .. "*)()",
function(before, dg) return before .. lenite_bdg end)
end
------------------ Vowel reduction
-- Reduction of unstressed a,e in Central and Balearic (Eastern Catalan).
if dialect ~= "val" then
-- The following rules seem to apply, based on the old code:
-- (1) Stressed a and e are never reduced.
-- (2) Unstressed e directly following ə is not reduced.
-- (3) Unstressed e directly before written <a> or before /ɔ/ is not reduced.
-- (4) Written <ee> when both vowels precede the primary stress is reduced to . (This rule preempts #2.)
-- (5) Written <ee> when both vowels follow the primary stress isn't reduced at all.
-- Rule #2 in particular seems to require that we proceed left to right, which is how the old code was
-- implemented.
-- FIXME: These rules seem overly complex and may produce incorrect results in some circumstances.
local words = rsplit(text, " ")
for j, word in ipairs(words) do
local chars = split_into_chars(word)
-- See above where voicing assimilation is handled. The overall respelling begins and ends in #, which we
-- can ignore. We need to look ahead three chars in some circumstances, but in all those circumstances we
-- shoudn't run off the end (and have assertions to check this).
local seen_primary_stress = false
for i = 2, #chars - 1 do
local this = chars
if chars == AC then
seen_primary_stress = true
end
if (this ~= "a" and this ~= "e") or rfind(chars, stress_c) then
-- Not a/e, or a stressed vowel; continue
else
local reduction = true
local prev, prev_stress, nxt, nxt_stress
if not rfind(chars, sylsep_c) then
prev = ""
else
prev = chars -- this should be non-nil as chars is a syllable separator (not #)
verify(prev, "Missing # at word boundary")
prev_stress = ""
if rfind(prev, stress_c) then
prev_stress = prev
prev = chars
-- As above; chars is a stress indicator (not #).
verify(prev, "Missing # at word boundary")
end
end
if not rfind(chars, sylsep_c) then
nxt = ""
-- leave nxt at nil
else
nxt = chars
nxt_stress = chars
-- chars is a syllable separator, so chars should not be a word boundary, so
-- chars should exist.
verify(nxt and nxt_stress, "Syllable separator at word boundary or missing # at word boundary")
end
if this == "e" and rfind(prev, "ə") then
reduction = false
elseif this == "e" and rfind(nxt, "") then
reduction = false
elseif this == "e" and nxt == "e" and not rfind(nxt_stress, AC) then
-- FIXME: Check specifically for AC duplicates previous logic but is probably wrong or unnecessary.
if not seen_primary_stress then
chars = "ə"
else
reduction = false
end
end
if reduction then
chars = "ə"
end
end
end
words = table.concat(chars)
end
text = table.concat(words, " ")
end
if dialect == "cen" then
-- Reduction of unstressed o (not before w)
text = rsub(text, "o()", "u%1")
elseif dialect == "bal" then
-- Reduction of unstressed o per vowel harmony: unstressed /o/ -> /u/ directly before stressed /i/ or /u/;
-- as a Lua pattern, o can be followed only by consonants and/or syllable separators (no vowels, stress marks
-- or word separators).
text = rsub(text, "o(*" .. stress_c .. ")", "u%1")
end
-- Final losses.
text = rsub(text, "j(ʧs?#)", "%1") -- boigs /bɔt͡ʃ/
text = rsub(text, "()s#", "%1#") -- homophone plurals -xs, -igs, -çs
if dialect ~= "val" then
-- Remove j before palatal obstruents
text = rsub(text, "j(" .. sylsep_c .. "*)", "%1")
else -- Valencian
-- Fortition of palatal fricatives
text = rsub(text, "ʒ", "ʤ")
text = rsub(text, "(i" .. stress_c .. "*" .. sylsep_c .. ")ʣ", "%1z")
end
if dialect ~= "cen" then
-- No palatal gemination ʎʎ > ll or ʎ, in Valencian and Balearic.
-- FIXME: These conditions seem to be targeting specific words and should probably be fixed using respelling
-- instead.
text = rsub(text, "(a" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "(e" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "(ti" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "(m" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "(u" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "ʎ(" .. sylsep_c .. "*ʎ)", "%1")
end
---------- Convert pseudo-symbols to real ones.
-- Convert g to IPA ɡ.
text = rsub(text, "g", "ɡ")
-- Convert pseudo-afficate symbols to full affricates.
local full_affricates = { = "t͡s", = "d͡z", = "t͡ʃ", = "d͡ʒ" }
text = rsub(text, "()", full_affricates)
---------- Generate IPA stress marks.
-- Convert acute and grave to IPA stress marks.
text = rsub(text, AC, "ˈ")
text = rsub(text, GR, "ˌ")
-- Move IPA stress marks to the beginning of the syllable.
text = rsub_repeatedly(text, "()(*)(" .. ipa_stress_c .. ")", "%1%3%2")
-- Suppress syllable divider before IPA stress indicator.
text = rsub(text, "%.(#?" .. ipa_stress_c .. ")", "%1")
-- Make all primary stresses but the last one in a given word be secondary. May be fed by the first rule above.
-- FIXME: Currently this is handled earlier, but we might want to move it here, as is done in Module:pt-pronunc.
-- text = rsub_repeatedly(text, "ˈ(+)ˈ", "ˌ%1ˈ")
-- Make primary stresses in prefixes become secondary. (FIXME: Handled earlier now.)
-- text = rsub_repeatedly(text, "ˈ(*#" .. PREFIX_MARKER .. ")", "ˌ%1")
-- Remove # symbols at word/text boundaries, as well as _ (which forces separate interpretation), pseudo-consonant
-- markers (at edges of some prefixes/suffixes), and prefix markers, and recompose.
text = rsub(text, "", "")
text = mw.ustring.toNFC(text)
return text
end
local function preprocess_word(syllables, suffix_syllables, dialect, pos, orig_word)
-- Stressed vowel is ambiguous
if syllables.stress then
local stressed_vowel = syllables.vowel
if rfind(stressed_vowel, "") then
local marks = { = {AC, GR, CFLEX, DIA}, = {AC, GR, CFLEX}}
local marked_vowels = {}
for _, mark in ipairs(marks) do
table.insert(marked_vowels, stressed_vowel .. mark)
end
error(("In respelling '%s', the stressed vowel '%s' is ambiguous. Please mark it with an acute, " ..
"grave, or combined accent: %s."):format(orig_word, stressed_vowel,
m_table.serialCommaJoin(marked_vowels, {dontTag = true, conj = "or"})))
end
end
-- Final -r is ambiguous in many cases.
local final = syllables
-- Stressed final r after a or i in non-monosyllables is treated as (r), i.e. verbal infinitives are assumed (NOTE:
-- not always the case, e.g. there are many adjectives and nouns in -ar that should be marked as '(rr)', and
-- several loanword nouns in -ir that should be marked as 'rr'). Likewise for stressed final r or rs after é in
-- non-monosyllables (which are usually adjectives or nouns with the -er ending, but may be verbal infinitives,
-- which should be marked as 'ê(r)'). That is, it disappears other than in Valencian. All other final r and final
-- rs are considered ambiguous and need to be rewritten using rr, (rr) or (r).
if #syllables > 1 and final.stressed then
if final.coda == "r" and rfind(final.vowel, "") or final.coda == "rs" and final.vowel == "é" or
final.vowel == "ó" and rfind(final.coda, "^rs?$") and rfind(final.onset, "") then
final.coda = TEMP_PAREN_R
end
end
if rfind(final.coda, "^rs?$") or rfind(final.coda, "rs?$") then
error(("In respelling '%s', final -r by itself or in -rs is ambiguous except in the verbal endings -ar or " ..
"-ir, in the nominal or adjectival endings -er(s) and -or(s). In all other cases it needs to be " ..
"rewritten using one of 'rr' (pronounced everywhere), '(rr)' (pronounced everywhere but Balearic) or " ..
"'(r)' (pronounced only in Valencian). Note that adjectives in -ar usually need rewriting using '(rr)'; " ..
"nouns in -ar referring to places should be rewritten using '(r)'; and loanword nouns in -ir usually " ..
"need rewriting using 'rr'."):format(orig_word))
end
local syllables_IPA = {stress = syllables.stress, is_prefix = syllables.is_prefix, is_suffix = syllables.is_suffix}
for key, val in ipairs(syllables) do
syllables_IPA = {onset = val.onset, vowel = val.vowel, coda = val.coda, stressed = val.stressed}
end
-- Replace letters with IPA equivalents
for i, syll in ipairs(syllables_IPA) do
-- Voicing of s
if syll.onset == "s" and i > 1 and rfind(syllables.coda, "^?$") then
syll.onset = "z"
end
if rfind(syll.vowel, "^$") then
syll.onset = rsub(syll.onset, "tg$", "ʤ")
syll.onset = rsub(syll.onset, "$", { = "s", = "ʒ"})
syll.onset = rsub(syll.onset, "u$", { = "k", = "g"})
end
syll.coda = rsub(syll.coda, "igs?$", "iʤ")
syll.onset = replace_context_free(syll.onset)
syll.coda = replace_context_free(syll.coda)
syll.vowel = rsub(syll.vowel, ".",
dialect == "cen" and IPA_vowels_central or
dialect == "bal" and IPA_vowels_balearic or
IPA_vowels_valencian
)
syll.vowel = rsub(syll.vowel, ".", IPA_vowels)
end
for _, suffix_syl in ipairs(suffix_syllables) do
table.insert(syllables_IPA, suffix_syl)
end
return syllables_IPA
end
-- Given a single substitution spec, `to`, figure out the corresponding value of `from` used in a complete
-- substitution spec. `pagename` is the name of the page, either the actual one or taken from the `pagename` param.
-- `whole_word`, if set, indicates that the match must be to a whole word (it was preceded by ~).
local function convert_single_substitution_to_original(to, pagename, whole_word)
-- Replace specially-handled characters with a class matching the character and possible replacements.
local escaped_from = to
-- Handling of '(rr)', '(r)', '.' and '-' needs to be done before calling pattern_escape(); otherwise they will be
-- escaped.
escaped_from = escaped_from:gsub("%(rr%)", "r")
escaped_from = escaped_from:gsub("%(r%)", "r")
escaped_from = escaped_from:gsub("ks", "x"):gsub("Ks", "X"):gsub("gz", "x"):gsub("()%1l", "%1l"):gsub("", "")
escaped_from = require(patut_module).pattern_escape(escaped_from)
escaped_from = escaped_from:gsub("rr", "rr?")
escaped_from = escaped_from:gsub("ss", "ss?")
escaped_from = escaped_from:gsub("ʃ", "")
-- This is tricky, because we already passed `escaped_from` through pattern_escape() causing a hyphen to get a
-- % sign before it, and have to double up the percent signs to match and replace a literal %.
escaped_from = escaped_from:gsub("%%%-", "%%-?")
-- Tie sign (‿) should match against space, hyphen or nothing in the original.
escaped_from = escaped_from:gsub("‿", "?")
escaped_from = rsub(escaped_from, "",
function(v) return " .. "]" end)
escaped_from = escaped_from:gsub(DOTOVER, DOTOVER .. "?"):gsub(LINEUNDER, LINEUNDER .. "?")
escaped_from = "(" .. escaped_from .. ")"
if whole_word then
escaped_from = "%f" .. escaped_from .. "%f"
end
local match = rmatch(pagename, escaped_from)
if match then
if match == to then
error(("Single substitution spec '%s' found in pagename '%s', replacement would have no effect"):
format(to, pagename))
end
return match
end
error(("Single substitution spec '%s' couldn't be matched to pagename '%s'"):format(to, pagename))
end
local function apply_substitution_spec(respelling, pagename, pos, allow_mid_vowel_hints, parse_err)
local subs = split_on_comma(rmatch(respelling, "^%$"))
respelling = pagename
local mid_vowel_hint
local regular_subs = {}
for _, sub in ipairs(subs) do
if rfind(sub, "^" .. export.mid_vowel_hint_c .. "$") then
if mid_vowel_hint then
parse_err(("Specified mid vowel hint twice, '%s' and '%s'"):format(
mid_vowel_hint, sub))
end
mid_vowel_hint = sub
else
table.insert(regular_subs, sub)
end
end
if mid_vowel_hint then
if not allow_mid_vowel_hints then
parse_err(("Mid vowel hint '%s' not allowed when apply one substitution spec to multiple words"):format(
mid_vowel_hint))
end
local suffix = ""
-- FIXME: This duplicates logic in to_IPA().
if not pos or pos == "adverb" then
local part_before_ment, ment = rmatch(respelling, "^(.*)(mnt)$")
if part_before_ment and (pos == "adverb" or not rfind(part_before_ment, "$") and
rfind(part_before_ment, V .. ".*" .. V)) then
suffix = ment
respelling = part_before_ment
end
end
local syllables = split_syllables(respelling, "stress prefixes", "may be uppercase")
local stressed_vowel = syllables.vowel
if stressed_vowel == mid_vowel_hint then
-- do nothing
elseif rfind(mid_vowel_hint, "") and rfind(stressed_vowel, "") or
rfind(mid_vowel_hint, "") and rfind(stressed_vowel, "") then
syllables.vowel = mid_vowel_hint
else
parse_err(("Stressed vowel '%s' not compatible with mid vowel hint '%s'"):format(
stressed_vowel, mid_vowel_hint))
end
respelling = reconstitute_word_from_syllables(syllables) .. suffix
end
for _, sub in ipairs(regular_subs) do
local from, escaped_from, to, escaped_to, whole_word
if rfind(sub, "^~") then
-- whole-word match
sub = rmatch(sub, "^~(.*)$")
whole_word = true
end
if sub:find(":") then
from, to = rmatch(sub, "^(.-):(.*)$")
else
to = sub
from = convert_single_substitution_to_original(to, pagename, whole_word)
end
if from then
local patut = require(patut_module)
escaped_from = patut.pattern_escape(from)
if whole_word then
escaped_from = "%f" .. escaped_from .. "%f"
end
escaped_to = patut.replacement_escape(to)
local subbed_respelling, nsubs = rsubn(respelling, escaped_from, escaped_to)
if nsubs == 0 then
parse_err(("Substitution spec %s -> %s didn't match processed pagename '%s'"):format(
from, to, respelling))
elseif nsubs > 1 then
parse_err(("Substitution spec %s -> %s matched multiple substrings in processed pagename '%s', add " ..
"more context"):format(from, to, respelling))
else
respelling = subbed_respelling
end
end
end
return respelling
end
local canonicalize_pos = {
n = "noun",
noun = "noun",
v = "verb",
vb = "verb",
verb = "verb",
a = "adjective",
adj = "adjective",
adjective = "adjective",
av = "adverb",
adv = "adverb",
adverb = "adverb",
o = "other",
other = "other",
}
local function parse_off_pos(respelling, parse_err)
local pos, rest = respelling:match("^(+)/(.*)$")
if pos then
if not canonicalize_pos then
local valid_pos = {}
for vp, _ in pairs(canonicalize_pos) do
table.insert(valid_pos, vp)
end
table.sort(valid_pos)
parse_err(("Unrecognized part of speech '%s', should be one of %s"):format(pos,
table.concat(valid_pos, ", ")))
end
pos = canonicalize_pos
respelling = rest
if respelling == "" then
respelling = "+"
end
end
return pos, respelling
end
-- Parse a respelling given by the user, allowing for '+' for pagename, mid vowel hints in place of a respelling and
-- substitution specs like '' or . In general, return an object {words = {WORD, WORD, ...}} where
-- WORD is of the form {term = PARSED_RESPELLING, pos = POS}. Other fields are set in special cases: If a raw respelling
-- was seen, the fields `raw_phonemic` and/or `raw_phonetic` are set; if '?' is seen, the field `unknown` is set; and if
-- '-' is seen, the field `omitted` is set.
local function parse_respelling(respelling, pagename, parse_err)
if respelling == "?" then
return {
unknown = true
}
end
if respelling == "-" then
return {
omitted = true
}
end
local saw_raw
local remaining_respelling = respelling:match("^raw:(.*)$")
if remaining_respelling then
saw_raw = true
respelling = remaining_respelling
end
local raw_phonemic, raw_phonetic = respelling:match("^/(.*)/ %$")
if not raw_phonemic then
raw_phonemic = respelling:match("^/(.*)/$")
end
if not raw_phonemic and saw_raw then
raw_phonetic = respelling:match("^%$")
end
if raw_phonemic or raw_phonetic then
return {
raw_phonemic = raw_phonemic,
raw_phonetic = raw_phonetic,
}
end
pagename = decompose_respelling(pagename)
respelling = decompose_respelling(respelling)
local function split_respelling_into_words(respelling, parse_pos)
respelling = canon_respelling(respelling)
local word_objs = {}
local respelling_words = rsplit(respelling, " ")
for _, word in ipairs(respelling_words) do
local pos
if parse_pos then
pos, word = parse_off_pos(word, parse_err)
end
table.insert(word_objs, {term = word, pos = pos})
end
return {words = word_objs}
end
local function substitute_respelling_word(respelling_word, pagename_word)
local pos
pos, respelling_word = parse_off_pos(respelling_word, parse_err)
if respelling_word == "+" then
respelling_word = pagename_word
else
if rfind(respelling_word, "^" .. export.mid_vowel_hint_c .. "$") then
respelling_word = ""
end
if rfind(respelling_word, "^%$") then
respelling_word = apply_substitution_spec(respelling_word, pagename_word, pos,
"allow mid vowel hint", parse_err)
end
end
return {term = respelling_word, pos = pos}
end
-- At this point, if there are multiple words in the pagename, there are three syntaxes allowed: all-at-once,
-- replacement or word-by-word. All-at-once syntax involves either a + representing the entire pagename, or a
-- substitution spec that applies to all words in the pagename. This syntax cannot have a prefixed part of speech
-- because it wouldn't be clear which word to apply the part of speech to. Replacement syntax simply spells out the
-- respelling without any substitution specs or +'s (but possibly with parts of speech prefixed to individual
-- words), and can have a different number of words than the pagename (essentially, the pagename is disregarded).
-- Word-by-word syntax involves a combination of respelled words, per-word substitution specs and/or a +
-- representing an individual word, and must have the same number of words as the pagename so that substitution
-- specs and +'s can be lined up with words in the pagename. In all cases, the return value is in the same format;
-- see comment at top of function.
if pagename:find(" ") or respelling:find(" ") then
if respelling == "+" then
return split_respelling_into_words(pagename)
elseif rfind(respelling, "^%$") then
-- all-at-once syntax with substitution spec
return split_respelling_into_words(apply_substitution_spec(respelling, pagename, nil, false, parse_err))
elseif rfind(respelling, "^(+)/$") or rfind(respelling, "^(+)/%]*%]$") then
-- attempt to include a part of speech in all-at-once syntax
parse_err(("Part of speech not allowed when pagename is multiword and all-at-once syntax is used in " ..
"the respelling, but saw '%s'"):format(respelling))
elseif rfind(respelling, "^" .. export.mid_vowel_hint_c .. "$") then
-- attempt to use a mid-vowel hint in all-at-once syntax
parse_err(("Single mid-vowel hint not allowed when pagename is multiword because it's not clear which " ..
"word to apply it to, but saw '%s'"):format(respelling))
elseif rfind(respelling, "]") or rfind(respelling, "^" .. export.mid_vowel_hint_c .. " ") or
rfind(respelling, " " .. export.mid_vowel_hint_c .. " ") or
rfind(respelling, " " .. export.mid_vowel_hint_c .. "$") then
-- word-by-word syntax
local sub_with_space = rmatch(respelling, "%]* ]*%]")
if sub_with_space then
parse_err(("When using word-by-word syntax with a multiword pagename, saw substitution spec '%s' " ..
"with spaces, which is not allowed because it must match a single word"):format(sub_with_space))
end
pagename = canon_respelling(pagename)
respelling = canon_respelling(respelling)
local pagename_words = rsplit(pagename, " ")
local respelling_words = rsplit(respelling, " ")
if #pagename_words ~= #respelling_words then
parse_err(("When using word-by-word syntax with a multiword pagename, saw %s words in pagename but " ..
"%s word%s in respelling; they need to match"):format(#pagename_words, #respelling_words,
#respelling_words > 1 and "s" or ""))
end
local word_objs = {}
for i = 1, #pagename_words do
table.insert(word_objs, substitute_respelling_word(respelling_words, pagename_words))
end
return {words = word_objs}
else
-- replacement syntax; pagename ignored
return split_respelling_into_words(respelling, "parse pos")
end
else
local word_obj = substitute_respelling_word(respelling, pagename)
word_obj.term = canon_respelling(word_obj.term)
return {words = {word_obj}}
end
end
-- Parse a list of comma-split runs containing one or more respellings, i.e. after calling parse_balanced_segment_run()
-- or the like followed by split_alternating_runs() or the like (see Module:parse utilities). `pagename` is the
-- pagename, for use when a respelling is just '+', a mid-vowel hint like 'ê' or a substitution spec like ''.
-- `original_input` is the raw input and `input_param` the name of the param containing the raw input; both are used
-- only in error messages. Return an object specifying the respellings, currently with a single field 'terms' (this
-- format is used in case other outer properties exist in the future), where 'terms' is a list of term objects. Each
-- term object contains either a field `term` with the respelling and an optional part of speech `pos`, or fields
-- `raw_phonemic` and/or `raw_phonetic` (if the user specified raw IPA using "/.../" or "/.../ " or "raw:"),
-- `unknown` (if the user specified "?"), or `omitted` (if the user specified "-"). In addition, there may be fields
-- `q`, `qq`, `a`, `aa`, and/or `ref` corresponding to inline modifiers. Each such field is a list; all are lists of
-- strings except for `ref`, which is a list of objects as returned by parse_references() in Module:references.
function export.parse_comma_separated_groups(comma_separated_groups, pagename, original_input, input_param)
local function generate_obj(respelling, parse_err)
return parse_respelling(respelling, pagename == true and respelling or pagename, parse_err)
end
local put = require(parse_utilities_module)
local outer_container = {terms = {}}
for _, group in ipairs(comma_separated_groups) do
-- Rejoin runs that don't involve <...>.
local j = 2
while j <= #group do
if not group:find("^<.*>$") then
group = group .. group .. group
table.remove(group, j)
table.remove(group, j)
else
j = j + 2
end
end
local param_mods = {
-- pre = { overall = true },
-- post = { overall = true },
ref = { store = "insert", convert = function(arg, parse_err)
return require("Module:references").parse_references(arg)
end },
q = { store = "insert" },
qq = { store = "insert" },
a = { store = "insert" },
aa = { store = "insert" },
}
table.insert(outer_container.terms, put.parse_inline_modifiers_from_segments {
group = group,
arg = original_input,
props = {
paramname = input_param,
param_mods = param_mods,
generate_obj = generate_obj,
splitchar = ",",
outer_container = outer_container,
},
})
end
return outer_container
end
-- Generate the pronunciation of `words` (a list of word objects representing respellings, each of which is an object
-- of the form {term = RESPELLING, pos = PART_OF_SPEECH} in `dialect` ("cen", "bal" or "val").
local function to_IPA(words, dialect)
local pronuns = {}
for _, wordobj in ipairs(words) do
if rfind(wordobj.term, "") then
error(("Invalid accented character in respelling '%s'; use accented à í ú, not the reversed versions"
):format(wordobj.term))
end
end
words = handle_unstressed_words(words)
for _, wordobj in ipairs(words) do
local word = wordobj.term
local pos = wordobj.pos
local suffix_syllables = {}
local orig_word = word
word = ulower(word)
if not pos or pos == "adverb" then
local word_before_ment, ment = rmatch(word, "^(.*)(mnt)$")
if word_before_ment and (pos == "adverb" or not rfind(word_before_ment, "$") and
rfind(word_before_ment, V .. ".*" .. V)) then
suffix_syllables = Template:onset = "m", vowel = "e", coda = "nt", stressed = true
pos = "adjective"
word = word_before_ment
end
end
word = word_fixes(word, dialect)
local syllables = split_syllables(word)
syllables = preprocess_word(syllables, suffix_syllables, dialect, pos, orig_word)
-- Combine syllables.
local combined = {}
local has_ment = #suffix_syllables > 0
for i, syll in ipairs(syllables) do
local ac = (i == syllables.stress and not syllables.is_prefix and not has_ment or
has_ment and i == #syllables) and AC or -- primary stress
syllables.stressed and GR or -- secondary stress
""
table.insert(combined, syll.onset .. syll.vowel .. ac .. syll.coda)
end
table.insert(pronuns, table.concat(combined, "."))
end
-- Put double ## at utterance boundaries (beginning/end of string) and at foot boundaries (marked with |).
-- Note that if the string without pound signs is 'foo bar baz | bat quux', the final string will be
-- '##foo# #bar# #baz## #|# ##bat# #quux##'.
local text = "##" .. table.concat(pronuns, " ") .. "##"
text = rsub(text, " | ", "# | #")
text = rsub(text, " ", "# #")
return postprocess_general(text, dialect)
end
-- Generate the phonemic and phonetic pronunciations of the respellings in `parsed_respellings`, which is a table whose
-- keys are dialect identifiers (e.g. "cen" for Central Catalan, "val" for Valencian) and whose values are objects of
-- the format returned by parse_comma_separated_groups() (see comment above that function). This destructively modifies
-- `parsed_respellings`, adding fields `phonemic` and `phonetic` containing the generated pronunciations and removing
-- the input fields used to generate those output fields. (FIXME: Currently only phonetic pronunciation is generated.)
function export.generate_phonemic_phonetic(parsed_respellings)
-- Convert each canonicalized respelling to phonemic/phonetic IPA.
for dialect, respelling_spec in pairs(parsed_respellings) do
for _, termobj in ipairs(respelling_spec.terms) do
if termobj.unknown or termobj.omitted then
-- leave alone, will handle later
elseif termobj.raw_phonemic or termobj.raw_phonetic then
termobj.phonemic = termobj.raw_phonemic
termobj.phonetic = termobj.raw_phonetic
-- set to nil so by-value comparisons respect only the resulting phonemic/phonetic and qualifiers
termobj.raw_phonemic = nil
termobj.raw_phonetic = nil
else
termobj.phonetic = to_IPA(termobj.words, dialect)
-- set to nil so by-value comparisons respect only the resulting phonemic/phonetic and qualifiers
termobj.words = nil
end
end
end
end
-- Group pronunciations by dialect, i.e. grouping pronunciations that are identical in every way (including both the
-- pronunciation(s) and any qualifiers and other inline modifiers). `parsed_respellings` contains the output from
-- generate_phonemic_phonetic(), and the return value is a list of grouped pronunciations, where each object in the list
-- contains fields `dialects` (a list of dialects containing the pronunciations) and `pronuns` (a list of
-- pronunciations, where each pronunciation is specified by an object containing fields `phonemic` and `phonetic`, as
-- generated by generate_phonemic_phonetic(), along with any inline modifier fields `q`, `qq`, `a`, `aa` and/or `ref`).
function export.group_pronuns_by_dialect(parsed_respellings)
local grouped_pronuns = {}
for dialect, pronun_spec in pairs(parsed_respellings) do
local saw_omitted = false
for _, termobj in ipairs(pronun_spec.terms) do
if termobj.omitted then
saw_omitted = true
break
end
end
if not saw_omitted then
local saw_existing = false
for _, group in ipairs(grouped_pronuns) do
if m_table.deepEquals(group.pronuns, pronun_spec.terms) then
table.insert(group.dialects, dialect)
saw_existing = true
break
end
end
if not saw_existing then
table.insert(grouped_pronuns, {dialects = {dialect}, pronuns = pronun_spec.terms})
end
end
end
return grouped_pronuns
end
-- Format pronunciations grouped by dialect. `grouped_pronuns` contains the output of group_pronuns_by_dialect().
-- This destructively modifies `grouped_pronuns`, adding a field 'formatted' to the first-level values of
-- `grouped_pronuns` containing the formatted pronunciation(s) for a given set of dialects.
function export.format_grouped_pronunciations(grouped_pronuns)
for _, grouped_pronun_spec in pairs(grouped_pronuns) do
local pronunciations = {}
-- Loop through each pronunciation. For each one, add the phonemic and phonetic versions to `pronunciations`,
-- for formatting by Module:IPA or raw (for use in Module:cy-headword).
for j, pronun in ipairs(grouped_pronun_spec.pronuns) do
-- Add dialect tags to left accent qualifiers if first one
local as = pronun.a
if j == 1 then
if as then
as = m_table.deepcopy(as)
else
as = {}
end
for _, dialect in ipairs(grouped_pronun_spec.dialects) do
table.insert(as, export.dialects_to_names)
end
end
local first_pronun = #pronunciations + 1
if pronun.unknown then
-- FIXME: This is a massive hack but it works for now.
table.insert(pronunciations, { pron = "", pretext = "unknown" })
else
if not pronun.phonemic and not pronun.phonetic then
error("Internal error: Saw neither phonemic nor phonetic pronunciation")
end
if pronun.phonemic then -- missing if 'raw:' given
local slash_pron = "/" .. pronun.phonemic .. "/"
table.insert(pronunciations, {
pron = slash_pron,
})
end
if pronun.phonetic then -- missing if '/.../' given
local bracket_pron = ""
table.insert(pronunciations, {
pron = bracket_pron,
})
end
end
local last_pronun = #pronunciations
if pronun.q then
pronunciations.q = pronun.q
end
if as then
pronunciations.a = as
end
if j > 1 then
pronunciations.separator = ", "
end
if pronun.qq then
pronunciations.qq = pronun.qq
end
if pronun.aa then
pronunciations.aa = pronun.aa
end
if pronun.refs then
pronunciations.refs = pronun.refs
end
if first_pronun ~= last_pronun then
pronunciations.separator = " "
end
end
grouped_pronun_spec.formatted = m_IPA.format_IPA_full(lang, pronunciations, nil, "")
end
end
function export.show(frame)
local params = {
= {},
indent = {},
pagename = {} -- for testing or documentation pages
}
for _, dialect in ipairs(export.dialects) do
params = {}
end
for dialect_group, _ in pairs(export.dialect_groups) do
params = {}
end
local args = require("Module:parameters").process(frame:getParent().args, params)
local pagename = args.pagename or mw.title.getCurrentTitle().subpageText
-- Set inputs
local inputs = {}
-- If 1= specified, do all dialects.
if args then
for _, dialect in ipairs(export.dialects) do
inputs = {input = args, param = 1}
end
end
-- Then do dialect groups.
for dialect_group, group_dialects in pairs(export.dialect_groups) do
if args then
for _, dialect in ipairs(group_dialects) do
inputs = {input = args, param = dialect_group}
end
end
end
-- Then do individual dialect settings.
for _, dialect in ipairs(export.dialects) do
if args then
inputs = {input = args, param = dialect}
end
end
-- If no inputs given, set all dialects based on current pagename.
if not next(inputs) then
for _, dialect in ipairs(export.dialects) do
inputs = {input = "+", param = "(pagename)"}
end
end
-- Parse the arguments.
local parsed_respellings = {}
for dialect, inputspec in pairs(inputs) do
local function generate_obj(respelling, parse_err)
return parse_respelling(respelling, pagename, parse_err)
end
if inputspec.input:find("") then
local put = require(parse_utilities_module)
-- Parse balanced segment runs involving either (substitution notation) or <...> (inline modifiers).
-- We do this because we don't want commas inside of square or angle brackets to count as respelling
-- delimiters. However, we need to rejoin square-bracketed segments with nearby ones after splitting
-- alternating runs on comma. For example, if we are given
-- "aa<q:learned>,<q:nonstandard>", after calling
-- parse_multi_delimiter_balanced_segment_run() we get the following output:
--
-- {"a", "", "a", "<q:learned>", ",", "", "", "<q:nonstandard>", ""}
--
-- After calling split_alternating_runs(), we get the following:
--
-- {{"a", "", "a", "<q:learned>", ""}, {"", "", "", "<q:nonstandard>", ""}}
--
-- We need to rejoin stuff on either side of the square-bracketed portions.
local segments = put.parse_multi_delimiter_balanced_segment_run(inputspec.input, {{"<", ">"}, {""}})
local comma_separated_groups = put.split_alternating_runs_on_comma(segments)
-- Process each value.
local outer_container = export.parse_comma_separated_groups(comma_separated_groups, pagename,
inputspec.input, inputspec.param)
parsed_respellings = outer_container
else
local termobjs = {}
local function parse_err(msg)
error(msg .. ": " .. inputspec.param .. "=" .. inputspec.input)
end
for _, term in ipairs(split_on_comma(inputspec.input)) do
table.insert(termobjs, generate_obj(term, parse_err))
end
parsed_respellings = {
terms = termobjs,
}
end
end
-- Convert each canonicalized respelling to phonemic/phonetic IPA.
export.generate_phonemic_phonetic(parsed_respellings)
-- Group the results.
local grouped_pronuns = export.group_pronuns_by_dialect(parsed_respellings)
-- Format the results.
export.format_grouped_pronunciations(grouped_pronuns)
-- Concatenate formatted results.
local formatted = {}
for _, grouped_pronun_spec in ipairs(grouped_pronuns) do
table.insert(formatted, grouped_pronun_spec.formatted)
end
local indent = (args.indent or "*") .. " "
local out = table.concat(formatted, "\n" .. indent)
if args.indent then
out = indent .. out
end
return out
end
-- Used by Module:cy-IPA/testcases.
function export.test(pagename, respelling, dialect)
local function parse_err(msg)
error(msg)
end
local parsed = parse_respelling(respelling, pagename, parse_err)
return to_IPA(parsed.words, dialect)
end
return export