This module is used by {{ca-IPA}}
local export = {}
local lang = require("Module:languages").getByCode("ca")
local m_IPA = require("Module:IPA")
local m_table = require("Module:table")
local parse_utilities_module = "Module:parse utilities"
local strutil_module = "Module:string utilities"
local listToSet = require("Module:table").listToSet
1. should reduce to in Central and Balearic (], ]). Similar for
2. There needs to be a way of forcing . (Maybe just ʃ?)
3. Make sure manual dot for syllable break works, cf. ] respelled `bèst.sèlerr'.
4. Explicit accents on a/i/u should be removed in split_syllables().
5. Compress double schwa in Central/Balearic in e.g. ], ], ],
]; seems not to operate in Valencian.
6. Compress unstressed <ie> and <oe> followed by coda consonant -> in Central/Balearic in e.g. ],
], ], but not ], ]; seems not to operate in Valencian.
NOTE: It does operate in an open syllable in ], ], ]; not sure why.
7. Compress unstressed <oo> followed by a coda consonant -> in Central/Balearic in e.g. ]. Seems
not to operate in Valencian.
8. bm -> e.g. ]; seems not to operate in Valencian.
9. ë (and presumably ê) doesn't work in secondary stress, always becomes /ɛ/ (e.g. in ] respelled
'ëxtrajudiciàl'; this seems to be because the handling of ë goes through mid_vowel_hint, which doesn't work for
secondary stress.
10. Respect ʃ at beginning of word in Valencian.
11. in single substitution specs should match against written x.
12. Prefixes e.g. ] should not have stress by default, and written primary stresses should be converted to
13. Convert apostrophe near beginning to tie (‿) and make sure we take account of it later, so that words like
] and phrases like ] work correctly.
14. Correctly handle -bl and -gl in respelling, generating and .
15. Correctly handle in respelling forcing fricatives; should not be fortitioned.
16. in single substitution specs should match against b/d/g.
17. in single substitution specs should match against ss?; used to force a pronounced .
18. in single substitution specs should match against m.
19. Correctly handle written -dg- after : fricatives in Valencian, stops in Central (and Balearic?).
20. Correctly handle lenition of written -bdg-: (1) -b- not lenited in Valencian or Balearic, lenited to in
Central Catalan after vowels and consonants except nasals and ; (2) -g- not lenited after nasals, also not
after in Central Catalan (and maybe Balearic?), otherwise yes except utterance initial; (3) -d- not lenited
after nasals or laterals, also not after in Central Catalan (and maybe Balearic?), otherwise yes except
utterance initial. Verify against ca-IPA equivalent on cawikt and also based on {{w|Catalan phonology}} and the IEC
grammar that Vriullop linked.
21. Finish rewriting do_dialect_specific() to operate on whole word using Lua patterns.
22. Implement multiword handling.
23. Make sure suffix handling works correctly.
24. Add many more test cases and redo test harness ala the German test harness.
25. Redo handling of mid-vowel hints so it gets done early and in one place.
26. Think about how to solve the issue of mid-vowel hints along with secondary stress marks in substitution specs.
Maybe a single mid-vowel spec should be rewritten to be a single substitution spec and the insertion of the
mid-vowel spec should happen during resolution of substitution specs.
27. <tm> should default to not .
28. Fix handling of mid vowel default in -è/-ès/-esa so it doesn't affect ] etc.
29. x- after hyphen should probably become tx- in Valencian, cf. ].
30. Implement DOTOVER to indicate lack of stress in a word, e.g. in a suffix.
31. Handle words without vowels.
32. Finish reviewing places where we may need to check for tie symbols.
33. Handle tie indicating liaison in e.g. ].
34. Handle pronunciation of ] correctly.
35. Handle tie indicating liaison before h- correctly, e.g. ].
36. Lenition should happen in Valencian in ] whether respelled 'réggla', 'régla' or 'rég_la'.
37. Syllabification should happen correctly when underscore is used in 'bíb_lia' to block doubling of <bl>.
38. <cn> should show up as .
39. Delete after before anything but (]) or (]).
40. Delete <t/d> after <n> before consonant even in Valencian; likewise for <p/b> after <m>, <c/g> after <n>.
41. DOTOVER in single substitution specs should work.
42. Underline in single substitution specs should work.
43. LINEUNDER should work to indicate secondary stress after the primary stress, including in single substitution
local usub = mw.ustring.sub
local rfind = mw.ustring.find
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local rsubn = mw.ustring.gsub
local ulower = mw.ustring.lower
local u = require("Module:string/char")
local ugcodepoint = mw.ustring.gcodepoint
export.dialects = {"bal", "cen", "val"}
export.dialects_to_names = {
bal = "Balearic",
cen = "Central Catalan",
val = "Valencian",
export.dialect_groups = {
east = {"bal", "cen"},
local written_unaccented_vowel_l = "aeiouyAEIOUY"
local written_stressed_vowel_l = "àèéêëíòóôúýÀÈÉÊËÍÒÓÔÚÝ"
local written_accented_not_stressed_vowel_l = "ïüÏÜ"
local written_accented_vowel_l = written_stressed_vowel_l .. written_accented_not_stressed_vowel_l
local ipa_vowel_l = "ɔɛə"
local written_vowel_l = written_unaccented_vowel_l .. written_accented_vowel_l
local vowel_l = written_vowel_l .. ipa_vowel_l
local V = ""
local written_accented_to_plain_vowel = {
= "a",
= "e",
= "e",
= "e",
= "e",
= "i",
= "i",
= "o",
= "o",
= "o",
= "u",
= "u",
= "y",
= "A",
= "E",
= "E",
= "E",
= "E",
= "I",
= "I",
= "O",
= "O",
= "O",
= "U",
= "U",
= "Y",
local AC = u(0x0301) -- acute = ́
local GR = u(0x0300) -- grave = ̀
local CFLEX = u(0x0302) -- circumflex = ̂
local DOTOVER = u(0x0307) -- dot over = ̇
local DIA = u(0x0308) -- diaeresis = ̈
local LINEUNDER = u(0x0331) -- lineunder = ̱
local stress_l = AC .. GR
local stress_c = ""
local ipa_stress_l = "ˈˌ"
local ipa_stress_c = ""
local sylsep_l = "%-." -- hyphen included for syllabifying from spelling; FIXME: formerly included SYLDIV
local sylsep_c = ""
local tie_l = "‿'"
local tie_c = ""
local charsep_l = sylsep_l .. tie_l .. stress_l .. ipa_stress_l
local charsep_c = ""
local wordsep_l = "# "
local wordsep_c = ""
local separator_l = charsep_l .. wordsep_l
local separator_c = ""
local neg_guts_of_cons = vowel_l .. separator_l
local C = "" -- consonant class including h
export.mid_vowel_hints = "éèêëóòô"
export.mid_vowel_hint_c = ""
local TEMP_PAREN_R = u(0xFFF1)
local TEMP_PAREN_RR = u(0xFFF2)
-- Pseudo-consonant at the edge of prefixes ending in a vowel and suffixes beginning with a vowel; FIXME: not currently
-- used.
local PSEUDOCONS = u(0xFFF3)
-- local PREFIX_MARKER = u(0xFFF4) -- marker indicating a prefix so we can convert primary to secondary accents
local valid_onsets = listToSet {
"b", "bl", "br",
"c", "cl", "cr",
"d", "dj", "dr",
"f", "fl", "fr",
"g", "gl", "gr", "gu", "gü",
"k", "kl", "kr",
"l", "ll",
"n", "ny", "ñ",
"p", "pl", "pr",
"qu", "qü",
"r", "rr",
"s", "ss",
"t", "tg", "tj", "tr", "tx", "tz",
"v", "vl", "vr",
"ʃ", -- e.g. 'χruʃóf' respelling of ]
"χ", -- in case of respelling
local decompose_dotover = {
-- No composed i, u or U with DOTOVER.
= "a" .. DOTOVER,
= "e" .. DOTOVER,
= "o" .. DOTOVER,
= "y" .. DOTOVER,
= "A" .. DOTOVER,
= "E" .. DOTOVER,
= "I" .. DOTOVER,
= "O" .. DOTOVER,
= "Y" .. DOTOVER,
local unstressed_words = listToSet {
-- proclitic object pronouns
"em", "et", "es", "el", "la", "els", "les", "li", "ens", "us", "ho", "hi", "en",
-- enclitic object pronouns usually attach with hyphen to preceding verb but not always, cf. ]
"me", "te", "se", "lo", "los", "nos", "vos", "ne",
-- contracted object pronouns and articles attached with apostrophe so no need to include
-- unstressed possessives
"mon", "ma", "mos", "mes", "ton", "ta", "tos", "tes", "son", "sa", "sos", "ses",
-- prepositions
"a", "de", "per", "amb", "ab", -- 'en' already included as proclitic object pronouns
-- prepositional contractions
"al", "als", "del", "dels", "pel", "pels",
-- articles 'el', 'la', 'els', 'les' already included as proclitic pronouns
-- personal articles
"na", -- 'en' already included above
-- indefinite articles
"un", "uns",
-- salat articles
"ets", "so", -- 'es' already included as proclitic object pronouns and 'ses', 'sa', 'sos' as possessives
-- conjunctions
"i", "o", "si", "ni", "que",
-- Version of rsubn() that discards all but the first return value.
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
-- Version of rsubn() that returns a 2nd argument boolean indicating whether a substitution was made.
local function rsubb(term, foo, bar)
local retval, nsubs = rsubn(term, foo, bar)
return retval, nsubs > 0
-- Apply rsub() repeatedly until no change.
local function rsub_repeatedly(term, foo, bar)
while true do
local new_term = rsub(term, foo, bar)
if new_term == term then
return term
term = new_term
local function split_into_chars(text)
local chars = {}
for codepoint in ugcodepoint(text) do
table.insert(chars, u(codepoint))
return chars
local function split_on_comma(term)
if term:find(",%s") or term:find("\\") then
return require(parse_utilities_module).split_on_comma(term)
return rsplit(term, ",")
local function concat_keys(tab)
local res = {}
for k, _ in pairs(tab) do
table.insert(res, k)
return table.concat(res)
local function handle_unstressed_words(words)
words = m_table.deepCopy(words)
-- Lowercase all words for ease in further processing.
for i, wordobj in ipairs(words) do
wordobj.term = ulower(wordobj.term)
-- Check if the word at index `i` in `words` is "amb" and the following word begins with a vowel.
local function is_amb_to_join(words, i)
return i < #words and words.term == "a" .. DOTOVER .. "mb" and rfind(words.term, "^h?" .. V)
local saw_amb_to_join = true
-- Mark all unstressed words with DOTOVER, so that split_syllables() doesn't assign stress. We need to do this
-- before special handling for ], because ] may join to another unstressed word like ], in the
-- process losing the identity of the two words. In the process, see if ] occurs before a following
-- vowel-initial word (which may begin with h-).
for i, wordobj in ipairs(words) do
-- Put DOTOVER after the last vowel (to handle the case of ]). It doesn't actually matter where we put
-- it, because split_syllables() just looks for DOTOVER anywhere in the word.
if unstressed_words then
wordobj.term = rsub(wordobj.term, "^(.*" .. V .. ")", "%1" .. DOTOVER)
if is_amb_to_join(words, i) then
saw_amb_to_join = true
-- Join ] before vowel-initial word with following word.
if saw_amb_to_join then
local new_words = {}
local i = 1
while i <= #words do
if is_amb_to_join(words, i) then
table.insert(new_words, {term = words.term .. "‿" .. words.term, pos = words.pos})
i = i + 2
table.insert(new_words, words)
i = i + 1
words = new_words
-- Finally, rewrite some unstressed words to get the right pronunciation. Any remaining ] not before a
-- vowel-initial word is pronounced even in Valencian (where / would be expected), and ] always
-- has a pronounced <r>.
local unstressed_word_replacement = {
= "a" .. DOTOVER .. "m",
= "pe" .. DOTOVER .. "rr",
for i, wordobj in ipairs(words) do
wordobj.term = unstressed_word_replacement or wordobj.term
return words
local function fix_prefixes(word)
-- Voiced s in prefix roots -fons-, -dins-, -trans-
word = rsub(word, "^enfons()", "enfonz%1")
word = rsub(word, "^endins()", "endinz%1")
word = rsub(word, "tr()ns()", "tr%1nz%2")
-- in + ex > ineks/inegz
word = rsub(word, "^inex", "in.ex")
return word
local function restore_diaereses(word)
-- Some structural forms do not have diaeresis per diacritic savings, let's restore it to identify hiatus
word = rsub(word, "()um(s?)$", "%1üm%2") -- Latinisms (-ius is ambiguous but rare)
word = rsub(word, "()isme(s?)$", "%1ísme%2") -- suffix -isme
word = rsub(word, "()ist(s?)$", "%1íst%2") -- suffix -ista
word = rsub(word, "()ir$", "%1ír") -- verbs -ir
word = rsub(word, "()int$", "%1ínt") -- present participle
word = rsub(word, "()ir()$", "%1ïr%2") -- future
word = rsub(word, "(u)ir()$", "%1ïr%2")
word = rsub(word, "()iràs$", "%1ïràs")
word = rsub(word, "(u)iràs$", "%1ïràs")
word = rsub(word, "()ir(e)$", "%1ïr%2")
word = rsub(word, "(u)ir(e)$", "%1ïr%2")
word = rsub(word, "()iran$", "%1ïran")
word = rsub(word, "(u)iran$", "%1ïran")
word = rsub(word, "()iria$", "%1ïria") -- conditional
word = rsub(word, "(u)iria$", "%1ïria")
word = rsub(word, "()ir(ie)$", "%1ïr%2")
word = rsub(word, "(u)ir(ie)$", "%1ïr%2")
return word
local function fix_y(word)
-- y > vowel i else consonant /j/, except ny
word = rsub(word, "ny", "ñ")
word = rsub(word, "y()", "i%1") -- vowel if not next to another vowel
word = rsub(word, "()y", "%1i") -- excluding also syllables separators
return word
local function mid_vowel_fixes(word)
local function track_mid_vowel(vowel, cont)
require("Module:debug/track"){"ca-IPA/" .. vowel, "ca-IPA/" .. vowel .. "/" .. cont}
return true
local changed
-- final -el (not -ell) usually è but not too many cases
word, changed = rsubb(word, "e(nts?)$", "é%1")
if changed then
track_mid_vowel("e", "nt-nts")
word, changed = rsubb(word, "e(rs?)$", "é%1")
if changed then
track_mid_vowel("e", "r-rs")
word, changed = rsubb(word, "o(rs?)$", "ó%1")
if changed then
track_mid_vowel("o", "r-rs")
word, changed = rsubb(word, "è(s?)$", "ê%1")
if changed then
track_mid_vowel("è", "s-blank")
word, changed = rsubb(word, "e(ss)$", "ê%1")
if changed then
track_mid_vowel("e", "sos-sa-ses")
word, changed = rsubb(word, "e(sa)$", "ê%1")
if changed then
track_mid_vowel("e", "sos-sa-ses")
return word
local function word_fixes(word, dialect)
word = rsub(word, "%(rr%)", TEMP_PAREN_RR)
word = rsub(word, "%(r%)", TEMP_PAREN_R)
word = rsub(word, "%-(?)", "-%1%1")
if dialect == "val" then
word = rsub(word, "%-x", "-tx")
word = rsub(word, "rç$", "rrs") -- silent r only in plurals -rs
word = fix_prefixes(word) -- internal pause after a prefix
word = restore_diaereses(word) -- no diaeresis saving
word = fix_y(word) -- ny > ñ else y > i vowel or consonant
word = mid_vowel_fixes(word)
-- all words in pn- (e.g. ] and mn- (e.g. ]) have silent p/m in both Central and Valencian
word = rsub(word, "^n", "n")
-- Respell ch + vowel as tx, before we remove other h's after consonants.
word = rsub(word, "ch(" .. V ..")", "tx%1")
-- Delete h after a consonant. This must happen here, before split_syllables(). We don't delete h after a vowel
-- yet because it indicates a hiatus.
word = rsub(word, "(" .. C .. ")h", "%1")
return word
local function split_vowels(vowels, saw_dotover, saw_lineunder)
local syllables = {{onset = "", vowel = usub(vowels, 1, 1), coda = "", separator = "", has_dotover = saw_dotover,
has_lineunder = saw_lineunder}}
vowels = usub(vowels, 2)
while vowels ~= "" do
local syll = {onset = "", vowel = "", coda = ""}
syll.onset, syll.vowel, vowels = rmatch(vowels, "^(?)(.)(.-)$")
table.insert(syllables, syll)
local count = #syllables
if count >= 2 and (syllables.vowel == "i" or syllables.vowel == "u") then
syllables.coda = syllables.vowel
syllables = nil
return syllables
-- Split the word into syllables. Return a list of syllable objects, each of which contains fields `onset`, `vowel`,
-- `coda`, `separator` (a user-specified syllable divider that goes before the syllable; one of '·', '-' or '.') and
-- `stressed` (a boolean indicating that the syllable is stressed). In addition, the list has fields `stress` (the
-- index of the syllable with primary stress) and `is_prefix` (true if the word is a prefix, i.e. it ends in '-').
-- Normally, prefixes are treated as unstressed if a stressed syllable isn't explicitly marked, but this can be
-- overridden with `stress_prefixes`, which causes the automatic stress-assignment algorithm to run for these terms.
local function split_syllables(word, stress_prefixes, may_be_uppercase)
local syllables = {}
local saw_dotover = false
local remainder = word
local is_prefix = false
if remainder:find("%-$") then -- prefix
is_prefix = true
remainder = remainder:gsub("%-$", "")
local is_suffix = false
if remainder:find("^%-") then -- suffix
is_suffix = true
remainder = remainder:gsub("^%-", "")
while remainder ~= "" do
local consonants, vowels
-- FIXME: Using C and V below instead of the existing patterns slows things down TREMENDOUSLY.
-- Not sure why.
local vowel_list = may_be_uppercase and "aeiouàèéêëíòóôúïüAEIOUÀÈÉÊËÍÒÓÔÚÏÜ" .. DOTOVER .. LINEUNDER or
"aeiouàèéêëíòóôúïü" .. DOTOVER .. LINEUNDER
consonants, remainder = rmatch(remainder, "^(*)(.-)$")
vowels, remainder = rmatch(remainder, "^(*)(.-)$")
local this_saw_dotover = not not rfind(vowels, DOTOVER)
if this_saw_dotover then
saw_dotover = true
vowels = vowels:gsub(DOTOVER, "")
local this_saw_lineunder = not not rfind(vowels, LINEUNDER)
if this_saw_lineunder then
vowels = vowels:gsub(LINEUNDER, "")
if vowels == "" then
if #syllables > 0 then
syllables.coda = syllables.coda .. consonants
-- word without vowels, e.g. foot boundary |
table.insert(syllables, {onset = consonants, vowel = "", coda = "", separator = ""})
local onset = consonants
local first_vowel = usub(vowels, 1, 1)
if (rfind(onset, "$") and (first_vowel == "ü" or (first_vowel == "u" and vowels ~= "u")))
or ((onset == "" or onset == "h" or onset == "H") and #syllables == 0 and
(first_vowel == "i" or first_vowel == "I") and (vowels ~= "i" and vowels ~= "I"))
onset = onset .. usub(vowels, 1, 1)
vowels = usub(vowels, 2)
local vsyllables = split_vowels(vowels, this_saw_dotover, this_saw_lineunder)
vsyllables.onset = onset .. vsyllables.onset
for _, s in ipairs(vsyllables) do
table.insert(syllables, s)
-- Shift over consonants from the onset to the preceding coda, until the syllable onset is valid
for i = 2, #syllables do
local current = syllables
local previous = syllables
while not (current.onset == "" or valid_onsets?$", ""), "_", "")]) do
local letter = usub(current.onset, 1, 1)
current.onset = usub(current.onset, 2)
if rfind(letter, "") then -- syllable separators
current.separator = letter
previous.coda = previous.coda .. letter
if rfind(letter, tie_c) then
-- Detect stress
for i, syll in ipairs(syllables) do
if rfind(syll.vowel, "^$") then
syll.stressed = true
-- primary stress: the last one stressed without LINEUNDER
if not syll.has_lineunder then
syllables.stress = i
-- Assign default stress
if not syllables.stress and not saw_dotover and (stress_prefixes or not is_prefix) then
local count = #syllables
if count == 1 then
if syllables.vowel ~= "" then -- vowel-less words don't get stress
syllables.stress = 1
local final = syllables
-- Take account of tie symbols (apostrophes and ‿).
if rfind(final.coda, "^*$") or (rfind(final.coda, "^" .. tie_c .. "*n" .. tie_c .. "*$") and (
final.vowel == "e" or final.vowel == "i" or final.vowel == "ï")) then
syllables.stress = count - 1
syllables.stress = count
if syllables.stress then
syllables.stressed = true
syllables.is_prefix = is_prefix
syllables.is_suffix = is_suffix
return syllables
local function reconstitute_word_from_syllables(syllables)
local parts = {}
local function ins(txt)
table.insert(parts, txt)
if syllables.is_suffix then
for _, syl in ipairs(syllables) do
if syl.has_dotover then
if syl.has_lineunder then
if syllables.is_prefix then
return table.concat(parts)
local function decompose_respelling(text)
local dotover_keys = concat_keys(decompose_dotover)
return rsub(text, "", decompose_dotover)
local function canon_respelling(text)
local function canon_spaces(text)
text = rsub(text, "%s+", " ")
text = rsub(text, "^ ", "")
text = rsub(text, " $", "")
return text
text = canon_spaces(text)
-- eliminate upside down punctuation
text = rsub(text, "", "")
-- eliminate utterance-final punctuation
text = rsub(text, "$", "")
-- eliminate double and triple quotes
text = rsub(text, "''+", "")
-- Convert commas and em/en dashes to IPA foot boundaries; require a space after commas and en dashes (for the
-- latter, in particular, to avoid treating the en dash in 'Bose–Einstein condensate' as a foot boundary.
text = rsub(text, " * ", " | ")
text = rsub(text, " * *", " | ")
-- ... in phrases like ] and ] become foot boundaries
text = rsub(text, " *%.%.%. *", " | ")
-- remaining commas and en dashes become spaces
text = rsub(text, "", " ")
-- may need to eliminate extraneous spaces again, e.g. if there was a space before or after an eliminated
-- punctuation mark
text = canon_spaces(text)
-- question mark or exclamation point in the middle of a sentence -> IPA foot boundary
text = rsub(text, "() * *()", "%1 | %2")
return text
local IPA_vowels_central = {
= "ɛ", = "ɛ", = "ɔ",
local IPA_vowels_balearic = {
= "ə", = "ɛ", = "ɔ",
local IPA_vowels_valencian = {
= "e", = "e", = "o",
local IPA_vowels = {
= "a",
= "ɛ", = "ɛ", = "ɛ", = "e",
= "i", = "i",
= "ɔ", = "ɔ", = "o",
= "u", = "u",
local function replace_context_free(cons)
cons = rsub(cons, "ŀ", "l")
cons = rsub(cons, "r", "ɾ")
cons = rsub(cons, "ɾɾ", "r")
cons = rsub(cons, "ss", "s")
cons = rsub(cons, "ll", "ʎ")
cons = rsub(cons, "ñ", "ɲ") -- hint ny > ñ
-- NOTE: We use single-character affricate symbols during processing for ease in handling, and convert them
-- to tied multi-character affricates at the end of join_syllables().
cons = rsub(cons, "j", "ʤ")
cons = rsub(cons, "tx", "ʧ")
cons = rsub(cons, "z", "ʣ")
cons = rsub(cons, "ç", "s")
cons = rsub(cons, "", "k")
cons = rsub(cons, "h", "")
cons = rsub(cons, "j", "ʒ")
-- Don't replace x -> ʃ yet so we can distinguish x from manually specified ʃ.
cons = rsub(cons, "i", "j") -- must be after j > ʒ
cons = rsub(cons, "y", "j") -- must be after j > ʒ and fix_y
cons = rsub(cons, "", "w")
cons = rsub(cons, "'", "‿")
return cons
-- Do context-sensitive phonological changes. Formerly this was all done syllable-by-syllable but that made the code
-- tricky (since it often had to look at adjacent syllables) and full of subtle bugs. Now we first concatenate the
-- syllables back to words and the words to the combined text and work on the text as a whole. FIXME: We should move
-- more of the work done in preprocess_word(), e.g. most of replace_context_free(), here.
local function postprocess_general(text, dialect)
local function verify(cond, msg)
if not cond then
error(("Internal error: %s; processed respelling at this point is '%s'"):format(msg, text))
return true
local voiced = listToSet {"b", "d", "g", "m", "n", "ɲ", "l", "ʎ", "r", "ɾ", "v", "z", "ʒ", "ʣ", "ʤ"}
local voiced_keys = concat_keys(voiced)
local voiceless = listToSet {"p", "t", "k", "f", "s", "ʃ", "ʦ", "ʧ"}
local voiceless_keys = concat_keys(voiceless)
local voicing = { = "b", = "d", = "g", = "v", = "z", = "ʒ", = "ʤ",
= "ʤ"}
local voicing_keys = concat_keys(voicing)
local devoicing = {}
for k, v in pairs(voicing) do
devoicing = k
local devoicing_keys = concat_keys(devoicing)
------------------ Handle <x>
-- Handle ex- + vowel > -egz-. We handle -x- on either side of the syllable boundary. Note that this also handles
-- inex- + vowel because in fix_prefixes we respell inex- as in.ex-, which ends up at this stage as in.e.xV.
text = rsub_repeatedly(text, "(" .. stress_c .. "*)(" .. charsep_c .. "*)x(" .. charsep_c .. "*" .. V ..
")", function(e, pre, post)
-- Preserve other character separators (especially the tie character ‿).
pre = pre:gsub("%.", "")
post = post:gsub("%.", "")
return e .. pre .. "g.z" .. post
-- -x- at the beginning of a coda becomes , e.g. ], ], ]; but not elsewhere in
-- the coda, e.g. in ], ]; words with in -nx such as ], ], ] need
-- respelling with ; words ending in vowel + x like ] need respelling with
text = rsub(text, "(" .. V .. stress_c .. "*)x", "%1ks")
if dialect == "val" then
-- Word-initial <x> as well as <x> after a consonant other than /j/ (including in the coda, e.g. ])
-- becomes .
text = rsub(text, "#x", "#ʧ")
text = rsub(text, "(" .. charsep_c .. "*)x", "%1ʧ")
-- Other x becomes
text = rsub(text, "x", "ʃ")
-- Doubled ss -> s e.g. in exs-, exc(e/i)-, sc(e/i)-; FIXME: should this apply across word boundaries?
text = rsub(text, "s(" .. charsep_c .. "*)s", "%1s")
------------------ Coda consonant losses
-- In Central Catalan, coda losses happen everywhere, but otherwise they don't happen when
-- absolutely word-finally before a vowel or end of utterance (e.g. ] has /k/ in Balearic and
-- Valencian but not ]). Must precede consonant assimilations.
local boundary = dialect == "cen" and "(.)" or "()"
text = rsub(text, "m" .. boundary, "m%1")
text = rsub(text, "()" .. boundary, "%1%2")
text = rsub(text, "" .. boundary, "ŋ%1")
if dialect == "val" or dialect == "bal" then
local before_cons = "(" .. separator_c .. "*" .. C .. ")"
text = rsub(text, "m" .. before_cons, "m%1")
text = rsub(text, "()" .. before_cons, "%1%2")
text = rsub(text, "" .. before_cons, "ŋ%1")
-- Delete /t/ between /s/ and any consonant other than /s/ or /ɾ/. Must precede voicing assimilation and
-- t + lateral/nasal assimilation.
text = rsub(text, "st(" .. sylsep_c .. "*)", "s%1")
------------------ Consonant assimilations
if dialect == "cen" then
-- v > b in onsets (not in codas, e.g. ] and ] ). This needs to precede
-- assimilation of nb -> mb.
text = rsub(text, "v(" .. C .. "*" .. V ..")", "b%1")
-- t + lateral assimilation -> geminate across syllable boundary. We don't any more do t + nasal assimiation
-- because there are too many exceptions, e.g. ], ], ]. Instead, we require that
-- cases where it does happen use respelling to effect this. FIXME: this doesn't always happen in -tl- either,
-- e.g. ] has in GDLC but in DNV.
-- FIXME: Clean this up, maybe move below voicing assimilation, investigate whether it operates across words,
-- move stuff below that special-cases tll in Valencian here.
text = rsub(text, "t(" .. sylsep_c .. ")()", "%2%1%2")
-- n + labial > labialized assimilation
text = rsub(text, "n(" .. separator_c .. "*)", "m%1")
text = rsub(text, "n(" .. separator_c .. "*)", "ɱ%1")
-- n + velar > velarized assimilation
text = rsub(text, "n(" .. separator_c .. "*)", "ŋ%1")
-- l/n + palatal > palatalized assimilation
text = rsub(text, "()(" .. separator_c .. "*)", function(ln, palatal)
ln = ({ = "ʎ", = "ɲ"})
return ln .. palatal
-- ɲs > ɲʃ; FIXME: not sure the purpose of this; it doesn't apply in ] or derived terms like ]
-- NOTE: Per , it does apply in these scenarios but the result is
-- somewhere between and , which is why it isn't shown in GDLC.
-- text = rsub(text, "ɲs", "%1ʃ")
------------------ Handle <r>
-- In replace_context_free(), we converted single r to ɾ and double rr to r.
if dialect == "cen" then
text = rsub(text, TEMP_PAREN_R, "")
text = rsub(text, TEMP_PAREN_RR, "r")
elseif dialect == "bal" then
text = rsub(text, TEMP_PAREN_R, "")
text = rsub(text, TEMP_PAREN_RR, "")
verify(dialect == "val", ("Unrecognized dialect '%s'"):format(dialect))
text = rsub(text, TEMP_PAREN_R, "ɾ")
text = rsub(text, TEMP_PAREN_RR, "ɾ")
if dialect ~= "val" then
-- Coda /ɾ/ -> /r/
-- FIXME: This is inherited from the older code. Correct?
text = rsub(text, "(" .. V .. stress_c .. "*" .. C .. "*)ɾ", "%1r")
-- ɾ -> r word-initially or after ; needs to precede voicing assimilation as <s> will be voiced to before
-- /ɾ/.
text = rsub(text, "(" .. sylsep_c .. "*)ɾ", "%1r")
------------------ Voicing assimilation
-- Voicing or devoicing; we want to proceed from right to left, and due to the limitations of patterns (in
-- particular, the lack of support for alternations), it's difficult to do this cleanly using Lua patterns, so we
-- do it character by character.
local chars = split_into_chars(text)
-- We need to look two characters ahead in some cases, so start two characters from the end. This is safe because
-- the overall respelling ends in "##". (Similarly, as an optimization, don't check the first two characters, which
-- are always "##".)
for i = #chars - 2, 3, -1 do
-- We are looking for two consonants next to each other, possibly separated by a syllable or word divider.
-- We also handle a consonant followed by a syllable divider then a vowel, and a consonant word-finally.
-- Note that only coda consonants can change voicing, so we need to check to make sure we're in the coda.
local first = chars
-- If `second` is nil, no assimilation occurs. Otherwise, `second` should be a consonant or empty string (which
-- represents a syllable or word boundary followed by a vowel or end of string), and we assimilate to that
-- consonant (empty string forces devoicing).
local second
-- If set to true, we're processing a consonant directly before a word boundary followed by a word beginning
-- with a vowel. In this context, voiceless sibilants voice. Note that we handle voicing of <s> word-internally
-- separately, in preprocess_word() [FIXME: maybe move much of the processing in preprocess_word() into this
-- function].
local word_boundary_before_vowel
if not rfind(first, C) then
-- leave `second` at nil; no assimilation
elseif chars == "#" then -- word boundary
if chars == " " then
-- chars should always be "#"
verify(chars == "#", "Word boundary followed by space but not #")
if rfind(chars, C) then
second = chars
second = ""
word_boundary_before_vowel = true
second = ""
elseif rfind(chars, sylsep_c) then -- syllable boundary
if rfind(chars, C) then
second = chars
second = ""
elseif rfind(chars, C) then
second = chars
-- followed by a vowel not across a syllable or word boundary; leave `second` as nil, no assimilation
if second then
-- Make sure we're in the coda. We have to look backwards until we find a vowel or syllable/word boundary.
local in_coda = false
local j = i - 1
while true do
verify(j > 0, "Missing word boundary at beginning of overall respelling")
if rfind(chars, "") then
elseif rfind(chars, V) then
in_coda = true
j = j - 1
if in_coda then
if word_boundary_before_vowel and rfind(first, "") then
-- leave alone
elseif voiced and voicing or word_boundary_before_vowel and rfind(first, "") then
chars = voicing
elseif (voiceless or second == "") and devoicing then
chars = devoicing
text = table.concat(chars)
-- gn -> ŋn e.g. ] (including word-initial gn- e.g. ], ])
-- FIXME: This should be moved below voicing assimilation, and we need to investigate if it operates across words
-- (here I'm guessing yes).
if dialect ~= "cen" then
text = rsub(text, "#gn", "#n")
text = rsub(text, "g(" .. separator_c .. "*n)", "ŋ%1")
-- gʒ > d͡ʒ
-- FIXME: We need to investigate if it operates across words
text = rsub(text, "g(" .. sylsep_c .. "*)ʒ", "%1ʤ")
-- sʃ -> ʃ (]), zʒ -> ʒ (])
if dialect ~= "val" then
text = rsub(text, "s(" .. separator_c .. "*ʃ)", "%1")
text = rsub(text, "z(" .. separator_c .. "*ʒ)", "%1")
------------------ Gemination of <bl>, <gl>
if dialect ~= "val" then
-- bl -> bbl, gl -> ggl after the stress when following a vowel; to avoid this, use <b_l> or <g_l>.
-- This must follow v > b above. To force a hard ungeminated or , use <_b> or <_g>.
text = rsub(text, "(" .. stress_c .. ")(" .. sylsep_c .. ")()l", "%1%3%2%3l")
else -- Valencian; undo manually written 'bbl', 'ggl' in words like ], ]
text = rsub(text, "()(" .. sylsep_c .. ")%1l", "%2%1l")
------------------ Lenition of voiced stops
-- In Central Catalan, b/d/g become fricatives (actually approximants, like in Spanish) in the onset following a
-- vowel and (except for <d>) after <l> and <ll> (cf. GDLC ] ). This also happens across
-- word boundaries but doesn't happen after stops, nor in Central Catalan after , or (and hence probably
-- not after either, although I can't find any examples in GDLC).
-- In Valencian, <b> doesn't lenite (at least formally?), but <d> and <g> do lenite after , or .
-- Balearic is like Valencian in not leniting <b>, and probably like Central Catalan otherwise.
local lenite_bdg = { = "β", = "ð", = "ɣ"}
if dialect == "cen" then
text = rsub(text, "(" .. separator_c .. "*" .. separator_c .. "*)()",
function(before, bdg) return before .. lenite_bdg end)
text = rsub(text, "(" .. separator_c .. "*" .. separator_c .. "*)()",
function(before, bg) return before .. lenite_bdg end)
elseif dialect == "val" then
text = rsub(text, "(" .. separator_c .. "*" .. separator_c .. "*)()",
function(before, dg) return before .. lenite_bdg end)
text = rsub(text, "(" .. separator_c .. "*" .. separator_c .. "*)(g)",
function(before, g) return before .. lenite_bdg end)
verify(dialect == "bal", ("Unrecognized dialect '%s'"):format(dialect))
text = rsub(text, "(" .. separator_c .. "*" .. separator_c .. "*)()",
function(before, dg) return before .. lenite_bdg end)
text = rsub(text, "(" .. separator_c .. "*" .. separator_c .. "*)(g)",
function(before, g) return before .. lenite_bdg end)
------------------ Vowel reduction
-- Reduction of unstressed a,e in Central and Balearic (Eastern Catalan).
if dialect ~= "val" then
-- The following rules seem to apply, based on the old code:
-- (1) Stressed a and e are never reduced.
-- (2) Unstressed e directly following ə is not reduced.
-- (3) Unstressed e directly before written <a> or before /ɔ/ is not reduced.
-- (4) Written <ee> when both vowels precede the primary stress is reduced to . (This rule preempts #2.)
-- (5) Written <ee> when both vowels follow the primary stress isn't reduced at all.
-- Rule #2 in particular seems to require that we proceed left to right, which is how the old code was
-- implemented.
-- FIXME: These rules seem overly complex and may produce incorrect results in some circumstances.
local words = rsplit(text, " ")
for j, word in ipairs(words) do
local chars = split_into_chars(word)
-- See above where voicing assimilation is handled. The overall respelling begins and ends in #, which we
-- can ignore. We need to look ahead three chars in some circumstances, but in all those circumstances we
-- shoudn't run off the end (and have assertions to check this).
local seen_primary_stress = false
for i = 2, #chars - 1 do
local this = chars
if chars == AC then
seen_primary_stress = true
if (this ~= "a" and this ~= "e") or rfind(chars, stress_c) then
-- Not a/e, or a stressed vowel; continue
local reduction = true
local prev, prev_stress, nxt, nxt_stress
if not rfind(chars, sylsep_c) then
prev = ""
prev = chars -- this should be non-nil as chars is a syllable separator (not #)
verify(prev, "Missing # at word boundary")
prev_stress = ""
if rfind(prev, stress_c) then
prev_stress = prev
prev = chars
-- As above; chars is a stress indicator (not #).
verify(prev, "Missing # at word boundary")
if not rfind(chars, sylsep_c) then
nxt = ""
-- leave nxt at nil
nxt = chars
nxt_stress = chars
-- chars is a syllable separator, so chars should not be a word boundary, so
-- chars should exist.
verify(nxt and nxt_stress, "Syllable separator at word boundary or missing # at word boundary")
if this == "e" and rfind(prev, "ə") then
reduction = false
elseif this == "e" and rfind(nxt, "") then
reduction = false
elseif this == "e" and nxt == "e" and not rfind(nxt_stress, AC) then
-- FIXME: Check specifically for AC duplicates previous logic but is probably wrong or unnecessary.
if not seen_primary_stress then
chars = "ə"
reduction = false
if reduction then
chars = "ə"
words = table.concat(chars)
text = table.concat(words, " ")
if dialect == "cen" then
-- Reduction of unstressed o (not before w)
text = rsub(text, "o()", "u%1")
elseif dialect == "bal" then
-- Reduction of unstressed o per vowel harmony: unstressed /o/ -> /u/ directly before stressed /i/ or /u/;
-- as a Lua pattern, o can be followed only by consonants and/or syllable separators (no vowels, stress marks
-- or word separators).
text = rsub(text, "o(*" .. stress_c .. ")", "u%1")
-- Final losses.
text = rsub(text, "j(ʧs?#)", "%1") -- boigs /bɔt͡ʃ/
text = rsub(text, "()s#", "%1#") -- homophone plurals -xs, -igs, -çs
if dialect ~= "val" then
-- Remove j before palatal obstruents
text = rsub(text, "j(" .. sylsep_c .. "*)", "%1")
else -- Valencian
-- Fortition of palatal fricatives
text = rsub(text, "ʒ", "ʤ")
text = rsub(text, "(i" .. stress_c .. "*" .. sylsep_c .. ")ʣ", "%1z")
if dialect ~= "cen" then
-- No palatal gemination ʎʎ > ll or ʎ, in Valencian and Balearic.
-- FIXME: These conditions seem to be targeting specific words and should probably be fixed using respelling
-- instead.
text = rsub(text, "(a" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "(e" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "(ti" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "(m" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "(u" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "ʎ(" .. sylsep_c .. "*ʎ)", "%1")
---------- Convert pseudo-symbols to real ones.
-- Convert g to IPA ɡ.
text = rsub(text, "g", "ɡ")
-- Convert pseudo-afficate symbols to full affricates.
local full_affricates = { = "t͡s", = "d͡z", = "t͡ʃ", = "d͡ʒ" }
text = rsub(text, "()", full_affricates)
---------- Generate IPA stress marks.
-- Convert acute and grave to IPA stress marks.
text = rsub(text, AC, "ˈ")
text = rsub(text, GR, "ˌ")
-- Move IPA stress marks to the beginning of the syllable.
text = rsub_repeatedly(text, "()(*)(" .. ipa_stress_c .. ")", "%1%3%2")
-- Suppress syllable divider before IPA stress indicator.
text = rsub(text, "%.(#?" .. ipa_stress_c .. ")", "%1")
-- Make all primary stresses but the last one in a given word be secondary. May be fed by the first rule above.
-- FIXME: Currently this is handled earlier, but we might want to move it here, as is done in ].
-- text = rsub_repeatedly(text, "ˈ(+)ˈ", "ˌ%1ˈ")
-- Make primary stresses in prefixes become secondary. (FIXME: Handled earlier now.)
-- text = rsub_repeatedly(text, "ˈ(*#" .. PREFIX_MARKER .. ")", "ˌ%1")
-- Remove # symbols at word/text boundaries, as well as _ (which forces separate interpretation), pseudo-consonant
-- markers (at edges of some prefixes/suffixes), and prefix markers, and recompose.
text = rsub(text, "", "")
text = mw.ustring.toNFC(text)
return text
local function preprocess_word(syllables, suffix_syllables, dialect, pos, orig_word)
-- Stressed vowel is ambiguous
if syllables.stress then
local stressed_vowel = syllables.vowel
if rfind(stressed_vowel, "") then
local marks = { = {AC, GR, CFLEX, DIA}, = {AC, GR, CFLEX}}
local marked_vowels = {}
for _, mark in ipairs(marks) do
table.insert(marked_vowels, stressed_vowel .. mark)
error(("In respelling '%s', the stressed vowel '%s' is ambiguous. Please mark it with an acute, " ..
"grave, or combined accent: %s."):format(orig_word, stressed_vowel,
mw.text.listToText(marked_vowels, nil, " or ")))
-- Final -r is ambiguous in many cases.
local final = syllables
-- Stressed final r after a or i in non-monosyllables is treated as (r), i.e. verbal infinitives are assumed (NOTE:
-- not always the case, e.g. there are many adjectives and nouns in -ar that should be marked as '(rr)', and
-- several loanword nouns in -ir that should be marked as 'rr'). Likewise for stressed final r or rs after é in
-- non-monosyllables (which are usually adjectives or nouns with the -er ending, but may be verbal infinitives,
-- which should be marked as 'ê(r)'). That is, it disappears other than in Valencian. All other final r and final
-- rs are considered ambiguous and need to be rewritten using rr, (rr) or (r).
if #syllables > 1 and final.stressed then
if final.coda == "r" and rfind(final.vowel, "") or final.coda == "rs" and final.vowel == "é" or
final.vowel == "ó" and rfind(final.coda, "^rs?$") and rfind(final.onset, "") then
final.coda = TEMP_PAREN_R
if rfind(final.coda, "^rs?$") or rfind(final.coda, "rs?$") then
error(("In respelling '%s', final -r by itself or in -rs is ambiguous except in the verbal endings -ar or " ..
"-ir, in the nominal or adjectival endings -er(s) and -or(s). In all other cases it needs to be " ..
"rewritten using one of 'rr' (pronounced everywhere), '(rr)' (pronounced everywhere but Balearic) or " ..
"'(r)' (pronounced only in Valencian). Note that adjectives in -ar usually need rewriting using '(rr)'; " ..
"nouns in -ar referring to places should be rewritten using '(r)'; and loanword nouns in -ir usually " ..
"need rewriting using 'rr'."):format(orig_word))
local syllables_IPA = {stress = syllables.stress, is_prefix = syllables.is_prefix, is_suffix = syllables.is_suffix}
for key, val in ipairs(syllables) do
syllables_IPA = {onset = val.onset, vowel = val.vowel, coda = val.coda, stressed = val.stressed}
-- Replace letters with IPA equivalents
for i, syll in ipairs(syllables_IPA) do
-- Voicing of s
if syll.onset == "s" and i > 1 and rfind(syllables.coda, "^?$") then
syll.onset = "z"
if rfind(syll.vowel, "^$") then
syll.onset = rsub(syll.onset, "tg$", "ʤ")
syll.onset = rsub(syll.onset, "$", { = "s", = "ʒ"})
syll.onset = rsub(syll.onset, "u$", { = "k", = "g"})
syll.coda = rsub(syll.coda, "igs?$", "iʤ")
syll.onset = replace_context_free(syll.onset)
syll.coda = replace_context_free(syll.coda)
syll.vowel = rsub(syll.vowel, ".",
dialect == "cen" and IPA_vowels_central or
dialect == "bal" and IPA_vowels_balearic or
syll.vowel = rsub(syll.vowel, ".", IPA_vowels)
for _, suffix_syl in ipairs(suffix_syllables) do
table.insert(syllables_IPA, suffix_syl)
return syllables_IPA
-- Given a single substitution spec, `to`, figure out the corresponding value of `from` used in a complete
-- substitution spec. `pagename` is the name of the page, either the actual one or taken from the `pagename` param.
-- `whole_word`, if set, indicates that the match must be to a whole word (it was preceded by ~).
local function convert_single_substitution_to_original(to, pagename, whole_word)
-- Replace specially-handled characters with a class matching the character and possible replacements.
local escaped_from = to
-- Handling of '(rr)', '(r)', '.' and '-' needs to be done before calling pattern_escape(); otherwise they will be
-- escaped.
escaped_from = escaped_from:gsub("%(rr%)", "r")
escaped_from = escaped_from:gsub("%(r%)", "r")
escaped_from = escaped_from:gsub("ks", "x"):gsub("Ks", "X"):gsub("gz", "x"):gsub("()%1l", "%1l"):gsub("", "")
escaped_from = require(strutil_module).pattern_escape(escaped_from)
escaped_from = escaped_from:gsub("rr", "rr?")
escaped_from = escaped_from:gsub("ss", "ss?")
escaped_from = escaped_from:gsub("ʃ", "")
-- This is tricky, because we already passed `escaped_from` through pattern_escape() causing a hyphen to get a
-- % sign before it, and have to double up the percent signs to match and replace a literal %.
escaped_from = escaped_from:gsub("%%%-", "%%-?")
-- Tie sign (‿) should match against space, hyphen or nothing in the original.
escaped_from = escaped_from:gsub("‿", "?")
escaped_from = rsub(escaped_from, "",
function(v) return " .. "]" end)
escaped_from = escaped_from:gsub(DOTOVER, DOTOVER .. "?"):gsub(LINEUNDER, LINEUNDER .. "?")
escaped_from = "(" .. escaped_from .. ")"
if whole_word then
escaped_from = "%f" .. escaped_from .. "%f"
local match = rmatch(pagename, escaped_from)
if match then
if match == to then
error(("Single substitution spec '%s' found in pagename '%s', replacement would have no effect"):
format(to, pagename))
return match
error(("Single substitution spec '%s' couldn't be matched to pagename '%s'"):format(to, pagename))
local function apply_substitution_spec(respelling, pagename, pos, allow_mid_vowel_hints, parse_err)
local subs = split_on_comma(rmatch(respelling, "^%$"))
respelling = pagename
local mid_vowel_hint
local regular_subs = {}
for _, sub in ipairs(subs) do
if rfind(sub, "^" .. export.mid_vowel_hint_c .. "$") then
if mid_vowel_hint then
parse_err(("Specified mid vowel hint twice, '%s' and '%s'"):format(
mid_vowel_hint, sub))
mid_vowel_hint = sub
table.insert(regular_subs, sub)
if mid_vowel_hint then
if not allow_mid_vowel_hints then
parse_err(("Mid vowel hint '%s' not allowed when apply one substitution spec to multiple words"):format(
local suffix = ""
-- FIXME: This duplicates logic in to_IPA().
if not pos or pos == "adverb" then
local part_before_ment, ment = rmatch(respelling, "^(.*)(mnt)$")
if part_before_ment and (pos == "adverb" or not rfind(part_before_ment, "$") and
rfind(part_before_ment, V .. ".*" .. V)) then
suffix = ment
respelling = part_before_ment
local syllables = split_syllables(respelling, "stress prefixes", "may be uppercase")
local stressed_vowel = syllables.vowel
if stressed_vowel == mid_vowel_hint then
-- do nothing
elseif rfind(mid_vowel_hint, "") and rfind(stressed_vowel, "") or
rfind(mid_vowel_hint, "") and rfind(stressed_vowel, "") then
syllables.vowel = mid_vowel_hint
parse_err(("Stressed vowel '%s' not compatible with mid vowel hint '%s'"):format(
stressed_vowel, mid_vowel_hint))
respelling = reconstitute_word_from_syllables(syllables) .. suffix
for _, sub in ipairs(regular_subs) do
local from, escaped_from, to, escaped_to, whole_word
if rfind(sub, "^~") then
-- whole-word match
sub = rmatch(sub, "^~(.*)$")
whole_word = true
if sub:find(":") then
from, to = rmatch(sub, "^(.-):(.*)$")
to = sub
from = convert_single_substitution_to_original(to, pagename, whole_word)
if from then
local strutil = require(strutil_module)
escaped_from = strutil.pattern_escape(from)
if whole_word then
escaped_from = "%f" .. escaped_from .. "%f"
escaped_to = strutil.replacement_escape(to)
local subbed_respelling, nsubs = rsubn(respelling, escaped_from, escaped_to)
if nsubs == 0 then
parse_err(("Substitution spec %s -> %s didn't match processed pagename '%s'"):format(
from, to, respelling))
elseif nsubs > 1 then
parse_err(("Substitution spec %s -> %s matched multiple substrings in processed pagename '%s', add " ..
"more context"):format(from, to, respelling))
respelling = subbed_respelling
return respelling
local canonicalize_pos = {
n = "noun",
noun = "noun",
v = "verb",
vb = "verb",
verb = "verb",
a = "adjective",
adj = "adjective",
adjective = "adjective",
av = "adverb",
adv = "adverb",
adverb = "adverb",
o = "other",
other = "other",
local function parse_off_pos(respelling, parse_err)
local pos, rest = respelling:match("^(+)/(.*)$")
if pos then
if not canonicalize_pos then
local valid_pos = {}
for vp, _ in pairs(canonicalize_pos) do
table.insert(valid_pos, vp)
parse_err(("Unrecognized part of speech '%s', should be one of %s"):format(pos,
table.concat(valid_pos, ", ")))
pos = canonicalize_pos
respelling = rest
if respelling == "" then
respelling = "+"
return pos, respelling
-- Parse a respelling given by the user, allowing for '+' for pagename, mid vowel hints in place of a respelling and
-- substitution specs like '' or . In general, return an object {words = {WORD, WORD, ...}} where
-- WORD is of the form {term = PARSED_RESPELLING, pos = POS}. Other fields are set in special cases: If a raw respelling
-- was seen, the fields `raw_phonemic` and/or `raw_phonetic` are set; if '?' is seen, the field `unknown` is set; and if
-- '-' is seen, the field `omitted` is set.
local function parse_respelling(respelling, pagename, parse_err)
if respelling == "?" then
return {
unknown = true
if respelling == "-" then
return {
omitted = true
local saw_raw
local remaining_respelling = respelling:match("^raw:(.*)$")
if remaining_respelling then
saw_raw = true
respelling = remaining_respelling
local raw_phonemic, raw_phonetic = respelling:match("^/(.*)/ %$")
if not raw_phonemic then
raw_phonemic = respelling:match("^/(.*)/$")
if not raw_phonemic and saw_raw then
raw_phonetic = respelling:match("^%$")
if raw_phonemic or raw_phonetic then
return {
raw_phonemic = raw_phonemic,
raw_phonetic = raw_phonetic,
pagename = decompose_respelling(pagename)
respelling = decompose_respelling(respelling)
local function split_respelling_into_words(respelling, parse_pos)
respelling = canon_respelling(respelling)
local word_objs = {}
local respelling_words = rsplit(respelling, " ")
for _, word in ipairs(respelling_words) do
local pos
if parse_pos then
pos, word = parse_off_pos(word, parse_err)
table.insert(word_objs, {term = word, pos = pos})
return {words = word_objs}
local function substitute_respelling_word(respelling_word, pagename_word)
local pos
pos, respelling_word = parse_off_pos(respelling_word, parse_err)
if respelling_word == "+" then
respelling_word = pagename_word
if rfind(respelling_word, "^" .. export.mid_vowel_hint_c .. "$") then
respelling_word = ""
if rfind(respelling_word, "^%$") then
respelling_word = apply_substitution_spec(respelling_word, pagename_word, pos,
"allow mid vowel hint", parse_err)
return {term = respelling_word, pos = pos}
-- At this point, if there are multiple words in the pagename, there are three syntaxes allowed: all-at-once,
-- replacement or word-by-word. All-at-once syntax involves either a + representing the entire pagename, or a
-- substitution spec that applies to all words in the pagename. This syntax cannot have a prefixed part of speech
-- because it wouldn't be clear which word to apply the part of speech to. Replacement syntax simply spells out the
-- respelling without any substitution specs or +'s (but possibly with parts of speech prefixed to individual
-- words), and can have a different number of words than the pagename (essentially, the pagename is disregarded).
-- Word-by-word syntax involves a combination of respelled words, per-word substitution specs and/or a +
-- representing an individual word, and must have the same number of words as the pagename so that substitution
-- specs and +'s can be lined up with words in the pagename. In all cases, the return value is in the same format;
-- see comment at top of function.
if pagename:find(" ") or respelling:find(" ") then
if respelling == "+" then
return split_respelling_into_words(pagename)
elseif rfind(respelling, "^%$") then
-- all-at-once syntax with substitution spec
return split_respelling_into_words(apply_substitution_spec(respelling, pagename, nil, false, parse_err))
elseif rfind(respelling, "^(+)/$") or rfind(respelling, "^(+)/%]*%]$") then
-- attempt to include a part of speech in all-at-once syntax
parse_err(("Part of speech not allowed when pagename is multiword and all-at-once syntax is used in " ..
"the respelling, but saw '%s'"):format(respelling))
elseif rfind(respelling, "^" .. export.mid_vowel_hint_c .. "$") then
-- attempt to use a mid-vowel hint in all-at-once syntax
parse_err(("Single mid-vowel hint not allowed when pagename is multiword because it's not clear which " ..
"word to apply it to, but saw '%s'"):format(respelling))
elseif rfind(respelling, "]") or rfind(respelling, "^" .. export.mid_vowel_hint_c .. " ") or
rfind(respelling, " " .. export.mid_vowel_hint_c .. " ") or
rfind(respelling, " " .. export.mid_vowel_hint_c .. "$") then
-- word-by-word syntax
local sub_with_space = rmatch(respelling, "%]* ]*%]")
if sub_with_space then
parse_err(("When using word-by-word syntax with a multiword pagename, saw substitution spec '%s' " ..
"with spaces, which is not allowed because it must match a single word"):format(sub_with_space))
pagename = canon_respelling(pagename)
respelling = canon_respelling(respelling)
local pagename_words = rsplit(pagename, " ")
local respelling_words = rsplit(respelling, " ")
if #pagename_words ~= #respelling_words then
parse_err(("When using word-by-word syntax with a multiword pagename, saw %s words in pagename but " ..
"%s word%s in respelling; they need to match"):format(#pagename_words, #respelling_words,
#respelling_words > 1 and "s" or ""))
local word_objs = {}
for i = 1, #pagename_words do
table.insert(word_objs, substitute_respelling_word(respelling_words, pagename_words))
return {words = word_objs}
-- replacement syntax; pagename ignored
return split_respelling_into_words(respelling, "parse pos")
local word_obj = substitute_respelling_word(respelling, pagename)
word_obj.term = canon_respelling(word_obj.term)
return {words = {word_obj}}
-- Parse a list of comma-split runs containing one or more respellings, i.e. after calling parse_balanced_segment_run()
-- or the like followed by split_alternating_runs() or the like (see ]). `pagename` is the
-- pagename, for use when a respelling is just '+', a mid-vowel hint like 'ê' or a substitution spec like ''.
-- `original_input` is the raw input and `input_param` the name of the param containing the raw input; both are used
-- only in error messages. Return an object specifying the respellings, currently with a single field 'terms' (this
-- format is used in case other outer properties exist in the future), where 'terms' is a list of term objects. Each
-- term object contains either a field `term` with the respelling and an optional part of speech `pos`, or fields
-- `raw_phonemic` and/or `raw_phonetic` (if the user specified raw IPA using "/.../" or "/.../ " or "raw:"),
-- `unknown` (if the user specified "?"), or `omitted` (if the user specified "-"). In addition, there may be fields
-- `q`, `qq`, `a`, `aa`, and/or `ref` corresponding to inline modifiers. Each such field is a list; all are lists of
-- strings except for `ref`, which is a list of objects as returned by parse_references() in ].
function export.parse_comma_separated_groups(comma_separated_groups, pagename, original_input, input_param)
local function generate_obj(respelling, parse_err)
return parse_respelling(respelling, pagename == true and respelling or pagename, parse_err)
local put = require(parse_utilities_module)
local outer_container = {terms = {}}
for _, group in ipairs(comma_separated_groups) do
-- Rejoin runs that don't involve <...>.
local j = 2
while j <= #group do
if not group:find("^<.*>$") then
group = group .. group .. group
table.remove(group, j)
table.remove(group, j)
j = j + 2
local param_mods = {
-- pre = { overall = true },
-- post = { overall = true },
ref = { store = "insert", convert = function(arg, parse_err)
return require("Module:references").parse_references(arg)
end },
q = { store = "insert" },
qq = { store = "insert" },
a = { store = "insert" },
aa = { store = "insert" },
table.insert(outer_container.terms, put.parse_inline_modifiers_from_segments {
group = group,
arg = original_input,
props = {
paramname = input_param,
param_mods = param_mods,
generate_obj = generate_obj,
splitchar = ",",
outer_container = outer_container,
return outer_container
-- Generate the pronunciation of `words` (a list of word objects representing respellings, each of which is an object
-- of the form {term = RESPELLING, pos = PART_OF_SPEECH} in `dialect` ("cen", "bal" or "val").
local function to_IPA(words, dialect)
local pronuns = {}
for _, wordobj in ipairs(words) do
if rfind(wordobj.term, "") then
error(("Invalid accented character in respelling '%s'; use accented à í ú, not the reversed versions"
words = handle_unstressed_words(words)
for _, wordobj in ipairs(words) do
local word = wordobj.term
local pos = wordobj.pos
local suffix_syllables = {}
local orig_word = word
word = ulower(word)
if not pos or pos == "adverb" then
local word_before_ment, ment = rmatch(word, "^(.*)(mnt)$")
if word_before_ment and (pos == "adverb" or not rfind(word_before_ment, "$") and
rfind(word_before_ment, V .. ".*" .. V)) then
suffix_syllables = {{onset = "m", vowel = "e", coda = "nt", stressed = true}}
pos = "adjective"
word = word_before_ment
word = word_fixes(word, dialect)
local syllables = split_syllables(word)
syllables = preprocess_word(syllables, suffix_syllables, dialect, pos, orig_word)
-- Combine syllables.
local combined = {}
local has_ment = #suffix_syllables > 0
for i, syll in ipairs(syllables) do
local ac = (i == syllables.stress and not syllables.is_prefix and not has_ment or
has_ment and i == #syllables) and AC or -- primary stress
syllables.stressed and GR or -- secondary stress
table.insert(combined, syll.onset .. syll.vowel .. ac .. syll.coda)
table.insert(pronuns, table.concat(combined, "."))
-- Put double ## at utterance boundaries (beginning/end of string) and at foot boundaries (marked with |).
-- Note that if the string without pound signs is 'foo bar baz | bat quux', the final string will be
-- '##foo# #bar# #baz## #|# ##bat# #quux##'.
local text = "##" .. table.concat(pronuns, " ") .. "##"
text = rsub(text, " | ", "# | #")
text = rsub(text, " ", "# #")
return postprocess_general(text, dialect)
-- Generate the phonemic and phonetic pronunciations of the respellings in `parsed_respellings`, which is a table whose
-- keys are dialect identifiers (e.g. "cen" for Central Catalan, "val" for Valencian) and whose values are objects of
-- the format returned by parse_comma_separated_groups() (see comment above that function). This destructively modifies
-- `parsed_respellings`, adding fields `phonemic` and `phonetic` containing the generated pronunciations and removing
-- the input fields used to generate those output fields. (FIXME: Currently only phonetic pronunciation is generated.)
function export.generate_phonemic_phonetic(parsed_respellings)
-- Convert each canonicalized respelling to phonemic/phonetic IPA.
for dialect, respelling_spec in pairs(parsed_respellings) do
for _, termobj in ipairs(respelling_spec.terms) do
if termobj.unknown or termobj.omitted then
-- leave alone, will handle later
elseif termobj.raw_phonemic or termobj.raw_phonetic then
termobj.phonemic = termobj.raw_phonemic
termobj.phonetic = termobj.raw_phonetic
-- set to nil so by-value comparisons respect only the resulting phonemic/phonetic and qualifiers
termobj.raw_phonemic = nil
termobj.raw_phonetic = nil
termobj.phonetic = to_IPA(termobj.words, dialect)
-- set to nil so by-value comparisons respect only the resulting phonemic/phonetic and qualifiers
termobj.words = nil
-- Group pronunciations by dialect, i.e. grouping pronunciations that are identical in every way (including both the
-- pronunciation(s) and any qualifiers and other inline modifiers). `parsed_respellings` contains the output from
-- generate_phonemic_phonetic(), and the return value is a list of grouped pronunciations, where each object in the list
-- contains fields `dialects` (a list of dialects containing the pronunciations) and `pronuns` (a list of
-- pronunciations, where each pronunciation is specified by an object containing fields `phonemic` and `phonetic`, as
-- generated by generate_phonemic_phonetic(), along with any inline modifier fields `q`, `qq`, `a`, `aa` and/or `ref`).
function export.group_pronuns_by_dialect(parsed_respellings)
local grouped_pronuns = {}
for dialect, pronun_spec in pairs(parsed_respellings) do
local saw_omitted = false
for _, termobj in ipairs(pronun_spec.terms) do
if termobj.omitted then
saw_omitted = true
if not saw_omitted then
local saw_existing = false
for _, group in ipairs(grouped_pronuns) do
if m_table.deepEquals(group.pronuns, pronun_spec.terms) then
table.insert(group.dialects, dialect)
saw_existing = true
if not saw_existing then
table.insert(grouped_pronuns, {dialects = {dialect}, pronuns = pronun_spec.terms})
return grouped_pronuns
-- Format pronunciations grouped by dialect. `grouped_pronuns` contains the output of group_pronuns_by_dialect().
-- This destructively modifies `grouped_pronuns`, adding a field 'formatted' to the first-level values of
-- `grouped_pronuns` containing the formatted pronunciation(s) for a given set of dialects.
function export.format_grouped_pronunciations(grouped_pronuns)
for _, grouped_pronun_spec in pairs(grouped_pronuns) do
local pronunciations = {}
-- Loop through each pronunciation. For each one, add the phonemic and phonetic versions to `pronunciations`,
-- for formatting by ] or raw (for use in ]).
for j, pronun in ipairs(grouped_pronun_spec.pronuns) do
-- Add dialect tags to left accent qualifiers if first one
local as = pronun.a
if j == 1 then
if as then
as = m_table.deepCopy(as)
as = {}
for _, dialect in ipairs(grouped_pronun_spec.dialects) do
table.insert(as, export.dialects_to_names)
local first_pronun = #pronunciations + 1
if pronun.unknown then
-- FIXME: This is a massive hack but it works for now.
table.insert(pronunciations, { pron = "", pretext = "''unknown''" })
if not pronun.phonemic and not pronun.phonetic then
error("Internal error: Saw neither phonemic nor phonetic pronunciation")
if pronun.phonemic then -- missing if 'raw:' given
local slash_pron = "/" .. pronun.phonemic .. "/"
table.insert(pronunciations, {
pron = slash_pron,
if pronun.phonetic then -- missing if '/.../' given
local bracket_pron = ""
table.insert(pronunciations, {
pron = bracket_pron,
local last_pronun = #pronunciations
if pronun.q then
pronunciations.q = pronun.q
if as then
pronunciations.a = as
if j > 1 then
pronunciations.separator = ", "
if pronun.qq then
pronunciations.qq = pronun.qq
if pronun.aa then
pronunciations.aa = pronun.aa
if pronun.refs then
pronunciations.refs = pronun.refs
if first_pronun ~= last_pronun then
pronunciations.separator = " "
grouped_pronun_spec.formatted = m_IPA.format_IPA_full {
lang = lang,
items = pronunciations,
separator = "",
local params = {
= {},
indent = {},
pagename = {} -- for testing or documentation pages
for _, dialect in ipairs(export.dialects) do
params = {}
for dialect_group, _ in pairs(export.dialect_groups) do
params = {}
local args = require("Module:parameters").process(frame:getParent().args, params)
local pagename = args.pagename or mw.title.getCurrentTitle().subpageText
-- Set inputs
local inputs = {}
-- If 1= specified, do all dialects.
if args then
for _, dialect in ipairs(export.dialects) do
inputs = {input = args, param = 1}
-- Then do dialect groups.
for dialect_group, group_dialects in pairs(export.dialect_groups) do
if args then
for _, dialect in ipairs(group_dialects) do
inputs = {input = args, param = dialect_group}
-- Then do individual dialect settings.
for _, dialect in ipairs(export.dialects) do
if args then
inputs = {input = args, param = dialect}
-- If no inputs given, set all dialects based on current pagename.
if not next(inputs) then
for _, dialect in ipairs(export.dialects) do
inputs = {input = "+", param = "(pagename)"}
-- Parse the arguments.
local parsed_respellings = {}
for dialect, inputspec in pairs(inputs) do
local function generate_obj(respelling, parse_err)
return parse_respelling(respelling, pagename, parse_err)
if inputspec.input:find("") then
local put = require(parse_utilities_module)
-- Parse balanced segment runs involving either (substitution notation) or <...> (inline modifiers).
-- We do this because we don't want commas inside of square or angle brackets to count as respelling
-- delimiters. However, we need to rejoin square-bracketed segments with nearby ones after splitting
-- alternating runs on comma. For example, if we are given
-- "aa<q:learned>,<q:nonstandard>", after calling
-- parse_multi_delimiter_balanced_segment_run() we get the following output:
-- {"a", "", "a", "<q:learned>", ",", "", "", "<q:nonstandard>", ""}
-- After calling split_alternating_runs(), we get the following:
-- {{"a", "", "a", "<q:learned>", ""}, {"", "", "", "<q:nonstandard>", ""}}
-- We need to rejoin stuff on either side of the square-bracketed portions.
local segments = put.parse_multi_delimiter_balanced_segment_run(inputspec.input, {{"<", ">"}, {""}})
local comma_separated_groups = put.split_alternating_runs_on_comma(segments)
-- Process each value.
local outer_container = export.parse_comma_separated_groups(comma_separated_groups, pagename,
inputspec.input, inputspec.param)
parsed_respellings = outer_container
local termobjs = {}
local function parse_err(msg)
error(msg .. ": " .. inputspec.param .. "=" .. inputspec.input)
for _, term in ipairs(split_on_comma(inputspec.input)) do
table.insert(termobjs, generate_obj(term, parse_err))
parsed_respellings = {
terms = termobjs,
-- Convert each canonicalized respelling to phonemic/phonetic IPA.
-- Group the results.
local grouped_pronuns = export.group_pronuns_by_dialect(parsed_respellings)
-- Format the results.
-- Concatenate formatted results.
local formatted = {}
for _, grouped_pronun_spec in ipairs(grouped_pronuns) do
table.insert(formatted, grouped_pronun_spec.formatted)
local indent = (args.indent or "*") .. " "
local out = table.concat(formatted, "\n" .. indent)
if args.indent then
out = indent .. out
return out
-- Used by ].
function export.test(pagename, respelling, dialect)
local function parse_err(msg)
local parsed = parse_respelling(respelling, pagename, parse_err)
return to_IPA(parsed.words, dialect)
return export