Module:wen-pron

The following documentation is located at Module:wen-pron/documentation.
Useful links: subpage list • links • transclusions • testcases • sandbox
This module implements {{dsb-pr}} and {{hsb-pr}}.
local export = {}
--Based on ]
local m_str_utils = require("Module:string utilities")
local m_table = require("Module:table")
local audio_module = "Module:audio"
local links_module = "Module:links"
local parse_utilities_module = "Module:parse utilities"

local u = m_str_utils.char
local rfind = m_str_utils.find
local rmatch = m_str_utils.match
local rsplit = m_str_utils.split
local rsubn = m_str_utils.gsub
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local uupper = mw.ustring.upper
local usub = mw.ustring.sub
local find = mw.ustring.find

local vowels = "aeiouěóyɔɛɪʊɨ"

-- FIXME: Implement optional assimilation across word boundaries.
local assimilate_across_word_boundaries = false

local function track(page)
    require("Module:debug/track")("wen-pron/" .. page)
    return true
end

-- Version of rsubn() that discards all but the first return value.
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

-- Flat-map a function `fun` over `items`. This is like `map` over a sequence followed by `flatten`, i.e. the function
-- must itself return a sequence and all of the returned sequences are flattened into a single sequence.
local function flatmap(items, fun)
	local new = {}
	for _, item in ipairs(items) do
		local results = fun(item)
		for _, result in ipairs(results) do
			m_table.insertIfNot(new, result)
		end
	end
	return new
end

-- Combine two sets of qualifiers, either of which may be nil or a list of qualifiers. Remove duplicate qualifiers.
-- Return value is nil or a list of qualifiers.
local function combine_qualifiers(qual1, qual2)
	if not qual1 then
		return qual2
	end
	if not qual2 then
		return qual1
	end
	local qualifiers = m_table.deepcopy(qual1)
	for _, qual in ipairs(qual2) do
		m_table.insertIfNot(qualifiers, qual)
	end
	return qualifiers
end

local function split_on_comma(term)
	if not term then
		return nil
	end
	if term:find(",%s") or term:find("\\") then
		return require(parse_utilities_module).split_on_comma(term)
	else
		return rsplit(term, ",")
	end
end

-- Remove any HTML from the formatted text and resolve links, since the extra characters don't contribute to the
-- displayed length.
local function convert_to_raw_text(text)
	text = rsub(text, "<.->", "")
	if text:find("%[%[") then
		text = require(links_module).remove_links(text)
	end
	return text
end

-- Return the approximate displayed length in characters.
local function textual_len(text)
	return ulen(convert_to_raw_text(text))
end

local function parse_respellings_with_modifiers(respelling, paramname)
	local function generate_obj(respelling, parse_err)
		return {respelling = respelling}
	end

	if respelling:find("") then
		local put = require(parse_utilities_module)
		-- Parse balanced segment runs involving either  (substitution notation) or <...> (inline modifiers).
		-- We do this because we don't want commas inside of square or angle brackets to count as respelling
		-- delimiters. However, we need to rejoin square-bracketed segments with nearby ones after splitting
		-- alternating runs on comma. For example, if we are given
		-- "aa<q:learned>,<q:nonstandard>", after calling
		-- parse_multi_delimiter_balanced_segment_run() we get the following output:
		--
		-- {"a", "", "a", "<q:learned>", ",", "", "", "<q:nonstandard>", ""}
		--
		-- After calling split_alternating_runs(), we get the following:
		--
		-- {{"a", "", "a", "<q:learned>", ""}, {"", "", "", "<q:nonstandard>", ""}}
		--
		-- We need to rejoin stuff on either side of the square-bracketed portions.
		local segments = put.parse_multi_delimiter_balanced_segment_run(respelling, {{"<", ">"}, {""}})

		local comma_separated_groups = put.split_alternating_runs_on_comma(segments)

		-- Process each value.
		local retval = {}
		for i, group in ipairs(comma_separated_groups) do
			-- Rejoin runs that don't involve <...>.
			local j = 2
			while j <= #group do
				if not group:find("^<.*>$") then
					group = group .. group .. group
					table.remove(group, j)
					table.remove(group, j)
				else
					j = j + 2
				end
			end

			local param_mods = {
				q = {type = "qualifier"},
				qq = {type = "qualifier"},
				a = {type = "labels"},
				aa = {type = "labels"},
				ref = {item_dest = "refs", type = "references"},
			}

			table.insert(retval, put.parse_inline_modifiers_from_segments {
				group = group,
				arg = respelling,
				props = {
					paramname = paramname,
					param_mods = param_mods,
					generate_obj = generate_obj,
				},
			})
		end
		return retval
	else
		local retval = {}
		for _, item in ipairs(split_on_comma(respelling)) do
			table.insert(retval, generate_obj(item))
		end
		return retval
	end
end

-- Parse a pronunciation modifier in `arg`, the argument portion in an inline modifier (after the prefix), which
-- specifies a pronunciation property such as rhyme, hyphenation/syllabification, homophones or audio. The argument
-- can itself have inline modifiers, e.g. <audio:Foo.ogg<a:Masovia>>. The allowed inline modifiers are specified
-- by `param_mods` (of the format expected by `parse_inline_modifiers()`); in addition to any modifiers specified
-- there, the modifiers <q:...>, <qq:...>, <a:...>, <aa:...> and <ref:...> are always accepted. `generate_obj` and
-- `parse_err` are like in `parse_inline_modifiers()` and specify respectively a function to generate the object into
-- which modifier properties are stored given the non-modifier part of the argument, and a function to generate an error
-- message (given the message). Normally, a comma-separated list of pronunciation properties is accepted and parsed,
-- where each element in the list can have its own inline modifiers and where no spaces are allowed next to the commas
-- in order for them to be recognized as separators. This can be overridden with `splitchar` (which can actually be a
-- Lua pattern). The return value is a list of property objects.
local function parse_pron_modifier(arg, paramname, generate_obj, param_mods, splitchar)
	splitchar = splitchar or ","
	if arg:find("<") then
		param_mods.q = {type = "qualifier"}
		param_mods.qq = {type = "qualifier"}
		param_mods.a = {type = "labels"}
		param_mods.aa = {type = "labels"}
		param_mods.ref = {item_dest = "refs", type = "references"}
		return require(parse_utilities_module).parse_inline_modifiers(arg, {
			param_mods = param_mods,
			generate_obj = generate_obj,
			paramname = paramname,
			splitchar = splitchar,
		})
	else
		local retval = {}
		local split_arg = splitchar == "," and split_on_comma(arg) or rsplit(arg, splitchar)
		for _, term in ipairs(split_arg) do
			table.insert(retval, generate_obj(term))
		end
		return retval
	end
end


local function parse_audio(lang, arg, pagename, paramname)
	local param_mods = {
		IPA = {
			sublist = true,
		},
		text = {},
		t = {
			item_dest = "gloss",
		},
		-- No tr=, ts=, or sc=; doesn't make sense for Polish.
		gloss = {},
		pos = {},
		-- No alt=; text= already goes in alt=.
		lit = {},
		-- No id=; text= already goes in alt= and isn't normally linked.
		g = {
			item_dest = "genders",
			sublist = true,
		},
		bad = {},
		cap = {
			item_dest = "caption",
		},
	}

	local function process_special_chars(val)
		if not val then
			return val
		end
		return (val:gsub("", {
			 = pagename,
		}))
	end

	local function generate_audio_obj(arg)
		return {file = process_special_chars(arg)}
	end

	-- Split on semicolon instead of comma because some filenames have embedded commas not followed by a space
	-- (typically followed by an underscore).
	local retvals = parse_pron_modifier(arg, paramname, generate_audio_obj, param_mods, "%s*;%s*")
	for i, retval in ipairs(retvals) do
		retval.lang = lang
		retval.text = process_special_chars(retval.text)
		retval.caption = process_special_chars(retval.caption)
		local textobj = require(audio_module).construct_audio_textobj(retval)
		retval.text = textobj
		-- Set to nil the fields that were moved into `retval.text`.
		retval.gloss = nil
		retval.pos = nil
		retval.lit = nil
		retval.genders = nil
	end
	return retvals
end

local function parse_homophones(arg, paramname)
	local function generate_obj(term)
		return {term = term}
	end
	local param_mods = {
		t = {
			-- ] expects the gloss in "gloss".
			item_dest = "gloss",
		},
		gloss = {},
		-- No tr=, ts=, or sc=; doesn't make sense for Polish.
		pos = {},
		alt = {},
		lit = {},
		id = {},
		g = {
			-- ] expects the genders in "genders".
			item_dest = "genders",
			sublist = true,
		},
	}

	return parse_pron_modifier(arg, paramname, generate_obj, param_mods)
end


--[=[
Given a single word in `txt`, compute its "lightly phonetic" IPA representation. If there are multiple possible outputs,
we signal this through special symbols such as capital letters (on input, capital letters have been lowercased 
so we can use capital letters as special symbols). The actual generation of multiple outputs happens in multiword() after the full term's IPA
has been generated. Return two values: the IPA representation and the hyphenation.
]=]
local function single_word(data)
	local txt, lang = data.txt, data.lang
	local is_prefix = not not txt:find("%-$")
	local is_suffix = not not txt:find("^%-")
	local unstressed = is_prefix or is_suffix or data.is_prep

	function tsub(s, r)
		local c
		txt, c = rsubn(txt, s, r)
		return c > 0
	end
	function tsub_repeatedly(s, r)
		txt = rsub_repeatedly(txt, s, r)
	end
	function lg(s) return s or s end
	function tfind(s) return rfind(txt, s) end

	-- Save indices of uppercase characters before setting everything lowercase.
	local uppercase_indices
	uppercase_indices = {}
	local capitals = (""):format(lg {
		hsb = "ČĆĚŁŃÓŘŠŽŹ",
		dsb = "ČĆĚŁŃÓŔŠŚŽŹ",
	})
	if tfind(capitals) then
		local i = 1
		local str = rsub(txt, "", "")
		while rfind(str, capitals, i) do
			local r, _ = rfind(str, capitals, i)
			table.insert(uppercase_indices, r)
			i = r + 1
		end
	end
	if #uppercase_indices == 0 then
		uppercase_indices = nil
	end

	txt = ulower(txt)

	-- Replace digraphs with single capital letters to simplify the code below."tř", "tč", "tš", "dš", "dč"
	tsub("", {
		ch = "H",
		dz = "Z",
		 = "Ź",
		 = "Ž",
		 = "Ć",
		ts = "C",
		 = "Č",
		 = "Š",
		 = "Ř",
		 = "D",
		 = "T",
		 = "K",
		 = "Q",
		 = "S",
		 = "P",
		 = "R",
	})

	local function undo_digraph_replacement(txt)
		return rsub(txt, "", {
			H = "ch",
			Z = "dz",
			 = "dź",
			 = "dž",
			 = "tś",
			C = "ts",
			 = "tč",
			 = "tš",
			 = "tř",
			D = "dš",
			T = "dč",
			K = "kš",
			Q = "kś",
			P = "pš",
			R = "pś",
			S = "př"
		})
	end

	-- Vowels, both spelled and IPA vowels. To make the distinction easier, we separate the IPA-only vowels and
	-- concatenate them at the end.
	local V = lg {
		hsb = "aeiouěóy" .. "ɔɛɪʊ",
		dsb = "aeiouěóy" .. "ɔɛɪʊɨ",
	}
	local C = (""):format(V)

	if txt:find("^%*") then
		-- The symbol <*> before a word indicates it is unstressed.
		unstressed = true
		txt = txt:sub(2)
	end

	-- Syllabify common prefixes as separate; note that since we replaced digraphs above with single
	-- chars, we need to use the replacement chars here. Longer prefixes must precede shorter subprefixes,
	-- e.g. pśede- must precede pśed- for the former to be recognized.
	local prefixes = lg {
		hsb = {
			"wote", "wot", "roze", "roz", "Sez", "Sed", "Se", "pode", "pod",
		},
		dsb = {
			"wóte", "wót", "roze", "roz", "Rez", "Rede", "Red", "Re", "póde", "pód", "nade", "nad"
		},
	}
	for _, prefix in ipairs(prefixes) do
		if tfind("^" .. prefix) then
			-- Make sure the suffix is followed by zero or more consonants (including - but not
			-- including a syllable divider) followed by a vowel. We do this so that we don't put
			-- a syllable divider when the user specified the divider in a different place.
			tsub(("^(%s)(%s*)"):format(prefix, C, V), "%1.%2")
			break
		end
	end

	-- syllabify common suffixes as separate
	local suffixes = lg {
		hsb = {
			"dla"
		},
		dsb = {
			"dla"
		},
	}

	for _, v in ipairs(suffixes) do
		-- Make sure there's no syllable divider elsewhere in the consonant cluster
		-- preceding the suffix. Same logic as above for prefixes.
		if tsub(("(%s*)(%s)$"):format(V, C, v), "%1.%2") then break end
	end

	-- Syllabify <-Ctka> as /Ct.ka/. Same for any case form.
	local C_before_t = "łrnfskp"
	if tfind(("+t"):format(V, C_before_t)) then
		local endings = lg {
			-- As with prefixes above, must use digraph replacement codes.
			dsb = { "k", "ce", "koma", "kowu", "kow", "kami", "ka", }, 
			hsb = { "k", "ce", "komaj", "kow", "kami", "ka", }, 
		}
		for _, ending in ipairs(endings) do
			-- Make sure there's no syllable divider elsewhere in the consonant cluster
			-- preceding the suffix. Same logic as above for prefixes.
			if tsub(("(+t)(%s)$"):format(V, C_before_t, ending), "%1.%2") then
				break
			end
		end
	end

	-- Syllabify by adding a period (.) between syllables. There may already be user-supplied syllable divisions
	-- (period or single quote), which we need to respect. This works by replacing each sequence of VC*V with
	-- V.V, V.CV, V.TRV (where T is an obstruent and R is a liquid) or otherwise VTR.C+V or VC.C+V, i.e. if there are
	-- multiple consonants, place the syllable boundary after the first TR sequence or otherwise the first consonant.
	-- We need to repeat since each VC*V sequence overlaps the next one. Digraphs have already been replaced by single
	-- capital letters.

	-- List of obstruents and liquids, including capital letters representing digraphs.
	local obstruent = "bdgkCptxfszśźšžđZŹŽĆCČŠŘDTKQPR"
	local liquid_no_w = "jlr"
	local liquid = liquid_no_w .. "łw"

	tsub_repeatedly(("()(%s*)()"):format(V, C, V), function(v1, cons, v2)
		local cons_no_hyphen = cons:gsub("%-", "")
		-- If there's only one consonant, or just an obstruent-liquid sequence, or any single consonant,
		-- put the syllable break before the consonant(s).
		if (ulen(cons_no_hyphen) < 2 or rfind(cons_no_hyphen, ("^j?$"):format(obstruent, liquid)) or
			rfind(cons_no_hyphen, ("^%sj$"):format(C))) then
			cons = "." .. cons
		else
			local nsubs
			-- Don't syllabify ] as niósł.by or ] as jabłczan.
			-- FIXME: Not sure if this is quite right.
			cons, nsubs = rsubn(cons, ("^(%%-?%%-?)"):format(obstruent, liquid_no_w), "%1.")
			if nsubs == 0 then
				cons = rsub(cons, "^(%-?.)", "%1.")
			end
		end
		return ("%s%s%s"):format(v1, cons, v2)
	end)

	-- Ignore certain symbols and diacritics for the hyphenation.
	local hyph = txt:gsub("", "."):gsub("^%.", ""):gsub("-", ""):gsub(",", "")

	hyph = undo_digraph_replacement(hyph)
	hyph = hyph:lower()
	-- Restore uppercase characters.
	if uppercase_indices then
		-- str_i loops through all the characters of the string
		-- list_i loops as above but doesn't count dots
		-- array_i loops through the indices at which the capital letters are
		local str_i, list_i, array_i = 1, 1, 1
		function h_sub(x, y) return usub(hyph, x, y) end
		while array_i <= #uppercase_indices do
			if h_sub(str_i, str_i) ~= "." then
				if list_i == uppercase_indices then
					hyph = ("%s%s%s"):format(h_sub(1,str_i-1), uupper(h_sub(str_i,str_i)), h_sub(str_i+1))
					array_i = array_i + 1
				end
				list_i = list_i + 1
			end
			str_i = str_i + 1
		end
	end

	tsub("'", "ˈ")
	tsub('"', "ˌ")

	txt = undo_digraph_replacement(txt)

	-- handle <x>; must precede ch -> ; use - to prevent palatalization by following i
	tsub("x", "ks-")
	-- move syllable boundary between  if preceded by a vowel
	tsub(("()()ks"):format(V), "%1k%2s")

	-- handle digraphs and related stuff
	tsub("ch", "x")
	tsub("tś", "ć")
	tsub("ts", "c")
	-- (marginal phonemes in Lower Sorbian)
	tsub("dz", "đ")
	tsub("dž", "џ")
	
	if lang == "hsb" then
		tsub("ć", "č") -- In Upper Sorbian <ć> and <č> represent the same phone. Internally, we will use ć for t͡sʲ (Upper) and t͡ɕ (Lower)
		tsub("t", "ć")
		tsub("d", "ć")
		tsub("ř", "š")
		tsub("dź", "џ")
	else
		tsub("dź", "ђ")
		tsub("tš", "č")
		tsub("r(%.џ)", "%1")
		--word initial/final <ł> is not pronounced if followed/preceded by an obstruent
		tsub("^ł()", "%1")
		tsub("()ł$", "%1")
	end
	
	--Voicing/devoicing
	local devoice = {
		 = "p",
		 = "t",
		 = "k", --non IPA <g>
		 = "s",
		 = "ś",
		 = "ć",
		 = "c",
		 = "č",
		 = "f",
		 = "š",
	}
	local voice = {}
	for k, v in pairs(devoice) do
		voice = k
	end

	-- final devoicing
	if not data.is_prep then
		for v, d in pairs(devoice) do
			tsub(v .. "$", d)
		end
	end
	
	--Regressive assimilation
	local vobs = ""
	local dobs = ""

	local prev_txt
	while txt ~= prev_txt do
		prev_txt = txt
		for v, d in pairs(devoice) do
			tsub(v .. "(*" .. dobs .. ")", d .. "%1")
		end
	end
	prev_txt = ""
	while txt ~= prev_txt do
		prev_txt = txt
		for d, v in pairs(voice) do
			tsub(d .. "(*" .. vobs .. ")", v .. "%1")
		end
	end

	-- not using lg() here for speed
	if lang == "hsb" then
		tsub("^xc", "c")
		tsub("hw", "f")
		tsub("h$", "")
		tsub("h" .. "(?" .. C .. ")", "%1")
	else
		-- <#pt/#pc> -> <#c>
		tsub("^p()", "%1")
		--<#rdž> -> <#dž>
		tsub("^rџ", "џ")
		--<šć> -> <ść> and <ždź> -> <źdź>
		tsub("š(*)ć", "ś%1ć")
		tsub("ž(*)ђ", "ź%1ђ")
		--mute <t>
		tsub("t" .. "(*)", "%1")
		--mute <k>
		tsub("^gdy", "dy")
		--values of <w>
		tsub("^woł", "Woł")
		tsub("^w()", "ʍ%1")
		tsub("%-w(*)", "ʍ%1") --we use the symbol ʍ to keep track of a mute <w>
	end
	--changes common to Upper and Lower
	tsub("^wj", "Wj")
	tsub("^w(" .. C .. ")", "%1")
	tsub("W", "w")

	-- palatalization
	local palat_intern = {{"p", "ṕ"}, {"b", "ḇ"}, {"m", "ḿ"}, {"w", "ẃ"}, {"f", "ḟ"}, {"t", "ť"}, {"d", "ď"}, {"n", "ñ"}, {"r", "ŕ"}, {"k", "ḱ"}, {"g", "ǵ"}}
	for _, pair in ipairs(palat_intern) do
		tsub(pair .. "()", pair .. "%1")
	end
	
	-- glide before soft consonants
	if lang == "hsb" then
		tsub("()(?)()", "%1j%2%3")
	end
	
	--other conversions
	if lang == "hsb" then
		tsub("ń", "jn")
	else
		tsub("ń", "ñ")
		tsub("ó(?)j", "e%1j")
	end

	-- Hyphen separator, e.g. to prevent palatalization of <kwazi->.
	tsub("-", "")
	
	--Final substitutions
	local final_replacements = {
		 = "pʲ",  = "bʲ",  = "mʲ",  = "wʲ",  = "fʲ", 
		 = "tʲ",  = "dʲ",  = "ɲ",  = "rʲ",
		 = "kʲ",  = "ɡʲ",
		 = "w",
		 = "ɛ",  = "ɔ",  = "ɪ",  = "ɨ",  = "ʊ",
		 = "ɡ",
		 = "t͡s",  = "t͡ʃ",  = "d͡z",  = "d͡ʒ",  = "ʒ",  = "ʃ",
		--lower Sorbian
		 = "ɕ",  = "ʑ",  = "d͡ʑ",
	}
	tsub(".", final_replacements)
	if lang == "hsb" then
		tsub("()ʲ", "%1") --Do we want this?
	end
	tsub("()j", "%1") -- we don't want <ʲj> in the transcriptions
	
	--stress on the first syllable, unless manually stressed or unstressed
	local should_stress = not (unstressed or txt:find("ˈ"))
	if should_stress then
		if rfind(txt, "^naj") and lang == "hsb" then
			txt = rsub(txt, "^naj", "najˈ")
			txt = rsub(txt, "ˈ%.", "ˈ")
		else
			txt = "ˈ" .. txt
		end
	end
	
	if data.is_prep then
		txt = txt .. "$"
	end
	
	if lang == "hsb" then
		tsub("ć", "t͡sʲ")
		tsub("r", "ʀ")
		--Unstressed <ó> and <e>.
		tsub("%.(*)ʊ", "%1O")
		tsub("%.(*)ɪ", "%1E")
	else
		tsub("ć", "t͡ɕ")
		-- silent <j>
		tsub("()ji", "%1i")
		tsub("ij", "i")
		tsub("h", "H") --stand-in for optional /h/
		tsub("HH", "h")
		tsub("wʲ", "vʲ")
	end
	
	tsub("ʍ", "")

	return txt, hyph
end

-- Returns rhyme from a transcription.
local function do_rhyme(pron, lang)
	-- No rhymes for multiword terms.
	if find(pron, "") then
		return nil
	end
	local V = "aɛiɔɪuʊɨ"
	local rhyme = rsub(rsub(pron:gsub("^.*ˈ", ""), ("^-()"):format(V, V), "%1"), "", "")
	local num_syl = { select(2, rsubn(pron, (""):format(V), "")) }
	return {
		rhyme = rhyme,
		num_syl = num_syl
	}
end

--[[
Handles a single input, returning a table of transcriptions. Returns also a string of hyphenation and a table of rhymes
if it is a single-word term.
--]]
local function multiword(term, lang)
	if term:find("^raw:%$") then
		return {{ phonetic = term:gsub("^raw:", "") }}
	end
	local ipa, hyph
	term = rsub(term, "%s*,%s*", " | ")
	if term:find(" ") then
		-- TODO: repeated
		function lg(s)
			return s or s
		end

		local prepositions = lg {
			hsb = {
				"bjeze?", "dla", "do", "ke?", "mjeze?", "na", "nade?", "pod?", "pode", "podłu", "pola", "porno", "pře?", "přee",
				"přećiwo", "při", "spod", "w?", "wo?", "wote", "z?", "zeza",
				"blisko", "dale", "nimo", "niže", "njedaloko", "spody", "srjedź", "wyše", "zady", "zboka", "zespody",
			}, 
			dsb = {
				"blisko", "bźez", "dalej", "dla", "do", "hob?", "hu", "ku?", "mimo", "mjazy", "nad?", "pa", "pod?", "pód?", "pódla", "pśed", "pśez", "pśi", "sa", "srjejź", "we?", "wob?", "wokoło", "wote?",
				"wót?", "wu", "za?", "ze"
			},
		}

		local ipaparts, hyphparts = {}, {}
		local contains_preps = false

		local words = rsplit(term, " +")
		for i, word in ipairs(words) do
			if word == "|" then
				-- prosodic boundary, from a comma
				table.insert(ipaparts, word)
				if hyphparts then
					hyphparts = hyphparts .. ","
				else
					hyphparts = ","
				end
			else
				local is_prep = false
				for _, prep in ipairs(prepositions) do
					if (rfind(word, ("^%s$"):format(prep))) then
						is_prep = true
						break
					end
				end
				contains_preps = contains_preps or is_prep
				local wordipa, wordhyph = single_word {
					txt = word,
					lang = lang,
					is_prep = is_prep,
				}
				table.insert(ipaparts, wordipa)
				table.insert(hyphparts, wordhyph)
				if i < #words then
					local separator = is_prep and "‿" or " "
					table.insert(ipaparts, separator)
					table.insert(hyphparts, separator)
				end
			end
		end

		ipa = table.concat(ipaparts)
		hyph = table.concat(hyphparts)

		local function assimilate_preps(str)
			local function assim(from, to, before)
				str = rsub(str, ("%s(%%$‿?)"):format(from, before), to .. "%1")
			end
			local T = "fptsɕkx"
			assim("d", "t", T)
			assim("v", "f", T)
			assim("z", "s", T)
			return rsub(str, "%$", "")
		end

		if contains_preps then
			ipa = assimilate_preps(ipa)
			-- Move stress before clitics (only non-syllabic ones in dsb) attached with tie bar. FIXME: We should be using # or similar
			-- at string boundaries to avoid the need to substitute twice.
			if lang == "dsb" then
				local C = ""
				ipa = rsub(ipa, ("()(%s+)‿()"):format(C), "%1%3%2‿")
				ipa = rsub(ipa, ("^(%s+)‿()"):format(C), "%2%1‿")
			else
				ipa = rsub(ipa, ("()(.+)‿()"), "%1%3%2‿")
				ipa = rsub(ipa, ("^(.+)‿()"), "%2%1‿")
			end
		end
	else
		ipa, hyph = single_word {
			txt = term,
			lang = lang,
			is_prep = false,
		}
	end

	local result = {{
		pron = ipa,
		norhyme = false,
	}}

	-- Map over each element in `result`. If `from` is found in the element, replace the element with up to three
	-- elements, respectively replacing `from` with `to1` (with accent qualifiers `a1`), `to2` (with accent qualifiers
	-- `a2`) and `to3` (with accent qualifiers `a3`). If `to2` or `to3` are nil, no replacement is done for them.
	-- If `nr1` is true, this variant should not have rhymes generated; likewise for `nr2` and `nr3`.
	local function flatmap_and_sub_post(from, to1, a1, nr1, to2, a2, nr2, to3, a3, nr3)
		local any_change = false
		result = flatmap(result, function(item)
			if rfind(item.pron, from) then
				any_change = true
				local retval = {
					{
						pron = rsub(item.pron, from, to1),
						a = combine_qualifiers(item.a, a1),
						norhyme = item.norhyme or nr1,
					}
				}
				if to2 then
					table.insert(retval,
						{
							pron = rsub(item.pron, from, to2),
							a = combine_qualifiers(item.a, a2),
							norhyme = item.norhyme or nr2,
						}
					)
				end
				if to3 then
					table.insert(retval,
						{
							pron = rsub(item.pron, from, to3),
							a = combine_qualifiers(item.a, a3),
							norhyme = item.norhyme or nr3,
						}
					)
				end
				return retval
			else
				return {item}
			end
		end)
		
		return any_change
	end

	--In Lower Sorbian, add variant pronunciations of <ó>, <h> (unless we're using the respelling <hh>)
	--In Upper Sorbian, add variant pronunciations of unstressed <ě> and <ó>
	if lang == "dsb" then
		flatmap_and_sub_post("ʊ", "ɛ", {}, false, "ɨ", {}, false, "ʊ", {"dated"}, false)
		flatmap_and_sub_post("H", "", {}, false, "h", {"less common"}, false)
	else
		flatmap_and_sub_post("E", "ɪ", {}, false, "ɛ", {"less common"}, false)
		flatmap_and_sub_post("O", "ʊ", {}, false, "ɔ", {"less common"}, false)
	end

	return result, hyph
end

-- Given a single substitution spec, `to`, figure out the corresponding value of `from` used in a complete
-- substitution spec. `pagename` is the name of the page, either the actual one or taken from the `pagename` param.
-- `anchor_begin`, if set, indicates that the match must be to the beginning of a word (it was preceded by ~).
-- `anchor_end`, if set, indicates that the match must be to the end of a word (it was followed by ~). If there is a
-- second return value, it indicates that the `from` is a Lua pattern and doesn't need to be pattern-escaped.
local function convert_single_substitution_to_original(to, pagename, anchor_begin, anchor_end)
	-- Replace specially-handled characters with a class matching the character and possible replacements.
	local escaped_from = to
	-- If the entire single substitution spec is one of the special stress-controlling symbols (*),
	-- place it at the very beginning of the pagename. We do this by returning a Lua pattern that matches the
	-- beginning of the string, using the %f notation. (%f means a transition from %z, or the NUL character, to
	-- not-%z; strings are treated for this purpose as if they begin and end with a NUL character.)
	if escaped_from:find("^+$") then
		return "%f", true
	end
	escaped_from = escaped_from:gsub("", "")
	-- Call the equivalent of pattern_escape() in ] but leave out +, *, ^ and -, which we match below.
	local chars = {
		 = "%z",  = "%$",  = "%%",  = "%(",  = "%)",
		 = "%.",  = "%?",  = "%"] = "%]",
	}
	escaped_from = escaped_from:gsub("]", chars)
	-- A special stress-controlling symbol (&, &&, +, etc.) that occurs in a single substitution expression but
	-- is not the entire expression matches the beginning of a word. (Contrast the case above when the symbol stands
	-- alone and matches the beginning of the entire term.) This makes it possible e.g. with an expression like
	-- ] to write the substitution spec  and have it work; it matches
	-- R at the beginning of a word and converts it to &R, which makes ] have optional antepenultimate
	-- stress. This must precede other replacements esp. k -> +, otherwise the + will wrongly get matched.
	escaped_from = escaped_from:gsub("+", "%%f")
	-- A hyphen can match against a hyphen or nothing in the original. We have to take into account escaping the hyphen
	-- in the from side, adding a % sign in the replacement to escape the hyphen in the later rmatch, and escaping the
	-- percent sign in the replacement.
	escaped_from = escaped_from:gsub("%-", "%%-?")
	-- A space can match against space or hyphen in the original, so e.g. ] can use  to respell it as
	-- <agar agar>.
	escaped_from = escaped_from:gsub(" ", "")
	escaped_from = "(" .. escaped_from .. ")"
	if anchor_begin then
		escaped_from = "%f" .. escaped_from
	end
	if anchor_end then
		escaped_from = escaped_from .. "%f"
	end
	local match = rmatch(pagename, escaped_from)
	if match then
		if match == to then
			error(("Single substitution spec '%s' found in pagename '%s', replacement would have no effect"):
				format(to, pagename))
		end
		return match
	end
	error(("Single substitution spec '%s' couldn't be matched to pagename '%s' (escaped_from: %s"):format(
		to, pagename, mw.dumpObject(escaped_from)))
end


local function apply_substitution_spec(respelling, pagename, parse_err)
	local subs = split_on_comma(rmatch(respelling, "^%$"))
	respelling = pagename
	for _, sub in ipairs(subs) do
		if sub:find("%^") or sub:find("%$") then
			-- Changed to use ~ at beginning or end to avoid clashing with other use of ^; catch old uses
			track("old-anchor-symbols")
		end
		local from, escaped_from, to, escaped_to, anchor_begin, anchor_end
		if sub:find("^~") then
			-- anchor at beginning
			sub = rmatch(sub, "^~(.*)$")
			anchor_begin = true
		end
		local already_escaped
		if sub:find(":") then
			from, to = rmatch(sub, "^(.-):(.*)$")
			if from:find("~$") then
				-- anchor at end
				from = rmatch(from, "^(.*)~$")
				anchor_end = true
			end
		else
			if sub:find("~$") then
				-- anchor at end
				sub = rmatch(sub, "^(.*)~$")
				anchor_end = true
			end
			to = sub
			from, already_escaped = convert_single_substitution_to_original(to, pagename, anchor_begin, anchor_end)
		end
		if from then
			escaped_from = already_escaped and from or m_str_utils.pattern_escape(from)
			if anchor_begin then
				escaped_from = "%f" .. escaped_from
			end
			if anchor_end then
				escaped_from = escaped_from .. "%f"
			end
			escaped_to = m_str_utils.replacement_escape(to)
			local subbed_respelling, nsubs = rsubn(respelling, escaped_from, escaped_to)
			if nsubs == 0 then
				parse_err(("Substitution spec %s -> %s didn't match processed pagename '%s'"):format(
					from, to, respelling))
			elseif nsubs > 1 then
				parse_err(("Substitution spec %s -> %s matched multiple substrings in processed pagename '%s', add " ..
					"more context"):format(from, to, respelling))
			else
				respelling = subbed_respelling
			end
		end
	end

	return respelling
end


-- This handles all the magic characters <*>, <^>, <&>, <+>, <.>, <#>.
local function normalise_input(term, pagename, paramname)
	local function check_af(str, af, reg, repl, err_msg)
		reg = reg:format(af)
		if not rfind(str, reg) then
			error(("The word %s does not %s with %s"):format(str, err_msg, af))
		end
		return str:gsub(reg, repl)
	end

	local function check_pref(str, pref) return check_af(str, pref, "^(%s)", "%1.", "start") end
	local function check_suf(str, suf) return check_af(str, suf, "(%s)$", ".%1", "end") end

	if term:find("^%$") then
		local function parse_err(msg)
			-- Don't call make_parse_err() until we actually need to throw an error, to avoid unnecessarily loading
			-- ].
			require(parse_utilities_module).make_parse_err(paramname)(msg)
		end
		return apply_substitution_spec(term, pagename, parse_err)
	end
	if term == "#" then
		-- The pound sign stands simply for {{PAGENAME}}.
		return pagename
	elseif (term == "+") or term:find("^%^+$") or term:find("^&+$") or (term == "*") then
		-- Inputs that are just '+', '*', '^', '^^', '&', '&&', etc. are treated as
		-- if they contained the pagename with those symbols preceding it.
		return term .. pagename
	-- Handle syntax like <po.>, <.ka> and <po..ka>. This allows to not respell
	-- the entire word when all is needed is to specify syllabification of a prefix
	-- and/or a suffix.
	elseif term:find(".+%.$") then
		return check_pref(pagename, term:sub(1, -2))
	elseif term:find("^%..+") then
		return check_suf(pagename, term:sub(2))
	elseif term:find(".+%.%..+") then
		return check_suf(check_pref(pagename, term:gsub("%.%..+", "")), term:gsub(".+%.%.", ""))
	end

	return term

end

-- This converts the raw information, the arguments and pagename, into tables to be handed over to the IPA module.
function export.get_lect_pron_info(terms, pagename, paramname, lang)
	local pron_list = {}
	local rhyme_list = {}
	local hyph_list = {}

	local brackets = "/%s/"

	-- Loops over the terms given as arguments.
	for _, term in ipairs(terms) do
		local respelling = term.respelling
		-- Handles magic symbols in the input.
		respelling = normalise_input(respelling, pagename, paramname)
		-- Obtains the transcription and hyphenation for the current index.
		local prons, hyph = multiword(respelling, lang)

		for i, pron in ipairs(prons) do
			if pron.phonetic then
				table.insert(pron_list, {
					pron = pron.phonetic,
					pron_with_syldivs = pron.phonetic,
					q = term.q,
					qq = term.qq,
					a = term.a,
					aa = term.aa,
					refs = i == 1 and term.refs or nil,
				})
			else
				local bracketed_pron = brackets:format(pron.pron)
				table.insert(pron_list, {
					pron = bracketed_pron,
					pron_with_syldivs = bracketed_pron,
					q = term.q,
					qq = term.qq,
					a = combine_qualifiers(pron.a, term.a),
					aa = term.aa,
					refs = i == 1 and term.refs or nil,
				})
				if not pron.norhyme then
					table.insert(rhyme_list, do_rhyme(pron.pron, lang))
				end
			end
		end

		-- If a hyphenation value had been returned by multiword(), make sure it matches the pagename; otherwise
		-- don't add. FIXME: This should be smarter in the presence of hyphens in the lemma.
		if hyph and hyph:gsub("%.", ""):gsub("‿", " ") == pagename then
			m_table.insertIfNot(hyph_list, hyph)
		end
	end

	return {
		pron_list = pron_list,
		hyph_list = hyph_list,
		rhyme_list = rhyme_list,
	}
end

function export.show(frame)
	local ilang = frame.args.lang

	local process_args = {
		 = {},
		 = {},  = { alias_of = "hyphs" },
		 = {},  = { alias_of = "rhymes" },
		 = {},  = { alias_of = "audios" },
		 = {},  = { alias_of = "homophones" },
		 = {}, -- for debugging or demonstration only
		 = {},
	}

	local args = require("Module:parameters").process(frame:getParent().args, process_args)
	local lang = require("Module:languages").getByCode(ilang, true, "allow etym")

	local termspec = args or "#"
	local terms = parse_respellings_with_modifiers(termspec, 1)
	local pagename = args.pagename or mw.loadData("Module:headword/data").pagename
	local indent = args.indent or "*"

	local pronobj = export.get_lect_pron_info(terms, pagename, 1, ilang)
	local hyph_list, rhyme_list = pronobj.hyph_list, pronobj.rhyme_list
	local hyph_automatic = true
	local do_hyph

	if args.hyphs then
		hyph_automatic = false
		if args.hyphs == "-" then
			do_hyph = false
		else
			hyph_list = split_on_comma(args.hyphs)
			do_hyph = true
		end
	else
		do_hyph = true
	end

	if args.rhymes then
		if args.rhymes == "-" then
			rhyme_list = {}
		elseif args.rhymes ~= "+" then
			rhyme_list = {}
			for _, rhyme in ipairs(split_on_comma(args.rhymes)) do
				if rfind(rhyme, ".+/.+") then
					table.insert(rhyme_list, {
						rhyme = rsub(rhyme, "/.+", ""),
						num_syl = { tonumber(rsub(rhyme, ".+/", "")) },
					})
				else
					error(("The manual rhyme %s did not specify syllable number as RHYME/NUM_SYL"):format(rhyme))
				end
			end
		end
	end

	-- This deals with duplicate values in rhymes.
	if #rhyme_list > 0 then
		local temp_rhyme_list = {}
		local indices = {}
		for _, rhymeobj in ipairs(rhyme_list) do
			local index = indices
			if index == nil then
				table.insert(temp_rhyme_list, rhymeobj)
				indices = #temp_rhyme_list
			else
				local different_num_syl = true
				for _, ns in ipairs(temp_rhyme_list.num_syl) do
					if ns == rhymeobj.num_syl then
						different_num_syl = false
						break
					end
				end
				if different_num_syl then
					table.insert(temp_rhyme_list.num_syl, rhymeobj.num_syl)
				end
			end
		end
		rhyme_list = temp_rhyme_list
	end

	local m_IPA_format = require("Module:IPA").format_IPA_full
	local parts = {}
	local function ins(text)
		table.insert(parts, text)
	end

	local do_collapse = false

	if pronobj.pron_list then
		ins(indent .. m_IPA_format { lang = lang, items = pronobj.pron_list })
	end

	local em_length

	if args.audios then
		local format_audio = require("Module:audio").format_audio
		local audio_objs = parse_audio(lang, args.audios, pagename, "audios")
		local num_audios = #audio_objs
		for i, audio_obj in ipairs(audio_objs) do
			if num_audios > 1 and not audio_obj.caption then
				audio_obj.caption = "Audio " .. i
			end
			ins("\n" .. indent .. " " .. format_audio(audio_obj))
		end
	end

	if #rhyme_list > 0 then
		ins("\n" .. indent .. " " .. require("Module:rhymes").format_rhymes { lang = lang, rhymes = rhyme_list })
	end

	if do_hyph then
		ins("\n" .. indent .. " ")
		if #hyph_list > 0 then
			local hyphs = {}
			local seen_num_pron_syls
			if pronobj.pron_list then
				-- Compute the number of syllables for each pronunciation, for use below.
				seen_num_pron_syls = {}
				for _, pronobj in ipairs(pronobj.pron_list) do
					local pron = pronobj.pron_with_syldivs
					-- Convert prosodic boundaries to regular spaces; ignore stress markers at the beginning
					-- of a word; and then count occurrences of spaces, syllable dividers and stress marks.
					local num_pron_syls = 1 + ulen(rsub(rsub(rsub(pron:gsub(" | ", " "), " ", " "), "^?", ""),
						"", ""))
					m_table.insertIfNot(seen_num_pron_syls, {pron = pron, nsyl = num_pron_syls})
				end
			end
			for i, hyph in ipairs(hyph_list) do
				hyphs = { hyph = {} }
				for syl in hyph:gmatch("+") do
					table.insert(hyphs.hyph, syl)
				end
				if pronobj.pron_list then
					-- Match each syllabification against the syllable counts of all pronunciations, since we don't have
					-- any alignment between the two. Only complain if the syllable count of the syllabification doesn't
					-- match the syllable count of any pronunciation.
					local num_hyph_syls = 1 + #(hyph:gsub("", ""))
					local matched = false
					for _, num_pron_syls in ipairs(seen_num_pron_syls) do
						if num_pron_syls.nsyl == num_hyph_syls then
							matched = true
							break
						end
					end
					if not matched then
						local prons_and_sylcounts = {}
						for _, num_pron_syls in ipairs(seen_num_pron_syls) do
							table.insert(prons_and_sylcounts, ("%s:%s"):format(num_pron_syls.pron, num_pron_syls.nsyl))
						end
						mw.log(("Syllable count %s for syllabification '%s' doesn't match pronunciation syllable count(s) %s for '%s'"):format(
							num_hyph_syls, hyph, table.concat(prons_and_sylcounts, ","), pagename))
						if mw.title.getCurrentTitle().nsText == "" then
							ins(("]"):format(
								lang:getFullName(), ilang, hyph_automatic and "automatic" or "manual"))
						end
					end
				end		
			end
			ins(require("Module:hyphenation").format_hyphenations {
				lang = lang, hyphs = hyphs, caption = "Syllabification"
			})
		else
			ins("Syllabification: <small></small>")
			if mw.title.getCurrentTitle().nsText == "" then
				ins(("]"):format(
					lang:getFullName(), ilang))
			end
		end
	end

	if args.homophones then
		local homophone_list = parse_homophones(args.homophones, "homophones")
		ins("\n" .. indent .. " " .. require("Module:homophones").format_homophones {
			lang = lang,
			homophones = homophone_list,
		})
	end

	local ret = table.concat(parts)
	if em_length then
		ret = m_str_utils.gsub(ret, "{width}", em_length)
	end

	return ret
end

return export
Module:wen-pron

Wikious

Boobota

Sagapedia