--[=====[
TODO:
* Function for pre-consonantal and final obstruent devoicing of d, g, b, s
* Function for syllable-final uvularisation of r (ɐ̯)
* Function to reduce geminates
* List of environments which trigger the palatalisation of /x/ (liquids + non-low front vowels)
* Function to determine if H is word initial (> /h/) or non-initial (> 0)
* Function to put stress in general, function to check for prefixes and realign stress accordingly
* Function to convert ⟨e⟩ in unstressed syllables to ə > Function to reduce -ər to -r + "devoicing"
* Function to convert ⟨c⟩ before front vowels to /t͡s/
* Function to convert final ⟨-ehe⟩ as /eː/ (verbs only)
* Function to mark whether the word is Germanic or Romanic - makes a lot of exceptions
predictable/automatable, e.g. /ɪ, ɔ, ʊ/ > /i, o, u/ for short vowels in closed syllables,
penultimate or final stress
* Inseparable prefixes do not take stress > Stress on the 2nd syllable
** A complete list could be compiled and the process automated, instead of making the user enter the stress by hand
* Rules to determine when to make vowels short vs. long. These are usually predictable,
but there are some exceptions; use a macron (e.g. ā ē ī) to force a long vowel,
and a breve (e.g. ă ĕ ĭ) to force a short vowel. Below are the general rules:
- vowels are long in an open syllable (no final consonant, e.g. bēten, hōlen)
- vowels are also long before a single consonant (e.g. kām), as well as before
a silent ⟨h⟩ (e.g. gēhen, zēhn)
- vowels are also short before a double (geminate) consonant (e.g. Wăsser, Mŭtter)
- however, vowels before two unique consonants are not predictable (they can either be
long, e.g. Mōnd, or short, e.g. Mŭnd)
- note that a long ⟨i⟩ is usually written as ⟨ie⟩, except word-initially (e.g. Īgel)
and the exception of short ⟨ie⟩ in vier and its derivatives (e.g. vierzehn)
- vowels are usually long in a stressed final syllable before a single
consonant (but with possible exceptions, e.g. '-eg')
- unstressed syllables do not have long vowels
* Stress is usually on the first syllable, but there are some exceptions:
- syllables with secondary stress are treated as if stressed
- syllables directly following a known prefix (aus-, zu-, über-, ge-, etc.)
should be treated as if stressed, whether they are actually stressed or not
- when there's an explicit slash to separate compounds, all parts should be
treated as if they were separate words for vowel-length purposes (e.g.
'-tag' in 'Reichs/tag' should be long)
- what about other unstressed syllables?
--]=====]
local export = {}
local u = require("Module:string/char")
local strutl = require("Module:string utilities")
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local ucomp = mw.ustring.toNFC
local udecomp = mw.ustring.toNFD
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper
local usub = mw.ustring.sub
local ulen = mw.ustring.len
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
while true do
local new_term = rsub(term, foo, bar)
if new_term == term then
return term
end
term = new_term
end
end
local function ine(x)
if x == "" then return nil else return x end
end
local AC = u(0x0301) -- acute accent
local GR = u(0x0300) -- grave accent
local MA = u(0x0304) -- macron
local BR = u(0x0306) -- breve
local DI = u(0x0308) -- diaeresis
local stress_accent = AC .. GR
local length_accent = MA .. BR
local all_accents = stress_accent .. length_accent
local front_vowel = "eiyäöü"
local back_vowel = "aou"
local vowel = front_vowel .. back_vowel .. all_accents
local vowel_stressed = ""
local cons_c = ""
local cons_or_boundary_c = "" -- includes ⁀ -- I have added /l/ & /r/ as a stopgap against Brücke -> /ˈprʏkə/, but this may need a new name.
local stress_marks = "ˈˌ"
local devoice_conv = { = "p", = "t", = "k" }
local umlaut_conv = { = "ä", = "ö", = "ü" }
local sequences = {
= {
= "aɪ̯";
= "aʊ̯";
= "aɪ̯";
= "a";
= "aː";
= "ɐ";
};
= {
= "ɔʏ̯";
= "ɛ";
= "ɛː";
= "ɛ";
};
= "b";
= {
= { "k", "s" }; -- FIXME: should we have this
= "ç"; -- front allophone (ich-laut)
= "k";
= "t͡s";
};
= {
= "d͡ʒ";
= "t";
= "d";
};
= {
= "aɪ̯";
= "ɔʏ̯";
= "aɪ̯";
= "ɛ";
= "eː";
= "ə";
};
= "f";
= "ɡ";
= "h";
= {
= "iː";
= "iː";
= "ɪ";
= "iː";
= "ɪ";
};
= "j";
= {
= { "k", "s" };
= "χ"; -- back allophone of /ç/ (ach-laut)
= "k";
};
= "l";
= "m";
= {
= { "ŋ", "k" };
= "ŋ";
= "n";
};
= {
= "ɔ";
= "oː";
= "ɔ";
};
= {
= "œ";
= "œː"; -- sometimes /øː/
= "œ";
};
= {
= "p͡f";
= "f";
= "p";
};
= {
= { "k", "ʋ" }; -- only before another vowel
= "k";
};
= "r"; -- phonetically syllable-initially; /ɐ/ syllable-finally
= {
= "ʃ";
= "ʃ";
= { "ʃ", "p" };
= { "ʃ", "t" };
= "s";
};
= {
= "t͡ʃ";
= "t͡s";
= "t";
};
= {
= "ʊ";
= "uː";
= "ʊ";
};
= {
= "ʏ";
= "yː";
= "ʏ";
};
= "f";
= "ʋ";
= { "k", "s" }; -- XXX
= {
= "ʏ";
= "yː";
= "ʏ";
};
= "z"; -- respellt from s
= "ˈ"; -- FIXME
= "ˌ";
-- = "ˈ";
-- = "ˌ";
}
-- normalise the function by substituting strings, making the text lowercase,
-- decomposing and recomposing umlauted vowels, and then converting ae, oe, ue
-- to umlauted vowels (ä, ö, ü)
local function normalise(text)
--[[
if not text or text == "+" then
text = pagename
end
--]]
-- handle the string substitution syntax
if rfind(text, "^%$") then
local subs = rsplit(rmatch(text, "^%$"), ",")
for _, sub in ipairs(subs) do
local fromto = rsplit(sub, ":")
if #fromto ~= 2 then
error("Bad substitution spec " .. sub .. " in {{de-IPA}}")
end
local from, to = fromto, fromto
if rfind(from, "^~") then
-- formerly, ~ was required to match within a word
from = rmatch(from, "^~(.*)$")
end
local newtext = text
if rfind(from, "^%^") then
-- whole-word match
from = rmatch(from, "^%^(.*)$")
newtext = rsub(text, "%f" .. strutl.pattern_escape(from) .. "%f", to)
else
newtext = rsub(text, strutl.pattern_escape(from), to)
end
if newtext == text then
error("Substitution spec " .. sub .. " didn't match respelling '" .. text .. "'")
end
text = newtext
end
end
-- make text lowercase
text = ulower(text)
-- simplify checking for word boundary markers by adding ⁀ at
-- the beginning and end of all words then removing them at the end
text = rsub(text, "%s*,%s*", "⁀⁀ | ⁀⁀") -- mark between commas and treat it as a pause
text = rsub(text, "%s+", "⁀ ⁀") -- mark between spaces
text = rsub(text, "+", "⁀-⁀") -- mark between compound word boundaries including hyphens
text = "⁀⁀" .. text .. "⁀⁀" -- mark at the start and end of the whole entry
-- handle combining accents
text = udecomp(text) -- decompose accented characters into their base and combining parts
text = rsub(text, "(*)()", "%2%1") -- avoid confusion of wrongly-ordered umlauts/e's and other accents
text = rsub(text, "()", umlaut_conv) -- recompose umlauted vowels
text = rsub(text, "()()", "%2%1") -- put length accents after stress accents
return text
end
-- handle stress by shifting accent mark. if there's no stress
-- mark, add stress mark according to predetermined rules
local function handle_stress(text, orig, pos)
if not rfind(text, AC) then -- FIXME later
return rsubn(text, "⁀(" .. cons_c .. "*)", "⁀%1" .. AC)
else
return text
end
end
-- respell the text more phonetically to allow easier conversion to IPA
local function respell(text, orig, pos)
-- handle ⟨q⟩
text = rsub(text, "q(?" .. cons_c .. ")", "k%1") -- convert ⟨q⟩ before a single or no vowel
-- handle ⟨c⟩/⟨s⟩/⟨z⟩
text = rsub(text, "c()", "k%1") -- convert ⟨c⟩ (single letter) before non-front vowels to /k/
text = rsub(text, "()ch", "%1kh") -- convert ⟨ch⟩ after back vowels to /χ/
text = rsub(text, "z", "c") -- convert ⟨z⟩ to /t͡s/
text = rsub(text, "s()", "z%1") -- ⟨s⟩ is voiced as z before vowels
-- handle consonant devoicing
text = rsub(text, "()(" .. cons_or_boundary_c .. ")", -- devoice syllable-final obstruents
function(c1, c2)
return devoice_conv .. c2
end)
-- handle predictable stressed vowel lengths; other cases must explicitly
-- be marked by the user or else the module will return an error
text = rsub(text, "(" .. vowel_stressed .. ")(" .. cons_c .. ")", "%1" .. MA .. "%2") -- long vowel before consonant + vowel
text = rsub(text, "(" .. vowel_stressed .. ")⁀", "%1" .. MA .. "⁀") -- long vowel before a word boundary
text = rsub(text, "(" .. vowel_stressed .. ")(" .. cons_c .. ")%2", "%1" .. BR .. "%2%2") -- short vowel before a double consonant
-- handle pronounced ⟨h⟩ (FIXME)
text = rsub(text, "()h()", "%1" .. MA .. "%2") -- ⟨h⟩ is pronounced /h/ in between vowels
-- shift stress accents before letter
return rsub(text, "(%w*)()", "%2%1")
end
-- convert letters to phonemes using the sequences table,
-- then return the phonemes as a concatenated string
local function parse_table(text)
local phones, i, n = {}, 1, ulen(text)
while i <= n do
local is_stressed = false
local cid = usub(text, i, i)
local value = sequences
local phone, cidl
if value == nil then -- skip over invalid values
i = i + 1
elseif rmatch(cid, "") then -- check for stressed vowel
is_stressed = true
table.insert(phones, value)
i = i + 1
else -- process letters
local cid_next = usub(text, i + 1, i + 1)
if rmatch(cid, "") then
cidl = 1 -- default character id length value
if cid_next == "h" or cid_next == cid or cid_next == MA then -- long vowel if following an 'h' or a double letter
phone = value
cidl = cidl + 1
elseif cid_next == BR then
phone = value
cidl = cidl + 1
else
local found = false
for seq, seq_phone in pairs(value) do
if type(seq) == "string" and usub(text, i, i + ulen(seq) - 1) == seq then
phone = seq_phone
cidl = cidl + ulen(seq) - 1
found = true
break
end
end
if not found then
if is_stressed then -- return error if vowel is stressed
error("Vowel length is ambiguous for the stressed vowel. Please specify vowel length.")
else
phone = value
end
end
end
is_stressed = false -- turn off stress until end or next stressed vowel
else
cidl = 1 -- default character id length value
if type(value) ~= "table" or value then
phone = value
elseif cid_next == cid then -- double consonants are treated as singular
phone = value
else -- otherwise go over table
local found = false
for seq, seq_phone in pairs(value) do
if type(seq) == "string" and usub(text, i, i + ulen(seq) - 1) == seq then
phone = seq_phone
cidl = ulen(seq)
found = true
break
end
end
if not found then
phone = value
end
end
end
if type(phone) == "string" then
table.insert(phones, phone)
elseif type(phone) == "table" then
for _, p in ipairs(phone) do
table.insert(phones, p)
end
end
i = i + cidl
end
end
-- concatenate the phonemes into a string
return table.concat(phones)
end
-- final phonemic substituations
local function phonemic(text, orig, pos)
text = rsub(text, "n()", "ŋ%1")
return text
end
function export.toIPA(text, orig, pos)
if type(text) == 'table' then
text, orig, pos = ine(text.args), ine(text.args.orig), ine(text.args.pos)
end
text = text or mw.title.getCurrentTitle().text
text = normalise(text)
-- text = handle_stress(text, orig, pos)
text = respell(text, orig, pos)
text = parse_table(text)
text = phonemic(text, orig, pos)
-- remove hyphens and word-boundary markers
return rsub(text, "", "")
end
return export