--[[
Author: AmazingJus
This module automatically generates the IPA transcription of Afrikaans terms
based on their grapheme structure, syllabification, affixation and etymology.
It is based on the phonological rules of Afrikaans and the grapheme-phoneme
correspondence of the language. The module processes the input text in several
steps:
1. It canonicalises the input text, decomposing accents and removing extraneous spaces.
2. It syllabifies the words, marking syllable boundaries and handling digraphs and trigraphs.
3. It applies affixes, marking prefixes and suffixes with special characters.
4. It assigns stress to syllables based on predefined rules.
5. It generates the IPA transcription by substituting graphemes with their phonetic equivalents.
Sources:
- Donaldson, Bruce C. (1993). A Grammar of Afrikaans.
- Wissing, Daan (2016). "Afrikaans phonology". Taalportaal.
--]]
local export = {}
local lang = require("Module:languages").getByCode("af")
local sc = require("Module:scripts").getByCode("Latn")
local hyph = require("Module:hyphenation")
local str_util = require("Module:string utilities")
local tbl = require("Module:table")
local ipa = require("Module:IPA") -- IPA display module
function export.tag_text(text, face)
return require("Module:script utilities").tag_text(text, lang, sc, face)
end
function export.link(term, face)
return require("Module:links").full_link( { term = term, lang = lang, sc = sc }, face )
end
local u = require("Module:string/char")
local decomp = mw.ustring.toNFD
local recomp = mw.ustring.toNFC
local lower = mw.ustring.lower
local find = mw.ustring.find
local len = mw.ustring.len
local match = mw.ustring.match
local split = mw.text.split
local sub = mw.ustring.sub
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.gmatch
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
while true do
local new_term = rsub(term, foo, bar)
if new_term == term then
return term
end
term = new_term
end
end
-- list of accents
local grave = u(0x0300) -- grave
local acute = u(0x0301) -- acute
local circ = u(0x0302) -- circumflex
local dia = u(0x0308) -- diaresis
local syll = "‧" -- syllable dot
-- for automatically generated stress
local auto_grave = u(0xFFF0) -- automatic grave
local auto_acute = u(0xFFF1) -- automatic acute
-- list of char classes
local accent = grave .. acute .. auto_grave .. auto_acute .. circ .. dia
local stress_accent = grave .. acute .. auto_grave .. auto_acute
local vowel = "aeiouyAEIOUY"
local cons = "bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQRSTVWXZ"
local syll_bound = syll .. "#<>%+%-%."
-- put them into classes
local A = "" -- all accents
local AS = "" -- all stress accents
local V = "" -- all vowels
local non_V = "" -- all non-vowels
local C = "" -- all consonants
local non_C = "" -- all non-consonants
local CV = "" -- all consonants and vowels
local S = "" -- all syllable boundaries
local non_S = "" -- all non-syllable boundaries
-- list of valid trigraphs and digraphs, including diphthongs and long vowels
local graphemes = {
= "ɑːɪ̯",
= "iʊ̯",
= "iʊ̯",
= "uɪ̯",
= "oːɪ̯",
= "ɑː",
= "ɑː",
= "aɪ̯",
= "œʊ̯",
= "ɪə̯",
= "əɪ̯",
= "iʊ̯",
= "į", -- temporary value
= "ů", -- temporary value
= "ɔɪ̯",
= "ʊə̯",
= "œʊ̯",
= "uɪ̯",
= "ü" -- temporary value
}
-- sort trigraphs and digraphs in descending order
local graphemes_sorted = {}
for k, _ in pairs(graphemes) do
table.insert(graphemes_sorted, k)
end
table.sort(graphemes_sorted, function(a, b) return len(a) > len(b) end)
-- list of various grapheme sets
local sets = {
= { -- long-short vowels
= {"a", "ɑː"},
= {"ɛ", "ɪə̯"},
= {"ə", "i"},
= {"ɔ", "ʊə̯"},
= {"œ", "y"}
},
= { -- voiced/voiceless consonants
{"b", "p"},
{"d", "t"},
{"ʤ", "ʧ"},
{"ɡ", "k"},
{"v", "f"},
{"z", "s"},
{"ʒ", "ʃ"},
}
}
-- list of defined affixes
local affixes = {
-- prefixes
= {
{"aan"},
{"agter"},
{"be", restriction = "^"},
{"deur"},
{"er"},
{"ge", restriction = "^"},
{"her"},
{"om"},
{"ont"},
{"onder"},
{"uit"},
{"van", pos = "d"},
{"ver"},
{"voor"}
},
-- suffixes
= {
{"agtig"},
{"baar"},
{"dom"},
{"end"},
{"heid"},
{"lik"},
{"loos"},
{"nis"},
{"sel"},
{"skap"}
}
}
-- list of unstressed words
local unstressed = {
"die",
"dit",
"is",
"nie",
"'n"
}
-- list of stressed endings (mostly in loanwords)
local stressed_endings = {
"aa",
"aans?",
"aard?",
"ant",
"a",
"ee?",
"ein",
{"el", orig = "loan"}, -- only in loanwords
"ent",
"eu",
"e",
"ieel",
"ie",
"ine",
"ie",
{"o", orig = "fr"}, -- only in french loanwords
"oen",
"on",
"oo",
{"sie", stress = "pre"},
"teek",
"teit",
"uu",
"u",
"y?",
}
-- list of respelling substitutions
local subs = {
-- 'N
{"#'n#", "#ə(n)#", "-"}, -- pronounced /ə(n)/ as the article 'n
{"'n#", "ən#", "-"}, -- pronounced /ən/ otherwise
-- CH
{"ch", "ʃ", "fr"}, -- pronounced /ʃ/ in french loans
{"sch", "sk", "-"}, -- pronounced /sk/ in the sequence "sch"
{"ch(?)", "χ%1", "-"}, -- pronounced /χ/ before optional consonant cluster and "e" or "i"
{"ch", "k", "-"}, -- otherwise /k/
-- NG
{"ng", "ŋ", "-"}, -- pronounced /ŋ/
-- SH/SJ
{"s", "ʃ", "-"}, -- pronounced /ʃ/
-- DJ/TJ
{"jie", "kį", "-"}, -- pronounced /-ci/ in the suffix "-djie"/"-tjie"
{"dj", "ʤ", "-"}, -- "dj" is otherwise /d͡ʒ/
{"tj", "ʧ", "-"}, -- "tj" is otherwise /t͡ʃ/
-- C
{"c()", "s%1", "-"}, -- pronounced /s/ before "e" or "i"
{"c", "k", "-"}, -- otherwise /k/
-- GH
{"gh", "ɡ", "-"}, -- pronounced /ɡ/
-- G
{"g", "ɡ", "en"}, -- pronounced /ɡ/ in english loans
{"r‧ge", "r‧ɡe", "-"}, -- pronounced /ɡ/ between /r/ and /ə/
{"g", "χ", "-"}, -- otherwise /χ/
{"n(‧?)", "ŋ%1", "-"}, -- /ŋ/ is an allophone of /n/ before /ɡ/ and /k/
-- V
{"v", "f", "af"}, -- pronounced /f/ in native words
-- W
{"w", "w", "en"}, -- pronounced /w/ in english loans
{"w", "v", "-"}, -- otherwise /v/
-- EAU
{"eaux?", "OU", "fr"}, -- pronounced /œʊ̯/ in french loans
-- OI
{"oi", "wA", "fr"}, -- pronounced /wa/ in french loans
-- IJ
{"ij(" .. non_V .. ")", "EI%1", "-"}, -- pronounced /əɪ̯/ in dutch-based names
-- X
{"#x", "#s", "-"}, -- pronounced /s/ word-initially
{"x", "ks", "-"}, -- otherwise /ks/
-- H
{"(" .. CV .. ")h", "%1", "-"}, -- silent if part of consonant digraph or syllable-final
{"h", "ɦ", "-"}, -- otherwise /ɦ/
-- O
{"o(" .. S .. ")", "OU%1", "en"}, -- pronounced /œʊ̯/ in open syllables in english loans
{"o#", "ů#", "-"}, -- otherwise /u/ in word-final position
-- U
{"u(" .. C .. ")", "A%1", "en"}, -- pronounced /a/ in closed syllables in english loans
{"u", "jů", "en"}, -- otherwise /ju/ in english loans
-- Y
{"y", "j", "en"}, -- pronounced /j/ in english loans
{"y", "EI", "-"}, -- otherwise /əɪ̯/
-- circumflex accent
{circ, "ː", "-"} -- lengthens a vowel with its short quality
}
-- canonicalisation function
local function canonicalise(text)
-- decompose accents
text = decomp(text)
-- remove extrenous spaces
text = rsub(text, "%s+", " ")
text = rsub(text, "^%s+", "")
text = rsub(text, "%s+$", "")
-- treat commas as a pause, but only if there is a space afterwards
text = rsub_repeatedly(text, "%s?,%s", " | ")
-- return as array of words
return split(text, " ")
end
-- syllabification function
local function syllabify(word)
-- remove diaresis and split syllable (note: diaresis shouldn't be displayed in its hyphenated form)
word = rsub(word, "(" .. V .. ")" .. dia, syll .. "%1")
-- mark trigraphs and digraphs with curly braces
for _, graph in ipairs(graphemes_sorted) do
word = rsub(word, graph, "{" .. graph .. "}")
end
-- add dot before consonant + vowel
word = rsub(word, "(" .. C .. "?{?" .. V .. A .. "?)", syll .. "%1")
-- remove any dots inside brackets
word = rsub(word, "{*}", function(a) return rsub(a, syll, "") end)
-- shift dot before certain consonant clusters and digraphs
word = rsub(word, "()‧l", syll .. "%1l") -- clusters with l
word = rsub(word, "()‧r", syll .. "%1r") -- clusters with r
word = rsub(word, "()‧j", syll .. "%1j") -- digraphs with j
word = rsub(word, "()‧h", syll .. "%1h") -- digraphs with h
word = rsub(word, "n‧g", "ng‧") -- ng is syllable-final
-- remove leading dots and brackets
-- word = rsub(word, "(" .. S .. ")(" .. non_V .. "+)" .. syll, "%1" .. syll .. "%2")
-- word = rsub(word, "^(" .. non_V .. "*)" .. syll, syll .. "%1")
-- word = rsub(word, "%.", syll)
word = rsub(word, "", "") -- comment out to debug
return rsub(word, syll .. "+", syll) -- remove multiple syllable dots
end
-- hyphen depth check function
local function is_hyphen_depth(depth)
return (depth == 1) and "%-" or ""
end
-- onset validation function
local function is_valid_onset(string)
-- check if matching syllable onset (including ones starting with s)
if find(string, "^" .. syll) or find(string, "^s" .. syll .. "") then
return true
end
return false
end
-- rest of string function
local function get_rest_string(string, affix, affix_type)
if affix_type == "pre" then
return sub(string, len(affix) + 1)
else
return sub(string, 1, -len(affix) - 1)
end
end
-- affix validation function
local function is_valid_affix(string, affix, affix_type, pos, depth)
-- get rest of string
local rest = get_rest_string(string, affix, affix_type)
-- check for existing pos restriction
if affix.pos and not find(pos, affix.pos) then
-- then for explicit non-boundaries
elseif affix.restriction and not find(rest, affix.restriction) and affix_type == "pre" then
-- then for matching syllable onset
elseif not is_valid_onset(syllabify(rest)) and affix_type == "pre" then
-- then for explicit word boundary
elseif find(rest, "^%+") and affix_type == "pre" then
-- then for no vowels
elseif not find(rest, V) and affix_type == "pre" then
-- then only for two or less chars
elseif find(rest, "^..?$") then
else
-- match hyphen at appropriate depth
local hyphen = is_hyphen_depth(depth)
-- match appropriate pattern
local pattern = affix_type == "pre" and "^" .. affix .. hyphen or hyphen .. affix .. "$"
return true and find(string, pattern) or false
end
return false
end
-- affix application function
local function apply_affixes(string, depth, pos)
-- match hyphen at appropriate depth
local hyphen = is_hyphen_depth(depth)
-- process prefixes
for _, affix in ipairs(affixes.pre) do
if is_valid_affix(string, affix, "pre", pos, depth) then
-- add prefix marker >
string = rsub(string, "^" .. affix .. hyphen, affix .. ">")
break
end
end
-- process suffixes
for _, affix in ipairs(affixes.suf) do
if is_valid_affix(string, affix, "suf", pos, depth) then
-- add suffix marker <
string = rsub(string, hyphen .. affix .. "$", "<" .. affix)
break
end
end
return string
end
-- stress assignment function (does not apply to depth zero)
local function assign_stress(string, etyl, pos)
-- get string without syllables for pattern matching
local string_no_syll = decomp(rsub(string, syll, ""))
-- check for stressed endings
for _, ending in ipairs(stressed_endings) do
-- handle table entries with additional properties
local pattern
if type(ending) == "table" then
pattern = ending
-- FIXME - breaks stress
-- if ending.orig and ending.orig ~= etyl then
-- break
-- end
else
pattern = ending
end
-- find and stress the ending if matched in string
local ending_match = match(string_no_syll, "(" .. pattern .. ")$")
if ending_match then
-- escape special characters in ending_match for pattern matching
local escaped_ending = str_util.pattern_escape(ending_match)
-- find corresponding ending in original string
local before_ending, full_ending = match(string, "(.*)(" .. syll .. "?" .. escaped_ending .. ")$")
if full_ending then
-- add acute accent on final syllable before ending if pre-stressed
if ending.stress == "pre" then
string = rsub(string, "(" .. before_ending .. ")", function(a)
return rsub(a, "(" .. non_V .. "*" .. V .. "+" .. A .. "*)(*)$", "%1" .. acute .. "%2")
end)
-- otherwise add acute accent on initial syllable of ending
else
string = rsub(string, "(" .. full_ending .. ")", function(a)
return rsub(a, "^(" .. non_V .. "*" .. V .. "+" .. A .. "*)(*)", "%1" .. acute .. "%2")
end)
end
end
-- break after successful match
break
end
end
-- check for > and add acute accent on the following syllable
if not find(string, acute) then
string = rsub(string, "(>*" .. syll .. "?" .. non_V .. "*" .. V .. "+" .. A .. "*)", function(a)
return a .. acute
end)
end
-- otherwise add acute accent to first syllable if no stress marks present
if not find(string, acute) then
string = rsubn(string, "(" .. syll .. "?" .. non_V .. "*" .. V .. "+" .. A .. "*)", function(a)
return a .. acute
end, 1)
end
-- likewise check for < and add grave accent on the following syllable
string = rsub(string, "(<*" .. syll .. "?" .. non_V .. "*" .. V .. "+" .. A .. "*)", function(a)
return a .. grave
end)
return string
end
-- component generation function
local function to_components(words, etyl, pos, depth, hide_stress)
-- parse each component first
local function split_components(word, etyl, pos, depth, hide_stress)
-- handle initial calls with no depth or part of speech
depth = depth or 0
pos = pos or ".*"
-- depth 0: handle double hyphen compounds first
if depth == 0 then
local parts = split(word, "%-%-")
if #parts > 1 then
local result = {}
for _, part in ipairs(parts) do
table.insert(result, split_components(part, etyl, pos, depth + 1, hide_stress))
end
return table.concat(result, "--")
else
return split_components(word, etyl, pos, depth + 1, hide_stress)
end
end
-- depth 1: handle single hyphen compounds and hyphenated affixes
if depth == 1 then
-- explicitly mark ambiguous prefix and suffixes with a hyphen with < and > respectively
word = apply_affixes(word, depth, pos)
-- check for further splitting
local parts = split(word, "%-")
if #parts > 1 then
local result = {}
for _, part in ipairs(parts) do
table.insert(result, split_components(part, etyl, pos, depth + 1, hide_stress))
end
return table.concat(result, "-")
else
return split_components(word, etyl, pos, depth + 1, hide_stress)
end
end
-- depth 2: handle non-hyphenated affixes
if depth == 2 then
-- add < and > for prefix and suffixes respectively
word = apply_affixes(word, depth, pos)
-- apply syllabification
word = syllabify(word)
-- assign stress after syllabification
return hide_stress and word or assign_stress(word, etyl, pos)
end
return word
end
-- loop over every word
local results = {}
for _, word in ipairs(words) do
-- get term as split components
local w = split_components(word, etyl, pos, 0, hide_stress)
w = rsub(w, "^(" .. C .. "*)" .. syll, "%1") -- remove leading initial syllable boundary after splitting components
table.insert(results, "#" .. w .. "#")
end
-- join processed words
return table.concat(results, " ")
end
-- generate substitutions function
local function generate_subs(term, etyl, pos)
local to_sub = {}
local seen_patterns = {}
for _, s in ipairs(subs) do
local s_patt, s_repl, s_etyl = s, s, s
-- only add if pattern wasn't added already
if not seen_patterns then
-- add substitution for etymology-specific rules
if etyl ~= "-" and s_etyl == etyl then
table.insert(to_sub, {s_patt, s_repl})
seen_patterns = true
-- otherwise add substitution for default rules
elseif s_etyl == "-" then
table.insert(to_sub, {s_patt, s_repl})
seen_patterns = true
end
end
end
return to_sub
end
-- hyphenation function (FIXME: make it more dynamic depending on how it's inputted)
function export.hyphenation(text, etyl, pos)
-- canonicalise term as array of words
local words = canonicalise(text)
-- mark text with appropriate components
local term = to_components(words, etyl, pos, 0, true)
-- return hyphen.format_hyphenations(data)
return recomp(term)
-- return rsub(recomp(term)), "<>]", "")
end
-- stress assignment function
function export.stress(text, etyl, pos)
-- canonicalise term as array of words
local words = canonicalise(text)
-- mark text with appropriate components
local term = to_components(words, etyl, pos, 0, false)
-- return hyphen.format_hyphenations(data)
return recomp(term)
-- return rsub(recomp(term)), "<>]", "")
end
-- pronunciation function
function export.toIPA(text, etyl, pos)
-- canonicalise term as array of words
local words = canonicalise(text)
-- mark text with appropriate components
local term = to_components(words, etyl, pos, 0, false)
-- make text lowercase
term = lower(term)
-- convert accents to stress marks
term = rsub(term, "(" .. non_S .. "*)(" .. AS .. ")", function(ns, s)
-- secondary stress if grave accent, primary stress otherwise
return (s == grave or s == auto_grave) and ("ˌ" .. ns) or ("ˈ" .. ns)
end)
--[[
-- prepare table to substitute the appropriate phonemes based on etymology and part of speech
local to_sub = generate_subs(term, etyl, pos)
-- go over substitution table
for _, s in ipairs(to_sub) do
local k, v = s, s
rsub(term, k, v)
end
-- make text lowercase again
term = lower(term)
-- substitute graphemes
for graph, phoneme in pairs(graphemes) do
term = rsub(term, graph, phoneme)
end
-- substitute single-letter vowels
term = rsub(term, "()()", function(a, b)
if match("", b) then
return sets.vowel_length .. b -- for open syllables
else
return sets.vowel_length .. b -- for closed syllables
end
end)
-- replace į, ů, ü with their actual phonetic values
term = rsub(term, "", { = "i", = "u", = "y"})
-- remove double consonants
term = rsub(term, "(.)(‧?)%1", "%2%1")
]]--
-- final adjustments
-- term = rsub(term, "‧", ".")
term = rsub(term, "#" .. syll, "#") -- re-remove syllable boundaries after analysing components
return term
-- return rsub(term, "]", "")
end
-- main export function
function export.show(frame)
-- get arguments and page title
local args = frame.args
local pagetitle = mw.title.getCurrentTitle().text
-- initialise parameters
local p, ipa_items, hyph_items = {}, {}, {
lang = lang,
sc = sc,
hyphs = {},
caption = "Syllabification"
}
-- get arguments
if args then
for _, v in ipairs(args) do
table.insert(p, (v ~= "") and v or nil)
end
else
p = { pagetitle }
end
for _, term in ipairs(p) do
-- get etymology and part of speech
local etyl = args.etyl
local pos = args.pos
-- get hyphenation and transcription
local syllables = export.hyphenation(term, etyl, pos)
local pron = export.toIPA(term, etyl, pos)
-- add to results
table.insert(ipa_items, {pron = "/" .. pron .. "/"})
table.insert(hyph_items.hyphs, {hyph = split(syllables, syll)})
end
-- format final output
return ipa.format_IPA_full{ lang = lang, items = ipa_items } .. "\n* " .. hyph.format_hyphenations(hyph_items)
end
return export