--[[
Author: AmazingJus
This module automatically generates the IPA transcription of Afrikaans terms
based on their grapheme structure, syllabification, affixation and etymology.
It is based on the phonological rules of Afrikaans and the grapheme-phoneme
correspondence of the language. The module processes the input text in several
steps:
1. It canonicalises the input text, decomposing accents and removing extraneous spaces.
2. It syllabifies the words, marking syllable boundaries and handling digraphs and trigraphs.
3. It applies affixes, marking prefixes and suffixes with special characters.
4. It assigns stress to syllables based on predefined rules.
5. It generates the IPA transcription by substituting graphemes with their phonetic equivalents.
Sources:
- Donaldson, Bruce C. (1993). A Grammar of Afrikaans.
- Wissing, Daan (2016). "Afrikaans phonology". Taalportaal.
--]]
local export = {}
local lang = require("Module:languages").getByCode("af")
local sc = require("Module:scripts").getByCode("Latn")
local hyph = require("Module:hyphenation")
local str = require("Module:string")
local tbl = require("Module:table")
local ipa = require("Module:IPA") -- IPA display module
function export.tag_text(text, face)
return require("Module:script utilities").tag_text(text, lang, sc, face)
end
function export.link(term, face)
return require("Module:links").full_link( { term = term, lang = lang, sc = sc }, face )
end
local u = require("Module:string/char")
local decomp = mw.ustring.toNFD
local recomp = mw.ustring.toNFC
local lower = mw.ustring.lower
local find = mw.ustring.find
local len = mw.ustring.len
local match = mw.ustring.match
local split = mw.text.split
local gsplit = mw.text.gsplit
local sub = mw.ustring.sub
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.gmatch
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
while true do
local new_term = rsub(term, foo, bar)
if new_term == term then
return term
end
term = new_term
end
end
-- list of accents
local grave = u(0x0300) -- grave
local acute = u(0x0301) -- acute
local circ = u(0x0302) -- circumflex
local dia = u(0x0308) -- diaresis
local syll = "‧" -- syllable dot
-- for automatically generated stress
local auto_grave = u(0xFFF0) -- automatic grave
local auto_acute = u(0xFFF1) -- automatic acute
-- list of char classes
local accent = grave .. acute .. auto_grave .. auto_acute .. circ .. dia
local stress_accent = grave .. acute .. auto_grave .. auto_acute
local vowel = "aeiouyAEIOUY"
local cons = "bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQSTVWXZ"
local bound = syll .. "#<>%+%."
-- put them into classes
local A = "" -- all accents
local V = "" -- all vowels
local non_V = "" -- all non-vowels
local C = "" -- all consonants
local non_C = "" -- all non-consonants
local CV = "" -- all consonants and vowels
local S = "" -- any syllable boundary
-- list of valid trigraphs and digraphs, including diphthongs and long vowels
local graphemes = {
= "ɑːɪ̯",
= "iʊ̯",
= "iʊ̯",
= "uɪ̯",
= "oːɪ̯",
= "ɑː",
= "ɑː",
= "aɪ̯",
= "œʊ̯",
= "ɪə̯",
= "əɪ̯",
= "iʊ̯",
= "į", -- temporary value
= "ů", -- temporary value
= "ɔɪ̯",
= "ʊə̯",
= "œʊ̯",
= "uɪ̯",
= "ü" -- temporary value
}
-- sort trigraphs and digraphs in descending order
local graphemes_sorted = {}
for k, _ in pairs(graphemes) do
table.insert(graphemes_sorted, k)
end
table.sort(graphemes_sorted, function(a, b) return len(a) > len(b) end)
-- list of various grapheme sets
local sets = {
= { -- long-short vowels
= {"a", "ɑː"},
= {"ɛ", "ɪə̯"},
= {"ə", "i"},
= {"ɔ", "ʊə̯"},
= {"œ", "y"}
},
= { -- voiced/voiceless consonants
{"b", "p"},
{"d", "t"},
{"ʤ", "ʧ"},
{"ɡ", "k"},
{"v", "f"},
{"z", "s"},
{"ʒ", "ʃ"},
}
}
-- list of defined affixes
local affixes = {
-- prefixes
= {
{"aan"},
{"agter"},
{"be", restriction = "^"},
{"deur"},
{"er"},
{"ge", restriction = "^"},
{"her"},
{"om"},
{"ont"},
{"onder"},
{"uit"},
{"van", pos = "d"},
{"ver"},
{"voor"}
},
-- suffixes
= {
{"agtig"},
{"baar"},
{"dom"},
{"end"},
{"heid"},
{"lik"},
{"loos"},
{"nis"},
{"sel"},
{"skap"}
}
}
-- list of unstressed words
local unstressed = {
"die",
"dit",
"is",
"nie",
"'n"
}
-- list of stressed endings (mostly in loanwords)
local stressed_endings = {
"aa",
"aans?",
"aard?",
"ant",
"a",
"ee?",
"ein",
{"el", orig = "loan"}, -- only in loanwords
"ent",
"eu",
"e",
"ieel",
"ie",
"ine",
"ie",
{"o", orig = "fr"}, -- only in french loanwords
"oen",
"on",
"oo",
{"sie", stress = "pre"},
"teek",
"teit",
"uu",
"u",
"y?",
}
-- list of respelling substitutions
local subs = {
-- 'N
{"#'n#", "#ə(n)#", "-"}, -- pronounced /ə(n)/ as the article 'n
{"'n#", "ən#", "-"}, -- pronounced /ən/ otherwise
-- CH
{"ch", "ʃ", "fr"}, -- pronounced /ʃ/ in french loans
{"sch", "sk", "-"}, -- pronounced /sk/ in the sequence "sch"
{"ch(?)", "χ%1", "-"}, -- pronounced /χ/ before optional consonant cluster and "e" or "i"
{"ch", "k", "-"}, -- otherwise /k/
-- NG
{"ng", "ŋ", "-"}, -- pronounced /ŋ/
-- SH/SJ
{"s", "ʃ", "-"}, -- pronounced /ʃ/
-- DJ/TJ
{"jie", "kį", "-"}, -- pronounced /-ci/ in the suffix "-djie"/"-tjie"
{"dj", "ʤ", "-"}, -- "dj" is otherwise /d͡ʒ/
{"tj", "ʧ", "-"}, -- "tj" is otherwise /t͡ʃ/
-- C
{"c()", "s%1", "-"}, -- pronounced /s/ before "e" or "i"
{"c", "k", "-"}, -- otherwise /k/
-- GH
{"gh", "ɡ", "-"}, -- pronounced /ɡ/
-- G
{"g", "ɡ", "en"}, -- pronounced /ɡ/ in english loans
{"r‧ge", "r‧ɡe", "-"}, -- pronounced /ɡ/ between /r/ and /ə/
{"g", "χ", "-"}, -- otherwise /χ/
{"n(‧?)", "ŋ%1", "-"}, -- /ŋ/ is an allophone of /n/ before /ɡ/ and /k/
-- V
{"v", "f", "af"}, -- pronounced /f/ in native words
-- W
{"w", "w", "en"}, -- pronounced /w/ in english loans
{"w", "v", "-"}, -- otherwise /v/
-- EAU
{"eaux?", "OU", "fr"}, -- pronounced /œʊ̯/ in french loans
-- OI
{"oi", "wA", "fr"}, -- pronounced /wa/ in french loans
-- IJ
{"ij(" .. non_V .. ")", "EI%1", "-"}, -- pronounced /əɪ̯/ in dutch-based names
-- X
{"#x", "#s", "-"}, -- pronounced /s/ word-initially
{"x", "ks", "-"}, -- otherwise /ks/
-- H
{"(" .. CV .. ")h", "%1", "-"}, -- silent if part of consonant digraph or syllable-final
{"h", "ɦ", "-"}, -- otherwise /ɦ/
-- O
{"o(" .. S .. ")", "OU%1", "en"}, -- pronounced /œʊ̯/ in open syllables in english loans
{"o#", "ů#", "-"}, -- otherwise /u/ in word-final position
-- U
{"u(" .. C .. ")", "A%1", "en"}, -- pronounced /a/ in closed syllables in english loans
{"u", "jů", "en"}, -- otherwise /ju/ in english loans
-- Y
{"y", "j", "en"}, -- pronounced /j/ in english loans
{"y", "EI", "-"}, -- otherwise /əɪ̯/
-- circumflex accent
{circ, "ː", "-"} -- lengthens a vowel with its short quality
}
-- canonicalisation function
local function canonicalise(text)
-- decompose accents
text = decomp(text)
-- make text lowercase
text = lower(text)
-- remove extrenous spaces
text = rsub(text, "%s+", " ")
text = rsub(text, "^ ", "")
text = rsub(text, " $", "")
-- treat commas as a pause
text = rsub_repeatedly(text, "%s*,%s*", " | ")
-- return as array of words
return split(text, " ")
end
-- syllabification function
local function syllabify(word, etyl, pos)
-- remove diaresis and split syllable (note: diaresis shouldn't be displayed in its hyphenated form)
word = rsub(word, "(" .. V .. ")" .. dia, syll .. "%1")
-- mark trigraphs and digraphs with curly braces
for _, graph in ipairs(graphemes_sorted) do
word = rsub(word, graph, "{" .. graph .. "}")
end
-- add dot before consonant + vowel
word = rsub(word, "(" .. C .. "?{?" .. V .. A .. "?)", syll .. "%1")
-- remove any dots inside brackets
word = rsub(word, "{*}", function(a) return rsub(a, syll, "") end)
-- shift dot before certain consonant clusters and digraphs
word = rsub(word, "()‧l", syll .. "%1l") -- clusters with l
word = rsub(word, "()‧r", syll .. "%1r") -- clusters with r
word = rsub(word, "()‧j", syll .. "%1j") -- digraphs with j
word = rsub(word, "()‧h", syll .. "%1h") -- digraphs with h
word = rsub(word, "n‧g", "ng‧") -- ng is syllable-final
-- remove leading dots and brackets
word = rsub(word, "(" .. S .. ")(" .. non_V .. "+)" .. syll, "%1" .. syll .. "%2")
-- word = rsub(word, "%.", syll)
word = rsub(word, "", "") -- comment out to debug
return rsub(word, syll .. "+", syll)
end
-- hyphen depth check function
local function is_hyphen_depth(depth)
return (depth == 1) and "%-" or ""
end
-- onset validation function
local function is_valid_onset(string)
-- check if matching syllable onset (including ones starting with s)
if find(string, "^" .. syll) or find(string, "^s" .. syll .. "") then
return true
end
return false
end
-- rest of string function
local function get_rest_string(string, affix, affix_type)
if affix_type == "pre" then
return sub(string, len(affix) + 1)
else
return sub(string, 1, -len(affix) - 1)
end
end
-- affix validation function
local function is_valid_affix(string, affix, affix_type, pos, depth)
-- get rest of string
local rest = get_rest_string(string, affix, affix_type)
-- check for existing pos restriction
if affix.pos and not find(pos, affix.pos) then
-- then for explicit non-boundaries
elseif affix.restriction and not find(rest, affix.restriction) and affix_type == "pre" then
-- then for matching syllable onset
elseif not is_valid_onset(syllabify(rest)) and affix_type == "pre" then
-- then for explicit word boundary
elseif find(rest, "^%+") and affix_type == "pre" then
-- then for no vowels
elseif not find(rest, V) and affix_type == "pre" then
-- then only for two or less chars
elseif find(rest, "^..?$") then
else
-- match hyphen at appropriate depth
local hyphen = is_hyphen_depth(depth)
-- match appropriate pattern
local pattern = affix_type == "pre" and "^" .. affix .. hyphen or hyphen .. affix .. "$"
return true and find(string, pattern) or false
end
return false
end
-- affix application function
local function apply_affixes(string, depth, pos)
-- match hyphen at appropriate depth
local hyphen = is_hyphen_depth(depth)
-- process prefixes
for _, affix in ipairs(affixes.pre) do
if is_valid_affix(string, affix, "pre", pos, depth) then
-- add prefix marker >
string = rsub(string, "^" .. affix .. hyphen, affix .. ">")
break
end
end
-- process suffixes
for _, affix in ipairs(affixes.suf) do
if is_valid_affix(string, affix, "suf", pos, depth) then
-- add suffix marker <
string = rsub(string, hyphen .. affix .. "$", "<" .. affix)
break
end
end
return string
end
-- stress assignment function (does not apply to depth zero)
local function assign_stress(string, etyl, pos)
-- match hyphen at appropriate depth
-- local hyphen = is_hyphen_depth(depth)
-- get string without syllables
local string_no_syll = rsub(string, syll, "")
-- check for stressed endings
for _, ending in ipairs(stressed_endings) do
-- handle table entries with additional properties
local pattern
if type(ending) == "table" then
pattern = ending
if ending.orig and ending.orig ~= etyl then
break
end
else
pattern = ending
end
-- find and stress the ending if matched
local ending_match = match(string, "(" .. pattern .. ")$")
if ending_match then
-- add acute accent on the vowel in final syllable before ending if pre-stressed
if ending.stress == "pre" then
string = rsub(string, "(" .. V .. "+)(" .. non_V .. "*" .. ending_match .. ")$", "%1" .. acute .. "%2")
-- otherwise add acute accent on the vowel in initial syllable of ending
else
local stressed_ending = rsubn(ending_match, "(" .. V .. "+)", "%1" .. acute, 1)
string = rsub(string, ending_match .. "$", stressed_ending)
end
end
end
-- check for > and add acute accent on the following vowel
string = rsub(string, "(>" .. non_V .. "*" .. V .. "+)", "%1" .. acute)
-- likewise check for < and add grave accent on the folloing vowel
string = rsub(string, "(<" .. non_V .. "*" .. V .. "+)", "%1" .. grave)
return string
end
-- components parsing function
local function split_components(word, depth, etyl, pos)
-- initialise some variables
depth = depth or 0
pos = pos or ".*"
-- depth 0: handle double hyphen compounds first
if depth == 0 then
local parts = split(word, "%-%-")
if #parts > 1 then
local result = {}
for _, part in ipairs(parts) do
table.insert(result, split_components(part, depth + 1, etyl, pos))
end
return table.concat(result, "--")
else
return split_components(word, depth + 1, etyl, pos)
end
end
-- depth 1: handle single hyphen compounds and hyphenated affixes
if depth == 1 then
-- explicitly mark ambiguous prefix and suffixes with a hyphen with < and > respectively
word = apply_affixes(word, depth, pos)
-- check for further splitting
local parts = split(word, "%-")
if #parts > 1 then
local result = {}
for _, part in ipairs(parts) do
table.insert(result, split_components(part, depth + 1, etyl, pos))
end
return table.concat(result, "-")
else
return split_components(word, depth + 1, etyl, pos)
end
end
-- depth 2: handle non-hyphenated affixes
if depth == 2 then
-- add < and > for prefix and suffixes respectively
word = apply_affixes(word, depth, pos)
-- apply syllabification
word = syllabify(word, etyl, pos)
-- assign stress after syllabification
word = assign_stress(word, etyl, pos)
return word
end
return word
end
-- component generation function
local function to_components(words, etyl, pos)
-- loop over every word
local results = {}
for _, word in ipairs(words) do
-- get term as split components
local w = split_components(word, 0, etyl, pos)
table.insert(results, "#" .. w .. "#")
end
-- join processed words
return table.concat(results, " ")
end
-- generate substitutions function
local function generate_subs(term, etyl, pos)
local to_sub = {}
local seen_patterns = {}
for _, s in ipairs(subs) do
local s_patt, s_repl, s_etyl = s, s, s
-- only add if pattern wasn't added already
if not seen_patterns then
-- add substitution for etymology-specific rules
if etyl ~= "-" and s_etyl == etyl then
table.insert(to_sub, {s_patt, s_repl})
seen_patterns = true
-- otherwise add substitution for default rules
elseif s_etyl == "-" then
table.insert(to_sub, {s_patt, s_repl})
seen_patterns = true
end
end
end
return to_sub
end
-- hyphenation function (FIXME: make it more dynamic depending on how it's inputted)
function export.hyphenation(text, etyl, pos)
-- canonicalise term as array of words
local words = canonicalise(text)
-- mark text with appropriate components
local term = to_components(words, etyl, pos)
-- format hyphenation
-- local data = { lang = lang, sc = sc, hyphs = {{hyph = rsub(syllabify(term), "<>]", ""), "%.")}} }
-- return hyphen.format_hyphenations(data)
term = rsub(term, "#" .. syll, "#") -- re-remove syllable boundaries after analysing components
return recomp(term)
-- return rsub(recomp(syllabify(term)), "<>]", "")
end
-- pronunciation function
function export.toIPA(text, etyl, pos)
-- canonicalise term as array of words
local words = canonicalise(text)
-- mark text with appropriate components
local term = to_components(words, etyl, pos)
-- shift stress rightwards to a syllable boundary
-- term = rsub(term, "(*)ˈ", "ˈ%1")
--[[
-- prepare table to substitute the appropriate phonemes based on etymology and part of speech
local to_sub = generate_subs(term, etyl, pos)
-- go over substitution table
for _, s in ipairs(to_sub) do
local k, v = s, s
rsub(term, k, v)
end
-- make text lowercase again
term = lower(term)
-- substitute graphemes
for graph, phoneme in pairs(graphemes) do
term = rsub(term, graph, phoneme)
end
-- substitute single-letter vowels
term = rsub(term, "()()", function(a, b)
if match("", b) then
return sets.vowel_length .. b -- for open syllables
else
return sets.vowel_length .. b -- for closed syllables
end
end)
-- replace į, ů, ü with their actual phonetic values
term = rsub(term, "", { = "i", = "u", = "y"})
-- remove double consonants
term = rsub(term, "(.)(‧?)%1", "%2%1")
]]--
-- final adjustments
-- term = rsub(term, "‧", ".")
term = rsub(term, "#" .. syll, "#") -- re-remove syllable boundaries after analysing components
return term
-- return rsub(term, "]", "")
end
-- main export function
function export.show(frame)
-- get arguments and page title
local args = frame.args
local pagetitle = mw.title.getCurrentTitle().text
-- initialise parameters
local p, results = {}, {}
-- get arguments
if args then
for _, v in ipairs(args) do
table.insert(p, (v ~= "") and v or nil)
end
else
p = { pagetitle }
end
for _, term in ipairs(p) do
-- get etymology and part of speech
local etyl = args.etyl
local pos = args.pos
-- get IPA transcription
local pron = export.toIPA(term, etyl, pos)
-- add to results
table.insert(results, { pron = "/" .. pron .. "/" })
end
return ipa.format_IPA_full { lang = lang, items = results }
end
return export