Module:User:AmazingJus/af

The following documentation is located at Module:User:AmazingJus/af/documentation. Categories were auto-generated by Module:documentation.

Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox

IPA^(key): /#ˈa‧fri‧ka‧ner#/, /#ˈa‧fri‧ka‧ners#/
Syllabification: #A‧fri‧ka‧ner#, #A‧fri‧ka‧ners#

249 of 249 tests failed. (refresh)

test_hyphen:
Text	Expected	Actual
gegee--gegee-gegee--gegee	ge>gee--ge>gee-ge>gee--ge>gee	#ge>‧gee--‧ge>‧gee-‧ge>‧gee--‧ge>‧gee#
gegee-gegee--gegee-gegee	ge>gee-ge>gee--ge>gee-ge>gee	#ge>‧gee-‧ge>‧gee--‧ge>‧gee-‧ge>‧gee#
aanleerders--woorde-boek	aan‧leer‧ders‧woor‧de‧boek	#aan>‧leer‧ders--‧woor‧de-‧boek#
Afrika	A‧fri‧ka	#A‧fri‧ka#
Afrikaans	A‧fri‧kaans	#A‧fri‧kaans#
Afrikaner	A‧fri‧ka‧ner	#A‧fri‧ka‧ner#
Amerikaner	A‧me‧ri‧ka‧ner	#A‧me‧ri‧ka‧ner#
asyn	a‧syn	#a‧syn#
belangrik	be‧lang‧rik	#be>‧lang‧rik#
berg	berg	#berg#
berge	ber‧ge	#ber‧ge#
berg-reeks	berg‧reeks	#berg-‧reeks#
bos-bedryf	bos‧be‧dryf	#bos-‧be>‧dryf#
beskou	be‧skou	#be>s‧kou#
be+ter	be‧ter	#be+‧ter#
beton	be‧ton	#be>‧ton#
betoon	be‧toon	#be>‧toon#
Botha	Bo‧tha	#Bo‧tha#
braai	braai	#braai#
dokumentasie	do‧ku‧men‧ta‧sie	#do‧ku‧men‧ta‧sie#
eiendoms-belasting	ei‧en‧doms‧be‧las‧ting	#ei‧en‧doms-‧be>‧las‧ting#
eggo	eg‧go	#eg‧go#
feste	fes‧te	#fes‧te#
geëet	ge‧eet	#ge‧eet#
gegee	ge‧gee	#ge>‧gee#
gesigs-uitdrukking	ge‧sigs‧uit‧druk‧king	#ge>‧sigs-‧uit>‧druk‧king#
ghitaar	ghi‧taar	#ghi‧taar#
hondjie	hon‧djie	#hon‧djie#
hoog--geregs-hof	hoog‧ge‧regs‧hof	#hoog--‧ge>‧regs-‧hof#
Johannesburg	Jo‧han‧nes‧burg	#Jo‧han‧nes‧burg#
karretjie	kar‧re‧tjie	#kar‧re‧tjie#
klu	klub	#klu#
Macedonië	Ma‧ce‧do‧ni‧e	#Ma‧ce‧do‧ni‧e#
moeder-taal--spreker	moe‧der‧taal‧spre‧ker	#moe‧der-‧taal--s‧pre‧ker#
'n	'n	#'n#
omstandigheid	om‧stan‧dig‧heid	#om>s‧tan‧dig<‧heid#
onweer	on‧weer	#on‧weer#
ondergaan	on‧der‧gaan	#on‧der>‧gaan#
Paraguay	Pa‧ra‧guay	#Pa‧ra‧gu‧a‧y#
Pretoria	Pre‧to‧ri‧a	#Pre‧to‧ri‧a#
sjokolade	sjo‧ko‧la‧de	#sjo‧ko‧la‧de#
s'n	s'n	#s'n#
spieël	spie‧el	#spie‧el#
Suid-Afrika	Suid-‧A‧fri‧ka	#Suid-‧A‧fri‧ka#
vanaand	va‧naand	#va‧naand#
Venesië	Ve‧ne‧si‧e	#Ve‧ne‧si‧e#
vinger	ving‧er	#ving‧er#
wîe	wî‧e	#wî‧e#
zero	ze‧ro	#ze‧ro#
André	An‧dré	#An‧dré#
Barnard	Bar‧nard	#Bar‧nard#
Blignaut	Blig‧naut	#Blig‧naut#
Blignault	Blig‧nault	#Blig‧nault#
Cilliers	Cil‧liers	#Cil‧liers#
Coetzee	Coet‧zee	#Coet‧zee#
Coetzer	Coet‧zer	#Coet‧zer#
de Villiers	de Vil‧liers	#de# #Vil‧liers#
du Plessis	du Ples‧sis	#du# #Ples‧sis#
du Preez	du Preez	#du# #Preez#
du Toit	du Toit	#du# #Toit#
Fouché	Fou‧ché	#Fou‧ché#
Fourie	Fou‧rie	#Fou‧rie#
Grové	Gro‧vé	#Gro‧vé#
Jean Pierre	Jean Pierre	#Je‧an# #Pier‧re#
Joubert	Jou‧bert	#Jou‧bert#
La.bus.chag.ne	La‧bus‧chag‧ne	#La.‧bus.‧chag.‧ne#
La.bu.schagne	La‧bu‧schagne	#La.‧bu.s‧chag‧ne#
Labu.schagnė	La‧bu‧schagne	#La‧bu.s‧chag‧nė#
le Gran.ge	le Gran‧ge	#le# #Gran.‧ge#
le Roux	le Roux	#le# #Roux#
Malan	Ma‧lan	#Ma‧lan#
Malherbe	Mal‧her‧be	#Mal‧her‧be#
Marais	Ma‧rais	#Ma‧rais#
Meintjes	Mein‧tjes	#Mein‧tjes#
Naudé	Nau‧dé	#Nau‧dé#
Nortje	Nor‧tje	#Nor‧tje#
Pienaar	Pie‧naar	#Pie‧naar#
Schalk	Schalk	#Schalk#
Terblanche	Ter‧blanche	#Ter‧blan‧che#
Theron	The‧ron	#The‧ron#
Viljoen	Vil‧joen	#Vil‧joen#
Visagie	Vi‧sa‧gie	#Vi‧sa‧gie#
Viviers	Vi‧vi‧ers	#Vi‧viers#

test_hyphen_stress:
Text	Expected	Actual
gegee--gegee-gegee--gegee		#ge>‧geé--‧ge>‧geé-‧ge>‧geé--‧ge>‧geé#
gegee-gegee--gegee-gegee		#ge>‧geé-‧ge>‧geé--‧ge>‧geé-‧ge>‧geé#
aanleerders--woorde-boek		#aan>‧leér‧ders--‧woór‧de-‧boék#
Afrika	Á‧fri‧ka	#Á‧fri‧ka#
Afrikaans	À‧fri‧kaáns	#A‧fri‧kaáns#
Afrikaner	À‧fri‧ká‧ner	#Á‧fri‧ka‧ner#
Amerikaner	A‧mè‧ri‧ká‧ner	#Á‧me‧ri‧ka‧ner#
asyn	a‧sýn	#a‧sýn#
belangrik	be‧láng‧rik	#be>‧láng‧rik#
berg	bérg	#bérg#
berge	bér‧ge	#bér‧ge#
berg-reeks	bérg‧reeks	#bérg-‧reéks#
bos-bedryf	bós‧be‧drỳf	#bós-‧be>‧drýf#
beskou	be‧skoú	#be>s‧koú#
be+ter	bé‧ter	#bé+‧ter#
beton	be‧tón	#be>‧tón#
betoon	be‧toón	#be>‧toón#
Botha	Bó‧tha	#Bó‧tha#
braai	braaí	#braaí#
dokumentasie	dò‧ku‧men‧tá‧sie	#dó‧ku‧men‧ta‧sie#
eiendoms-belasting	eí‧en‧doms‧be‧làs‧ting	#eí‧en‧doms-‧be>‧lás‧ting#
eggo	ég‧go	#eg‧gó#
feste	fés‧te	#fés‧te#
geëet	ge‧eét	#ge‧eét#
gegee	ge‧geé	#ge>‧geé#
gesigs-uitdrukking	ge‧sígs‧uit‧drùk‧king	#ge>‧sígs-‧uit>‧drúk‧king#
ghitaar	ghi‧taár	#ghi‧taár#
hondjie	hón‧djie	#hón‧djie#
hoog--geregs-hof	hoóg‧ge‧règs‧hof	#hoóg--‧ge>‧régs-‧hóf#
Johannesburg	Jo‧hán‧nes‧bùrg	#Jó‧han‧nes‧burg#
karretjie	kár‧re‧tjie	#kár‧re‧tjie#
klu	klúb	#klú#
Macedonië	Mà‧ce‧dó‧ni‧e	#Má‧ce‧do‧ni‧e#
moeder-taal--spreker	moé‧der‧taal‧sprè‧ker	#moé‧der-‧taál--s‧pré‧ker#
'n	'n	#'n#
omstandigheid	om‧stán‧dig‧heíd	#om>s‧tán‧dig<‧heìd#
onweer	on‧weér	#on‧weér#
ondergaan	òn‧der‧gaán	#on‧der>‧gaán#
Paraguay	Pá‧ra‧guay	#Pa‧ra‧gu‧a‧ý#
Pretoria	Pre‧tó‧ri‧a	#Pré‧to‧ri‧a#
sjokolade	sjò‧ko‧lá‧de	#sjó‧ko‧la‧de#
s'n	s'n	#s'n#
spieël	spié‧el	#spié‧el#
Suid-Afrika	Suid-‧Á‧fri‧ka	#Suíd-‧Á‧fri‧ka#
vanaand	va‧naánd	#vá‧naand#
Venesië	Ve‧né‧si‧e	#Vé‧ne‧si‧e#
vinger	víng‧er	#víng‧er#
wîe	wî́‧e	#wî́‧e#
zero	zé‧ro	#ze‧ró#
André		#An‧dré#
Barnard		#Bár‧nard#
Blignaut		#Blíg‧naut#
Blignault		#Blíg‧nault#
Cilliers		#Cíl‧liers#
Coetzee		#Coet‧zeé#
Coetzer		#Coét‧zer#
de Villiers		#dé# #Víl‧liers#
du Plessis		#dú# #Plés‧sis#
du Preez		#dú# #Preéz#
du Toit		#dú# #Toít#
Fouché		#Fou‧ché#
Fourie	Fou‧rié	#Foú‧rie#
Grové		#Gro‧vé#
Jean Pierre		#Jé‧an# #Piér‧re#
Joubert		#Joú‧bert#
La.bus.chag.ne		#Lá.‧bus.‧chag.‧ne#
La.bu.schagne		#Lá.‧bu.s‧chag‧ne#
Labu.schagnė		#Lá‧bu.s‧chag‧nė#
le Gran.ge		#lé# #Grán.‧ge#
le Roux		#lé# #Roúx#
Malan		#Má‧lan#
Malherbe		#Mál‧her‧be#
Marais		#Má‧rais#
Meintjes		#Mein‧tjés#
Naudé		#Nau‧dé#
Nortje		#Nór‧tje#
Pienaar		#Pie‧naár#
Schalk		#Schálk#
Terblanche		#Tér‧blan‧che#
Theron		#The‧rón#
Viljoen		#Vil‧joén#
Visagie		#Ví‧sa‧gie#
Viviers		#Ví‧viers#

test_pron:
Text	Expected	Actual
gegee--gegee-gegee--gegee		#ge>‧ˈgee--‧ge>‧ˈgee-‧ge>‧ˈgee--‧ge>‧ˈgee#
gegee-gegee--gegee-gegee		#ge>‧ˈgee-‧ge>‧ˈgee--‧ge>‧ˈgee-‧ge>‧ˈgee#
aanleerders--woorde-boek	ˈɑːnˌlɪə̯rdərsˌvʊə̯rdəbuk	#aan>‧ˈleer‧ders--‧ˈwoor‧de-‧ˈboek#
Afrika	ˈɑː.fri.ka	#ˈa‧fri‧ka#
Afrikaans	ˌa.friˈkɑ̃ːs, ˌa.friˈkɑːns	#a‧fri‧ˈkaans#
Afrikaner	ˌa.friˈkɑː.nər	#ˈa‧fri‧ka‧ner#
Amerikaner	aˌmɪə̯.riˈkɑː.nər	#ˈa‧me‧ri‧ka‧ner#
asyn	aˈsəɪ̯n	#a‧ˈsyn#
belangrik	bəˈlaŋ.rək	#be>‧ˈlang‧rik#
berg	ˈbɛrχ	#ˈberg#
berge	ˈbɛr.ɡə	#ˈber‧ge#
berg-reeks	ˈbɛrχ.rɪə̯ks	#ˈberg-‧ˈreeks#
bos-bedryf	ˈbɔs.bəˌdrəɪ̯f	#ˈbos-‧be>‧ˈdryf#
beskou	bəˈskœʊ̯	#be>s‧ˈkou#
be+ter	ˈbɪə̯.tər	#ˈbe+‧ter#
beton	bəˈtɔn	#be>‧ˈton#
betoon	bəˈtʊə̯n	#be>‧ˈtoon#
Botha	ˈbʊə̯.ta	#ˈbo‧tha#
braai	brɑːɪ̯	#ˈbraai#
dokumentasie	ˌdɔ.kju.mɛnˈtɑː.si, ˌdɔ.ky.mɛnˈtɑː.si	#ˈdo‧ku‧men‧ta‧sie#
eiendoms-belasting	ˈəɪ̯.ən.dɔms.bəˌlas.təŋ	#ˈei‧en‧doms-‧be>‧ˈlas‧ting#
eggo	ˈɛ.χu	#eg‧ˈgo#
feste	ˈfɛs.tə	#ˈfes‧te#
geëet	χəˈɪə̯t	#ge‧ˈeet#
gegee	χəˈχɪə̯	#ge>‧ˈgee#
gesigs-uitdrukking	χəˈsəχsˌəɪ̯(t).drœ.kəŋ	#ge>‧ˈsigs-‧uit>‧ˈdruk‧king#
ghitaar	ɡiˈtɑːr	#ghi‧ˈtaar#
hondjie	ˈɦœi̯ɲ.ci	#ˈhon‧djie#
hoog--geregs-hof	ˈɦuəχ.χəˌrɛχs.ɦɔf	#ˈhoog--‧ge>‧ˈregs-‧ˈhof#
Johannesburg	jʊə̯ˈɦa.nəsˌbœrχ	#ˈjo‧han‧nes‧burg#
karretjie	ˈka.rəi̯.ci	#ˈkar‧re‧tjie#
klu	klab, klœb	#ˈklu#
Macedonië	ˌma.səˈdʊə̯.ni.ə	#ˈma‧ce‧do‧ni‧e#
moeder-taal--spreker	ˈmudərtɑːlˌspreə̯kər	#ˈmoe‧der-‧ˈtaal--s‧ˈpre‧ker#
'n	ə(n)	#'n#
omstandigheid	ɔmˈstan.dəχˌɦəɪ̯t	#om>s‧ˈtan‧dig<‧ˌheid#
onweer	ˈɔn.vɪə̯r	#on‧ˈweer#
ondergaan	ˌɔn.dərˈχɑːn	#on‧der>‧ˈgaan#
Paraguay	ˈpa.ra.ɡwaɪ̯	#pa‧ra‧gu‧a‧ˈy#
Pretoria	prəˈtʊə̯.ri.a	#ˈpre‧to‧ri‧a#
sjokolade	ˌʃɔ.kɔˈlɑː.də	#ˈsjo‧ko‧la‧de#
s'n	sən	#s'n#
spieël	spiːl	#ˈspie‧el#
Suid-Afrika	səɪ̯tˈɑː.fri.ka	#ˈsuid-‧ˈa‧fri‧ka#
vanaand	fəˈnɑːnt	#ˈva‧naand#
Venesië	vəˈniː.si.ə	#ˈve‧ne‧si‧e#
vinger	ˈfəŋ.ər	#ˈving‧er#
wîe	ˈvəː.(ɦ)ə	#ˈwî‧e#
zero	ˈzɪə̯.ru	#ze‧ˈro#
André	ˈan.drəɪ̯	#an‧ˈdre#
Barnard	ˈbar.nart	#ˈbar‧nard#
Blignaut	ˈbləχ.nœʊ̯t, ˈbli.nœʊ̯	#ˈblig‧naut#
Blignault	ˈbləχ.nœʊ̯t, ˈbli.nœʊ̯	#ˈblig‧nault#
Cilliers	səlˈjeə̯	#ˈcil‧liers#
Coetzee	kutˈseə̯	#coet‧ˈzee#
Coetzer	ˈkut.sər	#ˈcoet‧zer#
de Villiers	də.fəlˈjeə̯	#ˈde# #ˈvil‧liers#
du Plessis	dy.pləˈsi	#ˈdu# #ˈples‧sis#
du Preez	dəˈpreə̯	#ˈdu# #ˈpreez#
du Toit	dəˈtoːɪ̯	#ˈdu# #ˈtoit#
Fouché	fuˈʃeə̯	#fou‧ˈche#
Fourie	fuˈri	#ˈfou‧rie#
Grové	χruˈveə̯	#gro‧ˈve#
Jean Pierre	anˈpiːr	#ˈje‧an# #ˈpier‧re#
Joubert	juˈbæːr	#ˈjou‧bert#
La.bus.chag.ne	la.busˈkaχ.nə	#ˈla.‧bus.‧chag.‧ne#
La.bu.schagne	ˈla.bu.ʃəɪ̯n	#ˈla.‧bu.s‧chag‧ne#
Labu.schagnė	ˈla.bu.ʃəɪ̯n	#ˈla‧bu.s‧chag‧nė#
le Gran.ge	ləˈχran.si	#ˈle# #ˈgran.‧ge#
le Roux	ləˈruː	#ˈle# #ˈroux#
Malan	maˈlan, maˈlaŋ	#ˈma‧lan#
Malherbe	malˈɦɛr.bə	#ˈmal‧her‧be#
Marais	maˈrɛː	#ˈma‧rais#
Meintjes	məɪ̯ɲˈcis	#mein‧ˈtjes#
Naudé	nœʊ̯ˈdeə̯	#nau‧ˈde#
Nortje	nɔrˈkɪə̯	#ˈnor‧tje#
Pienaar	ˈpi.nɑːr	#pie‧ˈnaar#
Schalk	skalk	#ˈschalk#
Terblanche	tərˈblɑːnʃ	#ˈter‧blan‧che#
Theron	t(ə)ˈron	#the‧ˈron#
Viljoen	fəlˈjun	#vil‧ˈjoen#
Visagie	fəˈsɑː.χi, fəˈsɑː.si	#ˈvi‧sa‧gie#
Viviers	fə.fəˈjeə̯	#ˈvi‧viers#

--[[
Author: AmazingJus

This module automatically generates the IPA transcription of Afrikaans terms
based on their grapheme structure, syllabification, affixation and etymology.

It is based on the phonological rules of Afrikaans and the grapheme-phoneme
correspondence of the language. The module processes the input text in several
steps:
1. It canonicalises the input text, decomposing accents and removing extraneous spaces.
2. It syllabifies the words, marking syllable boundaries and handling digraphs and trigraphs.
3. It applies affixes, marking prefixes and suffixes with special characters.
4. It assigns stress to syllables based on predefined rules.
5. It generates the IPA transcription by substituting graphemes with their phonetic equivalents.

Sources:
- Donaldson, Bruce C. (1993). A Grammar of Afrikaans.
- Wissing, Daan (2016). "Afrikaans phonology". Taalportaal.
--]]

local export = {}

local lang = require("Module:languages").getByCode("af")
local sc = require("Module:scripts").getByCode("Latn")
local hyph = require("Module:hyphenation")
local str_util = require("Module:string utilities")
local tbl = require("Module:table")
local ipa = require("Module:IPA")  -- IPA display module

function export.tag_text(text, face)
	return require("Module:script utilities").tag_text(text, lang, sc, face)
end

function export.link(term, face)
	return require("Module:links").full_link( { term = term, lang = lang, sc = sc }, face )
end

local u = require("Module:string/char")
local decomp = mw.ustring.toNFD
local recomp = mw.ustring.toNFC
local lower = mw.ustring.lower

local find = mw.ustring.find
local len = mw.ustring.len
local match = mw.ustring.match
local split = mw.text.split
local sub = mw.ustring.sub

local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.gmatch

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

-- list of accents
local grave = u(0x0300) -- grave
local acute = u(0x0301) -- acute
local circ = u(0x0302) -- circumflex
local dia = u(0x0308) -- diaresis
local syll = "‧" -- syllable dot

-- for automatically generated stress
local auto_grave = u(0xFFF0) -- automatic grave
local auto_acute = u(0xFFF1) -- automatic acute

-- list of char classes
local accent = grave .. acute .. auto_grave .. auto_acute .. circ .. dia
local stress_accent = grave .. acute .. auto_grave .. auto_acute
local vowel = "aeiouyAEIOUY"
local cons = "bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQRSTVWXZ"
local syll_bound = syll .. "#<>%+%-%."

-- put them into classes
local A = "" -- all accents
local AS = "" -- all stress accents
local V = "" -- all vowels
local non_V = "" -- all non-vowels
local C = "" -- all consonants
local non_C = "" -- all non-consonants
local CV = "" -- all consonants and vowels
local S = "" -- all syllable boundaries
local non_S = "" -- all non-syllable boundaries

-- list of valid trigraphs and digraphs, including diphthongs and long vowels
local graphemes = {
	 = "ɑːɪ̯",
	 = "iʊ̯",
	 = "iʊ̯",
	 = "uɪ̯",
	 = "oːɪ̯",
	 = "ɑː",
	 = "ɑː",
	 = "aɪ̯",
	 = "œʊ̯",
	 = "ɪə̯",
	 = "əɪ̯",
	 = "iʊ̯",
	 = "į", -- temporary value
	 = "ů", -- temporary value
	 = "ɔɪ̯",
	 = "ʊə̯",
	 = "œʊ̯",
	 = "uɪ̯",
	 = "ü" -- temporary value
}
-- sort trigraphs and digraphs in descending order
local graphemes_sorted = {}
for k, _ in pairs(graphemes) do
	table.insert(graphemes_sorted, k)
end
table.sort(graphemes_sorted, function(a, b) return len(a) > len(b) end)

-- list of various grapheme sets
local sets = {
	 = { -- long-short vowels
		 = {"a", "ɑː"},
		 = {"ɛ", "ɪə̯"},
		 = {"ə", "i"},
		 = {"ɔ", "ʊə̯"},
		 = {"œ", "y"}
	},
	 = { -- voiced/voiceless consonants
		{"b", "p"},
		{"d", "t"},
		{"ʤ", "ʧ"},
		{"ɡ", "k"},
		{"v", "f"},
		{"z", "s"},
		{"ʒ", "ʃ"},
	}
}

-- list of defined affixes
local affixes = {
	-- prefixes
	 = {
		{"aan"},
		{"agter"},
		{"be", restriction = "^"},
		{"deur"},
		{"er"},
		{"ge", restriction = "^"},
		{"her"},
		{"om"},
		{"ont"},
		{"onder"},
		{"uit"},
		{"van", pos = "d"},
		{"ver"},
		{"voor"}
	},
	-- suffixes
	 = {
		{"agtig"},
		{"baar"},
		{"dom"},
		{"end"},
		{"heid"},
		{"lik"},
		{"loos"},
		{"nis"},
		{"sel"},
		{"skap"}
	}
}

-- list of unstressed words
local unstressed = {
	"die",
	"dit",
	"is",
	"nie",
	"'n"
}

-- list of stressed endings (mostly in loanwords)
local stressed_endings = {
	"aa",
	"aans?",
	"aard?",
	"ant",
	"a",
	"ee?",
	"ein",
	{"el", orig = "loan"}, -- only in loanwords
	"ent",
	"eu",
	"e",
	"ieel",
	"ie",
	"ine",
	"ie",
	{"o", orig = "fr"}, -- only in french loanwords
	"oen",
	"on",
	"oo",
	{"sie", stress = "pre"},
	"teek",
	"teit",
	"uu",
	"u",
	"y?",
}

-- list of respelling substitutions
local subs = {
	-- 'N
	{"#'n#", "#ə(n)#", "-"}, -- pronounced /ə(n)/ as the article 'n
	{"'n#", "ən#", "-"}, -- pronounced /ən/ otherwise

	-- CH
	{"ch", "ʃ", "fr"}, -- pronounced /ʃ/ in french loans
	{"sch", "sk", "-"}, -- pronounced /sk/ in the sequence "sch"
	{"ch(?)", "χ%1", "-"}, -- pronounced /χ/ before optional consonant cluster and "e" or "i"
	{"ch", "k", "-"}, -- otherwise /k/

	-- NG
	{"ng", "ŋ", "-"}, -- pronounced /ŋ/

	-- SH/SJ
	{"s", "ʃ", "-"}, -- pronounced /ʃ/

	-- DJ/TJ
	{"jie", "kį", "-"}, -- pronounced /-ci/ in the suffix "-djie"/"-tjie"
	{"dj", "ʤ", "-"}, -- "dj" is otherwise /d͡ʒ/
	{"tj", "ʧ", "-"}, -- "tj" is otherwise /t͡ʃ/

	-- C
	{"c()", "s%1", "-"}, -- pronounced /s/ before "e" or "i"
	{"c", "k", "-"}, -- otherwise /k/

	-- GH
	{"gh", "ɡ", "-"}, -- pronounced /ɡ/

	-- G
	{"g", "ɡ", "en"}, -- pronounced /ɡ/ in english loans
	{"r‧ge", "r‧ɡe", "-"}, -- pronounced /ɡ/ between /r/ and /ə/
	{"g", "χ", "-"}, -- otherwise /χ/
	{"n(‧?)", "ŋ%1", "-"}, -- /ŋ/ is an allophone of /n/ before /ɡ/ and /k/

	-- V
	{"v", "f", "af"}, -- pronounced /f/ in native words

	-- W
	{"w", "w", "en"}, -- pronounced /w/ in english loans
	{"w", "v", "-"}, -- otherwise /v/

	-- EAU
	{"eaux?", "OU", "fr"}, -- pronounced /œʊ̯/ in french loans

	-- OI
	{"oi", "wA", "fr"}, -- pronounced /wa/ in french loans

	-- IJ
	{"ij(" .. non_V .. ")", "EI%1", "-"}, -- pronounced /əɪ̯/ in dutch-based names

	-- X
	{"#x", "#s", "-"}, -- pronounced /s/ word-initially
	{"x", "ks", "-"}, -- otherwise /ks/

	-- H
	{"(" .. CV .. ")h", "%1", "-"}, -- silent if part of consonant digraph or syllable-final
	{"h", "ɦ", "-"}, -- otherwise /ɦ/

	-- O
	{"o(" .. S .. ")", "OU%1", "en"}, -- pronounced /œʊ̯/ in open syllables in english loans
	{"o#", "ů#", "-"}, -- otherwise /u/ in word-final position

	-- U
	{"u(" .. C .. ")", "A%1", "en"}, -- pronounced /a/ in closed syllables in english loans
	{"u", "jů", "en"}, -- otherwise /ju/ in english loans

	-- Y
	{"y", "j", "en"}, -- pronounced /j/ in english loans
	{"y", "EI", "-"}, -- otherwise /əɪ̯/

	-- circumflex accent
	{circ, "ː", "-"} -- lengthens a vowel with its short quality
}

-- canonicalisation function
local function canonicalise(text)
	-- decompose accents
	text = decomp(text)

	-- remove extrenous spaces
	text = rsub(text, "%s+", " ")
	text = rsub(text, "^%s+", "")
	text = rsub(text, "%s+$", "")

	-- treat commas as a pause, but only if there is a space afterwards
	text = rsub_repeatedly(text, "%s?,%s", " | ")

	-- return as array of words
	return split(text, " ")
end

-- syllabification function
local function syllabify(word)
	-- remove diaresis and split syllable (note: diaresis shouldn't be displayed in its hyphenated form)
	word = rsub(word, "(" .. V .. ")" .. dia, syll .. "%1")

	-- mark trigraphs and digraphs with curly braces
	for _, graph in ipairs(graphemes_sorted) do
		word = rsub(word, graph, "{" .. graph .. "}")
	end

	-- add dot before consonant + vowel
	word = rsub(word, "(" .. C .. "?{?" .. V .. A .. "?)", syll .. "%1")

	-- remove any dots inside brackets
	word = rsub(word, "{*}", function(a) return rsub(a, syll, "") end)

	-- shift dot before certain consonant clusters and digraphs
	word = rsub(word, "()‧l", syll .. "%1l") -- clusters with l
	word = rsub(word, "()‧r", syll .. "%1r") -- clusters with r
	word = rsub(word, "()‧j", syll .. "%1j") -- digraphs with j
	word = rsub(word, "()‧h", syll .. "%1h") -- digraphs with h
	word = rsub(word, "n‧g", "ng‧") -- ng is syllable-final

	-- remove leading dots and brackets
	-- word = rsub(word, "(" .. S .. ")(" .. non_V .. "+)" .. syll, "%1" .. syll .. "%2")
	-- word = rsub(word, "^(" .. non_V .. "*)" .. syll, syll .. "%1")
	-- word = rsub(word, "%.", syll)
	word = rsub(word, "", "") -- comment out to debug
	return rsub(word, syll .. "+", syll) -- remove multiple syllable dots
end

-- hyphen depth check function
local function is_hyphen_depth(depth)
	return (depth == 1) and "%-" or ""
end

-- onset validation function
local function is_valid_onset(string)
	-- check if matching syllable onset (including ones starting with s)
	if find(string, "^" .. syll) or find(string, "^s" .. syll .. "") then
		return true
	end
	return false
end

-- rest of string function
local function get_rest_string(string, affix, affix_type)
	if affix_type == "pre" then
		return sub(string, len(affix) + 1)
	else
		return sub(string, 1, -len(affix) - 1)
	end
end

-- affix validation function
local function is_valid_affix(string, affix, affix_type, pos, depth)
	-- get rest of string
	local rest = get_rest_string(string, affix, affix_type)

	-- check for existing pos restriction
	if affix.pos and not find(pos, affix.pos) then
	-- then for explicit non-boundaries
	elseif affix.restriction and not find(rest, affix.restriction) and affix_type == "pre" then
	-- then for matching syllable onset
	elseif not is_valid_onset(syllabify(rest)) and affix_type == "pre" then
	-- then for explicit word boundary
	elseif find(rest, "^%+") and affix_type == "pre" then
	-- then for no vowels
	elseif not find(rest, V) and affix_type == "pre" then
	-- then only for two or less chars
	elseif find(rest, "^..?$") then
	else
		-- match hyphen at appropriate depth
		local hyphen = is_hyphen_depth(depth)
		-- match appropriate pattern
		local pattern = affix_type == "pre" and "^" .. affix .. hyphen or hyphen .. affix .. "$"
		return true and find(string, pattern) or false
	end

	return false
end

-- affix application function
local function apply_affixes(string, depth, pos)
	-- match hyphen at appropriate depth
	local hyphen = is_hyphen_depth(depth)
	-- process prefixes
	for _, affix in ipairs(affixes.pre) do
		if is_valid_affix(string, affix, "pre", pos, depth) then
			-- add prefix marker >
			string = rsub(string, "^" .. affix .. hyphen, affix .. ">")
			break
		end
	end
	-- process suffixes
	for _, affix in ipairs(affixes.suf) do
		if is_valid_affix(string, affix, "suf", pos, depth) then
			-- add suffix marker <
			string = rsub(string, hyphen .. affix .. "$", "<" .. affix)
			break
		end
	end
	return string
end

-- stress assignment function (does not apply to depth zero)
local function assign_stress(string, etyl, pos)
	-- get string without syllables for pattern matching
	local string_no_syll = decomp(rsub(string, syll, ""))

	-- check for stressed endings
	for _, ending in ipairs(stressed_endings) do
		-- handle table entries with additional properties
		local pattern
		if type(ending) == "table" then
			pattern = ending
			-- FIXME - breaks stress
			-- if ending.orig and ending.orig ~= etyl then
			-- 	break
			-- end
		else
			pattern = ending
		end

		-- find and stress the ending if matched in string
		local ending_match = match(string_no_syll, "(" .. pattern .. ")$")
		if ending_match then
			-- escape special characters in ending_match for pattern matching
			local escaped_ending = str_util.pattern_escape(ending_match)
			-- find corresponding ending in original string
			local before_ending, full_ending = match(string, "(.*)(" .. syll .. "?" .. escaped_ending .. ")$")

			if full_ending then
				-- add acute accent on final syllable before ending if pre-stressed
				if ending.stress == "pre" then
					string = rsub(string, "(" .. before_ending .. ")", function(a)
						return rsub(a, "(" .. non_V .. "*" .. V .. "+" .. A .. "*)(*)$", "%1" .. acute .. "%2")
					end)
				-- otherwise add acute accent on initial syllable of ending
				else
					string = rsub(string, "(" .. full_ending .. ")", function(a)
						return rsub(a, "^(" .. non_V .. "*" .. V .. "+" .. A .. "*)(*)", "%1" .. acute .. "%2")
					end)
				end
			end
			-- break after successful match
			break
		end
	end

	-- check for > and add acute accent on the following syllable
	if not find(string, acute) then
		string = rsub(string, "(>*" .. syll .. "?" .. non_V .. "*" .. V .. "+" .. A .. "*)", function(a)
			return a .. acute
		end)
	end

	-- otherwise add acute accent to first syllable if no stress marks present
	if not find(string, acute) then
		string = rsubn(string, "(" .. syll .. "?" .. non_V .. "*" .. V .. "+" .. A .. "*)", function(a)
			return a .. acute
		end, 1)
	end

	-- likewise check for < and add grave accent on the following syllable
	string = rsub(string, "(<*" .. syll .. "?" .. non_V .. "*" .. V .. "+" .. A .. "*)", function(a)
		return a .. grave
	end)

	return string
end

-- component generation function
local function to_components(words, etyl, pos, depth, hide_stress)
	-- parse each component first
	local function split_components(word, etyl, pos, depth, hide_stress)
		-- handle initial calls with no depth or part of speech
		depth = depth or 0
		pos = pos or ".*"

		-- depth 0: handle double hyphen compounds first
		if depth == 0 then
			local parts = split(word, "%-%-")
			if #parts > 1 then
				local result = {}
				for _, part in ipairs(parts) do
					table.insert(result, split_components(part, etyl, pos, depth + 1, hide_stress))
				end
				return table.concat(result, "--")
			else
				return split_components(word, etyl, pos, depth + 1, hide_stress)
			end
		end

		-- depth 1: handle single hyphen compounds and hyphenated affixes
		if depth == 1 then
			-- explicitly mark ambiguous prefix and suffixes with a hyphen with < and > respectively
			word = apply_affixes(word, depth, pos)
			-- check for further splitting
			local parts = split(word, "%-")
			if #parts > 1 then
				local result = {}
				for _, part in ipairs(parts) do
					table.insert(result, split_components(part, etyl, pos, depth + 1, hide_stress))
				end
				return table.concat(result, "-")
			else
				return split_components(word, etyl, pos, depth + 1, hide_stress)
			end
		end

		-- depth 2: handle non-hyphenated affixes
		if depth == 2 then
			-- add < and > for prefix and suffixes respectively
			word = apply_affixes(word, depth, pos)
			-- apply syllabification
			word = syllabify(word)
			-- assign stress after syllabification
			return hide_stress and word or assign_stress(word, etyl, pos)
		end

		return word
	end

	-- loop over every word
	local results = {}
	for _, word in ipairs(words) do
		-- get term as split components
		local w = split_components(word, etyl, pos, 0, hide_stress)
		w = rsub(w, "^(" .. C .. "*)" .. syll, "%1") -- remove leading initial syllable boundary after splitting components
		table.insert(results, "#" .. w .. "#")
	end
	-- join processed words
	return table.concat(results, " ")
end

-- generate substitutions function
local function generate_subs(term, etyl, pos)
	local to_sub = {}
	local seen_patterns = {}

	for _, s in ipairs(subs) do
		local s_patt, s_repl, s_etyl = s, s, s

		-- only add if pattern wasn't added already
		if not seen_patterns then
			-- add substitution for etymology-specific rules
			if etyl ~= "-" and s_etyl == etyl then
				table.insert(to_sub, {s_patt, s_repl})
				seen_patterns = true
			-- otherwise add substitution for default rules
			elseif s_etyl == "-" then
				table.insert(to_sub, {s_patt, s_repl})
				seen_patterns = true
			end
		end
	end

	return to_sub
end

-- hyphenation function (FIXME: make it more dynamic depending on how it's inputted)
function export.hyphenation(text, etyl, pos)
	-- canonicalise term as array of words
	local words = canonicalise(text)

	-- mark text with appropriate components
	local term = to_components(words, etyl, pos, 0, true)

	-- return hyphen.format_hyphenations(data)
	return recomp(term)
	-- return rsub(recomp(term)), "<>]", "")
end

-- stress assignment function
function export.stress(text, etyl, pos)
	-- canonicalise term as array of words
	local words = canonicalise(text)

	-- mark text with appropriate components
	local term = to_components(words, etyl, pos, 0, false)

	-- return hyphen.format_hyphenations(data)
	return recomp(term)
	-- return rsub(recomp(term)), "<>]", "")
end

-- pronunciation function
function export.toIPA(text, etyl, pos)
	-- canonicalise term as array of words
	local words = canonicalise(text)

	-- mark text with appropriate components
	local term = to_components(words, etyl, pos, 0, false)

	-- make text lowercase
	term = lower(term)

	-- convert accents to stress marks
	term = rsub(term, "(" .. non_S .. "*)(" .. AS .. ")", function(ns, s)
		-- secondary stress if grave accent, primary stress otherwise
		return (s == grave or s == auto_grave) and ("ˌ" .. ns) or ("ˈ" .. ns)
	end)

	--[[
	-- prepare table to substitute the appropriate phonemes based on etymology and part of speech
	local to_sub = generate_subs(term, etyl, pos)

	-- go over substitution table
	for _, s in ipairs(to_sub) do
		local k, v = s, s
		rsub(term, k, v)
	end

	-- make text lowercase again
	term = lower(term)

	-- substitute graphemes
	for graph, phoneme in pairs(graphemes) do
		term = rsub(term, graph, phoneme)
	end

	-- substitute single-letter vowels
	term = rsub(term, "()()", function(a, b)
		if match("", b) then
			return sets.vowel_length .. b -- for open syllables
		else
			return sets.vowel_length .. b -- for closed syllables
		end
	end)

	-- replace į, ů, ü with their actual phonetic values
	term = rsub(term, "", { = "i",  = "u",  = "y"})

	-- remove double consonants
	term = rsub(term, "(.)(‧?)%1", "%2%1")
	]]--

	-- final adjustments
	-- term = rsub(term, "‧", ".")
	term = rsub(term, "#" .. syll, "#") -- re-remove syllable boundaries after analysing components
	return term
	-- return rsub(term, "]", "")
end

-- main export function
function export.show(frame)
	-- get arguments and page title
	local args = frame.args
	local pagetitle = mw.title.getCurrentTitle().text

	-- initialise parameters
	local p, ipa_items, hyph_items = {}, {}, {
		lang = lang,
		sc = sc,
		hyphs = {},
		caption = "Syllabification"
	}

	-- get arguments
	if args then
		for _, v in ipairs(args) do
			table.insert(p, (v ~= "") and v or nil)
		end
	else
		p = { pagetitle }
	end

	for _, term in ipairs(p) do
		-- get etymology and part of speech
		local etyl = args.etyl
		local pos = args.pos

		-- get hyphenation and transcription
		local syllables = export.hyphenation(term, etyl, pos)
		local pron = export.toIPA(term, etyl, pos)

		-- add to results
		table.insert(ipa_items, {pron = "/" .. pron .. "/"})
		table.insert(hyph_items.hyphs, {hyph = split(syllables, syll)})
	end

	-- format final output
	return ipa.format_IPA_full{ lang = lang, items = ipa_items } .. "\n* " .. hyph.format_hyphenations(hyph_items)
end

return export

Module:User:AmazingJus/af

Wikious

Boobota

Sagapedia