Module:sla-common

The following documentation is located at Module:sla-common/documentation. Categories were auto-generated by Module:module categorization.
Useful links: subpage list • links • transclusions • testcases • sandbox
This module contains common helper functions for Proto-Slavic, that are needed by other modules.
local export = {}

local m_links = require("Module:links")
local m_table_tools = require("Module:table tools")

local lang = require("Module:languages").getByCode("sla-pro")

local u = mw.ustring.char
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local toNFD = mw.ustring.toNFD
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper
local usub = mw.ustring.sub

local AC = u(0x0301) -- acute =  ́
local GR = u(0x0300) -- grave =  ̀
local CFLEX = u(0x0302) -- circumflex =  ̂
local TILDE = u(0x0303) -- tilde =  ̃
local BREVE = u(0x0306) -- breve =  ̆
local INVBREVE = u(0x0311) -- inverse breve =  ̑
local DOUBLEAC = u(0x030B) -- double acute =  ̋
local DOUBLEGR = u(0x030F) -- double grave =  ̏
local MACRON = u(0x0304) -- macron =  ̄
local CARON = u(0x030C) -- caron =  ̌
local OGONEK = u(0x0328) -- ogonek =  ̨

local stressed_accents = AC .. GR .. INVBREVE .. DOUBLEGR .. DOUBLEAC .. TILDE
local stressed_accents_c = ""
local accents = stressed_accents .. MACRON
local accents_c = ""
local vowels = "aeiouyьъěęǫ"
local vowels_c = ""
local non_vowels_c = ""
local short_vowels = "eoьъ"
local short_vowels_c = ""
local long_vowels = "aiuyěęǫ"
local long_vowels_c = ""
local cons_c = ""
local iotated_cons = "čďjľňřšťž"
local iotated_cons_c = ""

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

function export.tag_form(form, tag)
	if form ~= "" then
		return "<" .. (tag or "span") .. " lang=\"sla-pro\" class=\"Unicode\">*" .. form .. "</" .. (tag or "span") .. ">"
	else
		return "&mdash;"
	end
end

-- Make a link out of a form, or show a dash if empty.
function export.link_form(form, tag)
	local SUBPAGENAME = mw.title.getCurrentTitle().subpageText
	
	if type(form) == "table" then
		if not form.notesym then
			local retval = {}
			for _, subform in ipairs(form) do
				table.insert(retval, export.link_form(subform, tag))
			end
			return table.concat(retval, ", ")
		else
			return m_links.full_link({ lang = lang, term = "*" .. form }) .. m_table_tools.superscript_notes(form.notesym)
		end
	elseif form ~= "" then
		return m_links.full_link({ lang = lang, term = "*" .. form })
	else
		return "&mdash;"
	end
end

local recomposer = { 
	 = "ě", -- Latin e and E
	 = "Ě",
	 = "ę", -- Latin e and E
	 = "Ę",
	 = "ǫ", -- Latin o and O
	 = "Ǫ",
	 = "č",
	 = "Č",
	 = "ď",
	 = "Ď",
	 = "ľ",
	 = "Ľ",
	 = "ň",
	 = "Ň",
	 = "ř",
	 = "Ř",
	 = "ś",
	 = "Ś",
	 = "š",
	 = "Š",
	 = "ť",
	 = "Ť",
	 = "ž",
	 = "Ž",
}

-- Decompose acute, grave, etc. on letters into individivual character +
-- combining accent. But recompose characters that we want to treat
-- as units and get caught in the crossfire.
function export.decompose(text)
	return (rsub(toNFD(text), ".", recomposer))
end

-- Decompose as in export.decompose(), but also canonicalize circumflex to
-- inverse breve in case it accidentally gets used.
function export.canon_decompose(text)
	return (export.decompose(text):gsub(CFLEX, INVBREVE))
end

function export.assert_decomposed(text)
	assert(text == export.canon_decompose(text))
end

function export.first_palatalization(stem)
	stem = rsub(stem, "...$", {="ždž"})
	stem = rsub(stem, "..$", {="šč", ="ždž", ="ž", ="šč"})
	stem = rsub(stem, ".$", {="č", ="ž", ="š", ="č", ="š"})
	return stem
end

function export.second_palatalization(stem)
	return rsub(stem, ".$", {="c", ="dz", ="ś"})
end

function export.iotate(stem)
	stem = rsub(stem, "...$", {="ždž"})
	stem = rsub(stem, "..$", {="šč", ="ždž", ="ž", ="šč"})
	stem = rsub(stem, ".$", {
		="bľ",
		="č",
		="ď",
		="ž",
		="č",
		="ľ",
		="mľ",
		="ň",
		="pľ",
		="ř",
		="š",
		="š",
		="ť",
		="vľ",
		="š",
		="ž",
	})
	
	if not rfind(stem, iotated_cons_c .. "$") then
		stem = stem .. "j"
	end
	return stem
end

-- Check if word has a stress accent
function export.is_stressed(word)
	export.assert_decomposed(word)
	return rfind(word, stressed_accents_c)
end

-- Remove any stress accents from the word
function export.make_unstressed(word)
	export.assert_decomposed(word)
	return rsub(word, stressed_accents_c, "")
end

-- Check if word is nonsyllabic (has no vowels)
function export.is_nonsyllabic(word)
	export.assert_decomposed(word)
	return rfind(word, "^" .. non_vowels_c .. "*$")
end

-- Check if word is monosyllabic (has only one vowel)
function export.is_monosyllabic(word)
	export.assert_decomposed(word)
	return rfind(word, "^" .. non_vowels_c .. "*" .. vowels_c .. non_vowels_c .. "*$")
end

-- Set the accent in STEM to ACCENT, replacing any stressed accent already
-- there. If there isn't such an accent already then:
-- (1) If the accent is inverse breve (= old circumflex or short accent) or
--     double grave (= old short accent), put it on the first syllable;
-- (2) If the accent is tilde (= neoacute), put it on the last syllable;
-- (3) If the accent is a single grave (= old acute), put it on the vowel if
--     there's only one, otherwise don't add it as it can go anywhere.
-- Placing the accent will replace any unstressed accent already there
-- (specifically the macron).
--
-- In addition, if the accent is tilde (= neoacute), we put the accent on the
-- last syllable of the stem, regardless of any existing accent. The logic here
-- is that, in nouns at least, a neoacute on the stem that we request (i.e. not
-- already in the stem) is always retracted from the ending, and thus should
-- go on the last syllable if there is more than one. FIXME: May not apply to
-- verbs.
--
-- Also apply certain conversions to the result:
-- (1) Original short vowels e o ь ъ can't get a macron. Per Derksen 2008,
--     this also includes liquid diphthongs, which normally behave like
--     long vowels; cf. 'borzdà' "burrow" in class b, where you expect the
--     preceding vowel to be long if possible. However, we go against
--     Derksen in this respect when the first vowel is e or o because Czech,
--     Slovak and Polish show clear length distinctions (or reflections thereof)
--     in original pre-tonic syllables in class b vs. c. (Serbo-Croat reflects
--     length in both classes but this can be a later development due to
--     analogy.) Per Kortlandt, the metathesis of liquid diphthongs preceded
--     Dybo's law and (probably) the shortening of pre-tonic vowels.
-- (2) Original long vowels a i u y ě ę ǫ can't get a double grave, nor can
--     liquid diphthongs; instead, convert to inverse breve (circumflex accent).
-- (3) Original short vowels e o ь ъ not in liquid diphthongs can't receive a
--     tilde (neoacute) per the May 2019 discussion in
--     ];
--     instead we convert to single grave.
function export.set_accent(stem, accent)
	export.assert_decomposed(stem)
	-- string containing a hyphen is the value of UNK = unknown, and removes
	-- all accents including macrons
	if accent == "-" then
		return rsub(stem, accents_c, "")
	end
	if accent == DOUBLEGR then
		error("Double grave should not be specified as an accent; use inverted breve instead")
	end
	if not export.is_stressed(stem) and accent ~= TILDE and
		(accent ~= GR or export.is_monosyllabic(stem)) then
		-- If no stressed accent, put one on the first syllable, removing any
		-- non-stress accent, i.e. macron (it doesn't matter which accent we put
		-- as long as it's a stress accent, as it will be overwritten in the
		-- next clause). But don't do this if accent is a tilde (no point, it
		-- will be ignored and removed in the next clause), and if the accent is
		-- a grave, only do this if the stem is monosyllabic.
		stem = rsub(stem, "^(.-" .. vowels_c .. ")" .. accents_c .. "*",
			"%1" .. INVBREVE)
	end
	if accent == TILDE then
		-- If a tilde, cancel out any existing stressed accent and put the tilde
		-- on the last syllable. (FIXME, might not apply to verbs.) Later on
		-- we will conver this to a single grave if it's on a short monophthong.
		stem = export.make_unstressed(stem)
		stem = rsub(stem, "^(.*" .. vowels_c .. ")" .. accents_c .. "*",
			"%1" .. TILDE)
	else
		-- Otherwise just replace the stressed accent, if any, with the given
		-- accent. There will always be such an accent except in multisyllabic
		-- words where the accent is a single grave; in other circumstances
		-- we added an accent on the first syllable if it was missing.
		stem = rsub(stem, stressed_accents_c .. "+", accent)
	end
	if accent == MACRON then
		-- hack to handle liquid diphthongs: generate two macrons, since the
		-- following regex will remove one.
		stem = rsub(stem, "()" .. MACRON .. "(" .. cons_c .. ")",
			"%1" .. MACRON .. MACRON .. "%2")
		stem = rsub(stem, "(" .. short_vowels_c .. ")" .. MACRON, "%1")
	end
	-- Convert inverse breve after short vowel not in liquid diphthong to
	-- double grave.
	if rfind(stem, short_vowels_c .. INVBREVE) and
		not rfind(stem, short_vowels_c .. INVBREVE .. "" .. cons_c) then
		stem = rsub(stem, INVBREVE, DOUBLEGR)
	end
	-- Convert tilde after short vowel not in liquid diphthong to single grave.
	if rfind(stem, short_vowels_c .. TILDE) and
		not rfind(stem, short_vowels_c .. TILDE .. "" .. cons_c) then
		stem = rsub(stem, TILDE, GR)
	end
	return stem
end

-- Infer the accentual pattern for a given unstressed word and accent pattern.
-- Return a list of possibilities (possibly empty if no accent could be inferred,
-- possibly containing more than one entry if multiple accentual patterns are
-- possible, e.g. with *voľa-type nouns), each of which is a three-entry list of
-- {stem, desinence, final_accent}.
local function infer_accent(word, ap)
	assert(not export.is_stressed(word))
	local possible_accents = {}
	local stem, desinence, final_accent = export.split_stem_desinence(word)
	if ap == "a" then
		if export.is_monosyllabic(stem) then
			table.insert(possible_accents, {export.set_accent(stem, GR), desinence, final_accent})
		end
	elseif ap == "b" then
		if export.is_nonsyllabic(stem) then
			table.insert(possible_accents, {stem, desinence, GR})
		elseif desinence == "ь" or desinence == "ъ" then
			table.insert(possible_accents, {export.set_accent(stem, TILDE), desinence, final_accent})
		else
			table.insert(possible_accents, {stem, desinence, GR})
			if desinence == "a" and rfind(stem, iotated_cons_c .. "$") then
				-- *voľa-type accent
				table.insert(possible_accents, {export.set_accent(stem, TILDE), desinence, final_accent})
			end
		end
	elseif ap == "c" then
		if export.is_nonsyllabic(stem) then
			table.insert(possible_accents, {stem, desinence, INVBREVE})
		elseif desinence == "a" then
			table.insert(possible_accents, {stem, desinence, GR})
		else
			table.insert(possible_accents, {export.set_accent(stem, INVBREVE), desinence, final_accent})
		end
	end
	return possible_accents
end

-- If WORD is unstressed, add the appropriate accent for the accent pattern AP
-- if possible (it won't be possible with accent pattern a in words with a
-- multisyllabic stem). If WORD is stressed, check that the accent on the word
-- is appropriate for the accent pattern, and throw an error if not. In either
-- case, return three values, STEM, DESINENCE and FINAL_ACCENT, which when
-- concatenated together produce the original word.
function export.auto_accent_and_check_accents(word, ap)
	local unstressed = not export.is_stressed(word)
	if unstressed then
		local possible_accents = infer_accent(word, ap)
		if #possible_accents == 0 then
			return export.split_stem_desinence(word)
		end
		local first = possible_accents
		local stem, desinence, final_accent = first, first, first
		return stem, desinence, final_accent
	else
		local uword = export.make_unstressed(word)
		local possible_accents = infer_accent(uword, ap)
		if #possible_accents == 0 then
			return export.split_stem_desinence(word)
		end
		local possible_words = {}
		for _, split_possible in ipairs(possible_accents) do
			local stem, desinence, final_accent = split_possible, split_possible, split_possible
			local possible_word = stem .. desinence .. final_accent
			if possible_word == word then
				return stem, desinence, final_accent
			end
			table.insert(possible_words, possible_word)
		end
		error("For accent pattern " .. ap .. ", accented lemma should look like " ..
			table.concat(possible_words, " or ") .. " but is actually " .. word)
	end
end

function export.split_stem_desinence(word)
	export.assert_decomposed(word)
	local stem, desinence, final_accent = rmatch(word, "^(.-)(.)(" .. accents_c .. "?)$")
	if not stem or not desinence then
		error("Something wrong with '" .. word .. "', probably too short")
	end
	return stem, desinence, final_accent
end

return export
Module:sla-common

Wikious

Boobota

Sagapedia