Modül belgelemesi

-- Yazarlar: Benwing, ZxxZxxZ, Atitarev

local export = {}

local U = mw.ustring.char
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local gcodepoint = mw.ustring.gcodepoint

-- version of rsubn() that discards all but the first return value
function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval

local zwnj = U(0x200c) -- zero-width non-joiner
local alif_madda = U(0x622)
local alif_hamza_below = U(0x625)
local alif = U(0x627)
local taa_marbuuTa = U(0x629)
local laam = U(0x644)
local waaw = U(0x648)
local alif_maqSuura = U(0x649)
local yaa = U(0x64A)
local fatHataan = U(0x64B)
local Dammataan = U(0x64C)
local kasrataan = U(0x64D)
local fatHa = U(0x64E)
local Damma = U(0x64F)
local kasra = U(0x650)
local shadda = U(0x651)
local sukuun = U(0x652)
local dagger_alif = U(0x670)
local alif_waSl = U(0x671)
--local zwj = U(0x200d) -- zero-width joiner
--local lrm = U(0x200e) -- left-to-right mark
--local rlm = U(0x200f) -- right-to-left mark

local tt = {
	-- consonants
	="b", ="t", ="s̱", ="c", ="ḥ", ="ḫ",
	="d", ="ẕ", ="r", ="z", ="s", ="ş",
	="ṣ", ="ḍ", ="ṭ", ="ẓ", ="ʿ", ="ġ",
	="f", ="ḳ", ="k", ="k", ="l", ="m", ="n",
	-- tāʾ-i merbūṭa (özel) - always after a fátḥa (a), silent at the end of
	-- an utterance, "t" in ʾiḍāfa or with pronounced tanwīn. We catch
	-- most instances of tāʾ marbūṭa before we get to this stage.
	="t", -- tāʾ marbūṭa = ة
	-- control characters
	="-", -- ZWNJ (zero-width non-joiner)
	-- ="", -- ZWJ (zero-width joiner)
	-- rare letters
	="p", ="ç", ="v", ="v", ="g", ="g", ="ḳ",
	-- semivowels or long vowels, alif, hamza, special letters
	="ā", -- ʾelif
	-- hemzeli harfler
	="ʾ", -- hemzeli elif
	="ʾ", -- hemze elifin altında
	="ʾ", -- hemzeli vav
	="ʾ", -- hemzeli yā
	="ʾ", -- hemze yerde
	-- long vowels
	="v", --"ū" after ḍamma (u) and not before diacritic
	="y", --"ī" after kasra (i) and not before diacritic
	="ā", -- ʾalif maqṣūra
	="ʾā", -- ʾalif madda
	= "", -- hemze-i vaṣl
	 = "ā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic)
	-- short vowels, šádda and sukūn
	="an", -- fetḥaten
	="un", --ḍammeten
	="in", -- kesreten
	="a", -- fetḥa
	="u", -- ḍamme
	="i", -- kesre
	-- şedde - çift konsonant
	="", --sukūn - no vowel
	-- ligatures
	-- taṭvīl
	="", -- taṭvīl, sessiz
	-- rakamlar
	="1", ="2", ="3", ="4", ="5",
	="6", ="7", ="8", ="9", ="0",
	-- punctuation (leave on separate lines)
	="?", -- soru işareti
	='“', -- tırnak işareti
	='”', -- tırnak işareti
	=".", -- decimal point
	=",", -- thousands separator
	="%", -- percent sign
	=",", -- comma
	=";" -- semicolon

local sun_letters = "تثدذرزسشصضطظلن"
local ince_harfler = "ءإأبتثجدذزسشفكلمنوؤهيئة"
local kalin_harfler = "حخرصضطظعغق"
-- For use in implementing sun-letter assimilation of ال (al-)
local ttsun1 = {}
local ttsun2 = {}
local ttsun3 = {}
for cp in gcodepoint(sun_letters) do
	local ch = U(cp)
	ttsun1 = tt
	ttsun2 = tt .. "-" .. ch
	table.insert(ttsun3, tt)
-- For use in implementing elision of al-
local sun_letters_tr = table.concat(ttsun3, "")

local consonants_needing_vowels = "بتثجحخدذرزسشصضطظعغفقكڪلمنهپچڤگڨڧأإؤئءةﷲ"
-- consonants on the right side; includes alif madda
local rconsonants = consonants_needing_vowels .. "ويآ"
-- consonants on the left side; does not include alif madda
local lconsonants = consonants_needing_vowels .. "وي"
-- Arabic semicolon, comma, question mark; taṭwīl; period, exclamation point,
-- single quote for bold/italic
local punctuation = "؟،؛" .. "ـ" .. ".!'"
local numbers = "١٢٣٤٥٦٧٨٩٠"

local before_diacritic_checking_subs = {
	------------ transformations prior to checking for diacritics --------------
	-- convert llh for allāh into ll+shadda+dagger-alif+h
	{"لله", "للّٰه"},
	-- shadda+short-vowel (including tanwīn vowels, i.e. -an -in -un) gets
	-- replaced with short-vowel+shadda during NFC normalisation, which
	-- MediaWiki does for all Unicode strings; however, it makes the
	-- transliteration process inconvenient, so undo it.
	{"()" .. shadda, shadda .. "%1"},
	-- ignore alif jamīla (otiose alif in 3pl verb forms)
	--     #1: handle ḍamma + wāw + alif (final -ū)
	{Damma .. waaw .. alif, Damma .. waaw},
	--     #2: handle wāw + sukūn + alif (final -w in -aw in defective verbs)
	--     this must go before the generation of w, which removes the waw here.
	{waaw .. sukuun .. alif, waaw .. sukuun},
	-- ignore final alif or alif maqṣūra following fatḥatan (e.g. in accusative
	-- singular or words like عَصًا "stick" or هُذًى "guidance"; this is called
	-- tanwin nasb)
	{fatHataan .. "", fatHataan},
	-- same but with the fatḥatan placed over the alif or alif maqṣūra
	-- instead of over the previous letter (considered a misspelling but
	-- common)
	{"" .. fatHataan, fatHataan},
	-- tāʾ marbūṭa should always be preceded by fatḥa, alif, alif madda or
	-- dagger alif; infer fatḥa if not
	{"()" .. taa_marbuuTa, "%1" .. fatHa .. taa_marbuuTa},
	-- similarly for alif between consonants, possibly marked with shadda
	-- (does not apply to initial alif, which is silent when not marked with
	-- hamza, or final alif, which might be pronounced as -an)
	{"(" .. shadda .. "?)" .. alif .. "()",
		"%1" .. fatHa .. alif .. "%2"},
	-- infer fatḥa in case of non-fatḥa + alif/alif-maqṣūra + dagger alif
	{"()(" .. dagger_alif .. ")", "%1" .. fatHa .. "%2"},
	-- infer kasra in case of hamza-under-alif not + kasra
	{alif_hamza_below .. "()", alif_hamza_below .. kasra .. "%1"},
	-- ignore dagger alif placed over regular alif or alif maqṣūra
	{"()" .. dagger_alif, "%1"},

	----------- rest of these concern definite article alif-lām ----------
	-- in kasra/ḍamma + alif + lam, make alif into hamzatu l-waṣl, so we
	-- handle cases like بِالتَّوْفِيق (bi-t-tawfīq) correctly
	{"()" .. alif .. laam, "%1" .. alif_waSl .. laam},
	-- al + consonant + shadda (only recognize word-initially if regular alif): remove shadda
	{"^(" .. alif .. fatHa .. "?" .. laam .. ")" .. shadda, "%1"},
	{"%s(" .. alif .. fatHa .. "?" .. laam .. ")" .. shadda, " %1"},
	{"(" .. alif_waSl .. fatHa .. "?" .. laam .. ")" .. shadda, "%1"},
	-- handle l- hamzatu l-waṣl or word-initial al-
	{"^" .. alif .. fatHa .. "?" .. laam, "el-"},
	{"%s" .. alif .. fatHa .. "?" .. laam, " el-"},
	-- next one for bi-t-tawfīq
	{"()" .. alif_waSl .. fatHa .. "?" .. laam, "%1-l-"},
	-- next one for remaining hamzatu l-waṣl (at beginning of word)
	{alif_waSl .. fatHa .. "?" .. laam, "l-"},
	-- special casing if the l in al- has a shadda on it (as in الَّذِي "that"),
	-- so we don't mistakenly double the dash
	{"l%-" .. shadda, "ll"},
	-- implement assimilation of sun letters
	{"l%-", ttsun2},

-- Transliterate the word(s) in TEXT. LANG (the language) and SC (the script)
-- are ignored. OMIT_I3RAAB means leave out final short vowels (ʾiʿrāb).
-- GRAY_I3RAAB means render transliterate short vowels (ʾiʿrāb) in gray.
-- FORCE_TRANSLIT causes even non-vocalized text to be transliterated
-- (normally the function checks for non-vocalized text and returns nil,
-- since such text is ambiguous in transliteration).
function, lang, sc, omit_i3raab, gray_i3raab, force_translit)
	-- make it possible to call this function from a template
	if type(text) == "table" then
		local function f(x) return (x ~= "") and x or nil end
		text, lang, sc, omit_i3raab, force_translit =
			f(text.args), f(text.args), f(text.args), f(text.args), f(text.args)

	for _, sub in ipairs(before_diacritic_checking_subs) do
		text = rsub(text, sub, sub)

	if not force_translit and not has_diacritics(text) then
		return nil
	------------ transformations after checking for diacritics --------------
	-- Replace plain alif with hamzatu l-waṣl when followed by fatḥa/ḍamma/kasra.
	-- Must go after handling of initial al-, which distinguishes alif-fatḥa
	-- from alif w/hamzatu l-waṣl. Must go before generation of ū and ī, which
	-- eliminate the ḍamma/kasra.
	text = rsub(text, alif .. "()", alif_waSl .. "%1")
	-- ḍamma + waw not followed by a diacritic is ū, otherwise w
	text = rsub(text, Damma .. waaw .. "()", "ū%1")
	text = rsub(text, Damma .. waaw .. "$", "ū")
	-- kasra + yaa not followed by a diacritic (or ū from prev step) is ī, otherwise y
	text = rsub(text, kasra .. yaa .. "()", "ī%1")
	text = rsub(text, kasra .. yaa .. "$", "ī")
	-- convert shadda to double letter.
	text = rsub(text, "(.)" .. shadda, "%1%1")
	if not omit_i3raab and gray_i3raab then -- show ʾiʿrāb grayed in transliteration
		-- decide whether to gray out the t in ﺓ. If word begins with al- or l-, yes.
		-- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un.
		text = rsub(text, "^(a?l%-+)" .. taa_marbuuTa .. "()",
			'%1<span style="color: #888888">t</span>%2')
		text = rsub(text, "(%sa?l%-+)" .. taa_marbuuTa .. "()",
			'%1<span style="color: #888888">t</span>%2')
		text = rsub(text, taa_marbuuTa .. "()", "t%1")
		text = rsub(text, taa_marbuuTa .. "()",
			'<span style="color: #888888">t</span>%1')
		text = rsub(text, ".", {
			 = '<span style="color: #888888">an</span>',
			 = '<span style="color: #888888">in</span>',
			 = '<span style="color: #888888">un</span>'
		text = rsub(text, "()%s", {
			 = '<span style="color: #888888">a</span> ',
			 = '<span style="color: #888888">i</span> ',
			 = '<span style="color: #888888">u</span> '
		text = rsub(text, "$", {
			 = '<span style="color: #888888">a</span>',
			 = '<span style="color: #888888">i</span>',
			 = '<span style="color: #888888">u</span>'
		text = rsub(text, '</span><span style="color: #888888">', "")
	elseif omit_i3raab then -- omit ʾiʿrāb in transliteration
		text = rsub(text, "", "")
		text = rsub(text, "%s", " ")
		text = rsub(text, "$", "")
	-- tāʾ marbūṭa should not be rendered by -t if word-final even when
	-- ʾiʿrāb (desinential inflection) is shown; instead, use (t) before
	-- whitespace, nothing when final; but render final -ﺍﺓ and -ﺁﺓ as -āh,
	-- consistent with Wehr's dictionary
	text = rsub(text, "()" .. taa_marbuuTa .. "$", "%1h")
	-- Ignore final tāʾ marbūṭa (it appears as "a" due to the preceding
	-- short vowel). Need to do this after graying or omitting word-final
	-- ʾiʿrāb.
	text = rsub(text, taa_marbuuTa .. "$", "")
	if not omit_i3raab then -- show ʾiʿrāb in transliteration
		text = rsub(text, taa_marbuuTa .. "%s", "(t) ")
		-- When omitting ʾiʿrāb, show all non-absolutely-final instances of
		-- tāʾ marbūṭa as (t), with trailing ʾiʿrāb omitted.
		text = rsub(text, taa_marbuuTa, "(t)")
	-- tatwīl should be rendered as - at beginning or end of word. It will
	-- be rendered as nothing in the middle of a word (FIXME, do we want
	-- this?)
	text = rsub(text, "^ـ", "-")
	text = rsub(text, "%sـ", " -")
	text = rsub(text, "ـ$", "-")
	text = rsub(text, "ـ%s", "- ")
	-- ince harfler
	text = rsub(text, "()()", "%1" .. "e")
	text = rsub(text, "()()", "%1" .. "ü")
	text = rsub(text, "()()", "%1" .. "en")
	text = rsub(text, "()()", "%1" .. "ün")
	text = rsub(text, "()(e)()", "%1" .. fatHa .. "%3")
	text = rsub(text, "()(ü)()", "%1" .. Damma .. "%3")
	text = rsub(text, "()(en)()", "%1" .. fatHataan .. "%3")
	text = rsub(text, "()(ün)()", "%1" .. Dammataan .. "%3")
	-- Now convert remaining Arabic chars according to table.
	text = rsub(text, ".", tt)
	text = rsub(text, "()(ā)", "ā")
	text = rsub(text, "aā", "ā")
	-- Implement elision of al- after a final vowel. We do this
	-- conservatively, only handling elision of the definite article rather
	-- than elision in other cases of hamzat al-waṣl (e.g. form-I imperatives
	-- or form-VII and above verbal nouns) partly because elision in
	-- these cases isn't so common in MSA and partly to avoid excessive
	-- elision in case of words written with initial bare alif instead of
	-- properly with hamzated alif. Possibly we should reconsider.
	-- At the very least we currently don't handle elision of الَّذِي (allaḏi)
	-- correctly because we special-case it to appear without the hyphen;
	-- perhaps we should reconsider that.
	text = rsub(text, "('* +'*)a(%-)",
	if gray_i3raab then
		text = rsub(text, "('*</span>'* +'*)a(%-)",
	-- Special-case the transliteration of allāh, without the hyphen
	text = rsub(text, "^(a?)l%-lāh", "%1llāh")
	text = rsub(text, "(%sa?)l%-lāh", "%1llāh")

	return text

local has_diacritics_subs = {
	-- FIXME! What about lam-alif ligature?
	-- remove punctuation and shadda
	-- must go before removing final consonants
	{"", ""},
	-- Remove consonants at end of word or utterance, so that we're OK with
	-- words lacking iʿrāb (must go before removing other consonants).
	-- If you want to catch places without iʿrāb, comment out the next two lines.
	{"$", ""},
	{"%s", " "},
	-- remove consonants (or alif) when followed by diacritics
	-- must go after removing shadda
	-- do not remove the diacritics yet because we need them to handle
	-- long-vowel sequences of diacritic + pseudo-consonant
	{"()", "%1"},
	-- the following two must go after removing consonants w/diacritics because
	-- we only want to treat vocalic wāw/yā' in them (we want to have removed
	-- wāw/yā' followed by a diacritic)
	-- remove ḍamma + wāw
	{Damma .. waaw, ""},
	-- remove kasra + yā'
	{kasra .. yaa, ""},
	-- remove fatḥa/fatḥatan + alif/alif-maqṣūra
	{"", ""},
	-- remove diacritics
	{"", ""},
	-- remove numbers, hamzatu l-waṣl, alif madda
	{"", ""},
	-- remove non-Arabic characters
	{"[^" .. U(0x0600) .. "-" .. U(0x06FF) .. U(0x0750) .. "-" .. U(0x077F) ..
			 U(0x08A0) .. "-" .. U(0x08FF) .. U(0xFB50) .. "-" .. U(0xFDFF) ..
			 U(0xFE70) .. "-" .. U(0xFEFF) .. "]", ""}

function has_diacritics(text)
	for _, sub in ipairs(has_diacritics_subs) do
		text = rsub(text, sub, sub)
	return #text == 0

-- Return true if transliteration TR is an irregular transliteration of
-- ARABIC. Return false if ARABIC can't be transliterated. For purposes of
-- establishing regularity, hyphens are ignored and word-final tāʾ marbūṭa
-- can be transliterated as "(t)", "" or "t".
function export.irregular_translit(arabic, tr)
	if not arabic or arabic == "" or not tr or tr == "" then
		return false
	local regtr =
	if not regtr or regtr == tr then
		return false
	local arwords = rsplit(arabic, " ")
	local regwords = rsplit(regtr, " ")
	local words = rsplit(tr, " ")
	if #regwords ~= #words or #regwords ~= #arwords then
		return true
	for i=1,#regwords do
		local regword = regwords
		local word = words
		local arword = arwords
		-- Resolve final (t) in auto-translit to t, h or nothing
		if rfind(regword, "%(t%)$") then
			regword = rfind(word, "āh$") and rsub(regword, "%(t%)$", "h") or
				rfind(word, "t$") and rsub(regword, "%(t%)$", "t") or
				rsub(regword, "%(t%)$", "")
		-- Resolve clitics + short a + alif-lām, which may get auto-transliterated
		-- to contain long ā, to short a if the manual translit has it; note
		-- that currently in cases with assimilated l, the auto-translit will
		-- fail, so we won't ever get here and don't have to worry about
		-- auto-translit l against manual-translit assimilated char.
		local clitic_chars = "^" -- separate line to avoid L2R display weirdness
		if rfind(arword, clitic_chars .. fatHa .. "?" .. laam) and rfind(word, "^a%-") then
			regword = rsub(regword, "^()ā", "%1a")
		-- Ignore hyphens when comparing
		if rsub(regword, "%-", "") ~= rsub(word, "%-", "") then
			return true
	return false

return export

-- For Vim, so we get 4-space tabs
-- vim: set ts=4 sw=4 noet: