Module:pa-Arab-translit/sandbox

The following documentation is located at Module:pa-Arab-translit/sandbox/documentation. Categories were auto-generated by Module:module categorization.
Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox of (diff)
Issues

Bari ye isn't working with anything other than an alif
Middle y paired with zer should return "īy"
Middle y stand-alone or paired with a jazm should return "ey"
دُھواں should return "dhūāṉ", not "dhūvāṉ"
Humza should be transliterated as: , contrasting to the ain
local U = require("Module:string/char")
local gsub = mw.ustring.gsub
local export = {}

local fatHataan = U(0x64B)
local zabar = U(0x64E)
local zer = U(0x650)
local pesh = U(0x64F)
local zwnj = U(0x200C) -- Is this even used in Urdu? Why was it included in the previous version?
local highhmz = U(0x654)
local tashdid = U(0x651) -- also called tashdid
local jazm = "ْ"
local he = "ہ"
local ghunna = U(0x658)
local dagger_alif = U(0x670)

local consonants = "ببپتثجچحخدذرزژسشصضطظعغفقکگلࣇمنݨؤڷہئھٹڈڑ"
local consonantS = "ببپتثجچحخدذرزژسشصضطظعغفقکگڷلࣇمنݨہھٹڈڑ"
local consonantS2 = "یببپتثجچحخدذرزژسشصضطظعغفقکگلࣇڷمنݨوؤہھئٹڈڑ" 
local semivowel = "یو"
local vowels = "āایئےۓوؤ"
local indvowels = "آایےوؤ"
local hes = "ہح"
local diacritics = "َُِّْٰ"
local ZZP = "َُِ"
local lrm = U(0x200e) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark

local consonants_needing_vowels = "ببپتثجچحخدذرزژسشصضطظعغفقکڷگلࣇمنںݨہئٹڈڑءﷲ"
-- consonants on the right side; includes alif madda
local rconsonants = consonants_needing_vowels .. "ویآ"
-- consonants on the left side; does not include alif madda
local lconsonants = consonants_needing_vowels
local space_like = "%s'" .. '"'
local space_like_class = ""

-- not all letters here are used by urdu
local mapping = {
	 = 'ā',  = 'b',  = 'ḇ',  = 'p',  = 't',  = 'ṭ',  = 's̱',
	 = 'j',  = 'ǰ',  = 'c',  = 'ḥ',  = 'x', 
	 = 'd',  = 'ḍ',  = 'ḏ',  = 'ẕ',  = 'r',  = "ṛ",  = 'z',  = 'ž',
	 = 's',  = 'ś',  = 'ṣ',  = 'ẓ', 
	 = 't̤',  = 'z̤',  = 'ʻ',  = 'ġ',  = 'f',  = 'q',
	 = 'k',  = 'g',  = 'g̈',  = 'ṇ',  = 'ḷ',
	 = 'l',  = 'm',  = 'n',  = 'v',  = 'h',  = 'y',  = ".",  = 'ṉ',

	 = "h",
	 = '',
	
	
	-- diacritics
	 = 'ṉ',
	 = "a",
	 = "i",
	 = "u",
	 = "", -- also sukun - no vowel
	 = "-", -- ZWNJ (zero-width non-joiner)
	
	-- ligatures
	 = "lā",
	 = "allāh",
	
	-- kashida
	 = "-", -- kashida, no sound
	
	-- numerals
	 = "1",  = "2",  = "3",  = "4",  = "5",
	 = "6",  = "7",  = "8",  = "9",  = "0",
	
	-- punctuation (leave on separate lines)
	 = "?", -- question mark
	 = ".", -- period
	 = ",", -- comma
	 = ";", -- semicolon
	 = '“', -- quotation mark
	 = '”', -- quotation mark
	 = "%", -- percent
	 = "‰", -- per mille
	 = ".", -- decimals
	 = ",", -- thousand
	 = "-ye", 
	 = "-yi",
}

local punctuation = "%-:%(%)%*&٫؛؟،ـ«\".\'!»٪؉۔"
local numbers = "۱۲۳۴۵۶۷۸۹۰"

local ain = 'ع'
local alif = 'ا'
local ye = 'ی'
local ye2 = 'ئ'
local ye3 = "ے"
local vao = "و"
local aspirate = 'ھ'
local highhmz = U(0x654)
local aiu = "āīūآ"
local n_exceptions = "" -- for nasalization exceptions

local has_diacritics_subs = {
	-- remove arabic ye (ruins conversions)
	{"لل" ..  he , ""},
	{"لل" .. tashdid ..  he , ""},
	{"لل" .. tashdid .. dagger_alif ..  he , ""},
	{"ۃ" , ""},
	-- aspirated consonants should cound as 1 consonant not two
	{"()" ..  aspirate , "%1"},
	{"()" ..  aspirate , "%1"},
	{ aspirate , ""},
	-- remove punctuation and tashdid
	{"", ""},
	-- noon gunna and silent consonants can be removed
	{ "..  .. ()" .. "()" .. "()"  , ""},
	{ "()" .. ghunna , ""},
	{ "()" .. jazm , ""},
	{ "()" .. "یٰ" , ""},
	-- must go before removing final consonants
	{"" .. alif , alif },
	{fatHataan , "" },
	{ "()" .. "" .. "()", "" },
	{ "()", "" },
	{ "()" .. dagger_alif, alif},
	{ dagger_alif .. ye , alif},
	{ alif .. "" , ""},
	{ "" .. alif , alif},
	{ dagger_alif .. "()", alif},
	-- Remove consonants at end of word or utterance, so that we're OK with
	-- words lacking iʿrāb (must go before removing other consonants).
	-- If you want to catch places without iʿrāb, comment out the next two lines.
	{"$", ""},
	-- closed consonants
	{"()", ""},
	-- remove consonants (or alif) when followed by diacritics
	-- must go after removing tashdid
	-- do not remove the diacritics yet because we need them to handle
	-- long-vowel sequences of diacritic + pseudo-consonant
	{"()", "%1"},
	-- the following two must go after removing consonants w/diacritics because
	{"()()()", ""},
	{"()", ""},
	{"()", ""},
	{"()", ""},
	{"()(" .. space_like_class .. ")", ""},
	{"" .. zabar .. "", ""},
	-- we only want to treat vocalic wāw/yā' in them (we want to have removed
	-- remove vaw
	{ "" .. vao, ""},
	{"ؤ" .. pesh , ""},
	{"ؤ", ""},
	-- remove ye
	{ "" .. ye, ""},
	{ye3, ""},
	{"()" .. he,""},
	-- remove fatḥa/fatḥatan + alif/alif-maqṣūra
	{"", ""},
	-- remove diacritics and independant vowels
	{"", ""},
	{ "" , ""},
	{ "" .. "" , ""},
	-- remove numbers, hamzatu l-waṣl, alif madda
	{"", ""},
	{"%s", ""},
}

-- declared as local above
local function has_diacritics(text)
	local count
	text, count = gsub(text, "", "")
	if count > 0 then
		require("Module:debug").track("ur-translit/lrm or rlm")
	end
	for _, sub in ipairs(has_diacritics_subs) do
		text = gsub(text, unpack(sub))
	end
	return #text == 0
end

function export.tr(text, lang, sc)
	
	--define the "end" of a word
	text = gsub(text, "#", "HASHTAG")
	text = gsub(text, " | ", "# | #")
	text = gsub(text, "\n" , "#".."\n" .. "#")
	text = gsub(text, "()" , "#".."%1" .. "#")
	text = "##" .. gsub(text, " ", "# #") .. "##"
	text = gsub(text, zwnj, "#"..zwnj.."#")
	-- hastags now mark the beginning and end of a word
	
	--exceptions
	text = gsub(text, "#" .. vao .. he .. "#", "#vo#")
	text = gsub(text, "#" .. vao .. pesh .. he .. "#", "#vo#")
	text = gsub(text, "#" .. "پ" .. he .. "#", "#pe#")
	text = gsub(text, "#" .. "پ" .. zer .. he .. "#", "#pe#")
	text = gsub(text, "#" .. ye .. he .. "#", "#ye#")
	text = gsub(text, "#" .. ye .. zer .. he .. "#", "#ye#")
	text = gsub(text, "ن٘", "ṉ")
	
	--character reformatting
	--to make an exceptions for a word, put hashtags on both sides
	text = gsub(text, "ۂ", he .. highhmz)
	text = gsub(text, highhmz, "#"..highhmz.."#")
	--text = gsub(text, 'ىٰ', "ā") -- the first letter is U+0649 (Arabic alif maqṣūra), it doesn't belong here
	text = gsub(text, 'یٰ', "ā") -- the first letter is U+06CC
	text = gsub(text, 'ٰ', "ā")
	text = gsub(text, 'ا' .. fatHataan, "an")
	text = gsub(text, 'لا', "ﻻ")
	text = gsub(text, "ة" 	, "ۃ")
	text = gsub(text, "ۃ" .. "()", "ت%1")
	text = gsub(text, "ۃ" , he)
	
	-- Tashdeed
	text = gsub(text, '()' .. tashdid, "%1%1")
	text = gsub(text, '()' .. tashdid .. '()', "%1%1%2")
	-- For some reason the tashdeed gets pushed after the other diacritics, so this line is necessary for tashdeed to work with other diacritics
	text = gsub(text, '()' .. '()' .. tashdid, "%1%1%2")
	text = gsub(text, '()' .. aspirate, aspirate.."%1") 
	text = gsub(text, dagger_alif .. aspirate, aspirate.."%1")
	text = gsub(text, ye .. '()' .. tashdid, "yy%1")
	text = gsub(text,  vao .. '()' .. tashdid, "vv%1")
	text = gsub(text, ye .. tashdid .. '()', "yy%1")
	text = gsub(text, vao .. tashdid .. '()', "vv%1")
	

    --initial alif
    text = gsub(text, "()" .. alif, "%1ā") 
    --alifs paired to a consonant are a vowel
    text = gsub(text, jazm .. alif, "-") -- invisible ZWNJ
    text = gsub(text, jazm .. "آ", "-ā") -- invisible ZWNJ
    text = gsub(text, "()" .. "آ", "%1'ā") 
    	text = gsub(text, pesh .. vao .. zabar .. alif , "ūā" )
    text = gsub(text, zabar .. alif, "ā")
    text = gsub(text, "()" .. alif, "%1")
    text = gsub(text, "()" .. alif, "%1")
    --alifs not paired to a consonant are a glottal stop (not shown currently)
    text = gsub(text, alif.."()".. "()", "%1%2")
    text = gsub(text, alif..ye.."#", "ī")
    text = gsub(text, alif..ye, "e")
    text = gsub(text, alif..ye3, "e")
    text = gsub(text, alif..zabar..ye3, "ai")
    text = gsub(text, alif..vao, "o")
    text = gsub(text, alif..zer..ye, "ī")
    text = gsub(text, alif..pesh..vao, "ū")
    text = gsub(text, alif.."()", "%1")
    
    
    -- convert semi vowels
    text = gsub(text, vao.. "()", "v%1")
    text = gsub(text, ye.. "()", "y%1")
    text = gsub(text, ye .. "ā", "yā")
    text = gsub(text, vao.. "ā", "vā")
    text = gsub(text, ye .. "(?)" .. ye3, "y%1"..ye3.."")
    text = gsub(text, vao .. "(?)" .. ye3, "v%1"..ye3.."")
    text = gsub(text, ye .. "()()", "e%1%2")
    text = gsub(text, vao .. "()()", "o%1%2")
    text = gsub(text, ye .. "()", "y%1")
    text = gsub(text, vao .. "()", "v%1")
    
    -- conversions for vaav/vaw/vao
    text = gsub(text, pesh.. vao, "ū")
    text = gsub(text, zabar .. vao, "au")
    text = gsub(text, vao.. "()", "v%1")
    text = gsub(text, "()" .. vao, "%1v")
    -- conversions for ye
    text = gsub(text, zer.. ye, "ī")
    text = gsub(text, ye .. "#", "ī#")
    text = gsub(text, zabar.. ye, "ai")
    text = gsub(text, zabar.. ye3, "ai")
    text = gsub(text, ye .. "()", "y%1")
    text = gsub(text, "()" .. ye , "%1y")
    
    -- final he and izafa/ezafe
    text = gsub(text, "e" .. zer .. "#", "e-yi#")
    text = gsub(text, "ī" .. zer .. "#", "ī-yi#")
    text = gsub(text, "y" .. zer .. "#", "-yi#")
    text = gsub(text, zer .. "#", "-i#")
    text = gsub(text, "()" .. he .. "#" .. zwnj, "%1-")
    text = gsub(text, "()" .. he .. "#", "%1#")
    text = gsub(text, zabar .. he .. "#", "a#")
    
    -- get rid of hashtags (not needed)
    text = gsub(text, "#", "")
    text = gsub(text, "HASHTAG", "#")
    text = string.gsub(text, lrm, "")
	text = string.gsub(text, rlm, "")
    -- convert all characters
    text = gsub(text, '.', mapping)
    
    -- vowel fixes
	
	-- alif
	-- Final corrections
	text = gsub(text, "hh", "h")
	text = gsub(text, "lll", "ll")
	text = gsub(text, "āa", "ā")
	text = gsub(text, "aaa", "ā")
	text = gsub(text, "āā", "ā")
	text = gsub(text, "aa", "ā")
	
	--now get rid of the zero consonants
	text = gsub(text, "ئ", "")
	text = gsub(text, "u" .. "ؤ" , "u")
	text = gsub(text, "ؤ" .. "u" .. "$", "ū")  -- ؤُ is rendered 'ū' word-finally, short 'u' otherwise
	text = gsub(text, "ؤ" .. "u" .. "()", "ū%1")
	text = gsub(text, "ؤ" .. "u" , "u")
	text = gsub(text, "ؤ", "o")
	
	text = mw.ustring.toNFC(text)
	
	return text
end

return export
Module:pa-Arab-translit/sandbox

Issues

Wikious

Boobota

Sagapedia