local U = require("Module:string/char")
local gsub = mw.ustring.gsub
local export = {}
local fatHataan = U(0x64B)
local zabar = U(0x64E)
local zer = U(0x650)
local pesh = U(0x64F)
local zwnj = U(0x200C) -- Is this even used in Urdu? Why was it included in the previous version?
local highhmz = U(0x654)
local tashdid = U(0x651) -- also called tashdid
local jazm = "ْ"
local he = "ہ"
local ghunna = U(0x658)
local dagger_alif = U(0x670)
local consonants = "ببپتثجچحخدذرزژسشصضطظعغفقکگلࣇمنݨؤڷہئھٹڈڑ"
local consonantS = "ببپتثجچحخدذرزژسشصضطظعغفقکگڷلࣇمنݨہھٹڈڑ"
local consonantS2 = "یببپتثجچحخدذرزژسشصضطظعغفقکگلࣇڷمنݨوؤہھئٹڈڑ"
local semivowel = "یو"
local vowels = "āایئےۓوؤ"
local indvowels = "آایےوؤ"
local hes = "ہح"
local diacritics = "َُِّْٰ"
local ZZP = "َُِ"
local lrm = U(0x200e) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark
local consonants_needing_vowels = "ببپتثجچحخدذرزژسشصضطظعغفقکڷگلࣇمنںݨہئٹڈڑءﷲ"
-- consonants on the right side; includes alif madda
local rconsonants = consonants_needing_vowels .. "ویآ"
-- consonants on the left side; does not include alif madda
local lconsonants = consonants_needing_vowels
local space_like = "%s'" .. '"'
local space_like_class = ""
-- not all letters here are used by urdu
local mapping = {
= 'ā', = 'b', = 'ḇ', = 'p', = 't', = 'ṭ', = 's̱',
= 'j', = 'ǰ', = 'c', = 'ḥ', = 'x',
= 'd', = 'ḍ', = 'ḏ', = 'ẕ', = 'r', = "ṛ", = 'z', = 'ž',
= 's', = 'ś', = 'ṣ', = 'ẓ',
= 't̤', = 'z̤', = 'ʻ', = 'ġ', = 'f', = 'q',
= 'k', = 'g', = 'g̈', = 'ṇ', = 'ḷ',
= 'l', = 'm', = 'n', = 'v', = 'h', = 'y', = ".", = 'ṉ',
= "h",
= '',
-- diacritics
= 'ṉ',
= "a",
= "i",
= "u",
= "", -- also sukun - no vowel
= "-", -- ZWNJ (zero-width non-joiner)
-- ligatures
= "lā",
= "allāh",
-- kashida
= "-", -- kashida, no sound
-- numerals
= "1", = "2", = "3", = "4", = "5",
= "6", = "7", = "8", = "9", = "0",
-- punctuation (leave on separate lines)
= "?", -- question mark
= ".", -- period
= ",", -- comma
= ";", -- semicolon
= '“', -- quotation mark
= '”', -- quotation mark
= "%", -- percent
= "‰", -- per mille
= ".", -- decimals
= ",", -- thousand
= "-ye",
= "-yi",
}
local punctuation = "%-:%(%)%*&٫؛؟،ـ«\".\'!»٪؉۔"
local numbers = "۱۲۳۴۵۶۷۸۹۰"
local ain = 'ع'
local alif = 'ا'
local ye = 'ی'
local ye2 = 'ئ'
local ye3 = "ے"
local vao = "و"
local aspirate = 'ھ'
local highhmz = U(0x654)
local aiu = "āīūآ"
local n_exceptions = "" -- for nasalization exceptions
local has_diacritics_subs = {
-- remove arabic ye (ruins conversions)
{"لل" .. he , ""},
{"لل" .. tashdid .. he , ""},
{"لل" .. tashdid .. dagger_alif .. he , ""},
{"ۃ" , ""},
-- aspirated consonants should cound as 1 consonant not two
{"()" .. aspirate , "%1"},
{"()" .. aspirate , "%1"},
{ aspirate , ""},
-- remove punctuation and tashdid
{"", ""},
-- noon gunna and silent consonants can be removed
{ ".. .. ()" .. "()" .. "()" , ""},
{ "()" .. ghunna , ""},
{ "()" .. jazm , ""},
{ "()" .. "یٰ" , ""},
-- must go before removing final consonants
{"" .. alif , alif },
{fatHataan , "" },
{ "()" .. "" .. "()", "" },
{ "()", "" },
{ "()" .. dagger_alif, alif},
{ dagger_alif .. ye , alif},
{ alif .. "" , ""},
{ "" .. alif , alif},
{ dagger_alif .. "()", alif},
-- Remove consonants at end of word or utterance, so that we're OK with
-- words lacking iʿrāb (must go before removing other consonants).
-- If you want to catch places without iʿrāb, comment out the next two lines.
{"$", ""},
-- closed consonants
{"()", ""},
-- remove consonants (or alif) when followed by diacritics
-- must go after removing tashdid
-- do not remove the diacritics yet because we need them to handle
-- long-vowel sequences of diacritic + pseudo-consonant
{"()", "%1"},
-- the following two must go after removing consonants w/diacritics because
{"()()()", ""},
{"()", ""},
{"()", ""},
{"()", ""},
{"()(" .. space_like_class .. ")", ""},
{"" .. zabar .. "", ""},
-- we only want to treat vocalic wāw/yā' in them (we want to have removed
-- remove vaw
{ "" .. vao, ""},
{"ؤ" .. pesh , ""},
{"ؤ", ""},
-- remove ye
{ "" .. ye, ""},
{ye3, ""},
{"()" .. he,""},
-- remove fatḥa/fatḥatan + alif/alif-maqṣūra
{"", ""},
-- remove diacritics and independant vowels
{"", ""},
{ "" , ""},
{ "" .. "" , ""},
-- remove numbers, hamzatu l-waṣl, alif madda
{"", ""},
{"%s", ""},
}
-- declared as local above
local function has_diacritics(text)
local count
text, count = gsub(text, "", "")
if count > 0 then
require("Module:debug").track("ur-translit/lrm or rlm")
end
for _, sub in ipairs(has_diacritics_subs) do
text = gsub(text, unpack(sub))
end
return #text == 0
end
function export.tr(text, lang, sc)
--define the "end" of a word
text = gsub(text, "#", "HASHTAG")
text = gsub(text, " | ", "# | #")
text = gsub(text, "\n" , "#".."\n" .. "#")
text = gsub(text, "()" , "#".."%1" .. "#")
text = "##" .. gsub(text, " ", "# #") .. "##"
text = gsub(text, zwnj, "#"..zwnj.."#")
-- hastags now mark the beginning and end of a word
--exceptions
text = gsub(text, "#" .. vao .. he .. "#", "#vo#")
text = gsub(text, "#" .. vao .. pesh .. he .. "#", "#vo#")
text = gsub(text, "#" .. "پ" .. he .. "#", "#pe#")
text = gsub(text, "#" .. "پ" .. zer .. he .. "#", "#pe#")
text = gsub(text, "#" .. ye .. he .. "#", "#ye#")
text = gsub(text, "#" .. ye .. zer .. he .. "#", "#ye#")
text = gsub(text, "ن٘", "ṉ")
--character reformatting
--to make an exceptions for a word, put hashtags on both sides
text = gsub(text, "ۂ", he .. highhmz)
text = gsub(text, highhmz, "#"..highhmz.."#")
--text = gsub(text, 'ىٰ', "ā") -- the first letter is U+0649 (Arabic alif maqṣūra), it doesn't belong here
text = gsub(text, 'یٰ', "ā") -- the first letter is U+06CC
text = gsub(text, 'ٰ', "ā")
text = gsub(text, 'ا' .. fatHataan, "an")
text = gsub(text, 'لا', "ﻻ")
text = gsub(text, "ة" , "ۃ")
text = gsub(text, "ۃ" .. "()", "ت%1")
text = gsub(text, "ۃ" , he)
-- Tashdeed
text = gsub(text, '()' .. tashdid, "%1%1")
text = gsub(text, '()' .. tashdid .. '()', "%1%1%2")
-- For some reason the tashdeed gets pushed after the other diacritics, so this line is necessary for tashdeed to work with other diacritics
text = gsub(text, '()' .. '()' .. tashdid, "%1%1%2")
text = gsub(text, '()' .. aspirate, aspirate.."%1")
text = gsub(text, dagger_alif .. aspirate, aspirate.."%1")
text = gsub(text, ye .. '()' .. tashdid, "yy%1")
text = gsub(text, vao .. '()' .. tashdid, "vv%1")
text = gsub(text, ye .. tashdid .. '()', "yy%1")
text = gsub(text, vao .. tashdid .. '()', "vv%1")
--initial alif
text = gsub(text, "()" .. alif, "%1ā")
--alifs paired to a consonant are a vowel
text = gsub(text, jazm .. alif, "-") -- invisible ZWNJ
text = gsub(text, jazm .. "آ", "-ā") -- invisible ZWNJ
text = gsub(text, "()" .. "آ", "%1'ā")
text = gsub(text, pesh .. vao .. zabar .. alif , "ūā" )
text = gsub(text, zabar .. alif, "ā")
text = gsub(text, "()" .. alif, "%1")
text = gsub(text, "()" .. alif, "%1")
--alifs not paired to a consonant are a glottal stop (not shown currently)
text = gsub(text, alif.."()".. "()", "%1%2")
text = gsub(text, alif..ye.."#", "ī")
text = gsub(text, alif..ye, "e")
text = gsub(text, alif..ye3, "e")
text = gsub(text, alif..zabar..ye3, "ai")
text = gsub(text, alif..vao, "o")
text = gsub(text, alif..zer..ye, "ī")
text = gsub(text, alif..pesh..vao, "ū")
text = gsub(text, alif.."()", "%1")
-- convert semi vowels
text = gsub(text, vao.. "()", "v%1")
text = gsub(text, ye.. "()", "y%1")
text = gsub(text, ye .. "ā", "yā")
text = gsub(text, vao.. "ā", "vā")
text = gsub(text, ye .. "(?)" .. ye3, "y%1"..ye3.."")
text = gsub(text, vao .. "(?)" .. ye3, "v%1"..ye3.."")
text = gsub(text, ye .. "()()", "e%1%2")
text = gsub(text, vao .. "()()", "o%1%2")
text = gsub(text, ye .. "()", "y%1")
text = gsub(text, vao .. "()", "v%1")
-- conversions for vaav/vaw/vao
text = gsub(text, pesh.. vao, "ū")
text = gsub(text, zabar .. vao, "au")
text = gsub(text, vao.. "()", "v%1")
text = gsub(text, "()" .. vao, "%1v")
-- conversions for ye
text = gsub(text, zer.. ye, "ī")
text = gsub(text, ye .. "#", "ī#")
text = gsub(text, zabar.. ye, "ai")
text = gsub(text, zabar.. ye3, "ai")
text = gsub(text, ye .. "()", "y%1")
text = gsub(text, "()" .. ye , "%1y")
-- final he and izafa/ezafe
text = gsub(text, "e" .. zer .. "#", "e-yi#")
text = gsub(text, "ī" .. zer .. "#", "ī-yi#")
text = gsub(text, "y" .. zer .. "#", "-yi#")
text = gsub(text, zer .. "#", "-i#")
text = gsub(text, "()" .. he .. "#" .. zwnj, "%1-")
text = gsub(text, "()" .. he .. "#", "%1#")
text = gsub(text, zabar .. he .. "#", "a#")
-- get rid of hashtags (not needed)
text = gsub(text, "#", "")
text = gsub(text, "HASHTAG", "#")
text = string.gsub(text, lrm, "")
text = string.gsub(text, rlm, "")
-- convert all characters
text = gsub(text, '.', mapping)
-- vowel fixes
-- alif
-- Final corrections
text = gsub(text, "hh", "h")
text = gsub(text, "lll", "ll")
text = gsub(text, "āa", "ā")
text = gsub(text, "aaa", "ā")
text = gsub(text, "āā", "ā")
text = gsub(text, "aa", "ā")
--now get rid of the zero consonants
text = gsub(text, "ئ", "")
text = gsub(text, "u" .. "ؤ" , "u")
text = gsub(text, "ؤ" .. "u" .. "$", "ū") -- ؤُ is rendered 'ū' word-finally, short 'u' otherwise
text = gsub(text, "ؤ" .. "u" .. "()", "ū%1")
text = gsub(text, "ؤ" .. "u" , "u")
text = gsub(text, "ؤ", "o")
text = mw.ustring.toNFC(text)
return text
end
return export