This module will transliterate Urdu language text per WT:UR TR. It is also used to transliterate Brahui and Gojri.
The module should preferably not be called directly from templates or other modules.
To use it from a template, use {{xlit}}
.
Within a module, use Module:languages#Language:transliterate.
For testcases, see Module:ur-translit/testcases.
tr(text, lang, sc)
text
written in the script specified by the code sc
, and language specified by the code lang
.nil
.THIS WILL REQUIRE DIACRITICS (USED CORRECTLY), Diacritics can be found at http://udb.gov.pk (which is NOT always correct). This should work correctly for majority of the work, although is still in progress.
Read #Usage notes for tips on how to use the module correctly:
Test Urdu:
تَرْکِ تَعَلُقّات پِہ رویا نَہ تُو نَہ مَیں لیکِن یِہ کْیا کَہ چَین سے سویا نَہ تُو نَہ مَیں
وُہ ہَمْسَفَر تھا مَگَر اُس سے ہَمْنَوَائی نَہ تھی کَہ دُھوپ چھاؤں کا عالَم رَہا جُدائی نَہ تھی
عَداوَتیں تِھیں، تَغافُل تھا رَنْجِشیں تِھیں مَگَر بِچَھڑْنے والے میں سَب کُچھ تھا بے وَفائی نَہ تھی
کاجَل ڈالو کُرْکُرا سُرْمَہ سَہا نَہ جائے جِن نَین میں پِی بَسے دُوجا کَون سَمائے؟
بِچَھڑْتے وَقْت اُن آن٘کھوں میں
تھی ہَماری غَزَل
غَزَل تھی وُہ جو
کِسی کو کَبھی سُنائی نَہ تھی
Result:
tark-i ta'aluqqāt pe royā na tū na ma͠i lekin ye kyā ka cain se soyā na tū na ma͠i
vo hamsafar thā magar us se hamnavāī na thī ka dhūp chāõ kā 'ālam rahā judāī na thī
'adāvatẽ thī̃, taġāful thā ranjiśẽ thī̃ magar bichaṛne vāle mẽ sab kuch thā be vafāī na thī
kājal ḍālo kurkurā surma sahā na jāe jin nain mẽ pī base dūjā kaun samāe?
bichaṛte vaqt un ā̃khõ mẽ thī hamārī ġazal ġazal thī vo jo kisī ko kabhī sunāī na thī
Expected:
tark-i ta'aluqqāt pe royā na tū na ma͠i lekin ye kyā ka cain se soyā na tū na ma͠i
vo hamsafar thā magar us se hamnavāī na thī ka dhūp chāõ kā 'ālam rahā judāī na thī
'adāvatẽ thī̃, taġāful thā rañjiśẽ thī̃ magar bichaṛne vāle mẽ sab kuch thā be vafāī na thī
kājal ḍālo kurkurā surma sahā na jāe jin nain mẽ pī base dūjā kaun samāe?
bichaṛte vaqt un ā̃khõ mẽ thī hamārī ġazal ġazal thī vo jo kisī ko kabhī sunāī na thī
3 of 69 tests failed. (refresh)
Text | Expected | Actual | Differs at | |
---|---|---|---|---|
اِیرانِی | īrānī | īrānī | ||
ماشاءاَللّٰہ | māśā'allāh | māśā'allāh | ||
پَیدائِش | paidāiś | paidāiś | ||
بَرْقِیات | barqiyāt | barqiyāt | ||
عَقْل | 'aql | 'aql | ||
عِزَّت | 'izzat | 'izzat | ||
عَین | 'ain | 'ain | ||
عالَم | 'ālam | 'ālam | ||
عَورَت | 'aurat | 'aurat | ||
شُرُوع | śurū' | śurū' | ||
اِشْعاع | iś'ā' | iś'ā' | ||
تَعَلُّقات | ta'alluqāt | ta'alluqāt | ||
تَعَلُّق | ta'alluq | ta'alluq | ||
مُتَعَلِّق | muta'alliq | muta'alliq | ||
متعلق | (nil) | (nil) | N/A | |
عُمْر | 'umr | 'umr | ||
دَفْعَہ | daf'a | daf'a | ||
بَچَّہ | bacca | bacca | ||
قُوَّت | quvvat | quvvat | ||
مَۓ عِشْق | ma-ye 'iśq | ma-ye 'iśq | ||
شیرِ پَن٘جاب | śer-i pañjāb | śer-i pañjāb | ||
مَلْکَۂ دُنْیا | malka-yi dunyā | malka-yi dunyā | ||
جَمُّوں | jammū̃ | jammū̃ | ||
آم | ām | ām | ||
اِشْتِراکِیَّت | iśtirākiyyat | iśtirākiyyat | ||
سِسَکْنا | sisaknā | sisaknā | ||
پُل | pul | pul | ||
عِیسیٰ | 'īsā | 'īsā | ||
اَعْلیٰ | a'lā | a'lā | ||
لَفْظ | lafz | lafz | ||
حاضِر | hāzir | hāzir | ||
بَہورا | bahorā | bahorā | ||
نَہِیں | nahī̃ | nahī̃ | ||
اِشْتِمالِیَت | iśtimāliyat | iśtimāliyat | ||
چَوڑا | cauṛā | cauṛā | ||
تِھیں | thī̃ | thī̃ | ||
کُتّا | kuttā | kuttā | ||
پَہْلے | pahle | pahle | ||
کِھلائی | khilāī | khilāī | ||
کھلائی | (nil) | (nil) | N/A | |
ٹَھہَرْنا | ṭhaharnā | ṭhaharnā | ||
تَیمُور | taimūr | taimūr | ||
فَوراً | fauran | fauran | ||
کوئے | koe | koe | ||
مَنَّتوں | mannatõ | mannatõ | ||
گان٘وں | gā̃õ | gā̃õ | ||
مَیں | ma͠i | ma͠i | ||
آئی | āī | āī | ||
مَکَّھن | makkhan | makkhan | ||
خُدا | xudā | xudā | ||
کَئی | kaī | kaī | ||
کُئی | kuī | kuī | ||
چائے | cāe | cāe | ||
کُھلْواؤ | khulvāo | khulvāo | ||
غَدّار | ġaddār | ġaddār | ||
بَیٹھو | baiṭho | baiṭho | ||
بَطَّخ | battax | battax | ||
مُتَّحِدَۂ | muttahida-yi | muttahida-yi | ||
ساؤُتھ اَفْرِیقَہ | sāuth afrīqa | sāuth afrīqa | ||
کُلِّیَّہ | kulliyya | kulliyya | ||
دائِرَۃُ | dāiratu | dāiratu | ||
سُورَۃ | sūra | sūra | ||
بِلّا | billā | billā | ||
دائِرَۃُ الْمَعارِف | dāiratu l-ma'ārif | dāiratu اlma'ārif | 9 | |
دائِرَۃْ اُلْمَعارِف | dāirah ulma'ārif | dāirat ulma'ārif | 6 | |
آیَتُ اْللّٰہ | āyatu llāh | āyatu llāh | ||
صَیّاد | saiyād | saiyād | ||
گُرْدَہ | gurda | gurda | ||
کہاں | (nil) | khā̃ | N/A |
--[=[
FIXME:
1. support for Arabic al- (copy from fa-cls-translit)
]=]
local U = require("Module:string/char")
local gsub = mw.ustring.gsub
local export = {}
local fatHataan = U(0x64B)
local zabar = U(0x64E)
local zer = U(0x650)
local pesh = U(0x64F)
local zwnj = U(0x200C) -- Is this even used in Urdu? Why was it included in the previous version?
local highhmz = U(0x654)
local tashdid = U(0x651) -- also called tashdid
local jazm = "ْ"
local he = "ہ"
local ghunna = U(0x658)
local dagger_alif = U(0x670)
local consonants = "ببپتثجچحخدذرزژسشصضطظعغفقکگلࣇمنݨؤڷہئھٹڈڑ"
local consonantS = "ببپتثجچحخدذرزژسشصضطظعغفقکگڷلࣇمنݨہھٹڈڑ"
local consonantS2 = "یببپتثجچحخدذرزژسشصضطظعغفقکگلࣇڷمنݨوؤہھئٹڈڑ"
local semivowel = "یو"
local vowels = "āایئےۓوؤ"
local indvowels = "آایےوؤ"
local hes = "ہح"
local diacritics = "َُِّْٰ"
local ZZP = "َُِ"
local lrm = U(0x200e) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark
local consonants_needing_vowels = "ببپتثجچحخدذرزژسشصضطظعغفقکڷگلࣇمنںݨہئٹڈڑءﷲ"
-- consonants on the right side; includes alif madda
local rconsonants = consonants_needing_vowels .. "ویآ"
-- consonants on the left side; does not include alif madda
local lconsonants = consonants_needing_vowels
local space_like = "%s'" .. '"'
local space_like_class = ""
-- not all letters here are used by urdu
local mapping = {
= 'ā', = 'b', = 'p', = 't', = 'ṭ', = 's',
= 'j', = 'c', = 'h', = 'x',
= 'd', = 'ḍ', = 'z', = 'r', = "ṛ", = 'z', = 'ź',
= 's', = 'ś', = 's', = 'z',
= 't', = 'z', = 'ġ', = 'f', = 'q',
= 'k', = 'g', = 'ṇ', = 'ḷ', = 'ł',
= 'l', = 'm', = 'n', = 'o', = 'h', = 'e', = 'e', = ".", = '̃',
= "h",
= '\'',
= '\'',
= '',
-- diacritics
= "a",
= "i",
= "u",
= "", -- also sukun - no vowel
= "-", -- ZWNJ (zero-width non-joiner)
-- ligatures
= "lā",
= "allāh",
-- kashida
= "-", -- kashida, no sound
-- numerals
= "1", = "2", = "3", = "4", = "5",
= "6", = "7", = "8", = "9", = "0",
-- punctuation (leave on separate lines)
= "?", -- question mark
= ".", -- period
= ",", -- comma
= ";", -- semicolon
= '“', -- quotation mark
= '”', -- quotation mark
= "%", -- percent
= "‰", -- per mille
= ".", -- decimals
= ",", -- thousand
= "-ye",
= "-yi",
}
local punctuation = "%-:%(%)%*&٫؛؟،ـ«\".\'!»٪؉۔"
local numbers = "۱۲۳۴۵۶۷۸۹۰"
local ain = 'ع'
local alif = 'ا'
local ye = 'ی'
local ye2 = 'ئ'
local ye3 = "ے"
local vao = "و"
local aspirate = 'ھ'
local highhmz = U(0x654)
local aiu = "āīūآ"
local n_exceptions = "" -- for nasalization exceptions
local before_diacritic_checking_subs = {
------------ transformations prior to checking for diacritics --------------
{U(0x06E5), "و"},
{U(0x06E6), "ی"},
-- ignore dagger alif placed over regular alif or alif maqṣūra
{"()" .. dagger_alif, alif},
{"()" .. fatHataan, alif .. fatHataan},
}
local has_diacritics_subs = {
-- remove arabic ye (ruins conversions)
{"لل" .. he , ""},
{"لل" .. tashdid .. he , ""},
{"لل" .. tashdid .. dagger_alif .. he , ""},
{"ۃ" , ""},
-- aspirated consonants should cound as 1 consonant not two
{"()" .. aspirate , "%1"},
{"()" .. aspirate , "%1"},
{ aspirate , ""},
-- remove punctuation and tashdid
{"", ""},
-- noon gunna and silent consonants can be removed
{ ".. .. ()" .. "()" .. "()" , ""},
{ "()" .. ghunna , ""},
{ "()" .. jazm , ""},
{ "()" .. "یٰ" , ""},
-- must go before removing final consonants
{"" .. alif , alif },
{fatHataan , "" },
{ "()" .. "" .. "()", "" },
{ "()", "" },
{ "()" .. dagger_alif, alif},
{ dagger_alif .. ye , alif},
{ alif .. "" , ""},
{ "" .. alif , alif},
{ dagger_alif .. "()", alif},
-- Remove consonants at end of word or utterance, so that we're OK with
-- words lacking iʿrāb (must go before removing other consonants).
-- If you want to catch places without iʿrāb, comment out the next two lines.
{"$", ""},
-- closed consonants
{"()", ""},
-- remove consonants (or alif) when followed by diacritics
-- must go after removing tashdid
-- do not remove the diacritics yet because we need them to handle
-- long-vowel sequences of diacritic + pseudo-consonant
{"()", "%1"},
-- the following two must go after removing consonants w/diacritics because
{"()()()", ""},
{"()", ""},
{"()", ""},
{"()", ""},
{"()(" .. space_like_class .. ")", ""},
{"" .. zabar .. "", ""},
-- we only want to treat vocalic wāw/yā' in them (we want to have removed
-- remove vaw
{ "" .. vao, ""},
{"ؤ" .. pesh , ""},
{"ؤ", ""},
-- remove ye
{ "" .. ye, ""},
{ye3, ""},
{"()" .. he,""},
-- remove fatḥa/fatḥatan + alif/alif-maqṣūra
{"", ""},
-- remove diacritics and independant vowels
{"", ""},
{ "" , ""},
{ "" .. "" , ""},
-- remove numbers, hamzatu l-waṣl, alif madda
{"", ""},
{"%s", ""},
}
-- declared as local above
local function has_diacritics(text)
local count
text, count = gsub(text, "", "")
if count > 0 then
require("Module:debug").track("ur-translit/lrm or rlm")
end
for _, sub in ipairs(has_diacritics_subs) do
text = gsub(text, unpack(sub))
end
return #text == 0
end
function export.tr(text, lang, sc)
if type(text) == "table" then
local function f(x) return (x ~= "") and x or nil end
text, lang, sc, omit_i3raab, force_translit =
f(text.args), f(text.args), f(text.args), f(text.args), f(text.args)
end
for _, sub in ipairs(before_diacritic_checking_subs) do
text = gsub(text, sub, sub)
end
if not force_translit and not has_diacritics(text) then
require("Module:debug").track("ur-translit/lacking diacritics")
return nil
end
--define the "end" of a word
text = gsub(text, "#", "HASHTAG")
text = gsub(text, " | ", "# | #")
text = gsub(text, "\n" , "#".."\n" .. "#")
text = gsub(text, "()" , "#".."%1" .. "#")
text = "##" .. gsub(text, " ", "# #") .. "##"
text = gsub(text, zwnj, "#"..zwnj.."#")
-- hastags now mark the beginning and end of a word
--exceptions
text = gsub(text, "#" .. vao .. he .. "#", "#vo#")
text = gsub(text, "#" .. vao .. pesh .. he .. "#", "#vo#")
text = gsub(text, "#" .. "پ" .. he .. "#", "#pe#")
text = gsub(text, "#" .. "پ" .. zer .. he .. "#", "#pe#")
text = gsub(text, "#" .. ye .. he .. "#", "#ye#")
text = gsub(text, "#" .. ye .. zer .. he .. "#", "#ye#")
--character reformatting
--to make an exceptions for a word, put hashtags on both sides
text = gsub(text, "ۂ", he .. highhmz)
text = gsub(text, highhmz, "#"..highhmz.."#")
--text = gsub(text, 'ىٰ', "ā") -- the first letter is U+0649 (Arabic alif maqṣūra), it doesn't belong here
text = gsub(text, 'یٰ', "ā") -- the first letter is U+06CC
text = gsub(text, 'ٰ', "ā")
text = gsub(text, 'ا' .. fatHataan, "an")
text = gsub(text, 'لا', "ﻻ")
text = gsub(text, "ة" , "ۃ")
text = gsub(text, "ۃ" .. "()", "ت%1")
text = gsub(text, "ۃ" , he)
-- Tashdeed
text = gsub(text, '()' .. tashdid, "%1%1")
text = gsub(text, '()' .. tashdid .. '()', "%1%1%2")
-- For some reason the tashdeed gets pushed after the other diacritics, so this line is necessary for tashdeed to work with other diacritics
text = gsub(text, '()' .. '()' .. tashdid, "%1%1%2")
text = gsub(text, '()' .. aspirate, aspirate.."%1")
text = gsub(text, dagger_alif .. aspirate, aspirate.."%1")
text = gsub(text, ye .. '()' .. tashdid, "yy%1")
text = gsub(text, vao .. '()' .. tashdid, "vv%1")
text = gsub(text, ye .. tashdid .. '()', "yy%1")
text = gsub(text, vao .. tashdid .. '()', "vv%1")
--initial alif
text = gsub(text, pesh .. vao .. alif, "uā")
text = gsub(text, "()" .. alif, "%1ā")
--alifs paired to a consonant are a vowel
text = gsub(text, jazm .. alif, "-") -- invisible ZWNJ
text = gsub(text, jazm .. "آ", "-ā") -- invisible ZWNJ
text = gsub(text, "()" .. "آ", "%1'ā")
text = gsub(text, pesh .. vao .. zabar .. alif , "ūā" )
text = gsub(text, zabar .. alif, "ā")
text = gsub(text, "(" .. ghunna .. ")" .. alif, "%1ā")
text = gsub(text, "()" .. alif, "%1")
text = gsub(text, "()" .. alif, "%1")
--alifs not paired to a consonant are a glottal stop (not shown currently)
text = gsub(text, alif.."()".. "()", "%1%2")
text = gsub(text, alif..ye.."#", "ī")
text = gsub(text, alif..ye, "e")
text = gsub(text, alif..ye3, "e")
text = gsub(text, alif..zabar..ye3, "ai")
text = gsub(text, alif..vao, "o")
text = gsub(text, alif..zer..ye, "ī")
text = gsub(text, alif..pesh..vao, "ū")
text = gsub(text, alif.."()", "%1")
-- convert semi vowels
text = gsub(text, vao.. "()", "v%1")
text = gsub(text, ye.. "()", "y%1")
text = gsub(text, ye .. "ā", "yā")
text = gsub(text, vao.. "ā", "vā")
text = gsub(text, ye .. "(?)" .. ye3, "y%1"..ye3.."")
text = gsub(text, vao .. "(?)" .. ye3, "v%1"..ye3.."")
text = gsub(text, ye .. "()()", "e%1%2")
text = gsub(text, vao .. "()()", "o%1%2")
text = gsub(text, ye .. "()", "y%1")
text = gsub(text, vao .. "()", "v%1")
-- conversions for vaav/vaw/vao
text = gsub(text, pesh.. vao, "ū")
text = gsub(text, zabar .. vao, "au")
text = gsub(text, vao.. "()", "v%1")
text = gsub(text, "()" .. vao, "%1v")
-- conversions for ye
text = gsub(text, zer.. ye, "ī")
text = gsub(text, ye .. "#", "ī#")
text = gsub(text, zabar.. ye, "ai")
text = gsub(text, zabar.. ye3, "ai")
text = gsub(text, ye .. "()", "y%1")
text = gsub(text, "()" .. ye , "%1y")
-- final he and izafa/ezafe
text = gsub(text, "e" .. zer .. "#", "e-yi#")
text = gsub(text, "ī" .. zer .. "#", "ī-yi#")
text = gsub(text, "y" .. zer .. "#", "-yi#")
text = gsub(text, zer .. "#", "-i#")
text = gsub(text, "()" .. he .. "#" .. zwnj, "%1-")
text = gsub(text, "()" .. he .. "#", "%1#")
text = gsub(text, zabar .. he .. "#", "a#")
-- noon ghunna assimilation/nasalization
--remove impossible nasal vowels
text = gsub(text, "ن" .. ghunna .. "()", "m%1") -- nasal vowels are impossible before b
text = gsub(text, "ن" .. ghunna .. "ت" .. aspirate, "nth")
text = gsub(text, "ن" .. ghunna .. "()", "ṅ%1") -- impossible before q and g
text = gsub(text, "(" .. n_exceptions .. ")" .. "ن" .. ghunna .. "ٹ" .. aspirate , "%1ṇṭh")
text = gsub(text, "(" .. n_exceptions .. ")" .. "ن" .. ghunna .. "پ" .. aspirate, "%1mph")
text = gsub(text, "(" .. n_exceptions .. ")" .. "ن" .. ghunna .. "ک" .. aspirate, "%1ṅkh")
text = gsub(text, "ن" .. ghunna .. "()", "ñ%1") -- impossible before j
text = gsub(text, "ن".. ghunna .. "ڈ" .. aspirate, "ṇḍh") -- aspirated d/D cant be nasalized
text = gsub(text, "ن".. ghunna .. "د" .. aspirate, "ndh") -- aspirated d/D cant be nasalized
--other nasals
text = gsub(text, "ن" .. jazm .. "()" .. "#", "ṅ%1#")
text = gsub(text, "ن" .. ghunna .. "()" .. jazm .. "#", "ṅ%1#")
text = gsub(text, "ن" .. jazm .. "()", "n%1") -- dental
text = gsub(text, "ن" .. ghunna .. "()" .. jazm .. "#", "ṇ%1#")
text = gsub(text, "ن" .. ghunna .. "()" .. jazm .. "#", "ñ%1#") -- postalveolar
text = gsub(text, "ن" .. ghunna .. "(".. aspirate ..")" .. jazm .. "#", "ñ%1#")
-- if noon ghunna cannot assimilate, it becomes a nasal vowel.
text = gsub(text, "ن" .. ghunna, "ں")
text = gsub(text, "ؤ" .. pesh .. "ں" .. "#", ye2 .. "ū" .. "ں" .. "#")
-- get rid of hashtags (not needed)
text = gsub(text, "#", "")
text = gsub(text, "HASHTAG", "#")
text = string.gsub(text, lrm, "")
text = string.gsub(text, rlm, "")
-- convert all characters
text = gsub(text, '.', mapping)
-- vowel fixes
-- nasalized dipthongs
text = gsub(text, 'a()̃', 'a͠%1')
-- alif
-- Final corrections
text = gsub(text, "lll", "ll")
text = gsub(text, "āa", "ā")
text = gsub(text, "aaa", "ā")
text = gsub(text, "āā", "ā")
text = gsub(text, "aa", "ā")
--now get rid of the zero consonants
text = gsub(text, "ئ", "")
text = gsub(text, "u" .. "ؤ" , "u")
text = gsub(text, "ؤ" .. "u" .. "$", "ū") -- ؤُ is rendered 'ū' word-finally, short 'u' otherwise
text = gsub(text, "ؤ" .. "u" .. "()", "ū%1")
text = gsub(text, "ؤ" .. "u" , "u")
text = gsub(text, "ؤ", "o")
text = mw.ustring.toNFC(text)
return text
end
return export