local export = {}
local m_str_utils = require("Module:string utilities")
local makeDisplayText -- defined below
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = m_str_utils.char
local ugsub = mw.ustring.gsub
local usub = m_str_utils.sub
local DIACRITICS = "[" ..
u(0x0307) .. u(0x0308) ..
u(0x034F) .. -- combining grapheme joiner
u(0x200C) .. -- zero width non-joiner
u(0x200D) .. -- zero width joiner
u(0x0591) .. "-" .. u(0x05BD) ..
u(0x05BF) ..
u(0x05C1) .. u(0x05C2) ..
u(0x05C4) .. u(0x05C5) ..
u(0x05C7) ..
u(0xFB1E) ..
"]"
local GERESH = u(0x059C)
local GERSHAYIM = u(0x059E) -- double geresh
local MERCHA = u(0x05A5)
local MERCHA_KEFULA = u(0x05A6) -- double mercha
local TELISHA = u(0x05A0) .. u(0x05A9)
local KARNE_PARAH = u(0x059F)
local SHEVA = u(0x05B0)
local HOLAM = u(0x05B9)
local HOLAM_HASER_FOR_VAV = u(0x05BA)
local WIDE_ALEF = u(0xFB21)
local substitutes = {
= "׳",
= "״",
= "־",
= "׀",
= u(0x05B1),
= u(0x05B2),
= u(0x05B3),
= u(0x05B3),
}
function export.makeDisplayText(text, lang, sc)
text = toNFD(text):gsub("", substitutes)
:gsub(SHEVA .. "", substitutes)
:gsub(GERESH .. GERESH, GERSHAYIM)
:gsub(MERCHA .. MERCHA, MERCHA_KEFULA)
:gsub(TELISHA, KARNE_PARAH)
-- Holam haser for vav (U+05BA) can only be placed on vav; otherwise, replace with holam (U+05B9).
if text:find(HOLAM_HASER_FOR_VAV, nil, true) then
text = ugsub(text, "()(" .. DIACRITICS .. "+)", function(loc, dia)
loc = loc - 1
if usub(text, loc, loc) ~= "ו" then
return (dia:gsub(HOLAM_HASER_FOR_VAV, HOLAM))
end
end)
end
return toNFC(text)
end
makeDisplayText = export.makeDisplayText
local retain_diacritics = {
= true,
= true,
= true,
= true,
}
function export.makeEntryName(text, lang, sc)
if retain_diacritics then
return makeDisplayText(text, lang, sc)
end
text = ugsub(toNFD(text), DIACRITICS .. "+", "")
:gsub("", substitutes)
return toNFC(text)
end
local sortkey_substitutes = {
= "׳",
= "״",
= "־",
= "׀",
= "כ",
= "מ",
= "נ",
= "פ",
= "צ",
= "ו" .. u(0xF000),
= "וו",
= "וי",
= "יי",
= "א",
= "ב",
= "ג",
= "ד",
= "ע",
= "א",
= "ד",
= "ה",
= "כ",
= "ל",
= "ם",
= "ר",
= "ת",
= "+",
= "אל"
}
-- Sort after U+FB21 HEBREW LETTER WIDE ALEF, so that it sorts after Arabic script titles.
local sort_after_wide_alef = {
= true,
= true,
= true,
= true,
}
function export.makeSortKey(text, lang, sc)
text = ugsub(toNFD(text), DIACRITICS .. "+", "")
text = ugsub(text, "", sortkey_substitutes)
if sort_after_wide_alef then
text = WIDE_ALEF .. text
end
return toNFC(text)
end
return export