Language code in page name (User:Erutuon/hbo
) not recognized.
-- Trying to reimplement ].
local export = {}
local Array = require "Module:array"
local function show_code_point_names(text)
if not text then return "" end
local names = Array()
for cp in mw.ustring.gcodepoint(text) do
-- Remove HEBREW LETTER, HEBREW POINT, etc.
local name = require "Module:Unicode data".lookup_name(cp)
:gsub(
"^HEBREW (%w+) ",
function(type)
if type == "ACCENT" then return "ACCENT " else return "" end
end)
:lower()
names:insert(name)
end
return names:concat ", "
end
local function show_tokens(tokens, i, j)
return table.concat(Array(tokens):map(show_code_point_names), " | ", i, j)
end
export.show_tokens = show_tokens
local U = mw.ustring.char
local ufind = mw.ustring.find
local ugsub = mw.ustring.gsub
local ulen = mw.ustring.len
local umatch = mw.ustring.match
local usub = mw.ustring.sub
local sheva = U(0x05B0)
local hataf_segol = U(0x05B1)
local hataf_patah = U(0x05B2)
local hataf_qamats = U(0x05B3)
local hiriq = U(0x05B4)
local tsere = U(0x05B5)
local segol = U(0x05B6)
local patah = U(0x05B7)
local qamats = U(0x05B8)
local qamats_qatan = U(0x05C7)
local holam = U(0x05B9)
local holam_haser_for_waw = U(0x05BA)
local qubuts = U(0x05BB)
local dagesh_mappiq = U(0x05BC)
local shin_dot = U(0x05C1)
local sin_dot = U(0x05C2)
local macron_above = U(0x0304)
local macron_below = U(0x0331)
local macron = ""
local circumflex = U(0x0302)
local acute = U(0x0301)
local alef = "א"
local he = "ה"
local waw = "ו"
local yod = "י"
local vowel_letters = alef .. he .. waw .. yod
local shin_sin = 'ש'
local shuruq = waw .. dagesh_mappiq
local holam_male = waw .. holam
local schwa = 'ə'
local vowel_map = {
= '',
= 'ɛ̆',
= 'ă',
= 'ɔ̆',
= 'i',
= 'e',
= 'ɛ',
= 'a',
= 'ɔ',
= 'ɔ',
= 'u',
= 'o',
= 'ō',
= 'o',
= 'ū',
}
local vowel_diacritics = Array.keys(vowel_map):filter(function(vowel) return ulen(vowel) == 1 end):concat()
local short_vowel_map = {
= 'o',
= 'o',
= 'o',
= 'u',
}
local plene_map = {
= 'ī',
= 'ē',
= 'ɔ' .. macron_above,
-- = 'o', -- if plene, then misspelling?
}
local bet = 'ב'
local gimel = 'ג'
local dalet = 'ד'
local kaf = 'כ'
local kaf_final = 'ך'
local lamed = 'ל'
local mem = 'מ'
local pe = 'פ'
local pe_final = 'ף'
local tav = 'ת'
local bgdkpt = bet .. gimel .. dalet .. kaf .. kaf_final .. pe .. pe_final .. tav
local het = 'ח'
local ayn = 'ע'
local letter_map = {
= 'ʾ',
= 'b' .. macron_below,
= 'g' .. macron_above,
= 'd' .. macron_below,
= 'h',
= 'w',
= 'z',
= 'ḥ',
= 'ṭ',
= 'y',
= 'k' .. macron_below,
= 'k' .. macron_below,
= 'l',
= 'm',
= 'm',
= 'n',
= 'n',
= 's',
= 'ʿ',
= 'p' .. macron_above,
= 'p' .. macron_above,
= 'ṣ',
= 'ṣ',
= 'q',
= 'r',
= 't' .. macron_below,
}
local shin_sin_map = {
= "š",
= "ś",
}
local letters = shin_sin .. Array.keys(letter_map):filter(function(letter) return ulen(letter) == 1 end):concat()
local punctuation_map = {
= "-",
= ".",
}
-- First and last code point called "HEBREW ACCENT ...".
local first_accent_cp, last_accent_cp = 0x0591, 0x05AE
local meteg_cp = 0x05BD
local meteg = U(meteg_cp)
local combining_grapheme_joiner_cp = 0x034F
local cgj = U(combining_grapheme_joiner_cp)
local accents = { U(meteg_cp) }
for cp = first_accent_cp, last_accent_cp do
table.insert(accents, U(cp))
end
local diacritic_order = {
{shin_dot, shin_dot},
{dagesh_mappiq},
Array.keys(vowel_map):filter(function(vowel) return ulen(vowel) == 1 end),
accents,
{cgj},
}
local accent_pattern = U(first_accent_cp) .. "-" .. U(last_accent_cp) .. U(meteg_cp)
local diacritic_pattern = "["
.. shin_dot .. sin_dot
.. dagesh_mappiq
.. vowel_diacritics
.. accent_pattern
.. cgj
.. "]"
local diacritics_pattern = diacritic_pattern .. diacritic_pattern .. "+"
local diacritic_order_map = {}
for i, diacritics in ipairs(diacritic_order) do
for _, diacritic in ipairs(diacritics) do
diacritic_order_map = i
end
end
local function is_accent(token)
if not token then
return false
end
local cp = mw.ustring.codepoint(token)
return first_accent_cp <= cp and cp <= last_accent_cp
or cp == combining_grapheme_joiner_cp
end
-- Fix illogical order of diacritics in Unicode normalization.
-- The default order:
-- consonant, vowel points, dagesh or mappiq, accent, shin or sin dot.
-- The desired order:
-- consonant, shin or sin dot, dagesh or mappiq, first vowel point, accent,
-- maybe second vowel point if first vowel point is sheva or hiriq.
function export.normalize(text)
text = ugsub(
text,
diacritics_pattern,
function(diacritics)
local diacritics_list = mw.text.split(diacritics, "")
table.sort(
diacritics_list,
function(a, b)
return (diacritic_order_map or 0) < (diacritic_order_map or 0)
end)
-- For now remove combining grapheme joiners... though this might be wrong.
while diacritics_list == cgj do
table.remove(diacritics_list)
end
-- If there are two vowels, put hiriq or sheva after other vowels.
-- If there is also an accent, put it after the first vowel.
-- Assume Unicode normalization:
-- sheva before hiriq before patah before either qamats.
-- This code works for combinations are in the testcases.
-- יְרוּשָׁלִַם, יְרוּשָׁלְַמָה
local i = 0
local first_vowel
repeat
i = i + 1
first_vowel = diacritics_list
until not first_vowel or vowel_diacritics:find(first_vowel)
if first_vowel then
local second_vowel = diacritics_list
if second_vowel and vowel_diacritics:find(second_vowel) then
if first_vowel == hiriq or first_vowel == sheva then
diacritics_list, diacritics_list = diacritics_list, diacritics_list
end
if is_accent(diacritics_list) then
diacritics_list, diacritics_list = diacritics_list, diacritics_list
end
end
end
return table.concat(diacritics_list)
end)
return text
end
local function match_alt_one(text, code_point_pos, patterns)
for _, pattern in ipairs(patterns) do
local start_pos, end_pos, capture = ufind(text, pattern, code_point_pos)
if start_pos == code_point_pos then
-- Return first capture (if any) and end of match
return capture, end_pos
end
end
end
local token_patterns = {
"(" .. holam_male .. ")",
"(?" .. dagesh_mappiq .. "?)",
"(.)",
}
local function next_token(text, code_point_pos)
return match_alt_one(text, code_point_pos, token_patterns)
end
-- Validate shin dot and sin dot?
local function tokenize(text)
local pos = 1
local tokens = {}
while true do
local token, next_pos = next_token(text, pos)
if not next_pos then
break
end
pos = next_pos + 1
table.insert(tokens, token)
end
return tokens
end
export.tokenize = tokenize
local function may_be_silent(token)
return token ~= nil and vowel_letters:find(token, 1, true) ~= nil
end
-- Indicates that a token might be a vowel.
-- Use only after determining that it is not a consonant.
local function is_vowel(token)
return token == holam_male or token == shuruq or (token ~= nil and vowel_diacritics:find(token, 1, true) ~= nil)
end
local function is_preceded_by_unchangeable_vowel(tokens, i)
local token1, token2 = tokens, tokens
return token2 == shuruq -- Don't check that this is waw with dagesh.
or token2 == holam_male
or token2 == yod and (token1 == hiriq or token1 == tsere or token1 == segol)
end
local function is_short_vowel(token)
return token == patah or token == segol or token == hiriq or token == qubuts
end
local function is_open_vowel(token)
return token == patah or token == qamats
end
local function has_dagesh(token)
return token ~= nil and token:find(dagesh_mappiq, 1, true) ~= nil
end
local function is_waw(token)
return token ~= nil and token:find(waw, 1, true) == 1
end
local function is_he(token)
return token ~= nil and token:find(he, 1, true) == 1
end
local function is_hataf(token)
return token == hataf_segol or token == hataf_patah or token == hataf_qamats
end
local function get_letter(token)
-- assert(ufind(token, "") == 1)
if token ~= nil then
return usub(token, 1, 1)
end
end
local function is_guttural(token)
local letter = get_letter(token)
return letter == alef or letter == he or letter == het or letter == ayn
end
local function is_bgdkpt(token)
return token ~= nil and ufind(token, "^") == 1
end
-- Bidirectional control characters should be avoided as much as possible,
-- but they are easily picked up when copying and pasting, so the module needs
-- to account for them.
-- This list is from ].
local bidirectional_control_characters =
U(0x061C) .. U(0x200E) .. U(0x200F) .. U(0x202A) .. "-" .. U(0x202E)
.. U(0x2066) .. "-" .. U(0x2069)
local word_boundary_character = "^$"
local function is_word_boundary(token)
return token == nil or ufind(token, word_boundary_character) ~= nil
end
local function get_dot(token)
return token and umatch(token, "")
end
local function is_followed_by_vowel(tokens, i)
repeat
i = i + 1
until not is_accent(tokens)
return is_vowel(tokens)
end
local function is_preceded_by_vowel(tokens, i)
repeat
i = i - 1
until not (may_be_silent(tokens) or is_accent(tokens))
return is_vowel(tokens)
end
local function get_previous_vowel_pos(tokens, i)
while true do
i = i - 1
local token = tokens
if is_vowel(token) then
return i
elseif is_word_boundary(token) then
return nil
end
end
end
local function get_previous_vowel(tokens, i)
local pos = get_previous_vowel_pos(tokens, i)
if pos then return tokens end
end
local function get_previous_neighboring_vowel(tokens, i)
while true do
i = i - 1
local token = tokens
if is_vowel(token) then
return token
elseif not is_accent(token) then
return nil
end
end
end
local function get_next_vowel(tokens, i)
while true do
i = i + 1
local token = tokens
if is_vowel(token) then
return token
elseif is_word_boundary(token) then
return nil
end
end
end
-- Defined below.
local is_consonant
local function skip_before_accent(tokens, i)
repeat
i = i - 1
until not is_accent(tokens)
return i
end
local function skip_after_accent(tokens, i)
repeat
i = i + 1
until not is_accent(tokens)
return i
end
local function is_preceded_by_consonant(tokens, i)
return is_consonant(tokens, skip_before_accent(tokens, i))
end
local function makes_furtive_patah(token)
local pos, letter = ufind(token, "()")
return pos == 1 and (token ~= he or has_dagesh(token))
end
-- Indicates that a token may be a consonant.
-- Declared as local above.
function is_consonant(tokens, i)
local token = tokens
if is_waw(token) then
return token == waw
or (token == shuruq and not (is_preceded_by_consonant(tokens, i) or is_word_boundary(tokens)))
else
return token ~= nil and ufind(token, "", 1) == 1
end
end
-- Don't double he.
-- Don't double bgdkpt after sheva or at beginning of word.
local function is_double(tokens, i)
local token = tokens
return token ~= nil
and has_dagesh(token)
and not is_he(token)
and not (is_bgdkpt(token) and (tokens == sheva or is_word_boundary(tokens)))
end
local function is_preceded_by_prefix(tokens, i)
local consonant, vowel = tokens, tokens
local letter = get_letter(consonant)
local letter_is_shin = (letter == shin_sin and get_dot(consonant) == shin_dot)
local next_cons_has_dagesh = has_dagesh(tokens)
return (vowel == hiriq and letter == mem and next_cons_has_dagesh)
or (vowel == sheva and (
letter == bet or letter == dalet or letter == waw
or letter == kaf or letter == lamed
)
) or (vowel == patah and next_cons_has_dagesh and (
letter == bet or letter == he or letter == kaf or letter == lamed
or letter_is_shin -- very archaic, says ]
)
) or (vowel == segol and next_cons_has_dagesh and letter_is_shin)
end
local function is_in_last_syllable(tokens, i)
while true do
local token = tokens
if is_word_boundary(token)
-- A sequence of consonant sheva consonant (sheva) does not have a vowel:
-- וַיֵּבְךְּ wayyēḇk, וַיַּרְא wayyar
or token == sheva and (
is_consonant(tokens, i + 2)
and not (tokens == sheva and is_word_boundary(tokens))
) then
return true
elseif is_vowel(token) then
return false
end
i = i + 1
end
end
function export.transliterate(text)
local tokens = export.tokenize(export.normalize(text))
local transliteration = {}
local function add_tr(val)
assert(type(val) == "string")
table.insert(transliteration, val)
end
-- Use a manually incremented loop so we can skip
-- furtive patah and matres lectionis tokens.
local i = 1
while true do
local token = tokens
if not token then
break
end
if is_consonant(tokens, i) then
local letter = get_letter(token)
local tr = assert(letter_map or shin_sin_map or letter == shin_sin and shin_sin_map, token)
if has_dagesh(token) then
tr = ugsub(tr, macron, "")
if is_double(tokens, i) then
tr = tr .. tr
end
end
-- Transcribe furtive patah before its consonant and skip it.
if makes_furtive_patah(token) and tokens == patah and is_word_boundary(tokens) then
local previous_vowel_pos = get_previous_vowel_pos(tokens, i)
if not is_accent(tokens) then
add_tr(acute)
end
add_tr(vowel_map)
i = i + 1
end
add_tr(tr)
elseif is_vowel(token) then
-- Genuine waw holam. Handle the waw and leave the holam to the next
-- bit of code.
-- מִצְוֹת miṣwōṯ
local waw_is_consonant = false
if token == holam_male and tokens == sheva then
add_tr(letter_map)
waw_is_consonant = true
end
local next_i = skip_after_accent(tokens, i)
local has_accent = next_i > i + 1
-- Handle sheva.
if tokens == sheva then
local previous_vowel = get_previous_vowel(tokens, i)
local previous_neighboring_vowel = get_previous_neighboring_vowel(tokens, i)
-- implicit ktiv/qre from ]:
-- יְרוּשָׁלְָמָה yərūšālayim, יְרוּשָׁלְַמָה yərūšālāyim
if tokens == meteg then
add_tr(schwa)
elseif is_open_vowel(previous_neighboring_vowel) then
add_tr("y")
elseif
is_word_boundary(tokens)
or (tokens == alef and is_word_boundary(tokens))
or has_dagesh(tokens) -- check for bgdkpt?
then
add_tr("")
elseif
-- after another sheva
previous_vowel == sheva
-- after initial consonant unless following consonant has dagesh
or previous_vowel == nil
-- between identical consonants
or get_letter(tokens) == get_letter(tokens)
or is_preceded_by_unchangeable_vowel(tokens, i - 1)
or is_double(tokens, i - 1)
then
add_tr(schwa)
elseif is_short_vowel(previous_vowel)
or is_guttural(tokens) then
add_tr("")
else
add_tr("")
end
-- implicit ktiv/qre from ]:
-- יְרוּשָׁלִַם yərūšālaymā, יְרוּשָׁלִָם yərūšālāymā
elseif token == hiriq and is_open_vowel(get_previous_neighboring_vowel(tokens, i)) then
add_tr("yi")
-- qamats in possibly closed syllable,
-- as long as following two consonants are not identical, in which
-- case the sheva has to be pronounced, putting the qamats
-- in an open syllable
elseif token == qamats
and (
(is_guttural(tokens) and (tokens == sheva or is_hataf(tokens)))
or (tokens == sheva and has_dagesh(tokens))
-- כָּל kol, on its own and with prefixes
or ((get_letter(tokens) == kaf and get_letter(tokens) == lamed)
and (is_word_boundary(tokens)
and (
is_word_boundary(tokens)
or is_preceded_by_prefix(tokens, i - 1)
)
)
)
) then
add_tr(vowel_map)
else
if waw_is_consonant then
add_tr(vowel_map)
else
add_tr(vowel_map)
end
local letter = tokens
if (letter == yod
and (token == hiriq or token == tsere or token == segol or token == qamats))
and not is_vowel(tokens) then
add_tr(macron_above)
i = next_i
elseif letter == he and not is_vowel(tokens) then
add_tr(circumflex)
i = next_i
end
end
-- This is not completely correct because not all accents indicate stress.
-- I haven't sorted out their functions though.
if has_accent and not is_in_last_syllable(tokens, i)
or (token == segol and get_next_vowel(tokens, i) == segol) then
add_tr(acute)
end
else
if not (is_accent(token) or token == meteg) then
add_tr(punctuation_map or token)
end
end
i = i + 1
end
return table.concat(transliteration)
end
return export