-- Currently based on
-- https://ia803104.us.archive.org/7/items/A_Students_Vocabulary_For_Biblical_Hebrew_And_Aramaic/A%20Student%27s%20Vocabulary%20for%20Biblical%20Hebrew%20and%20Aramaic_text.pdf
local export = {}
local Array = require "Module:array"
local U = mw.ustring.char
local ufind = mw.ustring.find
local ugsub = mw.ustring.gsub
local ulen = mw.ustring.len
local umatch = mw.ustring.match
local usub = mw.ustring.sub
local sheva = U(0x05B0)
local hataf_segol = U(0x05B1)
local hataf_patah = U(0x05B2)
local hataf_qamats = U(0x05B3)
local hiriq = U(0x05B4)
local tsere = U(0x05B5)
local segol = U(0x05B6)
local patah = U(0x05B7)
local qamats = U(0x05B8)
local qamats_qatan = U(0x05C7)
local holam = U(0x05B9)
local holam_haser_for_waw = U(0x05BA)
local qubuts = U(0x05BB)
local dagesh_mappiq = U(0x05BC)
local shin_dot = U(0x05C1)
local sin_dot = U(0x05C2)
local macron_above = U(0x0304)
local macron_below = U(0x0331)
local macron = ""
local alef = "א"
local he = "ה"
local waw = "ו"
local yod = "י"
local vowel_letters = alef .. he .. waw .. yod
local shin_sin = 'ש'
-- local vowel_letter = ""
-- -- '0' represents silent sheva
-- local vowel_points = (
-- sheva .. hataf_segol .. hataf_patah .. hataf_qamats .. hiriq .. tsere ..
-- segol .. patah .. qamats .. qamats_qatan .. holam .. qubuts .. '0' ..
-- holam_haser_for_waw
-- )
-- local vowel_point = ""
local short_vowels = segol .. patah .. hiriq .. qubuts .. qamats_qatan
local short_vowel = ""
local shuruq = waw .. dagesh_mappiq
local holam_male = waw .. holam
local schwa = 'ə'
local superscript_a = 'ᵃ'
local vowel_map = {
= '',
= 'ĕ',
= 'ă',
= 'ŏ',
= 'i',
= 'ē',
= 'e',
= 'a',
= 'ā',
= 'o',
= 'u',
= 'ō',
-- = '',
-- = '',
= 'ô',
= 'û',
}
local plene_map = {
-- = '', -- ə
-- = 'ĕ',
-- = 'ă',
-- = 'ŏ',
= 'î',
= 'ê',
-- = 'ệ', -- Lambdin's Introduction to Biblical Hebrew uses this.
-- = 'a',
= 'â',
-- = 'o', -- if plene, then misspelling?
-- = 'u',
-- = 'ō',
-- = 'ô',
-- = 'û',
}
local vowel_diacritics = Array.keys(vowel_map):filter(function(vowel) return ulen(vowel) == 1 end):concat()
local bet = 'ב'
local gimel = 'ג'
local dalet = 'ד'
local kaf = 'כ'
local kaf_final = 'ך'
local pe = 'פ'
local pe_final = 'ף'
local tav = 'ת'
local bgdkpt = bet .. gimel .. dalet .. kaf .. kaf_final .. pe .. pe_final .. tav
local het = 'ח'
local ayn = 'ע'
local letter_map = {
= 'ʾ',
= 'b' .. macron_below,
= 'g' .. macron_above,
= 'd' .. macron_below,
= 'h',
= 'w',
= 'z',
= 'ḥ',
= 'ṭ',
= 'y',
= 'k' .. macron_below,
= 'k' .. macron_below,
= 'l',
= 'm',
= 'm',
= 'n',
= 'n',
= 's',
= 'ʿ',
= 'p' .. macron_above,
= 'p' .. macron_above,
= 'ṣ',
= 'ṣ',
= 'q',
= 'r',
= 't' .. macron_below,
}
local shin_sin_map = {
= "š",
= "ś",
}
local letters = shin_sin .. Array.keys(letter_map):filter(function(letter) return ulen(letter) == 1 end):concat()
local punctuation_map = {
= "-",
= ".",
}
-- Fix illogical order of diacritics in Unicode normalization.
function export.normalize(text)
-- Comment from ]:
-- The default order is: consonant, vowel point, dagesh or mappiq, shin or sin dot.
-- The desired order is: consonant, shin or sin dot, dagesh or mappiq, vowel point.
text = ugsub(text, "(*)(' .. dagesh_mappiq .. "*)(*)", "%3%2%1")
text = ugsub(
text,
"+",
function(vowels)
if ulen(vowels) == 2 then
local first, second = umatch(vowels, "^(.)(.)$")
-- יְרוּשָׁלִַם
if (first == hiriq and second ~= hiriq)
-- יְרוּשָׁלְַמָה
or (first == sheva and (second == patah or second == qamats or second == qamats_qatan)) then
return second .. first
end
end
end)
return text
end
local function match_alt_one(text, code_point_pos, patterns)
for _, pattern in ipairs(patterns) do
local start_pos, end_pos, capture = ufind(text, pattern, code_point_pos)
if start_pos == code_point_pos then
-- Return first capture (if any) and end of match
return capture, end_pos
end
end
end
local token_patterns = {
"(" .. holam_male .. ")",
"(?" .. dagesh_mappiq .. "?)",
"(.)",
}
local function next_token(text, code_point_pos)
return match_alt_one(text, code_point_pos, token_patterns)
end
-- Validate shin dot and sin dot?
local function tokenize(text)
local pos = 1
local tokens = {}
while true do
local token, next_pos = next_token(text, pos)
if not next_pos then
break
end
pos = next_pos + 1
table.insert(tokens, token)
end
return tokens
end
export.tokenize = tokenize
-- Indicates that a token may be a consonant.
local function is_consonant(token)
return token ~= nil and ufind(token, "", 1) == 1
end
local function may_be_silent(token)
return token ~= nil and vowel_letters:find(token, 1, true) ~= nil
end
-- Indicates that a token is definitely a vowel.
-- Shuruq not covered because it could be a ww.
local function is_vowel(token)
return token == holam_male or token ~= nil and vowel_diacritics:find(token, 1, true) ~= nil
end
local function is_preceded_by_unchangeable_vowel(tokens, i)
local token1, token2 = tokens, tokens
return token2 == shuruq -- Don't check that this is waw with dagesh.
or token2 == holam_male
or token2 == yod and (token1 == hiriq or token1 == tsere or token1 == segol)
end
local function has_dagesh(token)
return token:find(dagesh_mappiq, 1, true) ~= nil
end
local function is_waw(token)
return token:find(waw, 1, true) == 1
end
local function is_he(token)
return token:find(he, 1, true) == 1
end
local function is_bgdkpt(token)
return ufind(token, "^") == 1
end
local function is_word_boundary(token)
return token == nil or ufind(token, "^$") ~= nil
end
local function get_letter(token)
-- assert(ufind(token, "") == 1)
if token ~= nil then
return usub(token, 1, 1)
end
end
local function get_dot(token)
return umatch(token, "")
end
local function is_followed_by_vowel(tokens, i)
local next_token = tokens
return is_vowel(next_token) or next_token == shuruq
end
local function is_preceded_by_vowel(tokens, i)
i = i - 1
while may_be_silent(tokens) do
i = i - 1
end
return is_vowel(tokens) or tokens == shuruq
end
local function makes_furtive_patah(token)
local pos, letter = ufind(token, "()")
return pos == 1 and (token ~= he or has_dagesh(token))
end
function export.transliterate(text)
local tokens = export.tokenize(export.normalize(text))
local transliteration = {}
local function add_tr(val)
assert(type(val) == "string")
table.insert(transliteration, val)
end
-- Use a manually incremented loop so we can skip
-- furtive patah and matres lectionis tokens.
local i = 1
while true do
local token = tokens
if not token then
break
end
if is_waw(token) then
if token == holam_male then
if tokens == sheva then
add_tr(letter_map .. vowel_map)
else
add_tr(vowel_map)
end
-- waw with dagesh, shuruq
elseif has_dagesh(token) then
if is_consonant(tokens) or is_word_boundary(tokens) then
add_tr(vowel_map)
else
add_tr("ww")
end
else
add_tr("w")
end
elseif is_consonant(token) then
local letter = get_letter(token)
local tr = assert(letter_map or shin_sin_map or letter == shin_sin and shin_sin_map, token)
if has_dagesh(token) then
tr = ugsub(tr, macron, "")
-- Don't double he.
-- Don't double bgdkpt after sheva or at beginning of word.
if not is_he(token) and not (is_bgdkpt(token) and (tokens == sheva or is_word_boundary(tokens))) then
tr = tr .. tr
end
end
-- Transcribe furtive patah before its consonant and skip it.
if makes_furtive_patah(token) and tokens == patah and is_word_boundary(tokens) then
add_tr(superscript_a)
i = i + 1
end
add_tr(tr)
elseif is_vowel(token) then
if ((token == tsere or token == hiriq) and tokens == yod)
or (token == qamats and tokens == he and not is_vowel(tokens)) then
add_tr(plene_map)
i = i + 1 -- Skip mater lectionis.
-- Handle vocalic sheva
elseif token == sheva
and (
-- after initial consonant unless following consonant has dagesh
(is_word_boundary(tokens) and not has_dagesh(tokens))
-- after another sheva not at end of word
or (tokens == sheva and not is_word_boundary(tokens))
-- between identical consonants
or get_letter(tokens) == get_letter(tokens)
-- after unchangeable vowel
or is_preceded_by_unchangeable_vowel(tokens, i - 1)
) then
add_tr(schwa)
elseif
-- implicit ktiv/qre from ]:
-- יְרוּשָׁלְַמָה, יְרוּשָׁלְָמָה
token == sheva
and (tokens == patah or tokens == qamats
or tokens == qamats_qatan)
then
add_tr("y")
elseif
-- implicit ktiv/qre from ]:
-- יְרוּשָׁלִַם, יְרוּשָׁלִָם
token == hiriq
and (tokens == patah or tokens == qamats
or tokens == qamats_qatan)
then
add_tr("yi")
-- qamats in possibly closed syllable,
-- as long as following two consonants are not identical, in which
-- case the sheva has to be pronounced, putting the qamats
-- in an open syllable
elseif token == qamats and tokens == sheva and not (is_consonant(tokens) and is_consonant(tokens) and tokens == tokens) then
add_tr(vowel_map)
elseif (token == patah or token == qamats) and tokens == yod and is_consonant(tokens) then
add_tr(vowel_map)
add_tr("i") -- ???
i = i + 1
else
add_tr(vowel_map)
end
else
add_tr(punctuation_map or token)
end
i = i + 1
end
return table.concat(transliteration)
end
return export