Module:User:Mzajac/transform

This module sandbox lacks a documentation subpage. Please create it.
Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox
-- Transliterations
--
-- invoke with {{#invoke:User:Mzajac/transform|romanize||method=}}
--
-- where method=
--     scholarly (DEFAULT): According to ], following Daniels and 
--         Bright (1996) ''World’s Writing Systems''.
--     alaloc: ALA–LC, 1997 http://www.loc.gov/catdir/cpso/roman.html
--     bgn: BGN/PCGN 1965 http://libraries.ucsd.edu/bib/fed/USBGN_romanization.pdf
--     iso-1968: ISO/R 9:1968, (Ukrainian language-specific) variant 1.
--     iso: ISO 9:1995
--     ungegn: UNGEGN, after the Ukrainian National system, 2012 http://www.eki.ee/wgrs/rom1_uk.pdf.

-- Bugs
--     

-- To do
--
--    Distinguish all caps from initial caps in context, 
--       e.g., ХАТА = KHATA (not KhATA); Хата = Khata (not KHata); хата = khata
--    Convert only single apostrophes within words, or before soft vowels, to distinguish them from single 
--       quotation marks and wikitext emphasis.
--    Set the order for method="all"
-- 
--   Tables for 
--      uk-Latn-t-uk-Cyrl-m0-iso-1968-v2 ?= uk-Latn-x-british (British Standard)


-- Configuration

-- default romanization method
local methodDefault = "scholarly"

-- characters to be replaced
local searchDefault = ""

local transform = {}

-- load transliteration tables from a data module
-- * Cyrillic characters as table indexes seem to fail when imported through mw.loadData *
-- * USING LOCAL DATA INSTEAD *
--[[ 
local ttable = mw.loadData('Module:User:Mzajac/transform/uk-Latn-t-uk-Cyrl') 
]]--


-- Ukrainian Romanization tables
local ttable = { 
    
     = {
         = "Scholarly", 
         = "Scholarly", 
         = "uk-Latn-t-uk-Cyrl-x-scholarly", 
         = {
             = {
                 = searchDefault, 
                 = {
                     = "A",
                     = "a",
                     = "B",
                     = "b",
                     = "V",
                     = "v",
                     = "H",
                     = "h",
                     = "G",
                     = "g",
                     = "D",
                     = "d",
                     = "E",
                     = "e",
                     = "Je",
                     = "je",
                     = "Ž",
                     = "ž",
                     = "Z",
                     = "z",
                     = "Y",
                     = "y",
                     = "I",
                     = "i",
                     = "Ji",
                     = "ji",
                     = "J",
                     = "j",
                     = "K",
                     = "k",
                     = "L",
                     = "l",
                     = "M",
                     = "m",
                     = "N",
                     = "n",
                     = "O",
                     = "o",
                     = "P",
                     = "p",
                     = "R",
                     = "r",
                     = "S",
                     = "s",
                     = "T",
                     = "t",
                     = "U",
                     = "u",
                     = "F",
                     = "f",
                     = "X",
                     = "x",
                     = "C",
                     = "c",
                     = "Č",
                     = "č",
                     = "Š",
                     = "š",
                     = "Šč",
                     = "šč",
                     = "ʹ",
                     = "ʹ",
                     = "Ju",
                     = "ju",
                     = "Ja",
                     = "ja",
                     = "ʺ", -- apostrophe
                     = "ʺ", -- right single quotation mark
                     = "ʺ", -- modifier letter apostrophe
        
                    -- Archaic letters
                     = "ʺ",
                     = "ʺ",
                     = "Ë",
                     = "ë",
                     = "Y",
                     = "y",
                     = "Ě",
                     = "ě",
                     = "È",
                     = "è",
                     = "�",
                     = "�",
                     = "�",
                     = "�",
                     = "�",
                     = "�",
                }
            }
        }
    }, 
    
     = {
         = "UNGEGN", 
         = "Ukrainian National/UNGEGN", 
         = "uk-Latn-t-uk-Cyrl-m0-ungegn-2012", 
         = {
            -- note 1 : “gh is used in the romanization of зг (zgh)”
             = {
                 = "", 
                 = {
                     = "ZGH",
                     = "Zgh",
                     = "zgh",
                }
            }, 
            -- note 2 : “The second variant is used at the beginning of a word”
             = {
                 = "()Є",
                 = "%1Ye", 
            },
             = {
                 = "()є",
                 = "%1ye", 
            },
             = {
                 = "()Ї",
                 = "%1Yi", 
            },
             = {
                 = "()ї",
                 = "%1yi", 
            },
             = {
                 = "()Й",
                 = "%1Y", 
            },
             = {
                 = "()й",
                 = "%1y", 
            },
             = {
                 = "()Ю",
                 = "%1Yu", 
            },
             = {
                 = "()ю",
                 = "%1yu", 
            },
             = {
                 = "()Я",
                 = "%1Ya", 
            },
             = {
                 = "()я",
                 = "%1ya", 
            },
            -- Default 1-letter replacements
             = {
                 = searchDefault, 
                 = {
                     = "A",
                     = "a",
                     = "B",
                     = "b",
                     = "V",
                     = "v",
                     = "H",
                     = "h",
                     = "G",
                     = "g",
                     = "D",
                     = "d",
                     = "E",
                     = "e",
                     = "Ie",
                     = "ie",
                     = "Zh",
                     = "zh",
                     = "Z",
                     = "z",
                     = "Y",
                     = "y",
                     = "I",
                     = "i",
                     = "I",
                     = "i",
                     = "I",
                     = "i",
                     = "K",
                     = "k",
                     = "L",
                     = "l",
                     = "M",
                     = "m",
                     = "N",
                     = "n",
                     = "O",
                     = "o",
                     = "P",
                     = "p",
                     = "R",
                     = "r",
                     = "S",
                     = "s",
                     = "T",
                     = "t",
                     = "U",
                     = "u",
                     = "F",
                     = "f",
                     = "Kh",
                     = "kh",
                     = "Ts",
                     = "ts",
                     = "Ch",
                     = "ch",
                     = "Sh",
                     = "sh",
                     = "Shch",
                     = "shch",
                     = "",
                     = "",
                     = "Iu",
                     = "iu",
                     = "Ia",
                     = "ia",
                     = "", -- apostrophe
                     = "", -- right single quotation mark
                     = "", -- modifier letter apostrophe
        
                    -- Archaic letters (non-standard)
                     = "",
                     = "",
                     = "Ë",
                     = "ë",
                     = "Y",
                     = "y",
                     = "Ě",
                     = "ě",
                     = "E",
                     = "e",
                     = "�",
                     = "�",
                     = "�",
                     = "�",
                     = "�",
                     = "�",
                }
            } 
        }
    }, 
    
     = {
         = "ISO 1968", 
         = "ISO/R 9:1968, Ukrainian variant", 
         = "uk-Latn-t-uk-Cyrl-m0-iso-1968", 
         = {
             = {
                 = searchDefault, 
                 = {
                     = "A",
                     = "a",
                     = "B",
                     = "b",
                     = "V",
                     = "v",
                     = "H",
                     = "h",
                     = "G",
                     = "g",
                     = "D",
                     = "d",
                     = "E",
                     = "e",
                     = "Je",
                     = "je",
                     = "Ž",
                     = "ž",
                     = "Z",
                     = "z",
                     = "Y",
                     = "y",
                     = "I",
                     = "i",
                     = "Ï",
                     = "ï",
                     = "J",
                     = "j",
                     = "K",
                     = "k",
                     = "L",
                     = "l",
                     = "M",
                     = "m",
                     = "N",
                     = "n",
                     = "O",
                     = "o",
                     = "P",
                     = "p",
                     = "R",
                     = "r",
                     = "S",
                     = "s",
                     = "T",
                     = "t",
                     = "U",
                     = "u",
                     = "F",
                     = "f",
                     = "Ch",
                     = "ch",
                     = "C",
                     = "c",
                     = "Č",
                     = "č",
                     = "Š",
                     = "š",
                     = "Šč",
                     = "šč",
                     = "’",
                     = "’",
                     = "Ju",
                     = "ju",
                     = "Ja",
                     = "ja",
                     = "", -- apostrophe
                     = "", -- right single quotation mark
                     = "", -- modifier letter apostrophe
        
                    -- Archaic letters
                     = "Ǎ",
                     = "ǎ",
        
                    -- Archaic letters (borrowed from other language columns in ISO/R 9:1968)
                     = "Ë",
                     = "ë",
                     = "Y",
                     = "y",
                     = "Ě",
                     = "ě",
                     = "Ė",
                     = "ė",
                     = "ʺ̣",
                     = "ʺ̣",
                     = "Ḟ",
                     = "ḟ",
                     = "Ẏ",
                     = "ẏ",
                }
            }
        }
    },
    
     = {
         = "ISO", 
         = "ISO 9:1995", 
         = "uk-Latn-t-uk-Cyrl-m0-iso-1995", 
         = {
             = {
                 = searchDefault, 
                 = {
                     = "A",
                     = "a",
                     = "B",
                     = "b",
                     = "V",
                     = "v",
                     = "G",
                     = "g",
                     = "G̀",
                     = "g̀",
                     = "D",
                     = "d",
                     = "E",
                     = "e",
                     = "Ê",
                     = "ê",
                     = "Ž",
                     = "ž",
                     = "Z",
                     = "z",
                     = "I",
                     = "i",
                     = "Ì",
                     = "ì",
                     = "Ï",
                     = "ï",
                     = "J",
                     = "j",
                     = "K",
                     = "k",
                     = "L",
                     = "l",
                     = "M",
                     = "m",
                     = "N",
                     = "n",
                     = "O",
                     = "o",
                     = "P",
                     = "p",
                     = "R",
                     = "r",
                     = "S",
                     = "s",
                     = "T",
                     = "t",
                     = "U",
                     = "u",
                     = "F",
                     = "f",
                     = "H",
                     = "h",
                     = "C",
                     = "c",
                     = "Č",
                     = "č",
                     = "Š",
                     = "š",
                     = "Ŝ",
                     = "ŝ",
                     = "ʹ",
                     = "ʹ",
                     = "Û",
                     = "û",
                     = "Â",
                     = "â",
                     = "ˋ", -- apostrophe
                     = "ˋ", -- right single quotation mark
                     = "ˋ", -- modifier letter apostrophe
        
                    -- Archaic letters
                     = "ʺ",
                     = "ʺ",
                     = "Ë",
                     = "ë",
                     = "Y",
                     = "y",
                     = "Ě",
                     = "ě",
                     = "È",
                     = "è",
                     = "Ǎ",
                     = "ǎ",
                     = "F̀",
                     = "f̀",
                     = "Ỳ",
                     = "ỳ",
                }
            }
        }
    },
    
     = {
         = "ALA–LC", 
         = "US Library of Congress", 
         = "uk-Latn-t-uk-Cyrl-m0-alaloc-1997", 
         = {
             = {
                 = searchDefault, 
                 = {
                     = "A",
                     = "a",
                     = "B",
                     = "b",
                     = "V",
                     = "v",
                     = "H",
                     = "h",
                     = "G",
                     = "g",
                     = "D",
                     = "d",
                     = "E",
                     = "e",
                     = "I͡e",
                     = "i͡e",
                     = "Z͡h",
                     = "z͡h",
                     = "Z",
                     = "z",
                     = "Y",
                     = "y",
                     = "I",
                     = "i",
                     = "Ï",
                     = "ï",
                     = "Ĭ",
                     = "ĭ",
                     = "K",
                     = "k",
                     = "L",
                     = "l",
                     = "M",
                     = "m",
                     = "N",
                     = "n",
                     = "O",
                     = "o",
                     = "P",
                     = "p",
                     = "R",
                     = "r",
                     = "S",
                     = "s",
                     = "T",
                     = "t",
                     = "U",
                     = "u",
                     = "F",
                     = "f",
                     = "Kh",
                     = "kh",
                     = "T͡s",
                     = "t͡s",
                     = "Ch",
                     = "ch",
                     = "Sh",
                     = "sh",
                     = "Shch",
                     = "shch",
                     = "ʹ",
                     = "ʹ",
                     = "I͡u",
                     = "i͡u",
                     = "I͡a",
                     = "i͡a",
                     = "", -- apostrophe
                     = "", -- right single quotation mark
                     = "", -- modifier letter apostrophe
        
                    -- Archaic letters (non-standard)
                     = "",
                     = "",
                     = "Ë",
                     = "ë",
                     = "Y",
                     = "y",
                     = "Ě",
                     = "ě",
                     = "E",
                     = "e",
                     = "�",
                     = "�",
                     = "�",
                     = "�",
                     = "�",
                     = "�",
                }
            }
       }
    }, 
    
     = {
         = "BGN/PCGN", 
         = "US Board on Geographic Names and British Permanent Committee on Geographical Names", 
         = "uk-Latn-t-uk-Cyrl-m0-bgn-1965", 
         = {
            -- note 1: “The character sequences зг, кг, сг, тс, and цг may be romanized z·h, k·h, s·h, t·s, and ts·h 
            -- in order to differentiate those romanizations from the digraphs zh, kh, sh, ts, and the letter 
            -- sequence tsh, which are used to render the characters ж, х, ш, ц, and the character sequence тш.”
             = {
                 = "", 
                 = {
                     = "Z·H",
                     = "Z·h",
                     = "z·h",
                     = "K·H",
                     = "K·h",
                     = "k·h",
                     = "S·H",
                     = "S·h",
                     = "s·h",
                     = "TS·H",
                     = "Ts·h",
                     = "ts·h",
                }
            }, 
             = {
                 = "", 
                 = {
                     = "T·S",
                     = "T·s",
                     = "t·s",
                }
            }, 
            -- Default 1-letter replacements
             = {
                 = searchDefault, 
                 = {
                     = "A",
                     = "a",
                     = "B",
                     = "b",
                     = "V",
                     = "v",
                     = "H",
                     = "h",
                     = "G",
                     = "g",
                     = "D",
                     = "d",
                     = "E",
                     = "e",
                     = "Ye",
                     = "ye",
                     = "Zh",
                     = "zh",
                     = "Z",
                     = "z",
                     = "Y",
                     = "y",
                     = "I",
                     = "i",
                     = "Yi",
                     = "yi",
                     = "Y",
                     = "y",
                     = "K",
                     = "k",
                     = "L",
                     = "l",
                     = "M",
                     = "m",
                     = "N",
                     = "n",
                     = "O",
                     = "o",
                     = "P",
                     = "p",
                     = "R",
                     = "r",
                     = "S",
                     = "s",
                     = "T",
                     = "t",
                     = "U",
                     = "u",
                     = "F",
                     = "f",
                     = "Kh",
                     = "kh",
                     = "Ts",
                     = "ts",
                     = "Ch",
                     = "ch",
                     = "Sh",
                     = "sh",
                     = "Shch",
                     = "shch",
                     = "’",
                     = "’",
                     = "Yu",
                     = "yu",
                     = "Ya",
                     = "ya",
                     = "ˮ", -- apostrophe
                     = "ˮ", -- right single quotation mark
                     = "ˮ", -- modifier letter apostrophe
        
                    -- Archaic letters (non-standard)
                     = "",
                     = "",
                     = "Ë",
                     = "ë",
                     = "Y",
                     = "y",
                     = "Ě",
                     = "ě",
                     = "E",
                     = "e",
                     = "�",
                     = "�",
                     = "�",
                     = "�",
                     = "�",
                     = "�",
                }
            }
       }
    }, 

}

-- handle input
function transform.romanize(frame)
    local inputText = frame.args
    local method = frame.args.method or methodDefault
    
    if ttable then -- just do a conversion
        return transform.convert(inputText, method)
    elseif method == "all" then -- loop through all methods
        local result = "" -- overall start tag
        local i = 1
        for theMethod, theValue in pairs(ttable) do
            
            -- comma following previous iteration
            if i > 1 then result = result .. ", " end
            i = i + 1
            
            result = result .. "<span>" -- instance start tag
            result = result .. transform.convert(inputText, theMethod, true)
            result = result .. "</span>" -- instance end tag
        end
        result = result .. "" -- overall end tag
        return result
    else
        error("Transliteration method “" .. method .. "” is not supported")
    end
end

-- do the conversion
function transform.convert(inputText, method, showLabel)
    local result = ""
    
    result = result .. "<span"
        result = result .. " lang='" .. ttable .. "'"
        result = result .. " title='Romanized Ukrainian (" .. ttable .. ")'"
        result = result .. ">"
    
    inputText = " " .. inputText .. " " -- pad with spaces to allow boundary patterns (working around lack of %f pattern)
    
    for thePattern, theReplacements in ipairs(ttable) do
        inputText = (mw.ustring.gsub(inputText, ttable, ttable))
    end
    
    inputText = mw.ustring.sub(inputText, 2, mw.ustring.len(inputText) - 1) -- un-pad with spaces
    
    result = result .. inputText
    
    result = result .. "</span>"
    
    if showLabel then
        result = result .. " ("
        
        if ttable ~= ttable then -- Use an abbr element if the short name doesn’t match name
            result = result .. "<abbr title='" .. ttable .. "'>"
        end
        
        result = result .. ttable
        
        if ttable ~= ttable then
            result = result .. "</abbr>"
        end
        
        result = result .. ")"
    end

    return result
end

return transform
Module:User:Mzajac/transform

Wikious

Boobota

Sagapedia