local export = {}
local combining_classes_module = "Module:Unicode data/combining classes"
local data_module = "Module:scripts/data"
local debug_track_module = "Module:debug/track"
local languages_error_module = "Module:languages/error"
local scripts_by_name_module = "Module:scripts/by name"
local scripts_chartoscript_module = "Module:scripts/charToScript"
local string_utilities_module = "Module:string utilities"
local writing_systems_module = "Module:writing systems"
local concat = table.concat
local insert = table.insert
local load_data = mw.loadData
local match = string.match
local select = select
local setmetatable = setmetatable
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local toNFKC = mw.ustring.toNFKC
local toNFKD = mw.ustring.toNFKD
local type = type
--[==[
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
local function debug_track(...)
debug_track = require(debug_track_module)
return debug_track(...)
end
local function explode(...)
explode = require(string_utilities_module).explode_utf8
return explode(...)
end
local function get_writing_system(...)
get_writing_system = require(writing_systems_module).getByCode
return get_writing_system(...)
end
local function languages_error(...)
languages_error = require(languages_error_module)
return languages_error(...)
end
local function split(...)
split = require(string_utilities_module).split
return split(...)
end
local function ugsub(...)
ugsub = require(string_utilities_module).gsub
return ugsub(...)
end
local function umatch(...)
umatch = require(string_utilities_module).match
return umatch(...)
end
--[==[
Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==]
local m_data
local function get_data()
m_data, get_data = load_data(data_module), nil
return m_data
end
local scripts_by_name
local function get_scripts_by_name()
scripts_by_name, get_scripts_by_name = load_data(scripts_by_name_module), nil
return scripts_by_name
end
local Script = {}
--==]
function Script:getIETFSubtag()
local code = self._ietf_subtag
if code == nil then
code = self._rawData.ietf_subtag or match(self._code, "+$")
self._ietf_subtag = code
end
return code
end
--==]
function Script:getParent()
return self._rawData.parent
end
function Script:getSystemCodes()
if not self._systemCodes then
local system_codes = self._rawData
if type(system_codes) == "table" then
self._systemCodes = system_codes
elseif type(system_codes) == "string" then
self._systemCodes = split(system_codes, ",", true, true)
else
self._systemCodes = {}
end
end
return self._systemCodes
end
function Script:getSystems()
if not self._systemObjects then
self._systemObjects = {}
for _, system in ipairs(self:getSystemCodes()) do
insert(self._systemObjects, get_writing_system(system))
end
end
return self._systemObjects
end
--==]
function Script:isSystem(...)
for _, system in ipairs{...} do
if type(system) == "table" then
system = system:getCode()
end
for _, s in ipairs(self:getSystemCodes()) do
if system == s then
return true
end
end
end
return false
end
--function Script:getAllNames()
-- return self._rawData.names
--end
--].
Unless optional argument <code>nocap</code> is given, the script name at the beginning of the returned value will be capitalized. This capitalization is correct for category names, but not if the script name is lowercase and the returned value of this function is used in the middle of a sentence. (For example, the script with the code <code>Semap</code> has the name <code>"flag semaphore"</code>, which should remain lowercase when used as part of the category name ] but should be capitalized in ].) If you are considering using <code>getCategoryName("nocap")</code>, use <code>getDisplayForm()</code> instead.]==]
function Script:getCategoryName(nocap)
local name = self:getCanonicalName()
-- If the name already has "script", "code" or "semaphore" at the end, don't add it.
if not (
name:find("cript$") or
name:find("ode$") or
name:find("emaphore$")
) then
name = name .. " script"
end
if not nocap then
name = mw.getContentLanguage():ucfirst(name)
end
return name
end
--[==[Returns the charset defining the script's characters from the script's data file.
This can be used to search for words consisting only of this script, but see the warning above.]==]
function Script:getCharacters()
return self.characters or nil
end
--[==[Returns the number of characters in the text that are part of this script.
'''Note:''' You should never assume that text consists entirely of the same script. Strings may contain spaces, punctuation and even wiki markup or HTML tags. HTML tags will skew the counts, as they contain Latin-script characters. So it's best to avoid them.]==]
function Script:countCharacters(text)
local charset = self._rawData.characters
if charset == nil then
return 0
end
return select(2, ugsub(text, "", ""))
end
function Script:hasCapitalization()
return not not self._rawData.capitalized
end
function Script:hasSpaces()
return self._rawData.spaces ~= false
end
function Script:isTransliterated()
return self._rawData.translit ~= false
end
--==]
function Script:sortByScraping()
return not not self._rawData.sort_by_scraping
end
--==]
function Script:getDirection()
return self._rawData.direction or "ltr"
end
--==]
function Script:hasNormalizationFixes()
return not not self._rawData.normalizationFixes
end
--==]
function Script:fixDiscouragedSequences(text)
if self:hasNormalizationFixes() then
local norm_fixes = self._rawData.normalizationFixes
local to = norm_fixes.to
if to then
for i, v in ipairs(norm_fixes.from) do
text = ugsub(text, v, to or "")
end
end
end
return text
end
do
local combining_classes
-- Obtain the list of default combining classes.
local function get_combining_classes()
combining_classes, get_combining_classes = load_data(combining_classes_module), nil
return combining_classes
end
-- Implements a modified form of Unicode normalization for instances where there are identified deficiencies in the default Unicode combining classes.
local function fixNormalization(text, self)
if not self:hasNormalizationFixes() then
return text
end
local norm_fixes = self._rawData.normalizationFixes
local new_classes = norm_fixes.combiningClasses
if not (new_classes and umatch(text, "")) then
return text
end
text = explode(text)
-- Manual sort based on new combining classes.
-- We can't use table.sort, as it compares the first/last values in an array as a shortcut, which messes things up.
for i = 2, #text do
local char = text
local class = new_classes or (combining_classes or get_combining_classes())
if class then
repeat
i = i - 1
local prev = text
if (new_classes or (combining_classes or get_combining_classes()) or 0) < class then
break
end
text, text = char, prev
until i == 1
end
end
return concat(text)
end
function Script:toFixedNFC(text)
return fixNormalization(toNFC(text), self)
end
function Script:toFixedNFD(text)
return fixNormalization(toNFD(text), self)
end
function Script:toFixedNFKC(text)
return fixNormalization(toNFKC(text), self)
end
function Script:toFixedNFKD(text)
return fixNormalization(toNFKD(text), self)
end
end
function Script:_additionalJSONfields(data)
data.direction = self:getDirection()
data.characters = self:getCharacters()
data.parent = self:getParent()
data.systems = self:getSystemCodes()
return data
end
require("Module:User:Theknightwho/objects").makePrototype(Script, "script")
function export.makeObject(code, raw_data)
return raw_data and setmetatable({
_rawData = raw_data,
_code = code,
characters = raw_data.characters
}, Script) or nil
end
--==]
function export.getByCode(code, paramForError, disallowNil)
-- Track uses of paramForError, ultimately so it can be removed, as error-handling should be done by ], not here.
if paramForError ~= nil then
debug_track("scripts/paramForError")
end
if code == nil and not disallowNil then
return nil
end
local retval = export.makeObject(code, (m_data or get_data()))
if not retval and paramForError then
languages_error(code, paramForError, "script code", nil, "not real lang")
end
return retval
end
function export.getByCanonicalName(name)
return export.getByCode((scripts_by_name or get_scripts_by_name()))
end
--[==[
Takes a codepoint or a character and finds the script code (if any) that is
appropriate for it based on the codepoint, using the data module
]. The data module was generated from the
patterns in ] using ].
Converts the character to a codepoint. Returns a script code if the codepoint
is in the list of individual characters, or if it is in one of the defined
ranges in the 4096-character block that it belongs to, else returns "None".
]==]
function export.charToScript(char)
export.charToScript = require(scripts_chartoscript_module).charToScript
return export.charToScript(char)
end
--[==[
Returns the code for the script that has the greatest number of characters in `text`. Useful for script tagging text
that is unspecified for language. Uses ] to determine a script code for a character
language-agnostically. Specifically, it works as follows:
Convert each character to a codepoint. Iterate the counter for the script code if the codepoint is in the list
of individual characters, or if it is in one of the defined ranges in the 4096-character block that it belongs to.
Each script has a two-part counter, for primary and secondary matches. Primary matches are when the script is the
first one listed; otherwise, it's a secondary match. When comparing scripts, first the total of both are compared
(i.e. the overall number of matches). If these are the same, the number of primary and then secondary matches are
used as tiebreakers. For example, this is used to ensure that `Grek` takes priority over `Polyt` if no characters
which exclusively match `Polyt` are found, as `Grek` is a subset of `Polyt`.
If `none_is_last_resort_only` is specified, this will never return {"None"} if any characters in `text` belong to a
script. Otherwise, it will return {"None"} if there are more characters that don't belong to a script than belong to
any individual script. (FIXME: This behavior is probably wrong, and `none_is_last_resort_only` should probably
become the default.)
]==]
function export.findBestScriptWithoutLang(text, none_is_last_resort_only)
export.findBestScriptWithoutLang = require(scripts_chartoscript_module).findBestScriptWithoutLang
return export.findBestScriptWithoutLang(text, none_is_last_resort_only)
end
return export