local subexport = {}
local cp = mw.ustring.char
local split = mw.text.split
--[=[
Takes a codepoint or a character and finds the script code(s) (if any) that are appropriate for it based on the codepoint, using the data module ].
By default, it returns only the first script code if there are multiple matches (i.e. the code we take to be the default). If `all_scripts` is set, then a table of all matching codes is returned.
]=]
local memo = {}
local charToScriptData = mw.loadData("Module:User:Theknightwho/recognition data")
function subexport.charToScript(char, all_scripts)
local t = type(char)
if t == "string" then
if char:find("*") then
error("bad argument #1 to 'charToScript' (expected a single character)")
end
elseif t == "number" then
char = u(char)
else
error(("bad argument #1 to 'charToScript' (expected string or a number, got %s)")
:format(t))
end
if not memo then
local data, ret = charToScriptData
for byte in char:gmatch(".") do
local new_data
if data then
new_data = data
else
for k, v in pairs(data) do
if #k > 1 and byte:find("") then
new_data = v
break
end
end
end
if not new_data then
ret = {"None"}
break
elseif type(new_data) == "string" then
ret = split(new_data, "%s*,%s*")
break
else
data = new_data
end
end
if all_scripts then
memo = ret
else
memo = ret
end
end
return memo
end
--[=[
Finds the best script for a string in a language-agnostic way.
Converts each character to a codepoint. Iterates the counter for the script code if the codepoint is in the list of individual characters, or if it is in one of the defined ranges in the 4096-character block that it belongs to.
Each script has a two-part counter, for primary and secondary matches. Primary matches are when the script is the first one listed; otherwise, it's a secondary match. When comparing scripts, first the total of both are compared (i.e. the overall number of matches). If these are the same, the number of primary and then secondary matches are used as tiebreakers. For example, this is used to ensure that `Grek` takes priority over `polytonic` if no characters which exclusively match `polytonic` are found, as `Grek` is a subset of `polytonic`.
]=]
function subexport.findBestScriptWithoutLang(text)
local min = math.min
-- `scripts` contains counters for any scripts detected so far. Jpan and Kore are handled as special-cases, as they are combinations of other scripts.
local weights_mt = {
__lt = function(a, b)
if a + a ~= b + b then
return a + a < b + b
elseif a ~= b then
return a < b
elseif a ~= b then
return a < b
else
return false
end
end
}
local scripts_mt = {__index = function(t, k)
return setmetatable({0, 0}, weights_mt)
end}
local scripts = setmetatable({}, scripts_mt)
text = require("Module:utilities").get_plaintext(text)
for character in text:gmatch("*") do
for i, script in ipairs(subexport.charToScript(character, true)) do
scripts = scripts
local weight = min(i, 2)
scripts = scripts + 1
end
end
local bestScript
local greatestCount
for script, count in pairs(scripts) do
if (not greatestCount) or greatestCount < count then
bestScript = script
greatestCount = count
end
end
bestScript = bestScript or "None"
return require("Module:scripts").getByCode(bestScript)
end
return subexport