This module generated the codepoint-to-script lookup table in Module:Unicode data/scripts.
Lua error at line 409: attempt to index local 'overrider' (a nil value)
local export = {}
local getCodepoint = mw.ustring.codepoint
local U = mw.ustring.char
local floor = math.floor
local title = mw.title.getCurrentTitle().fullText
local function check(funcName, expectType)
return function(argIndex, arg)
require("libraryUtil").checkType(funcName, argIndex, arg, expectType)
end
end
function mw.logf(...)
return mw.log(string.format(...))
end
local output_mt = {}
function output_mt:insert(str)
self.n = self.n + 1
self = str
end
function output_mt:insert_format(...)
self:insert(string.format(...))
end
output_mt.join = table.concat
output_mt.__index = output_mt
local function Output()
return setmetatable({ n = 0 }, output_mt)
end
local function dump(val)
local output = Output()
output:insert('{\n')
local range_format =
[[
{ 0x%05X, 0x%05X, "%s" },
]]
local length_format = -- also close range array
[[
length = %d,
},
]]
for i = 0, 0x10FFFF / 0x100 do
local ranges = val
if ranges then
output:insert_format(
[[
= {
]], i)
for j, range in ipairs(ranges) do
output:insert_format(range_format, unpack(range))
end
output:insert_format(length_format, ranges.length or -1)
end
end
output:insert
[[
individual = {
]]
for codepoint, script in require "Module:table".sortedPairs(val.individual) do
output:insert_format(
[[
= "%s",
]], codepoint, script)
end
output:insert [[
},
blocks = {
]]
for _, blockRange in ipairs(val.blocks) do
output:insert_format(
[[
{ 0x%02X, 0x%02X, "%s" },
]], unpack(blockRange))
end
output:insert
[[
},
}]]
return require "Module:debug".highlight(table.concat(output))
end
local function printRanges(ranges)
local output = Output()
output:insert("Ranges:")
for _, range in ipairs(ranges) do
output:insert_format('\n\tU+%04X-U+%04X: %s', unpack(range))
end
mw.log(output:join())
end
local function hasContents(t)
if next(t) then
return true
else
return false
end
end
local function log(message)
if title:match("testcases/documentation$") then
mw.log(message)
end
end
local function makeRangeKey(codepoint)
return floor(codepoint / 0x1000)
end
local function isInRange(value, lower, upper)
-- mw.log(value, lower, upper)
local check = check("isInRange", "number")
check(1, value)
check(2, lower)
check(3, upper)
return value >= lower and value <= upper
end
local function lookupCharacter(characterLookup, character)
local codepoint
if type(character) == "string" then
if mw.ustring.len(character) == 1 then
codepoint = getCodepoint(character)
else
error("Character " .. character .. " has length " .. mw.ustring.len(character) .. ". It is supposed to be a single character.")
end
elseif type(character) == "number" then
codepoint = character
else
error("Character is the wrong type: " .. type(character) .. ".")
end
if characterLookup.smallest and not isInRange(codepoint, characterLookup.smallest, characterLookup.largest) then
return false
elseif characterLookup.values and characterLookup.values then
return true
else
for i, range in ipairs(characterLookup) do
if isInRange(codepoint, range, range) then
return true
end
end
end
return false
end
local function forEachChar(str, func)
if type(func) == "function" then
for i = 1, mw.ustring.len(str) do
char = mw.ustring.sub(str, i, i)
func(char)
end
end
end
function export.makeCharacterLookup(pattern)
local characterLookup = {}
local values = {}
local allValues = {}
local i = 1
-- Create ranges in which all characters belong to the script.
local workingString = mw.ustring.gsub(
pattern,
"()%-()",
function(item1, item2)
local codepoint1, codepoint2 = getCodepoint(item1), getCodepoint(item2)
--[[
if not (codepoint1 < codepoint2) then
error("Wrong codepoint order with " .. U(codepoint1) .. " and " .. U(codepoint2) .. "!")
end
]]
table.insert(characterLookup, { codepoint1, codepoint2 })
allValues = true
allValues = true
return ""
end
)
if workingString ~= "" then
workingString = mw.ustring.gsub(
workingString,
".",
function(char)
local codepoint = getCodepoint(char)
values = true
allValues = true
end
)
end
--[[
Place the tables of ranges in the Unicode order (the patterns
should already be in that order, but just to be safe).
]]
table.sort(
characterLookup,
function(item1, item2)
return item1 < item2
end)
local allValuesKeys = require("Module:table").numKeys(allValues)
local smallest, largest = allValuesKeys, allValuesKeys
-- Don't create an empty values table.
if hasContents(values) then
characterLookup.values = values
end
--[[
Don't record the smallest and largest values if they're found in the
first range.
]]
if not (smallest == characterLookup and largest == characterLookup) then
characterLookup.smallest, characterLookup.largest = smallest, largest
end
return characterLookup
end
function export.makeAllScriptsCharacterLookup()
local allScriptsCharacterLookup = {}
local patternToScript = {}
for code, data in pairs(require("Module:scripts/data")) do
if not code:find("-", 1, true) then
if data.characters then
-- Don't generate identical lookup table twice.
local scriptWithPattern = patternToScript
if scriptWithPattern then
allScriptsCharacterLookup = allScriptsCharacterLookup
else
allScriptsCharacterLookup = export.makeCharacterLookup(data.characters)
end
patternToScript = code
end
end
end
return allScriptsCharacterLookup
end
-- fa-Arab → Arab-fa
local function switchLangSc(scriptCode)
return scriptCode:gsub("^(+)%-(.+)$", "%2-%1")
end
-- To ensure that Grek and Latn appear first.
-- This also makes Grek and Latn take precedence when generating
-- the codepoint-to-script lookup table.
local scriptCodeReplacements = {
polytonic = "Grek2",
Latinx = "Latnx",
Latf = "Latnf",
}
local function modifyAdHocCode(code)
if scriptCodeReplacements then
return scriptCodeReplacements
elseif not (code:find("%u%l%l%l") or code:find("%l%l%l%-%u%l%l%l")) then
return code:gsub("^(.+)$", "~%1")
else
return code
end
end
local function keySort(key1, key2)
local type1, type2 = type(key1), type(key2)
if type1 == "number" and type2 == "string" then
return true
elseif type1 == "string" and type2 == "number" then
return false
elseif type1 == "string" then
key1, key2 = modifyAdHocCode(key1), modifyAdHocCode(key2)
key1, key2 = switchLangSc(key1), switchLangSc(key2)
local lower1, lower2 = mw.ustring.lower(key1), mw.ustring.lower(key2)
return lower1 < lower2
else
return key1 < key2
end
end
local function hex(number)
return string.format("0x%X", number)
end
local function divideRange(lower, upper, width, testing)
local ranges = {}
if not (lower and upper) then
mw.log("divideRange failed:", lower, upper, width, testing)
return nil
end
local position = floor(lower / width)
local start = position * width
local i = 0
local increment = i * width
repeat
local range1 = start + increment
local range2 = range1 + width - 1
if range1 < lower then
range1 = lower
end
if range2 > upper then
range2 = upper
end
if testing then
range1, range2 = hex(range1), hex(range2)
end
ranges = { range1, range2 }
i = i + 1
increment = i * width
until
start + increment > upper
return ranges
end
function export.showDividedRange(frame)
local lower = 0x2A700
local higher = 0x2B73F
local width = 0x1000
local dividedRange = divideRange(lower, higher, width, true)
return table.concat({ hex(lower), hex(higher) }, ", ") .. dump(dividedRange)
end
-- Scripts that consist entirely of characters from another script.
local scriptBlacklist = {
= true;
= true;
= true;
= true;
= true;
= true;
= true;
}
local function sortRange(range1, range2)
local number1, number2 = tonumber(range1), tonumber(range2)
if number1 == number2 then
return keySort(range1, range2)
else
return number1 < number2
end
end
local function printScriptRange(range, hideScriptName)
if hideScriptName then
return ("U+%04X-U+%04X"):format(range, range)
else
return ("%s (U+%04X-U+%04X)"):format(range, range, range)
end
end
-- When there is overlap between ranges belonging to two different scripts,
-- the key in this table overrides the value.
local overrides = {
Beng = "as-Beng",
Cyrl = "Cyrs",
Grek = "polytonic",
Latn = "Latinx",
}
local function fixRangeOverlaps(ranges)
local prev
local i = 1
while ranges do
range = ranges
prev = ranges
if prev and (range <= prev or range <= prev) then
-- mw.logf("%s in conflict with %s",
-- printScriptRange(prev), printScriptRange(range))
local overrider, overridden
if overrides] == prev then
overrider, overridden = range, prev
elseif overrides] == range then
overrider, overridden = prev, range
end
if overrider and overridden then
mw.logf("%s overrides %s", printScriptRange(overrider),
printScriptRange(overridden))
else
mw.logf("Should %s override %s or the other way around?",
printScriptRange(prev), printScriptRange(range))
end
if overrider <= overridden then -- low end of overridden is inside overrider
if overridden <= overrider then -- overridden entirely within overrider
table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
if overridden == prev then
i = i - 1
end
else -- upper part of overridden outside of overrider
if overridden - overrider == 1 then -- one codepoint of overridden is outside overrider
table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
if overridden == prev then
i = i - 1
end
individual] = overridden
else
overridden = overrider + 1
end
end
else -- overridden < overrider: low end of overridden is outside overrider
-- single codepoint at low end of overridden is outside overrider
table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
if overridden == prev then
i = i - 1
end
if overrider - overridden == 1 then
individual] = overridden
else -- multiple codepoints at low end of overridden are outside overrider
ranges:insert(i,
{ overridden, overrider - 1, overridden })
i = i + 1
end
if overrider < overridden then -- high end of overridden is outside overrider
-- single codepoint at high end of overridden is outside of overrider
if overridden - overrider == 1 then
individual] = overridden
else
ranges:insert(i,
{ overrider + 1, overridden, overridden })
i = i + 1
end
end
end
end
i = i + 1
end
end
local function checkRangeOverlaps(ranges)
local prev
for i, range in ipairs(ranges) do
if prev and prev >= range then
mw.logf("%s overlaps with %s",
printScriptRange(prev), printScriptRange(range))
end
prev = range
end
end
local function makeCodepointToScriptLookup(testing)
local output = {}
local ranges_mt = {
insert = function (self, i, value)
if value ~= nil then
if self < value then
i = i + 1
end
mw.logf("Inserting %s below %s",
printScriptRange(value), printScriptRange(self))
table.insert(self, i, value)
else
value = i
table.insert(self, value)
end
end,
remove = table.remove,
}
ranges_mt.__index = ranges_mt
setmetatable(output,
{
__index = function (self, key)
local val = setmetatable({}, ranges_mt)
self = val
return val
end,
})
output.individual = {}
local individual = output.individual
local rangeStrings = {}
local allScriptsCharacterLookup = export.makeAllScriptsCharacterLookup()
for scriptCode, lookup in require("Module:table").sortedPairs(allScriptsCharacterLookup, keySort) do
if not scriptBlacklist then
for key, value in ipairs(lookup) do
if type(value) == "table" then
local newRanges = divideRange(value, value, 0x1000, testing)
if newRanges then
for position, newRange in pairs(newRanges) do
local rangeString = newRange .. newRange
if rangeStrings then
mw.logf("The range U+%04X-U+%04X is already "
.. "recorded as belonging to the script "
.. "code %s, conflicting with %s.",
newRange, newRange, rangeStrings, scriptCode)
else
rangeStrings = scriptCode
output:insert({ newRange, newRange, scriptCode })
end
end
end
end
end
if lookup.values then
for codepoint in pairs(lookup.values) do
if individual then
mw.logf("The codepoint %s is already recorded as " ..
"belonging to the script code %s, conflicting with %s.",
hex(codepoint), individual, scriptCode)
else
individual = scriptCode
end
end
end
end
end
for position, ranges in pairs(output) do
if type(position) == "number" then
local prevRange
local i = 1
while ranges do
range = ranges
if prevRange and range == prevRange and prevRange == range - 1 then
mw.logf("Merged %s with %s",
printScriptRange(range), printScriptRange(prevRange))
prevRange = range
table.remove(ranges, i)
i = i - 1 -- to compensate for removed element
end
prevRange = range
i = i + 1
end
table.sort(ranges, sortRange)
end
end
local individualCodepoints = require "Module:table".numKeys(individual)
local minimumCodepointRange = 3
local i = 1
while individualCodepoints do
local codepoint = individualCodepoints
local script = individual
if not script then
error(("No script for U+%04X"):format(codepoint))
end
local startOfRun = codepoint
while individual == script do
codepoint = codepoint + 1
i = i + 1
end
if codepoint - startOfRun + 1 >= minimumCodepointRange
and makeRangeKey(startOfRun) == makeRangeKey(codepoint) then
for j = startOfRun, codepoint do
individual = nil
end
local rangeKey = makeRangeKey(startOfRun)
local ranges = output
if not ranges then
ranges = {}
output = ranges
end
ranges:insert({ startOfRun, codepoint, script })
mw.logf("Added range %s from a run in individual map",
printScriptRange { startOfRun, codepoint, script })
table.sort(ranges, sortRange)
end
i = i + 1
end
for position, ranges in pairs(output) do
if type(position) == "number" then
if ranges then
fixRangeOverlaps(ranges)
end
end
end
-- Add length field to range arrays and check that there are no overlaps
-- between ranges.
output.blocks = {}
local prevScript, blockRange
for index, ranges in pairs(output) do
if type(index) == "number" then
ranges.length = #ranges
if ranges then
checkRangeOverlaps(ranges)
end
local firstScript = ranges
if not ranges or require "Module:fun".all(
function (range)
return range == firstScript
end,
ranges) then -- All ranges contain the same script.
if prevScript and firstScript == prevScript then
if not blockRange then
blockRange = { index - 1, index, prevScript }
table.insert(output.blocks, blockRange)
else
blockRange = index
end
else
blockRange = nil
prevScript = firstScript
end
else
prevScript = nil
end
end
end
setmetatable(output, nil)
return output
end
--[[
Binary search: more efficient for the longer lists of codepoint ranges than
for the shorter ones.
]]
local function binarySearch(ranges, value)
-- Initialize numbers.
local iStart, iMid = 1, 0
-- Can't use # because table is loaded by mw.loadData.
local iEnd = require("Module:table").size(ranges)
if iEnd == 0 then
return nil
end
local iterations = 0
-- Do search.
while iStart <= iEnd do
iterations = iterations + 1
-- Calculate middle.
iMid = floor((iStart + iEnd) / 2)
-- Get compare value.
local range = ranges
-- Return matching index. Assumes there are no duplicates.
if isInRange(value, range, range) then
return range
-- Keep searching.
elseif value < range then
iEnd = iMid - 1
else
iStart = iMid + 1
end
end
return nil
end
local function lookupInOrder(number, ranges)
for i, range in ipairs(ranges) do
if isInRange(number, range, range) then
-- mw.log(mw.ustring.char(number), hex(number), i)
return range
end
if number < range then
-- mw.log(mw.ustring.char(number), hex(number), i)
return nil
end
end
end
-- Save previously used codepoint ranges in case another character is in the
-- same range.
local rangesCache = {}
function export.charToScript(char)
local lookup = mw.loadData("Module:User:Erutuon/script recognition/data") -- makeCodepointToScriptLookup()
local codepoint = mw.ustring.codepoint(char)
local individualMatch = lookup.individual
if individualMatch then
return individualMatch
else
local script = lookupInOrder(codepoint, rangesCache)
if script then
return script
end
local index = makeRangeKey(codepoint)
script = lookupInOrder(index, lookup.blocks)
if script then
return script
end
local range = binarySearch(lookup, codepoint)
if range then
table.insert(rangesCache, range)
table.sort(rangesCache, sortRange)
return range
end
end
return nil
end
function export.show(frame)
local allScriptsCharacterLookup = mw.loadData("Module:User:Erutuon/script recognition/data")
local str = frame.args or "ABCD一丨丶丿乙亅"
local result = {}
forEachChar(
str,
function(char)
table.insert(result, tostring(export.charToScript(char)))
end
)
return str .. ": " .. table.concat(result, ", ")
end
function export.show(frame)
return dump(makeCodepointToScriptLookup())
end
return export