Module:User:Erutuon/script recognition

Hello, you have come here looking for the meaning of the word Module:User:Erutuon/script recognition. In DICTIOUS you will not only get to know all the dictionary meanings for the word Module:User:Erutuon/script recognition, but we will also tell you about its etymology, its characteristics and you will know how to say Module:User:Erutuon/script recognition in singular and plural. Everything you need to know about the word Module:User:Erutuon/script recognition you have here. The definition of the word Module:User:Erutuon/script recognition will help you to be more precise and correct when speaking or writing your texts. Knowing the definition ofModule:User:Erutuon/script recognition, as well as those of other words, enriches your vocabulary and provides you with more and better linguistic resources.

This module generated the codepoint-to-script lookup table in Module:Unicode data/scripts.

Lua error at line 409: attempt to index local 'overrider' (a nil value)


local export = {}

local getCodepoint = mw.ustring.codepoint
local U = mw.ustring.char
local floor = math.floor

local title = mw.title.getCurrentTitle().fullText

local function check(funcName, expectType)
	return function(argIndex, arg)
		require("libraryUtil").checkType(funcName, argIndex, arg, expectType)
	end
end

function mw.logf(...)
	return mw.log(string.format(...))
end

local output_mt = {}
function output_mt:insert(str)
	self.n = self.n + 1
	self = str
end

function output_mt:insert_format(...)
	self:insert(string.format(...))
end

output_mt.join = table.concat

output_mt.__index = output_mt

local function Output()
	return setmetatable({ n = 0 }, output_mt)
end

local function dump(val)
	local output = Output()
	
	output:insert('{\n')
	local range_format =
[[
		{ 0x%05X, 0x%05X, "%s" },
]]
	local length_format = -- also close range array
[[
		length = %d,
	},
]]
	for i = 0, 0x10FFFF / 0x100 do
		local ranges = val
		if ranges then
			output:insert_format(
[[
	 = {
]], i)
			for j, range in ipairs(ranges) do
				output:insert_format(range_format, unpack(range))
			end
			output:insert_format(length_format, ranges.length or -1)
		end
	end
	
	output:insert
[[

	individual = {
]]
	
	for codepoint, script in require "Module:table".sortedPairs(val.individual) do
		output:insert_format(
[[
		 = "%s",
]],		codepoint, script)
	end
	output:insert [[
	},

	blocks = {
]]
	
	for _, blockRange in ipairs(val.blocks) do
		output:insert_format(
[[
		{ 0x%02X, 0x%02X, "%s" },
]],		unpack(blockRange))
	end
	
	output:insert
[[
	},
}]]
	
	return require "Module:debug".highlight(table.concat(output))
end

local function printRanges(ranges)
	local output = Output()
	output:insert("Ranges:")
	for _, range in ipairs(ranges) do
		output:insert_format('\n\tU+%04X-U+%04X: %s', unpack(range))
	end
	mw.log(output:join())
end

local function hasContents(t)
	if next(t) then
		return true
	else
		return false
	end
end

local function log(message)
	if title:match("testcases/documentation$") then
		mw.log(message)
	end
end

local function makeRangeKey(codepoint)
	return floor(codepoint / 0x1000)
end

local function isInRange(value, lower, upper)
	-- mw.log(value, lower, upper)
	local check = check("isInRange", "number")
	check(1, value)
	check(2, lower)
	check(3, upper)
	
	return value >= lower and value <= upper
end

local function lookupCharacter(characterLookup, character)
	local codepoint
	if type(character) == "string" then
		if mw.ustring.len(character) == 1 then
			codepoint = getCodepoint(character)
		else
			error("Character " .. character .. " has length " .. mw.ustring.len(character) .. ". It is supposed to be a single character.")
		end
	elseif type(character) == "number" then
		codepoint = character
	else
		error("Character is the wrong type: " .. type(character) .. ".")
	end
	
	if characterLookup.smallest and not isInRange(codepoint, characterLookup.smallest, characterLookup.largest) then
		return false
	elseif characterLookup.values and characterLookup.values then
		return true
	else
		for i, range in ipairs(characterLookup) do
			if isInRange(codepoint, range, range) then
				return true
			end
		end
	end
	
	return false
end

local function forEachChar(str, func)
	if type(func) == "function" then
		for i = 1, mw.ustring.len(str) do
			char = mw.ustring.sub(str, i, i)
			func(char)
		end
	end
end

function export.makeCharacterLookup(pattern)
	local characterLookup = {}
	local values = {}
	local allValues = {}
	
	local i = 1
	-- Create ranges in which all characters belong to the script.
	local workingString = mw.ustring.gsub(
		pattern,
		"()%-()",
		function(item1, item2)
			local codepoint1, codepoint2 = getCodepoint(item1), getCodepoint(item2)
			--[[
			if not (codepoint1 < codepoint2) then
				error("Wrong codepoint order with " .. U(codepoint1) .. " and " .. U(codepoint2) .. "!")
			end
			]]
			table.insert(characterLookup, { codepoint1, codepoint2 })
			allValues = true
			allValues = true
			return ""
		end
	)
	if workingString ~= "" then
		workingString = mw.ustring.gsub(
			workingString,
			".",
			function(char)
				local codepoint = getCodepoint(char)
				values = true
				allValues = true
			end
		)
	end
	
	--[[
		Place the tables of ranges in the Unicode order (the patterns
		should already be in that order, but just to be safe).
	]]
	table.sort(
		characterLookup,
		function(item1, item2)
			return item1 < item2
		end)
	
	local allValuesKeys = require("Module:table").numKeys(allValues)
	
	local smallest, largest = allValuesKeys, allValuesKeys
	
	-- Don't create an empty values table.
	if hasContents(values) then
		characterLookup.values = values
	end
	
	--[[
		Don't record the smallest and largest values if they're found in the
		first range.
	]]
	if not (smallest == characterLookup and largest == characterLookup) then
		characterLookup.smallest, characterLookup.largest = smallest, largest
	end
	
	return characterLookup
end

function export.makeAllScriptsCharacterLookup()
	local allScriptsCharacterLookup = {}
	local patternToScript = {}
	for code, data in pairs(require("Module:scripts/data")) do
		if not code:find("-", 1, true) then
			if data.characters then
				-- Don't generate identical lookup table twice.
				local scriptWithPattern = patternToScript
				if scriptWithPattern then
					allScriptsCharacterLookup = allScriptsCharacterLookup
				else
					allScriptsCharacterLookup = export.makeCharacterLookup(data.characters)
				end
				patternToScript = code
			end
		end
	end
	return allScriptsCharacterLookup
end

-- fa-Arab → Arab-fa
local function switchLangSc(scriptCode)
	return scriptCode:gsub("^(+)%-(.+)$", "%2-%1")
end

-- To ensure that Grek and Latn appear first.
-- This also makes Grek and Latn take precedence when generating
-- the codepoint-to-script lookup table.
local scriptCodeReplacements = {
	polytonic = "Grek2",
	Latinx = "Latnx",
	Latf = "Latnf",
}

local function modifyAdHocCode(code)
	if scriptCodeReplacements then
		return scriptCodeReplacements
	elseif not (code:find("%u%l%l%l") or code:find("%l%l%l%-%u%l%l%l")) then
		return code:gsub("^(.+)$", "~%1")
	else
		return code
	end
end
	
local function keySort(key1, key2)
	local type1, type2 = type(key1), type(key2)
	if type1 == "number" and type2 == "string" then
		return true
	elseif type1 == "string" and type2 == "number" then
		return false
	elseif type1 == "string" then
		key1, key2 = modifyAdHocCode(key1), modifyAdHocCode(key2)
		key1, key2 = switchLangSc(key1), switchLangSc(key2)
		local lower1, lower2 = mw.ustring.lower(key1), mw.ustring.lower(key2)
		return lower1 < lower2
	else
		return key1 < key2
	end
end

local function hex(number)
	return string.format("0x%X", number)
end

local function divideRange(lower, upper, width, testing)
	local ranges = {}
	
	if not (lower and upper) then
		mw.log("divideRange failed:", lower, upper, width, testing)
		return nil
	end
	
	local position = floor(lower / width)
	local start = position * width
	
	local i = 0
	local increment = i * width
	repeat
		local range1 = start + increment
		local range2 = range1 + width - 1
		
		if range1 < lower then
			range1 = lower
		end
		
		if range2 > upper then
			range2 = upper
		end
		
		if testing then
			range1, range2 = hex(range1), hex(range2)
		end
		
		ranges = { range1, range2 }
		
		i = i + 1
		increment = i * width
	until
		 start + increment > upper
	
	return ranges
end

function export.showDividedRange(frame)
	local lower = 0x2A700
	local higher = 0x2B73F
	local width = 0x1000
	local dividedRange = divideRange(lower, higher, width, true)
	return table.concat({ hex(lower), hex(higher) }, ", ") .. dump(dividedRange)
end

-- Scripts that consist entirely of characters from another script.
local scriptBlacklist = {
			= true;
			= true;
			= true;
			= true;
			= true;
			= true;
			= true;
}

local function sortRange(range1, range2)
	local number1, number2 = tonumber(range1), tonumber(range2)
	if number1 == number2 then
		return keySort(range1, range2)
	else
		return number1 < number2
	end
end

local function printScriptRange(range, hideScriptName)
	if hideScriptName then
		return ("U+%04X-U+%04X"):format(range, range)
	else
		return ("%s (U+%04X-U+%04X)"):format(range, range, range)
	end
end
	
-- When there is overlap between ranges belonging to two different scripts,
-- the key in this table overrides the value.
local overrides = {
	Beng = "as-Beng",
	Cyrl = "Cyrs",
	Grek = "polytonic",
	Latn = "Latinx",
}

local function fixRangeOverlaps(ranges)
	local prev
	local i = 1
	while ranges do
		range = ranges
		prev = ranges
		if prev and (range <= prev or range <= prev) then
			-- mw.logf("%s in conflict with %s",
				-- printScriptRange(prev), printScriptRange(range))
			local overrider, overridden
			if overrides] == prev then
				overrider, overridden = range, prev
			elseif overrides] == range then
				overrider, overridden = prev, range
			end
			
			if overrider and overridden then
				mw.logf("%s overrides %s", printScriptRange(overrider),
					printScriptRange(overridden))
			else
				mw.logf("Should %s override %s or the other way around?",
					printScriptRange(prev), printScriptRange(range))
			end
			
			if overrider <= overridden then -- low end of overridden is inside overrider
				if overridden <= overrider  then -- overridden entirely within overrider
					table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
					if overridden == prev then
						i = i - 1
					end
				else -- upper part of overridden outside of overrider
					if overridden - overrider == 1 then -- one codepoint of overridden is outside overrider
						table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
						if overridden == prev then
							i = i - 1
						end
						individual] = overridden
					else
						overridden = overrider + 1
					end
				end
			else -- overridden < overrider: low end of overridden is outside overrider
				-- single codepoint at low end of overridden is outside overrider
				table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
				if overridden == prev then
					i = i - 1
				end
				
				if overrider - overridden == 1 then
					individual] = overridden
				else -- multiple codepoints at low end of overridden are outside overrider
					ranges:insert(i,
						{ overridden, overrider - 1, overridden })
					i = i + 1
				end
				
				if overrider < overridden then -- high end of overridden is outside overrider
					-- single codepoint at high end of overridden is outside of overrider
					if overridden - overrider == 1 then
						individual] = overridden
					else
						ranges:insert(i,
							{ overrider + 1, overridden, overridden })
						i = i + 1
					end
				end
			end
		end
		i = i + 1
	end
end

local function checkRangeOverlaps(ranges)
	local prev
	for i, range in ipairs(ranges) do
		if prev and prev >= range then
			mw.logf("%s overlaps with %s",
				printScriptRange(prev), printScriptRange(range))
		end
		prev = range
	end
end

local function makeCodepointToScriptLookup(testing)
	local output = {}
	local ranges_mt = {
		insert = function (self, i, value)
			if value ~= nil then
				if self < value then
					i = i + 1
				end
				mw.logf("Inserting %s below %s",
					printScriptRange(value), printScriptRange(self))
				table.insert(self, i, value)
			else
				value = i
				table.insert(self, value)
			end
		end,
		remove = table.remove,
	}
	ranges_mt.__index = ranges_mt
	
	setmetatable(output,
		{
			__index = function (self, key)
				local val = setmetatable({}, ranges_mt)
				self = val
				return val
			end,
		})
	output.individual = {}
	local individual = output.individual
	local rangeStrings = {}
	
	local allScriptsCharacterLookup = export.makeAllScriptsCharacterLookup()
	for scriptCode, lookup in require("Module:table").sortedPairs(allScriptsCharacterLookup, keySort) do
		if not scriptBlacklist then
			for key, value in ipairs(lookup) do
				if type(value) == "table" then
					local newRanges = divideRange(value, value, 0x1000, testing)
					if newRanges then
						for position, newRange in pairs(newRanges) do
							local rangeString = newRange .. newRange
							if rangeStrings then
								mw.logf("The range U+%04X-U+%04X is already "
									.. "recorded as belonging to the script "
									.. "code %s, conflicting with %s.",
									newRange, newRange, rangeStrings, scriptCode)
							else
								rangeStrings = scriptCode
								
								output:insert({ newRange, newRange, scriptCode })
							end
						end
					end
				end
			end
			
			if lookup.values then
				for codepoint in pairs(lookup.values) do
					if individual then
						mw.logf("The codepoint %s is already recorded as " ..
							"belonging to the script code %s, conflicting with %s.",
							hex(codepoint), individual, scriptCode)
					else
						individual = scriptCode
					end
				end
			end
		end
	end
	
	for position, ranges in pairs(output) do
		if type(position) == "number" then
			local prevRange
			local i = 1
			while ranges do
				range = ranges
				if prevRange and range == prevRange and prevRange == range - 1 then
					mw.logf("Merged %s with %s",
						printScriptRange(range), printScriptRange(prevRange))
					prevRange = range
					table.remove(ranges, i)
					i = i - 1 -- to compensate for removed element
				end
				prevRange = range
				i = i + 1
			end
			table.sort(ranges, sortRange)
		end
	end
	
	local individualCodepoints = require "Module:table".numKeys(individual)
	local minimumCodepointRange = 3
	local i = 1
	while individualCodepoints do
		local codepoint = individualCodepoints
		local script = individual
		if not script then
			error(("No script for U+%04X"):format(codepoint))
		end
		
		local startOfRun = codepoint
		while individual == script do
			codepoint = codepoint + 1
			i = i + 1
		end
		
		if codepoint - startOfRun + 1 >= minimumCodepointRange
			and makeRangeKey(startOfRun) == makeRangeKey(codepoint) then
			for j = startOfRun, codepoint do
				individual = nil
			end
			
			local rangeKey = makeRangeKey(startOfRun)
			local ranges = output
			if not ranges then
				ranges = {}
				output = ranges
			end
			ranges:insert({ startOfRun, codepoint, script })
			mw.logf("Added range %s from a run in individual map",
				printScriptRange { startOfRun, codepoint, script })
			table.sort(ranges, sortRange)
		end
		
		i = i + 1
	end
	
	for position, ranges in pairs(output) do
		if type(position) == "number" then
			if ranges then
				fixRangeOverlaps(ranges)
			end
		end
	end
	
	-- Add length field to range arrays and check that there are no overlaps
	-- between ranges.
	output.blocks = {}
	local prevScript, blockRange
	for index, ranges in pairs(output) do
		if type(index) == "number" then
			ranges.length = #ranges
			if ranges then
				checkRangeOverlaps(ranges)
			end
			local firstScript = ranges
			if not ranges or require "Module:fun".all(
					function (range)
						return range == firstScript
					end,
					ranges) then -- All ranges contain the same script.
				if prevScript and firstScript == prevScript then
					if not blockRange then
						blockRange = { index - 1, index, prevScript }
						table.insert(output.blocks, blockRange)
					else
						blockRange = index
					end
				else
					blockRange = nil
					prevScript = firstScript
				end
			else
				prevScript = nil
			end
		end
	end
	
	setmetatable(output, nil)
	
	return output
end

--[[
	Binary search: more efficient for the longer lists of codepoint ranges than
	for the shorter ones.
]]
local function binarySearch(ranges, value)
	--	Initialize numbers.
	local iStart, iMid = 1, 0
	-- Can't use # because table is loaded by mw.loadData.
	local iEnd = require("Module:table").size(ranges)
	
	if iEnd == 0 then
		return nil
	end
	
	local iterations = 0
	
	-- Do search.
	while iStart <= iEnd do
		iterations = iterations + 1
		
		-- Calculate middle.
		iMid = floor((iStart + iEnd) / 2)
		
		-- Get compare value.
		local range = ranges
		
		-- Return matching index. Assumes there are no duplicates.
		if isInRange(value, range, range) then
			return range
		
		-- Keep searching.
		elseif value < range then
			iEnd = iMid - 1
		else
			iStart = iMid + 1
		end
	end
	return nil
end

local function lookupInOrder(number, ranges)
	for i, range in ipairs(ranges) do
		if isInRange(number, range, range) then
			-- mw.log(mw.ustring.char(number), hex(number), i)
			return range
		end
		if number < range then
			-- mw.log(mw.ustring.char(number), hex(number), i)
			return nil
		end
	end
end

-- Save previously used codepoint ranges in case another character is in the
-- same range.
local rangesCache = {}

function export.charToScript(char)
	local lookup = mw.loadData("Module:User:Erutuon/script recognition/data") -- makeCodepointToScriptLookup()
	local codepoint = mw.ustring.codepoint(char)
	
	local individualMatch = lookup.individual
	if individualMatch then
		return individualMatch
	else
		local script = lookupInOrder(codepoint, rangesCache)
		if script then
			return script
		end
		
		local index = makeRangeKey(codepoint)
		
		script = lookupInOrder(index, lookup.blocks)
		if script then
			return script
		end
		
		local range = binarySearch(lookup, codepoint)
		if range then
			table.insert(rangesCache, range)
			table.sort(rangesCache, sortRange)
			return range
		end
	end
	
	return nil
end

function export.show(frame)
	local allScriptsCharacterLookup = mw.loadData("Module:User:Erutuon/script recognition/data")
	
	local str = frame.args or "ABCD一丨丶丿乙亅"
	
	local result = {}
	forEachChar(
		str,
		function(char)
			table.insert(result, tostring(export.charToScript(char)))
		end
	)

	return str .. ": " .. table.concat(result, ", ")
end

function export.show(frame)
	return dump(makeCodepointToScriptLookup())
end

return export