Module:User:Erutuon/UTF-8

Hello, you have come here looking for the meaning of the word Module:User:Erutuon/UTF-8. In DICTIOUS you will not only get to know all the dictionary meanings for the word Module:User:Erutuon/UTF-8, but we will also tell you about its etymology, its characteristics and you will know how to say Module:User:Erutuon/UTF-8 in singular and plural. Everything you need to know about the word Module:User:Erutuon/UTF-8 you have here. The definition of the word Module:User:Erutuon/UTF-8 will help you to be more precise and correct when speaking or writing your texts. Knowing the definition ofModule:User:Erutuon/UTF-8, as well as those of other words, enriches your vocabulary and provides you with more and better linguistic resources.

Function for making a byte pattern that will match any UTF-8 character in a range between and including two characters. May be useful for making transliteration modules more efficient. Then again, it's cryptic and some random person can vandalize the pattern or just accidentally misuse it, making the module fail.

  • Ა-ჿ (0x1C90-0x10FF):
  • Lua error at line 64: The first character to makeUTF8Pattern (U+10140) should have a lower codepoint than the second (U+3FF).
  • ⿿-〿 (0x2FFF-0x303F): \226

local export = {}

local m_debug = require("Module:debug")

-- Excludes null byte, which is supposed to be able to be included in Lua strings,
-- but causes patterns to fail.
local continuationByte = ""
---[[
local UTF8Char = "" .. continuationByte .. "*"
local nonASCII = "" .. continuationByte .. "+"
--]]
local escapePatt = "\\%d"
local hexPatt = "0x%X"

local floor = math.floor

local function highlight(text)
	return m_debug.highlight(text, { inline = true })
end

local function hex(number)
	return hexPatt:format(number)
end

local function byteEscape(number)
	return escapePatt:format(number)
end

local function escapeBytes(str)
	local out = {}
	for i, byte in ipairs{ string.byte(str, 1, -1) } do
		if byte < 128 then
			table.insert(out, string.char(byte))
		else
			table.insert(out, byteEscape(byte))
		end
	end
	return table.concat(out)
end

-- Based on the helpful byte chart at ].
local function getLeadingContinuation(codepoint)
	if codepoint < 0x80 then
		error("ASCII does not have leading bytes.")
	elseif codepoint < 0x800 then
		return 194 + floor((codepoint - 0x80) / 0x40), 1
	elseif codepoint < 0x10000 then
		return 224 + floor((codepoint - 0x800) / 0x1000), 2
	elseif codepoint < 0x11000 then
		return 240 + floor((codepoint - 0x10000) / 0x40000), 3
	else
		error(("Codepoint U+%X is outside valid range."):format(codepoint))
	end
end

function export.makeUTF8Pattern(lower, higher)
	local codepoint1, codepoint2 = mw.ustring.codepoint(lower), mw.ustring.codepoint(higher)
	local leading1, continuationCount1 = getLeadingContinuation(codepoint1)
	local leading2, continuationCount2 = getLeadingContinuation(codepoint2)
	local continuationSequence = string.rep(continuationByte, continuationCount1)
	if continuationCount1 < continuationCount2 then
		continuationSequence = continuationSequence .. "+"
	elseif continuationCount1 > continuationCount2 then
		error(string.format("The first character to makeUTF8Pattern (U+%X) should have a lower codepoint than the second (U+%X).", codepoint1, codepoint2))
	end
	local leading = leading1 == leading2 and byteEscape(leading1) or ""
	return lower .. "-" .. higher ..
		" (" .. highlight(hex(codepoint1) .. "-" .. hex(codepoint2)) .. "): " ..
		highlight(leading .. escapeBytes(continuationSequence))
end

function export.makeRange(characters)
	characters = string.gsub(characters, "%-", "")
	local firstChar = string.match(characters, UTF8Char)
	local lower, higher = firstChar, firstChar
	for character in string.gmatch(characters, UTF8Char) do
		if character < lower then
			lower = character
		elseif character > higher then
			higher = character
		end
	end
	return lower, higher
end

function export.charPatternForScript(scCode)
	local sc = require("Module:scripts").getByCode(scCode)
	local characters = sc:getCharacters()
	return export.makeUTF8Pattern(export.makeRange(characters))
end

function export.show(frame)
	return export.charPatternForScript(frame.args or "polytonic")
end

return export