This is a private module sandbox of Theknightwho, for their own experimentation. Items in this module may be added and removed at Theknightwho's discretion; do not rely on this module's stability.
local char = string.char
local concat = table.concat
local floor = math.floor
local insert = table.insert
local ipairs = ipairs
local split = require("Module:string utilities").split
local sub = string.sub
local tonumber = tonumber
--[==[local skipped_ranges = {}
do
local hex = require("hex").to_hex
local udata = require("Module:User:Theknightwho/UnicodeData.txt")
local last = 0
for line in udata:gmatch("+") do
local cp = line:match("^%x+")
local a = cp
if cp then
cp = tonumber(cp, 16)
if cp - last > 0x1000 then
insert(skipped_ranges, {hex(last), hex(cp - 1)})
end
last = cp
end
end
return skipped_ranges
end]==]
local ducet = require("Module:User:Theknightwho/UCA/DUCET")
local implicit_ranges = {
{0x3400, 0x4DBF}, -- CJK Unified Ideographs Extension A
{0x4E00, 0x9FFF}, -- CJK Unified Ideographs
{0xAC00, 0xD7AF}, -- Hangul Syllables
{0xD800, 0xF8FF}, -- Surrogates, Private Use Area
{0x12550, 0x12F8F}, -- Unassigned
{0x13460, 0x143FF}, -- Unassigned
{0x14680, 0x167FF}, -- Unassigned
{0x17000, 0x1AFEF}, -- Tangut, Tangut Components, Khitan Small Script, Tangut Supplement, Unassigned
{0x1B170, 0x1BBFF}, -- Nushu, Unassigned
{0x1BCB0, 0x1CEFF}, -- Unassigned
{0x1DAB0, 0x1DEFF}, -- Unassigned
{0x20000, 0x2A6DF}, -- CJK Unified Ideographs Extension B
{0x2A700, 0x2B73F}, -- CJK Unified Ideographs Extension C
{0x2B740, 0x2B81F}, -- CJK Unified Ideographs Extension D
{0x2B820, 0x2CEAF}, -- CJK Unified Ideographs Extension E
{0x2CEB0, 0x2EBEF}, -- CJK Unified Ideographs Extension F
{0x2EBF0, 0x2EE5F}, -- CJK Unified Ideographs Extension I
{0x30000, 0x3134F}, -- CJK Unified Ideographs Extension G
{0x31350, 0x323AF}, -- CJK Unified Ideographs Extension H
}
local export = {}
do
local escapes = {
= "\\a", = "\\b", = "\\t",
= "\\n", = "\\v", = "\\f",
= "\\r", = "\\\"", = "\\\\"
}
local ranges
local function base_256(w)
w = tonumber(w, 16)
return char(w / 0x100) .. char(w % 0x100)
end
local function process_line(line, plane, output)
-- Get the codepoint(s), and return if not found.
local cp = line:match("^+%f")
if not cp then
return
end
-- If there is more than one, (TODO)
cp = split(cp, " ")
if #cp > 1 then
-- TODO
return
end
-- Check this is the correct plane, and return if not. Planes range from 0x0 to 0x10 (17 in total), and each has 0x10000 characters, from U+(X)0000 to U+(X)FFFF.
cp = tonumber(cp, 16)
if floor(cp / 0x10000) ~= plane then
return
end
-- Normalize codepoint by removing the plane.
cp = cp % 0x10000
-- Get the weights and convert each the first two weights (ranging from 0x0000 to 0xFFFF) to 2-digit base-256 and store each digit as the corresponding character (e.g. 0xFFFD is "\255" and "\253"). The final weight can be stored as 1 digit, because it only ranges from 0x0000 to 0x001F.
local weights = {}
for var, w1, w2, w3 in line:gmatch("%)(%x+)%.(%x+)%.(%x+)%]") do
insert(weights, var .. base_256(w1) .. base_256(w2) .. char(tonumber(w3, 16)))
end
-- If there is more than one set, (TODO)
if #weights > 1 then
-- TODO
return
else
weights = weights
end
for i = 1, #weights do
output = sub(weights, i, i)
end
end
function export.weights(plane)
if not plane then
error("Please enter a plane.")
end
-- Collate the implicit ranges for this plane (if any).
ranges = {}
for _, range in ipairs(implicit_ranges) do
if floor(range / 0x10000) == plane then
insert(ranges, range)
end
end
local output = {}
for line in ducet:gmatch("+") do
process_line(line, plane, output)
end
-- Fill in any blanks with zeroes.
for i = 1, 0x60000 do
output = output or "\0"
end
for i = #output, 1, -1 do
local b = output:byte()
if b > 0x7E then
output = "\\" .. b
elseif b < 0x07 or b > 0x0D and b < 0x20 then
local nxt = output
if nxt and nxt:match("^%d$") then
b = ("%03d"):format(b)
end
output = "\\" .. b
else
output = escapes or output
end
end
return concat(output)
end
end
return export