This module is used for accessing the Unicode character database (version 16.0), derived from files in the Unicode Character Database (UCD).
All of these functions can only be used in a module. For template-invokable functions, see Module:Unicode data/templates.
lookup_name
lookup_image
enum_blocks
for in
loop in Module:character list to iterate through the array of block ranges.lookup_plane
get_block_range
is_valid_pagename
true
or false
depending on whether the string can be used as a page title.add_dotted_circle
lookup_script
get_entry_title
The following functions relate to a code point's General Category. They receive a code point and return two values: a boolean (which expresses whether the code point fulfills the condition) and a string describing the General Category, if the character is in the General Categories of Separator (Z) or Other (C), or the string "assigned"
. They use Module:Unicode data/control.
is_assigned
is_printable
is_whitespace
The data used by functions in this module is found on subpages.
The name and image data modules (Module:Unicode data/names/xxx and Module:Unicode data/images/xxx) were compiled from UnicodeData.txt. Each one contains, at maximum, code points U+xxx000 to U+xxxFFF. The names in the following ranges, however, are auto-generated by this base module and do not require separate name modules:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | A | B | C | D | E | F | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
00x | names | names | names | names | names | names | names | names | ||||||||
images | images | images | images | images | images | images | images | images | images | images | images | images | images | images | ||
emoji images | emoji images | emoji images | ||||||||||||||
01x | names | names | names | names | names | names | names | names | names | names | names | names | ||||
images | images | images | images | images | images | images | images | images | images | images | images | images | images | |||
emoji images | ||||||||||||||||
02x | ||||||||||||||||
images | images | images | images | images | images | images | images | images | images | images | images | images | images | images | images | |
03x | ||||||||||||||||
images | images | images | ||||||||||||||
When each Unicode version is released, several submodules, and sometimes the name rules in the main module, need to be updated. The name, block, control character, and combining character category modules were constructed by the scripts given in User:Kephir/Unicode; the script and General Category modules were generated by modules on Wikipedia. Most of the name rules are given in chapter 4, section 4.8 (Name) of the Unicode Core Specification, others were manually added and at each Unicode version the ranges of characters that they apply to may need to be expanded.
Module:Unicode data/Hangul and the Hangul Syllable name rule never need to be updated.
After these modules have been updated, the version number in Module:character list needs to be updated as well.
The Unicode database is released by Unicode Inc. under the following terms:
Copyright © 1991-2022 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
Permission is hereby granted, free of charge, to any person obtaining a copy of the Unicode data files and any associated documentation (the "Data Files") or Unicode software and any associated documentation (the "Software") to deal in the Data Files or Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies of the Data Files or Software, and to permit persons to whom the Data Files or Software are furnished to do so, provided that either (a) this copyright and permission notice appear with all copies of the Data Files or Software, or (b) this copyright and permission notice appear in associated Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall not be used in advertising or otherwise to promote the sale, use or other dealings in these Data Files or Software without prior written authorization of the copyright holder.
local m_str_utils = require("Module:string utilities")
local cp = m_str_utils.codepoint
local gcodepoint = m_str_utils.gcodepoint
local gsub = string.gsub
local u = m_str_utils.char
local export = {}
local udata = mw.loadData("Module:Unicode data/data")
local floor = math.floor
local function errorf(first_arg, ...)
if type(first_arg) == "number" then
return error(string.format(...), first_arg + 1)
else
return error(string.format(first_arg, ...), 2)
end
end
local function binary_range_search(codepoint, ranges)
local low, mid, high
low, high = 1, ranges.length or require "Module:table".length(ranges)
while low <= high do
mid = floor((low + high) / 2)
local range = ranges
if codepoint < range then
high = mid - 1
elseif codepoint <= range then
return range, mid
else
low = mid + 1
end
end
return nil, mid
end
export.binary_range_search = binary_range_search
local function linear_range_search(codepoint, ranges)
for i, range in ipairs(ranges) do
if codepoint < range then
break
elseif codepoint <= range then
return range
end
end
end
-- Load a module by indexing "loader" with the name of the module minus the
-- "Module:Unicode data/" part. For instance, loader.blocks returns
-- ]. If a module cannot be loaded, false will be
-- returned.
local loader = setmetatable({}, {
__index = function (self, key)
local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)
if not success then
data = false
end
self = data
return data
end
})
-- For the algorithm used to generate Hangul Syllable names,
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- Unicode Specification:
-- https://www.unicode.org/versions/latest/ch03.pdf
-- For most of the name rules given here, see the subsection
-- "Unicode Name Property" in section 4.8 (Name) and the table 4-8
-- (Name Derivation Rule Prefix Strings):
-- https://www.unicode.org/versions/latest/ch04.pdf
local name_hooks = {
{ 0x0000, 0x001F, "<control-%04X>" }, -- C0 control characters
{ 0x007F, 0x009F, "<control-%04X>" }, -- DEL and C1 control characters
{ 0x3400, 0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
{ 0x4E00, 0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
{ 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables
local Hangul_data = loader.Hangul
local syllable_index = codepoint - 0xAC00
return ("HANGUL SYLLABLE %s%s%s"):format(
Hangul_data.leads,
Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)
/ Hangul_data.trail_count)],
Hangul_data.trails
)
end },
-- High Surrogates, High Private Use Surrogates, Low Surrogates
{ 0xD800, 0xDFFF, "<surrogate-%04X>" },
{ 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use
-- CJK Compatibility Ideographs
{ 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{ 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{ 0xFDD0, 0xFDEF, "<noncharacter-%04X>" },
{ 0xFE00, 0xFE0F, function (codepoint) -- Variation Selectors Supplement
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xFE00 + 1)
end},
{ 0x13460, 0x143FA, "EGYPTIAN HIEROGLYPH-%04X" }, -- Egyptian Hieroglyphs Extended-A
{ 0x17000, 0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut
{ 0x18800, 0x18AFF, function (codepoint)
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
end },
{ 0x18B00, 0x18CD5, "KHITAN SMALL SCRIPT CHARACTER-%04X" }, -- Khitan Small Script
{ 0x18CFF, 0x18CFF, "KHITAN SMALL SCRIPT CHARACTER-%04X" },
{ 0x18D00, 0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Supplement
{ 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
{ 0x20000, 0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
{ 0x2A700, 0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
{ 0x2B740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
{ 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
{ 0x2EBF0, 0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
{ 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{ 0x30000, 0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{ 0x31350, 0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
{ 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
end},
{ 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%04X>" }, -- Plane 16 Private Use
}
name_hooks.length = #name_hooks
--[[ Add another - in this line to test the code point ordering in name_hooks.
local i = 1
local function print_it(a, b, c)
if type(c) == "string" then
mw.log(c:format(a), c:format(b))
else
mw.log(c(a), c(b))
end
end
while true do
local first, second = name_hooks, name_hooks
if not (first and second) then break end
local message
if not (first < first and first < second and second < second) then
message = "Bad name label ordering at index " .. i .. ":"
elseif second == first + 1 and second == first then
message = "Name hooks can be merged at index " .. i .. ":"
end
if message then
mw.log(message)
print_it(unpack(first))
print_it(unpack(second))
end
i = i + 1
end
--]]
local name_range_cache
local function generate_name(data, codepoint)
if type(data) == "string" then
return data:format(codepoint)
else
return data(codepoint)
end
end
--[[
-- Checks that the code point is a number and in range.
-- Does not check whether code point is an integer.
-- Not used
local function check_codepoint(funcName, argIdx, val)
require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')
if codepoint < 0 or 0x10FFFF < codepoint then
errorf("Codepoint %04X out of range", codepoint)
end
end
--]]
-- https://www.unicode.org/versions/latest/ch04.pdf, section 4.8
function export.lookup_name(codepoint)
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
-- (Cn) and specifically noncharacters:
-- https://www.unicode.org/faq/private_use.html#nonchar4
if codepoint >= 0xFFFE and floor(codepoint % 0x10000) >= 0xFFFE then
return ("<noncharacter-%04X>"):format(codepoint)
end
if name_range_cache -- Check if previously used "name hook" applies to this code point.
and codepoint >= name_range_cache
and codepoint <= name_range_cache then
return generate_name(name_range_cache, codepoint)
end
local range = binary_range_search(codepoint, name_hooks)
if range then
name_range_cache = range
return generate_name(range, codepoint)
end
local data = loader
if data and data then
return data
-- Unassigned (Cn) consists of noncharacters and reserved characters.
-- The character has been established not to be a noncharacter,
-- and if it were assigned, its name would already been retrieved,
-- so it must be reserved.
else
return ("<reserved-%04X>"):format(codepoint)
end
end
function export.lookup_image(codepoint)
local data = loader
if data then
return data
end
end
function export.lookup_image_emoji(codepoint)
local data = loader
if data then
return data
end
end
-- Load ] if needed and assign it to this variable.
local blocks
local function block_iter(blocks, i)
i = i + 1
local data = blocks
if data then
-- Unpack doesn't work on tables loaded with mw.loadData.
return i, data, data, data
end
end
-- An ipairs-type iterator generator for the list of blocks.
function export.enum_blocks()
local blocks = loader.blocks
return block_iter, blocks, 0
end
function export.get_block_range(name)
for i, block in ipairs(loader.blocks) do
if block == name then
return block, block
end
end
end
function export.lookup_plane(codepoint)
local i = floor(codepoint / 0x10000)
return udata.planes or ("Plane %u"):format(i)
end
function export.lookup_block(codepoint)
local blocks = loader.blocks
local range = binary_range_search(codepoint, blocks)
if range then
return range
else
return "No Block"
end
end
function export.get_block_info(name)
for i, block in ipairs(loader.blocks) do
if block == name then
return block
end
end
end
function export.is_valid_pagename(pagename)
local has_nonws = false
for codepoint in gcodepoint(pagename) do
if (codepoint == 0x0023) -- #
or (codepoint == 0x005B) -- [
or (codepoint == 0x005D) -- ]
or (codepoint == 0x007B) -- {
or (codepoint == 0x007C) -- |
or (codepoint == 0x007D) -- }
or (codepoint == 0x180E) -- MONGOLIAN VOWEL SEPARATOR
or ((codepoint >= 0x2000) and (codepoint <= 0x200A)) -- spaces in General Punctuation block
or (codepoint == 0xFFFD) -- REPLACEMENT CHARACTER
then
return false
end
local printable, result = export.is_printable(codepoint)
if not printable then
return false
end
if result ~= "space-separator" then
has_nonws = true
end
end
return has_nonws
end
local function manual_unpack(what, from)
if what == nil then
return what
end
local result = {}
from = from or 1
for i, item in ipairs(what) do
if i >= from then
table.insert(result, item)
end
end
return unpack(result)
end
local function compare_ranges(range1, range2)
return range1 < range2
end
-- Creates a function to look up data in a module that contains "singles" (a
-- code point-to-data map) and "ranges" (an array containing arrays that contain
-- the low and high code points of a range and the data associated with that
-- range).
-- "loader" loads and returns the "singles" and "ranges" tables.
-- "match_func" is passed the code point and either the data or the "dots", and
-- generates the final result of the function.
-- The varargs ("dots") describes the default data to be returned if there wasn't
-- a match.
-- In case the function is used more than once, "cache" saves ranges that have
-- already been found to match, or a range whose data is the default if there
-- was no match.
local function codepoint_lookup(data_module_subpage, match_func, ...)
local dots = { ... }
local cache = {}
local singles, ranges
return function (codepoint)
if not singles then
local data_module = loader
singles, ranges = data_module.singles, data_module.ranges
end
if singles then
return match_func(codepoint, singles)
end
local range = binary_range_search(codepoint, cache)
if range then
return match_func(codepoint, manual_unpack(range, 3))
end
local range, index = binary_range_search(codepoint, ranges)
if range then
table.insert(cache, range)
table.sort(cache, compare_ranges)
return match_func(codepoint, manual_unpack(range, 3))
end
if ranges then
local dots_range
if codepoint > ranges then
dots_range = {
ranges + 1,
ranges and ranges - 1 or 0x10FFFF,
unpack(dots)
}
else -- codepoint < range
dots_range = {
ranges and ranges + 1 or 0,
ranges - 1,
unpack(dots)
}
end
table.insert(cache, dots_range)
table.sort(cache, compare_ranges)
end
return match_func(codepoint, unpack(dots))
end
end
-- Return a character's combining class value from ],
-- or otherwise 0, which is treated as the default value.
do
local combining
function export.combining_class(ch)
combining = combining or mw.loadData("Module:Unicode data/combining classes")
return combining or 0
end
end
-- FIXME: Some combining characters have a combining class of 0, so this needs rethinking.
function export.is_combining(ch)
return export.combining_class(ch) ~= 0
end
do
local function dotted_circle(ch)
if export.combining_class(ch) ~= 0 then
return "◌" .. ch
end
end
function export.add_dotted_circle(str)
return (gsub(str, ".*", dotted_circle))
end
end
local lookup_control = codepoint_lookup(
"control",
function (codepoint, ccc)
return ccc or "assigned"
end,
"assigned")
export.lookup_control = lookup_control
function export.is_assigned(codepoint)
return lookup_control(codepoint) ~= "unassigned"
end
function export.is_printable(codepoint)
local result = lookup_control(codepoint)
return (result == "assigned") or (result == "space-separator"), result
end
function export.is_whitespace(codepoint)
local result = lookup_control(codepoint)
return (result == "space-separator"), result
end
export.lookup_category = codepoint_lookup(
"category",
function (codepoint, category)
return category
end,
"Cn"
)
function export.get_category_long_name(category)
return loader.long_names
end
export.lookup_script = codepoint_lookup(
"scripts",
function (codepoint, script)
return script
end,
"Zzzz"
)
function export.get_script_alias(script)
return loader.aliases
end
function export.get_entry_title(codepoint)
if udata.unsupported_title then
return udata.unsupported_title
end
if lookup_control(codepoint) ~= "assigned" then
return nil
end
return u(codepoint)
end
return export