This data submodule defines part of Wiktionary's category structure.
For an introduction to the poscatboiler
system and a description of how to add or modify categories, see Module:category tree/poscatboiler/data/documentation.
local raw_categories = {}
local handlers = {}
local raw_handlers = {}
local m_str_utils = require("Module:string utilities")
local insert = table.insert
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local umatch = m_str_utils.match
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local function track(page)
require("Module:debug/track")("poscatboiler-characters/" .. page)
return true
end
-----------------------------------------------------------------------------
-- --
-- RAW CATEGORIES --
-- --
-----------------------------------------------------------------------------
raw_categories = {
description = "Umbrella categories covering terms categorized by unusual characters contained in them.",
additional = "{{{umbrella_meta_msg}}}",
parents = {
"Umbrella metacategories",
{name = "terms by their individual characters", is_label = true, sort = " "},
},
}
-- FIXME! This should probably be deleted.
raw_categories = {
description = "Categories specifying individual letters, containing the languages that use those letters.",
additional = "{{{umbrella_meta}}}",
parents = {
"Fundamental",
},
}
-----------------------------------------------------------------------------
-- --
-- HANDLERS --
-- --
-----------------------------------------------------------------------------
-- If char is a combining character, returns a version with a dotted circle before it.
local function add_dotted_circle(char, combining)
return combining and "◌" .. char or char
end
insert(handlers, function(data)
-- NOTE: The "character" in the title may actually be a description such as
-- "gershayim". In that case, char= is specified as a parameter indicating the
-- actual character.
local titlechar = data.label:match("^terms spelled with (.+)$")
if not titlechar then
return nil
end
local args = require("Module:parameters").process(data.args, {
= true,
= true,
-- Not sure what used to be done with the following parameters.
= true,
= true,
})
if args.context or args.context2 then
track("terms-spelled-with-context")
end
local special_cases = {
numbers = {
sort = "#",
desc = "numeric digits",
},
emoji = {
sort = "⌚", -- the first emoji in our list in ]
},
parentheses = {
sort = "(",
},
= {
sort = "[",
},
= {
sort = "<",
},
braces = {
sort = "{",
},
}
if special_cases then
local sortkey = args.sort or special_cases.sort
return {
description = "{{{langname}}} terms spelled with one or more " .. (special_cases.desc or titlechar) .. ".",
parents = {{name = "terms by their individual characters", sort = sortkey }},
breadcrumb = titlechar,
umbrella = {
breadcrumb = titlechar,
parents = {{name = "Terms by their individual characters subcategories by language", sort = " " .. sortkey }}
},
}, true
end
local char = args.char or titlechar
local titlechar_is_desc = args.char and args.char ~= titlechar
if titlechar_is_desc then
track("titlechar_is_desc")
end
local lang = data.lang or require("Module:languages").getByCode("mul")
local combining = ulen(char) == 1 and require("Module:Unicode_data").is_combining(char)
local specials = { = "ẞ", = "ͅ"}
local upper = toNFD(char)
:gsub("*", function(m)
return specials or m:uupper()
end)
upper = toNFC(upper)
local standard_chars = lang:getStandardCharacters()
-- FIXME: This should be able to handle non-atomic single characters (e.g. "Q̓").
if char ~= upper and ulen(char) == 1 then
-- We want uppercase characters; but unless we're careful, we run into an issue with
-- ] due to the weird behavior of this character,
-- which has standard "I" as its uppercase equivalent.
if standard_chars then
local function err()
error("Category titles should use uppercase characters: '" .. data.label .. "'", 2)
end
if lang:getCode() ~= "hi" and lang:getCode() ~= "lo" then
if not umatch(standard_chars, upper) then
err()
end
elseif not umatch(upper, "") then
err()
end
end
end
-- Compute description.
local character = require("Module:links").full_link(
{
term = char,
alt = combining and add_dotted_circle(char, true) or nil,
lang = lang,
tr = combining and "-" or nil,
},
"term"
)
-- If the letter has a lowercase form that's also not in the standard characters,
-- show it. This time, it's ] that causes
-- issues, because the lowercase equivalent is standard "i".
-- Note that ulower("İ") has a bug where it outputs "i" with a combining dot, instead
-- of plain "i", so this has to be accounted for.
local lower = ulower(char:gsub("İ", "I"))
if lower ~= char and not (standard_chars and umatch(lower, "")) then
character = "upper case " .. character .. " or lower case " ..
require("Module:links").full_link(
{
term = lower,
lang = lang
},
"term"
)
end
if titlechar_is_desc then
character = character .. " (" .. titlechar .. ")"
end
local description = "{{{langname}}} terms spelled with " .. character .. "."
-- Set tagged character for displaytitle and breadcrumb.
local tagged_titlechar = not titlechar_is_desc and
require("Module:script utilities").tag_text(titlechar, lang, nil, "term") or nil
local tagged_char = titlechar_is_desc and titlechar or
require("Module:script utilities").tag_text(add_dotted_circle(char, combining), lang, nil, "term")
local han = umatch(char, "^+$")
-- Make the sortkey. Always use Hani-sortkey for Han characters, as this circumvents any reading-based sortkey methods.
local sortkey = args.sort or han and require("Module:Hani-sortkey").makeSortKey(char) or lang:makeSortKey(char)
-- Use the char as a fallback.
if sortkey == "" then
sortkey = char
end
return {
description = description,
-- The following doesn't apply to Sinitic or Japonic, where we categorize all characters.
additional = not lang:inFamily("zhx", "jpx") and
"Note that categories of the form '''''LANG terms spelled with CHAR''''' are intended for characters not "
.. "part of the standard repertoire of a language (e.g. Cyrillic characters in English or Latin characters in Russian)." or nil,
displaytitle = not titlechar_is_desc and "{{{langname}}} terms spelled with " .. tagged_titlechar or nil,
parents = {{name = "terms by their individual characters", sort = sortkey }},
breadcrumb = tagged_char,
umbrella = {
displaytitle = not titlechar_is_desc and "Terms spelled with " .. tagged_titlechar .. " by language" or nil,
breadcrumb = tagged_char,
parents = {{name = "Terms by their individual characters subcategories by language", sort = " " .. sortkey }}
},
}, true
end)
-----------------------------------------------------------------------------
-- --
-- RAW HANDLERS --
-- --
-----------------------------------------------------------------------------
-- Special-cased categories that we allow, for Turkish letters.
local letter_cat_allow_list = require("Module:table").listToSet {
"İi",
}
insert(raw_handlers, function(data)
-- Only recognize cases consisting of an uppercase letter followed by the
-- corresponding lowercase letter, either as the entire category name or
-- followed by a colon (for cases like ]). Cases that
-- don't fit this profile (e.g. for Turkish ] and
-- ]) need to call {{letter cat}} directly. Formerly this
-- handler was much less restrictive and would fire on categories named
-- ], ], etc.
local upper, lower = umatch(data.category, "^(%u)(%l)%f")
if not upper or not letter_cat_allow_list and lower:uupper() ~= upper then
return nil
end
return {
description = ('Languages that use the uppercase letter "%s" (lowercase equivalent "%s").'):format(upper, lower),
parents = {"Letters"},
}
end)
return {RAW_CATEGORIES = raw_categories, HANDLERS = handlers, RAW_HANDLERS = raw_handlers}