A module for collation (alphabetization) that is used by Module:columns and {{sort}}
. Contains functions to alphabetize lists of terms.
local export = {}
local require = require
local byte = string.byte
local concat = table.concat
local find = string.find
local get_plaintext = require("Module:utilities").get_plaintext
local match = string.match
local memoize = require("Module:memoize")
local remove = table.remove
local sort = table.sort
local string_sort -- defined below as export.string_sort
local sub = string.sub
local trim = mw.text.trim
local type = type
-- Custom functions for generating a sortkey that will achieve the desired sort
-- order.
-- name of module and name of exported function
local custom_funcs = {
ahk = { "Mymr-sortkey", "makeSortKey" },
aio = { "Mymr-sortkey", "makeSortKey" },
blk = { "Mymr-sortkey", "makeSortKey" },
egy = { "egy-utilities", "make_sortkey" },
kac = { "Mymr-sortkey", "makeSortKey" },
kht = { "Mymr-sortkey", "makeSortKey" },
ksw = { "Mymr-sortkey", "makeSortKey" },
kyu = { "Mymr-sortkey", "makeSortKey" },
= { "Mymr-sortkey", "makeSortKey" },
mnw = { "Mymr-sortkey", "makeSortKey" },
my = { "Mymr-sortkey", "makeSortKey" },
phk = { "Mymr-sortkey", "makeSortKey" },
pwo = { "Mymr-sortkey", "makeSortKey" },
omx = { "Mymr-sortkey", "makeSortKey" },
shn = { "Mymr-sortkey", "makeSortKey" },
tjl = { "Mymr-sortkey", "makeSortKey" },
}
local function is_lang_object(lang)
return type(lang) == "table" and type(lang.getCanonicalName) == "function"
end
local function check_function(funcName, argIdx, func)
if type(func) ~= "function" then
error("bad argument #" .. argIdx .. " to " .. funcName
.. ": expected function object, got " .. type(func) .. ".", 2)
end
return true
end
local function make_sortkey_func(lang, make_sortbase)
local langcode = lang:getCode()
local makeDisplayText = lang.makeDisplayText
local custom_func = custom_funcs
local makeSortKey
if custom_func then
local _makeSortKey = require("Module:" .. custom_func)]
function makeSortKey(_, text)
return _makeSortKey(text, langcode)
end
else
makeSortKey = lang.makeSortKey
end
return make_sortbase and check_function("make_sortkey_func", 2, make_sortbase) and function(element)
return (makeSortKey(
lang,
(makeDisplayText(
lang,
get_plaintext(make_sortbase(element))
))
))
end or function(element)
return (makeSortKey(
lang,
(makeDisplayText(
lang,
get_plaintext(element)
))
))
end
end
-- When comparing two elements with code points outside the BMP, the less-than
-- operator treats all code points above U+FFFF as equal because of a bug in
-- glibc. See ]. Instead, compares bytes, which always
-- yields the same result as comparing code points in valid UTF-8 strings.
-- UTF-8-encoded characters that do not belong to the Basic Multilingual Plane
-- (that is, with code points greater than U+FFFF) have byte sequences that
-- begin with the bytes 240 to 244.
--
-- Update 2025-01-10: The < operator also fails with some codepoints in the BMP, seemingly esp. if they are unassigned.
-- See https://sourceware.org/bugzilla/show_bug.cgi?id=21302#c11, quoted here:
--[=[
Carlos O'Donell 2017-10-28 02:26:30 UTC
OK, I have fixed the code-point collation sorting issue.
There are 2 problems:
(a) The collation table builder and thus the weights ignores characters in the collation specification if they do not
exactly match the hash of the symbolic name from the charmap. This is arguably a QoI issue, but it needs an explicit
warning for all UTF-8 locales to catch typos in the collation tables.
(b) Since the UTF-8 charmap uses 4 or 8 character code point names, the collation must also use *identically* matching
symbols or those symbols are silently ignored and have no weights. This is where the Debian and Fedora collations got it
wrong, effectively we have giant ranges of typos (and ellipsis generating typos in the thousands) that do not have
correct weights.
Once I added the new warnings for (a), I could find all the problems with the locale file and fix (b).
To solve this I'm adding a new --warning=missingcollchar warning which I plan to turn on for all locales being compiled
with UTF-8, it will also be turned on by verbose, such that users can see these warnings when developing a locale. We
cannot turn them on by default because it is entirely allowed to have a collation sequence whose characters may not
exist in the charmap you are using, and so can be safely ignored.
After that I'm going to send my C.UTF-8 patch upstream for review so all the distros can have a harmonized C.UTF-8 to
use with correct collation.
]=]
-- So for now I'm making the use of < contingent on there being only ASCII chars in both strings, which seems to be
-- fairly safe.
do
-- Memoize match with the `simple` flag, which means it should only be used
-- with fixed additional arguments (in this case, the pattern).
local sortkey_match = memoize(match, true)
function export.string_sort(item1, item2)
-- if sortkey_match(item1, "^*$") and sortkey_match(item2, "^*$") then
if sortkey_match(item1, "^*$") and sortkey_match(item2, "^*$") then
return item1 < item2
end
local i = 0
while true do
i = i + 1
local b1, b2 = byte(item1, i, i), byte(item2, i, i)
if not b1 then
return b2 and true or false
elseif b1 ~= b2 then
return b2 and b1 < b2 or false
end
end
end
string_sort = export.string_sort
end
function export.sort(elems, lang, make_sortbase)
if not is_lang_object(lang) then
return sort(elems)
end
local make_sortkey = memoize(make_sortkey_func(lang, make_sortbase), true)
return sort(elems, function(elem1, elem2)
return string_sort(make_sortkey(elem1), make_sortkey(elem2))
end)
end
function export.sort_template(frame)
if not mw.isSubsting() then
error("This template must be substed.")
end
local args
if frame.args.parent then
args = frame:getParent().args
else
args = frame.args
end
local m_table = require("Module:table")
local elems = m_table.shallowCopy(args)
local m_languages = require("Module:languages")
local lang
if args.lang then
lang = m_languages.getByCode(args.lang) or m_languages.err(args.lang, "lang")
else
local code = remove(elems, 1)
code = code and trim(code)
lang = m_languages.getByCode(code) or m_languages.err(code, 1)
end
local i = 1
while true do
local elem = elems
while elem do
elem = trim(elem, "%s")
if elem ~= "" then
break
end
remove(elems, i)
elem = elems
end
if not elem then
break
elseif not ( -- Strip redundant wikilinks.
not match(elem, "^()%[%[") or
find(elem, "[[", 3, true) or
find(elem, "]]", 3, true) ~= #elem - 1 or
find(elem, "|", 3, true)
) then
elem = sub(elem, 3, -3)
elem = trim(elem, "%s")
end
elems = elem .. "\n"
i = i + 1
end
elems = m_table.removeDuplicates(elems)
export.sort(elems, lang)
return concat(elems, args.sep or "|")
end
return export