Module:collation

Hello, you have come here looking for the meaning of the word Module:collation. In DICTIOUS you will not only get to know all the dictionary meanings for the word Module:collation, but we will also tell you about its etymology, its characteristics and you will know how to say Module:collation in singular and plural. Everything you need to know about the word Module:collation you have here. The definition of the word Module:collation will help you to be more precise and correct when speaking or writing your texts. Knowing the definition ofModule:collation, as well as those of other words, enriches your vocabulary and provides you with more and better linguistic resources.

A module for collation (alphabetization) that is used by Module:columns and {{sort}}. Contains functions to alphabetize lists of terms.


local export = {}

local require = require
local byte = string.byte
local concat = table.concat
local find = string.find
local get_plaintext = require("Module:utilities").get_plaintext
local match = string.match
local memoize = require("Module:memoize")
local remove = table.remove
local sort = table.sort
local string_sort -- defined below as export.string_sort
local sub = string.sub
local trim = mw.text.trim
local type = type

-- Custom functions for generating a sortkey that will achieve the desired sort
-- order.
-- name of module and name of exported function
local custom_funcs = {
	ahk = { "Mymr-sortkey", "makeSortKey" },
	aio = { "Mymr-sortkey", "makeSortKey" },
	blk = { "Mymr-sortkey", "makeSortKey" },
	egy = { "egy-utilities", "make_sortkey" },
	kac = { "Mymr-sortkey", "makeSortKey" },
	kht = { "Mymr-sortkey", "makeSortKey" },
	ksw = { "Mymr-sortkey", "makeSortKey" },
	kyu = { "Mymr-sortkey", "makeSortKey" },
	 = { "Mymr-sortkey", "makeSortKey" },
	mnw = { "Mymr-sortkey", "makeSortKey" },
	my  = { "Mymr-sortkey", "makeSortKey" },
	phk = { "Mymr-sortkey", "makeSortKey" },
	pwo = { "Mymr-sortkey", "makeSortKey" },
	omx = { "Mymr-sortkey", "makeSortKey" },
	shn = { "Mymr-sortkey", "makeSortKey" },
	tjl = { "Mymr-sortkey", "makeSortKey" },
}

local function is_lang_object(lang)
	return type(lang) == "table" and type(lang.getCanonicalName) == "function"
end

local function check_function(funcName, argIdx, func)
	if type(func) ~= "function" then
		error("bad argument #" .. argIdx .. " to " .. funcName
			.. ": expected function object, got " .. type(func) .. ".", 2)
	end
	return true
end

local function make_sortkey_func(lang, make_sortbase)
	local langcode = lang:getCode()
	local makeDisplayText = lang.makeDisplayText
	local custom_func = custom_funcs
		
	local makeSortKey
	if custom_func then
		local _makeSortKey = require("Module:" .. custom_func)]
		function makeSortKey(_, text)
			return _makeSortKey(text, langcode)
		end
	else
		makeSortKey = lang.makeSortKey
	end
	
	return make_sortbase and check_function("make_sortkey_func", 2, make_sortbase) and function(element)
		return (makeSortKey(
			lang,
			(makeDisplayText(
				lang,
				get_plaintext(make_sortbase(element))
			))
		))
	end or function(element)
		return (makeSortKey(
			lang,
			(makeDisplayText(
				lang,
				get_plaintext(element)
			))
		))
	end
end

-- When comparing two elements with code points outside the BMP, the less-than
-- operator treats all code points above U+FFFF as equal because of a bug in
-- glibc. See ]. Instead, compares bytes, which always
-- yields the same result as comparing code points in valid UTF-8 strings.
-- UTF-8-encoded characters that do not belong to the Basic Multilingual Plane
-- (that is, with code points greater than U+FFFF) have byte sequences that
-- begin with the bytes 240 to 244.
--
-- Update 2025-01-10: The < operator also fails with some codepoints in the BMP, seemingly esp. if they are unassigned.
-- See https://sourceware.org/bugzilla/show_bug.cgi?id=21302#c11, quoted here:
--[=[
Carlos O'Donell 2017-10-28 02:26:30 UTC
OK, I have fixed the code-point collation sorting issue.

There are 2 problems:

(a) The collation table builder and thus the weights ignores characters in the collation specification if they do not
exactly match the hash of the symbolic name from the charmap. This is arguably a QoI issue, but it needs an explicit
warning for all UTF-8 locales to catch typos in the collation tables.

(b) Since the UTF-8 charmap uses 4 or 8 character code point names, the collation must also use *identically* matching
symbols or those symbols are silently ignored and have no weights. This is where the Debian and Fedora collations got it
wrong, effectively we have giant ranges of typos (and ellipsis generating typos in the thousands) that do not have
correct weights.

Once I added the new warnings for (a), I could find all the problems with the locale file and fix (b).

To solve this I'm adding a new --warning=missingcollchar warning which I plan to turn on for all locales being compiled
with UTF-8, it will also be turned on by verbose, such that users can see these warnings when developing a locale. We
cannot turn them on by default because it is entirely allowed to have a collation sequence whose characters may not
exist in the charmap you are using, and so can be safely ignored.

After that I'm going to send my C.UTF-8 patch upstream for review so all the distros can have a harmonized C.UTF-8 to
use with correct collation.
]=]

-- So for now I'm making the use of < contingent on there being only ASCII chars in both strings, which seems to be
-- fairly safe.
do
	-- Memoize match with the `simple` flag, which means it should only be used
	-- with fixed additional arguments (in this case, the pattern).
	local sortkey_match = memoize(match, true)
	
	function export.string_sort(item1, item2)
		-- if sortkey_match(item1, "^*$") and sortkey_match(item2, "^*$") then
		if sortkey_match(item1, "^*$") and sortkey_match(item2, "^*$") then
			return item1 < item2
		end
		local i = 0
		while true do
			i = i + 1
			local b1, b2 = byte(item1, i, i), byte(item2, i, i)
			if not b1 then
				return b2 and true or false
			elseif b1 ~= b2 then
				return b2 and b1 < b2 or false
			end
		end
	end
	string_sort = export.string_sort
end

function export.sort(elems, lang, make_sortbase)
	if not is_lang_object(lang) then
		return sort(elems)
	end
	
	local make_sortkey = memoize(make_sortkey_func(lang, make_sortbase), true)
	
	return sort(elems, function(elem1, elem2)
		return string_sort(make_sortkey(elem1), make_sortkey(elem2))
	end)
end

function export.sort_template(frame)
	if not mw.isSubsting() then
		error("This template must be substed.")
	end
	
	local args
	if frame.args.parent then
		args = frame:getParent().args
	else
		args = frame.args
	end
	
	local m_table = require("Module:table")
	local elems = m_table.shallowCopy(args)
	local m_languages = require("Module:languages")
	local lang
	if args.lang then
		lang = m_languages.getByCode(args.lang) or m_languages.err(args.lang, "lang")
	else
		local code = remove(elems, 1)
		code = code and trim(code)
		lang = m_languages.getByCode(code) or m_languages.err(code, 1)
	end
	
	local i = 1
	while true do
		local elem = elems
		while elem do
			elem = trim(elem, "%s")
			if elem ~= "" then
				break
			end
			remove(elems, i)
			elem = elems
		end
		if not elem then
			break
		elseif not ( -- Strip redundant wikilinks.
			not match(elem, "^()%[%[") or
			find(elem, "[[", 3, true) or
			find(elem, "]]", 3, true) ~= #elem - 1 or
			find(elem, "|", 3, true)
		) then
			elem = sub(elem, 3, -3)
			elem = trim(elem, "%s")
		end
		elems = elem .. "\n"
		i = i + 1
	end
	
	elems = m_table.removeDuplicates(elems)
	export.sort(elems, lang)
	
	return concat(elems, args.sep or "|")
end

return export