Module:User:Benwing2/headword/page

This module sandbox lacks a documentation subpage. Please create it.
Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox
local export = {}

local concat = table.concat
local get_etym_lang = require("Module:etymology languages").getByCanonicalName
local gsub = mw.ustring.gsub
local insert = table.insert
local set = require("Module:utilities/set")
local split = mw.text.split
local trim = mw.text.trim
local type_or_class = require("Module:parser").type_or_class
local u = require("Module:string/char")

-- Convert a numeric list of characters and ranges to the equivalent Lua pattern. WARNING: This destructively modifies
-- the contents of `ranges`.
local function char_ranges_to_pattern(ranges)
	for j, range in ipairs(ranges) do
		if type(range) == "table" then
			for k, char in ipairs(range) do
				range = u(char)
			end
			ranges = table.concat(range, "-")
		else
			ranges = u(range)
		end
	end
	return table.concat(ranges)
end


-- Combining character data used when categorising unusual characters. These resolve into two patterns, used to find
-- single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character +
-- diacritic(s) + character).
local comb_chars = {
	single = {
		{0x0300, 0x034E},
		-- Exclude combining grapheme joiner.
		{0x0350, 0x035B},
		{0x0363, 0x036F},
		{0x0483, 0x0489},
		{0x0591, 0x05BD},
		0x05BF,
		{0x05C1, 0x05C2},
		{0x05C4, 0x05C5},
		0x05C7,
		{0x0610, 0x061A},
		{0x064B, 0x065F},
		0x0670,
		{0x06D6, 0x06DC},
		{0x06DF, 0x06E4},
		{0x06E7, 0x06E8},
		{0x06EA, 0x06ED},
		0x0711,
		{0x0730, 0x074A},
		{0x07A6, 0x07B0},
		{0x07EB, 0x07F3},
		0x07FD,
		{0x0816, 0x0819},
		{0x081B, 0x0823},
		{0x0825, 0x0827},
		{0x0829, 0x082D},
		{0x0859, 0x085B},
		{0x0898, 0x089F},
		{0x08CA, 0x08E1},
		{0x08E3, 0x0903},
		{0x093A, 0x093C},
		{0x093E, 0x094F},
		{0x0951, 0x0957},
		{0x0962, 0x0963},
		{0x0981, 0x0983},
		0x09BC,
		{0x09BE, 0x09C4},
		{0x09C7, 0x09C8},
		{0x09CB, 0x09CD},
		0x09D7,
		{0x09E2, 0x09E3},
		0x09FE,
		{0x0A01, 0x0A03},
		0x0A3C,
		{0x0A3E, 0x0A42},
		{0x0A47, 0x0A48},
		{0x0A4B, 0x0A4D},
		0x0A51,
		{0x0A70, 0x0A71},
		0x0A75,
		{0x0A81, 0x0A83},
		0x0ABC,
		{0x0ABE, 0x0AC5},
		{0x0AC7, 0x0AC9},
		{0x0ACB, 0x0ACD},
		{0x0AE2, 0x0AE3},
		{0x0AFA, 0x0AFF},
		{0x0B01, 0x0B03},
		0x0B3C,
		{0x0B3E, 0x0B44},
		{0x0B47, 0x0B48},
		{0x0B4B, 0x0B4D},
		{0x0B55, 0x0B57},
		{0x0B62, 0x0B63},
		0x0B82,
		{0x0BBE, 0x0BC2},
		{0x0BC6, 0x0BC8},
		{0x0BCA, 0x0BCD},
		0x0BD7,
		{0x0C00, 0x0C04},
		0x0C3C,
		{0x0C3E, 0x0C44},
		{0x0C46, 0x0C48},
		{0x0C4A, 0x0C4D},
		{0x0C55, 0x0C56},
		{0x0C62, 0x0C63},
		{0x0C81, 0x0C83},
		0x0CBC,
		{0x0CBE, 0x0CC4},
		{0x0CC6, 0x0CC8},
		{0x0CCA, 0x0CCD},
		{0x0CD5, 0x0CD6},
		{0x0CE2, 0x0CE3},
		0x0CF3,
		{0x0D00, 0x0D03},
		{0x0D3B, 0x0D3C},
		{0x0D3E, 0x0D44},
		{0x0D46, 0x0D48},
		{0x0D4A, 0x0D4D},
		0x0D57,
		{0x0D62, 0x0D63},
		{0x0D81, 0x0D83},
		0x0DCA,
		{0x0DCF, 0x0DD4},
		0x0DD6,
		{0x0DD8, 0x0DDF},
		{0x0DF2, 0x0DF3},
		0x0E31,
		{0x0E34, 0x0E3A},
		{0x0E47, 0x0E4E},
		0x0EB1,
		{0x0EB4, 0x0EBC},
		{0x0EC8, 0x0ECE},
		{0x0F18, 0x0F19},
		0x0F35,
		0x0F37,
		0x0F39,
		{0x0F3E, 0x0F3F},
		{0x0F71, 0x0F84},
		{0x0F86, 0x0F87},
		{0x0F8D, 0x0F97},
		{0x0F99, 0x0FBC},
		0x0FC6,
		{0x102B, 0x103E},
		{0x1056, 0x1059},
		{0x105E, 0x1060},
		{0x1062, 0x1064},
		{0x1067, 0x106D},
		{0x1071, 0x1074},
		{0x1082, 0x108D},
		0x108F,
		{0x109A, 0x109D},
		{0x135D, 0x135F},
		{0x1712, 0x1715},
		{0x1732, 0x1734},
		{0x1752, 0x1753},
		{0x1772, 0x1773},
		{0x17B4, 0x17D3},
		0x17DD,
		-- Exclude Mongolian variation selectors.
		{0x1885, 0x1886},
		0x18A9,
		{0x1920, 0x192B},
		{0x1930, 0x193B},
		{0x1A17, 0x1A1B},
		{0x1A55, 0x1A5E},
		{0x1A60, 0x1A7C},
		0x1A7F,
		{0x1AB0, 0x1ACE},
		{0x1B00, 0x1B04},
		{0x1B34, 0x1B44},
		{0x1B6B, 0x1B73},
		{0x1B80, 0x1B82},
		{0x1BA1, 0x1BAD},
		{0x1BE6, 0x1BF3},
		{0x1C24, 0x1C37},
		{0x1CD0, 0x1CD2},
		{0x1CD4, 0x1CE8},
		0x1CED,
		0x1CF4,
		{0x1CF7, 0x1CF9},
		{0x1DC0, 0x1DCC},
		{0x1DCE, 0x1DFB},
		{0x1DFD, 0x1DFF},
		{0x20D0, 0x20F0},
		{0x2CEF, 0x2CF1},
		0x2D7F,
		{0x2DE0, 0x2DFF},
		{0x302A, 0x302F},
		{0x3099, 0x309A},
		{0xA66F, 0xA672},
		{0xA674, 0xA67D},
		{0xA69E, 0xA69F},
		{0xA6F0, 0xA6F1},
		0xA802,
		0xA806,
		0xA80B,
		{0xA823, 0xA827},
		0xA82C,
		{0xA880, 0xA881},
		{0xA8B4, 0xA8C5},
		{0xA8E0, 0xA8F1},
		0xA8FF,
		{0xA926, 0xA92D},
		{0xA947, 0xA953},
		{0xA980, 0xA983},
		{0xA9B3, 0xA9C0},
		0xA9E5,
		{0xAA29, 0xAA36},
		0xAA43,
		{0xAA4C, 0xAA4D},
		{0xAA7B, 0xAA7D},
		0xAAB0,
		{0xAAB2, 0xAAB4},
		{0xAAB7, 0xAAB8},
		{0xAABE, 0xAABF},
		0xAAC1,
		{0xAAEB, 0xAAEF},
		{0xAAF5, 0xAAF6},
		{0xABE3, 0xABEA},
		{0xABEC, 0xABED},
		0xFB1E,
		{0xFE20, 0xFE2F},
		0x101FD,
		0x102E0,
		{0x10376, 0x1037A},
		{0x10A01, 0x10A03},
		{0x10A05, 0x10A06},
		{0x10A0C, 0x10A0F},
		{0x10A38, 0x10A3A},
		0x10A3F,
		{0x10AE5, 0x10AE6},
		{0x10D24, 0x10D27},
		{0x10EAB, 0x10EAC},
		{0x10EFD, 0x10EFF},
		{0x10F46, 0x10F50},
		{0x10F82, 0x10F85},
		{0x11000, 0x11002},
		{0x11038, 0x11046},
		0x11070,
		{0x11073, 0x11074},
		{0x1107F, 0x11082},
		{0x110B0, 0x110BA},
		0x110C2,
		{0x11100, 0x11102},
		{0x11127, 0x11134},
		{0x11145, 0x11146},
		0x11173,
		{0x11180, 0x11182},
		{0x111B3, 0x111C0},
		{0x111C9, 0x111CC},
		{0x111CE, 0x111CF},
		{0x1122C, 0x11237},
		0x1123E,
		0x11241,
		{0x112DF, 0x112EA},
		{0x11300, 0x11303},
		{0x1133B, 0x1133C},
		{0x1133E, 0x11344},
		{0x11347, 0x11348},
		{0x1134B, 0x1134D},
		0x11357,
		{0x11362, 0x11363},
		{0x11366, 0x1136C},
		{0x11370, 0x11374},
		{0x11435, 0x11446},
		0x1145E,
		{0x114B0, 0x114C3},
		{0x115AF, 0x115B5},
		{0x115B8, 0x115C0},
		{0x115DC, 0x115DD},
		{0x11630, 0x11640},
		{0x116AB, 0x116B7},
		{0x1171D, 0x1172B},
		{0x1182C, 0x1183A},
		{0x11930, 0x11935},
		{0x11937, 0x11938},
		{0x1193B, 0x1193E},
		0x11940,
		{0x11942, 0x11943},
		{0x119D1, 0x119D7},
		{0x119DA, 0x119E0},
		0x119E4,
		{0x11A01, 0x11A0A},
		{0x11A33, 0x11A39},
		{0x11A3B, 0x11A3E},
		0x11A47,
		{0x11A51, 0x11A5B},
		{0x11A8A, 0x11A99},
		{0x11C2F, 0x11C36},
		{0x11C38, 0x11C3F},
		{0x11C92, 0x11CA7},
		{0x11CA9, 0x11CB6},
		{0x11D31, 0x11D36},
		0x11D3A,
		{0x11D3C, 0x11D3D},
		{0x11D3F, 0x11D45},
		0x11D47,
		{0x11D8A, 0x11D8E},
		{0x11D90, 0x11D91},
		{0x11D93, 0x11D97},
		{0x11EF3, 0x11EF6},
		{0x11F00, 0x11F01},
		0x11F03,
		{0x11F34, 0x11F3A},
		{0x11F3E, 0x11F42},
		0x13440,
		{0x13447, 0x13455},
		{0x16AF0, 0x16AF4},
		{0x16B30, 0x16B36},
		0x16F4F,
		{0x16F51, 0x16F87},
		{0x16F8F, 0x16F92},
		-- Exclude Khitan Small Script filler.
		{0x16FF0, 0x16FF1},
		{0x1BC9D, 0x1BC9E},
		{0x1CF00, 0x1CF2D},
		{0x1CF30, 0x1CF46},
		{0x1D165, 0x1D169},
		{0x1D16D, 0x1D172},
		{0x1D17B, 0x1D182},
		{0x1D185, 0x1D18B},
		{0x1D1AA, 0x1D1AD},
		{0x1D242, 0x1D244},
		{0x1DA00, 0x1DA36},
		{0x1DA3B, 0x1DA6C},
		0x1DA75,
		0x1DA84,
		{0x1DA9B, 0x1DA9F},
		{0x1DAA1, 0x1DAAF},
		{0x1E000, 0x1E006},
		{0x1E008, 0x1E018},
		{0x1E01B, 0x1E021},
		{0x1E023, 0x1E024},
		{0x1E026, 0x1E02A},
		0x1E08F,
		{0x1E130, 0x1E136},
		0x1E2AE,
		{0x1E2EC, 0x1E2EF},
		{0x1E4EC, 0x1E4EF},
		{0x1E8D0, 0x1E8D6},
		{0x1E944, 0x1E94A},
	},
	double = {
		{0x035C, 0x0362},
		0x1DCD,
		0x1DFC,
	},
	vs = { -- variation selectors; separated out so that we don't get categories for them
		{0xFE00, 0xFE0F},
		{0xE0100, 0xE01EF},
	}
}
for key, charset in pairs(comb_chars) do
	comb_chars = char_ranges_to_pattern(charset)
end
comb_chars.both = comb_chars.single .. comb_chars.double .. comb_chars.vs
comb_chars = {
	combined_single = "+%f",
	combined_double = "*+*.*",
	diacritics_single = "",
	diacritics_double = ""
}

-- From https://unicode.org/Public/emoji/15.1/emoji-sequences.txt
local emoji_chars = {
	{0x231A, 0x231B}, --  watch..hourglass done                                          # E0.6    (⌚..⌛)
	{0x23E9, 0x23EC}, --  fast-forward button..fast down button                          # E0.6    (⏩..⏬)
	0x23F0,           --  alarm clock                                                    # E0.6    (⏰)
	0x23F3,           --  hourglass not done                                             # E0.6    (⏳)
	{0x25FD, 0x25FE}, --  white medium-small square..black medium-small square           # E0.6    (◽..◾)
	{0x2614, 0x2615}, --  umbrella with rain drops..hot beverage                         # E0.6    (☔..☕)
	{0x2648, 0x2653}, --  Aries..Pisces                                                  # E0.6   (♈..♓)
	0x267F,           --  wheelchair symbol                                              # E0.6    (♿)
	0x2693,           --  anchor                                                         # E0.6    (⚓)
	0x26A1,           --  high voltage                                                   # E0.6    (⚡)
	{0x26AA, 0x26AB}, --  white circle..black circle                                     # E0.6    (⚪..⚫)
	{0x26BD, 0x26BE}, --  soccer ball..baseball                                          # E0.6    (⚽..⚾)
	{0x26C4, 0x26C5}, --  snowman without snow..sun behind cloud                         # E0.6    (⛄..⛅)
	0x26CE,           --  Ophiuchus                                                      # E0.6    (⛎)
	0x26D4,           --  no entry                                                       # E0.6    (⛔)
	0x26EA,           --  church                                                         # E0.6    (⛪)
	{0x26F2, 0x26F3}, --  fountain..flag in hole                                         # E0.6    (⛲..⛳)
	0x26F5,           --  sailboat                                                       # E0.6    (⛵)
	0x26FA,           --  tent                                                           # E0.6    (⛺)
	0x26FD,           --  fuel pump                                                      # E0.6    (⛽)
	0x2705,           --  check mark button                                              # E0.6    (✅)
	{0x270A, 0x270B}, --  raised fist..raised hand                                       # E0.6    (✊..✋)
	0x2728,           --  sparkles                                                       # E0.6    (✨)
	0x274C,           --  cross mark                                                     # E0.6    (❌)
	0x274E,           --  cross mark button                                              # E0.6    (❎)
	{0x2753, 0x2755}, --  red question mark..white exclamation mark                      # E0.6    (❓..❕)
	0x2757,           --  red exclamation mark                                           # E0.6    (❗)
	{0x2795, 0x2797}, --  plus..divide                                                   # E0.6    (➕..➗)
	0x27B0,           --  curly loop                                                     # E0.6    (➰)
	0x27BF,           --  double curly loop                                              # E1.0    (➿)
	{0x2B1B, 0x2B1C}, --  black large square..white large square                         # E0.6    (⬛..⬜)
	0x2B50,           --  star                                                           # E0.6    (⭐)
	0x2B55,           --  hollow red circle                                              # E0.6    (⭕)
	{0x1F300, 0x1FAFF}, --  emoji in Plane 1
	-- NOTE: There are lots more emoji sequences involving non-emoji Plane 0 symbols followed by 0xFE0F, which we don't
	-- (yet?) handle.
}
emoji_chars = char_ranges_to_pattern(emoji_chars)

local unsupported_characters = {}
for k, v in pairs(require("Module:links/data").unsupported_characters) do
	unsupported_characters = k
end

-- Get the list of unsupported titles and invert it (so the keys are pagenames and values are canonical titles).
local unsupported_titles = {}
for k, v in pairs(require("Module:links/data").unsupported_titles) do
	unsupported_titles = k
end

--[==[
Given a pagename (or {nil} for the current page), create and return a data structure describing the page. The returned
object includes the following fields:
* `comb_chars`: A table containing various Lua character class patterns for different types of combined characters
  (those that decompose into multiple characters in the NFD decomposition). The patterns are meant to be used with
  {mw.ustring.find()}. The keys are:
** `single`: Single combining characters (character + diacritic), without surrounding brackets;
** `double`: Double combining characters (character + diacritic + character), without surrounding brackets;
** `vs`: Variation selectors, without surrounding brackets;
** `both`: Concatenation of `single` + `double` + `vs` (FIXME: should be named `all`), without surrounding brackets;
** `diacritics_single`: Like `single` but with surrounding brackets;
** `diacritics_single`: Like `double` but with surrounding brackets;
** `combined_single`: Lua pattern for matching a spacing character followed by one or more single combining characters;
** `combined_double`: Lua pattern for matching a combination of two spacing characters separated by one or more double
   combining characters, possibly also with single combining characters;
* `emoji_pattern`: A Lua character class pattern (including surrounding brackets) that matches emojis. Meant to be used
  with {mw.ustring.find()}.
* `unsupported_titles`: Map from pagenames to canonical titles for unsupported-title pages.
* `namespace`: Namespace of the pagename.
* `full_raw_pagename`: Full version of the '''RAW''' pagename (i.e. unsupported-title pages aren't canonicalized);
  including the namespace and the root (portion before the slash).
* `pagename`: Canonicalized subpage portion of the pagename (unsupported-title pages are canonicalized).
* `decompose_pagename`: Equivalent of `pagename` in NFD decomposition.
* `pagename_len`: Length of `pagename` in Unicode chars, where combinations of spacing character + decomposed diacritic
  are treated as single characters.
* `explode_pagename`: Set of characters found in `pagename`. The keys are characters (where combinations of spacing
  character + decomposed diacritic are treated as single characters).
* `encoded_pagename`: FIXME: Document me.
* `pagename_defaultsort`: FIXME: Document me.
* `raw_defaultsort`: FIXME: Document me.
* `page_L2s`: FIXME: Document me.
* `pagename_defaultsort_conflict`: FIXME: Document me.
* `pagename_displaytitle_conflict`: FIXME: Document me.
* `unsupported_title`: FIXME: Document me.
* `tab_characters`: FIXME: Document me.
* `unencoded_char`: FIXME: Document me.
* `wikitext_topic_cat`: FIXME: Document me.
* `wikitext_langname_cat`: FIXME: Document me.
* `raw_sortkey`: FIXME: Document me.
]==]
function export.process_page(pagename)
	local data = {}
	data.comb_chars = comb_chars
	data.emoji_pattern = ""
	data.unsupported_titles = unsupported_titles

	-- We cannot store `raw_title` in `data` because it contains a metatable.
	local raw_title
	if pagename then -- for testing, doc pages, etc.
		raw_title = mw.title.new(pagename)
		if not raw_title then
			error(("Bad value for `data.pagename`: '%s'"):format(pagename))
		end
	else
		raw_title = mw.title.getCurrentTitle()
	end
	data.namespace = raw_title.nsText
	data.full_raw_pagename = raw_title.fullText

	local frame = mw.getCurrentFrame()
	local content = raw_title:getContent()
	local content_lang = mw.getContentLanguage()

	--Get the pagename.
	pagename = raw_title.subpageText
		:gsub("^Unsupported titles/(.*)", function(m)
			data.unsupported_title = true
			return unsupported_titles or (m:gsub("`.-`", unsupported_characters))
		end)
	-- Save pagename, as local variable will be destructively modified.
	data.pagename = pagename
	-- Decompose the pagename in Unicode normalization form D.
	data.decompose_pagename = mw.ustring.toNFD(pagename)
	-- Explode the current page name into a character table, taking decomposed combining characters into account.
	local explode_pagename = {}
	local pagename_len = 0
	local function explode(char)
		explode_pagename = true
		pagename_len = pagename_len + 1
		return ""
	end
	pagename = gsub(pagename, comb_chars.combined_double, explode)
	pagename = gsub(pagename, comb_chars.combined_single, explode)
		:gsub("*", explode)

	data.explode_pagename = explode_pagename
	data.pagename_len = pagename_len

	-- Generate DEFAULTSORT.
	data.encoded_pagename = require("Module:string/encode entities")(data.pagename, nil, true)
	data.pagename_defaultsort = require("Module:languages").getByCode("mul"):makeSortKey(data.encoded_pagename)
	frame:callParserFunction(
		"DEFAULTSORT",
		data.pagename_defaultsort
	)
	data.raw_defaultsort = raw_title.text:uupper()

	-- Get section numbers for the page, and note raw wikitext use of {{DEFAULTSORT:}} and {{DISPLAYTITLE:}}.
	-- Note: HTML comments shouldn't be removed from `content` until after this step, as they can affect the result.
	do
		local page_L2s, defaultsort, displaytitle = {}
		
		local function iterate(node)
			local node_type = type_or_class(node)
			if node_type == "heading" and node.level == 2 then
				local name = node:get_name()
				if name:find("\n", 1, true) then
					return
				end
				page_L2s = name
			elseif node_type == "template" and not (defaultsort and displaytitle) then
				local name = node:get_name()
				if name == "DEFAULTSORT" then
					defaultsort = frame:expandTemplate{
						title = "tracking category",
						args = {"Pages with DEFAULTSORT conflicts"}
					}
				elseif name == "DISPLAYTITLE" then
					displaytitle = frame:expandTemplate{
						title = "tracking category",
						args = {"Pages with DISPLAYTITLE conflicts"}
					}
				end
			end
		end
		
		for node in require("Module:template parser").parse(content):__pairs("next_node") do
			iterate(node)
		end
		
		data.page_L2s = page_L2s
		data.pagename_defaultsort_conflict = defaultsort
		data.pagename_displaytitle_conflict = displaytitle
	end

	------ 4. Parse page for maintenance categories. ------
	-- Use of tab characters.
	if content:find("\t") then
		data.tab_characters = frame:expandTemplate{
			title = "tracking category",
			args = {"Pages with tab characters"}
		}
	end
	-- Unencoded character(s) in title.
	local IDS = set{"⿰", "⿱", "⿲", "⿳", "⿴", "⿵", "⿶", "⿷", "⿸", "⿹", "⿺", "⿻", "⿼", "⿽", "⿾", "⿿", "㇯"}
	for char in pairs(explode_pagename) do
		if IDS and char ~= data.pagename then
			data.unencoded_char = true
			break
		end
	end

	-- Raw wikitext use of a topic or langname category. Also check if any raw sortkeys have been used.
	do
		-- All chars treated as spaces in links (including categories).
		local spaces = " _" ..
			"\194\160" ..
			"\225\154\128" ..
			"\225\160\142" ..
			"\226\128\128-\226\128\138" ..
			"\226\128\168" ..
			"\226\128\169" ..
			"\226\128\175" ..
			"\226\129\159" ..
			"\227\128\128"
		local wikitext_topic_cat = {}
		local wikitext_langname_cat = {}
		local raw_sortkey
		
		local langnames = mw.loadData("Module:languages/canonical names")
		local etym_langnames = mw.loadData("Module:etymology languages/canonical names")
		
		-- If a raw sortkey has been found, add it to the relevant table.
		-- If there's no table (or the index is just `true`), create one first.
		local function add_cat_table(marker, sortkey, tbl)
			if not sortkey then
				tbl = tbl or true
				return true
			elseif type(tbl) ~= "table" then
				tbl = {}
			end
			insert(tbl, sortkey)
			return true
		end
		
		local function do_iteration(name, sortkey, wikitext_langname_cat)
			if langnames then
				return add_cat_table(name, sortkey, wikitext_langname_cat)
			end
			name = etym_langnames and name or content_lang:lcfirst(name)
			if etym_langnames then
				name = get_etym_lang(name):getFullName()
				return add_cat_table(name, sortkey, wikitext_langname_cat)
			end
		end
		
		local function process_category(cat)
			cat = trim(cat, spaces)
			local code = cat:match("^(+):")
			local sortkey = cat:match("|(.*)")
			if sortkey then
				raw_sortkey = raw_sortkey or frame:expandTemplate{
					title = "tracking category",
					args = {"Pages with raw sortkeys"}
				}
			end
			if code then
				return add_cat_table(code, sortkey, wikitext_topic_cat)
			end
			-- Remove sortkey and split by word.
			cat = split(cat:gsub("|.*", ""), "+")
			-- Iterate over the category name, starting with the longest possible name and shaving off the first word until we find one. We do it this way because:
			-- (a) Going from shortest to longest risks falsely matching (e.g.) German Low German categories as German.
			-- (b) Checking the start of category names first risks falsely match (e.g.) Alsatian French as Alsatian (a variety of Alemannic German), not French.
			-- If no matches are found, then check the start of the category name, shaving off the last word each iteration.
			local cat_len = #cat
			local n, name, done = 1
			repeat
				name = concat(cat, " ", n, cat_len)
				done = do_iteration(name, sortkey, wikitext_langname_cat)
				if done then
					return
				end
				n = n + 1
			until n > cat_len
			n = cat_len - 1
			if n <= 0 then
				return
			end
			repeat
				name = concat(cat, " ", 1, n)
				done = do_iteration(name, sortkey, wikitext_langname_cat)
				if done then
					return
				end
				n = n - 1
			until n == 0
		end
		
		for prefix, cat in content:gsub("<!%-%-.-%-%->", "")
			:gsub("<!%-%-.*", "")
			:gsub("%[%[", "\1")
			:gsub("]]", "\2")
			:gmatch("\1(--):(-)\2") do
			prefix = trim(prefix, spaces):lower()
			if prefix == "cat" or prefix == "category" then
				process_category(cat)
			end
		end
		data.wikitext_topic_cat = wikitext_topic_cat
		data.wikitext_langname_cat = wikitext_langname_cat
		data.raw_sortkey = raw_sortkey
	end

	return data
end

return export
Module:User:Benwing2/headword/page

Wikious

Boobota

Sagapedia