Module:User:Theknightwho/Jpan-sortkey

This module sandbox lacks a documentation subpage. Please create it.
Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox
local export = {}

local kanji_pattern = mw.loadData("Module:ja/data/range").kanji
local ideograph_pattern = mw.loadData("Module:ja/data/range").ideograph
local kana_graph_pattern = mw.loadData("Module:ja/data/range").kana_graph
local latin_pattern = mw.loadData("Module:ja/data/range").latin
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = mw.ustring.char
local UTF8_char = "*"

function export.sortkey_from_string(text, lang, sc)
	text = mw.ustring.toNFD(require("Module:ja").kata_to_hira(text))
	
	-- If the first character has dakuten, replace it with the corresponding character without dakuten and add an apostrophe to the end, e.g. がす > かす'
	text = text:gsub("^(" .. UTF8_char .. ")" .. u(0x3099) .. "(.*)", "%1%2'")
	-- Similar thing, but with handuken and two apostrophes, e.g. ぱす -> はす''
	text = text:gsub("^(" .. UTF8_char .. ")" .. u(0x309A) .. "(.*)", "%1%2''")
	
	-- Replace the long vowel mark with the vowel that it stands for
	if text:match("ー") then
		local from = {
			"あぁかさたなはまやゃらわ",
			"いぃきしちにひみり",
			"うぅくすつぬふむゆゅる",
			"えぇけせてねへめれ",
			"おぉこそとのほもよょろ",
			"ん"
		}
		local to = {"あ", "い", "う", "え", "お", "ん"}
		local dh = u(0x3099) .. u(0x309A)
		for i, v in ipairs(from) do
			text = mw.ustring.gsub(text, "(?)ー", "%1" .. to)
		end
	end
	
	text = gsub(text, "", " ")
	
	local ret = require("Module:Hani-sortkey").makeSortKey(text, lang, sc)
	
	if ret ~= text then
		require("Module:debug/track"){"Jpan-sortkey/fallback", "Jpan-sortkey/fallback/" .. lang}
	end
	
	return ret
end

function export.makeSortKey(text, lang, sc)
	local langname = require("Module:languages").getByCode(lang):getCanonicalName()
	local seen_pages = {}
	local section
	
	local function scrape_page(text)
		seen_pages = true
		local content = mw.title.new(toNFC(text)):getContent()
		if content then
			local section = section or require("Module:User:Theknightwho/get_header").get_header1()
			local i = 1
			for heading in content:gmatch("(%f(=+)-%S+*%2%f)") do
				i = i + 1
				if heading:find("==%s*" .. langname:gsub("%-", "%%%-") .. "%s*==") then
					break
				end
			end
			local loc1, loc2 = content:find("%f==*" .. langname:gsub("%-", "%%%-") .. "*==()")
			local loc2 = content:find("%f==+==", loc2)
			if loc1 then
				content = content:sub(loc1, loc2)
				section = section - i + 1
				local findTemplates = require("Module:templateparser").findTemplates
				local templates = {
					 = true,
					 = true,
					 = true,
					 = true,
					 = true,
					 = true,
					 = true,
					 = true,
					 = true,
				}
				local templates2 = {
					 = true,
					 = true,
				}
				local function parse_section(section_content)
					local kanjitab, br
					for template, args, _, temp_start in findTemplates(section_content) do
						if templates and args then
							text = args:gsub("", "")
							br = true
							break
						elseif templates2 and args then
							text = args:gsub("", "")
							br = true
							break
						elseif (template == "head" or template == "head-lite") and args == lang then
							for i, arg in ipairs(args) do
								if arg == "kana" then
									local kana = args
									if kana then
										text = kana
										br = true
										break
									end
								end
							end
						elseif template == lang .. "-kanjitab" then
							kanjitab = kanjitab or args
						end
					end
					if (not br) and kanjitab then
						require("Module:debug/track"){"Jpan-sortkey/kanjitab", "Jpan-sortkey/kanjitab/" .. lang}
						if kanjitab.sortkey then
							return kanjitab.sortkey
						end
						-- extract kanji and non-kanji
						local kanji = {}
						local non_kanji = {}
						
						local kanji_border = 1
						mw.ustring.gsub(text, "()()()", function(p1, w1, p2)
							table.insert(non_kanji, mw.ustring.sub(text, kanji_border, p1 - 1))
							kanji_border = p2
							table.insert(kanji, w1)
						end)
						table.insert(non_kanji, mw.ustring.sub(text, kanji_border))
						-- 々
						for i, v in ipairs(kanji) do
							if v == "々" then kanji = kanji end
						end
						-- process readings
						local readings = {}
						local readings_actual = {}
						local reading_length_total = 0
						for i in ipairs(kanjitab) do
							local reading_kana, reading_length
							_, _, reading_kana, reading_length = mw.ustring.find(kanjitab or "", "^(*)(*)$")
							reading_kana = reading_kana ~= "" and reading_kana or nil
							reading_length = reading_kana and tonumber(reading_length) or 1
	
							table.insert(readings, {reading_kana, reading_length})
							reading_length_total = reading_length_total + reading_length
							for i = reading_length_total + 1, #kanji do
								table.insert(readings, {nil, 1})
							end
							if reading_kana then
								local actual_reading = kanjitab
								local okurigana = kanjitab
								readings_actual = {(actual_reading or reading_kana) .. (okurigana or ""), reading_length}
							else
								readings_actual = {nil, 1}
							end
						end
						local sortkey = {non_kanji}
						local id = 1
						for _, v in ipairs(readings_actual) do
							id = id + v
							v = v ~= "-" and v
							table.insert(sortkey, (v or "") .. (non_kanji or ""))
						end
						sortkey = table.concat(sortkey)
						if sortkey ~= "" then
							text = sortkey
						end
					end
				end
				local sections, i = {}, 0
				for pos in content:gmatch("()%f(=+)-%S+*%2%f") do
					i = i + 1
					sections = pos
				end
				for i = section, 1, -1 do
					local section_content = content:sub(sections, sections)
					parse_section(section_content)
					if not find(text, "") then
						break
					end
				end
			end
		end
		return text
	end
	
	while lang ~= "mul" and (not seen_pages) and find(text, "") do
		text = scrape_page(text)
	end
	
	return export.sortkey_from_string(text, lang, sc)
end

return export
Module:User:Theknightwho/Jpan-sortkey

Wikious

Boobota

Sagapedia