This is a private module sandbox of Theknightwho, for their own experimentation. Items in this module may be added and removed at Theknightwho's discretion; do not rely on this module's stability.
local export = {}
local kanji_pattern = mw.loadData("Module:ja/data/range").kanji
local ideograph_pattern = mw.loadData("Module:ja/data/range").ideograph
local kana_graph_pattern = mw.loadData("Module:ja/data/range").kana_graph
local latin_pattern = mw.loadData("Module:ja/data/range").latin
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = mw.ustring.char
local UTF8_char = "*"
function export.sortkey_from_string(text, lang, sc)
text = mw.ustring.toNFD(require("Module:ja").kata_to_hira(text))
-- If the first character has dakuten, replace it with the corresponding character without dakuten and add an apostrophe to the end, e.g. がす > かす'
text = text:gsub("^(" .. UTF8_char .. ")" .. u(0x3099) .. "(.*)", "%1%2'")
-- Similar thing, but with handuken and two apostrophes, e.g. ぱす -> はす''
text = text:gsub("^(" .. UTF8_char .. ")" .. u(0x309A) .. "(.*)", "%1%2''")
-- Replace the long vowel mark with the vowel that it stands for
if text:match("ー") then
local from = {
"あぁかさたなはまやゃらわ",
"いぃきしちにひみり",
"うぅくすつぬふむゆゅる",
"えぇけせてねへめれ",
"おぉこそとのほもよょろ",
"ん"
}
local to = {"あ", "い", "う", "え", "お", "ん"}
local dh = u(0x3099) .. u(0x309A)
for i, v in ipairs(from) do
text = mw.ustring.gsub(text, "(?)ー", "%1" .. to)
end
end
text = gsub(text, "", " ")
local ret = require("Module:Hani-sortkey").makeSortKey(text, lang, sc)
if ret ~= text then
require("Module:debug/track"){"Jpan-sortkey/fallback", "Jpan-sortkey/fallback/" .. lang}
end
return ret
end
function export.makeSortKey(text, lang, sc)
local langname = require("Module:languages").getByCode(lang):getCanonicalName()
local seen_pages = {}
local section
local function scrape_page(text)
seen_pages = true
local content = mw.title.new(toNFC(text)):getContent()
if content then
local section = section or require("Module:User:Theknightwho/get_header").get_header1()
local i = 1
for heading in content:gmatch("(%f(=+)-%S+*%2%f)") do
i = i + 1
if heading:find("==%s*" .. langname:gsub("%-", "%%%-") .. "%s*==") then
break
end
end
local loc1, loc2 = content:find("%f==*" .. langname:gsub("%-", "%%%-") .. "*==()")
local loc2 = content:find("%f==+==", loc2)
if loc1 then
content = content:sub(loc1, loc2)
section = section - i + 1
local findTemplates = require("Module:templateparser").findTemplates
local templates = {
= true,
= true,
= true,
= true,
= true,
= true,
= true,
= true,
= true,
}
local templates2 = {
= true,
= true,
}
local function parse_section(section_content)
local kanjitab, br
for template, args, _, temp_start in findTemplates(section_content) do
if templates and args then
text = args:gsub("", "")
br = true
break
elseif templates2 and args then
text = args:gsub("", "")
br = true
break
elseif (template == "head" or template == "head-lite") and args == lang then
for i, arg in ipairs(args) do
if arg == "kana" then
local kana = args
if kana then
text = kana
br = true
break
end
end
end
elseif template == lang .. "-kanjitab" then
kanjitab = kanjitab or args
end
end
if (not br) and kanjitab then
require("Module:debug/track"){"Jpan-sortkey/kanjitab", "Jpan-sortkey/kanjitab/" .. lang}
if kanjitab.sortkey then
return kanjitab.sortkey
end
-- extract kanji and non-kanji
local kanji = {}
local non_kanji = {}
local kanji_border = 1
mw.ustring.gsub(text, "()()()", function(p1, w1, p2)
table.insert(non_kanji, mw.ustring.sub(text, kanji_border, p1 - 1))
kanji_border = p2
table.insert(kanji, w1)
end)
table.insert(non_kanji, mw.ustring.sub(text, kanji_border))
-- 々
for i, v in ipairs(kanji) do
if v == "々" then kanji = kanji end
end
-- process readings
local readings = {}
local readings_actual = {}
local reading_length_total = 0
for i in ipairs(kanjitab) do
local reading_kana, reading_length
_, _, reading_kana, reading_length = mw.ustring.find(kanjitab or "", "^(*)(*)$")
reading_kana = reading_kana ~= "" and reading_kana or nil
reading_length = reading_kana and tonumber(reading_length) or 1
table.insert(readings, {reading_kana, reading_length})
reading_length_total = reading_length_total + reading_length
for i = reading_length_total + 1, #kanji do
table.insert(readings, {nil, 1})
end
if reading_kana then
local actual_reading = kanjitab
local okurigana = kanjitab
readings_actual = {(actual_reading or reading_kana) .. (okurigana or ""), reading_length}
else
readings_actual = {nil, 1}
end
end
local sortkey = {non_kanji}
local id = 1
for _, v in ipairs(readings_actual) do
id = id + v
v = v ~= "-" and v
table.insert(sortkey, (v or "") .. (non_kanji or ""))
end
sortkey = table.concat(sortkey)
if sortkey ~= "" then
text = sortkey
end
end
end
local sections, i = {}, 0
for pos in content:gmatch("()%f(=+)-%S+*%2%f") do
i = i + 1
sections = pos
end
for i = section, 1, -1 do
local section_content = content:sub(sections, sections)
parse_section(section_content)
if not find(text, "") then
break
end
end
end
end
return text
end
while lang ~= "mul" and (not seen_pages) and find(text, "") do
text = scrape_page(text)
end
return export.sortkey_from_string(text, lang, sc)
end
return export