local export = {}
local kanji_pattern = "々一-鿿㐀-䶿﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧-﨩𠀀-𪛟𪜀-𰀀-"
-- returns an array of possible matches between kanji and kana
-- for example, simple_match('物の哀れ', 'もののあわれ') returns { '(も)の(のあわ)れ', '(もの)の(あわ)れ' }
local function simple_match(kanji, kana)
local kanji_segments = mw.ustring.gsub(kanji, "(+)", "`%1`")
local function simple_match_rec(kanji_segments, kana)
if kanji_segments:find('`') then
local kana_portion, kanji_portion, rest = mw.ustring.match(kanji_segments, '(.-)`(.-)`(.*)')
_, _, kana = mw.ustring.find(kana, '^' .. kana_portion .. '(.*)')
if not kana then return {} end
local candidates = {}
for i = 1, mw.ustring.len(kana) do
for _, candidate in ipairs(simple_match_rec(rest, mw.ustring.sub(kana, i + 1))) do
table.insert(candidates, kana_portion .. '(' .. mw.ustring.sub(kana, 1, i) .. ')' .. candidate)
end
end
return candidates
else
return (kanji_segments == kana) and { kana } or {}
end
end
return simple_match_rec(kanji_segments, kana)
end
function export.simple_match(kanji, kana)
local simple_results = simple_match(kanji, kana)
return #simple_results == 1 and simple_results or '(' .. kana .. ')'
end
-- transcludes the entry, and returns an array of its kanjitabs transformed into the format above
-- for example, extract_kanjitab_from_entry('書留') returns { '(かき)(とめ)' }
-- if the 書留 entry contains {{ja-kanjitab|か|と|o1=き|o2=め|yomi=k}}
local function extract_kanjitab_from_entry(entry_title)
local entry_wikicode = mw.title.new(entry_title):getContent() or ''
local results = {}
for kanjitab in mw.ustring.gmatch(entry_wikicode, '{{ja%-kanjitab|(.-)}}') do
kanjitab = mw.ustring.gsub(kanjitab, '%|]-)|(|]-)%]%]', ']')
local args, counter = {}, 1
for arg in mw.text.gsplit(kanjitab, '|') do
if mw.ustring.find(arg, '=') then
local _, _, k, v = mw.ustring.find(arg, '(.-)=(.*)')
k = ({ k = 'k1', o = 'o1' }) or tonumber(k) or k
args = v
else
args = arg
counter = counter + 1
end
end
local argpos, skip = 1, 0
local result = mw.ustring.gsub(entry_title, '', function(kanji)
if skip > 0 then skip = skip - 1 return '<CONCAT>' .. kanji end
local reading_kana, reading_length = '', nil
if args then _, _, reading_kana, reading_length = mw.ustring.find(args, '^(*)(*)$') end
if args then reading_kana = args end
if args then reading_kana = reading_kana .. args end
reading_length = reading_kana and tonumber(reading_length) or 1
skip = reading_length - 1
argpos = argpos + 1
return '(' .. reading_kana .. ')'
end)
for i = 1, 10 do
if not mw.ustring.find(result, '<CONCAT>') then break end
result = mw.ustring.gsub(result, '%]+)%]%((+)%)<CONCAT>(.)', '(%2)')
end
table.insert(results, result)
end
return results
end
-- Try simple match first. If the result is not accurate, that is,
-- if there are zero results, or more than one result, or the result contains consecutive kanji like (かきとめ),
-- then try to transclude the entry and look for its kanjitabs to decide
function export.accurate_match(kanji, kana)
local simple_results = simple_match(kanji, kana)
if #simple_results == 1 and not mw.ustring.find(simple_results, '') then
return simple_results
else
local kanjitab_results = extract_kanjitab_from_entry(kanji)
for _, result in ipairs(kanjitab_results) do
if mw.ustring.gsub(result, '%]+)%]%((+)%)', '%2') == kana then
return result
end
end
-- if all fails
return '(' .. kana .. ')'
end
end
return export