Module:ja-parse

Hello, you have come here looking for the meaning of the word Module:ja-parse. In DICTIOUS you will not only get to know all the dictionary meanings for the word Module:ja-parse, but we will also tell you about its etymology, its characteristics and you will know how to say Module:ja-parse in singular and plural. Everything you need to know about the word Module:ja-parse you have here. The definition of the word Module:ja-parse will help you to be more precise and correct when speaking or writing your texts. Knowing the definition ofModule:ja-parse, as well as those of other words, enriches your vocabulary and provides you with more and better linguistic resources.

Auxiliary functions to parse the source of Japanese entries. Currently only used by Module:ja-see.


local export = {}

local len = mw.ustring.len
local sub = mw.ustring.sub
local gsub = mw.ustring.gsub
local find = mw.ustring.find
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch

local m_ja = require('Module:ja')

-- Auxiliary functions

local kanji_pattern = "一-鿿㐀-䶿﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧-﨩𠀀-𪛟𪜀-𮯯𰀀-𱍏"
local kana_pattern = 'ぁ-ゖァ-ヺー'
local japanese_pattern = kana_pattern .. kanji_pattern .. 'a-zA-Z0-9〆々'

local headword_templates = {
	 = true,  = true,  = true,  = true,
	 = true,  = true,  = true,
}

local function find_headword_template(wikitext)
	local index =
		wikitext:find('{{ja%-noun') or
		wikitext:find('{{ja%-adj') or
		wikitext:find('{{ja%-pos') or
		wikitext:find('{{ja%-phrase') or
		wikitext:find('{{ja%-verb') or
		wikitext:find('{{ja%-verb form') or
		wikitext:find('{{ja%-verb%-suru')
	if index then
		-- This assumes that the template has matching braces.
		return wikitext:match('%b{}', index)
	end
end

local function parse_template(wikitext) -- only supports the simplest format
	local template = wikitext
	template = template:gsub('%|]-)|(|]-)%]%]', ']')
	local name
	local args = {}
	for glob in mw.text.gsplit(template:gsub('^{{', ''):gsub('}}$', ''), '|') do
		if not name then
			name = glob
		else
			glob = glob:gsub('`', '|')
			local key, value = match(glob, "(.-)=(.*)")
			if key and value then
				args = value
			else
				table.insert(args, glob)
			end
		end
	end
	return name, args
end

local function contains(list, item)
	for i = 1, #list do
		if list == item then return true end
	end
	return false
end

-- Part I: functions to parse entries into words

function export.words(page_title)
	local page = mw.title.new(page_title):getContent() or ''
	local l2 = match(page, '==Japanese==\n(.-)\n==+==\n') or match(page, '==Japanese==\n(.*)') or ''

	-- split into L3 sections
	local l3_sections = {}
	local multi_etym = false

	-- special hack for kanji entries
	if not find(l2, '===Etymology 1===') and (find(l2, '===Kanji===') or find(l2, '===Kanji %d+===')) then
		l2 = gsub(l2, '{{ja%-kanjitab', '=== ===\n{{ja-kanjitab')
	end

	local current_l3_title = ''
	local current_l3_content = {}
	for v in l2:gmatch('+') do
		if find(v, '^===') then
			table.insert(l3_sections, { current_l3_title, table.concat(current_l3_content, '\n') })
			current_l3_title = match(v, '^===(+)')
			if current_l3_title == 'Etymology 1' then multi_etym = true end
			current_l3_content = {}
		end
		table.insert(current_l3_content, v)
	end
	table.insert(l3_sections, { current_l3_title, table.concat(current_l3_content, '\n') })

	-- group the L3 sections into words
	local words = {}
	if multi_etym then
		for _, v in ipairs(l3_sections) do
			local header = v
			local content = v
			if find(header, '^Etymology %d+$') then
				table.insert(words, content)
			end
		end
	else
		local word = {}
		for _, v in ipairs(l3_sections) do
			local header = v
			local content = v
			if not (header == 'Kanji' or find(header, '^Kanji %d+$')) then
				table.insert(word, content)
			end
		end
		word = table.concat(word, '\n')
		table.insert(words, word)
	end

	local result = {}

	local function add(list, item)
		if not contains(list, item) then table.insert(list, item) end
	end

	local function insert_spelling(entry, spelling)
		if spelling then
			if find(m_ja.script(spelling), 'Hani') then
				add(entry.kanji_spellings, spelling)
			else
				add(entry.kana_spellings, spelling)
			end
		end
	end

	for _, word in ipairs(words) do
		local entry = {
			word,
			type = '',
			kana_spellings = {},
			kanji_spellings = {},
			historical_spellings = {},
		}
		insert_spelling(entry, page_title)

		local ja_see = find(word, '{{ja%-see') or find(word, '{{ja%-see-kango')
		if ja_see then
			entry.type = 'redirect'
			for link_title in gmatch(match(word, '.-}}', ja_see), '+') do
				insert_spelling(entry, link_title)
			end
		else
			local ja_kanjitab = word:find('{{ja%-kanjitab')
			local headword_template = find_headword_template(word)
			if ja_kanjitab then
				entry.type = 'lemma'
				local _, args = parse_template(word:match('%b{}', ja_kanjitab))
				if args.alt and args.alt ~= "" and args.alt ~= "-" then
					for alt_spelling in mw.text.gsplit(args.alt, ',') do
						insert_spelling(entry, alt_spelling:gsub(':.+', ''))
					end
				end
			end
			if headword_template then
				entry.type = 'lemma'
				local _, args = parse_template(headword_template)
				for i = 1, #args do
					if find(args, '') then
						insert_spelling(entry, m_ja.remove_ruby_markup(args))
					end
				end
				add(entry.historical_spellings, args.hhira)
				add(entry.historical_spellings, args.hkata)
			end
		end
		table.insert(result, entry)
	end
	return result
end

-- Part II: functions to extract definitions and categories from a word

function export.parse_word(wikitext, lemma, nonlemma, frame, reading)
	local def = {}
	local cat = {}
	local current_section = ''

	for line in wikitext:gmatch('+') do
		if line:find('^#+ ') then
			if not line:find('{{rfdef') and not (
				-- the nonlemma entry is a kanji spelling and
				find(nonlemma, '') and
				-- is not listed in {{ja-def}} or the lemma entry has <!-- kana only -->
				(line:find('{{ja%-def|') and not line:find('|' .. nonlemma .. '') or line:find('<!%-%- kana only %-%->'))
			) then
				table.insert(def, { line:gsub("<ref.-</ref>", ""), pos = current_section })
			end
		elseif line:find('^===') then
			current_section = line:gsub("^=*(.-)=*$", "%1")
		else
			table.insert(cat, line)
		end
	end

	-- expand the other parts for categories
	local cat = table.concat(cat, '\n')
	cat = gsub(cat, '<ref', '')
	local function process_template_header(a, b) -- if the template begins with "{{ja-usex|", a is "ja-usex" and b is "|".
		local templates_to_include = {
			-- Categories generated by these templates are copied.
			-- It is currently empty here.
			--  = true,
		}
		if headword_templates then
			local source_script = m_ja.script(lemma)
			if source_script == 'Hira' or source_script == 'Kana' or source_script == 'Hira+Kana' then
				return '{{' .. a .. '|hira=' .. lemma .. b
			else
				return '{{' .. a .. b
			end
		elseif a:find('^R%:') then
			return '{{=' .. b
		elseif a == 'ja-usex' or a:find('^quote') then -- special hack
			return ']{{=' .. b
		elseif not templates_to_include then
			return '{{=' .. b
		else
			return '{{' .. a .. b
		end
	end
	cat = gsub(cat, '{{(+)\n?()', process_template_header)
	cat = gsub(cat, '{{ja%-pron.-}}', function(pron)
		local result = ''
		if not find(pron, '|noipa=') then result = result .. ']' end
		if find(pron, '|a=') or find(pron, '|audio=') then result = result .. ']' end
		return result
		end)
	cat = frame:preprocess(cat)

	local cat2 = {}
	for i in gmatch(cat, '%%]') do
		i = gsub(i, '|.*', ']]')
		if i == ']' then i = ']' end
		i = gsub(i, '%]%]', '|' .. (require("Module:languages").getByCode("ja"):makeSortKey(reading)) .. ']]')
		table.insert(cat2, i)
	end
	cat = table.concat(cat2)
	-- one might want to modify the sortkeys here

	return def, cat
end

return export