Module:okm-translit

The following documentation is generated by Module:documentation/functions/translit.
Useful links: subpage list • links • transclusions • testcases • sandbox
This module will transliterate Middle Korean language text. It is also used to transliterate Early Modern Korean. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.
For testcases, see Module:okm-translit/testcases.
Functions

tr(text, lang, sc): Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.; When the transliteration fails, returns nil.
local export = {}
local gsub = mw.ustring.gsub

local chars_Hani = require('Module:scripts').getByCode('Hani'):getCharacters()
local chars_Hang = require('Module:scripts').getByCode('Hang'):getCharacters()

-- https://github.com/szc126/rime-slg-korean/blob/main/slg_break_jamo.yaml
-- https://github.com/szc126/rime-slg-korean/blob/main/soolegi_yethangeul.custom.yaml
local tt_complex = {
='ᄇᄉᄀ',
='ᄇᄉᄃ',
='ᄇᄉᄇ',
='ᄇᄉᄉ',
='ᄇᄉᄌ',
='ᄉᄇᄀ',
='ᄉᄉᄉ',
='ᄅᄀᄀ',
='ᄅᄃᄃ',
='ᄅᄇᄇ',
='ᄇᄉᄐ',
='ᄉᄉᄇ',
='ᄌᄌᄒ',
='ᄀᄀ',
='ᄃᄃ',
='ᄇᄇ',
='ᄉᄉ',
='ᄌᄌ',
='ᄂᄀ',
='ᄂᄂ',
='ᄂᄃ',
='ᄂᄇ',
='ᄃᄀ',
='ᄅᄂ',
='ᄅᄅ',
='ᄅᄒ',
='ᄆᄇ',
='ᄇᄀ',
='ᄇᄂ',
='ᄇᄃ',
='ᄇᄉ',
='ᄇᄌ',
='ᄇᄎ',
='ᄇᄐ',
='ᄇᄑ',
='ᄫᄫ',
='ᄉᄀ',
='ᄉᄂ',
='ᄉᄃ',
='ᄉᄅ',
='ᄉᄆ',
='ᄉᄇ',
='ᄉᄋ',
='ᄉᄌ',
='ᄉᄎ',
='ᄉᄏ',
='ᄉᄐ',
='ᄉᄑ',
='ᄉᄒ',
='ᄼᄼ',
='ᄾᄾ',
='ᄋᄀ',
='ᄋᄃ',
='ᄋᄆ',
='ᄋᄇ',
='ᄋᄉ',
='ᄋᅀ',
='ᄋᄋ',
='ᄋᄌ',
='ᄋᄎ',
='ᄋᄐ',
='ᄋᄑ',
='ᄌᄋ',
='ᅎᅎ',
='ᅐᅐ',
='ᄎᄏ',
='ᄎᄒ',
='ᄑᄇ',
='ᄒᄒ',
='ᄀᄃ',
='ᄂᄉ',
='ᄂᄌ',
='ᄂᄒ',
='ᄃᄅ',
='ᄃᄆ',
='ᄃᄇ',
='ᄃᄉ',
='ᄃᄌ',
='ᄅᄀ',
='ᄅᄃ',
='ᄅᄆ',
='ᄅᄇ',
='ᄅᄫ',
='ᄅᄉ',
='ᄅᄌ',
='ᄅᄏ',
='ᄆᄀ',
='ᄆᄃ',
='ᄆᄉ',
='ᄇᄏ',
='ᄇᄒ',
='ᄋᄅ',
='ᄋᄒ',
='ᄐᄐ',
='ᄑᄒ',
='ᄒᄉ',
='ᅙᅙ',

='＠ᅩᅡ＠',
='＠ᅮᅥ＠',
='＠ᅡ＠ᅩ',
='＠ᅩᅡ',
='＠ᅩ＠ᅥ',
='＠ᅮᅥ',
='＠ᅥ＠ᅡ',
='＠ᅮᅥ＠',
='＠ᅩᅡ＠',
='＠ᅮᅡ＠',
='ᅩ＠ᅥ＠',
='ᅮ＠ᅥ＠',
='ᅩ＠ᅡ＠',
='ᅵ＠ᅡᅩ',
='ᅵ＠ᅡ＠',
='ᅵ＠ᅥ＠',
='＠ᅡ＠',
='＠ᅥ＠',
='＠ᅡᅩ',
='＠ᅥᅩ',
='＠ᅥᅮ',
='＠ᅩᅩ',
='＠ᅩ＠',
='＠ᅮᅡ',
='＠ᅮᅥ',
='＠ᅮᅮ',
='＠ᅮ＠',
='＠ᅡᅮ',
='＠ᅩᅡ',
='＠ᅩᅥ',
='＠ᅮᅩ',
='ᅵ＠ᅡ',
='ᅩ＠ᅡ',
='ᅩ＠ᅥ',
='ᅮ＠ᅥ',
='ᅵ＠ᅥ',
='ᅵ＠ᅩ',
='ᅵ＠ᅮ',
='ᅩᅡ＠',
='ᅮᅥ＠',
='ᅩᅥ＠',
='ᅮᅡ＠',
='ᅮᅥᅳ',
='ᅳᅵᅮ',
='ᅩᅩᅵ',
='ᅮᅵ＠',
='ᅳᅥ＠',
='ᅵᅩᅵ',
='ᆞᅥ＠',
='＠ᅡ',
='＠ᅥ',
='＠ᅩ',
='＠ᅮ',
='ᅡ＠',
='ᅥ＠',
='ᅩᅡ',
='ᅩ＠',
='ᅮᅥ',
='ᅮ＠',
='ᅳ＠',
='ᅡᅩ',
='ᅡᅮ',
='ᅥᅩ',
='ᅥᅮ',
='ᅥᅳ',
='ᅩᅥ',
='ᅩᅩ',
='ᅩᅮ',
='ᅮᅡ',
='ᅮᅮ',
='ᅳᅮ',
='ᅳᅳ',
='ᅵᅡ',
='ᅵᅩ',
='ᅵᅮ',
='ᅵᅳ',
='ᅵᆞ',
='ᆞᅥ',
='ᆞᅮ',
='ᆞ＠',
='ᆞᆞ',
='ᅡᅳ',
='ᅳᅡ',
='ᅳᅥ',
='ᅳᅩ',
='ᅵ＠',
='ᆞᅡ',

='ᆨᆺᆨ',
='ᆯᆨᆺ',
='ᆯᆮᇂ',
='ᆯᆷᆨ',
='ᆯᆷᆺ',
='ᆯᆸᆺ',
='ᆯᆸᇂ',
='ᆯᆺᆺ',
='ᆷᆺᆺ',
='ᇰᆨᆨ',
='ᆮᆮᆸ',
='ᆮᆺᆨ',
='ᆯᆨᆨ',
='ᆯᆨᇂ',
='ᆯᆯᆿ',
='ᆯᆷᇂ',
='ᆯᆸᆮ',
='ᆯᆸᇁ',
='ᆯᇹᇂ',
='ᆷᆫᆫ',
='ᆷᆸᆺ',
='ᆸᆯᇁ',
='ᆸᆺᆮ',
='ᆺᆺᆨ',
='ᆺᆺᆮ',
='ᆽᆸᆸ',
='ᆨᆨ',
='ᆨᆺ',
='ᆫᆽ',
='ᆫᇂ',
='ᆯᆨ',
='ᆯᆷ',
='ᆯᆸ',
='ᆯᆺ',
='ᆯᇀ',
='ᆯᇁ',
='ᆯᇂ',
='ᆸᆺ',
='ᆺᆺ',
='ᆨᆯ',
='ᆫᆨ',
='ᆫᆮ',
='ᆫᆺ',
='ᆫᇫ',
='ᆫᇀ',
='ᆮᆨ',
='ᆮᆯ',
='ᆯᆫ',
='ᆯᆮ',
='ᆯᆯ',
='ᆯᇦ',
='ᆯᇫ',
='ᆯᆿ',
='ᆯᇹ',
='ᆷᆨ',
='ᆷᆯ',
='ᆷᆸ',
='ᆷᆺ',
='ᆷᇫ',
='ᆷᆾ',
='ᆷᇂ',
='ᆸᆯ',
='ᆸᇁ',
='ᆸᇂ',
='ᆺᆨ',
='ᆺᆮ',
='ᆺᆯ',
='ᆺᆸ',
='ᇰᆨ',
='ᇰᇰ',
='ᇰᆿ',
='ᇰᆺ',
='ᇰᇫ',
='ᇁᆸ',
='ᇂᆫ',
='ᇂᆯ',
='ᇂᆷ',
='ᇂᆸ',
='ᆨᆫ',
='ᆨᆸ',
='ᆨᆾ',
='ᆨᆿ',
='ᆨᇂ',
='ᆫᆫ',
='ᆫᆯ',
='ᆫᆾ',
='ᆮᆮ',
='ᆮᆸ',
='ᆮᆺ',
='ᆮᆽ',
='ᆮᆾ',
='ᆮᇀ',
='ᆯᇰ',
='ᆷᆫ',
='ᆷᆷ',
='ᆷᆽ',
='ᆸᆮ',
='ᆸᆷ',
='ᆸᆸ',
='ᆸᆽ',
='ᆸᆾ',
='ᆺᆷ',
='ᆺᇦ',
='ᆺᇫ',
='ᆺᆽ',
='ᆺᆾ',
='ᆺᇀ',
='ᆺᇂ',
='ᇫᆸ',
='ᇫᇦ',
='ᇰᆷ',
='ᇰᇂ',
='ᆽᆸ',
='ᆽᆽ',
='ᇁᆺ',
='ᇁᇀ',

-- compatibility jamo
='ᄅᄀᄉ',
='ᄅᄇᄉ',
='ᄇᄉᄀ',
='ᄇᄉᄃ',
='ᄀᄀ',
='ᄃᄃ',
='ᄇᄇ',
='ᄀᄉ',
='ᄂᄌ',
='ᄂᄒ',
='ᄅᄀ',
='ᄅᄆ',
='ᄅᄇ',
='ᄅᄉ',
='ᄅᄐ',
='ᄅᄑ',
='ᄅᄒ',
='ᄇᄉ',
='ᄉᄉ',
='ᄌᄌ',
='ᄂᄂ',
='ᄂᄃ',
='ᄂᄉ',
='ᄂᅀ',
='ᄅᄃ',
='ᄅᅀ',
='ᄅᅙ',
='ᄆᄇ',
='ᄆᄉ',
='ᄆᅀ',
='ᄇᄀ',
='ᄇᄃ',
='ᄇᄌ',
='ᄇᄐ',
='ᄫᄫ',
='ᄉᄀ',
='ᄉᄂ',
='ᄉᄃ',
='ᄉᄇ',
='ᄉᄌ',
='ᄋᄋ',
='ᅌᄉ',
='ᅌᅀ',
='ᄒᄒ',
='ᄀ',
='ᄂ',
='ᄃ',
='ᄅ',
='ᄆ',
='ᄇ',
='ᄉ',
='ᄋ',
='ᄌ',
='ᄎ',
='ᄏ',
='ᄐ',
='ᄑ',
='ᄒ',
='ᅟ', -- filler
='ᄝ',
='ᄫ',
='ᅀ',
='ᅌ',
='ᅗ',
='ᅙ',

='＠ᅩ＠ᅡᅵ',
='＠ᅮ＠ᅥᅵ',
='＠ᅩ＠ᅡ',
='＠ᅮ＠ᅥ',
='＠ᅡᅵ',
='＠ᅥᅵ',
='ᅩᅡᅵ',
='ᅮᅥᅵ',
='＠ᅩᅵ',
='＠ᅮᅵ',
='ᅡᅵ',
='＠ᅡ',
='ᅥᅵ',
='＠ᅥ',
='ᅩᅡ',
='ᅩᅵ',
='＠ᅩ',
='ᅮᅥ',
='ᅮᅵ',
='＠ᅮ',
='ᅳᅵ',
='ᅡ',
='ᅥ',
='ᅩ',
='ᅮ',
='ᅳ',
='ᅵ',
='ᆞ',
}

local tt = [==[
BREAK	1

# remove hanja from (ex.) 사뎐(辭典)
# caps prob. isn't necessary since the "base" text is actually hangeul?
# Hani regex is a reasonable subset of Hani from ],
# last checked on 20220221
%(+%)	×

# to yale

# non-simple
gᄋ	Ğ # voiced velar fricative /ɣ/
ᄋᄋ	Ő
＠ᅩᅡ	ywa
＠ᅮᅥ	ywe
＠ᅮ	ywu
＠ᅩ	ywo
ᅩᅡ	wa
ᅮᅥ	we
ᅵᆞ	yo
ᆞᆞ	yo

# choseong
ᄀ	K
ᄂ	N
ᄃ	T
ᄅ	L
ᄆ	M
ᄇ	P
ᄉ	S
ᄋ	Ø
ᄌ	C
ᄎ	CH
ᄏ	KH
ᄐ	TH
ᄑ	PH
ᄒ	H
ᄝ	◆
ᄫ	Ƃ
ᅗ	◆
ᄛ	◆
ᅌ	Ŋ
ᅀ	Z
ᅙ	Q
ᄼ	◆
ᅎ	◆
ᅔ	◆
ᄾ	◆
ᅐ	◆
ᅕ	◆
ᅟ	× # filler

# jungseong
＠	y
ᅡ	a
ᅥ	e
ᅩ	wo
ᅮ	wu
ᅳ	u
ᅵ	i
ᆞ	o
ᅠ	× # filler

# jongseong
ᆨ	k
ᆫ	n
ᆮ	t
ᆯ	l
ᆷ	m
ᆸ	p
ᆺ	s
ᆼ	ø
ᆽ	c
ᆾ	ch
ᆿ	kh
ᇀ	th
ᇁ	ph
ᇂ	h
ᇢ	W
ᇦ	ƃ
ᇴ	◆
ퟝ	◆
ᇰ	ŋ
ᇫ	z
ᇹ	q

# tone
〮	↑
〯	→

# tone diacritic location
(+)(?)()	%1%3%2

# hyphens within syllables
BREAK	2

# hyphens within syllables (legacy)
# CV-y
# CVC-C
# CV-C
# C-V
%-%-%-%-(.-+)(y)	%1-%2
%-%-%-(.-+)()	%1-%2
%-%-%-(.-+)	%1-
%-%-(.-)()	%1-%2

# 子(ᄌᆞ)ㅣ
(%))(%-?)i	%1%2y

Ø	×

BREAK	3

↑	́
→	̌
↓	̀

ğ	G
ő	OO
Ø	NG # capitalized hanja readings
ø	ng
ƃ	W
Ŋ	NG # capitalized hanja readings
ŋ	ng
]==]

tt = mw.text.trim(tt)
tt = mw.ustring.gsub(tt, '%s*#+', '') -- remove comments
tt = mw.ustring.gsub(tt, '\n+', '\n') -- remove empty lines

local a, b, c, d = 'ᄀᄂᄃᄅᄆᄇᄉᄋᄌᄎᄏᄐᄑᄒᄝᄫᅗᄛᅌᅀᅙᄼᅎᅔᄾᅐᅕᅟ', '＠ᅡᅥᅩᅮᅳᅵᆞᅠ', 'ᆨᆫᆮᆯᆷᆸᆺᆼᆽᆾᆿᇀᇁᇂᇢᇦᇴퟝᇰᇫᇹ', '〮〯'

function export.tr(text, lang, sc)
	text = gsub(text, "%<%/?r%>", "")
	text = gsub(text, "%<%/?ruby%>", "")

	if not mw.ustring.match(text, '') then
		return nil
	end

	local bool_tone_marking = mw.ustring.find(text, (''):format(d))

	text = mw.ustring.toNFD(text)

	text = mw.ustring.gsub(text, '.', tt_complex)

	for line in mw.text.gsplit(tt, '\n') do
		local _, __, pattern, repl = mw.ustring.find(line, '(.+)\t(.+)')

		if pattern .. repl == 'BREAK1' then
			-- add period between hanja readings
			text = mw.ustring.gsub(text, '()%((.-)%)', function(hanja, reading)
				return hanja .. '(' .. mw.ustring.gsub(reading, ('(+)'):format(a), '.%1') .. ')'
			end)

			if bool_tone_marking then
				-- move the location of tone marks for easier handling and
				-- mark low tone
				local syllable_pat = ('(+)(+)(*)(*)'):format(a, b, c, d)
				text = mw.ustring.gsub(text, syllable_pat, function(a, b, c, d)
					return a .. b .. (d == '' and '↓' or d) .. (c == '' and '' or c)
				end)
			end
		elseif pattern .. repl == 'BREAK2' then
			-- handle shifted hyphens (e.g. ->>>곬 "kwol-s")
			for i=1,10 do
				text, count = mw.ustring.gsub(
					text,
					'%-(>+)(+)(%-?)(y?)(%-?)(+)(%-?)(y?)(%-?)(*)', 
					function(shifts, initials, h1, y1, h2, vowel, h3, y2, h4, finals)
						local num_shifts = mw.ustring.len(shifts)
						
						local initial_len = mw.ustring.len(initials)
						if initials == 'Ø' then
							initial_len = 0
						end
						local y1_len = mw.ustring.len(y1)
						local y2_len = mw.ustring.len(y2)
						local vowel_len = 1
						if mw.ustring.find(vowel, '^w') then
							vowel_len = 2
						end
						
						local sub = mw.ustring.sub
						if num_shifts <= initial_len then
							initials = sub(initials, 1, num_shifts) .. '-' .. sub(initials, num_shifts + 1, -1)
						elseif num_shifts <= initial_len + y1_len then
							y1 = y1 .. '-'
						elseif num_shifts <= initial_len + y1_len + vowel_len then
							num_shifts = num_shifts - (initial_len + y1_len)
							if num_shifts == vowel_len then
								vowel = vowel .. '-'
							else
								vowel = sub(vowel, 1, num_shifts) .. '-' .. sub(vowel, num_shifts + 1, -1)
							end
						elseif num_shifts <= initial_len + y1_len + vowel_len + y2_len then
							y2 = y2 .. '-'
						else
							num_shifts = num_shifts - (initial_len + y1_len + vowel_len + y2_len)
							finals = sub(finals, 1, num_shifts) .. '-' .. sub(finals, num_shifts + 1, -1)
						end
						
						return initials .. h1 .. y1 .. h2 .. vowel .. h3 .. y2 .. h4 .. finals
					end
				)
				if count == 0 then
					break
				end
			end
			
		elseif pattern .. repl == 'BREAK3' then
			text = mw.ustring.lower(text)

			-- hanja readings
			-- ref. ]
			text = mw.ustring.gsub(text, '()(+)%((.-)%)()', function(start_pos, hanja, reading, end_pos)
				-- treat final ieung as null if tones are marked (is this a safe assumption?)
				if bool_tone_marking then
					reading = mw.ustring.gsub(reading, 'ø', '')
				end
				-- convert to uppercase
				reading = mw.ustring.upper(reading)
				return reading
			end)
			-- remove hanja reading leading period
			text = mw.ustring.gsub(text, '^%.', '')
			text = mw.ustring.gsub(text, "'''%.", "'''")
			text = mw.ustring.gsub(text, '(%s)%.', '%1')
		else
			if repl == '×' then
				repl = ''
			end
			text = mw.ustring.gsub(text, pattern, repl)
		end
	end

	-- track failed romanizations
	-- (black diamond instead of U+FFFD to avoid warnings when saving this page)
	if mw.ustring.match(text, '◆') then
		require('Module:debug').track('okm-translit/failed romanization')
	end

	return text
end

return export
Module:okm-translit

Functions

Wikious

Boobota

Sagapedia