Template:User:kc kennylau/zh-usex/testcases
-- sandbox of ]
local export = {}
local m_zh = require("Module:zh")
local m_languages = require("Module:languages")
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local match = mw.ustring.match
local sub = mw.ustring.sub
local split = mw.text.split
-- Use this when the actual title needs to be known.
local actual_title = mw.title.getCurrentTitle()
-- Use this when testcases need to be able to override the title (for bolding,
-- for instance).
local title = actual_title
local PAGENAME = PAGENAME or title.text
local data = mw.loadData("Module:zh-usex/data")
local punctuation = data.punctuation
local ref_list = data.ref_list
local pron_correction = data.pron_correction
local polysyllable_pron_correction = data.polysyllable_pron_correction
local zh_format_end = "</span>"
--local Han_pattern = ""
local Han_pattern = ""
local UTF8_char = '*'
local UTF8_char2 = '*' -- not ""
local tr_data = {
cmn = {
segment_c = " %-",
separator_conv = {="",=" ",="",="-"},
link_ignore = "\1.^",
tr_cap = true,
combine = function(t)
return t:gsub("^%f","\3") -- temporary substitute of the apostrophe
end,
},
yue = {
segment_c = " ",
separator_conv = {="",=" "},
link_ignore = "\1",
tr_cap = false,
combine = function(t) return " "..t end,
},
= {
segment_c = " ~",
separator_conv = {="",=" ",="-"},
link_ignore = "\1%%.^",
tr_cap = true,
combine = function(t) return "-"..t end,
},
hak = {
segment_c = " ~",
separator_conv = {="",=" ",="-"},
link_ignore = "\1.^",
tr_cap = true,
combine = function(t) return "-"..t end,
},
default = {
segment_c = " ",
separator_conv = {="",=" "},
link_ignore = "\1",
tr_cap = false,
},
}
local function get_tr(display, norm_code)
local given, given_pos = {}, 1 -- record the characters with given transcription
local punc, punc_pos = {}, 1 -- record the punctuations with given transcription
local tr_datapoint = tr_data
local word_regex = "+" -- regex that matches words
local tr_word = display:gsub("\1", " ")
:gsub("%", "")
:gsub("("..UTF8_char.."){(*)}", function(a,b) -- record given tr and replace with "{"
given = a:find("^%w$") and b or tr_datapoint.combine(b)
given_pos = given_pos + 1
return "{"
end)
:gsub("%f+%f", function(a) -- record punctuation and replace with "}"
if punctuation then
punc = punctuation
punc_pos = punc_pos + 1
return "}"
end
return a
end)
:gsub("<b>","\1"):gsub("</b>","\2") -- substitute bold tags for further processing
:gsub(word_regex,function(word)
-- first attempt to get the pronunciation of the whole word
local res = polysyllable_pron_correction
or pron_correction
if res then return res end
local length = 0 -- for check_pron (a bit hacky because check_pron only checks if length == 1)
if word:find("^"..UTF8_char.."$") then length = 1 end
res = m_zh.check_pron(word, norm_code, length)
if res then return tr_datapoint.combine(res:gsub("/.+","")) end
-- if it fails, get pronunciation of each character
return word:gsub(UTF8_char, function(ch)
local ch_res = pron_correction
if ch_res then return ch_res end
ch_res = m_zh.check_pron(ch, norm_code, 1)
return ch_res and tr_datapoint.combine(ch_res:gsub("/.+","")) or ch
end)
end)
if norm_code == "cmn" then
tr_word = tr_word:gsub("%.%.","-")
end
if norm_code ~= "yue" then
tr_word = tr_word:gsub("%."," ")
end
given_pos, punc_pos = 0,0
tr_word = tr_word:gsub("{",function() -- substitute back the stored results
given_pos = given_pos + 1
return given
end)
:gsub("}",function() -- substitute back the punctuations
punc_pos = punc_pos + 1
return punc
end)
return tr_word
end
local function make_link(target, display)
target = target == "" and display or target
-- Remove bold tags from target
target = target:gsub("</?b>","")
-- Generate link to Chinese section
local result = "]"
-- For debugging purposes
--if actual_title.nsText == "Module" then mw.log(display, target, "->", result) end
return result
end
local function convert(conv_fun, text)
return (text .. "A")
:gsub("(]*)"..UTF8_char2.."%",
function(a,b) return conv_fun(a)..b end)
:sub(1,-2)
end
function export.show(frame)
local params = {
= { required = true }, -- example
= {}, -- translation
= {}, -- variety
lit = {},
tr = {},
ref = {}, r = { alias_of = "ref" },
inline = {},
audio = {}, a = { alias_of = "audio" },
collapsed = { type = "boolean" },
-- Allow specifying pagename in testcases on documentation page.
pagename = actual_title.nsText == "Template" and {} or nil,
nocat = { type = "boolean" },
tr_nocap = { type = "boolean" },
simp = { type = "boolean" }
}
local category = frame.args or error("Please specify the category.")
local args, unrecognized_args = require("Module:parameters").process(frame:getParent().args, params, true)
if args.pagename then
-- Override title in Module namespace.
title = mw.title.new(args.pagename)
PAGENAME = title.text
end
local example = args or error("Example unspecified.")
local translation = args
local literal = args
local reference = args
local manual_tr = args
local display = args
local inline = args
local audio_file = args
local collapsed = args
local simp = args
local phonetic = ""
local original_length = example:gsub("+",""):len()
local variety = args or frame.args or (ref_list and ref_list or false) or "cmn"
local variety_data = data.varieties_by_code or data.varieties_by_old_code or error("Variety " .. variety .. " not recognized.")
-- unpack() doesn't work here because the data was loaded using mw.loadData()
local std_code, norm_code, desc, tr_desc = variety_data, variety_data, variety_data, variety_data
norm_code = norm_code or std_code
variety = std_code
local lang_obj_wikt = m_languages.getByCode(variety, 3, "allow etym")
if next(unrecognized_args) then
--]
require("Module:debug").track_unrecognized_args(unrecognized_args, "zh-usex")
end
if reference then
require("Module:debug").track("zh-usex/ref")
end
if example:find("") then
require("Module:debug").track("zh-usex/parentheses")
end
if example:find("&#") then
require("Module:debug").track("zh-usex/html")
end
-- future escape character?
if example:find("`") then
require("Module:debug").track("zh-usex/backtick")
end
if example:find(" ") then
require("Module:debug").track("zh-usex/double-space")
end
if (norm_code == "nan-hbl" or norm_code:find("^hak")) and example:find("%-") then
require("Module:debug").track("zh-usex/hyphen")
end
if example:find("%w%{") then
require("Module:debug").track("zh-usex/rom-text")
end
if not translation or translation == '' then -- per standard ]
translation = '<small>(please add an English translation of this ' .. (category == "quotations" and "quotation" or "usage example") .. ')</small> ]'
end
-- should we generate the other (simp/trad) form
-- (in the end, only actually display if the converted text is different)
local do_conv = true
if norm_code == "vi" or norm_code == "ko" then
do_conv = false
end
local conv_fun = m_zh.ts
if simp then
if category ~= "quotations" then error("parameter simp cannot be true in ] or ].") end
if norm_code == "vi" or norm_code == "ko" or norm_code == "lzh" or variety == "yue-HK" or variety == "cmn-TW" or
variety == "nan-hbl-TW" or variety == "lzh-cmn-TW" or variety == "hak-hai" or variety == "hak-dab" or
variety == "hak-zha" then
error(("Parameter simp= cannot be specified for variety '%s'"):format(variety))
end
conv_fun = m_zh.st
end
-- should we generate the transcription
local generate_tr = false
if tr_data then
if manual_tr then
require("Module:debug").track("zh-usex/manual-tr")
else
generate_tr = true
end
end
local boldify = false
-- automatically boldify pagetitle if nothing is in bold
if not example:find("'''") and not punctuation then
boldify = true
end
-- tidying up the example, making it ready for transcription
example = gsub(example, "", " %0 ")
example = example:gsub("— —", "——") -- double em-dash (to be converted to single em-dash later)
:gsub("<br */?>"," <br> ") -- process linebreaks
:gsub("^ *",""):gsub(" *$",""):gsub(" +"," ") -- process spaces
:gsub("%%]%f]",function(a) -- process ]
return a:gsub(" ","\1")
end)
:gsub("'''(+)'''", "<b>%1</b>") -- normalise bold syntax
:gsub("%^<b>","<b>^")
:gsub("</b>(%)","%1</b>")
:gsub("</b>({*})","%1</b>")
-- parsing: convert "-", "--", "---" to "-", "..", "--" respectively
-- so that "-" is the character that delimits links
-- further explanation will use the replacement result to refer to the commands
if norm_code == "cmn" then
example = example:gsub("%-+",{="..",="--"})
if example:find("%-+\\") then
require("Module:debug").track("zh-usex/extra-pinyin")
end
end
local regex_data = tr_data or tr_data.default
local segment_c = regex_data.segment_c -- the characters that delimit links
local separator_conv = regex_data.separator_conv -- the table for separator mapping
local link_ignore = regex_data.link_ignore -- the characters that do not affect links
local tr_cap = regex_data.tr_cap -- transliteration can be capitalised
local segment_regex = "(*)(+)" -- the regex that matches each segment and the separator before it
local cache = {} -- store the result of each segment
local trad_text = ""
local simp_text = ""
-- generate the transliteration
-- but store the results in the cache
-- and also build up trad_text and simp_text
local tr_text = example:gsub(segment_regex, function(separator,seg)
separator = separator_conv or error('Invalid separator: "'..separator..'"')
if cache then
trad_text = trad_text .. cache.trad
simp_text = simp_text .. cache.simp
return separator..cache.tr
end
if punctuation then
cache = {
trad = seg,
simp = seg,
tr = punctuation
}
trad_text = trad_text .. seg
simp_text = simp_text .. seg
return separator..punctuation
end
local generate_link = 0
seg, generate_link = seg:gsub("@","")
generate_link = (generate_link == 0)
local target, display = "", seg
local pos = seg:find("\\",1,true)
if generate_link and pos then
-- move formatting from start of target to display
-- e.g. <b>^甲\乙 --> 甲\<b>^乙
local bold = ""
local caret = ""
local start = 1
if seg:sub(1,3) == "<b>" then
bold,start = "<b>",4
end
if tr_cap and seg:sub(start) == "^" then
caret,start = "^",start+1
end
target, display = seg:sub(start,pos-1), bold..caret..seg:sub(pos+1,-1)
if target:find("</?b>") then -- Check for bold tags in target.
require("Module:debug").track("zh-usex/bold-target")
end
end
local target_trad = target:gsub("%","")
local target_simp = do_conv and convert(conv_fun, target)
local occurrences = 0
if boldify then
display, occurrences = display:gsub(PAGENAME,"<b>"..PAGENAME.."</b>")
end
if occurrences > 0 then
display = display:gsub("%","%")
:gsub("%^<b>","<b>^")
:gsub("</b>(%)","%1</b>")
:gsub("</b>({*})","%1</b>")
end
local display_derom = display:gsub("{*}","")
:gsub("+","")
local display_trad = display_derom:gsub("%","")
local display_simp = do_conv and convert(conv_fun, display_derom) or ""
local seg_tr = generate_tr and get_tr(display, norm_code) or ""
if display_trad:gsub("</?b>","") == PAGENAME or target_trad == PAGENAME then
generate_link = false
if boldify and occurrences == 0 then
display_trad = "<b>" .. display_trad .. "</b>"
display_simp = "<b>" .. display_simp .. "</b>"
seg_tr = "<b>" .. seg_tr .. "</b>"
end
end
local seg_trad = generate_link and make_link(target_trad, display_trad) or display_trad
local seg_simp = generate_link and do_conv and make_link(target_simp, display_simp) or display_simp
cache = {
trad = seg_trad,
simp = seg_simp,
tr = seg_tr
}
trad_text = trad_text .. seg_trad
simp_text = simp_text .. seg_simp
return separator..seg_tr
end)
if trad_text == simp_text then
do_conv = false
simp_text = nil
end
if not trad_text:find("</?b>") then
require("Module:debug").track("zh-usex/no-bold")
end
-- format generated tr
-- at this point we have three temporary substitutions:
-- <b>:\1, </b>:\2, ':\3
if generate_tr then
if norm_code == "cmn" then -- format apostrophe
tr_text = tr_text
:gsub("%f(*)\3", "%1")
:gsub("\1\3","\3\1") -- <b>' → '<b>
:gsub("^\3","\3^") -- ^' → '^ (shouldn't occur)
elseif norm_code == "nan-hbl" or norm_code == "hak" then -- format hyphens
mw.log(tr_text)
tr_text = tr_text
:gsub("%^%-","-^")
:gsub("\1%-","-\1") -- <b>- → -<b>
:gsub("%-\2","\2-") -- -</b> → </b>-
:gsub("%f%-%f","") -- "-chhek" at beginning -> "chhek"
:gsub("%f%-%f","") -- "shi-" at the end -> "shi"
:gsub("%-+","-")
:gsub("%-?%%%-?", "--")
mw.log(tr_text)
end
tr_text = tr_text:gsub("",{="<b>",="</b>",="'"})
if tr_text:find(Han_pattern) then
require("Module:debug").track("zh-usex/character without transliteration")
end
end
local tag_start = " <span style=\"color:darkgreen; font-size:x-small;\">[" -- HTML entity since "]" is interpreted poorly
local tag_end = "]</span>"
local simp_link = "<i>]</i>"
local trad_link = "<i>]</i>"
if simp then
simp_link, trad_link = trad_link, simp_link
end
trad_text, auto_spaces = trad_text:gsub("(%]%])(%)", "%1 %2")
simp_text = do_conv and simp_text:gsub("(%]%])(%)", "%1 %2") or false
phonetic = manual_tr or (generate_tr and tr_text)
if auto_spaces > 0 then
require("Module:debug").track("zh-usex/auto-spaces")
end
-- overall transcription formatting
if phonetic then
phonetic = gsub(phonetic, " </b>", "</b> ")
phonetic = gsub(phonetic, " ", " ")
if norm_code == "yue" or norm_code == "zhx-tai" or norm_code == "nan-tws" or norm_code == "nan-hnm" or
norm_code == "zhx-sic" or norm_code == "cjy" or norm_code == "hsn" or norm_code == "gan" or
variety == "hak-mei" then
phonetic = gsub(phonetic, "(+)(+)", "%1<sup>%2</sup>") -- superscript tones
end
phonetic = gsub(phonetic, " ()", "%1") -- remove excess spaces from punctiation
phonetic = gsub(phonetic, "() ", "%1")
phonetic = phonetic:gsub(" <br> ", "<br>")
if not manual_tr then
if norm_code == "nan-hbl" then
phonetic = gsub(phonetic, " +%-%-", "--")
end
end
-- capitalisation
if not manual_tr then
if norm_code == "yue" or norm_code == "zhx-tai" or norm_code == "cjy" or norm_code == "hsn" or
norm_code == "cmn-wuh" or norm_code == "nan-tws" or norm_code == "wxa" or norm_code == "wuu" or
variety == "hak-mei" then
args.tr_nocap = true
end
if not args.tr_nocap and match(example, "") then
phonetic = "^" .. gsub(phonetic, "() ", "%1 ^")
end
if not args.tr_nocap then
phonetic = gsub(phonetic, "() (.)", "%1 ^%2")
phonetic = gsub(phonetic, "<br>(.)", "<br>^%1")
phonetic = gsub(phonetic, ": ()(.)", ": %1^%2")
end
phonetic = gsub(phonetic, "%^<b>", "<b>^")
phonetic = gsub(phonetic, "%^+.", mw.ustring.upper)
phonetic = gsub(phonetic, "%^", "")
end
if norm_code == "wuu" then
local wuu_pron = require("Module:wuu-pron")
if phonetic:find(":") then
phonetic = "''" .. wuu_pron.wugniu_format(phonetic:sub(4)) .. "''"
else
phonetic = "''" .. wuu_pron.wugniu_format(wuu_pron.wikt_to_wugniu(phonetic)) .. "''"
end
elseif norm_code == "cmn-wuh" or norm_code == "wxa" then
phonetic = "<span class=\"IPA\"></span>"
elseif norm_code == "cdo" then
local cdo_pron = require("Module:cdo-pron")
phonetic = "<i>" .. phonetic .. "</i>" ..
(not match(phonetic, "-+-+-+-")
and " / <span class=\"IPA\"><small></small></span>"
or "")
else
phonetic = "<i>" .. phonetic .. "</i>"
end
phonetic = "<span lang=\"zh-Latn\" style=\"color:#404D52\">" .. phonetic .. "</span>"
end
local collapse_start, collapse_end, collapse_tag, collapse_border_div, collapse_border_div_end = '', '', '', '', ''
simplified_start = '<br>'
if collapsed then
collapse_start = '<span class="vsHide">'
collapse_end = '</span>'
collapse_tag = '<span class="vsToggleElement" style="color:darkgreen; font-size:x-small;padding-left:10px"></span>'
collapse_border_div = '<div class="vsSwitcher" data-toggle-category="usage examples" style="border-left: 1px solid #930; border-left-width: 2px; padding-left: 0.8em;">'
collapse_border_div_end = '</div>'
simplified_start = '<hr>'
end
if actual_title.nsText == '' and (not args.nocat) then -- fixme: probably categorize only if text contains the actual word
if reference then
cat = "]"
else
cat = "]"
end
end
local zh_format_start_simp = "<span lang=\"zh-Hans\" class=\"Hans\">"
local zh_format_start_trad = "<span lang=\"zh-Hant\" class=\"Hant\">"
if simp then zh_format_start_simp, zh_format_start_trad = zh_format_start_trad, zh_format_start_simp end
-- indentation, font and identity tags
if ((norm_code == "cmn" and original_length > 7)
or (norm_code ~= "cmn" and original_length > 5)
or reference
or collapsed
or (match(example, "") and norm_code == "wuu")
or (norm_code == "cdo" and original_length > 3)
or (inline or "" ~= "")) then
trad_text = zh_format_start_trad .. trad_text .. zh_format_end
if not phonetic then
translation = "<i>" .. translation .. "</i>"
end
if phonetic then
phonetic = "<dd>" .. collapse_start .. phonetic
translation = "<dd>" .. translation .. "</dd>"
tr_tag = tag_start .. tr_desc .. tag_end .. collapse_end .. "</dd>"
else
translation = "<dd>" .. translation .. "</dd>"
end
if audio_file then
audio = "<dd>]</dd>"
end
if do_conv then
trad_tag = collapse_start .. tag_start .. desc .. ", " .. trad_link .. tag_end .. collapse_end .. collapse_tag
simp_text = simplified_start .. collapse_start .. zh_format_start_simp .. simp_text .. zh_format_end
simp_tag = tag_start .. desc .. ", " .. simp_link .. tag_end .. collapse_end
elseif norm_code == "vi" or norm_code == "ko" then
trad_tag = collapse_start .. tag_start .. desc ..", " .. trad_link .. tag_end .. collapse_end .. collapse_tag
else
trad_tag = collapse_start .. tag_start .. desc ..", " .. trad_link .. " and " .. simp_link .. tag_end .. collapse_end .. collapse_tag
end
if reference then
reference = "<dd>" .. collapse_start .. "<small><i>From:</i> " ..
(ref_list and ref_list or reference) .. "</small>" .. collapse_end .. "</dd>"
end
return collapse_border_div .. "<dl class=\"zhusex\">" .. trad_text .. trad_tag .. (simp_text or "") .. (simp_tag or "") .. (reference or "") ..
(phonetic and phonetic .. tr_tag or "") .. (audio or "") .. translation .. "</dl>" .. (cat or "") .. collapse_border_div_end
else
trad_text = zh_format_start_trad .. trad_text .. zh_format_end
divider = " ― "
if variety ~= "cmn" then
ts_tag = tag_start .. desc .. tag_end
tr_tag = tag_start .. tr_desc .. tag_end
end
if not phonetic then
translation = "<i>" .. translation .. "</i>"
end
if do_conv then
simp_text = "<span lang=\"zh-Hani\" class=\"Hani\">/</span>" .. zh_format_start_simp .. simp_text .. zh_format_end
end
if audio_file then
audio = " ]"
end
return trad_text .. (simp_text or "") .. (ts_tag or "") .. divider ..
(phonetic and phonetic .. (tr_tag or "") .. (audio or "") .. divider or "") .. translation .. (literal and " (literally, “" .. literal .. "”)" or "") ..
(cat or "")
end
end
-- function export.migrate(text, translation, ref)
-- if type(text) == "table" then
-- if not text.args or not text.args then
-- text = text:getParent()
-- end
-- if text.args and text.args ~= '' then
-- ref = text.args
-- translation = text.args
-- text = text.args
-- else
-- text = text.args
-- end
-- end
-- text = text:gsub('^+', ''):gsub('+$', ''):gsub(' +', ' '):gsub('\n+', '<br>'):gsub('|', '\\'):gsub('\'\'\'%%]\'\'\'', ' '):gsub('%]%]%%]', ''):gsub('%[%[', '')
-- :gsub('\'\'\'', ''):gsub(',', ','):gsub('!', '!'):gsub('%?', '?')
-- if translation then
-- if ref and ref ~= '' then
-- return '{{zh-x|' .. text .. '|' .. translation .. '|ref=' .. ref .. '}}'
-- else
-- return '{{zh-x|' .. text .. '|' .. translation .. '}}'
-- end
-- else
-- return text
-- end
-- end
return export