local export = {}
export.postprocess_handlers = {}
local labels_module = "Module:labels"
-- Remove duplicated labels like 'Taiwanese' in 'Taiwanese Hokkien|and|Taiwanese Hakka'. Also remove duplicated labels
-- in things like
-- * 'Quanzhou|_|Hokkien' (which canonicalizes to 'Quanzhou Hokkien|_|Hokkien');
-- * 'Xiamen|and|Quanzhou|_|Hokkien' (which canonicalizes to 'Xiamen Hokkien|and|Quanzhou Hokkien|_|Hokkien');
-- * 'Xiamen|and|Anxi|_|Hokkien' (which canonicalizes to 'Xiamen Hokkien|and|Anxi|_|Hokkien');
-- * 'Xiamen|Zhangzhou|and|Quanzhou|_|Hokkien' (which canonicalizes to 'Xiamen Hokkien|Zhangzhou Hokkien|and|Quanzhou Hokkien|_|Hokkien');
-- * 'Xiamen|Zhangzhou|and|Anxi|_|Hokkien' (which canonicalizes to 'Xiamen Hokkien|Zhangzhou Hokkien|and|Anxi|_|Hokkien').
-- We do two passes. The first pass fixes cases like 'Quanzhou Hokkien|_|Hokkien', irrespective of whether there's an
-- "and" present. The second pass looks for a stretch of labels where (a) all of the labels have the same prefix or
-- suffix, and (b) in between the labels is at least one occurrence of "and" (which can also start out as "&" but is
-- canonicalized to "and"); but (c) we count two labels separated by "_" (which is canonicalized to a blank label) as a
-- single label.
table.insert(export.postprocess_handlers,
function(data)
local labels = data.labels
if #labels == 1 then
return
end
local m_labels = require(labels_module)
-- First, split the labels into `link` and `display` component parts (done only once).
local split_labels = {}
for i, label in ipairs(labels) do
local link, display = m_labels.split_display_form(label.label)
split_labels = {link = link, display = display}
end
-- Then compute "label starts" (indices of label sets to consider when looking for runs with the same prefix or
-- suffix), where a label start is either a single label or a set of two labels separated by an underscore,
-- and where we take occurrences of "and" into consideration.
local label_starts = {}
local i = 1
while i <= #labels do
local start = i
local followed_by_and = false
local after_underscore
if i <= #labels - 4 and labels.label == "" and labels == "and" and labels == "" then
-- 'Foo|_|and|_|Bar'; redundant underscores
followed_by_and = true
i = i + 3
elseif i <= #labels - 2 and labels.label == "and" then
followed_by_and = true
i = i + 1
elseif i <= #labels - 2 and labels.label == "" then
after_underscore = i + 2
i = i + 1
end
table.insert(label_starts, {
start = start,
followed_by_and = followed_by_and,
after_underscore = after_underscore
})
i = i + 1
end
-- Now the main loop.
-- Each spec is {"affix", `at_beginning`}, or {{"affix", "affix"}, `at_beginning`} where "affix" is a prefix or
-- suffix to remove and `at_beginning` indicates whether "affix" is a prefix or suffix. If more than one affix
-- is listed, any affix counts, e.g. 'Taiwan Mandarin|and|Taiwanese Hokkien'.
for _, affix_spec in ipairs {
{{"Taiwanese", "Taiwan"}, true}, {"Chinese"}, {"Gan"}, {"Hakka"}, {"Hokkien"}, {"Mandarin"},
-- Min needs to go before Southern Min, Eastern Min, etc. because the later check for e.g. Eastern Min
-- will overwrite the value set by Min if both match. With Min later, we'll end up with e.g.
-- "Fuqing Eastern Eastern Min".
{"Min"}, {"Southern Min"}, {"Eastern Min"}, {"Northern Min"}, {"Central Min"}, {"Wu"}, {"Xiang"}
} do
local affixes, at_beginning = unpack(affix_spec)
if type(affixes) == "string" then
affixes = {affixes}
end
-- Does `item` match against the prefix or suffix when both prefix/suffix and something else are
-- present? If so, return the something else, which is what we need to set the label to if we remove
-- the prefix/suffix.
local function matches_affix_with_space(item)
for _, affix in ipairs(affixes) do
local space_regex = at_beginning and "^" .. affix .. " (.+)$" or "^(.+) " .. affix .. "$"
local rest = item:match(space_regex)
if rest then
return rest
end
end
return false
end
-- Does `item` match against the prefix or suffix exactly? If so, return an empty string, which is what
-- we need to set the label to if we remove the prefix/suffix.
local function matches_affix_exactly(item)
for _, affix in ipairs(affixes) do
if item == affix then
return ""
end
end
return false
end
-- Does the link or display at `label_index` match with `match_function`? If so, return a three-element
-- list of `label_index`, `component` (either "link" or "display") and the return value of `match_function`.
local function check_match(label_index, match_function)
local link, display = split_labels.link, split_labels.display
local rest = display and match_function(display)
if rest then
return {label_index, "display", rest}
else
rest = link and matches_affix_with_space(link)
if rest then
return {label_index, "link", rest}
end
end
return nil
end
-- Given {`label_index`, `component`, `value`}, set the link or display component (depending on `component`)
-- of the label at `label_index` to `value`.
local function set_component_value(to_erase)
local label_index, component, value = unpack(to_erase)
if value == "" then
labels.label = ""
else
local link, display = split_labels.link, split_labels.display
if component == "display" then
display = value
else
link = value
end
labels.label = m_labels.combine_display_form_parts(link, display)
end
end
-- First pass: Look for two labels separated by an underscore, with the suffix occurring on both parts.
-- (This shouldn't happen with prefixes.)
if not at_beginning then
for _, label_start in ipairs(label_starts) do
local to_erase = check_match(label_start.start, matches_affix_with_space)
if to_erase and label_start.after_underscore and
check_match(label_start.after_underscore, matches_affix_exactly) then
set_component_value(to_erase)
end
end
end
-- Second pass.
-- Check whether a prefix or suffix matches the given label start index (index of a label set in the
-- `label_starts` list; see above). If it matches, return value is {`index`, `component`, `value`}, i.e.
-- the label index to change, the component ("link" or "display") to change and the value to set the
-- component to. Otherwise, return nil.
local function affix_matches(label_start_index)
local label_start = label_starts
-- If we're dealing with a suffix, there are two cases: (1) 'Quanzhou Hokkien';
-- (2) 'Quanzhou|_|Hokkien'. If we're dealing with a prefix, there are similarly (1) 'Taiwanese Hakka';
-- (2) 'Taiwanese|_|Hakka'. In addition, we have to check both the link and the display.
local to_erase = check_match(label_start.start, matches_affix_with_space)
if to_erase then
return to_erase
end
local after_underscore = label_start.after_underscore
if not after_underscore then
return nil
end
return check_match(at_beginning and label_start.start or after_underscore, matches_affix_exactly)
end
-- Now, try to find a run of two or more label sets with the same prefix or suffix, with at least one "and"
-- in the middle.
local j = 1
while j <= #label_starts - 1 do
local saw_and = false
local run = {}
local match = affix_matches(j)
if match then
table.insert(run, match)
local k = j + 1
while k <= #label_starts do
match = affix_matches(k)
if not match then
break
end
table.insert(run, match)
if label_starts.followed_by_and then
saw_and = true
end
k = k + 1
end
if #run > 1 and saw_and then
-- We saw a run of two or more with at least one 'and' in the middle. Remove the prefix or
-- suffix from all but the last (if we're dealing with a suffix) or all but the first (if we're
-- dealing with a prefix).
if at_beginning then
table.remove(run, 1)
else
table.remove(run)
end
for _, to_erase in ipairs(run) do
set_component_value(to_erase)
end
end
j = k + 1
else
j = j + 1
end
end
end
end
)
return export