local export = {}
local rfind = mw.ustring.find
local rsplit = mw.text.split
local u = mw.ustring.char
local rsubn = mw.ustring.gsub
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
--[==[
-- Reimplementation of mw.ustring.split() that includes any capturing
-- groups in the splitting pattern. This works like Python's re.split()
-- function, except that it has Lua's behavior when the split pattern
-- is empty (i.e. advancing by one character at a time; Python returns the
-- whole remainder of the string).
]==]
function export.capturing_split(str, pattern)
local ret = {}
-- (.-) corresponds to (.*?) in Python or Perl; () captures the
-- current position after matching.
pattern = "(.-)" .. pattern .. "()"
local start = 1
while true do
-- Did we reach the end of the string?
if start > #str then
table.insert(ret, "")
return ret
end
-- match() returns all captures as multiple return values;
-- we need to insert into a table to get them all.
local captures = {export.match(str, pattern, start)}
-- If no match, add the remainder of the string.
if #captures == 0 then
table.insert(ret, export.sub(str, start))
return ret
end
local newstart = table.remove(captures)
-- Special case: If we don't advance by any characters, then advance
-- by one character; this avoids an infinite loop, and makes splitting
-- by an empty string work the way mw.ustring.split() does. If we
-- reach the end of the string this way, return immediately, so we
-- don't get a final empty string.
if newstart == start then
table.insert(ret, export.sub(str, start, start))
table.remove(captures, 1)
start = start + 1
if start > #str then
return ret
end
else
table.insert(ret, table.remove(captures, 1))
start = newstart
end
-- Insert any captures from the splitting pattern.
for _, x in ipairs(captures) do
table.insert(ret, x)
end
end
end
--[=[
In order to understand the following parsing code, you need to understand how inflected text specs work. They are
intended to work with inflected text where individual words to be inflected may be followed by inflection specs in
angle brackets. The format of the text inside of the angle brackets is up to the individual language and part-of-speech
specific implementation. A real-world example is as follows: "]<+> ]<*,*#.pr>". This is the
inflection of a multiword expression "меди́чна сестра́", which means "nurse" in Ukrainian (literally "medical sister"),
consisting of two words: the adjective меди́чна ("medical" in the feminine singular) and the noun сестра́ ("sister"). The
specs in angle brackets follow each word to be inflected; for example, <+> means that the preceding word should be
declined as an adjective.
The code below works in terms of balanced expressions, which are bounded by delimiters such as < > or . The
intention is to allow separators such as spaces to be embedded inside of delimiters; such embedded separators will not
be parsed as separators. For example, Ukrainian noun specs allow footnotes in brackets to be inserted inside of angle
brackets; something like "меди́чна<+> сестра́<pr.>" is legal, as is
"]<+> ]<pr.>", and the parsing code should not be
confused by the embedded brackets, spaces or angle brackets.
The parsing is done by two functions, which work in close concert: parse_balanced_segment_run() and
split_alternating_runs(). To illustrate, consider the following:
parse_balanced_segment_run("foo<M.proper noun> bar<F>", "<", ">") =
{"foo", "<M.proper noun>", " bar", "<F>", ""}
then
split_alternating_runs({"foo", "<M.proper noun>", " bar", "<F>", ""}, " ") =
{{"foo", "<M.proper noun>", ""}, {"bar", "<F>", ""}}
Here, we start out with a typical inflected text spec "foo<M.proper noun> bar<F>", call parse_balanced_segment_run() on
it, and call split_alternating_runs() on the result. The output of parse_balanced_segment_run() is a list where
even-numbered segments are bounded by the bracket-like characters passed into the function, and odd-numbered segments
consist of the surrounding text. split_alternating_runs() is called on this, and splits *only* the odd-numbered
segments, grouping all segments between the specified character. Note that the inner lists output by
split_alternating_runs() are themselves in the same format as the output of parse_balanced_segment_run(), with
bracket-bounded text in the even-numbered segments. Hence, such lists can be passed again to split_alternating_runs().
]=]
--[==[
Parse a string containing matched instances of parens, brackets or the like. Return a list of strings, alternating
between textual runs not containing the open/close characters and runs beginning and ending with the open/close
characters. For example,
{parse_balanced_segment_run("foo(x(1)), bar(2)", "(", ")") = {"foo", "(x(1))", ", bar", "(2)", ""}}
]==]
function export.parse_balanced_segment_run(segment_run, open, close)
return capturing_split(segment_run, "(%b" .. open .. close .. ")")
-- Повторная реализация mw.ustring.split(), которая включает любые группы захвата -- в шаблоне разделения.
-- Это работает аналогично функции Python re.split() -- за исключением того, что она ведет себя как Lua,
-- когда шаблон разделения -- пуст (т. е. продвигается по одному символу за раз; Python возвращает -- весь остаток строки).
end
--[==[
Split a list of alternating textual runs of the format returned by `parse_balanced_segment_run` on `splitchar`. This
only splits the odd-numbered textual runs (the portions between the balanced open/close characters). The return value
is a list of lists, where each list contains an odd number of elements, where the even-numbered elements of the sublists
are the original balanced textual run portions. For example, if we do
{parse_balanced_segment_run("foo<M.proper noun> bar<F>", "<", ">") =
{"foo", "<M.proper noun>", " bar", "<F>", ""}}
then
{split_alternating_runs({"foo", "<M.proper noun>", " bar", "<F>", ""}, " ") =
{{"foo", "<M.proper noun>", ""}, {"bar", "<F>", ""}}}
Note that we did not touch the text "<M.proper noun>" even though it contains a space in it, because it is an
even-numbered element of the input list. This is intentional and allows for embedded separators inside of
brackets/parens/etc. Note also that the inner lists in the return value are of the same form as the input list (i.e.
they consist of alternating textual runs where the even-numbered segments are balanced runs), and can in turn be passed
to split_alternating_runs().
If `preserve_splitchar` is passed in, the split character is included in the output, as follows:
{split_alternating_runs({"foo", "<M.proper noun>", " bar", "<F>", ""}, " ", true) =
{{"foo", "<M.proper noun>", ""}, {" "}, {"bar", "<F>", ""}}}
Consider what happens if the original string has multiple spaces between brackets, and multiple sets of brackets
without spaces between them.
{parse_balanced_segment_run("foo baz-bat quux xyzzy", "") =
{"foo", "", "", "", " baz-bat quux xyzzy", "", ""}}
then
{split_alternating_runs({"foo", "", "", "", " baz-bat quux xyzzy", "", ""}, "") =
{{"foo", "", "", "", ""}, {"baz"}, {"bat"}, {"quux"}, {"xyzzy", "", ""}}}
If `preserve_splitchar` is passed in, the split character is included in the output,
as follows:
{split_alternating_runs({"foo", "", "", "", " baz bat quux xyzzy", "", ""}, "", true) =
{{"foo", "", "", "", ""}, {" "}, {"baz"}, {"-"}, {"bat"}, {" "}, {"quux"}, {" "}, {"xyzzy", "", ""}}}
As can be seen, the even-numbered elements in the outer list are one-element lists consisting of the separator text.
]==]
function export.split_alternating_runs(segment_runs, splitchar, preserve_splitchar)
local grouped_runs = {}
local run = {}
for i, seg in ipairs(segment_runs) do
if i % 2 == 0 then
table.insert(run, seg)
else
local parts =
preserve_splitchar and capturing_split(seg, "(" .. splitchar .. ")") or
rsplit(seg, splitchar)
table.insert(run, parts)
for j=2,#parts do
table.insert(grouped_runs, run)
run = {parts}
end
end
end
if #run > 0 then
table.insert(grouped_runs, run)
end
return grouped_runs
end
--[==[
Like split_alternating_runs() but applies an arbitrary function `frob` to "raw-text" segments in the result (i.e.
not stuff within balanced delimiters such as footnotes and inflection specs, and not splitchars if present). `frob`
is a function of one argument (the string to frob) and should return one argument (the frobbed string).
]==]
function export.split_alternating_runs_and_frob_raw_text(run, splitchar, frob, preserve_splitchar)
local split_runs = export.split_alternating_runs(run, splitchar, preserve_splitchar)
--[==[
Apply an arbitrary function `frob` to the "raw-text" segments in a split run set (the output of
split_alternating_runs()). We leave alone stuff within balanced delimiters (footnotes, inflection specs and the
like), as well as splitchars themselves if present. `preserve_splitchar` indicates whether splitchars are present
in the split run set. `frob` is a function of one argument (the string to frob) and should return one argument (the
frobbed string). We operate by only frobbing odd-numbered segments, and only in odd-numbered runs if
preserve_splitchar is given.
]==]
local function frob_raw_text_alternating_runs(split_run_set, frob, preserve_splitchar)
for i, run in ipairs(split_run_set) do
if not preserve_splitchar or i % 2 == 1 then
for j, segment in ipairs(run) do
if j % 2 == 1 then
run = frob(segment)
end
end
end
end
end
frob_raw_text_alternating_runs(split_runs, frob, preserve_splitchar)
return split_runs
end
--[==[
Split the non-modifier parts of an alternating run (after parse_balanced_segment_run() is called) on a Lua pattern,
but not on certain sequences involving characters in that pattern (e.g. comma+whitespace). `splitchar` is the pattern
to split on; `preserve_splitchar` indicates whether to preserve the delimiter and is the same as in
split_alternating_runs(). `escape_fun` is called beforehand on each run of raw text and should return two values:
the escaped run and whether unescaping is needed. If any call to `escape_fun` indicates that unescaping is needed,
`unescape_fun` will be called on each run of raw text after splitting on `splitchar`. The return value of this
function is as in split_alternating_runs().
]==]
function export.split_alternating_runs_escaping(run, splitchar, preserve_splitchar, escape_fun, unescape_fun)
-- First replace comma with a temporary character in comma+whitespace sequences.
local need_unescape = false
for i, seg in ipairs(run) do
if i % 2 == 1 then
local this_need_unescape
run, this_need_unescape = escape_fun(run)
need_unescape = need_unescape or this_need_unescape
end
end
if need_unescape then
return export.split_alternating_runs_and_frob_raw_text(run, splitchar, unescape_fun, preserve_splitchar)
else
return export.split_alternating_runs(run, splitchar, preserve_splitchar)
end
end
--[==[
Replace comma with a temporary char in comma + whitespace.
]==]
function export.escape_comma_whitespace(run, tempcomma)
tempcomma = tempcomma or u(0xFFF0)
local escaped = false
if run:find("\\,") then
run = run:gsub("\\,", "\\" .. tempcomma) -- assign to temp to discard second return value
escaped = true
end
if run:find(",%s") then
run = run:gsub(",(%s)", tempcomma .. "%1") -- assign to temp to discard second return value
escaped = true
end
return run, escaped
end
--[==[
Undo the replacement of comma with a temporary char.
]==]
function export.unescape_comma_whitespace(run, tempcomma)
tempcomma = tempcomma or u(0xFFF0)
run = run:gsub(tempcomma, ",") -- assign to temp to discard second return value
return run
end
--[==[
Split the non-modifier parts of an alternating run (after parse_balanced_segment_run() is called) on comma, but not
on comma+whitespace. See `split_on_comma()` above for more information and the meaning of `tempcomma`.
]==]
function export.split_alternating_runs_on_comma(run, tempcomma)
tempcomma = tempcomma or u(0xFFF0)
-- Replace comma with a temporary char in comma + whitespace.
local function escape_comma_whitespace(seg)
return export.escape_comma_whitespace(seg, tempcomma)
end
-- Undo replacement of comma with a temporary char in comma + whitespace.
local function unescape_comma_whitespace(seg)
return export.unescape_comma_whitespace(seg, tempcomma)
end
return export.split_alternating_runs_escaping(run, ",", false, escape_comma_whitespace, unescape_comma_whitespace)
end
--[==[
Split text on comma, but not on comma+whitespace. This is similar to `mw.text.split(text, ",")` but will not split
on commas directly followed by whitespace, to handle embedded commas in terms (which are almost always followed by
a space). `tempcomma` is the Unicode character to temporarily use when doing the splitting; normally U+FFF0, but
you can specify a different character if you use U+FFF0 for some internal purpose.
]==]
function export.split_on_comma(text, tempcomma)
-- Don't do anything if no comma. Note that split_escaping() has a similar check at the beginning, so if there's a
-- comma we effectively do this check twice, but this is worth it to optimize for the common no-comma case.
if not text:find(",") then
return {text}
end
tempcomma = tempcomma or u(0xFFF0)
-- Replace comma with a temporary char in comma + whitespace.
local function escape_comma_whitespace(run)
return export.escape_comma_whitespace(run, tempcomma)
end
-- Undo replacement of comma with a temporary char in comma + whitespace.
local function unescape_comma_whitespace(run)
return export.unescape_comma_whitespace(run, tempcomma)
end
--[==[
Split text on a Lua pattern, but not on certain sequences involving characters in that pattern (e.g.
comma+whitespace). `splitchar` is the pattern to split on; `preserve_splitchar` indicates whether to preserve the
delimiter between split segments. `escape_fun` is called beforehand on the text and should return two values: the
escaped run and whether unescaping is needed. If the call to `escape_fun` indicates that unescaping is needed,
`unescape_fun` will be called on each run of text after splitting on `splitchar`. The return value of this a list
of runs, interspersed with delimiters if `preserve_splitchar` is specified.
]==]
local function split_escaping(text, splitchar, preserve_splitchar, escape_fun, unescape_fun)
if not rfind(text, splitchar) then
return {text}
end
--[==[
Like parse_balanced_segment_run() but accepts multiple sets of delimiters. For example,
{parse_multi_delimiter_balanced_segment_run("foo)], quux<glorp>", {{""}, {"(", ")"}, {"<", ">"}}) =
{"foo", ")]", ", quux", "<glorp>", ""}}.
Each element in the list of delimiter pairs is a string specifying an equivalence class of possible delimiter
characters. You can use this, for example, to allow either "[" or "&#91;" to be treated equivalently, with either
one closed by either "]" or "&#93;". To do this, first replace "&#91;" and "&#93;" with single Unicode
characters such as U+FFF0 and U+FFF1, and then specify a two-character string containing "[" and U+FFF0 as the opening
delimiter, and a two-character string containing "]" and U+FFF1 as the corresponding closing delimiter.
If `no_error_on_unmatched` is given and an error is found during parsing, a string is returned containing the error
message instead of throwing an error.
]==]
local function parse_multi_delimiter_balanced_segment_run(segment_run, delimiter_pairs, no_error_on_unmatched)
local escaped_delimiter_pairs = {}
local open_to_close_map = {}
local open_close_items = {}
local open_items = {}
for _, open_close in ipairs(delimiter_pairs) do
local open, close = unpack(open_close)
open = rsub(open, "(%%%%-])", "%%%1")
close = rsub(close, "(%%%%-])", "%%%1")
table.insert(open_close_items, open)
table.insert(open_close_items, close)
table.insert(open_items, open)
open = ""
close = ""
open_to_close_map = close
table.insert(escaped_delimiter_pairs, {open, close})
end
local open_close_pattern = "()"
local open_pattern = "()"
local break_on_open_close = capturing_split(segment_run, open_close_pattern)
local text_and_specs = {}
local level = 0
local seg_group = {}
local open_at_level_zero
for i, seg in ipairs(break_on_open_close) do
if i % 2 == 0 then
table.insert(seg_group, seg)
if level == 0 then
if not rfind(seg, open_pattern) then
local errmsg = "Unmatched close sign " .. seg .. ": '" .. segment_run .. "'"
if no_error_on_unmatched then
return errmsg
else
error(errmsg)
end
end
assert(open_at_level_zero == nil)
for _, open_close in ipairs(escaped_delimiter_pairs) do
local open, close = unpack(open_close)
if rfind(seg, open) then
open_at_level_zero = open
break
end
end
if open_at_level_zero == nil then
error(("Internal error: Segment %s didn't match any open regex"):format(seg))
end
level = level + 1
elseif rfind(seg, open_at_level_zero) then
level = level + 1
elseif rfind(seg, open_to_close_map) then
level = level - 1
assert(level >= 0)
if level == 0 then
table.insert(text_and_specs, table.concat(seg_group))
seg_group = {}
open_at_level_zero = nil
end
end
elseif level > 0 then
table.insert(seg_group, seg)
else
table.insert(text_and_specs, seg)
end
end
if level > 0 then
local errmsg = "Unmatched open sign " .. open_at_level_zero .. ": '" .. segment_run .. "'"
if no_error_on_unmatched then
return errmsg
else
error(errmsg)
end
end
return text_and_specs
end
-- If there are square or angle brackets, we don't want to split on delimiters inside of them. To effect this, we
-- use parse_multi_delimiter_balanced_segment_run() to parse balanced brackets, then do delimiter splitting on the
-- non-bracketed portions of text using split_alternating_runs_escaping(), and concatenate back to a list of
-- strings. When calling parse_multi_delimiter_balanced_segment_run(), we make sure not to throw an error on
-- unbalanced brackets; in that case, we fall through to the code below that handles the case without brackets.
if text:find("") then
local runs = parse_multi_delimiter_balanced_segment_run(text, {{""}, {"<", ">"}},
"no error on unmatched")
if type(runs) ~= "string" then
local split_runs = export.split_alternating_runs_escaping(runs, splitchar, preserve_splitchar, escape_fun,
unescape_fun)
for i = 1, #split_runs, (preserve_splitchar and 2 or 1) do
split_runs = table.concat(split_runs)
end
return split_runs
end
end
-- First escape sequences we don't want to count for splitting.
local need_unescape
text, need_unescape = escape_fun(text)
local parts =
preserve_splitchar and capturing_split(text, "(" .. splitchar .. ")") or
rsplit(text, splitchar)
if need_unescape then
for i = 1, #parts, (preserve_splitchar and 2 or 1) do
parts = unescape_fun(parts)
end
end
return parts
end
return split_escaping(text, ",", false, escape_comma_whitespace, unescape_comma_whitespace)
end
return export