Module:User:Gyfo/parse utilities

This module sandbox lacks a documentation subpage. Please create it.
Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox
local export = {}

local rfind = mw.ustring.find
local rsplit = mw.text.split
local u = mw.ustring.char
local rsubn = mw.ustring.gsub

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

--[==[
-- Reimplementation of mw.ustring.split() that includes any capturing
-- groups in the splitting pattern. This works like Python's re.split()
-- function, except that it has Lua's behavior when the split pattern
-- is empty (i.e. advancing by one character at a time; Python returns the
-- whole remainder of the string).
]==]
function export.capturing_split(str, pattern)
	local ret = {}
	-- (.-) corresponds to (.*?) in Python or Perl; () captures the
	-- current position after matching.
	pattern = "(.-)" .. pattern .. "()"
	local start = 1
	while true do
		-- Did we reach the end of the string?
		if start > #str then
			table.insert(ret, "")
			return ret
		end
		-- match() returns all captures as multiple return values;
		-- we need to insert into a table to get them all.
		local captures = {export.match(str, pattern, start)}
		-- If no match, add the remainder of the string.
		if #captures == 0 then
			table.insert(ret, export.sub(str, start))
			return ret
		end
		local newstart = table.remove(captures)
		-- Special case: If we don't advance by any characters, then advance
		-- by one character; this avoids an infinite loop, and makes splitting
		-- by an empty string work the way mw.ustring.split() does. If we
		-- reach the end of the string this way, return immediately, so we
		-- don't get a final empty string.
		if newstart == start then
			table.insert(ret, export.sub(str, start, start))
			table.remove(captures, 1)
			start = start + 1
			if start > #str then
				return ret
			end
		else
			table.insert(ret, table.remove(captures, 1))
			start = newstart
		end
		-- Insert any captures from the splitting pattern.
		for _, x in ipairs(captures) do
			table.insert(ret, x)
		end
	end
end

--[=[
In order to understand the following parsing code, you need to understand how inflected text specs work. They are
intended to work with inflected text where individual words to be inflected may be followed by inflection specs in
angle brackets. The format of the text inside of the angle brackets is up to the individual language and part-of-speech
specific implementation. A real-world example is as follows: "]<+> ]<*,*#.pr>". This is the
inflection of a multiword expression "меди́чна сестра́", which means "nurse" in Ukrainian (literally "medical sister"),
consisting of two words: the adjective меди́чна ("medical" in the feminine singular) and the noun сестра́ ("sister"). The
specs in angle brackets follow each word to be inflected; for example, <+> means that the preceding word should be
declined as an adjective.

The code below works in terms of balanced expressions, which are bounded by delimiters such as < > or . The
intention is to allow separators such as spaces to be embedded inside of delimiters; such embedded separators will not
be parsed as separators. For example, Ukrainian noun specs allow footnotes in brackets to be inserted inside of angle
brackets; something like "меди́чна<+> сестра́<pr.>" is legal, as is
"]<+> ]<pr.>", and the parsing code should not be
confused by the embedded brackets, spaces or angle brackets.

The parsing is done by two functions, which work in close concert: parse_balanced_segment_run() and
split_alternating_runs(). To illustrate, consider the following:

parse_balanced_segment_run("foo<M.proper noun> bar<F>", "<", ">") =
  {"foo", "<M.proper noun>", " bar", "<F>", ""}

then

split_alternating_runs({"foo", "<M.proper noun>", " bar", "<F>", ""}, " ") =
  {{"foo", "<M.proper noun>", ""}, {"bar", "<F>", ""}}

Here, we start out with a typical inflected text spec "foo<M.proper noun> bar<F>", call parse_balanced_segment_run() on
it, and call split_alternating_runs() on the result. The output of parse_balanced_segment_run() is a list where
even-numbered segments are bounded by the bracket-like characters passed into the function, and odd-numbered segments
consist of the surrounding text. split_alternating_runs() is called on this, and splits *only* the odd-numbered
segments, grouping all segments between the specified character. Note that the inner lists output by
split_alternating_runs() are themselves in the same format as the output of parse_balanced_segment_run(), with
bracket-bounded text in the even-numbered segments. Hence, such lists can be passed again to split_alternating_runs().
]=]


--[==[
Parse a string containing matched instances of parens, brackets or the like. Return a list of strings, alternating
between textual runs not containing the open/close characters and runs beginning and ending with the open/close
characters. For example,

{parse_balanced_segment_run("foo(x(1)), bar(2)", "(", ")") = {"foo", "(x(1))", ", bar", "(2)", ""}}
]==]
function export.parse_balanced_segment_run(segment_run, open, close)
	return capturing_split(segment_run, "(%b" .. open .. close .. ")")
		-- Повторная реализация mw.ustring.split(), которая включает любые группы захвата -- в шаблоне разделения. 
		-- Это работает аналогично функции Python re.split() -- за исключением того, что она ведет себя как Lua, 
		-- когда шаблон разделения -- пуст (т. е. продвигается по одному символу за раз; Python возвращает -- весь остаток строки).
end


--[==[
Split a list of alternating textual runs of the format returned by `parse_balanced_segment_run` on `splitchar`. This
only splits the odd-numbered textual runs (the portions between the balanced open/close characters).  The return value
is a list of lists, where each list contains an odd number of elements, where the even-numbered elements of the sublists
are the original balanced textual run portions. For example, if we do

{parse_balanced_segment_run("foo<M.proper noun> bar<F>", "<", ">") =
  {"foo", "<M.proper noun>", " bar", "<F>", ""}}

then

{split_alternating_runs({"foo", "<M.proper noun>", " bar", "<F>", ""}, " ") =
  {{"foo", "<M.proper noun>", ""}, {"bar", "<F>", ""}}}

Note that we did not touch the text "<M.proper noun>" even though it contains a space in it, because it is an
even-numbered element of the input list. This is intentional and allows for embedded separators inside of
brackets/parens/etc. Note also that the inner lists in the return value are of the same form as the input list (i.e.
they consist of alternating textual runs where the even-numbered segments are balanced runs), and can in turn be passed
to split_alternating_runs().

If `preserve_splitchar` is passed in, the split character is included in the output, as follows:

{split_alternating_runs({"foo", "<M.proper noun>", " bar", "<F>", ""}, " ", true) =
  {{"foo", "<M.proper noun>", ""}, {" "}, {"bar", "<F>", ""}}}

Consider what happens if the original string has multiple spaces between brackets, and multiple sets of brackets
without spaces between them.

{parse_balanced_segment_run("foo baz-bat quux xyzzy", "") =
  {"foo", "", "", "", " baz-bat quux xyzzy", "", ""}}

then

{split_alternating_runs({"foo", "", "", "", " baz-bat quux xyzzy", "", ""}, "") =
  {{"foo", "", "", "", ""}, {"baz"}, {"bat"}, {"quux"}, {"xyzzy", "", ""}}}

If `preserve_splitchar` is passed in, the split character is included in the output,
as follows:

{split_alternating_runs({"foo", "", "", "", " baz bat quux xyzzy", "", ""}, "", true) =
  {{"foo", "", "", "", ""}, {" "}, {"baz"}, {"-"}, {"bat"}, {" "}, {"quux"}, {" "}, {"xyzzy", "", ""}}}

As can be seen, the even-numbered elements in the outer list are one-element lists consisting of the separator text.
]==]
function export.split_alternating_runs(segment_runs, splitchar, preserve_splitchar)
	local grouped_runs = {}
	local run = {}
	for i, seg in ipairs(segment_runs) do
		if i % 2 == 0 then
			table.insert(run, seg)
		else
			local parts =
				preserve_splitchar and capturing_split(seg, "(" .. splitchar .. ")") or
				rsplit(seg, splitchar)
			table.insert(run, parts)
			for j=2,#parts do
				table.insert(grouped_runs, run)
				run = {parts}
			end
		end
	end
	if #run > 0 then
		table.insert(grouped_runs, run)
	end
	return grouped_runs
end

--[==[
Like split_alternating_runs() but applies an arbitrary function `frob` to "raw-text" segments in the result (i.e.
not stuff within balanced delimiters such as footnotes and inflection specs, and not splitchars if present). `frob`
is a function of one argument (the string to frob) and should return one argument (the frobbed string).
]==]
function export.split_alternating_runs_and_frob_raw_text(run, splitchar, frob, preserve_splitchar)
	local split_runs = export.split_alternating_runs(run, splitchar, preserve_splitchar)
	--[==[
	Apply an arbitrary function `frob` to the "raw-text" segments in a split run set (the output of
	split_alternating_runs()). We leave alone stuff within balanced delimiters (footnotes, inflection specs and the
	like), as well as splitchars themselves if present. `preserve_splitchar` indicates whether splitchars are present
	in the split run set. `frob` is a function of one argument (the string to frob) and should return one argument (the
	frobbed string). We operate by only frobbing odd-numbered segments, and only in odd-numbered runs if
	preserve_splitchar is given.
	]==]
	local function frob_raw_text_alternating_runs(split_run_set, frob, preserve_splitchar)
		for i, run in ipairs(split_run_set) do
			if not preserve_splitchar or i % 2 == 1 then
				for j, segment in ipairs(run) do
					if j % 2 == 1 then
						run = frob(segment)
					end
				end
			end
		end
	end
	frob_raw_text_alternating_runs(split_runs, frob, preserve_splitchar)
	return split_runs
end


--[==[
Split the non-modifier parts of an alternating run (after parse_balanced_segment_run() is called) on a Lua pattern,
but not on certain sequences involving characters in that pattern (e.g. comma+whitespace). `splitchar` is the pattern
to split on; `preserve_splitchar` indicates whether to preserve the delimiter and is the same as in
split_alternating_runs(). `escape_fun` is called beforehand on each run of raw text and should return two values:
the escaped run and whether unescaping is needed. If any call to `escape_fun` indicates that unescaping is needed,
`unescape_fun` will be called on each run of raw text after splitting on `splitchar`. The return value of this
function is as in split_alternating_runs().
]==]
function export.split_alternating_runs_escaping(run, splitchar, preserve_splitchar, escape_fun, unescape_fun)
	-- First replace comma with a temporary character in comma+whitespace sequences.
	local need_unescape = false
	for i, seg in ipairs(run) do
		if i % 2 == 1 then
			local this_need_unescape
			run, this_need_unescape = escape_fun(run)
			need_unescape = need_unescape or this_need_unescape
		end
	end

	if need_unescape then
		return export.split_alternating_runs_and_frob_raw_text(run, splitchar, unescape_fun, preserve_splitchar)
	else
		return export.split_alternating_runs(run, splitchar, preserve_splitchar)
	end
end


--[==[
Replace comma with a temporary char in comma + whitespace.
]==]
function export.escape_comma_whitespace(run, tempcomma)
	tempcomma = tempcomma or u(0xFFF0)
	local escaped = false

	if run:find("\\,") then
		run = run:gsub("\\,", "\\" .. tempcomma) -- assign to temp to discard second return value
		escaped = true
	end
	if run:find(",%s") then
		run = run:gsub(",(%s)", tempcomma .. "%1") -- assign to temp to discard second return value
		escaped = true
	end
	return run, escaped
end


--[==[
Undo the replacement of comma with a temporary char.
]==]
function export.unescape_comma_whitespace(run, tempcomma)
	tempcomma = tempcomma or u(0xFFF0)

	run = run:gsub(tempcomma, ",") -- assign to temp to discard second return value
	return run
end


--[==[
Split the non-modifier parts of an alternating run (after parse_balanced_segment_run() is called) on comma, but not
on comma+whitespace. See `split_on_comma()` above for more information and the meaning of `tempcomma`.
]==]
function export.split_alternating_runs_on_comma(run, tempcomma)
	tempcomma = tempcomma or u(0xFFF0)

	-- Replace comma with a temporary char in comma + whitespace.
	local function escape_comma_whitespace(seg)
		return export.escape_comma_whitespace(seg, tempcomma)
	end

	-- Undo replacement of comma with a temporary char in comma + whitespace.
	local function unescape_comma_whitespace(seg)
		return export.unescape_comma_whitespace(seg, tempcomma)
	end

	return export.split_alternating_runs_escaping(run, ",", false, escape_comma_whitespace, unescape_comma_whitespace)
end


--[==[
Split text on comma, but not on comma+whitespace. This is similar to `mw.text.split(text, ",")` but will not split
on commas directly followed by whitespace, to handle embedded commas in terms (which are almost always followed by
a space). `tempcomma` is the Unicode character to temporarily use when doing the splitting; normally U+FFF0, but
you can specify a different character if you use U+FFF0 for some internal purpose.
]==]
function export.split_on_comma(text, tempcomma)
	-- Don't do anything if no comma. Note that split_escaping() has a similar check at the beginning, so if there's a
	-- comma we effectively do this check twice, but this is worth it to optimize for the common no-comma case.
	if not text:find(",") then
		return {text}
	end

	tempcomma = tempcomma or u(0xFFF0)

	-- Replace comma with a temporary char in comma + whitespace.
	local function escape_comma_whitespace(run)
		return export.escape_comma_whitespace(run, tempcomma)
	end

	-- Undo replacement of comma with a temporary char in comma + whitespace.
	local function unescape_comma_whitespace(run)
		return export.unescape_comma_whitespace(run, tempcomma)
	end

	--[==[
	Split text on a Lua pattern, but not on certain sequences involving characters in that pattern (e.g.
	comma+whitespace). `splitchar` is the pattern to split on; `preserve_splitchar` indicates whether to preserve the
	delimiter between split segments. `escape_fun` is called beforehand on the text and should return two values: the
	escaped run and whether unescaping is needed. If the call to `escape_fun` indicates that unescaping is needed,
	`unescape_fun` will be called on each run of text after splitting on `splitchar`. The return value of this a list
	of runs, interspersed with delimiters if `preserve_splitchar` is specified.
	]==]
	local function split_escaping(text, splitchar, preserve_splitchar, escape_fun, unescape_fun)
		if not rfind(text, splitchar) then
			return {text}
		end

		--[==[
		Like parse_balanced_segment_run() but accepts multiple sets of delimiters. For example,

		{parse_multi_delimiter_balanced_segment_run("foo)], quux<glorp>", {{""}, {"(", ")"}, {"<", ">"}}) =
			{"foo", ")]", ", quux", "<glorp>", ""}}.

		Each element in the list of delimiter pairs is a string specifying an equivalence class of possible delimiter
		characters. You can use this, for example, to allow either "[" or "&amp;#91;" to be treated equivalently, with either
		one closed by either "]" or "&amp;#93;". To do this, first replace "&amp;#91;" and "&amp;#93;" with single Unicode
		characters such as U+FFF0 and U+FFF1, and then specify a two-character string containing "[" and U+FFF0 as the opening
		delimiter, and a two-character string containing "]" and U+FFF1 as the corresponding closing delimiter.

		If `no_error_on_unmatched` is given and an error is found during parsing, a string is returned containing the error
		message instead of throwing an error.
		]==]
		local function parse_multi_delimiter_balanced_segment_run(segment_run, delimiter_pairs, no_error_on_unmatched)
			local escaped_delimiter_pairs = {}
			local open_to_close_map = {}
			local open_close_items = {}
			local open_items = {}
			for _, open_close in ipairs(delimiter_pairs) do
				local open, close = unpack(open_close)
				open = rsub(open, "(%%%%-])", "%%%1")
				close = rsub(close, "(%%%%-])", "%%%1")
				table.insert(open_close_items, open)
				table.insert(open_close_items, close)
				table.insert(open_items, open)
				open = ""
				close = ""
				open_to_close_map = close
				table.insert(escaped_delimiter_pairs, {open, close})
			end
			local open_close_pattern = "()"
			local open_pattern = "()"
			local break_on_open_close = capturing_split(segment_run, open_close_pattern)
			local text_and_specs = {}
			local level = 0
			local seg_group = {}
			local open_at_level_zero

			for i, seg in ipairs(break_on_open_close) do
				if i % 2 == 0 then
					table.insert(seg_group, seg)
					if level == 0 then
						if not rfind(seg, open_pattern) then
							local errmsg = "Unmatched close sign " .. seg .. ": '" .. segment_run .. "'"
							if no_error_on_unmatched then
								return errmsg
							else
								error(errmsg)
							end
						end
						assert(open_at_level_zero == nil)
						for _, open_close in ipairs(escaped_delimiter_pairs) do
							local open, close = unpack(open_close)
							if rfind(seg, open) then
								open_at_level_zero = open
								break
							end
						end
						if open_at_level_zero == nil then
							error(("Internal error: Segment %s didn't match any open regex"):format(seg))
						end
						level = level + 1
					elseif rfind(seg, open_at_level_zero) then
						level = level + 1
					elseif rfind(seg, open_to_close_map) then
						level = level - 1
						assert(level >= 0)
						if level == 0 then
							table.insert(text_and_specs, table.concat(seg_group))
							seg_group = {}
							open_at_level_zero = nil
						end
					end
				elseif level > 0 then
					table.insert(seg_group, seg)
				else
					table.insert(text_and_specs, seg)
				end
			end
			if level > 0 then
				local errmsg = "Unmatched open sign " .. open_at_level_zero .. ": '" .. segment_run .. "'"
				if no_error_on_unmatched then
					return errmsg
				else
					error(errmsg)
				end
			end
			return text_and_specs
		end
		-- If there are square or angle brackets, we don't want to split on delimiters inside of them. To effect this, we
		-- use parse_multi_delimiter_balanced_segment_run() to parse balanced brackets, then do delimiter splitting on the
		-- non-bracketed portions of text using split_alternating_runs_escaping(), and concatenate back to a list of
		-- strings. When calling parse_multi_delimiter_balanced_segment_run(), we make sure not to throw an error on
		-- unbalanced brackets; in that case, we fall through to the code below that handles the case without brackets.
		if text:find("") then
			local runs = parse_multi_delimiter_balanced_segment_run(text, {{""}, {"<", ">"}},
				"no error on unmatched")
			if type(runs) ~= "string" then
				local split_runs = export.split_alternating_runs_escaping(runs, splitchar, preserve_splitchar, escape_fun,
					unescape_fun)
				for i = 1, #split_runs, (preserve_splitchar and 2 or 1) do
					split_runs = table.concat(split_runs)
				end
				return split_runs
			end
		end

		-- First escape sequences we don't want to count for splitting.
		local need_unescape
		text, need_unescape = escape_fun(text)

		local parts =
			preserve_splitchar and capturing_split(text, "(" .. splitchar .. ")") or
			rsplit(text, splitchar)
		if need_unescape then
			for i = 1, #parts, (preserve_splitchar and 2 or 1) do
				parts = unescape_fun(parts)
			end
		end
		return parts
	end

	return split_escaping(text, ",", false, escape_comma_whitespace, unescape_comma_whitespace)
end

return export
Module:User:Gyfo/parse utilities

Wikious

Boobota

Sagapedia