Faster reimplementation of mw.text.nowiki
, which also has some minor changes to match the PHP equivalent used by the native parser.
local table_list_to_set_module = "Module:table/listToSet"
local byte = string.byte
local char = string.char
local find = string.find
local format = string.format
local gsub = string.gsub
local lower = string.lower
local sub = string.sub
local function list_to_set(...)
list_to_set = require(table_list_to_set_module)
return list_to_set(...)
end
local function escape_byte(_, b)
return format("&#%d;", b)
end
local absolute
local function get_absolute()
absolute, get_absolute = list_to_set({"\"", "&", "'", ";", "<", "=", ">", "", "{", "|", "}"}, function(_, ch)
return format("&#%d;", byte(ch))
end), nil
return absolute
end
local after_newline
local function get_after_newline()
after_newline, get_after_newline = list_to_set({0x9, 0xA, 0xD, 0x20, 0x21, 0x23, 0x2A, 0x3A}, escape_byte), nil
return after_newline
end
local scheme_chars, unused_scheme_chars, uri_schemes
local function get_scheme_chars()
local _uri_schemes = {"bitcoin", "geo", "magnet", "mailto", "matrix", "news", "sip", "sips", "sms", "tel", "urn", "xmpp"}
uri_schemes, get_scheme_chars = list_to_set(_uri_schemes), nil
_uri_schemes = table.concat(_uri_schemes)
-- Add alphanumeric characters used in a listed scheme to `scheme_chars`,
-- and those that aren't to `unused_scheme_chars`.
scheme_chars, unused_scheme_chars = {}, {}
for i = 0x61, 0x7A do
local chars = find(_uri_schemes, char(i), nil, true) and scheme_chars or unused_scheme_chars
chars = true
chars = true
end
unused_scheme_chars = true -- _
return scheme_chars
end
local after_magic_link
local function get_after_magic_link()
after_magic_link, get_after_magic_link = list_to_set({0x9, 0xA, 0xC, 0xD, 0x20}, escape_byte), nil
return after_magic_link
end
-- Temporary variables. No risk of collisions across stack levels, since there's
-- no way for nowiki() to be called recursively.
local _str, esc, esc_next, esc_next2
local function nowiki(loc, ch)
if esc_next ~= nil then
esc, esc_next, esc_next2 = esc_next, esc_next2, nil
return esc or nil
end
esc = (absolute or get_absolute())
if esc then
return esc
elseif ch == "\n" or ch == "\r" then
esc_next = (after_newline or get_after_newline())
elseif ch == "!" then
if byte(_str, loc + 1) == 0x21 then -- !
esc_next = "!"
end
elseif ch == ":" then
if sub(_str, loc + 1, loc + 2) == "//" then
return ":"
end
local n, b = 0
repeat
n = n + 1
b = byte(_str, loc - n)
until not (scheme_chars or get_scheme_chars())
-- Abort on an unused scheme character, as it can't be a scheme.
if unused_scheme_chars then
return
-- Otherwise, check if the the word before the colon matches a scheme.
elseif uri_schemes then
return ":"
end
elseif ch == "-" then
local prev = byte(_str, loc - 1)
if (prev == 0xA or prev == 0xD) and sub(_str, loc + 1, loc + 3) == "---" then
return "-"
end
elseif ch == "I" then
local esc_sp = (after_magic_link or get_after_magic_link())
if esc_sp and sub(_str, loc + 1, loc + 3) == "SBN" then
esc_next = esc_sp
end
elseif ch == "P" then
local esc_sp = (after_magic_link or get_after_magic_link())
if esc_sp and sub(_str, loc + 1, loc + 3) == "MID" then
esc_next, esc_next2 = false, esc_sp -- to avoid escaping "I" in "PMID"
end
elseif ch == "R" then
local esc_sp = (after_magic_link or get_after_magic_link())
if esc_sp and sub(_str, loc + 1, loc + 2) == "FC" then
esc_next = esc_sp
end
elseif ch == "_" then
if byte(_str, loc + 1) == 0x5F then -- _
esc_next = "_"
end
elseif ch == "~" then
if sub(_str, loc + 1, loc + 2) == "~~" then
esc_next, esc_next2 = false, "~"
end
end
end
local first
local function get_first()
first, get_first = list_to_set({0x9, 0xA, 0xD, 0x20, 0x21, 0x23, 0x2A, 0x2B, 0x2D, 0x3A, 0x5F, 0x7E}, escape_byte), nil
return first
end
local last
local function get_last()
last, get_last = list_to_set({0x9, 0xA, 0xC, 0xD, 0x20, 0x21, 0x3A, 0x5F, 0x7E}, escape_byte), nil
return last
end
return function(str)
if #str == 0 then
return str
end
_str, esc, esc_next, esc_next2 = str, nil, nil, nil
str = gsub(str, "()(_{|}~])", nowiki)
local b = byte(str, 1)
local esc_first = (first or get_first())
if #str == 1 then
return esc_first or (last or get_last()) or str
end
local esc_last = (last or get_last())
if esc_first then
if esc_last then
return esc_first .. sub(str, 2, -2) .. esc_last
end
return esc_first .. sub(str, 2)
elseif esc_last then
return sub(str, 1, -2) .. esc_last
end
return str
end