A documentação para este módulo pode ser criada na página Módulo:languages/doc
local export = {}
-- Temporarily convert various formatting characters to PUA to prevent them from being disrupted by the substitution process.
-- TODO: Handle arbitrary number of capture groups.
local u = mw.ustring.char
local function doTempSubstitutions(text, subbedChars, keepCarets)
-- Cloning the table locally is much faster.
local patterns = mw.clone(require("Module:languages/data").patterns)
if keepCarets then
table.insert(patterns, "((\\\\)%^)")
table.insert(patterns, "((\\)%^)")
table.insert(patterns, "((%^))")
end
local i, pe = #subbedChars, require("Module:utilities").pattern_escape
for _, pattern in ipairs(patterns) do
for m1, m2, m3 in text:gmatch(pattern) do
local m, m1New = {m1, m2, m3}, m1
for j = 2, #m do
subbedChars = m
m1New = m1New:gsub(pe(m), u(0xE000+i+j-1), 1)
end
text = text:gsub(pe(m1), pe(m1New), 1)
i = i + #m - 1
end
end
return text, subbedChars
end
-- Reinsert any formatting that was temporarily substituted.
local function undoTempSubstitutions(text, subbedChars)
local pe = require("Module:utilities").pattern_escape
for i = 1, #subbedChars do
text = text:gsub(u(0xE000+i), pe(subbedChars))
end
return text
end
-- Convert any HTML entities.
local function noEntities(text)
if text:find("&+;") then
return require("Module:utilities").get_entities(text)
else
return text
end
end
-- Check if the raw text is an unsupported title, and if so return that. Otherwise, remove HTML entities. We do the pre-conversion to avoid loading the unsupported title list unnecessarily.
local function checkNoEntities(text)
local textNoEnc = noEntities(text)
if textNoEnc ~= text and mw.loadData("Module:links/data").unsupported_titles then
return text
else
return textNoEnc
end
end
-- If no script object is provided (or if it's invalid or None), get one.
local function checkScript(text, self, sc)
if type(sc) ~= "table" or sc._type ~= "script object" or sc:getCode() == "None" then
return self:findBestScript(text)
else
return sc
end
end
local function normalize(text, sc)
text = sc:fixDiscouragedSequences(text)
return sc:toFixedNFD(text)
end
-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
local function escapeRiskyChars(text)
for _, pattern in ipairs(mw.clone(require("Module:languages/data").patterns)) do
text = text:gsub(pattern, function(cap1) return mw.text.encode(cap1, "\"'") end)
end
return mw.text.encode(text, "#%%&+/:<=>?@_{|}")
end
-- Split the text into sections, based on the presence of temporarily substituted formatting characters, then iterate over each one to apply substitutions. This avoids putting PUA characters through language-specific modules, which may be unequipped for them.
local function iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, substitution_data, function_name)
local pe = require("Module:utilities").pattern_escape
local puaChars = ""
local sections = mw.text.split(text, puaChars)
for i, section in ipairs(sections) do
local sub = require("Module:languages/doSubstitutions")(section, self, sc, substitution_data, function_name)
-- Second round of temporary substitutions, in case any formatting was added by the main substitution process. However, don't do this if the section contains formatting already (as it would have had to have been escaped to reach this stage, and therefore should be given as raw text).
if sub and subbedChars then
local noSub; for _, pattern in ipairs(mw.clone(require("Module:languages/data").patterns)) do
if section:match(pattern) then noSub = true end
end
if not noSub then
sub, subbedChars = doTempSubstitutions(sub, subbedChars, keepCarets)
end
end
text = sub and text:gsub(pe(section), pe(sub), 1) or text
end
-- Trim end (ignoring any final formatting characters).
if text:match("^" .. puaChars) or text:match(puaChars .. "$") then
text = mw.ustring.gsub(text, "(.*+)(%s*" .. puaChars .. "?)", function(c1, c2) return c1 .. mw.text.trim(c2) end)
else
text = text:gsub("%s+$", "")
end
return text, subbedChars
end
-- Process carets (and any escapes). Default to simple removal, if no pattern/replacement is given.
local function processCarets(text, pattern, repl)
return text
:gsub("\\\\^", u(0xF0000) .. "^")
:gsub("\\^", u(0xF0001))
:gsub(pattern or "%^", repl or "")
:gsub(u(0xF0000), "\\")
:gsub(u(0xF0001), "^")
end
-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped).
local function removeCarets(text, sc)
if not sc:hasCapitalization() and sc:isTransliterated() and text:find("%^") then
return processCarets(text)
else
return text
end
end
local Language = {}
function Language:getCode()
return self._code
end
function Language:getCanonicalName()
return self._rawData
end
function Language:getDisplayForm()
return self:getCanonicalName()
end
function Language:getOtherNames(onlyOtherNames)
self:loadInExtraData()
return require("Module:language-like").getOtherNames(self, onlyOtherNames)
end
function Language:getAliases()
self:loadInExtraData()
return self._extraData.aliases or {}
end
function Language:getVarieties(flatten)
self:loadInExtraData()
return require("Module:language-like").getVarieties(self, flatten)
end
function Language:getType()
return self._rawData.type or "regular"
end
function Language:getWikimediaLanguageCodes()
if not self._wikimediaLanguageCodes then
self._wikimediaLanguageCodes = type(self._rawData.wikimedia_codes) == "table" and self._rawData.wikimedia_codes or type(self._rawData.wikimedia_codes) == "string" and mw.text.split(self._rawData.wikimedia_codes, "%s*,%s*") or {self:getCode()}
end
return self._wikimediaLanguageCodes
end
function Language:getWikimediaLanguages()
if not self._wikimediaLanguageObjects then
local m_wikimedia_languages = require("Module:wikimedia languages")
self._wikimediaLanguageObjects = {}
local wikimedia_codes = self:getWikimediaLanguageCodes()
for _, wlangcode in ipairs(wikimedia_codes) do
table.insert(self._wikimediaLanguageObjects, m_wikimedia_languages.getByCode(wlangcode))
end
end
return self._wikimediaLanguageObjects
end
function Language:getWikipediaArticle()
if self._rawData.wikipedia_article then
return self._rawData.wikipedia_article
elseif self._wikipedia_article then
return self._wikipedia_article
elseif self:getWikidataItem() and mw.wikibase then
self._wikipedia_article = mw.wikibase.sitelink(self:getWikidataItem(), 'enwiki')
end
if not self._wikipedia_article then
self._wikipedia_article = self:getCategoryName():gsub("Creole language", "Creole")
end
return self._wikipedia_article
end
function Language:makeWikipediaLink()
return "]"
end
function Language:getWikidataItem()
local item = self._rawData
if type(item) == "number" then
return "Q" .. item
else
return item
end
end
function Language:getScriptCodes()
if not self._scriptCodes then
self._scriptCodes = type(self._rawData) == "table" and self._rawData or type(self._rawData) == "string" and mw.text.split(self._rawData, "%s*,%s*") or {"None"}
end
return self._scriptCodes
end
function Language:getScripts()
if not self._scriptObjects then
local m_scripts = require("Module:scripts")
self._scriptObjects = {}
if self:getScriptCodes() == "All" then
self._scriptObjects = mw.loadData("Module:scripts/data")
else
for _, sc in ipairs(self:getScriptCodes()) do
table.insert(self._scriptObjects, m_scripts.getByCode(sc))
end
end
end
return self._scriptObjects
end
-- Find the best script to use, based on the characters of a string. If forceDetect is set, run the detection algorithm even if there's only one possible script; in that case, if the text isn't in the script, the return value will be None.
function Language:findBestScript(text, forceDetect)
if (not text) or text == "" or text == "-" then
return require("Module:scripts").getByCode("None")
end
if table.concat(self:getScriptCodes()) == "All" then
return require("Module:scripts").findBestScriptWithoutLang(text)
end
local scripts = self:getScripts()
if not scripts and not forceDetect then
if scripts:countCharacters(text) > 0 then
return scripts
else
return require("Module:scripts").getByCode("None")
end
end
return require("Module:languages/findBestScript")(export, self, text, scripts, forceDetect)
end
function Language:getFamily()
if self._familyObject then
return self._familyObject
end
if self._rawData then
self._familyObject = require("Module:families").getByCode(self._rawData)
end
return self._familyObject
end
function Language:getAncestorCodes()
if not self._ancestorCodes then
self._ancestorCodes = type(self._rawData.ancestors) == "table" and self._rawData.ancestors or type(self._rawData.ancestors) == "string" and mw.text.split(self._rawData.ancestors, "%s*,%s*") or nil
end
return self._ancestorCodes
end
function Language:getAncestors()
if not self._ancestorObjects then
self._ancestorObjects = {}
local ancestors
if self._rawData.ancestors then
ancestors = self:getAncestorCodes()
for _, ancestor in ipairs(ancestors) do
table.insert(self._ancestorObjects, export.getByCode(ancestor) or require("Module:etymology languages").getByCode(ancestor))
end
else
local fam = self:getFamily()
local protoLang = fam and fam:getProtoLanguage() or nil
-- For the case where the current language is the proto-language
-- of its family, we need to step up a level higher right from the start.
if protoLang and protoLang:getCode() == self:getCode() then
fam = fam:getFamily()
protoLang = fam and fam:getProtoLanguage() or nil
end
while not protoLang and not (not fam or fam:getCode() == "qfa-not") do
fam = fam:getFamily()
protoLang = fam and fam:getProtoLanguage() or nil
end
table.insert(self._ancestorObjects, protoLang)
end
end
return self._ancestorObjects
end
local function iterateOverAncestorTree(node, func)
for _, ancestor in ipairs(node:getAncestors()) do
if ancestor then
local ret = func(ancestor) or iterateOverAncestorTree(ancestor, func)
if ret then
return ret
end
end
end
end
function Language:getAncestorChain()
if not self._ancestorChain then
self._ancestorChain = {}
local step = self
while true do
local ancestors = step:getAncestors()
step = #ancestors == 1 and ancestors or nil
if not step then break end
table.insert(self._ancestorChain, 1, step)
end
end
return self._ancestorChain
end
function Language:hasAncestor(otherlang)
local function compare(ancestor)
return ancestor:getCode() == otherlang:getCode()
end
return iterateOverAncestorTree(self, compare) or false
end
function Language:getCategoryName(nocap)
local name = self:getCanonicalName()
-- If the name already has "language" in it, don't add it.
if not name:find("anguage$") then
name = name .. " language"
end
if not nocap then
name = mw.getContentLanguage():ucfirst(name)
end
return name
end
function Language:makeCategoryLink()
return "]"
end
function Language:getStandardCharacters()
return self._rawData.standardChars
end
-- Make the entry name (i.e. the correct page name).
function Language:makeEntryName(text, sc)
if not (text and text ~= "") then return text end
-- Remove bold, soft hyphens, strip markers and HTML tags.
text = text
:gsub("('*)'''(.-'*)'''", "%1%2")
:gsub("", "")
text = mw.text.unstrip(text)
:gsub("<+>", "")
-- Don't remove italics, as that would allow people to use it instead of {{m}} etc.
local textWithEnc = text
text = mw.uri.decode(text)
text = noEntities(text)
-- Check if the text is an interwiki link.
if text:find(":") then
-- If this is an interwiki link, a link to another namespace or there's an initial colon, return what we have.
local check, m_utildata = text:match("^:*(*):"), mw.loadData("Module:utilities/data")
if m_utildata.interwikis or m_utildata.namespaces then
return text
else
check, m_utildata = nil
-- Convert any escaped colons.
text = text:gsub("\\:", ":")
textWithEnc = textWithEnc:gsub("\\:", ":")
end
end
-- Check if the text is an unsupported title (with and without converting percent encoding/HTML entities).
local unsupportedTitles = mw.loadData("Module:links/data").unsupported_titles
if unsupportedTitles or unsupportedTitles then
return "Unsupported titles/" .. (unsupportedTitles or unsupportedTitles)
end
sc = checkScript(text, self, sc)
text = normalize(text, sc)
text = iterateSectionSubstitutions(text, nil, nil, self, sc, self._rawData.entry_name, "makeEntryName")
text = removeCarets(text, sc)
text = mw.ustring.match(text, "^?(.-.-)%s*?$") or text
return escapeRiskyChars(text)
end
-- Generates alternative forms using a specified method, and returns them as a table. If no method is specified, returns a table containing only the input term.
function Language:generateForms(text, sc)
if self._rawData.generate_forms then
sc = checkScript(text, self, sc)
return require("Module:" .. self._rawData.generate_forms).generateForms(text, self:getCode(), sc:getCode())
else
return {text}
end
end
function Language:makeSortKey(text, sc)
if (not text) or text == "" then return text end
-- Remove soft hyphens, strip markers and HTML tags.
text = text:gsub("", "")
text = mw.text.unstrip(text)
:gsub("<+>", "")
text = checkNoEntities(text)
-- Remove initial hyphens and *.
text = mw.ustring.gsub(text, "^+(.)", "%1")
sc = checkScript(text, self, sc)
text = normalize(text, sc)
text = removeCarets(text, sc)
-- For languages with dotted dotless i, ensure that "İ" is sorted as "i", and "I" is sorted as "ı".
if self._rawData.dotted_dotless_i then
text = text
:gsub(mw.ustring.toNFD("İ"), "i")
:gsub("I", "ı")
text = sc:toFixedNFD(text)
end
-- Convert to lowercase, make the sortkey, then convert to uppercase. Where the language has dotted dotless i, it is usually not necessary to convert "i" to "İ" and "ı" to "I" first, because "I" will always be interpreted as conventional "I" (not dotless "İ") by any sorting algorithms, which will have been taken into account by the sortkey substitutions themselves. However, if no sortkey substitutions have been specified, then conversion is necessary so as to prevent "i" and "ı" both being sorted as "I".
text = mw.ustring.lower(text)
text = iterateSectionSubstitutions(text, nil, nil, self, sc, self._rawData.sort_key, "makeSortKey")
if self._rawData.dotted_dotless_i and not self._rawData.sort_key then
text = text
:gsub("ı", "I")
:gsub("i", "İ")
text = sc:toFixedNFC(text)
end
text = mw.ustring.upper(text)
-- Remove parentheses, as long as they are either preceded or followed by something.
text = text
:gsub("(.)+", "%1")
:gsub("+(.)", "%1")
return escapeRiskyChars(text)
end
-- Create the form used as as a basis for display text and transliteration.
local function processDisplayText(text, self, sc, keepCarets, keepPrefixes)
local subbedChars = {}
text, subbedChars = doTempSubstitutions(text, subbedChars, keepCarets)
text = checkNoEntities(text)
sc = checkScript(text, self, sc)
text = normalize(text, sc)
text, subbedChars = iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, self._rawData.display_text, "makeDisplayText")
text = removeCarets(text, sc)
-- Remove any interwiki link prefixes (unless they have been escaped or this has been disabled).
if text:find(":") and not keepPrefixes then
text = text
:gsub("\\\\:", u(0xF0000) .. ":")
:gsub("\\:", u(0xF0001))
local prefix, oldText, m_utildata = text:match("^(*):"), text, mw.loadData("Module:utilities/data")
while m_utildata.interwikis or prefix == "" do
oldText = text
text = text:gsub("^" .. prefix .. ":", "")
prefix = text:match("^(*):")
end
-- If the whole text has been removed (i.e. the text ends with a colon), then the final prefix is not actually a prefix.
if text == "" then text = oldText end
text = text
:gsub(u(0xF0000), "\\")
:gsub(u(0xF0001), ":")
end
return text, subbedChars
end
-- Make the display text (i.e. what is displayed on the page).
function Language:makeDisplayText(text, sc, keepPrefixes)
if (not text) or text == "" then return text end
local subbedChars
text, subbedChars = processDisplayText(text, self, sc, nil, keepPrefixes)
text = escapeRiskyChars(text)
return undoTempSubstitutions(text, subbedChars)
end
function Language:transliterate(text, sc, module_override)
-- If there is no text, or the language doesn't have transliteration data and there's no override, return nil.
if not (self._rawData.translit or module_override) then
return nil
elseif (not text) or text == "" or text == "-" then
return text
end
-- If the script is not transliteratable (and no override is given), return nil.
sc = checkScript(text, self, sc)
if not (sc:isTransliterated() or module_override) then
return nil
end
-- Remove any strip markers.
text = mw.text.unstrip(text)
-- Get the display text with the keepCarets flag set.
local subbedChars
text, subbedChars = processDisplayText(text, self, sc, true)
-- Transliterate (using the module override if applicable).
text, subbedChars = iterateSectionSubstitutions(text, subbedChars, true, self, sc, module_override or self._rawData.translit, "tr")
-- Incomplete transliterations return nil.
-- FIXME: Handle transliterations with characters that are in both Latn/Latinx and a transliteratable script (e.g. U+A700-U+A707 are in Latinx and Hani).
if (not text) or sc:countCharacters(text) > 0 then
return nil
end
text = escapeRiskyChars(text)
text = undoTempSubstitutions(text, subbedChars)
-- If the script does not use capitalization, then capitalize any letters of the transliteration which are immediately preceded by a caret (and remove the caret).
if text and not sc:hasCapitalization() and text:find("%^") then
text = processCarets(text, "%^(*)", mw.ustring.upper)
end
-- Track module overrides.
if module_override then
require("Module:debug").track("module_override")
end
return text
end
function Language:overrideManualTranslit()
return not not self._rawData.override_translit
end
function Language:hasTranslit()
return not not self._rawData.translit
end
function Language:link_tr()
return not not self._rawData.link_tr
end
function Language:toJSON(returnTable)
local entryNamePatterns = nil
local entryNameRemoveDiacritics = nil
if self._rawData.entry_name then
entryNameRemoveDiacritics = self._rawData.entry_name.remove_diacritics
if self._rawData.entry_name.from then
entryNamePatterns = {}
for i, from in ipairs(self._rawData.entry_name.from) do
table.insert(entryNamePatterns, {from = from, to = self._rawData.entry_name.to or ""})
end
end
end
local ret = {
ancestors = self:getAncestorCodes(),
canonicalName = self:getCanonicalName(),
categoryName = self:getCategoryName("nocap"),
code = self:getCode(),
entryNamePatterns = entryNamePatterns,
entryNameRemoveDiacritics = entryNameRemoveDiacritics,
family = self._rawData,
otherNames = self:getOtherNames(true),
aliases = self:getAliases(),
varieties = self:getVarieties(),
scripts = self:getScriptCodes(),
type = self:getType(),
wikimediaLanguages = self:getWikimediaLanguageCodes(),
wikidataItem = self:getWikidataItem(),
}
if returnTable then
return ret
end
return require("Module:JSON").toJSON(ret)
end
-- Do NOT use these methods!
-- All uses should be pre-approved on the talk page!
function Language:getRawData()
return self._rawData
end
function Language:getRawExtraData()
self:loadInExtraData()
return self._extraData
end
Language.__index = Language
function export.getDataModuleName(code)
if code:find("^%l%l$") then
return "languages/data/2"
elseif code:find("^%l%l%l$") then
local prefix = code:sub(1, 1)
return "languages/data/3/" .. prefix
elseif code:find("^+$") then
return "languages/data/exceptional"
else
return nil
end
end
function export.getExtraDataModuleName(code)
local dataModule = export.getDataModuleName(code)
return dataModule and dataModule .. "/extra" or nil
end
local function getRawLanguageData(code)
local modulename = export.getDataModuleName(code)
return modulename and mw.loadData("Module:" .. modulename) or nil
end
local function getRawExtraLanguageData(code)
local modulename = export.getExtraDataModuleName(code)
return modulename and mw.loadData("Module:" .. modulename) or nil
end
function Language:loadInExtraData()
if not self._extraData then
-- load extra data from module and assign to _extraData field
-- use empty table as a fallback if extra data is nil
self._extraData = getRawExtraLanguageData(self:getCode()) or {}
end
end
function export.makeObject(code, data)
if data and data.deprecated then
require("Module:debug").track {
"languages/deprecated",
"languages/deprecated/" .. code
}
end
return data and setmetatable({_rawData = data, _code = code, _type = "language object"}, Language) or nil
end
function export.getByCode(code, paramForError, allowEtymLang, allowFamily)
if type(code) ~= "string" then
error("The function getByCode expects a string as its first argument, but received " .. (code == nil and "nil" or "a " .. type(code)) .. ".")
end
local retval = export.makeObject(code, getRawLanguageData(code))
if not retval and allowEtymLang then
retval = require("Module:etymology languages").getByCode(code)
end
if not retval and allowFamily then
retval = require("Module:families").getByCode(code)
end
if not retval and paramForError then
require("Module:languages/errorGetBy").code(code, paramForError, allowEtymLang, allowFamily)
end
return retval
end
function export.getByName(name, errorIfInvalid)
local byName = mw.loadData("Module:languages/by name")
local code = byName.all and byName.all or byName
if not code then
if errorIfInvalid then
error("The language name \"" .. name .. "\" is not valid. See ].")
else
return nil
end
end
return export.makeObject(code, getRawLanguageData(code))
end
function export.getByCanonicalName(name, errorIfInvalid, allowEtymLang, allowFamily)
local byName = mw.loadData("Module:languages/canonical names")
local code = byName and byName
local retval = code and export.makeObject(code, getRawLanguageData(code)) or nil
if not retval and allowEtymLang then
retval = require("Module:etymology languages").getByCanonicalName(name)
end
if not retval and allowFamily then
local famname = name:match("^(.*) languages$")
famname = famname or name
retval = require("Module:families").getByCanonicalName(famname)
end
if not retval and errorIfInvalid then
require("Module:languages/errorGetBy").canonicalName(name, allowEtymLang, allowFamily)
end
return retval
end
--[[ If language is an etymology language, iterates through parent languages
until it finds a non-etymology language. ]]
function export.getNonEtymological(lang)
while lang:getType() == "etymology language" do
local parentCode = lang:getParentCode()
lang = export.getByCode(parentCode)
or require("Module:etymology languages").getByCode(parentCode)
or require("Module:families").getByCode(parentCode)
end
return lang
end
-- for backwards compatibility only; modules should require the /error themselves
function export.err(lang_code, param, code_desc, template_tag, not_real_lang)
return require("Module:languages/error")(lang_code, param, code_desc, template_tag, not_real_lang)
end
return export