Ce module définit des fonctions permettant de manipuler les données d’Unicode.
getBlocks()
Retourne la liste de tous les blocs Unicode.
getScripts()
Retourne la liste de tous les scripts Unicode.
getScriptRanges()
Retourne la liste des plages de tous les scripts Unicode.
getBlock(lowerCodepoint)
Retourne le bloc Unicode dont le point de code le plus petit correspond au paramètre ou nil
si aucun ne correspond.
lowerCodepoint
(entier) : La borne inférieure du bloc (point de code).nil
getBlockForChar(char)
Retourne le bloc Unicode contenant le caractère donné ou nil
s’il n’appartient à aucun bloc. Lance une erreur si aucun caractère ou plusieurs sont passés.
char
(chaîne) : Le caractère.nil
getScript(code)
Retourne le script Unicode correspondant au code donné ou nil
si aucun ne correspond.
code
(chaîne) : Le code du script.nil
getScriptForChar(char)
Retourne le script Unicode pour le caractère donné. Lance une erreur si aucun caractère ou plusieurs sont passés.
char
(chaîne) : Le caractère.getScriptForText(text)
Retourne le script Unicode pour le texte donné. Si le texte est composé de caractères dans plusieurs scripts, autres que Common ou Inherited, le script retourné est Common.
text
(chaîne) : Le texte.getScriptsForText(text)
Retourne les scripts Unicode pour le texte donné.
text
(chaîne) : Le texte.textHasScript(text, scriptCode)
Indique si le texte donné est dans le script Unicode donné. Retourne faux si le code ne correspond à aucun script existant.
text
(chaîne) : Le texte.scriptCode
(chaîne) : Le code du script.setWritingDirection(text)
Définit le sens d’écriture pour le texte donné, à partir de son script Unicode, en l’insérant dans une balise span.
text
(chaîne) : Le texte.blockReference
Retourne le wikicode pour le modèle Modèle:R:Bloc Unicode.
1
(entier, optionnel) : Le point de code de la borne inférieure du bloc (décimal ou hexadécimal avec préfixe « 0x »). Si ce paramètre n’est pas renseigné, le code sera extrait du titre de la page.writingDirection
Définit le sens d’écriture pour le texte donné, à partir de son script Unicode, en l’insérant dans une balise span.
1
(chaîne) : Le texte.codepoint
Retourne le point de code Unicode du caractère donné. Lance une erreur si aucun caractère ou plusieurs sont passés.
1
(1 seul caractère) : Le caractère dont on veut le point de code.hexa
(booléen, défaut = faux) : Indique si le point de code doit être retourné en hexadécimal (cf. Module:paramètres pour les valeurs possibles). La valeur hexadécimale est retournée sans le préfixe « 0x ».character
Retourne le caractère correspondant au point de code Unicode donné. Lance une erreur si le point de code est invalide.
1
(entier) : Le point de code (décimal ou hexadécimal avec préfixe « 0x »).La documentation de ce module est générée par le modèle {{Documentation module}}.
Elle est incluse depuis la page Module:données Unicode/Documentation. Veuillez placer les catégories sur cette page-là.
Les éditeurs peuvent travailler dans le bac à sable (créer).
Voir les statistiques d'appel depuis le wikicode sur l'outil wstat et les appels depuis d'autres modules.
local m_table = require("Module:table")
local m_params = require("Module:paramètres")
local p = {}
---------------------------------
-- Functions for other modules --
---------------------------------
-- Data loading functions --
--- Returns the list of all Unicode blocks.
--- @return table
function p.getBlocks()
return mw.loadData("Module:données Unicode/data/blocks")
end
--- Returns the list of all Unicode scripts.
--- @return table
function p.getScripts()
return mw.loadData("Module:données Unicode/data/scripts")
end
--- Returns the list of all Unicode script ranges.
--- @return table
function p.getScriptRanges()
-- Loaded with require() instead of mw.loadData() as the returned table
-- has tables as keys.
return require("Module:données Unicode/data/script ranges")
end
--- Returns the list of all number systems.
--- @return table
function p.getNumberSystems()
return mw.loadData("Module:données Unicode/data/numbers")
end
-- Block-related functions --
--- Returns the Unicode block that has the given lowest codepoint.
--- @param lowerCodepoint number The lowest codepoint of the block.
--- @return table|nil The block or nil if none were found.
function p.getBlock(lowerCodepoint)
return p.getBlocks()
end
--- Returns the Unicode block that contains the given character.
--- Throws an error if zero or more than 1 characters were given.
--- @param char string The character.
--- @return table|nil The block or nil if the character doesn’t belong to any block.
function p.getBlockForChar(char)
local len = mw.ustring.len(char)
if len ~= 1 then
error(mw.ustring.format('Un seul caractère attendu, %d donnés ("%s")', len, char))
end
local code = mw.ustring.codepoint(char)
for _, block in pairs(p.getBlocks()) do
if block.lower <= code and code <= block.upper then
return block
end
end
return nil
end
-- Script-related functions --
--- Returns the Unicode script for the given code.
--- @param code string The script’s code.
--- @return table|nil The script or nil if none were found.
function p.getScript(code)
return p.getScripts()
end
--- Returns the Unicode script for the given character.
--- Throws an error if zero or more than 1 characters were given.
--- @param char string The character.
--- @return table|nil The script or nil if the character doesn’t belong to any block.
function p.getScriptForChar(char)
local len = mw.ustring.len(char)
if len ~= 1 then
error(mw.ustring.format("Un seul caractère attendu, %d donnés", len))
end
local code = mw.ustring.codepoint(char)
local scripts = p.getScripts()
for range, script_code in pairs(p.getScriptRanges()) do
if range <= code and code <= range then
return scripts
end
end
return scripts
end
--- Returns the Unicode script for the given text.
--- If the text contains character from several scripts other than
--- Common or Inherited, the returned script is Common.
--- @param text string The text.
--- @return table The script.
function p.getScriptForText(text)
local inheritedFound = false
local commonFound = false
local res
for _, script in pairs(p.getScriptsForText(text)) do
local name = script.code
if not commonFound and name == "Common" then
commonFound = true
elseif not inheritedFound and name == "Inherited" then
inheritedFound = true
elseif name ~= "Common" and name ~= "Inherited" then
if res == nil or res.code == "Unknown" then
res = script
elseif res and script.code ~= "Unknown" and script.code ~= res.code then
return p.getScript("Unknown")
end
end
end
if res == nil then
if inheritedFound then
return p.getScript("Inherited")
elseif commonFound then
return p.getScript("Common")
end
end
return res
end
--- Returns the Unicode scripts for the given text.
--- @param text string The text.
--- @param getRanges boolean If true, ranges for each script will be returned
--- @return table The list of scripts (unsorted).
function p.getScriptsForText(text, getRanges)
local res = {}
local scriptsRanges = {}
local i = 1
while i <= mw.ustring.len(text) do
local c = mw.ustring.sub(text, i, i)
-- Skip HTML tags
local skip = false
if c == "<" then
local j = mw.ustring.find(text, ">", i, true)
if j ~= nil then
table.insert(scriptsRanges, { script = nil, from = i, to = j })
i = j
skip = true
end
end
if not skip then
local script = p.getScriptForChar(c)
local name = script.code
if not res then
res = script
table.insert(scriptsRanges, { script = name, from = i, to = i })
else
local lastRange = scriptsRanges
if lastRange.script == name and lastRange.to + 1 == i then
lastRange.to = i
else
table.insert(scriptsRanges, { script = name, from = i, to = i })
end
end
end
i = i + 1
end
if getRanges then
return res, scriptsRanges
end
return res
end
--- Indicates whether the given text contains characters in the given Unicode script.
--- @param text string The text.
--- @param scriptCode string The script’s code.
--- @return boolean True if the code exists and the text contains characters in this script,
--- false otherwise.
function p.textHasScript(text, scriptCode)
local script = p.getScript(scriptCode)
return script ~= nil and p.getScriptsForText(text) ~= nil
end
--- Indicates whether the given text should be in italics, based on the different character scripts.
--- A text should be in italics if and only if p.getScriptForText(text) returns either Latin, Common or Inherit;
--- in all other cases, it should not.
--- @param text string The text.
--- @return boolean True if the text should be in italics.
function p.shouldItalicize(text)
local name = p.getScriptForText(text).code
return name == "Latin" or name == "Common" or name == "Inherited"
end
local directionToCss = {
= "horizontal-tb",
= "horizontal-tb",
= "vertical-lr",
= "inherit",
= "inherit",
}
local directionToDir = {
= "ltr",
= "rtl",
= "ltr",
}
--- Sets the writing direction for the given text, based on its Unicode scripts,
--- by inserting span tags.
--- @param text string The text.
--- @return string The text which contains span tags with the writing-mode CSS rule and dir attribute.
function p.setWritingDirection(text)
local dirsToIgnore = { lr = true, rl = true }
local res = ""
local scripts, intervals = p.getScriptsForText(text, true)
local prevScriptName
local inSpan = false
for i, interval in ipairs(intervals) do
local substr = mw.ustring.sub(text, interval.from, interval.to)
if interval.script then
local script = scripts
local scriptName = script.name
local scriptDir = script.direction or "i"
local nextScript = intervals and scripts.script] or nil
local nextScriptDir = nextScript and (nextScript.direction or "i") or nil
if inSpan and scriptDir ~= "i" and scriptDir ~= "m" and scriptName ~= prevScriptName then
res = res .. "</span>"
inSpan = false
end
if dirsToIgnore or prevScriptName == scriptName
-- Special case for when text begins with i or m scripts and is followed by script that is not lr nor rl
or ((scriptDir == "i" or scriptDir == "m") and ((i == 1 and (dirsToIgnore or nextScriptDir == "i" or nextScriptDir == "m")) or i > 1 or not nextScript)) then
res = res .. substr
else
local dir
local cssDir
-- Include current span in next script’s span
if scriptDir ~= "i" and scriptDir ~= "m" then
dir = directionToDir
cssDir = scriptDir
prevScriptName = scriptName
elseif nextScriptDir then
dir = directionToDir
cssDir = nextScriptDir
prevScriptName = nextScript.name
end
if dir then
local dirAttr = dir and ('dir="' .. dir .. '"') or ""
local writingMode = directionToCss
res = res .. mw.ustring.format('<span %s style="writing-mode:%s">', dirAttr, writingMode) .. substr
inSpan = true
end
end
else
if inSpan then
res = res .. "</span>"
inSpan = false
end
res = res .. substr
prevScriptName = nil
end
end
if inSpan then
res = res .. "</span>"
end
return res
end
--- Converts an integer into another system.
--- @param n number The number to convert.
--- @param system string The target number system.
--- @return string The converted number as a string of digits.
function p.convertNumber(n, system)
if n < 0 then
error("Le nombre doit être un entier positif ou nul !")
end
local s = p.getNumberSystems()
if not s then
error("Système numérique invalide : " .. tostring(system))
end
if (s.min_value and n < s.min_value) or (s.max_value and n > s.max_value) then
error("Valeur invalide : " .. tostring(n))
end
-- Positional systems
if s.positional then
local base = s.base
local offset = s.zero_offset
local digit = function(i)
return mw.ustring.char(i + offset)
end
if n == 0 then
return digit(0)
end
local res = ""
local i = n
while i > 0 do
res = digit(i % base) .. res
i = math.floor(i / base)
end
return res
-- Roman numerals
elseif system == "romain" then
local symbols = {
"I", "V",
"X", "L",
"C", "D",
"M", "ↁ",
"ↂ", "ↇ",
"ↈ",
}
local exp = 0;
local res = "";
local i = n
while i > 0 do
local d = i % 10;
local unit1 = symbols;
local unit5 = symbols;
local unit10 = symbols;
local str = "";
if d ~= 0 then
if d <= 3 then
for _ = 0, d - 1 do
str = str .. unit1
end
elseif d == 4 then
str = str .. unit1 .. unit5
elseif d == 5 then
str = str .. unit5
elseif 5 < d and d < 9 then
str = str .. unit5
for _ = 0, d - 6 do
str = str .. unit1
end
else
str = str .. unit1 .. unit10
end
end
res = str .. res;
i = math.floor(i / 10);
exp = exp + 1;
end
return res;
-- Greek numerals
elseif system == "grec" then
local symbols = {
"α", "β", "γ", "δ", "ε", "ϛ", "ζ", "η", "θ", -- 1 to 9
"ι", "κ", "λ", "μ", "ν", "ξ", "ο", "π", "ϟ", -- 10 to 90
"ρ", "σ", "τ", "υ", "φ", "χ", "ψ", "ω", "ϡ", -- 100 to 900
}
local exp = 1;
local res = "";
local i = n
while i > 0 do
local d = i % 10
if d > 0 then
if exp == 10 or exp == 10000 then
d = d + 9
elseif exp == 100 or exp == 100000 then
d = d + 18
end
res = symbols .. res
if exp >= 1000 then
res = "͵" .. res
end
end
exp = exp * 10
i = math.floor(i / 10)
end
if n % 1000 ~= 0 then
res = res .. "ʹ"
end
return res
-- Chinese and japanese numerals
elseif system == "sinogrammes" then
local symbols = { "〇", "一", "二", "三", "四", "五", "六", "七", "八", "九" }
if n == 0 then
return symbols
end
local res = "";
local i = n
while i > 0 do
res = symbols .. res
i = math.floor(i / 10)
end
return res
end
end
-----------------------------
-- Functions for templates --
-----------------------------
--- Returns the wikitext for template ].
--- frame.args (int, optional): The lower bound of the Unicode block
--- (decimal or hexadecimal with “0x” prefix).
--- If undefined, the code will be extracted from the page’s title.
--- @return string The template’s wikicode.
function p.blockReference(frame)
local args = m_params.process(frame.args, {
= { type = m_params.INT },
})
local blockCode = args
local block
if blockCode then
block = p.getBlock(blockCode)
else
block = p.getBlockForChar(mw.title.getCurrentTitle().text)
end
if block then
return mw.ustring.format("Unicode, Inc., '''', The Unicode Standard, version %s, %d",
block.url, block.name.en, block.version, block.year)
end
error(mw.ustring.format("Bloc Unicode « %s » invalide", blockCode or ""))
end
--- Sets the writing direction for the given text, based on its Unicode script,
--- by inserting it inside a span tag.
--- frame.args (string): The text.
--- @return string The text, included in a span tag with the writing-mode CSS rule.
function p.writingDirection(frame)
local args = m_params.process(frame.args, {
= { required = true, allow_empty = true },
})
if args then
return p.setWritingDirection(args)
end
return ""
end
--- Returns the Unicode codepoint of the given character.
--- Throws an error if zero or more than 1 characters were given.
--- frame.args (string, only one character): The character.
--- frame.args (boolean, default = false) : Indicates wether the returned codepoint
--- will be in hexadecimal.
--- @return string|number The character’s codepoint without the “0x” prefix if it is in hexadecimal.
function p.codepoint(frame)
local args = m_params.process(frame.args, {
= {
required = true,
checker = function(value)
return mw.ustring.len(value) == 1
end,
},
= { type = m_params.BOOLEAN, default = false },
})
local char = args
local isHex = args
local code = mw.ustring.codepoint(char)
if isHex then
return string.format("%04X", code)
end
return code
end
--- Returns the character with the given Unicode codepoint.
--- Throws an error if the codepoint is invalid.
--- frame.args (int): The codepoint.
--- @return string The character.
function p.character(frame)
local args = m_params.process(frame.args, {
= { required = true, type = m_params.INT },
})
local code = tonumber(args)
if code ~= nil then
local success, char = pcall(mw.ustring.char, code)
if success then
return char
end
end
error("Point de code incorrect")
end
--- Converts an arab number into the given system.
--- frame.args (int): The number.
--- frame.args (string): The target system.
--- @return string The converted number as a string of digits.
function p.number(frame)
local args = m_params.process(frame.args, {
= { required = true, type = m_params.INT, checker = function(i)
return tonumber(i) >= 0
end },
= { enum = m_table.keysToList(p.getNumberSystems()), default = "latin" }
})
return p.convertNumber(args, args)
end
return p