import bisect
import json
import unicodedata
import urllib.request
NATIVE_NAMES = {
"af": ,
"am": ,
"an": ,
"ang": ,
"ar": ,
"ast": ,
"ay": ,
"az": ,
"bcl": ,
"be": ,
"bg": ,
"bn": ,
"br": ,
"bs": ,
"ca": ,
"chr": ,
"co": ,
"cs": ,
"csb": ,
"cy": ,
"da": ,
"de": ,
"dv": ,
"el": ,
"eo": ,
"es": ,
"et": ,
"eu": ,
"fa": ,
"fi": ,
"fo": ,
"fr": ,
"fy": ,
"ga": ,
"gd": ,
"gl": ,
"gn": ,
"gu": ,
"gv": ,
"ha": ,
"haw": ,
"he": ,
"hi": ,
"hif": ,
"hr": ,
"hsb": ,
"hu": ,
"hy": ,
"ia": ,
"id": ,
"ie": ,
"ig": ,
"io": ,
"is": ,
"it": ,
"iu": ,
"ja": ,
"jv": ,
"ka": ,
"kk": ,
"kl": ,
"km": ,
"kn": ,
"ko": ,
"ks": ,
"ku": ,
"kw": ,
"ky": ,
"ky": ,
"la": ,
"lb": ,
"li": ,
"lmo": ,
"ln": ,
"lo": ,
"lt": ,
"lv": ,
"mg": ,
"mi": ,
"mk": ,
"ml": ,
"mn": ,
"mnw": ,
"mr": ,
"ms": ,
"mt": ,
"my": ,
"nah": ,
"nds": ,
"ne": ,
"nia": ,
"nl": ,
"nn": ,
"no": ,
"oc": ,
"om": ,
"or": ,
"pa": ,
"pl": ,
"pnb": ,
"ps": ,
"pt": ,
"qu": ,
"ro": ,
"roa-rup": ,
"ru": ,
"rw": ,
"scn": ,
"sd": ,
"sg": ,
"sh": ,
"shn": ,
"shy": ,
"si": ,
"simple": ,
"sk": ,
"sl": ,
"sm": ,
"so": ,
"sq": ,
"sr": ,
"st": ,
"su": ,
"sv": ,
"sw": ,
"ta": ,
"te": ,
"tg": ,
"th": ,
"ti": ,
"tk": ,
"tl": ,
"tn": ,
"tpi": ,
"tr": ,
"ts": ,
"tt": ,
"ug": ,
"uk": ,
"ur": ,
"uz": ,
"vi": ,
"vo": ,
"wa": ,
"wo": ,
"yi": ,
"zh-min-nan": ,
"zh": ,
"zu": ,
"bjn": ,
"blk": ,
"btm": ,
"ckb": ,
"diq": ,
"fj": ,
"gom": ,
"gor": ,
"guw": ,
"jbo": ,
"kaa": ,
"kbd": ,
"kcg": ,
"min": ,
"mni": ,
"na": ,
"sa": ,
"skr": ,
"ss": ,
"vec": ,
"zh-yue": ,
}
ENGLISH_NAMES = {
# based on names used by Wikimedia themselves; see <https://meta.wikimedia.orghttps://dictious.com/en/Wiktionary/Table>
"af": "Afrikaans",
"am": "Amharic",
"an": "Aragonese",
"ang": "Old English",
"ar": "Arabic",
"ast": "Asturian",
"ay": "Aymara",
"az": "Azeri",
"bcl": "Bikol",
"be": "Belarusian",
"bg": "Bulgarian",
"bjn": "Banjar",
"blk": "Pa'O",
"bn": "Bengali",
"br": "Breton",
"bs": "Bosnian",
"btm": "Batak Mandailing",
"ca": "Catalan",
"chr": "Cherokee",
"ckb": "Central Kurdish",
"co": "Corsican",
"cs": "Czech",
"csb": "Kashubian",
"cy": "Welsh",
"da": "Danish",
"de": "German",
"diq": "Zazaki",
"dv": "Divehi",
"el": "Greek",
"eo": "Esperanto",
"es": "Spanish",
"et": "Estonian",
"eu": "Basque",
"fa": "Persian",
"fi": "Finnish",
"fj": "Fijian",
"fo": "Faroese",
"fr": "French",
"fy": "West Frisian",
"ga": "Irish",
"gd": "Scottish Gaelic",
"gl": "Galician",
"gn": "Guaraní",
"gom": "Goan Konkani",
"gor": "Gorontalo",
"gu": "Gujarati",
"guw": "Gun",
"gv": "Manx",
"ha": "Hausa",
"haw": "Hawaiian",
"he": "Hebrew",
"hi": "Hindi",
"hif": "Fiji Hindi",
"hr": "Croatian",
"hsb": "Upper Sorbian",
"hu": "Hungarian",
"hy": "Armenian",
"ia": "Interlingua",
"id": "Indonesian",
"ie": "Interlingue",
"ig": "Igbo",
"io": "Ido",
"is": "Icelandic",
"it": "Italian",
"iu": "Inuktitut",
"ja": "Japanese",
"jbo": "Lojban",
"jv": "Javanese",
"ka": "Georgian",
"kaa": "Kara-Kalpak",
"kbd": "Kabardian",
"kcg": "Tyap",
"kk": "Kazakh",
"kl": "Greenlandic",
"km": "Khmer",
"kn": "Kannada",
"ko": "Korean",
"ks": "Kashmiri",
"ku": "Kurdish",
"kw": "Cornish",
"ky": "Kyrgyz",
"ky": "Kyrgyz",
"la": "Latin",
"lb": "Luxembourgish",
"li": "Limburgish",
"lmo": "Lombard",
"ln": "Lingala",
"lo": "Lao",
"lt": "Lithuanian",
"lv": "Latvian",
"mg": "Malagasy",
"mi": "Māori",
"min": "Minangkabau",
"mk": "Macedonian",
"ml": "Malayalam",
"mn": "Mongolian",
"mni": "Manipuri",
"mnw": "Mon",
"mr": "Marathi",
"ms": "Malay",
"mt": "Maltese",
"my": "Burmese",
"na": "Nauru",
"nah": "Nahuatl",
"nds": "Low Saxon",
"ne": "Nepali",
"nia": "Nias",
"nl": "Dutch",
"nn": "Norwegian",
"no": "Norwegian",
"oc": "Occitan",
"om": "Oromo",
"or": "Odia",
"pa": "Punjabi",
"pl": "Polish",
"pnb": "Western Punjabi",
"ps": "Pashto",
"pt": "Portuguese",
"qu": "Quechua",
"ro": "Romanian",
"roa-rup": "Aromanian",
"ru": "Russian",
"rw": "Kinyarwanda",
"sa": "Sanskrit",
"scn": "Sicilian",
"sd": "Sindhi",
"sg": "Sängö",
"sh": "Serbo-Croatian",
"shn": "Shan",
"shy": "Shawiya",
"si": "Sinhalese",
"simple": "Simple English",
"sk": "Slovak",
"skr": "Saraiki",
"sl": "Slovene",
"sm": "Samoan",
"so": "Somali",
"sq": "Albanian",
"sr": "Serbian",
"ss": "Swati",
"st": "Southern Sotho",
"su": "Sundanese",
"sv": "Swedish",
"sw": "Swahili",
"ta": "Tamil",
"te": "Telugu",
"tg": "Tajik",
"th": "Thai",
"ti": "Tigrinya",
"tk": "Turkmen",
"tl": "Tagalog",
"tn": "Setswana",
"tpi": "Tok Pisin",
"tr": "Turkish",
"ts": "Tsonga",
"tt": "Tatar",
"ug": "Uyghur",
"uk": "Ukrainian",
"ur": "Urdu",
"uz": "Uzbek",
"vec": "Venetian",
"vi": "Vietnamese",
"vo": "Volapük",
"wa": "Walloon",
"wo": "Wolof",
"yi": "Yiddish",
"zh-min-nan": "Min Nan",
"zh-yue": "Cantonese",
"zh": "Chinese",
"zu": "Zulu",
}
HOMEWIKI = "en"
TEMPLATE = """
<!--All Wiktionaries with over 10,000 entries get a sidebar entry-->
{abbrs}
<strong style="font-size:larger; font-weight:normal;">1,000,000+ entries: </strong>
{x1000000}
<hr style="margin-top:0;color: #EEE4CE"><!-- ------------------------------------ -->
<strong style="font-size:larger; font-weight:normal;">100,000+: </strong>
{x100000}
<hr style="margin-top:0;color: #EEE4CE"><!-- ------------------------------------ -->
<strong style="font-size:larger; font-weight:normal;">10,000+: </strong>
{x10000}
<hr style="margin-top:0"><!-- ------------------------------------ -->
<strong style="font-weight:normal;">1,000+: </strong>
{x1000}
<hr style="margin-top:0"><!-- ---------------------------------- -->
<span style="font-size:smaller;"><strong style="font-weight:normal;">100+: </strong>
{x100}
"""
VARIABLES =
def get_article_counts_per_version():
CLOSED_WIKTIONARIES = {"ik", "za"}
with urllib.request.urlopen("https://commons.wikimedia.orghttps://en.wiktionary.org/w/index.php?title=Data:Wikipedia_statistics/data.tab&action=raw") as u:
j = json.load(u)
columns =
for field in j:
columns.append(field)
columns_index = {c: i for i, c in enumerate(columns)}
column_name = columns_index
column_articles = columns_index
data = {}
for row in j:
name = row
if name.endswith(".wiktionary") and not name.startswith("total"):
code = name.removesuffix(".wiktionary")
if code not in CLOSED_WIKTIONARIES:
data = row
return data
ABBR_THRESHOLD = 10000
def language_tag(text):
assert isinstance(text, tuple) and len(text) in range(2, 4)
if len(text) == 2:
code, text = text
if code == "en":
return text
return "{{lang|" + code + "|" + text + "}}"
elif len(text) == 3:
code, text, kw = text
kw_text = "".join("|" + k + "=" + v for k, v in kw.items())
return "{{lang|" + code + kw_text + "|" + text + "}}"
def format_language_by_code(code):
if code not in NATIVE_NAMES or code not in ENGLISH_NAMES:
print("Code " + code + " not configured!")
return "(ERROR FOR CODE " + code + chr(0xFFFD) + ")"
native_name = NATIVE_NAMES
english_name = ENGLISH_NAMES
assert len(native_name) > 0
if len(native_name) == 1 and native_name == english_name:
text = language_tag(native_name)
else:
text = " / ".join(language_tag(name) for name in native_name) + " (" + english_name + ")"
return "]".format(code=code, text=text)
def remove_diacritics(text):
return "".join(c for c in unicodedata.normalize("NFKD", text) if unicodedata.category(c) not in ("Mn",))
def format_language_list(codes):
return " • \n".join(format_language_by_code(code) for code in sorted(codes, key=lambda c: remove_diacritics(ENGLISH_NAMES.get(c, c))))
def format_template(article_counts):
variables = sorted(VARIABLES, key=lambda t: t)
names, thresholds = zip(*variables)
langcode_groups = {v: set() for v in names}
abbrs = set()
for langcode, count in article_counts.items():
if langcode == HOMEWIKI:
continue
index = bisect.bisect(thresholds, count) - 1
if index < 0:
continue
if count >= ABBR_THRESHOLD:
abbrs.add(langcode)
group_name = names
langcode_groups.add(langcode)
kw = {group: format_language_list(codes) for group, codes in langcode_groups.items()}
kw = "".join("]" for lang in sorted(abbrs))
result = TEMPLATE.format(**kw)
if chr(0xFFFD) in result:
raise ValueError("some language codes not configured")
return result
if __name__ == "__main__":
data = get_article_counts_per_version()
output = format_template(data)
print(output)
try:
import pyperclip
pyperclip.copy(output)
except:
pass