#!/usr/bin/env python3
import os
import re
import time
import pywikibot
import ast
from subprocess import call
# Copie de ] (la liste étant préfixée de languages =)
from languages_list import languages
site = pywikibot.Site(code='fr', fam='wiktionary')
# Dossier qui contient les résultats générés précédemment et notamment le fichier
# stats-trads-diffs-to-check.txt qui contient les infos sur les diffs à analyser.
folder = os.path.dirname(__file__)
# Fichiers de sortie
file_langs = 'stats-langs-after-diffs.txt'
file_dates = 'stats-months-after-diffs.txt'
file_contributors = 'stats-trads-contributors-after-diffs.txt'
file_ips = 'stats-trads-ips-after-diffs.txt'
def first_n_items_dict(d, n, first_index):
'''Returns a list of n top-valued items from index first_index'''
return sorted(d.items(), key=lambda x: x, reverse=True)
def dict_from_file(file, folder=folder):
with open(os.path.join(folder, file), encoding='utf-8') as f:
dict_results = ast.literal_eval(f.read())
return dict_results
def generate_graph(results, file_graph='graph_template.txt', cat='langs', label='lang'):
'''
@param results: an object containing the results
@type results: str representing a file,
or dict or list of tuples
'''
input_file = os.path.join(folder, file_graph)
output_file = os.path.join(folder, 'graph-{}.txt'.format(cat))
if isinstance(results, str):
with open(os.path.join(folder, results)) as f:
results = ast.literal_eval(f.read())
with open(input_file, 'r', encoding='utf-8') as f:
content = f.read()
specific_results = ''
if isinstance(results, dict):
for k in results:
specific_results += ' {{"{}": "{}", "amount": {} }},\n'.format(label, k, str(results))
elif isinstance(results, list):
for item in results:
specific_results += ' {{"{}": "{}", "amount": {} }},\n'.format(label, item, str(item))
else:
print('results objects should be either a string, a list or a dict, but it is a {}'.format(type(results))); return
specific_results = specific_results # removing last ",\n"
content = content.replace('__TO_REPLACE__', specific_results)
content = content.replace('__LABEL__', label)
oblique_labels = '__OBLIQUE_LABELS__'
if cat == 'dates':
content = content.replace(oblique_labels, ', "properties": { "labels": {"angle": {"value": -45}, "dx": {"value": -20} } } ')
else:
content = content.replace(oblique_labels, '')
with open(output_file, 'w', encoding='utf-8') as f:
f.write(content)
print('Graph saved to {}'.format(output_file))
def generate_langs_graph(nbLangs=15, first_index=0, filename_var='langs'):
d = dict_from_file(file_langs)
data = first_n_items_dict(d, nbLangs, first_index)
generate_graph(data, file_graph='graph_template.txt', cat=filename_var, label='lang')
def generate_dates_graph():
generate_graph(file_dates, file_graph='graph_template.txt', cat='dates', label='date')
def results_to_wikitable(dict, header1='Langues', header2='Traductions ajoutées'):
wikitext = '{{| class="wikitable sortable mw-collapsible"\n! {} !! {}'.format(header1, header2)
for k in dict:
wikitext += '\n|-\n| {} || {}'.format(k, str(dict))
wikitext += '\n|}'
return wikitext
def raw_results_to_wikitable(filename, dest=None, header1='Langues', header2='Traductions ajoutées'):
if dest is None:
raise Exception('A destination file must be provided')
input_file = os.path.join(folder, filename)
output_file = os.path.join(folder, dest)
with open(input_file, encoding='utf-8') as f:
for line in f:
dict_results = ast.literal_eval(line)
break
with open(output_file, 'w+', encoding='utf-8') as f:
f.write(results_to_wikitable(dict_results, header1, header2))
print('Wikitable saved to {}'.format(output_file))
def count_trads_per_diff():
'''
Génère des statistiques d'ajout de trads lorsque les résumés d'édition sont tronqués
et necessitent une analyse des diffs
'''
stats_langs = {}
stats_dates = {}
stats_contribs = {}
stats_ips = {}
cpt = 0
cpt_trads = 0
count_contributors = 0
count_ips = 0
with open(os.path.join(folder, "stats-langs.txt"), encoding="utf-8") as f:
for line in f:
match = re.search("(.+) : (\d+)", line)
if match is None:
continue
stats_langs] = int(match)
with open(os.path.join(folder, "stats-months.txt"), encoding="utf-8") as f:
for line in f:
match = re.search("(.+) : (\d+)", line)
stats_dates] = int(match)
with open(os.path.join(folder, "stats-trads-contributors.txt"), encoding="utf-8") as f:
for line in f:
match = re.search("(.+) : (\d+)", line)
if match is None:
continue
stats_contribs] = int(match)
with open(os.path.join(folder, "stats-trads-ips.txt"), encoding="utf-8") as f:
for line in f:
match = re.search("(.+) : (\d+)", line)
if match is None:
continue
stats_ips] = int(match)
with open(os.path.join(folder, "stats-trads-diffs-to-check.txt"), encoding="utf-8") as f:
for line in f:
# line = "title=lire;prev_rev_id=18770633;rev_id=18908602;contrib=Test;is_ip=false;date=2015-01"
# title = line.split(';').split('=')
from_rev = line.split(';').split('=')
to_rev = line.split(';').split('=')
contrib = line.split(';').split('=')
if line.split(';').split('=') == 'true':
is_ip = True
else:
is_ip = False
date = line.split(';').split('=').strip()
diff_html = site.compare(old=int(from_rev), diff=int(to_rev))
# The Mediawiki diff algorithm sometimes shows existing translations as removed lines
# (cf. eg. https://fr.wiktionary.orghttps://fr.wiktionary.org/w/index.php?title=boto&diff=prev&oldid=25504237),
# and then shows them back in the added lines.
# Hence we collect all language codes in the added translations (A), and in the removed translations (B),
# and we do the diff, by substracting B from A (as multisets).
# This works as translation_editor is only used to add translations.
# Note: the previously used algorithm (]) was almost twice as efficient,
# and had an error margin of just about ~100 entries out of 77,500, that is a ~0.13 % error margin.
trads_removed = re.findall('<td class="diff-deletedline diff-side-deleted"><div>(.+)</div></td>', diff_html)
trads_added = re.findall('<td class="diff-addedline diff-side-added"><div>(.+)</div></td>', diff_html)
codes_added =
codes_removed =
for t in trads_added:
codes_added += re.findall('{{trad(?:<ins class=\"diffchange diffchange-inline\">)?{0,2}(?:</ins>)?\|(?:<ins class="diffchange diffchange-inline">)?(+)(?:</ins>)?\|', t)
for t in trads_removed:
codes_removed += re.findall('{{trad(?:<del class=\"diffchange diffchange-inline\">)?{0,2}(?:</del>)?\|(?:<del class="diffchange diffchange-inline">)?(+)(?:</del>)?\|', t)
# Now that we collected the codes for all removed translations, we remove one instance
# of each code in the added translations.
for code in codes_removed:
codes_added.remove(code)
for code in codes_added:
if code in languages:
lang_name = languages
elif code in languages:
lang_name = languages]
else:
# Les codes langue non répertoriés dans la liste des langues sont ignorés
# 1 cas le 3/12/2023 : le code zh-tc supprimé depuis
print('CODE ' + code + ' NOT FOUND (and translation ignored) - ' + line.strip())
continue
if lang_name:
if lang_name in stats_langs:
stats_langs += 1
else:
stats_langs = 1
if is_ip:
if contrib in stats_ips:
stats_ips += 1
else:
stats_ips = 1
count_ips += 1
else:
if contrib in stats_contribs:
stats_contribs += 1
else:
stats_contribs = 1
count_contributors += 1
if date in stats_dates:
stats_dates += 1
else:
stats_dates = 1
cpt_trads += 1
cpt += 1
if cpt % 100 == 0:
print(str(cpt) + " diffs traites (" + str(cpt_trads) + " traductions)")
with open(os.path.join(folder, "stats-trads-res-after-diffs.txt"), "w+", encoding="utf-8") as f:
res = "Résultats des stats sur les résumés d'édition tronqués (ajout de traductions par lots) :\n"
res += "Traductions ajoutées : " + str(cpt_trads) + "\n"
res += "Traductions ajoutées par des utilisateurs inscrits : " + str(count_contributors) + "\n"
res += "Traductions ajoutées par des utilisateurs non inscrits : " + str(count_ips) + "\n"
f.write(res)
with open(os.path.join(folder, "stats-langs-after-diffs.txt"), "w+", encoding="utf-8") as f:
f.write(str(stats_langs))
with open(os.path.join(folder, "stats-months-after-diffs.txt"), "w+", encoding="utf-8") as f:
f.write(str(stats_dates))
with open(os.path.join(folder, "stats-trads-contributors-after-diffs.txt"), "w+", encoding="utf-8") as f:
f.write(str(stats_contribs))
with open(os.path.join(folder, "stats-trads-ips-after-diffs.txt"), "w+", encoding="utf-8") as f:
f.write(str(stats_ips))
if __name__ == '__main__':
start_time = time.time()
count_trads_per_diff()
print("--- %s seconds ---" % (time.time() - start_time))
# Création des fichiers de stats
raw_results_to_wikitable(file_dates, dest='wikitable-dates.txt', header1='Mois', header2='Traductions ajoutées')
raw_results_to_wikitable(file_langs, dest='wikitable-langs.txt', header1='Langue', header2='Traductions ajoutées')
raw_results_to_wikitable(file_ips, dest='wikitable-ips.txt', header1='Utilisateur non enregistré', header2='Traductions ajoutées')
raw_results_to_wikitable(file_contributors, dest='wikitable-contributors.txt', header1='Utilisateur enregistré', header2='Traductions ajoutées')
# Disabling graphs generation, as the Graph extension is disabled on Wikimedia wikis
# as of 1/2024 due to security issues.
# generate_langs_graph()
# generate_langs_graph(15, 15, filename_var='langs2')
# generate_dates_graph()
Ce script a pris ~20 minutes à s’exécuter (Windows 10, processeur quadricore (2.3GHz), 8 Go de RAM)
Il fait référence au fichier graph_template.txt suivant pour la génération des graphiques :
{{#tag:graph| { "version": 4, "width": 1000, "height": 200, "padding": {"top": 20, "left": 65, "bottom": 60, "right": 10}, "data": [ { "name": "table", "values": [ __TO_REPLACE__ ] } ], "signals": [ { "name": "tooltip", "init": {}, "streams": [ {"type": "rect:mouseover", "expr": "datum"}, {"type": "rect:mouseout", "expr": "{}"} ] } ], "predicates": [ { "name": "tooltip", "type": "==", "operands": } ], "scales": [ { "name": "xscale", "type": "ordinal", "range": "width", "domain": {"data": "table", "field": "__LABEL__"} }, { "name": "yscale", "type": "linear", "range": "height", "domain": {"data": "table", "field": "amount"} } ], "axes": [ { "type": "x", "scale": "xscale"__OBLIQUE_LABELS__}, { "type": "y", "scale": "yscale" } ], "marks": [ { "type": "rect", "from": {"data":"table"}, "properties": { "enter": { "x": {"scale": "xscale", "field": "__LABEL__"}, "width": {"scale": "xscale", "band": true, "offset": -1}, "y": {"scale": "yscale", "field": "amount"}, "y2": {"field": {"group": "height"} } }, "update": { "fill": {"value": "steelblue"} }, "hover": { "fill": {"value": "red"} } } }, { "type": "text", "properties": { "enter": { "align": {"value": "center"}, "fill": {"value": "#333"} }, "update": { "x": {"scale": "xscale", "signal": "tooltip.__LABEL__"}, "dx": {"scale": "xscale", "band": true, "mult": 0.5}, "y": {"scale": "yscale", "signal": "tooltip.amount", "offset": -5}, "text": {"signal": "tooltip.amount"}, "fillOpacity": { "rule": [ { "predicate": {"name": "tooltip", "id": {"value": null} }, "value": 0 }, {"value": 1} ] } } } } ] } | mode=interactive }}