Code utilisé pour dénombrer le nombre de traductions sur le Wiktionnaire.
import re
import time
def count_all_trads():
input_file = 'C:\\Users\\automatik\\Downloads\\frwiktionary-20180901-pages-articles.xml'
counter_pages = 0
counter_trads = 0
with open(input_file, encoding='utf-8') as f:
pattern = re.compile(r'{{trad{0,2}\|')
ns = -1
for line in f:
if '<ns>' in line:
ns = int(line)
counter_pages += 1
if counter_pages % 100000 == 0:
print(str(counter_trads) + ' traductions dénombrées pour ' + str(counter_pages) + ' pages scannees')
if ns != 0:
continue
if ('{{trad|' in line or '{{trad+|' in line or '{{trad-|' in line or '{{trad--|' in line) and '{{T|' in line and not '<comment>' in line:
counter_trads += len(re.findall(pattern, line))
print('Total : {} traductions dans {} pages'.format(counter_trads, counter_pages))
if __name__ == '__main__':
start_time = time.time()
count_all_trads()
print('Done in {} sec'.format(str(time.time() - start_time))) # last runtime: 90 sec
# Dump du 1/12/2023 : 1 259 193 traductions dans 5 378 163 pages
Script exécuté en moins de 5 minutes (Windows 10, processeur quadricore 2.3GHz, 8 Go de RAM)