#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:Robert Ullmann/code/tlang """ This code looks for languages in translations sections some comments and such refer to headers, this code derived from the header analysis No command line arguments. writes reports """ import wikipedia import xmlreader import sys import re import pickle import xmldate def safe(s): ss = pickle.dumps(s) l = len(ss) return ss def main(): # make sure we are logged in site = wikipedia.getSite() site.forceLogin() wikipedia.setAction('writing report') # get XML dump dump = xmlreader.XmlDump("en-wikt.xml") entries = 0 words = 0 Trans = 0 # several different cases ... retrans1 = re.compile(r'\*\s*\]+?)\]\]\s*:(.*)') retrans2 = re.compile(r'\*\s*\]+?)\]\]\s*:(.*)') retrans3 = re.compile(r'\*\s*(.+?):(.*)') reunlink = re.compile(r'\\]') # valid headers have templates with codes Codes2 = {} Codes3 = {} CodesW = {} # all headers have occurance counts Occurs = {} # invalid headers have examples, but we collect for all Examples = {} # things that look like codes, but aren't; including ISO 639-2 B codes: Stops = [ 'alb', 'arm', 'baq', 'bur', 'chi', 'cze', 'dut', 'fre', 'geo', 'ger', 'gre', 'ice', 'mac', 'may', 'mao', 'per', 'rum', 'scc', 'scr', 'slo', 'tib', 'wel', 'zh-tc', 'zh-sc', 'zh-yue', 'gko', 'rfc', 'rfd', 'rfv', 'top', 'mid', 'pos-n', 'pie' ] # and fix DAVilla silliness: Codes2 = 'zh' recmatch = re.compile(r'+$') regood = re.compile(r'(' + re.escape('{{{l|[}}}{{{l|[}}}') + \ r'|)(+)(' + re.escape('{{{l|]]}}}') + r'|)<noinclude') # try a particular language report, Romanian to start replang = "" Alltrans = set() for entry in dump.parse(): text = entry.text title = entry.title entries += 1 if entries % 10000 == 0: print "%d entries, %d words, %d translations" % (entries, words, Trans) # look for code templates if title.startswith('Template:'): code = title if code in Stops: continue if not recmatch.match(code): continue if 'Language templates' not in text: continue mo = regood.match(text) if not mo: print "bad code template %s: %s" % (safe(code), safe(text)) continue lang = mo.group(2) print "code %s: %s" % (safe(code), safe(lang)) if len(code) == 2: Codes2 = code elif len(code) == 3: Codes3 = code else: CodesW = code continue # now skip non main-name-space if title.find(':') >= 0: continue else: words += 1 # if entries > 5000: break # parse text ... intrans = False for line in text.splitlines(): # comments on the (presumed) end of lines if line.find('<!--') >= 0: line = line.split('<!--') if line.startswith('='): intrans = False if line.find('Translations') >= 0: intrans = True continue if not intrans: continue mo = retrans1.match(line) if not mo: mo = retrans2.match(line) if not mo: mo = retrans3.match(line) if not mo: continue # do some cleanup lang = mo.group(1) lang = reunlink.sub(r'\1', lang) lang = lang.strip(" '") if not lang: continue if lang.startswith( ('{', '*', ':', '?', '#') ): continue Trans += 1 if lang not in Occurs: Occurs = 0 Occurs += 1 # accumulate report lines for a language if lang == replang: Alltrans.add('* ]: ' + mo.group(2).strip()) print safe(lang + ': ' + title) # always collect examples if lang not in Examples: Examples = ']' continue if len(Examples) < 210: Examples += ' ]' # end of for line # end of for entry print "%d entries, %d words, %d translations" % (entries, words, Trans) nlangs = 0 # report languages report = 'Languages used in translations sections as of ' + xmldate.enXMLdate + '\n' #Codes = 'grc' #report += '(Ancient Greek set to grc for this run)\n' report += """ See ] for a list of entries without codes (subset of this list). {| class="prettytable sortable" |-\n| |Codes\n| |Language\n| |Occurs\n| |Examples """ repinv = 'List of languages without code templates as of ' + xmldate.enXMLdate + """, to be sorted. {| class="prettytable sortable" |-\n| |Codes\n| |Language\n| |Occurs\n| |Examples||Notes """ for lang in sorted(Occurs): # if (header not in Codes2) and (header not in Codes3) and (header not in CodesW): continue codes = '' if lang in Codes2: codes += ', ' + Codes2 if lang in Codes3: codes += ', ' + Codes3 if lang in CodesW: codes += ', ' + CodesW codes = codes.strip(', ') if codes: report += "|-\n| %s || %s || %d ||\n" % (codes, lang, Occurs) else: report += "|-\n| || %s || %d || %s\n" % (lang, Occurs, Examples) repinv += "|-\n| || %s || %d || %s ||\n" % (lang, Occurs, Examples) # del Occurs nlangs += 1 report += "|}\n" repinv += "|}\n" wikipedia.setAction('writing report') # write the report pages try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Trans languages') oldreport = reportpage.get() except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink() # file the report reportpage.put(report) try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Trans languages/uncoded') oldreport = reportpage.get() except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink() # file the report reportpage.put(repinv) print "Languages: %d" % nlangs # report for specific language if not replang: return # we are done report = replang + ' entries in translations sections as of ' + xmldate.enXMLdate + '\n\n\n' report += u'\n'.join(sorted(Alltrans)) + '\n' wikipedia.setAction('writing report') # write the report page try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Trans languages/' + replang) oldreport = reportpage.get() except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink() # file the report reportpage.put(report) if __name__ == "__main__": try: main() finally: wikipedia.stopme()