#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:AutoFormat/code/langcodes """ This code looks for language code templates in the en.wiki, using API from live DB Writes AF control file No command line arguments. writes reports """ import wikipedia import xmlreader import sys import re import pickle import xmldate import socket def safe(s): ss = pickle.dumps(s) l = len(ss) return ss def skey(s): # sort key to put codes in preferred order: sk = '%02d%s' % (len(s), s) # except: if s == 'zh': sk = '04zh' # after '03cmn' return sk def main(): socket.setdefaulttimeout(30) # make sure we are logged in site = wikipedia.getSite() site.forceLogin() wikipedia.setAction('writing report') Langs = set() Lcodes = {} poscodes = set() retitle = re.compile(r'title="(*)"') recmatch = re.compile(r'Template:{2,10}$') reccont = re.compile(r'cmcontinue="(*)"') k = 0 # get category from live wikt (too much variation) ccont = '!' while ccont: print "getting cat from", ccont cats = site.getUrl("/w/api.php?action=query&list=categorymembers" \ "&cmtitle=Category:Language_templates&cmlimit=1000" \ "&cmcontinue=" + ccont + "&format=xml") for title in retitle.findall(cats): if not recmatch.match(title): print "skipped", repr(title) continue poscodes.add(title) k += 1 mo = reccont.search(cats) if mo: ccont = mo.group(1) else: ccont = '' print "possible templates found", k # now get content from XML scan relink = re.compile(r"\{\{\{l\|']*\}\}\}") reincl = re.compile(r'<noinclude.*$', re.S) reonly = re.compile(r'^.*<onlyinclude>(.*)</onlyinclude>.*$', re.S) # get XML dump dump = xmlreader.XmlDump("en-wikt.xml") for entry in dump.parse(): title = entry.title if title not in poscodes: continue poscodes.remove(title) code = title text = entry.text text = relink.sub('', text) text = reincl.sub('', text) text = reonly.sub(r'\1', text) lang = text # special case(s) if code == 'see': lang = 'Seneca' # bugs, fixed in next XML, 4.5.10 if code == 'oun': lang = '!O!ung' if code == 'bdf': lang = 'Biage' # got one! print safe(code), safe(lang) Langs.add(lang) if lang in Lcodes: Lcodes.append(code) else: Lcodes = # not found? if poscodes: print "not found:", repr(poscodes) # report for AF control file: report = '\n{| class="prettytable"\n' report += '|-\n| | codes\n| |Language\n' for lang in sorted(Langs): codes = u','.join(sorted(Lcodes, key=skey)) report += "|-\n| " + codes + '||' + lang + '\n' report += "|}\n" wikipedia.setAction('writing AutoFormat language table') # write the AutoFormat table page try: reportpage = wikipedia.Page(site, 'User:AutoFormat/Languages') oldreport = reportpage.get(sysop = True) except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink() # file the report reportpage.put(report) if __name__ == "__main__": try: main() finally: wikipedia.stopme()