#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:Robert Ullmann/code/hanform """ This bot formats Han entries No command line arguments. """ import wikipedia import xmlreader import sys import re import pickle import xmldate from mwapi import getwikitext, getedit def safe(s): return pickle.dumps(s) def main(): # regex table (dict, name = tuple of compiled object and replacement) Regex = { } # examples # Regex = (re.compile(r'\{\{PAGENAME}}'), '{{subst:PAGENAME}}') # Regex = (re.compile(r'\{\{cattag\|'), '{{context|') Regex = \ (re.compile(r'^\{\{Han char(.*)\|as=(\d)()', re.M), r'{{Han char\1|as=0\2\3') Regex = \ (re.compile(r"^==Vietnamese==\n+===Han character===\n+'''..?''' +\((\\])\)\n+(\[\[|----)", re.M), r'==Vietnamese==\n\n===Han character===\n{{vi-hantu|\1}}\n\n# {{defn|Vietnamese}}\n\n\2') Regex = \ (re.compile(r"^==Vietnamese==\n+===Han character===\n+'''..?''' +\(?(\\])\)?\n+#", re.M), r'==Vietnamese==\n\n===Han character===\n{{vi-hantu|\1}}\n\n#') Regex = \ (re.compile(r'^==Korean==\n+===Hanja===\n+\{\{ko-hanja(.*)\}\}\n+(\[\[|----)', re.M), r'==Korean==\n\n===Hanja===\n{{ko-hanja\1}}\n\n# {{defn|Korean}}\n\n\2') Regex = \ (re.compile(r"^==Middle Chinese==\n+===Han character===\n+('''.*)\n+(\[\[|----)", re.M), r'==Middle Chinese==\n\n===Han character===\n\1\n\n# {{defn|Middle Chinese}}\n\n\2') # (format matches entries by annoying IP-anon, still need serious work) Regex = \ (re.compile(r'\ategory:Viet .+\]\]\n?'), r'') Regex = \ (re.compile(r'\ategory:Viet\]\]\n?'), r'') Regex = \ (re.compile(r'^===Compounds===$', re.M), r'====Compounds====') # other regex precomp, not sure what is needed yet rehanchar = re.compile(r'\{\{Han char\|.*?\}\}') reradno = re.compile(r'\|rn=(\d+)') rerad = re.compile(r'\|rad=(.)') reas = re.compile(r'\|as=(\d\d?)') rehanref = re.compile(r'\{\{Han ref\|.*?\}\}') reuh = re.compile(r'\|uh=(+)') revihantu = re.compile(r'\{\{vi-hantu.*?\}\}') # make sure we are logged in site = wikipedia.getSite() site.forceLogin() # get problems list page = wikipedia.Page(site, "User:Robert Ullmann/Han/Problems") rwp = getwikitext(page) reent = re.compile(r'\* + \\]') entries = 0 probs = 0 fixed = 0 for title in reent.findall(rwp): try: page = wikipedia.Page(site, title) text = getwikitext(page) except wikipedia.IsRedirectPage: print "redirect page? bad!" text = '' except Exception, e: print "exception?", repr(e) text = '' if not text: continue origtext = text entries += 1 if entries % 100 == 0: print "%d entries, %d tagged/replaced" % (entries, fixed) ishanchar = False if len(title) == 1: a = ord(title) #print "one character entry, code is %x" % a if a >= 0x3400 and a < 0xA000: ishanchar = True if a >= 0x20000 and a <= 0x2A6D6: ishanchar = True # Extension B, in UTF-16, narrow build (although XMLreader/Python Lib don't say so): if len(title) == 2: a = ord(title) b = ord(title) if a >= 0xd800 and a < 0xdc00: a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000 if a >= 0x20000 and a <= 0x2A6D6: ishanchar = True # extraneous links on the page?: if not ishanchar: continue print "entry %s" % safe(title) # initalize radno = 0 rad = '' ast = '' acts = set() # parse text ... find the Han char template, some params if we can mo = rehanchar.search(text) if mo: hanct = mo.group(0) mo = reradno.search(hanct) if mo: radno = int(mo.group(1)) mo = rerad.search(hanct) if mo: rad = mo.group(1) mo = reas.search(hanct) if mo: ast = mo.group(1) else: hanct = '(missing)' # fix as, regex will fix entry if len(ast) == 1: ast = '0' + ast # add entire translingual section if needed if hanct == '(missing)' and '==Translingual==' not in text: text = '{{subst:xhan|uh=%X}}\n----\n{{rfc-auto}}\n' % a + text acts.add('added Translingual section') """ print "need xhan template!" continue """ # check/correct Unicode Hex in Han ref mo = rehanref.search(text) if mo: hanref = mo.group(0) mo = reuh.search(hanref) if mo: uht = mo.group(1) uh = int(uht, 16) # note that we know the characters are in hex range # but: if uht != uht.upper(): uh = 0 # replace with UC else: uht = '(nil)' uh = 0 # now check, should be the same as title ordinal if uh != a: if 'uh=|' in hanref or 'uh=}' in hanref: hanrefnew = hanref.replace('|uh=', '|uh=%X'%a) elif 'uh=' in hanref: hanrefnew = hanref.replace('|uh=' + uht, '|uh=%X'%a) else: hanrefnew = hanref + '|uh=%X}}'%a text = text.replace(hanref, hanrefnew) acts.add('added/corrected Unicode hex value -%s +%X' % (uht, a)) # compute sort key: if rad and ast: skey = rad + ast else: skey = '' # that gives us sort= for {defn} and rs= for others # now do regex, see if we have a substitution for rx in Regex: newtext = Regex.sub(Regex, text) if newtext != text: acts.add(rx) text = newtext # add sort keys if possible: (skip Japanese because key may be different, can do later) if skey: for lang in : if '{{defn|' + lang + '}}' in text: text = text.replace('{{defn|' + lang + '}}', '{{defn|' + lang + '|sort=' + skey +'}}') acts.add('add sort keys') # add sort key to template(s), vi-hantu for now if skey and 'vi-hantu' in text: mo = revihantu.search(text) if mo and 'rs=' not in mo.group(0): newv = mo.group(0) + '|rs=' + skey + '}}' text = text.replace(mo.group(0), newv) acts.add('add sort keys') # changes? if not acts: continue act = ', '.join(acts) # some change, write it if act: # (redundant) fixed += 1 print "replacing in %s: %s" % (safe(title), safe(act)) act = "Han format: " + act # try to fix the entry saved = False while not saved: try: currtext = getedit(page) if currtext.strip('\n ') != origtext.strip('\n '): print "page changed during edit?" break page.put(text, comment=act) saved = True except KeyboardInterrupt: print "keyboard interrupt" return except Exception, e: print "exception %s, trying again" % safe(e) # limit number of fixes for testing # if fixed > 3: break print "%d entries, %d fixed" % (entries, fixed) # done if __name__ == "__main__": try: main() finally: wikipedia.stopme()