User:Robert Ullmann/code/hanform

#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Robert Ullmann/code/hanform


"""
This bot formats Han entries

No command line arguments.

"""

import wikipedia
import xmlreader
import sys
import re
import pickle
import xmldate
from mwapi import getwikitext, getedit

def safe(s):
    return pickle.dumps(s)

def main():

    # regex table (dict, name = tuple of compiled object and replacement)
    Regex = { }

    # examples
    # Regex = (re.compile(r'\{\{PAGENAME}}'), '{{subst:PAGENAME}}')
    # Regex = (re.compile(r'\{\{cattag\|'), '{{context|')

    Regex = \
          (re.compile(r'^\{\{Han char(.*)\|as=(\d)()', re.M), 
           r'{{Han char\1|as=0\2\3')

    Regex = \
          (re.compile(r"^==Vietnamese==\n+===Han character===\n+'''..?''' +\((\\])\)\n+(\[\[|----)", re.M),
           r'==Vietnamese==\n\n===Han character===\n{{vi-hantu|\1}}\n\n# {{defn|Vietnamese}}\n\n\2')

    Regex = \
          (re.compile(r"^==Vietnamese==\n+===Han character===\n+'''..?''' +\(?(\\])\)?\n+#", re.M),
           r'==Vietnamese==\n\n===Han character===\n{{vi-hantu|\1}}\n\n#')

    Regex = \
          (re.compile(r'^==Korean==\n+===Hanja===\n+\{\{ko-hanja(.*)\}\}\n+(\[\[|----)', re.M),
           r'==Korean==\n\n===Hanja===\n{{ko-hanja\1}}\n\n# {{defn|Korean}}\n\n\2')

    Regex = \
          (re.compile(r"^==Middle Chinese==\n+===Han character===\n+('''.*)\n+(\[\[|----)", re.M),
           r'==Middle Chinese==\n\n===Han character===\n\1\n\n# {{defn|Middle Chinese}}\n\n\2')
    # (format matches entries by annoying IP-anon, still need serious work)

    Regex = \
          (re.compile(r'\ategory:Viet .+\]\]\n?'), r'')
    Regex = \
          (re.compile(r'\ategory:Viet\]\]\n?'), r'')

    Regex = \
          (re.compile(r'^===Compounds===$', re.M), r'====Compounds====')


    # other regex precomp, not sure what is needed yet
    rehanchar = re.compile(r'\{\{Han char\|.*?\}\}')
    reradno = re.compile(r'\|rn=(\d+)')
    rerad = re.compile(r'\|rad=(.)')
    reas = re.compile(r'\|as=(\d\d?)')
    rehanref = re.compile(r'\{\{Han ref\|.*?\}\}')
    reuh = re.compile(r'\|uh=(+)')

    revihantu = re.compile(r'\{\{vi-hantu.*?\}\}')

    # make sure we are logged in
    site = wikipedia.getSite()
    site.forceLogin()

    # get problems list
    page = wikipedia.Page(site, "User:Robert Ullmann/Han/Problems")
    rwp = getwikitext(page)

    reent = re.compile(r'\* + \\]')

    entries = 0
    probs = 0
    fixed = 0

    for title in reent.findall(rwp):

        try:
            page = wikipedia.Page(site, title)
            text = getwikitext(page)
        except wikipedia.IsRedirectPage:
            print "redirect page? bad!"
            text = ''
        except Exception, e:
            print "exception?", repr(e)
            text = ''
        if not text: continue
        origtext = text


        entries += 1
        if entries % 100 == 0: print "%d entries, %d tagged/replaced" % (entries, fixed)

        ishanchar = False
        if len(title) == 1:
            a = ord(title)
            #print "one character entry, code is %x" % a
            if a >= 0x3400 and a < 0xA000: ishanchar = True
            if a >= 0x20000 and a <= 0x2A6D6: ishanchar = True

        # Extension B, in UTF-16, narrow build (although XMLreader/Python Lib don't say so):
        if len(title) == 2:
            a = ord(title)
            b = ord(title)
            if a >= 0xd800 and a < 0xdc00:
                a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000
            if a >= 0x20000 and a <= 0x2A6D6: ishanchar = True

        # extraneous links on the page?:
        if not ishanchar: continue

        print "entry %s" % safe(title)

        # initalize
        radno = 0
        rad = ''
        ast = ''
        acts = set()

        # parse text ... find the Han char template, some params if we can

        mo = rehanchar.search(text)
        if mo:
            hanct = mo.group(0)
            mo = reradno.search(hanct)
            if mo: radno = int(mo.group(1))
            mo = rerad.search(hanct)
            if mo: rad = mo.group(1)
            mo = reas.search(hanct)
            if mo: ast = mo.group(1)
        else:
            hanct = '(missing)'

        # fix as, regex will fix entry
        if len(ast) == 1: ast = '0' + ast

        # add entire translingual section if needed
        if hanct == '(missing)' and '==Translingual==' not in text:
            text = '{{subst:xhan|uh=%X}}\n----\n{{rfc-auto}}\n' % a + text
            acts.add('added Translingual section')
            """
            print "need xhan template!"
            continue
            """

        # check/correct Unicode Hex in Han ref
        mo = rehanref.search(text)
        if mo:
             hanref = mo.group(0)
             mo = reuh.search(hanref)
             if mo:
                 uht = mo.group(1)
                 uh = int(uht, 16)  # note that we know the characters are in hex range
                 # but:
                 if uht != uht.upper(): uh = 0  # replace with UC
             else:
                 uht = '(nil)'
                 uh = 0
             # now check, should be the same as title ordinal
             if uh != a:
                 if 'uh=|' in hanref or 'uh=}' in hanref: hanrefnew = hanref.replace('|uh=', '|uh=%X'%a)
                 elif 'uh=' in hanref: hanrefnew = hanref.replace('|uh=' + uht, '|uh=%X'%a)
                 else: hanrefnew = hanref + '|uh=%X}}'%a
                 text = text.replace(hanref, hanrefnew)
                 acts.add('added/corrected Unicode hex value -%s +%X' % (uht, a))
             

        # compute sort key:
        if rad and ast:
            skey = rad + ast
        else:
            skey = ''
        # that gives us sort= for {defn} and rs= for others

        # now do regex, see if we have a substitution

        for rx in Regex:
            newtext = Regex.sub(Regex, text)
            if newtext != text:
                acts.add(rx)
                text = newtext

        # add sort keys if possible: (skip Japanese because key may be different, can do later)
        if skey:
            for lang in :
                if '{{defn|' + lang + '}}' in text:
                    text = text.replace('{{defn|' + lang + '}}', '{{defn|' + lang + '|sort=' + skey +'}}')
                    acts.add('add sort keys')

        # add sort key to template(s), vi-hantu for now
        if skey and 'vi-hantu' in text:
            mo = revihantu.search(text)
            if mo and 'rs=' not in mo.group(0):
               newv = mo.group(0) + '|rs=' + skey + '}}'
               text = text.replace(mo.group(0), newv)
               acts.add('add sort keys')

        # changes?

        if not acts: continue
        act = ', '.join(acts)

        # some change, write it
        if act: # (redundant)

            fixed += 1

            print "replacing in %s: %s" % (safe(title), safe(act))

            act = "Han format: " + act

            # try to fix the entry
            saved = False
            while not saved:
                try:
                    currtext = getedit(page)
                    if currtext.strip('\n ') != origtext.strip('\n '):
                        print "page changed during edit?"
                        break
                    page.put(text, comment=act)
                    saved = True
                except KeyboardInterrupt:
                    print "keyboard interrupt"
                    return
                except Exception, e:
                    print "exception %s, trying again" % safe(e)

        # limit number of fixes for testing
        # if fixed > 3: break

    print "%d entries, %d fixed" % (entries, fixed)

    # done

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()
User:Robert Ullmann/code/hanform

Wikious

Boobota

Sagapedia