User:Robert Ullmann/code/hanform

Hello, you have come here looking for the meaning of the word User:Robert Ullmann/code/hanform. In DICTIOUS you will not only get to know all the dictionary meanings for the word User:Robert Ullmann/code/hanform, but we will also tell you about its etymology, its characteristics and you will know how to say User:Robert Ullmann/code/hanform in singular and plural. Everything you need to know about the word User:Robert Ullmann/code/hanform you have here. The definition of the word User:Robert Ullmann/code/hanform will help you to be more precise and correct when speaking or writing your texts. Knowing the definition ofUser:Robert Ullmann/code/hanform, as well as those of other words, enriches your vocabulary and provides you with more and better linguistic resources.



#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Robert Ullmann/code/hanform


"""
This bot formats Han entries

No command line arguments.

"""

import wikipedia
import xmlreader
import sys
import re
import pickle
import xmldate
from mwapi import getwikitext, getedit

def safe(s):
    return pickle.dumps(s)

def main():

    # regex table (dict, name = tuple of compiled object and replacement)
    Regex = { }

    # examples
    # Regex = (re.compile(r'\{\{PAGENAME}}'), '{{subst:PAGENAME}}')
    # Regex = (re.compile(r'\{\{cattag\|'), '{{context|')

    Regex = \
          (re.compile(r'^\{\{Han char(.*)\|as=(\d)()', re.M), 
           r'{{Han char\1|as=0\2\3')

    Regex = \
          (re.compile(r"^==Vietnamese==\n+===Han character===\n+'''..?''' +\((\\])\)\n+(\[\[|----)", re.M),
           r'==Vietnamese==\n\n===Han character===\n{{vi-hantu|\1}}\n\n# {{defn|Vietnamese}}\n\n\2')

    Regex = \
          (re.compile(r"^==Vietnamese==\n+===Han character===\n+'''..?''' +\(?(\\])\)?\n+#", re.M),
           r'==Vietnamese==\n\n===Han character===\n{{vi-hantu|\1}}\n\n#')

    Regex = \
          (re.compile(r'^==Korean==\n+===Hanja===\n+\{\{ko-hanja(.*)\}\}\n+(\[\[|----)', re.M),
           r'==Korean==\n\n===Hanja===\n{{ko-hanja\1}}\n\n# {{defn|Korean}}\n\n\2')

    Regex = \
          (re.compile(r"^==Middle Chinese==\n+===Han character===\n+('''.*)\n+(\[\[|----)", re.M),
           r'==Middle Chinese==\n\n===Han character===\n\1\n\n# {{defn|Middle Chinese}}\n\n\2')
    # (format matches entries by annoying IP-anon, still need serious work)

    Regex = \
          (re.compile(r'\ategory:Viet .+\]\]\n?'), r'')
    Regex = \
          (re.compile(r'\ategory:Viet\]\]\n?'), r'')

    Regex = \
          (re.compile(r'^===Compounds===$', re.M), r'====Compounds====')


    # other regex precomp, not sure what is needed yet
    rehanchar = re.compile(r'\{\{Han char\|.*?\}\}')
    reradno = re.compile(r'\|rn=(\d+)')
    rerad = re.compile(r'\|rad=(.)')
    reas = re.compile(r'\|as=(\d\d?)')
    rehanref = re.compile(r'\{\{Han ref\|.*?\}\}')
    reuh = re.compile(r'\|uh=(+)')

    revihantu = re.compile(r'\{\{vi-hantu.*?\}\}')

    # make sure we are logged in
    site = wikipedia.getSite()
    site.forceLogin()

    # get problems list
    page = wikipedia.Page(site, "User:Robert Ullmann/Han/Problems")
    rwp = getwikitext(page)

    reent = re.compile(r'\* + \\]')

    entries = 0
    probs = 0
    fixed = 0

    for title in reent.findall(rwp):

        try:
            page = wikipedia.Page(site, title)
            text = getwikitext(page)
        except wikipedia.IsRedirectPage:
            print "redirect page? bad!"
            text = ''
        except Exception, e:
            print "exception?", repr(e)
            text = ''
        if not text: continue
        origtext = text


        entries += 1
        if entries % 100 == 0: print "%d entries, %d tagged/replaced" % (entries, fixed)

        ishanchar = False
        if len(title) == 1:
            a = ord(title)
            #print "one character entry, code is %x" % a
            if a >= 0x3400 and a < 0xA000: ishanchar = True
            if a >= 0x20000 and a <= 0x2A6D6: ishanchar = True

        # Extension B, in UTF-16, narrow build (although XMLreader/Python Lib don't say so):
        if len(title) == 2:
            a = ord(title)
            b = ord(title)
            if a >= 0xd800 and a < 0xdc00:
                a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000
            if a >= 0x20000 and a <= 0x2A6D6: ishanchar = True

        # extraneous links on the page?:
        if not ishanchar: continue

        print "entry %s" % safe(title)

        # initalize
        radno = 0
        rad = ''
        ast = ''
        acts = set()

        # parse text ... find the Han char template, some params if we can

        mo = rehanchar.search(text)
        if mo:
            hanct = mo.group(0)
            mo = reradno.search(hanct)
            if mo: radno = int(mo.group(1))
            mo = rerad.search(hanct)
            if mo: rad = mo.group(1)
            mo = reas.search(hanct)
            if mo: ast = mo.group(1)
        else:
            hanct = '(missing)'

        # fix as, regex will fix entry
        if len(ast) == 1: ast = '0' + ast

        # add entire translingual section if needed
        if hanct == '(missing)' and '==Translingual==' not in text:
            text = '{{subst:xhan|uh=%X}}\n----\n{{rfc-auto}}\n' % a + text
            acts.add('added Translingual section')
            """
            print "need xhan template!"
            continue
            """

        # check/correct Unicode Hex in Han ref
        mo = rehanref.search(text)
        if mo:
             hanref = mo.group(0)
             mo = reuh.search(hanref)
             if mo:
                 uht = mo.group(1)
                 uh = int(uht, 16)  # note that we know the characters are in hex range
                 # but:
                 if uht != uht.upper(): uh = 0  # replace with UC
             else:
                 uht = '(nil)'
                 uh = 0
             # now check, should be the same as title ordinal
             if uh != a:
                 if 'uh=|' in hanref or 'uh=}' in hanref: hanrefnew = hanref.replace('|uh=', '|uh=%X'%a)
                 elif 'uh=' in hanref: hanrefnew = hanref.replace('|uh=' + uht, '|uh=%X'%a)
                 else: hanrefnew = hanref + '|uh=%X}}'%a
                 text = text.replace(hanref, hanrefnew)
                 acts.add('added/corrected Unicode hex value -%s +%X' % (uht, a))
             

        # compute sort key:
        if rad and ast:
            skey = rad + ast
        else:
            skey = ''
        # that gives us sort= for {defn} and rs= for others

        # now do regex, see if we have a substitution

        for rx in Regex:
            newtext = Regex.sub(Regex, text)
            if newtext != text:
                acts.add(rx)
                text = newtext

        # add sort keys if possible: (skip Japanese because key may be different, can do later)
        if skey:
            for lang in :
                if '{{defn|' + lang + '}}' in text:
                    text = text.replace('{{defn|' + lang + '}}', '{{defn|' + lang + '|sort=' + skey +'}}')
                    acts.add('add sort keys')

        # add sort key to template(s), vi-hantu for now
        if skey and 'vi-hantu' in text:
            mo = revihantu.search(text)
            if mo and 'rs=' not in mo.group(0):
               newv = mo.group(0) + '|rs=' + skey + '}}'
               text = text.replace(mo.group(0), newv)
               acts.add('add sort keys')

        # changes?

        if not acts: continue
        act = ', '.join(acts)

        # some change, write it
        if act: # (redundant)

            fixed += 1

            print "replacing in %s: %s" % (safe(title), safe(act))

            act = "Han format: " + act

            # try to fix the entry
            saved = False
            while not saved:
                try:
                    currtext = getedit(page)
                    if currtext.strip('\n ') != origtext.strip('\n '):
                        print "page changed during edit?"
                        break
                    page.put(text, comment=act)
                    saved = True
                except KeyboardInterrupt:
                    print "keyboard interrupt"
                    return
                except Exception, e:
                    print "exception %s, trying again" % safe(e)

        # limit number of fixes for testing
        # if fixed > 3: break

    print "%d entries, %d fixed" % (entries, fixed)

    # done

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()