Warning: foreach() argument must be of type array|object, null given in
/home/enciclo/public_html/dictious.com/inc.main.result.php on line
38
User:Robert Ullmann/code/xhan
#!/usr/bin/python
# -*- coding: utf-8 -*-
# wikipath en wiktionary User:Robert Ullmann/code/xhan
"""
This bot checks Han character entries in en-wikt.xml and writes report pages for each row.
No command line arguments.
Generates (replaces) User:Robert Ullmann/Han/(hexcode) for each row
Generates a problems summary (to-do list)
"""
import wikipedia
import xmlreader
import sys
import re
import pickle
from getwikitext import getwikitext
import xmldate
def safe(s):
return pickle.dumps(s)[1:-5]
def main():
probonly = False
recheck = False
for arg in sys.argv[1:]:
if arg.startswith('-probonly'):
probonly = True
print "only updating problems"
elif arg.startswith('-recheck'):
recheck = True
print "rechecking entries from current DB"
else: print "unknown command line argument %s" % arg
# report dictionary
enwikt = { 0:"blank" }
# problems, index is character code number, value is char + text of problem
problems = { }
# header levels
Lang = {'Translingual':'Han char', 'Cantonese':'yue', 'Japanese':'ja', 'Korean':'ko', 'Mandarin':'cmn',
'Min Nan':'nan', 'Hakka':'hak', 'Gan':'gan', 'Jinyu':'cjy', 'Min Bei':'mnp', 'Min Dong':'cdo',
'Min Zhong':'czo', 'Wu':'wuu', 'Xiang':'hsn', 'Vietnamese':'vi', 'Chinese':'zh',
'Old Chinese':'och', 'Middle Chinese':'???', 'Zhuang':'za', 'Old Korean':'oko' }
L3 = set(['Hanzi', 'Kanji', 'Hanja', 'Han character', 'Pronunciation', 'Proper noun', 'Pronoun',
'Noun', 'Verb', 'Adjective', 'Number', 'Counter', 'Particle', 'Prefix', 'Suffix', 'Affix', 'Adverb',
'Etymology', 'Etymology 1', 'Etymology 2','Etymology 3','Etymology 4',
'Related terms', 'Derived terms', 'Usage notes', 'External links', 'See also',
'Alternative spellings', 'Alternative forms', 'Preposition', 'Adnominal',
'References', 'Interjection', 'Measure word', 'Conjunction' ])
L4 = set(['Compounds', 'References', 'Readings', 'Derived terms', 'Related terms', 'Antonyms',
'Usage notes', 'Synonyms', 'See also', 'Descendants' ])
# template list, these are the templates that are used in one specific language and section
# and should always appear in that section. used to build Tdict and Require
Tlist = [ ('Han char', 'Translingual', 'Han character'),
('Han ref', 'Translingual', 'Han character'),
('cmn-hanzi', 'Mandarin', 'Hanzi'),
('nan-hanzi', 'Min Nan', 'Hanzi'),
('yue-hanzi', 'Cantonese', 'Hanzi'),
('ja-kanji', 'Japanese', 'Kanji'),
('ko-hanja', 'Korean', 'Hanja'),
('vi-hantu', 'Vietnamese', 'Han character') ]
# dictionary of templates, built from above
Tdict = {}
for t, l, s in Tlist:
Tdict[t] = (l, s)
# checklist requirements. for each of the first in the tuple, the second must exist, or the entry
# is in error
Require = [ ('entry', 'Translingual'),
('Translingual', 'Translingual Han character section'),
('Mandarin', 'Mandarin Hanzi section'),
('Min Nan', 'Min Nan Hanzi section'),
('Cantonese', 'Cantonese Hanzi section'),
('Japanese', 'Japanese Kanji section'),
('Korean', 'Korean Hanja section'),
('Vietnamese', 'Vietnamese Han character section') ]
# add templates to requirements
for t, l, s in Tlist:
Require.append( (l + ' ' + s + ' section', t + ' template in ' + l + ' ' + s + ' section') )
# regex precomp
rehanchar = re.compile(r'\{\{Han char.*?\}\}')
reradno = re.compile(r'\|rn=(\d+)[|}]')
rerad = re.compile(r'\|rad=(.)[|}]')
reas = re.compile(r'\|as=(\d\d)[|}]')
rehanref = re.compile(r'\{\{Han ref.*?\}\}')
reuh = re.compile(r'\|uh=(\w+)[|}]')
reud = re.compile(r'\|ud=(\d+)[|}]')
# header, will treat L1 as a special case
reheader = re.compile(r'(={2,6})\s*(.+?)={2,6}(.*)')
retemplate = re.compile(r'\{\{([-a-zA-Z ]+)[\}\|]')
# templater allows for '* ' before Han ref ...
retemplater = re.compile(r'\*? ?\{\{(Han ref)[\}\|]')
# make sure we are logged in
site = wikipedia.getSite()
site.forceLogin()
wikipedia.setAction('Han character report')
# get XML dump
dump = xmlreader.XmlDump("en-wikt.xml")
print "reading XML dump from %s" % xmldate.enXMLdate
entries = 0
hanchars = 0
kprobs = 0
for entry in dump.parse():
text = entry.text
title = entry.title
entries += 1
if entries % 5000 == 0: print "%d entries, %d characters" % (entries, hanchars)
# figure out if it is a Han character entry:
ishanchar = False
if len(title) == 1:
a = ord(title[0:1])
#print "one character entry, code is %x" % a
if a >= 0x3400 and a < 0xA000: ishanchar = True
if a > 0x4BD5 and a < 0x4E00: ishanchar = False # I Ching characters
# Extension B, in UTF-16 (although XMLreader/Python Lib don't say so):
if len(title) == 2:
a = ord(title[0:1])
b = ord(title[1:2])
if a >= 0xd800 and a < 0xdc00:
a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000
if a >= 0x20000 and a <= 0x2A6D6: ishanchar = True
if not ishanchar: continue
hanchars += 1
# do this twice if needed, first with XML, then with current entry if recheck
rc = True
reread = recheck
while rc:
rc = False
#u = unichr(i)
#ucchar = u.encode("UTF-8")
ucs = '%X' % a
#title = "%d;" % a
han = '[[' + title + ']]'
#inititalize
Checklist = set(['entry'])
simple = ''
defn = ''
ex = ''
wlinkfound = False
deffound = 0
inlevel2 = 0
currlang = ''
current3 = ''
langfound = 0
l3found = 0
deffound = 0
extra = 0
detail = True
MR = ''
Yale = ''
# first find Han char and Han ref templates, check a few things
mo = rehanchar.search(text)
if mo:
hanct = mo.group(0)
mo = reradno.search(hanct)
if mo: radno = int(mo.group(1))
else: ex += 'Radical number missing
'
mo = rerad.search(hanct)
if mo: rad = mo.group(1)
else: ex += 'Radical missing
'
mo = reas.search(hanct)
if mo: ast = int(mo.group(1))
else: ex += 'Additional strokes parameter missing or incorrect
'
# else: ex += 'Han char template missing
'
mo = rehanref.search(text)
if mo:
hanref = mo.group(0)
mo = reud.search(hanref)
if mo:
ud = int(mo.group(1), 10)
if ud != a: ex += 'Unicode decimal value incorrect
'
mo = reuh.search(hanref)
if mo:
uh = int(mo.group(1), 16)
if uh != a: ex += 'Unicode hex value incorrect
'
else: ex += 'Unicode hex value missing
'
# else: ex += 'Han ref template missing
'
# now parse text line-by-line ...
for line in text.splitlines():
# print "line len is %d" % len(line)
if line[0:1] == '#':
deffound = 1
if line.find('[[') > 0: wlinkfound = True
lang = line.partition('{{defn|')[2]
if lang <> '':
lang = lang.split('|')[0]
lang = lang.split('}')[0]
if lang in Lang: defn += ', ' + Lang[lang]
else: defn += ', ' + lang
elif simple == '': simple = line[1:140]
# look for indicators of un-revised format
if detail:
#if line.find('total strokes index') > 0: ex += "NanshuBot header not formatted
"
if line.find('Penkyamp') > 0: ex += "Chinese hanzi not formatted
"
if line.find('McCune-Reischauer') > 0: ex += "Korean not formatted
"
if line.find('Morohashi') > 0: ex += "References not formatted
"
if line[0:1] == '=' and line[1:2] != '=':
ex += "Level one header
"
continue
mo = reheader.match(line)
if mo:
header = mo.group(2).strip()
level = len(mo.group(1))
if mo.group(3): ex += "Stuff after %s header
" % header
else: level = 0
# check headers by level
if level == 4:
if header not in L4 and header not in L3:
if detail: ex += "L4 header: %s
" % header
# multiple etymologies:
if header not in L4 and header in L3:
l3found = 1
current3 = header
Checklist.add(currlang + ' ' + current3 + ' section')
if level == 3:
if header in L3:
l3found = 1
current3 = header
Checklist.add(currlang + ' ' + current3 + ' section')
else:
if detail: ex += "L3 header: %s
" % header
current3 = ''
# if level is two, close L2 section
if level == 2:
current3 = ''
if inlevel2 == 1:
if detail: ex += "Missing ---- to end %s section
" % currlang
# check, pick up new language
if level == 2:
inlevel2 = 1
if header in Lang: newlang = header
else: newlang = ''
if newlang <> '':
if newlang <> 'Translingual': langfound = 1
if currlang <> '':
# check current lang for order
if newlang == 'Translingual': ex += '%s before Translingual
' % currlang
elif currlang <> 'Translingual':
if currlang == newlang: ex += 'two sections for %s
' % currlang
if currlang > newlang: ex += '%s out of order
' % currlang
else:
ex += "L2 header: %s
" % header
detail = False
# in order, or not, current is language if valid
currlang = newlang
l3found = 0
current3 = ''
deffound = 0
Checklist.add(currlang)
# templates
mo = retemplate.match(line)
if not mo: mo = retemplater.match(line) # "* {{Han ref..." case
if mo:
t = mo.group(1).strip()
if t in Tdict:
l, s = Tdict[t]
if currlang != l: ex += "Template %s not in %s section
" % (t, l)
elif current3 != s: ex += "Template %s not in %s section
" % (t, s)
# (if error, harmless to add to checklist)
Checklist.add(t + ' template in ' + l + ' ' + s + ' section')
# don't require Korean Hanja section on kwukyel notes, should refer to that in Han defn:
if line.startswith('#') and 'kwukyel' in line:
Checklist.add("Korean Hanja section")
Checklist.add("ko-hanja template in Korean Hanja section")
# random things, cruft:
if detail:
if inlevel2 and line[0:5] == "* '''":
ex += "Cruft: %s" % line[2:]
if "Template:substub" in line:
ex += "substub template"
# Korean, new format:
if line.find('ko-hanja') > 0:
if line.find('|mr=') > 0: MR = re.sub(r'.*\|mr=(.*?)[|}].*', r'\1', line)
if line.find('|y=') > 0: Yale = re.sub(r'.*\|y=(.*?)[|}].*', r'\1', line)
# line across, exit level 2
if line[0:4] == '----':
if inlevel2 == 0:
if extra == 0: ex += 'Extraneous ----'
detail = False
inlevel2 = 0
if not l3found and detail: ex += "No L3 header in %s section" % currlang
if not deffound and detail: ex += "No definition line for %s" % currlang
elif inlevel2 == 0:
# only other text allowed is templates or blank lines
if len(line) > 1:
if line[0:2] <> '{{':
if extra == 0:
if detail: ex += "Extraneous text not in L2 section"
extra = 1
# enough already!
if detail and len(ex) > 200:
detail = False
ex += '...more...'
# end for line
# end of entry
if detail:
# close last section, should be in level 2, exit
if inlevel2 == 0: ex += 'Extraneous ---- at end'
else:
if l3found == 0: ex += "No L3 header in %s section" % currlang
if deffound == 0: ex += "No definition line for %s" % currlang
# even if no detail, report bad Korean Yale
if Yale:
yf = Yale
if MR.find(u'y\u014f') >= 0 and Yale.find('ey') >= 0:
yf = re.sub('ey', 'ye', yf)
yf = re.sub('yye', 'yey', yf)
elif MR.find(u'he') >= 0 and Yale.find('ye') >= 0: yf = re.sub('ye', 'ey', yf)
if MR.find(u'ya') >= 0 and Yale.find('ay') >= 0: yf = re.sub('ay', 'ya', yf)
elif MR.find(u'ae') >= 0 and Yale.find('ya') >= 0: yf = re.sub('ya', 'ay', yf)
if MR.find(u"ch'e") >= 0 and Yale.find('chye') >= 0: yf = re.sub('chye', 'chey', yf)
if MR.find(u'ke') >= 0 and Yale.find('kye') >= 0: yf = re.sub('kye', 'key', yf)
if MR.find(u'se') >= 0 and Yale.find('sye') >= 0: yf = re.sub('sye', 'sey', yf)
if MR.find(u're') >= 0 and Yale.find('lye') >= 0: yf = re.sub('lye', 'ley', yf)
if MR.find(u'ne') >= 0 and Yale.find('nye') >= 0: yf = re.sub('nye', 'ney', yf)
if MR.find(u'pe') >= 0 and Yale.find('pye') >= 0: yf = re.sub('pye', 'pey', yf)
if MR == 'e' and Yale == 'ye': yf = 'ey'
if yf <> Yale:
ex += "Korean Yale %s should be %s" % (Yale, yf)
kprobs += 1
# run checklist (regardless of detail for now)
for r, i in Require:
if r in Checklist and i not in Checklist: ex += i + ' missing'
# if there was a problem, reread from current DB?
if ex and reread:
page = wikipedia.Page(site, title)
print "Re-reading character %X" % a
try:
# text = page.get()
text = getwikitext(site, page)
rc = True
reread = False
continue # go back to top once more
except wikipedia.NoPage:
print "can't read current page?"
pass
except wikipedia.IsRedirectPage:
print "redirect page?"
pass
# add to problems
if ex: problems[a] = han + ' ' + re.sub('', ', ', ex)[0:-2]
# more details, not reported in problem punchlist
if detail:
if simple and not wlinkfound: ex += "No wikilink in any definition found"
if langfound == 0: ex += "No language section found"
# fixups
if defn[0:1] == ',': defn = defn[2:]
# store report line
enwikt[a] = '|-\n| ' + ucs + ' || ' + han + ' || ' + simple + ' || ' + defn + ' || ' + ex + '\n'
print "Character %X %s" % (a, safe(ex))
print "%d Korean Yale problems" % kprobs
print "%d total problems" % len(problems)
print "%d entries, %d characters, writing reports" % (entries, hanchars)
# write report pages
report = '\nProblems as of ' + xmldate.enXMLdate
report += ', keep in mind while fixing entries that the check, rather than the entry, may be wrong.\n\n'
for c in sorted(problems):
report += '* %X ' % c + problems[c] + '\n'
report += '\n%d problems\n\n' % len(problems)
# report page
try:
reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Han/Problems')
oldreport = reportpage.get()
except wikipedia.NoPage:
print "No present report for %s" % reportpage.aslink()
# file the report
if report.strip(' \n') != oldreport.strip(' \n'): reportpage.put(report)
if probonly: return # we are done
for si in range(0x3400, 0x2B000, 256):
validentry = False
# save some time
if si > 0xA000 and si < 0x20000: continue
# blank and re-intialize
report = '
Summary of checks on Han character entries from UCS hex ' + "%X"%si + ' to ' + "%X"%(si+255)
report += ', run on ' + xmldate.enXMLdate + ' XML dump of the en.wikt.
This is one row (sometimes called a block) of the Unified Han characters; see the + "%X"%si + ' Unihan database for this row.
Notes:
The simple meaning shown is just the first # definition line in the entry, regardless of language.
Exceptions may not be errors, rather things that did not "pass" rather simple checks; some less used level 4 headers, etc. may show up.
Some exceptions may mask others, for example if the horizontal rule ending a section is reported missing, missing POS headers or definitions in that section will not be reported.
A major error (bad L2 header) will cause details to be suppressed, also if there are simply too many exceptions.
Cruft refers to the format, not the content!
This page is generated by 'bot code, and is completely over-written on each run, so it isn't very useful to edit it.
\n'
for i in range(si, si+256):
if i in enwikt:
line = enwikt[i]
validentry = True
report += line
# else: line = '|-\n| ' + '%X'%i + ' || ' + "%d;"%i + ' || || || (entry not found)\n'
# last rows of Han, Ext A, Ext B
if i == 0x9FA5: break
if i == 0x4DB5: break
if i == 0x2A6D6: break
report += '|}\n'
if not validentry: continue
# report page
try:
reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Han/%X' % si)
oldreport = reportpage.get()
except wikipedia.NoPage:
print "No present report for %s" % reportpage.aslink()
oldreport =
if report.strip(' \n') == oldreport.strip(' \n'):
print "No change to report for %s" % reportpage.aslink()
continue
wikipedia.showDiff(oldreport, report)
# file the report
reportpage.put(report)
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()