#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:AutoFormat/code/contexts """ This code looks for contexts formatted as ('' '') on definition lines Writes AF control file, context templates list, context exceptions list No command line arguments. writes report """ import wikipedia import xmlreader import sys import re import pickle import xmldate from mwapi import getwikitext def safe(s): ss = pickle.dumps(s) l = len(ss) return ss # parse spec relab1 = re.compile(r'label=(*)\]+)\]\](*)\|') relab2 = re.compile(r'label=(]+)\|') retc = re.compile(r'topcat=(+)\|') repc = re.compile(r'poscat=(+)\|') rerc = re.compile(r'regcat=(+)\|') recc = re.compile(r'()cat=(+)\|') relang = re.compile(r'lang=\{\{#if:\{\{\{lang\|}}}\|\{\{\{lang}}}\|(+)}}\|') respace = re.compile(r'_\|') retcat = re.compile(r'tcat=(+)\|') def respec(spec): if spec.startswith('{{context {{{sub|}}}|'): spec = spec elif '{{context' not in spec: spec = "(not a context label)" if spec.endswith('<noinclude>'): spec = spec spec += '|' spec = relab1.sub(r'label:\1]\3, ', spec) spec = relab2.sub(r'label:\1, ', spec) spec = retc.sub(r'topic category:\1, ', spec) spec = repc.sub(r'POS category:\1, ', spec) spec = rerc.sub(r'regional category:\1, ', spec) spec = recc.sub(r'\1category:\2, ', spec) spec = relang.sub(r'default language:\1, ', spec) spec = respace.sub(r'(space), ', spec) spec = spec.strip(' ,|') if '{' in spec: spec = '<nowiki>' + spec + ''
return spec
def main():
# make sure we are logged in site = wikipedia.getSite() site.forceLogin() wikipedia.setAction('writing report')
# get XML dump dump = xmlreader.XmlDump("en-wikt.xml")
entries = 0 words = 0 ctxs = 0
Contexts = { } Examples = { } Templates = { } Redirs = { } First = { } Specs = { } Date = { } Cats = { } Bad = { }
recontext = re.compile(r"^# *(\(|\)|\{\{italbrac\|)(.+?)(\(|\)|}})", re.M) reredir = re.compile(r"#redirect\s*\\]", re.I) recats = re.compile(r"\]*)", re.I) reiwiki = re.compile(r'\{2,9}:.+\]\]')
for entry in dump.parse(): text = entry.text title = entry.title
entries += 1 if entries % 10000 == 0: print "%d entries, %d words, %d contexts" % (entries, words, ctxs)
# skip redirects, unless Templates if text and text == '#': mo = reredir.match(text) if mo: Redirs] = mo.group(1) print "redirect: %s to %s" % (safe(title), safe(mo.group(1))) continue
# look for templates if title.startswith('Template:') and text.find('{{context') >= 0: # but not the templates themselves! if title.find('/') >= 0: continue if title.find('context') >= 0: continue if title.find('checklabel') >= 0: continue if title.find('pos-') >= 0: continue tname = title Templates = tname First = text.splitlines() Date = xmldate.enXMLdate # sans year
print "template: ", safe(tname)
spec = respec(First) # if spec is "bad" (contains { is a good indication) pick up current, also other stuff
bad = False if '{' in spec or '}' in spec: bad = True if reiwiki.search(text): bad = True if '}}\n<noinc' in text: bad = True
if bad: oldtext = text print ' getting current version' try: page = wikipedia.Page(site, 'Template:' + tname) # text = page.get(sysop = True) # for protected pages text = getwikitext(page) except wikipedia.NoPage: print "Can't get %s from en.wikt" % safe(page.aslink()) text = Date = "can't access" except wikipedia.IsRedirectPage, target: print "Page %s is now a redirect" % safe(page.aslink()) text = Date = '9 July' First = u'redirect to ' + target if not text: text = oldtext else: First = text.splitlines() Date = '9 July' spec = respec(First)
cats = # extract tcat from spec mo = retcat.search(spec + '|') if mo: spec = retcat.sub(, spec + '|').strip('|') cats += mo.group(1).capitalize() + ' context labels, '
# explicit cats for cat in recats.findall(text): cats += cat + ', ' Cats = cats.strip(', ') if cats: print " cats %s" % safe(Cats)
# trouble Bad = # look for iwikis, to report: for iw in reiwiki.findall(text): Bad += ', bad iwiki: ' + iw + '' print " bad iwiki ", safe(iw) if '}}\n<noinc' in text: Bad += ', extra line break'
Specs = spec
# skip non main-name-space
if title.find(':') >= 0: continue
words += 1
# if entries > 5000: break
# parse text ...
for ctxtup in recontext.findall(text):
ctxstr = ctxtup
for ctx in ctxstr.split(','):
ctx = ctx.strip("' ") if not ctx: continue
if ctx == 'w:' and ctx.find('|') > 0: ctx = ctx.split('|')
if ctx > 'z' or ctx < 'A': continue
if ctx in Contexts: Contexts += 1 else: Contexts = 1 print 'context: %s' % safe(ctx)
if ctx in Examples: if len(Examples) < 70: Examples += ' ' + title + '' else: Examples = '' + title + '' ctxs += 1
# end of for context string
# end of for entry
print "%d entries, %d words, %d ctxs" % (entries, words, ctxs)
# and write the AutoFormat control file
try: reportpage = wikipedia.Page(site, 'User:AutoFormat/Contexts') oldreport = reportpage.get() except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink() oldreport = "(edit above this line)\n----\n"
report = oldreport + '----\n'
report += '\nas of ' + xmldate.enXMLdate + ';\n' report += 'context templates, redirects, this section generated by bot, edit above horizontal rule\n' report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | Context string\n| |Template name\n'
for ctx in sorted(Templates): tname = Templates report += "|-\n| " + ctx + " ||" + tname + '\n' report += "|}\n\nRedirects:\n\n"
report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | Context string\n| |Redirect\n'
print "WARNING: still writing redirects separately"
for red in sorted(Redirs): if Redirs.lower() in Templates: # redirect to a template, so valid # if just a case variant, ignore it, we match anyway if red.lower() == Redirs.lower(): continue report += "|-\n| " + red.lower() + " ||" + red + '\n'
report += "|}\n" wikipedia.setAction('writing report')
# file the report reportpage.put(report)
# add redirs to Templates, set "first line" to be the redirect for red in Redirs: if Redirs.lower() in Templates: Templates = Templates.lower()] First = u'redirect to ' + Redirs Specs = First Cats = Date = xmldate.enXMLdate Bad =
# write context labels report
report = 'Context label templates:\n\n' report += '* categories are those specified explicitly, including with tcat=, not the default cat\n\n' report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| |Template\n| width=10% |as of\n| |Specification\n| width=15% |Template category\n'
for tname in sorted(First, key=unicode.lower): report += '|-\n| ' + tname +' || ' + Date + ' || ' \ + Specs + Bad + ' || ' + Cats + '\n' print "label %s: %s" % (safe(tname), safe(Specs))
report += "|}\n" wikipedia.setAction('writing report')
# write the report page
try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Context labels') oldreport = reportpage.get() except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink()
# file the report reportpage.put(report)
# write contexts report
thresh = 3
report = '\nas of ' + xmldate.enXMLdate + '\n' report += 'Contexts given in definition lines without templates: ' report += '\n%d different "contexts" found, report is those occuring at least %d times' % (len(Contexts), thresh) report += ' or that do have matching templates\n' report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | Context\n| |Template\n| |Occurs\n| | Examples\n'
for ctx in sorted(Contexts): tname = ctx.lower().strip('') if tname in Templates: tname = Templates else: tname = if not tname and Contexts < thresh: continue report += "|-\n| " + ctx + " ||" + tname + ' ||' + str(Contexts) + '||' + Examples + '\n'
report += "|}\n" wikipedia.setAction('writing report')
# write the report page
try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Contexts') oldreport = reportpage.get() except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink()
# file the report reportpage.put(report)
if __name__ == "__main__":
try: main() finally: wikipedia.stopme()</nowiki>