User:AutoFormat/code/contexts
#!/usr/bin/python
# -*- coding: utf-8 -*-
# wikipath en wiktionary User:AutoFormat/code/contexts
"""
This code looks for contexts formatted as ('' '') on definition lines
Writes AF control file, context templates list, context exceptions list
No command line arguments.
writes report
"""
import wikipedia
import xmlreader
import sys
import re
import pickle
import xmldate
from mwapi import getwikitext
def safe(s):
ss = pickle.dumps(s)
l = len(ss)
return ss[1:l-5]
# parse spec
relab1 = re.compile(r'label=([^\|]*)\[\[([^\]]+)\]\]([^\|]*)\|')
relab2 = re.compile(r'label=([^\|\[\]]+)\|')
retc = re.compile(r'topcat=([^\|]+)\|')
repc = re.compile(r'poscat=([^\|]+)\|')
rerc = re.compile(r'regcat=([^\|]+)\|')
recc = re.compile(r'([^\w])cat=([^\|]+)\|')
relang = re.compile(r'lang=\{\{#if:\{\{\{lang\|}}}\|\{\{\{lang}}}\|([^}]+)}}\|')
respace = re.compile(r'_\|')
retcat = re.compile(r'tcat=([^\|]+)\|')
def respec(spec):
if spec.startswith('{{context {{{sub|}}}|'): spec = spec[21:]
elif '{{context' not in spec: spec = "(not a context label)"
if spec.endswith(''): spec = spec[:-11]
spec += '|'
spec = relab1.sub(r'label:\1[[\2]]\3, ', spec)
spec = relab2.sub(r'label:\1, ', spec)
spec = retc.sub(r'topic category:\1, ', spec)
spec = repc.sub(r'POS category:\1, ', spec)
spec = rerc.sub(r'regional category:\1, ', spec)
spec = recc.sub(r'\1category:\2, ', spec)
spec = relang.sub(r'default language:\1, ', spec)
spec = respace.sub(r'(space), ', spec)
spec = spec.strip(' ,|')
if '{' in spec: spec = '' + spec + ''
return spec
def main():
# make sure we are logged in
site = wikipedia.getSite()
site.forceLogin()
wikipedia.setAction('writing report')
# get XML dump
dump = xmlreader.XmlDump("en-wikt.xml")
entries = 0
words = 0
ctxs = 0
Contexts = { }
Examples = { }
Templates = { }
Redirs = { }
First = { }
Specs = { }
Date = { }
Cats = { }
Bad = { }
recontext = re.compile(r"^# *(\(|\)|\{\{italbrac\|)(.+?)(\(|\)|}})", re.M)
reredir = re.compile(r"#redirect\s*\[\[template:(.*)\]\]", re.I)
recats = re.compile(r"\[\[Category:([^\|\]]*)", re.I)
reiwiki = re.compile(r'\[\[[a-z-]{2,9}:.+\]\]')
for entry in dump.parse():
text = entry.text
title = entry.title
entries += 1
if entries % 10000 == 0:
print "%d entries, %d words, %d contexts" % (entries, words, ctxs)
# skip redirects, unless Templates
if text and text[0] == '#':
mo = reredir.match(text)
if mo:
Redirs[title[9:]] = mo.group(1)
print "redirect: %s to %s" % (safe(title[9:]), safe(mo.group(1)))
continue
# look for templates
if title.startswith('Template:') and text.find('{{context') >= 0:
# but not the templates themselves!
if title.find('/') >= 0: continue
if title.find('context') >= 0: continue
if title.find('checklabel') >= 0: continue
if title.find('pos-') >= 0: continue
tname = title[9:]
Templates[tname.lower()] = tname
First[u + tname] = text.splitlines()[0]
Date[tname] = xmldate.enXMLdate[:-5] # sans year
print "template: ", safe(tname)
spec = respec(First[tname])
# if spec is "bad" (contains { is a good indication) pick up current, also other stuff
bad = False
if '{' in spec or '}' in spec: bad = True
if reiwiki.search(text): bad = True
if '}}\n= 0:
continue
words += 1
# if entries > 5000: break
# parse text ...
for ctxtup in recontext.findall(text):
ctxstr = ctxtup[1]
for ctx in ctxstr.split(','):
ctx = ctx.strip("'[] ")
if not ctx: continue
if ctx[0:2] == 'w:' and ctx.find('|') > 0: ctx = ctx.split('|')[1]
if ctx[0] > 'z' or ctx[0] < 'A': continue
if ctx in Contexts:
Contexts[ctx] += 1
else:
Contexts[ctx] = 1
print 'context: %s' % safe(ctx)
if ctx in Examples:
if len(Examples[ctx]) < 70: Examples[ctx] += ' ' + title + ''
else:
Examples[ctx] = '' + title + ''
ctxs += 1
# end of for context string
# end of for entry
print "%d entries, %d words, %d ctxs" % (entries, words, ctxs)
# and write the AutoFormat control file
try:
reportpage = wikipedia.Page(site, 'User:AutoFormat/Contexts')
oldreport = reportpage.get()
except wikipedia.NoPage:
print "No present report for %s" % reportpage.aslink()
oldreport = "(edit above this line)\n----\n"
report = oldreport[:oldreport.find('----')] + '----\n'
report += '\nas of ' + xmldate.enXMLdate + ';\n'
report += 'context templates, redirects, this section generated by bot, edit above horizontal rule\n'
report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n'
report += '|-\n| | Context string\n| |Template name\n'
for ctx in sorted(Templates):
tname = Templates[ctx]
report += "|-\n| " + ctx + " ||" + tname + '\n'
report += "|}\n\nRedirects:\n\n"
report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n'
report += '|-\n| | Context string\n| |Redirect\n'
print "WARNING: still writing redirects separately"
for red in sorted(Redirs):
if Redirs[red].lower() in Templates:
# redirect to a template, so valid
# if just a case variant, ignore it, we match anyway
if red.lower() == Redirs[red].lower(): continue
report += "|-\n| " + red.lower() + " ||" + red + '\n'
report += "|}\n"
wikipedia.setAction('writing report')
# file the report
reportpage.put(report)
# add redirs to Templates, set "first line" to be the redirect
for red in Redirs:
if Redirs[red].lower() in Templates:
Templates[red.lower()] = Templates[Redirs[red].lower()]
First[u + red] = u'redirect to ' + Redirs[red]
Specs[red] = First[red]
Cats[red] =
Date[red] = xmldate.enXMLdate[:-5]
Bad[red] =
# write context labels report
report = 'Context label templates:\n\n'
report += '* categories are those specified explicitly, including with tcat=, not the default cat\n\n'
report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n'
report += '|-\n| |Template\n| width=10% |as of\n| |Specification\n| width=15% |Template category\n'
for tname in sorted(First, key=unicode.lower):
report += '|-\n| ' + tname +' || ' + Date[tname] + ' || ' \
+ Specs[tname] + Bad[tname] + ' || ' + Cats[tname] + '\n'
print "label %s: %s" % (safe(tname), safe(Specs[tname]))
report += "|}\n"
wikipedia.setAction('writing report')
# write the report page
try:
reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Context labels')
oldreport = reportpage.get()
except wikipedia.NoPage:
print "No present report for %s" % reportpage.aslink()
# file the report
reportpage.put(report)
# write contexts report
thresh = 3
report = '\nas of ' + xmldate.enXMLdate + '\n'
report += 'Contexts given in definition lines without templates: '
report += '\n%d different "contexts" found, report is those occuring at least %d times' % (len(Contexts), thresh)
report += ' or that do have matching templates\n'
report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n'
report += '|-\n| | Context\n| |Template\n| |Occurs\n| | Examples\n'
for ctx in sorted(Contexts):
tname = ctx.lower().strip('[]')
if tname in Templates: tname = Templates[tname]
else: tname =
if not tname and Contexts[ctx] < thresh: continue
report += "|-\n| " + ctx + " ||" + tname + ' ||' + str(Contexts[ctx]) + '||' + Examples[ctx] + '\n'
report += "|}\n"
wikipedia.setAction('writing report')
# write the report page
try:
reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Contexts')
oldreport = reportpage.get()
except wikipedia.NoPage:
print "No present report for %s" % reportpage.aslink()
# file the report
reportpage.put(report)
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()