#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:Robert Ullmann/code/level3 """ This code looks for valis and invalid L3 headers in the en.witk No command line arguments. writes reports """ import wikipedia import xmlreader import sys import re import pickle import xmldate def safe(s): ss = pickle.dumps(s) l = len(ss) return ss def main(): # make sure we are logged in site = wikipedia.getSite() site.forceLogin() wikipedia.setAction('writing report') # get XML dump dump = xmlreader.XmlDump("en-wikt.xml") entries = 0 words = 0 L3headers = 0 # valid headers have notes Notes = {} # all headers have occurance counts Occurs = {} # invalid headers have examples Examples = {} # initialize some valid headers # standard POS, etc: for header in ('Noun', 'Verb', 'Adjective', 'Adverb', 'Pronoun', 'Proper noun', 'Preposition', 'Conjunction', 'Interjection', 'Article', 'Prefix', 'Suffix', 'Affix', 'Infix', 'Counter'): Notes = 'standard POS header' for header in ('Initialism', 'Abbreviation', 'Letter', 'Symbol', 'Acronym', 'Proverb', 'Contraction', 'Idiom', 'Phrase', 'Syllable'): Notes = 'standard non-POS header' for header in ('Number', 'Numeral', 'Cardinal number', 'Cardinal numeral', 'Ordinal number', 'Ordinal numeral'): Notes = "see note ''supra''" for header in ('Etymology', 'Pronunciation', 'Trivia', 'Alternative spellings', 'Alternative forms', 'Anagrams', 'Usage notes'): Notes = "standard L3 header" for num in range(0, 25): Notes = 'standard L3 header' for header in ('Related terms', 'Derived terms', 'Descendants', 'See also', 'References', 'External links', 'Quotations'): Notes = "standard L4/L3 header" for header in ('Declension', 'Conjugation', 'Inflection', 'Antonyms', 'Synonyms', 'Translations'): Notes = "header should be at L4" for header in ('Han character', 'Kanji', 'Hanzi', 'Hanja'): Notes = "valid in single Han character entries only, ''not checked''" Notes = "valid only for single syllable entries, ''not checked''" for header in ('{{abbreviation}}', '{{acronym}}', '{{initialism}}', '{{numeral}}'): Notes = "L3 POS header templates" for entry in dump.parse(): text = entry.text title = entry.title entries += 1 if entries % 10000 == 0: print "%d entries, %d words, %d L3 headers" % (entries, words, L3headers) # skip non main-name-space if title.find(':') >= 0 or title.find('/') >= 0: continue else: words += 1 # if entries > 5000: break if title.startswith('Glossary of'): continue # parse text ... for line in text.splitlines(): # comments on the (presumed) end of lines if line.find('<!--') >= 0: line = line.split('<!--') if line != '===': continue if line == '=': continue L3headers += 1 header = line.strip().strip() # template mess if header == '{{': header = re.sub(r'(.*?)\|.*?\}(.*)', r'\1}\2', header) if header not in Occurs: Occurs = 0 Occurs += 1 if header in Notes: continue if header not in Examples: Examples = ']' continue if len(Examples) < 210: Examples += ' ]' # end of for line # end of for entry print "%d entries, %d words, %d L3 headers" % (entries, words, L3headers) # report valid headers report = '\nas of ' + xmldate.enXMLdate + '\n' report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | Header\n| |Occurs\n| |Notes\n' for header in sorted(Occurs): if header not in Notes: continue report += "|-\n| '''<nowiki>" + header + " ||" + str(Occurs) + '||' + Notes + '\n' del Occurs
report += "|}\n" wikipedia.setAction('writing report')
# write the report page
try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L3/valid') oldreport = reportpage.get() except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink()
# file the report reportpage.put(report)
# report invalid headers
i = k = 0 report = '\nas of ' + xmldate.enXMLdate + '\n' report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | Header\n| |Occurs\n| |Examples\n'
for header in sorted(Occurs): report += "|-\n| " + header + " ||" + str(Occurs) + '||' + Examples + '\n' i += 1 k += Occurs
report += "|}\n\n" report += "* Number of distinct invalid headers: %d\n" % i report += "* Total number of invalid headers: %d\n" % k wikipedia.setAction('writing report')
# write the report page
try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L3/invalid') oldreport = reportpage.get() except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink()
# file the report reportpage.put(report)
if __name__ == "__main__":
try: main() finally: wikipedia.stopme()</nowiki>