User:Robert Ullmann/code/level3

#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Robert Ullmann/code/level3


"""
This code looks for valis and invalid L3 headers in the en.witk

No command line arguments.

writes reports
"""

import wikipedia
import xmlreader
import sys
import re
import pickle
import xmldate

def safe(s):

    ss = pickle.dumps(s)
    l = len(ss)
    return ss

def main():

    # make sure we are logged in
    site = wikipedia.getSite()
    site.forceLogin()
    wikipedia.setAction('writing report')

    # get XML dump
    dump = xmlreader.XmlDump("en-wikt.xml")

    entries = 0
    words = 0
    L3headers = 0

    # valid headers have notes
    Notes = {}
    # all headers have occurance counts
    Occurs = {}
    # invalid headers have examples
    Examples = {}

    # initialize some valid headers

    # standard POS, etc:
    for header in ('Noun', 'Verb', 'Adjective', 'Adverb', 'Pronoun',
            'Proper noun', 'Preposition', 'Conjunction', 'Interjection',
            'Article', 'Prefix', 'Suffix', 'Affix', 'Infix', 'Counter'):
        Notes = 'standard POS header'

    for header in ('Initialism', 'Abbreviation', 'Letter', 'Symbol', 'Acronym', 'Proverb',
            'Contraction', 'Idiom', 'Phrase', 'Syllable'):
        Notes = 'standard non-POS header'

    for header in ('Number', 'Numeral', 'Cardinal number', 'Cardinal numeral', 'Ordinal number', 'Ordinal numeral'):
        Notes = "see note ''supra''"

    for header in ('Etymology', 'Pronunciation', 'Trivia', 'Alternative spellings', 'Alternative forms',             'Anagrams', 'Usage notes'):
        Notes = "standard L3 header"

    for num in range(0, 25):
        Notes = 'standard L3 header'

    for header in ('Related terms', 'Derived terms', 'Descendants', 'See also', 'References', 'External links', 'Quotations'):
        Notes = "standard L4/L3 header"

    for header in ('Declension', 'Conjugation', 'Inflection', 'Antonyms', 'Synonyms', 'Translations'):
        Notes = "header should be at L4"

    for header in ('Han character', 'Kanji', 'Hanzi', 'Hanja'):
        Notes = "valid in single Han character entries only, ''not checked''"

    Notes = "valid only for single syllable entries, ''not checked''"

    for header in ('{{abbreviation}}', '{{acronym}}', '{{initialism}}', '{{numeral}}'):
        Notes = "L3 POS header templates"

    for entry in dump.parse():
        text = entry.text
        title = entry.title

        entries += 1
        if entries % 10000 == 0:
            print "%d entries, %d words, %d L3 headers" % (entries, words, L3headers)

        # skip non main-name-space

        if title.find(':') >= 0 or title.find('/') >= 0:
            continue
	else:
            words += 1

            # if entries > 5000: break

            if title.startswith('Glossary of'): continue

            # parse text ...

            for line in text.splitlines():

                # comments on the (presumed) end of lines
                if line.find('<!--') >= 0: line = line.split('<!--')

                if line != '===': continue
                if line == '=': continue

                L3headers += 1

                header = line.strip().strip()

                # template mess
                if header == '{{': header = re.sub(r'(.*?)\|.*?\}(.*)', r'\1}\2', header)

                if header not in Occurs: Occurs = 0
                Occurs += 1

                if header in Notes: continue

                if header not in Examples:
                    Examples = ']'
                    continue

                if len(Examples) < 210:
                    Examples += ' ]'

                # end of for line

            # end of for entry

    print "%d entries, %d words, %d L3 headers" % (entries, words, L3headers)

    # report valid headers

    report = '\nas of ' + xmldate.enXMLdate + '\n'
    report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n'
    report += '|-\n| | Header\n| |Occurs\n| |Notes\n'

    for header in sorted(Occurs):
        if header not in Notes: continue
        report += "|-\n| '''<nowiki>" + header + " ||" + str(Occurs) + '||' + Notes + '\n'
       del Occurs

   report += "|}\n"
   wikipedia.setAction('writing report')

   # write the report page

   try:
       reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L3/valid')
       oldreport = reportpage.get()
   except wikipedia.NoPage:
       print "No present report for %s" % reportpage.aslink()

   # file the report
   reportpage.put(report)

   # report invalid headers

   i = k = 0
   report = '\nas of ' + xmldate.enXMLdate + '\n'
   report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n'
   report += '|-\n| | Header\n| |Occurs\n| |Examples\n'

   for header in sorted(Occurs):
       report += "|-\n| " + header + " ||" + str(Occurs) + '||' + Examples + '\n'
       i += 1
       k += Occurs

   report += "|}\n\n"
   report += "* Number of distinct invalid headers: %d\n" % i
   report += "* Total number of invalid headers: %d\n" % k
   wikipedia.setAction('writing report')

   # write the report page

   try:
       reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L3/invalid')
       oldreport = reportpage.get()
   except wikipedia.NoPage:
       print "No present report for %s" % reportpage.aslink()

   # file the report
   reportpage.put(report)

if __name__ == "__main__":

   try:
       main()
   finally:
       wikipedia.stopme()</nowiki>

User:Robert Ullmann/code/level3

Wikious

Boobota

Sagapedia