User:Robert Ullmann/code/level2

Hello, you have come here looking for the meaning of the word User:Robert Ullmann/code/level2. In DICTIOUS you will not only get to know all the dictionary meanings for the word User:Robert Ullmann/code/level2, but we will also tell you about its etymology, its characteristics and you will know how to say User:Robert Ullmann/code/level2 in singular and plural. Everything you need to know about the word User:Robert Ullmann/code/level2 you have here. The definition of the word User:Robert Ullmann/code/level2 will help you to be more precise and correct when speaking or writing your texts. Knowing the definition ofUser:Robert Ullmann/code/level2, as well as those of other words, enriches your vocabulary and provides you with more and better linguistic resources.



#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Robert Ullmann/code/level2


"""
This code looks for valid and invalid L2 headers (languages) in the en.wikt

No command line arguments.

writes reports
"""

import wikipedia
import xmlreader
import sys
import re
import pickle
import xmldate

def safe(s):

    ss = pickle.dumps(s)
    l = len(ss)
    return ss

def main():

    # make sure we are logged in
    site = wikipedia.getSite()
    site.forceLogin()
    wikipedia.setAction('writing report')

    # get XML dump
    dump = xmlreader.XmlDump("en-wikt.xml")

    entries = 0
    words = 0
    L2headers = 0

    # valid headers have templates with codes
    Codes2 = {}
    Codes3 = {}
    CodesW = {}
    # all headers have occurance counts
    Occurs = {}
    # invalid headers have examples, but we collect for all
    Examples = {}

    # things that look like codes, but aren't; including ISO 639-2 B codes (of which one is missing?):

    Stops = [ 'alb', 'arm', 'baq', 'bur', 'chi', 'cze', 'dut', 'fre', 'geo', 'ger',
              'gre', 'ice', 'mac', 'may', 'mao', 'per', 'rum', 'scc', 'scr', 'slo',
              'tib', 'wel',
              'zh-tc', 'zh-sc', 'gko',
              'rfc', 'rfd', 'rfv', 'top', 'mid', 'pos-n', 'pie' ]

    # and fix DAVilla silliness:
    Codes2 = 'zh'

    recmatch = re.compile(r'+$')
    regood = re.compile(r'(' + re.escape('{{{l|[}}}{{{l|[}}}') + \
                        r'|)(+)(' + re.escape('{{{l|]]}}}') + r'|)<noinclude')

    Reds = { }
    redirect = re.compile(r'#.*\\]')

    for entry in dump.parse():
        text = entry.text
        title = entry.title

        entries += 1
        if entries % 10000 == 0:
            print "%d entries, %d words, %d L2 headers" % (entries, words, L2headers)

        # look for code templates, just ignore any that are badly formatted

        if title.startswith('Template:'):
           code = title
           if code in Stops: continue

           if not recmatch.match(code): continue
           if len(code) > 10: continue

           if text == '#':
               # record redirects, more breakage July 2010:
               mo = redirect.match(text)
               if mo: Reds = mo.group(1)
               continue

           # gratuitiously broken July 2010, can no longer positively ID language templates
           # if 'Language templates' not in text: continue

           mo = regood.match(text)
           if not mo:
               # can't report bad templates, as we can no longer tell which are lang temps
               # print "bad code template %s: %s" % (safe(code), safe(text))
               continue
           lang = mo.group(2)

           print "code %s: %s" % (safe(code), safe(lang))
           if len(code) == 2: Codes2 = code
           elif len(code) == 3: Codes3 = code
           else: CodesW = code
           continue

        # now skip non main-name-space

        if title.find(':') >= 0:
            continue
	else:
            words += 1

            # if entries > 5000: break

            # parse text ...

            for line in text.splitlines():

                # comments on the (presumed) end of lines
                if line.find('<!--') >= 0: line = line.split('<!--')

                if line != '==': continue
                if line == '=': continue

                L2headers += 1

                header = line.strip().strip(' ')

                # template mess, might as well keep (from L3 code)
                if header == '{{': header = re.sub(r'(.*?)\|.*?\}(.*)', r'\1|...}\2', header)

                if header not in Occurs: Occurs = 0
                Occurs += 1

                # always collect examples
                if header not in Examples:
                    Examples = ']'
                    continue

                if len(Examples) < 210 or header == 'Slovenian':
                    Examples += ' ]'

                # end of for line

            # end of for entry

    print "%d entries, %d words, %d L2 headers" % (entries, words, L2headers)

    # fix up redirects, brokenness from about July 2010:
    for header in Codes2:
        code = Codes2
        for red in Reds:
            if Reds == code:
                print "found redirect from %s to %s" % (red, code)
                if len(red) == 3: Codes3 = red
                else: CodesW = red
    # does this case occur?:
    for header in Codes3:
        code = Codes3
        for red in Reds:
            if Reds == code:
                print "found redirect from %s to %s" % (red, code)
                CodesW = red
    # yes, that was sloppy. But what can I do?

    nlangs = 0

    # report valid headers

    report = '\nas of ' + xmldate.enXMLdate + '\n'

    report += """
May include bogus codes/languages as ability to distinguish language templates by wikitext
was broken June/July 2010: category is now buried in doc page elsewhere in the dump.
"""

    #Codes = 'grc'
    #report += '(Ancient Greek set to grc for this run)\n'

    # fixes 8.7.10:
    Codes3 = 'see'
    Codes3 = 'ang'
    Codes3 = 'knw'
    if 'Simplified Chinese' in CodesW: del CodesW
    if 'Traditional Chinese' in CodesW: del CodesW

    report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n'
    report += '|-\n| | ISO 639-1\n| | ISO 639-3\n| | Wiki code\n| |Occurs\n| |Language\n| |Category\n'

    for header in sorted(Occurs):
        if (header not in Codes2) and (header not in Codes3) and (header not in CodesW): continue
        report += "|-\n| "
        if header in Codes2: report += "'''" + Codes2 + "''' ||"
        else: report += " ||" 
        if header in Codes3: report += "'''" + Codes3 + "''' ||"
        else: report += " ||" 
        if header in CodesW: report += "'''" + CodesW + "''' ||"
        else: report += " ||" 
        report += str(Occurs) + '||' + header + ' || ]\n'
        # del Occurs
        nlangs += 1
    report += "|}\n"
    wikipedia.setAction('writing report')

    # write the report page

    try:
        reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L2/valid')
        oldreport = reportpage.get()
    except wikipedia.NoPage:
        print "No present report for %s" % reportpage.aslink()

    # file the report
    reportpage.put(report)

    print "valid languages: %d" % nlangs

    # now remove valid, to report all the rest (keys() allows us to delete)
    for header in Occurs.keys():
        if (header not in Codes2) and (header not in Codes3) and (header not in CodesW): continue
        del Occurs

    # report invalid headers

    report = '\nas of ' + xmldate.enXMLdate + '\n'
    report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n'
    report += '|-\n| | Language\n| |Occurs\n| |Examples\n'

    for header in sorted(Occurs):
        report += "|-\n| '''<nowiki>" + header + " ||" + str(Occurs) + '||' + Examples + '\n'
   report += "|}\n"
   wikipedia.setAction('writing report')
   # write the report page
   try:
       reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L2/invalid')
       oldreport = reportpage.get()
   except wikipedia.NoPage:
       print "No present report for %s" % reportpage.aslink()
   # file the report
   reportpage.put(report)


if __name__ == "__main__":

   try:
       main()
   finally:
       wikipedia.stopme()</nowiki>