User:Robert Ullmann/code/tlang

#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Robert Ullmann/code/tlang


"""
This code looks for languages in translations sections

some comments and such refer to headers, this code derived from the header analysis

No command line arguments.

writes reports
"""

import wikipedia
import xmlreader
import sys
import re
import pickle
import xmldate

def safe(s):

    ss = pickle.dumps(s)
    l = len(ss)
    return ss

def main():

    # make sure we are logged in
    site = wikipedia.getSite()
    site.forceLogin()
    wikipedia.setAction('writing report')

    # get XML dump
    dump = xmlreader.XmlDump("en-wikt.xml")

    entries = 0
    words = 0
    Trans = 0

    # several different cases ...
    retrans1 = re.compile(r'\*\s*\]+?)\]\]\s*:(.*)')
    retrans2 = re.compile(r'\*\s*\]+?)\]\]\s*:(.*)')
    retrans3 = re.compile(r'\*\s*(.+?):(.*)')
    reunlink = re.compile(r'\\]')

    # valid headers have templates with codes
    Codes2 = {}
    Codes3 = {}
    CodesW = {}
    # all headers have occurance counts
    Occurs = {}
    # invalid headers have examples, but we collect for all
    Examples = {}

    # things that look like codes, but aren't; including ISO 639-2 B codes:

    Stops = [ 'alb', 'arm', 'baq', 'bur', 'chi', 'cze', 'dut', 'fre', 'geo', 'ger',
              'gre', 'ice', 'mac', 'may', 'mao', 'per', 'rum', 'scc', 'scr', 'slo',
              'tib', 'wel',
              'zh-tc', 'zh-sc', 'zh-yue', 'gko',
              'rfc', 'rfd', 'rfv', 'top', 'mid', 'pos-n', 'pie' ]

    # and fix DAVilla silliness:
    Codes2 = 'zh'

    recmatch = re.compile(r'+$')
    regood = re.compile(r'(' + re.escape('{{{l|[}}}{{{l|[}}}') + \
                        r'|)(+)(' + re.escape('{{{l|]]}}}') + r'|)<noinclude')

    # try a particular language report, Romanian to start
    replang = ""
    Alltrans = set()

    for entry in dump.parse():
        text = entry.text
        title = entry.title

        entries += 1
        if entries % 10000 == 0:
            print "%d entries, %d words, %d translations" % (entries, words, Trans)


        # look for code templates

        if title.startswith('Template:'):
           code = title
           if code in Stops: continue
           if not recmatch.match(code): continue
           if 'Language templates' not in text: continue

           mo = regood.match(text)
           if not mo:
               print "bad code template %s: %s" % (safe(code), safe(text))
               continue
           lang = mo.group(2)

           print "code %s: %s" % (safe(code), safe(lang))
           if len(code) == 2: Codes2 = code
           elif len(code) == 3: Codes3 = code
           else: CodesW = code
           continue

        # now skip non main-name-space

        if title.find(':') >= 0:
            continue
	else:
            words += 1

            # if entries > 5000: break

            # parse text ...

            intrans = False

            for line in text.splitlines():

                # comments on the (presumed) end of lines
                if line.find('<!--') >= 0: line = line.split('<!--')

                if line.startswith('='):
                    intrans = False
                    if line.find('Translations') >= 0: intrans = True
                    continue

                if not intrans: continue

                mo = retrans1.match(line)
                if not mo: mo = retrans2.match(line)
                if not mo: mo = retrans3.match(line)
                if not mo: continue

                # do some cleanup

                lang = mo.group(1)
                lang = reunlink.sub(r'\1', lang)
                lang = lang.strip(" '")
                if not lang: continue
                if lang.startswith( ('{', '*', ':', '?', '#') ): continue

                Trans += 1


                if lang not in Occurs: Occurs = 0
                Occurs += 1

                # accumulate report lines for a language
                if lang == replang:
                    Alltrans.add('* ]: ' + mo.group(2).strip())
                    print safe(lang + ': ' + title)

                # always collect examples
                if lang not in Examples:
                    Examples = ']'
                    continue

                if len(Examples) < 210:
                    Examples += ' ]'

                # end of for line

            # end of for entry

    print "%d entries, %d words, %d translations" % (entries, words, Trans)

    nlangs = 0

    # report languages

    report = 'Languages used in translations sections as of ' + xmldate.enXMLdate + '\n'

    #Codes = 'grc'
    #report += '(Ancient Greek set to grc for this run)\n'

    report += """
See ] for a list of entries without codes (subset of this list).

{| class="prettytable sortable"
|-\n| |Codes\n| |Language\n| |Occurs\n| |Examples
"""
    repinv = 'List of languages without code templates as of ' + xmldate.enXMLdate + """, to be sorted.

{| class="prettytable sortable"
|-\n| |Codes\n| |Language\n| |Occurs\n| |Examples||Notes
"""

    for lang in sorted(Occurs):
        # if (header not in Codes2) and (header not in Codes3) and (header not in CodesW): continue
        codes = ''
        if lang in Codes2: codes += ', ' + Codes2
        if lang in Codes3: codes += ', ' + Codes3
        if lang in CodesW: codes += ', ' + CodesW
        codes = codes.strip(', ')
        if codes:
            report += "|-\n| %s || %s || %d ||\n" % (codes, lang, Occurs)
        else:
            report += "|-\n| || %s || %d || %s\n" % (lang, Occurs, Examples)
            repinv += "|-\n| || %s || %d || %s ||\n" % (lang, Occurs, Examples)
        # del Occurs
        nlangs += 1
    report += "|}\n"
    repinv += "|}\n"
    wikipedia.setAction('writing report')

    # write the report pages

    try:
        reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Trans languages')
        oldreport = reportpage.get()
    except wikipedia.NoPage:
        print "No present report for %s" % reportpage.aslink()

    # file the report
    reportpage.put(report)

    try:
        reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Trans languages/uncoded')
        oldreport = reportpage.get()
    except wikipedia.NoPage:
        print "No present report for %s" % reportpage.aslink()

    # file the report
    reportpage.put(repinv)

    print "Languages: %d" % nlangs

    # report for specific language

    if not replang: return # we are done

    report = replang + ' entries in translations sections as of ' + xmldate.enXMLdate + '\n\n\n'

    report += u'\n'.join(sorted(Alltrans)) + '\n'

    wikipedia.setAction('writing report')

    # write the report page

    try:
        reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Trans languages/' + replang)
        oldreport = reportpage.get()
    except wikipedia.NoPage:
        print "No present report for %s" % reportpage.aslink()

    # file the report
    reportpage.put(report)


if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()
User:Robert Ullmann/code/tlang

Wikious

Boobota

Sagapedia