User:Robert Ullmann/newrat.py

#!/usr/bin/python
# -*- coding: utf-8  -*-

"""
This application patrols the en.wikt

"""

import wikipedia
import sys
import re
import pickle
import time
import xmlreader
import socket
from math import floor
from Tkinter import *
import Queue
import threading
from webbrowser import open_new_tab
from difflib import ndiff
import urllib

def srep(s):
    return repr(u''+s)

plock = threading.Lock()

def log(s):
    with plock: print srep(s)

def unescape(s):
    if '&' not in s: return s
    s = s.replace("&lt;", "<")
    s = s.replace("&gt;", ">")
    s = s.replace("&apos;", "'")
    s = s.replace("&quot;", '"')
    s = s.replace("&#039;", "'")
    s = s.replace("&amp;", "&") # Must be last
    return s

site = None
Quit = False

# ------------------------------------------------------------------------------------------------

class Task():
    def __init__(t, revid = '', pid = '', title = '', user = '', oldid = '',
                    rcid = '', ts = '', summary = ''):
        t.revid = revid
        t.pid = pid
        t.title = title
        t.urlname = urllib.quote(title.encode(site.encoding()))
        t.user = user
        t.oldid = oldid
        t.rcid = rcid
        t.ts = ts
        t.summary = summary
        t.allrevs = None
        t.oldlines = ''
        t.newlines = ''
        t.revlines = ''
        t.done = False

    def __cmp__(s, o):
        if s.ts < o.ts: return -1
        if s.ts > o.ts: return 1
        return 0

# timeout set: a set that elements magically disappear from after a time
# 

from weakref import WeakValueDictionary
from heapq import heappush, heappop

class tmo(float): pass

class timeoutset():

    def __init__(s, timeout):
        s.timeout = timeout
        s.wdict = WeakValueDictionary()
        s.theap = 

    def add(s, key):
        t = tmo(time.clock())
        s.wdict = t
        heappush(s.theap, t)

    def __contains__(s, key):
        while s.theap and s.theap < time.clock() - s.timeout: heappop(s.theap)
        return key in s.wdict

    def __len__(s):
        return len(s.theap)

# all active tasks, not in skip list, key is revid
active = WeakValueDictionary()

# tasks skipped for a day or so, values are revids
skipped = timeoutset(24 * 3600)

# whitelisted users
whitelist = timeoutset(3 * 3600)

# users to skip for a while (8 hours? e.g. rest of this session?)
skipusers = timeoutset(8 * 3600)

# queues: tasks, ready to present, ready to mark

tasq = Queue.PriorityQueue()
readyq = Queue.Queue(20) # limit preloads 
patrolq = Queue.Queue()

# check a task to see where it should go, used at several steps

def checktask(task):

    if task.done: return None # drops task from active presumably

    if task.user in whitelist:
        patrolq.put(task)
        # 
        return None

    if task.user in skipusers:
        skipped.add(task.revid)
        # 
        stat("Skipped", len(skipped))
        return None

    return task # next step as normal

# ------------------------------------------------------------------------------------------------

# mwapi interface, a few mods here

from StringIO import StringIO
from gzip import GzipFile

# first, our own read url routine, so we can accept gzip, and be much faster:

class MyURLopener(urllib.FancyURLopener):
    version="PythonWikipediaBot/1.0"

# since we aren't using the framework 'throttle', do something better
# this is a "tick-tock" timer, shared on all threads
# clicked down each success, up on each network failure of any type

ticktocklock = threading.Lock()
ticktock = 1.0
def getticktock(): 
    global ticktock
    return ticktock

relagged = re.compile(r'<error.*"maxlag".* (\d+) seconds')

def readapi(site, request, sysop = True):
    global ticktocklock, ticktock

    url = "http://" + site.hostname() + "/w/api.php?" + request

    done = False
    nap = 5
    maxl = 5
    maxlag = "&maxlag=%d" % maxl

    with ticktocklock:
        ticktock *= 0.95  # is -0.025 if 5 seconds, -1.0 at 20 seconds
        ticktock = max(ticktock, 0.1)
        ticktock = min(ticktock, 20.0)
        if ticktock >= 10.0:
            with plock: print "(mwapi readapi: tick tock is %.1f)" % ticktock
        time.sleep(ticktock)
        ticktock -= 1.0   # undo first increment in loop

    while not done:
        ticktock += 1.0   # done w/o lock, race condition is rare, not a serious problem, ignored!
        try:
            uo = MyURLopener()
            uo.addheader('Cookie', site.cookies(sysop = sysop) or '')
            uo.addheader('Accept-Encoding', 'gzip')
            f = uo.open(url + maxlag)
            text = f.read()
            try:
                if 'gzip' in f.info():
                    text = GzipFile(fileobj=StringIO(text)).read()
                else: pass
            except KeyError:
                pass
            text = unicode(text, 'UTF-8' , errors = 'ignore')
            done = True
        except Exception, e:
            """ report all errors for now:
            if '10054' in repr(e) and nap < 15:
                time.sleep(nap)
                continue # quietly
            """
            with plock:
                print "(%s: exception reading API: %s)" % (threading.currentThread().name, repr(e))
            text = ''
            time.sleep(nap)
            nap = min(nap + nap/2, 300)
            continue

        if '<api' not in text and 'NdxICC' in text:
            # silently ignore bad return from Nomadix box
            time.sleep(5)
            done = False
            continue

        mo = relagged.search(text)
        if mo:
            replag = int(mo.group(1))
            with plock: print "(%s: server lagged %s seconds)" % \
                           (threading.currentThread().name, replag)
            # allow more lag the next time
            maxl += max(maxl/4, replag/20)
            maxlag = "&maxlag=%d" % maxl
            # make some progress even when server crocked ...
            if maxl > 600: maxlag = ""
            if maxlag and maxl > 60:
                with plock: print "(mwapi readapi: next with %s)" % maxlag
            # sleep replag if not more than 70
            time.sleep(min(replag, 70))
            done = False
            continue

    return text


# ------------------------------------------------------------------------------------------------

# recent changes, stuff into task queue

def readrc():

    # use regex. this will break sometimes if API is changed, so we will fix it (:-)

    rerev = re.compile(r'<rc *title="(*)" rcid="(*)" ' +
                       r'pageid="(*)" revid="(*)" old_revid="(*)" user="(*)"' +
                       r'*timestamp="(*)" comment="(*)"')

    nap = 70
    limit = 5 # 100

    while not Quit:
        nf = 0

        # add number skipped to limit so we won't get stuck if we've skipped more
        rclim = min(limit + len(skipped), 5000)

        with plock: print "reading rc, max", rclim
        rcs = readapi(site, "action=query&list=recentchanges&rcprop=title|ids|user|timestamp|comment" +
                            "&rclimit=%s&rcshow=!patrolled&format=xml" % rclim)

        for mo in rerev.finditer(rcs):
             title = mo.group(1)
             title = title.replace('&quot;', '"')
             title = title.replace('&amp;', '&')

             rcid = mo.group(2)
             pid = mo.group(3)
             revid = mo.group(4)
             oldid = mo.group(5)
             if oldid == "0": oldid = ""
             user = mo.group(6)
             ts = mo.group(7)
             # 
             summary = unescape(mo.group(8))

             # with plock: print "debug rc found: title", srep(title), "user", srep(user)

             if revid in active: continue
             if revid in skipped: continue

             # probably a good task
             nf += 1
             task = Task(title=title, user=user, revid=revid, oldid=oldid,
                         rcid=rcid, pid=pid, ts=ts, summary=summary)
             active = task
             stat("Unpatrolled", len(active))

             task = checktask(task)
             if task: tasq.put(task)

        # nap time ...
        if nf:
             nap = max(nap/2, 10)
             limit = min(limit*2, 5000)
        else:
             nap = min(nap*2, 350)
             limit = max(limit/2, 20)
        with plock: print "rc found", nf, "next in", nap
        for i in range(0, nap/5):
             time.sleep(5)
             if Quit: break

    with plock: print "recent changes thread ends"
 
# ------------------------------------------------------------------------------------------------

# read patrol log

def readpl():

    markedothers = 0

    #sample    <item logid="3691330" pageid="1741875" ns="0" title="proptrækkers" type="patrol"     

#action="patrol" user="Leolaursen" timestamp="2009-09-14T08:36:09Z" comment="">
    #       <patrol auto="1" prev="0" cur="7381584" />

    repat = re.compile(r'<item *title="(*)"*user="(*)"*>' +
                       r'\s*<patrol*prev="(*)" cur="(*)"')

    while not Quit:

        if len(active):
            with plock: print "reading patrol log"
            pats = readapi(site, "action=query&list=logevents&letype=patrol&lelimit=200&format=xml")
        else: pats = '' # little point, eh? sleep some more ...

        for mo in repat.finditer(pats):
            title = mo.group(1)
            user = mo.group(2)
            prev = mo.group(3) # oldid
            cur = mo.group(4) # revid

            task = None
            if cur in active:
                try:
                    task = active
                except KeyError:
                    pass # race with GC
            if not task or task.done: continue

            with plock: print "rev %s of %s patrolled by %s" % (srep(cur), srep(title), srep(user))
            task.done = True # we have no idea where it is ... (:-)
            markedothers += 1
            stat("Marked by others", markedothers)

        for i in range(0, 70/5):
             if Quit: break
             time.sleep(5)

    with plock: print "read patrol log thread ends"


# ------------------------------------------------------------------------------------------------

# mark edits

def patrol():

    markedbyme = 0
    whitelisted = 0

    # 
    markset = { }

    while not Quit:
        try:
            task = patrolq.get(timeout=20)
        except Queue.Empty:
            continue

        # 
        markset = task

        log('mark %s as patrolled' % task.title)

        if task.user not in whitelist:
            markedbyme += 1
            stat("Marked by me", markedbyme)
        else: # presume was marked by whitelisting  
            whitelisted += 1
            stat("Whitelisted", whitelisted)  
        task = None
        stat("Unpatrolled", len(active))

        time.sleep(5) # no hurry

    with plock: print "patrol thread ends"


# ------------------------------------------------------------------------------------------------

# preload, read from task q, write to ready q

def preload():

    rever = re.compile(r'<rev revid="(*)"*user="(*)"' +
                       r'*timestamp="(*)"\s*(comment="*"|)*>(.*?)</rev>', re.S)


    # cache of previously read revisions, kept as long as some other task has them
    # key is title, value is other task (can't just be unicode string, must be object)
    revcache = WeakValueDictionary()

    while not Quit:
        try:
            task = tasq.get(timeout=20)
        except Queue.Empty:
            continue
        task = checktask(task)
        if not task: continue

        log('preload for ' + task.title)

        # see if we have revs already

        revs = None
        if task.title in revcache:
            try:
                ot = revcache
                revs = ot.allrevs # from other task
                ot = None
            except KeyError: pass

        if not revs:
            with plock: print "reading 20 revs for", srep(task.title)
            revs = readapi(site,
                 "action=query&prop=revisions|info&rvprop=timestamp|user|comment|content|ids&format=xml"
                 "&titles=" + task.urlname + "&rvlimit=20")

        # now we want to see if we have enough; maybe not, and maybe stale
        if 'revid="' + task.revid + '"' not in revs: revs = ''
        if task.oldid and 'revid="' + task.oldid + '"' not in revs: revs = ''

        # if not, do it again at 200
        if not revs:
            with plock: print "reading 200 revs for", srep(task.title)
            revs = readapi(site,
                 "action=query&prop=revisions|info&rvprop=timestamp|user|comment|content|ids&format=xml"
                 "&titles=" + task.urlname + "&rvlimit=200")

        # check again!
        if 'revid="' + task.revid + '"' not in revs: revs = ''
        if task.oldid and 'revid="' + task.oldid + '"' not in revs: revs = ''

        if not revs:
             # can happen on page deletes, moves, stuff
             with plock: "can't find needed old revs for %s, skipping task!" % srep(task.title)
             skipped.add(task.revid)
             stat("Skipped", len(skipped))
             task = None
             stat("Unpatrolled", len(active))
             continue

        task.allrevs = revs
        revcache = task
        # now available for other tasks on same title

        # now find the revs we want, and make a list ...
        oldrevtext = ''
        newrevtext = ''
        replinelist = 

        for mo in rever.finditer(revs):
             revid = mo.group(1)
             user = mo.group(2)
             ts = mo.group(3)
             comment = unescape(mo.group(4))
             text = unescape(mo.group(5))

             if len(replinelist) < 10:
                 replinelist.append("%s %s: (%s)" % (ts, user, comment))
                 # with plock: print "debug match rev", srep(ts), srep(user), srep(comment)

             if revid == task.revid: newrevtext = text
             if revid == task.oldid: oldrevtext = text

        # should have always been found?
        if not newrevtext:
            with plock: print "what? can't match new revtext in revs?"
            continue # discard task?

        task.revlines = u'\n'.join(replinelist)

       # differences

        for delta in ndiff(oldrevtext.splitlines(), newrevtext.splitlines()):
            delta = unescape(delta)
            if delta.startswith('- '):
                task.oldlines += delta + '\n'
            elif delta.startswith('+ '):
                task.newlines += delta + '\n'
            # ignore '  ' and '? ' lines, might do something with context later?

        while not Quit:
            try:
                readyq.put(task, timeout=20)
                break
            except Queue.Full:
                continue

    with plock: print "preload thread ends"

# ------------------------------------------------------------------------------------------------

# now the tkinter stuff:
# could do a fancy class with attributes and methods, but instead keep it simple, just share some stuff

root = None
status = None
statboxes = { }

tkmessages = Queue.Queue()

def stat(n, v): tkmessages.put( (n, v) )

# messages to update stats from other threads

def tkmess():

    while tkmessages.qsize():
        try:
            lab, val = tkmessages.get()
            statboxes.config(text=val)
            statboxes.update_idletasks()

        except Queue.Empty:
            pass

    root.after(200, tkmess)

# oldest (?) unpatrolled page data

# shared things, just easier this way, all belong to tk run thread
oldboxes = { }
oldedit = None # current task being presented
oldlines = None
newlines = None
revlines = None
showdiffb = None
oldlineslabel = None
newlineslabel = None

def get_next_oldpage():
    global oldboxes, oldedit
    global showdiffb, oldlineslabel, newlineslabel

    oldedit = None

    oldboxes.config(text='')
    oldboxes.config(text='')
    oldboxes.config(text='')
    oldlines.config(text='')
    newlines.config(text='')
    revlines.config(text='')

    try:
        oldedit = readyq.get_nowait()
    except Queue.Empty:
        root.after(5000, get_next_oldpage)
        return

    oldedit = checktask(oldedit)
    if not oldedit:
        # recall immediately:
        root.after(20, get_next_oldpage)
        return
        
    if oldedit.oldid:
        oldlineslabel.config(text='Old lines')
        newlineslabel.config(text='New lines')
        showdiffb.config(text='Show diffs')
    else:
        oldlineslabel.config(text='')
        newlineslabel.config(text='Page text (current)')
        showdiffb.config(text='Show page')

    oldboxes.config(text=oldedit.title)
    oldboxes.config(text=oldedit.user)
    oldboxes.config(text=oldedit.summary)
    oldlines.config(text=oldedit.oldlines)
    newlines.config(text=oldedit.newlines)
    revlines.config(text=oldedit.revlines)
    newlines.update_idletasks()

    return

def mark_edit():

    if not oldedit: return

    patrolq.put(oldedit)

    get_next_oldpage()
    return

def show_diff():

    if not oldedit: return

    if oldedit.oldid:
        open_new_tab("http://en.wiktionary.orghttps://en.wiktionary.org/w/index.php?title=%s&diff=next&oldid=%s" % 
               (oldedit.urlname, oldedit.oldid))
    else:
        open_new_tab("http://en.wiktionary.orghttps://en.wiktionary.org/w/index.php?title=%s" % oldedit.urlname)


    return

def edit_page():

    if not oldedit: return

    open_new_tab("http://en.wiktionary.orghttps://en.wiktionary.org/w/index.php?title=%s&action=edit" % oldedit.urlname)

    return

def skip_edit():
    global oldedit

    if not oldedit: return

    skipped.add(oldedit.revid)
    stat("Skipped", len(skipped))
    oldedit = None
    stat("Unpatrolled", len(active))

    get_next_oldpage()
    return

def skip_user():
    global oldedit
    if not oldedit: return

    skipusers.add(oldedit.user)
    stat("Skipped users", len(skipusers))

    # continue to skip this edit
    skip_edit()
    return

def whitelist_user():
    global oldedit
    if not oldedit: return

    whitelist.add(oldedit.user)
    stat("Whitelist users", len(whitelist))

    # continue to mark this edit
    mark_edit()
    return

def rats_quit():
    global Quit

    Quit = True
    log("Quitting ...")
    root.quit()

# main program runs tkinter loop

def main():
    global site

    site = wikipedia.getSite("en", "wiktionary")
    site.forceLogin(sysop = True)

    # things shared with subs
    global root, oldlines, newlines, revlines
    global showdiffb, oldlineslabel, newlineslabel

    rct = threading.Thread(target=readrc)
    rct.daemon = True
    rct.name = 'read recent changes'

    plt = threading.Thread(target=readpl)
    rct.daemon = True
    rct.name = 'read patrol log'

    prt = threading.Thread(target=preload)
    rct.daemon = True
    rct.name = 'preload'

    pat = threading.Thread(target=patrol)
    rct.daemon = True
    rct.name = 'mark patrol'

    root = Tk()
    root.title('Rat Patrol')

    font = ('Arial', 10)
    fontb = ('Arial', 10, 'bold')

    # pack from bottom, then left to right at top:

    revlines = Label(root, width=97, height=10, justify=LEFT, anchor=W, font=font, bg='#fff', 

relief=RIDGE)
    revlines.pack(side=BOTTOM, padx=5, pady=5)

    # button bar

    bbox = Frame(root)
    bbox.pack(side=BOTTOM, fill=X, padx=10, pady=5)

    editpageb = Button(bbox, text="Edit page", width=11, font=font, command=edit_page)
    editpageb.pack(side=LEFT)

    showdiffb = Button(bbox, text="Show diffs", width=11, font=font, command=show_diff)
    showdiffb.pack(side=LEFT)

    skipuserb = Button(bbox, text="Skip user", width=11, font=font, command=skip_user)
    skipuserb.pack(side=LEFT)

    wluserb = Button(bbox, text="Whitelist", width=11, font=font, command=whitelist_user)
    wluserb.pack(side=LEFT)

    skipeditb = Button(bbox, text="Skip", width=8, font=font, command=skip_edit)
    skipeditb.pack(side=LEFT)

    markeditb = Button(bbox, text="Mark", width=8, font=font, command=mark_edit)
    markeditb.pack(side=LEFT)

    quitb = Button(bbox, text='Quit', width=10, font=font, command=rats_quit)
    quitb.pack(side=RIGHT)

    # differences

    dbox = Frame(root)
    dbox.pack(side=BOTTOM, padx=10, pady=5)

    obox = Frame(dbox)
    obox.pack(side=LEFT)
    oldlineslabel = Label(obox, text="Old lines", width=24, font=fontb, justify=LEFT, anchor=W)
    oldlineslabel.pack(side=TOP, fill=X)
    oldlines = Label(obox, width=48, height=8, justify=LEFT, anchor=W, font=font, bg='#fff', 

relief=RIDGE)
    oldlines.config(wraplength = oldlines.winfo_reqwidth() - 8)
    oldlines.pack(side=TOP)

    nbox = Frame(dbox)
    nbox.pack(side=LEFT)
    newlineslabel = Label(nbox, text="New lines", width=24, font=fontb, justify=LEFT, anchor=W)
    newlineslabel.pack(side=TOP, fill=X)
    newlines = Label(nbox, width=48, height=8, justify=LEFT, anchor=W, font=font, bg='#fff', 

relief=RIDGE)
    newlines.config(wraplength = newlines.winfo_reqwidth() - 8)
    newlines.pack(side=TOP)

    # statistics frame, boxes
    stats = Frame(root)
    stats.pack(side=LEFT, padx=10, pady=5)

    statframes = { }
    statlabels = { }
    for lab in [ 'Unpatrolled', 'Marked by me', 'Marked by others', 'Whitelisted', \
                 'Whitelist users', 'Skipped users', 'Skipped' ]:
        statframes = Frame(stats)
        statframes.pack(side=TOP)
        statlabels = Label(statframes, text=lab+':', width=15, font=font, justify=LEFT, 

anchor=W)
        statlabels.pack(side=LEFT)
        statboxes = Label(statframes, text='0', width=10, font=font, justify=RIGHT, anchor=E, 

bg='#fff', relief=RIDGE)
        statboxes.pack(side=RIGHT)

    ebox = Frame(root)
    ebox.pack(side=LEFT, padx=10, pady=5, fill=X)

    oldframes = { }
    oldlabels = { }
    oldtoplabel = Label(ebox, text="Oldest unpatrolled edit", width=24, font=fontb, justify=LEFT, 

anchor=W)
    oldtoplabel.pack(side=TOP, fill=X)

    for lab in :
        oldframes = Frame(ebox)
        oldframes.pack(side=TOP)
        oldlabels = Label(oldframes, text=lab+':', width=15, font=font, justify=LEFT, 

anchor=NW)
        oldlabels.pack(side=LEFT)
        oldboxes = Label(oldframes, text='', width=48, font=font, justify=LEFT, anchor=W, 

bg='#fff', relief=RIDGE)
        oldboxes.pack(side=RIGHT)
    oldlabels.config(text='Summary:\n\n\n')  # (hack ;-)
    oldboxes.config(height = 4)
    oldboxes.config(wraplength = oldboxes.winfo_reqwidth() - 8)

    root.after(200, tkmess)
    root.after(200, get_next_oldpage)

    rct.start()
    plt.start()
    prt.start()
    pat.start()
    root.mainloop()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()
User:Robert Ullmann/newrat.py

Wikious

Boobota

Sagapedia