#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This application patrols the en.wikt
"""
import wikipedia
import sys
import re
import pickle
import time
import xmlreader
import socket
from math import floor
from Tkinter import *
import Queue
import threading
from webbrowser import open_new_tab
from difflib import ndiff
import urllib
def srep(s):
return repr(u''+s)
plock = threading.Lock()
def log(s):
with plock: print srep(s)
def unescape(s):
if '&' not in s: return s
s = s.replace("<", "<")
s = s.replace(">", ">")
s = s.replace("'", "'")
s = s.replace(""", '"')
s = s.replace("'", "'")
s = s.replace("&", "&") # Must be last
return s
site = None
Quit = False
# ------------------------------------------------------------------------------------------------
class Task():
def __init__(t, revid = '', pid = '', title = '', user = '', oldid = '',
rcid = '', ts = '', summary = ''):
t.revid = revid
t.pid = pid
t.title = title
t.urlname = urllib.quote(title.encode(site.encoding()))
t.user = user
t.oldid = oldid
t.rcid = rcid
t.ts = ts
t.summary = summary
t.allrevs = None
t.oldlines = ''
t.newlines = ''
t.revlines = ''
t.done = False
def __cmp__(s, o):
if s.ts < o.ts: return -1
if s.ts > o.ts: return 1
return 0
# timeout set: a set that elements magically disappear from after a time
#
from weakref import WeakValueDictionary
from heapq import heappush, heappop
class tmo(float): pass
class timeoutset():
def __init__(s, timeout):
s.timeout = timeout
s.wdict = WeakValueDictionary()
s.theap =
def add(s, key):
t = tmo(time.clock())
s.wdict = t
heappush(s.theap, t)
def __contains__(s, key):
while s.theap and s.theap < time.clock() - s.timeout: heappop(s.theap)
return key in s.wdict
def __len__(s):
return len(s.theap)
# all active tasks, not in skip list, key is revid
active = WeakValueDictionary()
# tasks skipped for a day or so, values are revids
skipped = timeoutset(24 * 3600)
# whitelisted users
whitelist = timeoutset(3 * 3600)
# users to skip for a while (8 hours? e.g. rest of this session?)
skipusers = timeoutset(8 * 3600)
# queues: tasks, ready to present, ready to mark
tasq = Queue.PriorityQueue()
readyq = Queue.Queue(20) # limit preloads
patrolq = Queue.Queue()
# check a task to see where it should go, used at several steps
def checktask(task):
if task.done: return None # drops task from active presumably
if task.user in whitelist:
patrolq.put(task)
#
return None
if task.user in skipusers:
skipped.add(task.revid)
#
stat("Skipped", len(skipped))
return None
return task # next step as normal
# ------------------------------------------------------------------------------------------------
# mwapi interface, a few mods here
from StringIO import StringIO
from gzip import GzipFile
# first, our own read url routine, so we can accept gzip, and be much faster:
class MyURLopener(urllib.FancyURLopener):
version="PythonWikipediaBot/1.0"
# since we aren't using the framework 'throttle', do something better
# this is a "tick-tock" timer, shared on all threads
# clicked down each success, up on each network failure of any type
ticktocklock = threading.Lock()
ticktock = 1.0
def getticktock():
global ticktock
return ticktock
relagged = re.compile(r'<error.*"maxlag".* (\d+) seconds')
def readapi(site, request, sysop = True):
global ticktocklock, ticktock
url = "http://" + site.hostname() + "/w/api.php?" + request
done = False
nap = 5
maxl = 5
maxlag = "&maxlag=%d" % maxl
with ticktocklock:
ticktock *= 0.95 # is -0.025 if 5 seconds, -1.0 at 20 seconds
ticktock = max(ticktock, 0.1)
ticktock = min(ticktock, 20.0)
if ticktock >= 10.0:
with plock: print "(mwapi readapi: tick tock is %.1f)" % ticktock
time.sleep(ticktock)
ticktock -= 1.0 # undo first increment in loop
while not done:
ticktock += 1.0 # done w/o lock, race condition is rare, not a serious problem, ignored!
try:
uo = MyURLopener()
uo.addheader('Cookie', site.cookies(sysop = sysop) or '')
uo.addheader('Accept-Encoding', 'gzip')
f = uo.open(url + maxlag)
text = f.read()
try:
if 'gzip' in f.info():
text = GzipFile(fileobj=StringIO(text)).read()
else: pass
except KeyError:
pass
text = unicode(text, 'UTF-8' , errors = 'ignore')
done = True
except Exception, e:
""" report all errors for now:
if '10054' in repr(e) and nap < 15:
time.sleep(nap)
continue # quietly
"""
with plock:
print "(%s: exception reading API: %s)" % (threading.currentThread().name, repr(e))
text = ''
time.sleep(nap)
nap = min(nap + nap/2, 300)
continue
if '<api' not in text and 'NdxICC' in text:
# silently ignore bad return from Nomadix box
time.sleep(5)
done = False
continue
mo = relagged.search(text)
if mo:
replag = int(mo.group(1))
with plock: print "(%s: server lagged %s seconds)" % \
(threading.currentThread().name, replag)
# allow more lag the next time
maxl += max(maxl/4, replag/20)
maxlag = "&maxlag=%d" % maxl
# make some progress even when server crocked ...
if maxl > 600: maxlag = ""
if maxlag and maxl > 60:
with plock: print "(mwapi readapi: next with %s)" % maxlag
# sleep replag if not more than 70
time.sleep(min(replag, 70))
done = False
continue
return text
# ------------------------------------------------------------------------------------------------
# recent changes, stuff into task queue
def readrc():
# use regex. this will break sometimes if API is changed, so we will fix it (:-)
rerev = re.compile(r'<rc *title="(*)" rcid="(*)" ' +
r'pageid="(*)" revid="(*)" old_revid="(*)" user="(*)"' +
r'*timestamp="(*)" comment="(*)"')
nap = 70
limit = 5 # 100
while not Quit:
nf = 0
# add number skipped to limit so we won't get stuck if we've skipped more
rclim = min(limit + len(skipped), 5000)
with plock: print "reading rc, max", rclim
rcs = readapi(site, "action=query&list=recentchanges&rcprop=title|ids|user|timestamp|comment" +
"&rclimit=%s&rcshow=!patrolled&format=xml" % rclim)
for mo in rerev.finditer(rcs):
title = mo.group(1)
title = title.replace('"', '"')
title = title.replace('&', '&')
rcid = mo.group(2)
pid = mo.group(3)
revid = mo.group(4)
oldid = mo.group(5)
if oldid == "0": oldid = ""
user = mo.group(6)
ts = mo.group(7)
#
summary = unescape(mo.group(8))
# with plock: print "debug rc found: title", srep(title), "user", srep(user)
if revid in active: continue
if revid in skipped: continue
# probably a good task
nf += 1
task = Task(title=title, user=user, revid=revid, oldid=oldid,
rcid=rcid, pid=pid, ts=ts, summary=summary)
active = task
stat("Unpatrolled", len(active))
task = checktask(task)
if task: tasq.put(task)
# nap time ...
if nf:
nap = max(nap/2, 10)
limit = min(limit*2, 5000)
else:
nap = min(nap*2, 350)
limit = max(limit/2, 20)
with plock: print "rc found", nf, "next in", nap
for i in range(0, nap/5):
time.sleep(5)
if Quit: break
with plock: print "recent changes thread ends"
# ------------------------------------------------------------------------------------------------
# read patrol log
def readpl():
markedothers = 0
#sample <item logid="3691330" pageid="1741875" ns="0" title="proptrækkers" type="patrol"
#action="patrol" user="Leolaursen" timestamp="2009-09-14T08:36:09Z" comment="">
# <patrol auto="1" prev="0" cur="7381584" />
repat = re.compile(r'<item *title="(*)"*user="(*)"*>' +
r'\s*<patrol*prev="(*)" cur="(*)"')
while not Quit:
if len(active):
with plock: print "reading patrol log"
pats = readapi(site, "action=query&list=logevents&letype=patrol&lelimit=200&format=xml")
else: pats = '' # little point, eh? sleep some more ...
for mo in repat.finditer(pats):
title = mo.group(1)
user = mo.group(2)
prev = mo.group(3) # oldid
cur = mo.group(4) # revid
task = None
if cur in active:
try:
task = active
except KeyError:
pass # race with GC
if not task or task.done: continue
with plock: print "rev %s of %s patrolled by %s" % (srep(cur), srep(title), srep(user))
task.done = True # we have no idea where it is ... (:-)
markedothers += 1
stat("Marked by others", markedothers)
for i in range(0, 70/5):
if Quit: break
time.sleep(5)
with plock: print "read patrol log thread ends"
# ------------------------------------------------------------------------------------------------
# mark edits
def patrol():
markedbyme = 0
whitelisted = 0
#
markset = { }
while not Quit:
try:
task = patrolq.get(timeout=20)
except Queue.Empty:
continue
#
markset = task
log('mark %s as patrolled' % task.title)
if task.user not in whitelist:
markedbyme += 1
stat("Marked by me", markedbyme)
else: # presume was marked by whitelisting
whitelisted += 1
stat("Whitelisted", whitelisted)
task = None
stat("Unpatrolled", len(active))
time.sleep(5) # no hurry
with plock: print "patrol thread ends"
# ------------------------------------------------------------------------------------------------
# preload, read from task q, write to ready q
def preload():
rever = re.compile(r'<rev revid="(*)"*user="(*)"' +
r'*timestamp="(*)"\s*(comment="*"|)*>(.*?)</rev>', re.S)
# cache of previously read revisions, kept as long as some other task has them
# key is title, value is other task (can't just be unicode string, must be object)
revcache = WeakValueDictionary()
while not Quit:
try:
task = tasq.get(timeout=20)
except Queue.Empty:
continue
task = checktask(task)
if not task: continue
log('preload for ' + task.title)
# see if we have revs already
revs = None
if task.title in revcache:
try:
ot = revcache
revs = ot.allrevs # from other task
ot = None
except KeyError: pass
if not revs:
with plock: print "reading 20 revs for", srep(task.title)
revs = readapi(site,
"action=query&prop=revisions|info&rvprop=timestamp|user|comment|content|ids&format=xml"
"&titles=" + task.urlname + "&rvlimit=20")
# now we want to see if we have enough; maybe not, and maybe stale
if 'revid="' + task.revid + '"' not in revs: revs = ''
if task.oldid and 'revid="' + task.oldid + '"' not in revs: revs = ''
# if not, do it again at 200
if not revs:
with plock: print "reading 200 revs for", srep(task.title)
revs = readapi(site,
"action=query&prop=revisions|info&rvprop=timestamp|user|comment|content|ids&format=xml"
"&titles=" + task.urlname + "&rvlimit=200")
# check again!
if 'revid="' + task.revid + '"' not in revs: revs = ''
if task.oldid and 'revid="' + task.oldid + '"' not in revs: revs = ''
if not revs:
# can happen on page deletes, moves, stuff
with plock: "can't find needed old revs for %s, skipping task!" % srep(task.title)
skipped.add(task.revid)
stat("Skipped", len(skipped))
task = None
stat("Unpatrolled", len(active))
continue
task.allrevs = revs
revcache = task
# now available for other tasks on same title
# now find the revs we want, and make a list ...
oldrevtext = ''
newrevtext = ''
replinelist =
for mo in rever.finditer(revs):
revid = mo.group(1)
user = mo.group(2)
ts = mo.group(3)
comment = unescape(mo.group(4))
text = unescape(mo.group(5))
if len(replinelist) < 10:
replinelist.append("%s %s: (%s)" % (ts, user, comment))
# with plock: print "debug match rev", srep(ts), srep(user), srep(comment)
if revid == task.revid: newrevtext = text
if revid == task.oldid: oldrevtext = text
# should have always been found?
if not newrevtext:
with plock: print "what? can't match new revtext in revs?"
continue # discard task?
task.revlines = u'\n'.join(replinelist)
# differences
for delta in ndiff(oldrevtext.splitlines(), newrevtext.splitlines()):
delta = unescape(delta)
if delta.startswith('- '):
task.oldlines += delta + '\n'
elif delta.startswith('+ '):
task.newlines += delta + '\n'
# ignore ' ' and '? ' lines, might do something with context later?
while not Quit:
try:
readyq.put(task, timeout=20)
break
except Queue.Full:
continue
with plock: print "preload thread ends"
# ------------------------------------------------------------------------------------------------
# now the tkinter stuff:
# could do a fancy class with attributes and methods, but instead keep it simple, just share some stuff
root = None
status = None
statboxes = { }
tkmessages = Queue.Queue()
def stat(n, v): tkmessages.put( (n, v) )
# messages to update stats from other threads
def tkmess():
while tkmessages.qsize():
try:
lab, val = tkmessages.get()
statboxes.config(text=val)
statboxes.update_idletasks()
except Queue.Empty:
pass
root.after(200, tkmess)
# oldest (?) unpatrolled page data
# shared things, just easier this way, all belong to tk run thread
oldboxes = { }
oldedit = None # current task being presented
oldlines = None
newlines = None
revlines = None
showdiffb = None
oldlineslabel = None
newlineslabel = None
def get_next_oldpage():
global oldboxes, oldedit
global showdiffb, oldlineslabel, newlineslabel
oldedit = None
oldboxes.config(text='')
oldboxes.config(text='')
oldboxes.config(text='')
oldlines.config(text='')
newlines.config(text='')
revlines.config(text='')
try:
oldedit = readyq.get_nowait()
except Queue.Empty:
root.after(5000, get_next_oldpage)
return
oldedit = checktask(oldedit)
if not oldedit:
# recall immediately:
root.after(20, get_next_oldpage)
return
if oldedit.oldid:
oldlineslabel.config(text='Old lines')
newlineslabel.config(text='New lines')
showdiffb.config(text='Show diffs')
else:
oldlineslabel.config(text='')
newlineslabel.config(text='Page text (current)')
showdiffb.config(text='Show page')
oldboxes.config(text=oldedit.title)
oldboxes.config(text=oldedit.user)
oldboxes.config(text=oldedit.summary)
oldlines.config(text=oldedit.oldlines)
newlines.config(text=oldedit.newlines)
revlines.config(text=oldedit.revlines)
newlines.update_idletasks()
return
def mark_edit():
if not oldedit: return
patrolq.put(oldedit)
get_next_oldpage()
return
def show_diff():
if not oldedit: return
if oldedit.oldid:
open_new_tab("http://en.wiktionary.orghttps://en.wiktionary.org/w/index.php?title=%s&diff=next&oldid=%s" %
(oldedit.urlname, oldedit.oldid))
else:
open_new_tab("http://en.wiktionary.orghttps://en.wiktionary.org/w/index.php?title=%s" % oldedit.urlname)
return
def edit_page():
if not oldedit: return
open_new_tab("http://en.wiktionary.orghttps://en.wiktionary.org/w/index.php?title=%s&action=edit" % oldedit.urlname)
return
def skip_edit():
global oldedit
if not oldedit: return
skipped.add(oldedit.revid)
stat("Skipped", len(skipped))
oldedit = None
stat("Unpatrolled", len(active))
get_next_oldpage()
return
def skip_user():
global oldedit
if not oldedit: return
skipusers.add(oldedit.user)
stat("Skipped users", len(skipusers))
# continue to skip this edit
skip_edit()
return
def whitelist_user():
global oldedit
if not oldedit: return
whitelist.add(oldedit.user)
stat("Whitelist users", len(whitelist))
# continue to mark this edit
mark_edit()
return
def rats_quit():
global Quit
Quit = True
log("Quitting ...")
root.quit()
# main program runs tkinter loop
def main():
global site
site = wikipedia.getSite("en", "wiktionary")
site.forceLogin(sysop = True)
# things shared with subs
global root, oldlines, newlines, revlines
global showdiffb, oldlineslabel, newlineslabel
rct = threading.Thread(target=readrc)
rct.daemon = True
rct.name = 'read recent changes'
plt = threading.Thread(target=readpl)
rct.daemon = True
rct.name = 'read patrol log'
prt = threading.Thread(target=preload)
rct.daemon = True
rct.name = 'preload'
pat = threading.Thread(target=patrol)
rct.daemon = True
rct.name = 'mark patrol'
root = Tk()
root.title('Rat Patrol')
font = ('Arial', 10)
fontb = ('Arial', 10, 'bold')
# pack from bottom, then left to right at top:
revlines = Label(root, width=97, height=10, justify=LEFT, anchor=W, font=font, bg='#fff',
relief=RIDGE)
revlines.pack(side=BOTTOM, padx=5, pady=5)
# button bar
bbox = Frame(root)
bbox.pack(side=BOTTOM, fill=X, padx=10, pady=5)
editpageb = Button(bbox, text="Edit page", width=11, font=font, command=edit_page)
editpageb.pack(side=LEFT)
showdiffb = Button(bbox, text="Show diffs", width=11, font=font, command=show_diff)
showdiffb.pack(side=LEFT)
skipuserb = Button(bbox, text="Skip user", width=11, font=font, command=skip_user)
skipuserb.pack(side=LEFT)
wluserb = Button(bbox, text="Whitelist", width=11, font=font, command=whitelist_user)
wluserb.pack(side=LEFT)
skipeditb = Button(bbox, text="Skip", width=8, font=font, command=skip_edit)
skipeditb.pack(side=LEFT)
markeditb = Button(bbox, text="Mark", width=8, font=font, command=mark_edit)
markeditb.pack(side=LEFT)
quitb = Button(bbox, text='Quit', width=10, font=font, command=rats_quit)
quitb.pack(side=RIGHT)
# differences
dbox = Frame(root)
dbox.pack(side=BOTTOM, padx=10, pady=5)
obox = Frame(dbox)
obox.pack(side=LEFT)
oldlineslabel = Label(obox, text="Old lines", width=24, font=fontb, justify=LEFT, anchor=W)
oldlineslabel.pack(side=TOP, fill=X)
oldlines = Label(obox, width=48, height=8, justify=LEFT, anchor=W, font=font, bg='#fff',
relief=RIDGE)
oldlines.config(wraplength = oldlines.winfo_reqwidth() - 8)
oldlines.pack(side=TOP)
nbox = Frame(dbox)
nbox.pack(side=LEFT)
newlineslabel = Label(nbox, text="New lines", width=24, font=fontb, justify=LEFT, anchor=W)
newlineslabel.pack(side=TOP, fill=X)
newlines = Label(nbox, width=48, height=8, justify=LEFT, anchor=W, font=font, bg='#fff',
relief=RIDGE)
newlines.config(wraplength = newlines.winfo_reqwidth() - 8)
newlines.pack(side=TOP)
# statistics frame, boxes
stats = Frame(root)
stats.pack(side=LEFT, padx=10, pady=5)
statframes = { }
statlabels = { }
for lab in [ 'Unpatrolled', 'Marked by me', 'Marked by others', 'Whitelisted', \
'Whitelist users', 'Skipped users', 'Skipped' ]:
statframes = Frame(stats)
statframes.pack(side=TOP)
statlabels = Label(statframes, text=lab+':', width=15, font=font, justify=LEFT,
anchor=W)
statlabels.pack(side=LEFT)
statboxes = Label(statframes, text='0', width=10, font=font, justify=RIGHT, anchor=E,
bg='#fff', relief=RIDGE)
statboxes.pack(side=RIGHT)
ebox = Frame(root)
ebox.pack(side=LEFT, padx=10, pady=5, fill=X)
oldframes = { }
oldlabels = { }
oldtoplabel = Label(ebox, text="Oldest unpatrolled edit", width=24, font=fontb, justify=LEFT,
anchor=W)
oldtoplabel.pack(side=TOP, fill=X)
for lab in :
oldframes = Frame(ebox)
oldframes.pack(side=TOP)
oldlabels = Label(oldframes, text=lab+':', width=15, font=font, justify=LEFT,
anchor=NW)
oldlabels.pack(side=LEFT)
oldboxes = Label(oldframes, text='', width=48, font=font, justify=LEFT, anchor=W,
bg='#fff', relief=RIDGE)
oldboxes.pack(side=RIGHT)
oldlabels.config(text='Summary:\n\n\n') # (hack ;-)
oldboxes.config(height = 4)
oldboxes.config(wraplength = oldboxes.winfo_reqwidth() - 8)
root.after(200, tkmess)
root.after(200, get_next_oldpage)
rct.start()
plt.start()
prt.start()
pat.start()
root.mainloop()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()