#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bot updates iwiki links between wiktionaries
22.1.9: try reading RC from various wikts and adding to en.wikt (just for fun)
24.1.9: try hunting down iwikis for new en.wikt entries
26.1.9: try adding reciprocals; can then use this in full run?
"""
import wikipedia
import xmlreader
import sys
import socket
import re
import pickle
import pagegenerators
import time
from random import randrange
from mwapi import getwikitext, getedit
from reciprocal import addrci
# borrow global:
from config import usernames
def safe(s):
return pickle.dumps(s)
# Iwiki cache:
# not used quite yet:
"""
import shelve
Iwikis = None
def iwopen(home):
global Iwikis
Iwikis = shelve.open(home + "-iwiki-cache")
cis = 0
def iwadd(title, iws, upd = True):
global cis
if safe(title) in Iwikis and not upd: return
if not iws or not len(iws): return
# print "iwikis cache %s: %s" % (safe(title), safe(u' '.join(iws)))
Iwikis = iws
cis += 1
if cis % 100: Iwikis.sync()
return
"""
Lcode = { }
Exists = set()
Active = set()
site = { }
naps = { }
def now(): return int(time.clock())
# return title, language code of FL wikt for recent changes in the other wikts
def recent(home = 'en'):
# set up list of wikt codes to look at
qtime = { }
maxnap = 350 * 60 # almost 6 hours
for lc in Exists:
# if lc == home: continue
site = wikipedia.getSite(lc, "wiktionary")
qtime = now()
naps = 60 * randrange(20, 71) # scatter 20 to 70 minutes
if lc == home: naps = 300 # five min for home wikt
# entries seen already (just let this grow?)
seen = set()
ny = 0
rcex = re.compile(r'title="(.+?)"')
while True:
# sleep until next one
nextq = now() + 1000000
nextlc = ''
for lc in qtime:
if qtime < nextq:
nextq = qtime
nextlc = lc
st = nextq - now()
if st > 90:
print "(%d, sleeping %d minutes, %s next)" % (now(), (st+29)/60, nextlc)
if st > 0:
time.sleep(st)
lc = nextlc
# read recentchanges, new entries, namespace 0, from site:
if True: #
print "(%d, reading from %s.wikt)" % (now(), lc)
# set parameters
# one hour ago back to one day ago
rcend = '&rcend=' + time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(time.time() - 86400))
rcstart = '&rcstart=' + time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(time.time() - 3600))
if lc == home:
rcshow = "&rcshow=patrolled|!bot" # avoid junk, large numbers of bot forms
sysop = True # need patrol right on login used
else:
rcshow = ''
sysop = False
rclimit = "&rclimit=%d" % min(1 + ny/20, 200)
# print "(options " + rcend + rcshow + rclimit + ")"
try:
rct = site.getUrl("/w/api.php?
action=query&list=recentchanges&format=xml&rcprop=title" +
"&rctype=new&rcnamespace=0"+rcend+rcstart+rcshow+rclimit, sysop = sysop)
except wikipedia.NoPage:
print "can't get recentchanges from %s.wikt" % lc
# rct = ''
# time.sleep(30)
qtime = now() + 700 # do other things for a bit
continue
if '<recentchanges />' in rct:
# no changes in recent history
pass
elif '</recentchanges>' not in rct:
print "some bad return from recentchanges, end tag not found"
print safe(rct)
# rct = ''
# time.sleep(30)
qtime = now() + 300 # do other things for a bit
continue
found = False
for title in rcex.findall(rct):
if ':' in title: continue # other stray stuff in NS:0
if lc + ':' + title not in seen:
seen.add(lc + ':' + title)
yield title, lc
ny += 1
found = True
if found:
naps /= 2
# naps = max(naps, 30) # thirty seconds
Active.add(lc)
else:
mn = naps/300 # one-fifth, in minutes
naps += 60 * randrange(5, 11 + mn) # five-ten minutes or longer if we don't find
anything
naps = min(naps, maxnap)
if naps > maxnap/2: Active.discard(lc)
qtime = now() + naps
if naps > 90:
print "(naptime for %s is %d minutes)" % (lc, (naps+29)/60)
else:
print "(naptime for %s is %d seconds)" % (lc, naps)
# wiki-hunt ... see if a word is in other wikts, return list ...
# challenge here is not to take a huge amount of time, but get as many as possible
re2head = re.compile(r'^==(*)==$', re.M)
def hunt(word, text, lc, home = 'en'):
iwikis =
print " ... hunting iwikis"
totry = set()
done = set()
present = set()
fps = set()
links = { }
redirs = { }
reiw = re.compile(r'\{2,11}):' + re.escape(word) + '\]\]')
# for lc in Active: totry.add(lc) magic occurs:
totry = set( sorted(Active, key=lambda c: naps) )
# if we found an FL title, start with that
if lc != home: totry.add(lc)
# language header(s) in entry are good candidates (of course!)
for lang in re2head.findall(text):
if lang in Lcode: totry.add(Lcode)
# simple scan for existing iwikis
for lc in reiw.findall(text):
if lc in site:
totry.add(lc)
present.add(lc)
# not home:
totry.discard(home)
done.add(home)
while totry:
lc = totry.pop()
try:
fpage = wikipedia.Page(site, word)
text = getwikitext(fpage)
except wikipedia.NoPage:
print " not in", lc
done.add(lc)
continue
except wikipedia.IsRedirectPage:
redirs = fpage
except Exception, e:
print "exception testing existence of word", str(e)
done.add(lc)
continue
print " found in", lc
if lc not in present: iwikis.append(lc)
done.add(lc)
links = fpage
# add to list to add reciprocal link, or complete set
fps.add(fpage)
# look for iwikis in the page, add to to-be-tried if not already done
for lc in reiw.findall(text):
if lc not in site: continue # (!) else who knows what junk ...
if lc not in done: totry.add(lc)
if lc not in done and lc not in totry:
print " found further iwiki", lc
# all done, now add reciprocals, don't remove anything because hunt may be incomplete
#
for fpage in fps:
addrci(fpage, site, links=links, redirs=redirs, remove=False)
return sorted(iwikis)
def main():
socket.setdefaulttimeout(40)
home = 'en'
xml = True
# testing rc:
xml = False
""" just keep argv code for now
for arg in sys.argv:
if arg.startswith('-start:'):
start = arg
print "starting at %s" % start
elif arg.startswith('-stop:'):
stop = arg
print "stopping at %s" % stop
elif arg.startswith('-new'):
newonly = True
print "new entries only"
elif arg.startswith('-sort'):
sort = True
print "do edits for sort"
elif arg.startswith('-xml'):
xml = True
print "read XML file"
elif arg.startswith('-update'):
update = True
print "update cache from XML (XML is current!)"
else: print "unknown command line argument %s" % arg
"""
mysite = wikipedia.getSite(home, 'wiktionary')
# make sure we are logged in
mysite.forceLogin()
meta = wikipedia.getSite(code = "meta", fam = "meta")
# get active wikt list
# minus crap. Tokipona? what are they thinking? Klingon? ;-)
Lstops =
page = wikipedia.Page(meta, "List of Wiktionaries/Table")
existtab = page.get()
""" entry looks like:
| ]
| ]
|
"""
# reextab = re.compile(r'^\+):')
# reextab = re.compile(r'\| \+)\.wiktionary\.org')
reextab = re.compile(r'^\| \\]\n'
r'^\| .*\n'
r'^\| \+)\.wiktionary\.org', re.M)
for mo in reextab.finditer(existtab):
if mo.group(2) in Lstops: continue
Exists.add(mo.group(2))
Lcode = mo.group(2)
# see if we have a login in user config, else pretend we do
# has to be done before any call, or login status gets confused!
if mo.group(2) not in usernames:
usernames = "Interwicket"
print "found %d active wikts" % len(Exists)
if len(Exists) < 150: return
# naps ... ;-)
naptime = 0
maxnap = 70
# Iwikis cache
# iwopen(home)
# build table of existing entries from xml
# note we assume since we are doing RC new entries that the iwiki will be new,
# what we want here is just an index to entries, so we don't have to do lots of en.wikt lookups
enwikt = set()
if xml:
# get XML dump
dump = xmlreader.XmlDump("../hancheck/en-wikt.xml")
ti = 0
entries = 0
reds = 0
iws = { } # in memory cache
for entry in dump.parse():
text = entry.text
title = entry.title
if ':' in title: continue
# if title < start or (stop and title > stop): continue
if text.startswith('#'): continue
entries += 1
if entries % 20000 == 0: print "prescan %d entries" % entries
enwikt.add(title)
# test:
# if entries > 100000: break
continue
print "total %d entries" % entries
# now look for iwikis needed
entries = 0
probs = 0
fixed = 0
news = 0
cbase = now() - 86400
rate = 0.0
for title, lc in recent():
if ':' in title: continue # redundant, but eh?
# temp:
# if lc == 'en' and title.startswith('Da'): continue
if title.lower() == 'main page': continue
news += 1
rate = news*3600.0/(now()-cbase)
if news % 100 == 0: print "(observed creation rate %.4f/hour)" % rate
print "%s:%s" % (safe(lc), safe(title))
# if looking at home wikt is enabled above, just add things (;-)
"""
if lc == home:
print " ... added to en.wikt"
enwikt.add(title)
continue
"""
if lc == home: tag = True
# if we are using xml? else just always look at entry
if lc != home and xml and title not in enwikt:
print " ... %s not in en.wikt" % safe(title)
continue
#
tag = True
# now see if it is something that should be tagged/replaced:
if tag:
probs += 1
naptime += 1
# ... pick up current version from en.wikt
# print '%s is possible update, getting current entry' % safe(title)
try:
page = wikipedia.Page(mysite, title)
# text = page.get()
text = getwikitext(page)
oldtext = text
except wikipedia.NoPage:
print " ... %s not in en.wikt" % safe(page.title())
text = ''
except wikipedia.IsRedirectPage:
print " ... redirect page"
text = ''
except KeyError:
# annoying local error, from crappy framework code
print "KeyError"
time.sleep(200)
continue
if not text: continue
act = ''
if lc != home and ']' in text:
print " ... iwiki %s already in %s" % (safe(lc), safe(title))
act = 'sort iwikis'
# was added manually? so probably wrong ... (;-)
iwikis =
else:
# go hunt down some iwikis, add reciprocals when needed
iwikis = hunt(title, text, lc)
if iwikis:
act = "iwiki +" + ", ".join(iwikis)
else:
print " ... no iwikis found"
if not act: continue
linksites = wikipedia.getLanguageLinks(text)
for lc in iwikis:
fpage = wikipedia.Page(site, title)
linksites] = fpage
newtext = wikipedia.replaceLanguageLinks(text, linksites, site = mysite)
newtext = newtext.replace('\r\n', '\n') # wikipedia brain-damage
if newtext.rstrip(' \n') == text.rstrip(' \n'): continue # didn't change anything
# wikipedia.showDiff(text, newtext)
# update cache with links read:
# if not act: iwadd(title, oldlinks.keys())
else: continue
# some change, write it
if act:
fixed += 1
naptime /= 2
print " ... updating %s: %s" % (safe(title), safe(act).strip("'"))
# try to fix the entry
try:
utext = getedit(page)
# utext = page.get()
if utext != oldtext:
print "page changed during attempted update"
continue
wikipedia.setAction(act)
page.put(newtext)
# no cache update
# iwadd(title, links.keys())
except wikipedia.EditConflict:
print "Edit conflict?"
continue
except wikipedia.PageNotSaved:
print "failed to save page"
# other action?
continue
except wikipedia.NoPage:
print "Can't get %s from en.wikt?" % safe(page.aslink())
continue
except wikipedia.IsRedirectPage:
print "Redirect page now?"
continue
except socket.timeout:
print "socket timeout, maybe not saving page"
continue
except socket.error:
print "socket error, maybe not saving page"
continue
except KeyError:
# annoying local error, from crappy framework code
print "KeyError"
time.sleep(200)
continue
# limit number of fixes for testing
# if fixed > 7: break
# pace
if naptime > maxnap: naptime = maxnap
"""
if naptime > 4:
print "sleeping %d seconds" % naptime
time.sleep(naptime)
"""
continue
#
# print "%d entries, %d possible, %d updated" % (entries, probs, fixed)
# done
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()