#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:Interwicket/code/reciprocal """ This bot updates iwiki links between wiktionaries 26.1.9: try adding reciprocals; can then use this in full run? This process checks for the "Interwicket" user on the FL wikt, tries to log in, create user page, check user status, and create a reciprocal link to match an en.wikt link just added (or about to be added) """ import wikipedia import sys import socket import re import pickle from time import time, strftime, gmtime, sleep from mwapi import getwikitext, getedit, readapi, putedit from iwlinks import getiwlinks, replaceiwlinks # borrow global: from config import usernames import Queue import threading toreplink = Queue.Queue() # was 35, soft limit repinit = False rthread = None # with plock: print lock, acquired around all print statements, caller can use to avoid munging lines together plock = threading.Lock() def srep(s): return repr(u''+s) def safe(s): return srep(s) class ufo: def __init__(self, **k): for a in k: setattr(self, a, k) class FLwikt: def __init__(self, code): self.lc = code self.lastcheck = None self.status = None self.userpage = False self.mainpage = '' # for test mode: self.edits = 0 self.limit = 2.0 self.newikt = True self.lockedwikt = False self.deletecode = False self.tbd = -1 # meaning "not known", 0 is valid try: # getting site will throw exceptions for unknown code(s) self.site = wikipedia.getSite(code, "wiktionary") if code in self.site.family.obsolete: self.lockedwikt = True except Exception: print "(code %s is not valid)" % code # avoid plock I think self.site = None # should not be referenced? self.lockedwikt = True # or doesn't exist at all return # rest of this is invalid # see if we have a login in user config, else invent it if code not in usernames: usernames = "Interwicket" # other options self.redirs = None # may be None, False, or True, can be tested either way self.attop = False self.oneline = False self.sortorder = '' # now decode what is in the family, so we can list it out (we don't do anything with it!) if self.site.language() in self.site.family.interwiki_attop: self.attop = True if self.site.language() in self.site.family.interwiki_on_one_line: self.oneline = True pf = self.site.interwiki_putfirst() if pf: if pf == self.site.family.alphabetic: self.sortorder = 'alpha by language name' elif pf == self.site.family.alphabetic_revised: self.sortorder = \ 'alpha by language name (revised)' elif pf == self.site.family.fyinterwiki: self.sortorder = 'code in special fy order' elif pf == self.site.family.dodde_order: self.sortorder = 'Dodde order' else: self.sortorder = '%s first' % (','.join(pf)) # specific pairings not to link self.nolink = # put these right here for now: (;-) if code == 'pl': self.nolink.append('ru') # if code in : self.redirs = True class FLdict(dict): def __init__(self): pass def __missing__(self, code): self = FLwikt(code) return self # so we can just reference the dictionary (;-) flws = FLdict() # FLwikt by code # Note: It is very important that flw's are NOT created for things that aren't iwiki codes! # This takes some care on the part of calling code. redits = re.compile('editcount="(\d+)"') # noflag hack noflagtext = None noflaglast = 0 redirtext = None redirlast = 0 remainpage = re.compile(r'<message name="mainpage"*>(.*?)</message>') # safety; this was not written to be re-entrant, probably is okay, but is simple to prevent # there is a lot of lock contention here when a process like mbwa starts gfslock = threading.Lock() def getflstatus(flw, nowrite = False): global noflagtext, noflaglast, redirtext, redirlast # before taking lock, can we just tell caller the status for this one? if flw.lockedwikt: flw.status = 'blocked' with plock: print '(wikt', flw.lc, 'is locked)' return flw.status # four hours for now if flw.lastcheck and flw.lastcheck > time() - (4 * 3600): return flw.status with gfslock: was = flw.status # if not a good status, start with test; in particular change exception to test if flw.status not in : flw.status = "test" # check logged in (or not) # we need try/except here, take keyboard interrupt and make it status = 'exception' # anything else thrown will get handled try: # take print lock around this, will stall other threads, # we may need to respond to login prompt, and it will print messages with plock: flw.site.forceLogin() except KeyboardInterrupt: with plock: print "Keyboard interrupt, skipping this wikt" flw.status = 'exception' return except Exception, e: flw.status = 'exception' with plock: print "exception trying to login on %s:" % flw.lc, str(e) return try: ustat = readapi(flw.site, "action=query&meta=userinfo" "&uiprop=blockinfo|rights|editcount&format=xml") except Exception, e: with plock: print "exception trying to read user status from %s.wikt:" % flw.lc, str(e) flw.status = "exception" return "exception" # edit count? mo = redits.search(ustat) if mo: flw.edits = int(mo.group(1)) # we can be bot, or blocked, or not known: if "<r>bot</r>" in ustat: flw.status = "bot" if "blockedby=" in ustat: flw.status = "blocked" # over-rides "bot", as it can be both if "missing=" in ustat: flw.status = "missing" # ? can get here now? # noflag hack if flw.status == 'test': if not noflagtext or noflaglast < time() - 3600: # just pick up once an hour try: nfp = wikipedia.Page(flws.site, "User:Interwicket/noflags") with plock: print '(reading noflags list)' noflagtext = getwikitext(nfp, plock = plock) noflaglast = time() except Exception, e: with plock: print "some exception getting noflag", str(e) pass # use previous file text if noflagtext: if "* '''" + flw.lc + "'''" in noflagtext: flw.status = 'noflag' # dyn pickup of redir configuration: if not redirtext or redirlast < time() - 3600: # just pick up once an hour try: rdp = wikipedia.Page(flws.site, "User:Interwicket/redirs") with plock: print '(reading redirs list)' redirtext = getwikitext(rdp, plock = plock) redirlast = time() except Exception, e: with plock: print "some exception getting redirs list", str(e) pass # use previous file text if redirtext: if "* '''" + flw.lc + "'''" in redirtext: flw.redirs = True else: flw.redirs = None # we don't use the "False" state at present # find main page title from WM "message": try: mtext = readapi(flw.site, "action=query&meta=allmessages&ammessages=mainpage&format=xml") mo = remainpage.search(mtext) flw.mainpage = mo.group(1) except Exception: flw.mainpage = '(exception)' if flw.status == 'test': if flw.lastcheck: flw.limit += (time() - flw.lastcheck) / 4800.0 # allow one more every 90 minutes flw.limit = min(flw.limit, flw.edits + 3.0) # don't accumulate too much quota else: flw.limit = flw.edits # initial state on most runs, allows one if flw.status != was: with plock: print "(status on %s.wikt is %s)" % (flw.lc, flw.status) if flw.status != 'exception': flw.lastcheck = time() # if nowrite, we are done for now (e.g. used by mbwa in intitialization) if nowrite and flw.status in and was == None: return flw.status if flw.newikt and flw.status not in : flw.newikt = False # set up complete # (release gfslock) if flw.status != was or flw.newikt: updstatus(flw) return flw.status # add or update user page on the FL wikt: userpage = """'''Wiktionary interwiki 'bot''' User "Interwicket" is the 'bot that adds interwiki (inter-language) links to entries. It is designed for the Wiktionaries. It is not the "wikipedia bot", it is much more efficient. It operates only in the main namespace (NS:0). Here, user "Interwicket" will add links to all of the other wiktionaries when needed. * If user "Interwicket" is blocked here, it will not edit (of course) * If user "Interwicket" is given a bot flag here, it will add iwikis whenever needed Otherwise it will operate in a test mode, doing only a very few edits, that can then be checked (by me, and by anyone else). Most of the possible updates will not be done because of this limit. :Discussion page for Interwicket is ]. :Code is at ]. :Status, number of edits, etc for each wikt at ]. :My talk page is ]. Finally, my sincere apologies for writing this message only in English!<!-- note that all of the text in this page is re-written by the 'bot; it is pointless to edit it. Any templates added at the top or categories and iwikis at the bottom will be left --> """ noflag = """ ---- The bot has been configured to run here without a bot flag, but at full rate, '''not in test mode'''. This is done for some small or inactive wiktionaries. If you are a user or admin here and would like to see it flagged, please note on ] and I will resolve it. It is sometimes hard to find the bot flag request page on various wikts; if you have one and I have not added a request, please write me a note on ] with a link! I strongly suggest that this wiktionary subscribe to one or both of the automatic approval policy or global bot policy. Please see ]. Feel free to ask me any questions. ] """ def adduserpage(flw): if flw.lc == 'en': flw.userpage = True if flw.userpage: return page = wikipedia.Page(flw.site, "User:Interwicket") try: op = getedit(page, plock = plock) except wikipedia.NoPage: op = '' pass except wikipedia.UserBlocked: flw.status = 'blocked' updstatus(flw) with plock: print "apparently blocked on", flw.lc, "/ wikt may be locked" return except Exception, e: with plock: print "exception trying to read %s:" % page.aslink(), str(e) return wikipedia.setAction("writing user page") # if templates added at top of (whereever) the page (bot template, or placeholder) # and cats, iwikis at end (if one per line, etc), contain ':' utext = (u'\n'.join(re.findall(r'\{\{.*?}}', op)) + '\n\n' + userpage + '\n\n' + u'\n'.join(re.findall(r'^\]$', op, re.M))).strip('\n ') if flw.status == "noflag": utext += noflag try: page.put(utext) flw.userpage = True except Exception, e: with plock: print "exception trying to write %s:" % safe(page.aslink()), str(e) return if flw.status == "missing": flw.status = "test" # trying to re-read status won't work for a while! # add a log entry, so we don't lose these in testing # temporary, although might be expanded and kept loglines = loglock = threading.Lock() def addlog(link, action): global loglines with loglock: # save up 20 to do in one edit: loglines.append('* ' + strftime("%d %B %H:%M", gmtime()) + ' + action) if len(loglines) < 20: return loglines.reverse() try: page = wikipedia.Page(flws.site, "User:Interwicket/FL log") text = getedit(page, plock = plock) k = 0 newt = '' for line in text.splitlines(): newt += line + '\n' if line == '----': for l2 in loglines: newt += l2 + '\n' k = 1 continue if k: k += 1 if k > 180: break putedit(page, newt, comment = "log entry " + link, plock = plock) loglines = except wikipedia.NoPage: pass except Exception, e: with plock: print "exception writing log entry", str(e) # update status table # re-entrant, but might edit-conflict with itself or elide edits (has been noted) updlock = threading.Lock() def updstatus(flw): if flw.lockedwikt: return # no point in listing with updlock: try: page = wikipedia.Page(flws.site, "User:Interwicket/FL status") text = getedit(page, plock = plock) notes = '' if flw.redirs == True: notes += 'link to redirects, ' if flw.redirs == False: notes += 'no links to redirects, ' if flw.attop: notes += 'iwikis at top, ' if flw.oneline: notes += 'on one line, ' if flw.nolink: notes += 'no links to %s added, ' % (",".join(flw.nolink)) if flw.sortorder: notes += 'sort %s, ' % flw.sortorder notes = notes.rstrip(", ") # day number used to provide an invisible sort key in date column daynumber = "%04d" % (time()/86400 - 14700) # days since about 1 April 2010 today = '<span style="display:none;">' + daynumber + '</span>' + \ strftime(" %d %B", gmtime()).replace(' 0', ' ') if flw.tbd >= 0: tbdtext = "%d" % flw.tbd else: tbdtext = '' lines = for line in text.splitlines(): # keep the old lines we want: if not line.startswith("| "): continue if "'''" + flw.lc + "'''" in line: parts = line.split('||') # (first will have the leading |) if len(parts) < 7: continue # (bad line? will replace it) uf = False if parts.strip() != flw.status: uf = True if not parts.strip().startswith(today): uf = True if tbdtext: if parts.strip() != tbdtext: uf = True else: tbdtext = parts.strip() # keep what was there if parts.strip() != notes: uf = True # if not worth updating, we are done if not uf: return # else elide this line, to be regenerated continue lines.append(line) lines.append( "| '''%s''' || {{%s|l=}} || %s || %d || %s {{subst:CURRENTTIME}} || %s || %s || %s" % (flw.lc, flw.lc, flw.status, flw.edits, today, tbdtext, notes, flw.mainpage) ) text = """{{/header}} {| class="wikitable sortable" ! code ! language ! status ! edits ! as of ! to be done ! width = 25% | notes ! main page |- """ + '\n|-\n'.join(sorted(lines)) + """ |} """ putedit(page, text, comment = "update status for " + flw.lc, plock = plock) except wikipedia.NoPage: pass except Exception, e: with plock: print "exception writing status table", str(e) # main event: def addrci(page, mysite, links = { }, redirs = { }, skips = , remove = False): """ page to add to localsite (to be always added) links is a dict of pages for all other links redirs is a dict of pages of other links that are redirects (i.e. subset of links) will add missing links not in redirs, will add if in redirs and allowed on FL.wikt will remove links that are not in links (if not "None") does not add or remove anything in skips only removes anything if remove; with incomplete list call with remove False """ flw = flws if getflstatus(flw) == "blocked": return # no kidding ... # if not blocked, try writing/overwriting user page, could do on "missing" but # we want to update it on new runs # useful access confirmation anyway if not flw.userpage: adduserpage(flw) # valid status? if flw.status not in : return # test limit per run if flw.status == "test" and flw.edits > flw.limit: with plock: print "(edit limit reached for %s)" % flw.lc return mypage = wikipedia.Page(mysite, page.title()) links = links.copy() # shallow copy links = mypage # now drop the request into a layer of threading magic: replink(page = page, links = links, redirs = redirs, skips = skips, remove = remove) return def replink(page = None, links = { }, redirs = { }, skips = , remove = False, end = False): # # call replink(end = True) to finish up and exit # this can be called from outside addrci (and I expect it to be) global repinit, rthread if not repinit: if end: return # no need to start for i in range(1, 4+1): rthread = threading.Thread(target=replinks) rthread.name = 'add replinks %d' % i rthread.start() repinit = True rtask = ufo(page = page, links = links, redirs = redirs, skips = skips, remove = remove, end = end) if not rtask.end: sleep(toreplink.qsize()) # soft q limit toreplink.put(rtask) if rtask.end: # make sure we have one per thread, extras do not matter for i in range(1, 4+1): toreplink.put(rtask) def replinks(): with plock: print "(rep link thread started)" while True: rtask = toreplink.get() if rtask.end: break reptask(page = rtask.page, links = rtask.links, redirs = rtask.redirs, skips = rtask.skips, remove = rtask.remove) with plock: print "(rep link thread ended)" rewpr = re.compile(r'\+):.*?\]\]') # remove count page recountpage = re.compile(r'\{\{count page\|+\}\}\n?') ticktock = threading.Lock() reptick = 10.0 # default def setreptick(rt): global reptick reptick = rt def reptask(page = None, links = { }, redirs = { }, skips = , remove = False): global reptick # now we have emerged from the thread magic, continue as before (:-) if not page: return # (?) flw = flws # we may already have page text, so use page given to us # some retry logic: done = False nap = 5 while not done and nap < 300: try: text = getwikitext(page, plock = plock) except wikipedia.NoPage: with plock: print " ... no page %s now" % safe(page.aslink()) break except wikipedia.IsRedirectPage: with plock: print " ... page %s is a redirect?" % safe(page.aslink()) break except Exception, e: with plock: print " ... some exception reading %s" % safe(page.aslink()), repr(e) # print "(sleeping %d seconds)" % nap sleep(nap) nap += nap/2 continue oldlinks = getiwlinks(text, flws) # print "debug, oldlinks are:", repr(oldlinks) # small optimization: if not links and not oldlinks: break # no links, none in entry # block edits to "main page" (!) if page.title() == flw.mainpage: with plock: print " ... not updating %s, wikt main page" % safe(page.aslink()) break # add/remove links if True: # just for left over indent act = "iwiki" # bad links, we seem to find a few, not infrequently (page moves, people adding links) act += ' -' title = page.title() for code in oldlinks.keys(): if oldlinks != title or code == flw.lc: if len(act) < 70: act += '], ' % (code, oldlinks) else: act += code + ', ' del oldlinks # will add valid link in next step if present act = act.rstrip(', -') act += " +" for code in sorted(links): # but not target page: if code == flw.lc: continue if code in flw.nolink: continue # e.g. pl->ru if code not in oldlinks and (flw.redirs or code not in redirs) and code not in skips: if len(act) < 70: act += '], ' % (code, title) else: act += code + ', ' oldlinks = title act = act.rstrip(', +') if remove: act += ' -' for code in sorted(oldlinks): if code not in links and code not in skips: act += code + ', ' del oldlinks act = act.rstrip(', -') # with plock: print "(debug: rtask %s action %s)" % (safe(page.aslink()), safe(act)) if act == "iwiki": break # nothing was done newtext = replaceiwlinks(text, oldlinks, flw, flws) # special case for en.wikt, remove count page if we've added an iwiki: # leave odd variants to AF as before if flw.lc == 'en' and "+" in act and '{{count page|' in newtext: newtext, k = recountpage.subn('', newtext) if k: act += ", -{{count page}}" # pace to max rate, take lock and sleep with ticktock: sleep(reptick) try: if text != getedit(page, plock = plock): with plock: print "page changed during edit?", srep(page.aslink(forceInterwiki = True)) continue # try this again # page.put(newtext, comment = act) putedit(page, newtext, comment = act, plock = plock) done = True flw.edits += 1 if flw.status == "test" or (" -" in act and "-{" not in act): addlog(page.aslink(), act) with plock: print " ... %s %s" % (srep(page.aslink(forceInterwiki = True)), srep(rewpr.sub(r'\1', act))) except Exception, e: if nap > 9 or '10054' not in repr(e): # e.g. not another box reset, do report on 3rd failure with plock: print " ... some exception trying to update %s" % safe(page.aslink()), str(e) # print "(sleeping %d seconds)" % nap sleep(nap) nap += nap/2 continue return if __name__ == "__main__": # init all the flws, getiwlinks relies on this for code in flws.site.family.langs: foo = flws # production calls from mbwa init all of them # test # flws.site.forceLogin() with plock: print "test FL get status" # valid = getflstatus(flws) # valid = getflstatus(flws) valid = getflstatus(flws) with plock: print "test add en to chat on mg" page = wikipedia.Page(flws.site, "chat") addrci(page, flws.site) """ # other tests: valid = getflstatus(flws) valid = getflstatus(flws) # test add userpage with plock: print "test add user page" adduserpage(flws) # test add # flws.tbd = 17 with plock: print "test add en to cat on sw" page = wikipedia.Page(flws.site, "cat") addrci(page, flws.site) with plock: print "test add en to Mwanzo (main page) on sw" page = wikipedia.Page(flws.site, "Mwanzo") addrci(page, flws.site) with plock: print "test add en to cat on pl" page = wikipedia.Page(flws.site, "cat") addrci(page, flws.site) with plock: print "test add en to cat on vi" page = wikipedia.Page(flws.site, "cat") addrci(page, flws.site) with plock: print "test add en to cat on sw, links fr, vi" page = wikipedia.Page(flws.site, "cat") links = { 'fr':wikipedia.Page(flws.site, "cat"), 'vi':wikipedia.Page(flws.site, "cat") } redirs = { } addrci(page, flws.site, links = links, redirs = redirs) # should not change any entry # now fix foo with plock: print "test fix foo on en" page = wikipedia.Page(flws.site, "foo") addrci(page, flws.site) # "locked" wikt: with plock: print "test add en to father on as" page = wikipedia.Page(flws.site, "father") addrci(page, flws.site) # rm bad link with plock: print "test add en to septendecim on ko" page = wikipedia.Page(flws.site, "septendecim") addrci(page, flws.site) """ replink(end = True)