This is "stabilized" and I don't use it; see User:Interwicket/code/mbwa. If mbwa is given exactly one langlinks file to work with, it will do what this code will do, albeit in a different order. Provided here only because it was here; I may delete this page presently. Robert Ullmann 17:01, 10 February 2009 (UTC)
#!/usr/bin/python # -*- coding: utf-8 -*- """ This bot updates iwiki links between wiktionaries 2.2.9: read langlinks.sql.gz dump, compare to union-index, re-evaluate conflicts Run with options: -langlinks:(filename) it is not necessary to specify the "-langlinks.sql.gz" ending overridden by -date if that is used -home:(code) -redir add links to redirects on this wikt, important to get right, as it will otherwise remove desired links to redirects (not so yet) -date:(date) reads file "langlinks/(home)wiktionary-(date)-langlinks.sql.gz" """ import wikipedia import xmlreader import sys import socket import re import pickle import pagegenerators import time from random import choice from mwapi import getwikitext, getedit from reciprocal import addrci # borrow global: from config import usernames def safe(s): return pickle.dumps(s) import shelve # use hunt routine in iwiktrc, should be able to maintain the same? from iwiktrc import hunt, Exists, Lcode, site, naps # things used by hunt(), use same copies! def now(): return int(time.clock()) # read language links file, sort internally (not in the order we'd like ;-) # compare to union index, yield titles that do not match, with language codes to hunt import gzip def llfile(home = '', filename = '', redirs = False): if not filename: return # dict of links, to sort out from file # entries are pageid, code, link title # pageid is mostly useless to us, link title is pagename presumably # so dict of sets links = { } retuple = re.compile(r"\((\d*?),'(.*?)','(.*?)'\)") print "reading file", filename f = gzip.open(filename, 'rb') leftover = '' while True: content = f.read(4096) if not content: break content = leftover + content # find a break not in UTF-8 i = content.rfind("');") # at end, must check first if i < 0: i = content.rfind("'),") # usual case if i < 0: leftover = content continue # at end or need to read some more leftover = content content = content content = unicode(content, 'utf-8', 'ignore') for tuple in retuple.findall(content): # print repr(tuple) pid, lc, title = tuple if ':' in title: continue if not title: continue title = title.replace(r"\'", "'") # SQL escape, we've matched ' only before , or ) if title not in links: links = set() links.add(lc) f.close() print "read links for %d titles" % len(links) # now we have all the links, compare to union index Uix = shelve.open("union-index") # Uix = {} # testing w/o union index for title in sorted(links): if repr(title) in Uix: t, ul, ur = Uix else: ul = ur = '' # print repr(title), "LL:", repr(links), "UNION:", repr(ul), "UREDIR:", repr(ur) if redirs: ul += ur # compare links to ul, should match # first add home to ll, then it should be identical ll = links ll.add(home) # if not redirs, but some present, is okay (at this point): if not redirs and ur: for lc in ur: ll.discard(lc) # (also no point in trying to read them in hunt ;-) if sorted(ll) != sorted(ul): print " in LL, not in UNION:", print " in UNION, not in LL:", lcs = set(ul) lcs.discard(home) yield title, lcs, ur else: print "(%s matches)" % repr(title) Uix.close() def main(): socket.setdefaulttimeout(40) home = 'en' langlinks = '' addredirs = False fdate = '' for arg in sys.argv: if arg.startswith('-langlinks:'): langlinks = arg if not langlinks.endswith("-langlinks.sql.gz") and '.' not in langlinks: langlinks += "-langlinks.sql.gz" print "reading langlinks file %s" % langlinks if arg.startswith('-date:'): fdate = arg elif arg.startswith('-home:'): home = arg print "home wikt is %s" % home elif arg.startswith('-redir'): addredirs = True print "add links to redirects" else: print "unknown command line argument %s" % arg if fdate: langlinks = "langlinks/" + home + "wiktionary-" + fdate + "-langlinks.sql.gz" print "reading langlinks file %s" % langlinks mysite = wikipedia.getSite(home, 'wiktionary') # make sure we are logged in mysite.forceLogin() meta = wikipedia.getSite(code = "meta", fam = "meta") # get active wikt list # minus crap. Tokipona? what are they thinking? Klingon? ;-) Lstops = page = wikipedia.Page(meta, "List of Wiktionaries/Table") existtab = page.get() """ entry looks like: | ] | ] | """ # reextab = re.compile(r'^\+):') # reextab = re.compile(r'\| \+)\.wiktionary\.org') reextab = re.compile(r'^\| \\]\n' r'^\| .*\n' r'^\| \+)\.wiktionary\.org', re.M) for mo in reextab.finditer(existtab): if mo.group(2) in Lstops: continue Exists.add(mo.group(2)) Lcode = mo.group(2) # see if we have a login in user config, else pretend we do # has to be done before any call, or login status gets confused! if mo.group(2) not in usernames: usernames = "Interwicket" print "found %d active wikts" % len(Exists) if len(Exists) < 150: return for lc in Exists: site = wikipedia.getSite(lc, "wiktionary") naps = 0 # nil, might be referenced by hunt() # naps ... ;-) naptime = 0 maxnap = 70 # now look for iwikis needed entries = 0 probs = 0 fixed = 0 for title, lcs, urs in llfile(home = home, filename = langlinks, redirs = addredirs): if ':' in title: continue # redundant, but eh? if title.lower() == 'main page': continue print "%s:%s" % (home, safe(title)) # structure of code here is leftover from source (;-) tag = True # now see if it is something that should be tagged/replaced: if tag: probs += 1 naptime += 1 # ... pick up current version from en.wikt # print '%s is possible update, getting current entry' % safe(title) try: page = wikipedia.Page(mysite, title) # text = page.get() text = getwikitext(page) oldtext = text except wikipedia.NoPage: print " ... %s not in %s.wikt" % (safe(page.title()), safe(home)) text = '' except wikipedia.IsRedirectPage: print " ... redirect page" text = '' except KeyError: # annoying local error, from crappy framework code print "KeyError" time.sleep(200) continue if not text: continue act = '' linksites = wikipedia.getLanguageLinks(text) ls = # list of iwikis in entry should match lcs, if not, we need to update if sorted(ls) == sorted(lcs): print " ... is okay" continue # if not always adding redirs to this wikt, but some present, is ok if not addredirs: ok = True # need to remove something for s in ls: if s not in lcs and s not in urs: ok = False # need to add something for s in lcs: if s not in ls: ok = False if ok: print " ... is okay (may have redirects)" continue # go hunt down some iwikis, add reciprocals when needed # always include en, pass all other lcs iwikis, missing = hunt(title, text, 'en', lcs = lcs, home = home, addredirs = addredirs) if iwikis: act = "iwiki +" + ", ".join(iwikis) else: print " ... no new iwikis found" # remove rms = for s in ls: if s in missing: rms.append(s) if home in ls: rms.append(home) # pre-existing self-link (!) if rms: if act: act += " -" else: act = "iwiki -" act += ", ".join(sorted(rms)) if not act: continue # add links, for lc in iwikis: fpage = wikipedia.Page(site, title) linksites] = fpage for lc in rms: del linksites] try: newtext = wikipedia.replaceLanguageLinks(text, linksites, site = mysite) except ValueError: # throws this trying to "add to self", just effing continue print " ... replace error in", repr(page.aslink()) continue newtext = newtext.replace('\r\n', '\n') # wikipedia brain-damage if newtext.rstrip(' \n') == text.rstrip(' \n'): continue # didn't change anything # wikipedia.showDiff(text, newtext) else: continue # some change, write it if act: fixed += 1 naptime /= 2 print " ... updating %s: %s" % (safe(title), safe(act).strip("'")) # try to fix the entry try: utext = getedit(page) # utext = page.get() if utext != oldtext: print "page changed during attempted update" continue wikipedia.setAction(act) page.put(newtext) # no cache update # iwadd(title, links.keys()) except wikipedia.EditConflict: print "Edit conflict?" continue except wikipedia.PageNotSaved: print "failed to save page" # other action? continue except wikipedia.NoPage: print "Can't get %s from en.wikt?" % safe(page.aslink()) continue except wikipedia.IsRedirectPage: print "Redirect page now?" continue except socket.timeout: print "socket timeout, maybe not saving page" continue except socket.error: print "socket error, maybe not saving page" continue except KeyError: # annoying local error, from crappy framework code print "KeyError" time.sleep(200) continue # limit number of fixes for testing # if fixed > 7: break # pace if naptime > maxnap: naptime = maxnap """ if naptime > 4: print "sleeping %d seconds" % naptime time.sleep(naptime) """ continue print "%d entries, %d possible, %d updated" % (entries, probs, fixed) # done if __name__ == "__main__": try: main() finally: wikipedia.stopme()