Warning: foreach() argument must be of type array|object, null given in
/home/enciclo/public_html/dictious.com/inc.main.result.php on line
38
User:Interwicket/code/iwiktll
This is "stabilized" and I don't use it; see User:Interwicket/code/mbwa. If mbwa is given exactly one langlinks file to work with, it will do what this code will do, albeit in a different order. Provided here only because it was here; I may delete this page presently. Robert Ullmann 17:01, 10 February 2009 (UTC)
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bot updates iwiki links between wiktionaries
2.2.9: read langlinks.sql.gz dump, compare to union-index, re-evaluate conflicts
Run with options:
-langlinks:(filename) it is not necessary to specify the "-langlinks.sql.gz" ending
overridden by -date if that is used
-home:(code)
-redir add links to redirects on this wikt, important to get right,
as it will otherwise remove desired links to redirects (not so yet)
-date:(date) reads file "langlinks/(home)wiktionary-(date)-langlinks.sql.gz"
"""
import wikipedia
import xmlreader
import sys
import socket
import re
import pickle
import pagegenerators
import time
from random import choice
from mwapi import getwikitext, getedit
from reciprocal import addrci
# borrow global:
from config import usernames
def safe(s):
return pickle.dumps(s)[1:-5]
import shelve
# use hunt routine in iwiktrc, should be able to maintain the same?
from iwiktrc import hunt, Exists, Lcode, site, naps
# things used by hunt(), use same copies!
def now(): return int(time.clock())
# read language links file, sort internally (not in the order we'd like ;-)
# compare to union index, yield titles that do not match, with language codes to hunt
import gzip
def llfile(home = '', filename = '', redirs = False):
if not filename: return
# dict of links, to sort out from file
# entries are pageid, code, link title
# pageid is mostly useless to us, link title is pagename presumably
# so dict of sets
links = { }
retuple = re.compile(r"\((\d*?),'(.*?)','(.*?)'\)")
print "reading file", filename
f = gzip.open(filename, 'rb')
leftover = ''
while True:
content = f.read(4096)
if not content: break
content = leftover + content
# find a break not in UTF-8
i = content.rfind("');") # at end, must check first
if i < 0: i = content.rfind("'),") # usual case
if i < 0:
leftover = content
continue # at end or need to read some more
leftover = content[i+3:]
content = content[:i+2]
content = unicode(content, 'utf-8', 'ignore')
for tuple in retuple.findall(content):
# print repr(tuple)
pid, lc, title = tuple
if ':' in title: continue
if not title: continue
title = title.replace(r"\'", "'") # SQL escape, we've matched ' only before , or )
if title not in links: links[title] = set()
links[title].add(lc)
f.close()
print "read links for %d titles" % len(links)
# now we have all the links, compare to union index
Uix = shelve.open("union-index")
# Uix = {} # testing w/o union index
for title in sorted(links):
if repr(title) in Uix: t, ul, ur = Uix[repr(title)]
else: ul = ur = ''
# print repr(title), "LL:", repr(links[title]), "UNION:", repr(ul), "UREDIR:", repr(ur)
if redirs: ul += ur
# compare links to ul, should match
# first add home to ll, then it should be identical
ll = links[title]
ll.add(home)
# if not redirs, but some present, is okay (at this point):
if not redirs and ur:
for lc in ur: ll.discard(lc)
# (also no point in trying to read them in hunt ;-)
if sorted(ll) != sorted(ul):
print " in LL, not in UNION:", [x for x in ll if x not in ul]
print " in UNION, not in LL:", [x for x in ul if x not in ll]
lcs = set(ul)
lcs.discard(home)
yield title, lcs, ur
else: print "(%s matches)" % repr(title)
Uix.close()
def main():
socket.setdefaulttimeout(40)
home = 'en'
langlinks = ''
addredirs = False
fdate = ''
for arg in sys.argv[1:]:
if arg.startswith('-langlinks:'):
langlinks = arg[11:]
if not langlinks.endswith("-langlinks.sql.gz") and '.' not in langlinks:
langlinks += "-langlinks.sql.gz"
print "reading langlinks file %s" % langlinks
if arg.startswith('-date:'):
fdate = arg[6:]
elif arg.startswith('-home:'):
home = arg[6:]
print "home wikt is %s" % home
elif arg.startswith('-redir'):
addredirs = True
print "add links to redirects"
else: print "unknown command line argument %s" % arg
if fdate:
langlinks = "langlinks/" + home + "wiktionary-" + fdate + "-langlinks.sql.gz"
print "reading langlinks file %s" % langlinks
mysite = wikipedia.getSite(home, 'wiktionary')
# make sure we are logged in
mysite.forceLogin()
meta = wikipedia.getSite(code = "meta", fam = "meta")
# get active wikt list
# minus crap. Tokipona? what are they thinking? Klingon? ;-)
Lstops = ['tokipona', 'tlh']
page = wikipedia.Page(meta, "List of Wiktionaries/Table")
existtab = page.get()
""" entry looks like:
| [[w:Vietnamese language|Vietnamese]]
| [[w:Vietnamese language|Tiếng Việt]]
| [http://vi.wiktionary.org/wiki/ vi]
"""
# reextab = re.compile(r'^\[\[:([a-z-]+):')
# reextab = re.compile(r'\| \[http://([a-z-]+)\.wiktionary\.org')
reextab = re.compile(r'^\| \[\[w:.*\|(.*)\]\]\n'
r'^\| .*\n'
r'^\| \[http://([a-z-]+)\.wiktionary\.org', re.M)
for mo in reextab.finditer(existtab):
if mo.group(2) in Lstops: continue
Exists.add(mo.group(2))
Lcode[mo.group(1)] = mo.group(2)
# see if we have a login in user config, else pretend we do
# has to be done before any call, or login status gets confused!
if mo.group(2) not in usernames['wiktionary']:
usernames['wiktionary'][mo.group(2)] = "Interwicket"
print "found %d active wikts" % len(Exists)
if len(Exists) < 150: return
for lc in Exists:
site[lc] = wikipedia.getSite(lc, "wiktionary")
naps[lc] = 0 # nil, might be referenced by hunt()
# naps ... ;-)
naptime = 0
maxnap = 70
# now look for iwikis needed
entries = 0
probs = 0
fixed = 0
for title, lcs, urs in llfile(home = home, filename = langlinks, redirs = addredirs):
if ':' in title: continue # redundant, but eh?
if title.lower() == 'main page': continue
print "%s:%s" % (home, safe(title))
# structure of code here is leftover from source (;-)
tag = True
# now see if it is something that should be tagged/replaced:
if tag:
probs += 1
naptime += 1
# ... pick up current version from en.wikt
# print '%s is possible update, getting current entry' % safe(title)
try:
page = wikipedia.Page(mysite, title)
# text = page.get()
text = getwikitext(page)
oldtext = text
except wikipedia.NoPage:
print " ... %s not in %s.wikt" % (safe(page.title()), safe(home))
text = ''
except wikipedia.IsRedirectPage:
print " ... redirect page"
text = ''
except KeyError:
# annoying local error, from crappy framework code
print "KeyError"
time.sleep(200)
continue
if not text: continue
act = ''
linksites = wikipedia.getLanguageLinks(text)
ls = [s.lang for s in linksites]
# list of iwikis in entry should match lcs, if not, we need to update
if sorted(ls) == sorted(lcs):
print " ... is okay"
continue
# if not always adding redirs to this wikt, but some present, is ok
if not addredirs:
ok = True
# need to remove something
for s in ls:
if s not in lcs and s not in urs: ok = False
# need to add something
for s in lcs:
if s not in ls: ok = False
if ok:
print " ... is okay (may have redirects)"
continue
# go hunt down some iwikis, add reciprocals when needed
# always include en, pass all other lcs
iwikis, missing = hunt(title, text, 'en', lcs = lcs, home = home, addredirs = addredirs)
if iwikis:
act = "iwiki +" + ", ".join(iwikis)
else:
print " ... no new iwikis found"
# remove
rms = [ ]
for s in ls:
if s in missing: rms.append(s)
if home in ls: rms.append(home) # pre-existing self-link (!)
if rms:
if act: act += " -"
else: act = "iwiki -"
act += ", ".join(sorted(rms))
if not act: continue
# add links, [don't remove unwanted redirects yet]
for lc in iwikis:
fpage = wikipedia.Page(site[lc], title)
linksites[site[lc]] = fpage
for lc in rms:
del linksites[site[lc]]
try:
newtext = wikipedia.replaceLanguageLinks(text, linksites, site = mysite)
except ValueError:
# throws this trying to "add to self", just effing continue
print " ... replace error in", repr(page.aslink())
continue
newtext = newtext.replace('\r\n', '\n') # wikipedia brain-damage
if newtext.rstrip(' \n') == text.rstrip(' \n'): continue # didn't change anything
# wikipedia.showDiff(text, newtext)
else: continue
# some change, write it
if act:
fixed += 1
naptime /= 2
print " ... updating %s: %s" % (safe(title), safe(act).strip("'"))
# try to fix the entry
try:
utext = getedit(page)
# utext = page.get()
if utext != oldtext:
print "page changed during attempted update"
continue
wikipedia.setAction(act)
page.put(newtext)
# no cache update [and "links" not set up]
# iwadd(title, links.keys())
except wikipedia.EditConflict:
print "Edit conflict?"
continue
except wikipedia.PageNotSaved:
print "failed to save page"
# other action?
continue
except wikipedia.NoPage:
print "Can't get %s from en.wikt?" % safe(page.aslink())
continue
except wikipedia.IsRedirectPage:
print "Redirect page now?"
continue
except socket.timeout:
print "socket timeout, maybe not saving page"
continue
except socket.error:
print "socket error, maybe not saving page"
continue
except KeyError:
# annoying local error, from crappy framework code
print "KeyError"
time.sleep(200)
continue
# limit number of fixes for testing
# if fixed > 7: break
# pace [not used in the same way, reconsider]
if naptime > maxnap: naptime = maxnap
"""
if naptime > 4:
print "sleeping %d seconds" % naptime
time.sleep(naptime)
"""
continue
print "%d entries, %d possible, %d updated" % (entries, probs, fixed)
# done
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()