Note: to copy this code, edit this page, and then copy from the edit window! Else you will not get the HTML entities as HTML entities.
#!/usr/bin/python # -*- coding: utf-8 -*- # modded RLU for iwikt, use MW API, remove a lot of cruft, add maxlag and some more reliability import re, codecs, sys import urllib import time import wikipedia # redirects may be None, True, or False (all different ;-) None is all pages, True is # just redirects, False is just non-redirects. reapt = re.compile('title ?="(.*?)"') relagged = re.compile(r'<error.*"maxlag".* (\d+) seconds') reapfrom = re.compile(r' apfrom="(.*?)"') def allpages(site = wikipedia.getSite(), start = '!', namespace = '0', redirects = None): while True: # encode Non-ASCII characters in hexadecimal format (e.g. %F6) start = start.encode(site.encoding()) start = urllib.quote(start) path = "/w/api.php?action=query&list=allpages&apfrom=" + start + \ "&aplimit=480&format=xml&maxlag=2&namespace=" + namespace # redirects may be None, False, or True if redirects == None: pass elif redirects == True: path += "&apfilterredir=redirects" elif redirects == False: path += "&apfilterredir=nonredirects" print '(getting pages in %s from %s)' % (site.lang, start) # add retry logic, Robert Ullmann 25 Sept 07 done = False nap = 5 while not done: atext = site.getUrl(path) mo = relagged.search(atext) if mo: print "(server lagged %s seconds)" % mo.group(1) time.sleep(20) continue if '</api>' in atext: done = True else: print "allpages: incomplete reply, sleeping %d seconds" % nap time.sleep(nap) nap = min(nap + nap/2, 300) for title in reapt.findall(atext): # " is an HTML entity in this field! Robert Ullmann, 20 January 2008 # & too! Robert Ullmann, 8 May 2008 title = title.replace('"', '"') title = title.replace('&', '&') # others, but not sure we need this at all? Page fixes things. # suppress other namespace-like things here, or Page will gen the "wrong" title if ':' in title: continue yield wikipedia.Page(site, title) # find continuation: mo = reapfrom.search(atext) if mo: start = mo.group(1) start = start.replace('"', '"') start = start.replace('&', '&') continue else: break # we are done, will raise StopIteration # define a class so we can instantiate the iter method: class allpagegen: def __init__(self, start ='!', namespace = '0', site = wikipedia.getSite(), redirects = None): self.start = start self.site = site self.redir = redirects self.namespace = namespace def __iter__(self): for page in allpages(site = self.site, start = self.start, namespace = self.namespace, redirects = self.redir): yield page # simple unit test: if __name__ == "__main__": print "testing allpages, 1000 redirects from Ka in en.wikt, print every 20:" s = wikipedia.getSite('en', 'wiktionary') kagen = allpagegen(site = s, redirects = True, start = 'Ka') i = 0 for page in kagen: i += 1 if i%20 == 0: print repr(page.title()) if i > 1000: break