fr-conj-ger.py
#!/usr/bin/python # -*- coding: utf-8 -*- """ This bot goes over multiple pages of the home wiki, and edits them without changing. This is for example used to get category links in templates working. Don't forget to set the ftout to your current list of words, see below for a line that looks like: ftout =open('/home/cmillet/wikitruks/wiktio/all/2005-12-14.txt', 'r') This script understands various command-line arguments: -start: used as -start:page_name, specifies that the robot should go alphabetically through all pages on the home wiki, starting at the named page. -file: used as -file:file_name, read a list of pages to treat from the named textfile. Page titles should be enclosed in ]. -ref: used as -start:page_name, specifies that the robot should touch all pages referring to the named page. -cat: used as -cat:category_name, specifies that the robot should touch all pages in the named category. All other parameters will be regarded as a page title; in this case, the bot will only touch a single page. """ import wikipedia, wiktionary, pagegenerators, catlib import sys import re import time #endings = #endings = #pronEnding = heading = u'Annexe:Conjugaison française:' ending = u'ger' templateName = u'fr-conj-1' alphanum = aList = cList = eList = iList = nList = oList = uList = quoteList = dotList = ', u'=', u'°', u'+', u'=', u'}', u'£', u'$', u'¤', u'%', u'µ', u'*', u'?', u',', u';', u':', u'§', u'!', u'<', u'>'] commentCompiler = re.compile( u"\<\!\-\-(.*?)\-\-\>", re.DOTALL | re.MULTILINE) templateCompiler = re.compile(u'\{\{' + templateName + u' *\r?\n?\| *.*?\}\}', re.DOTALL) flexionCompiler = re.compile(u'\{\{' + templateName + u' *\r?\n?\| *(.*?)\}\}', re.DOTALL) flexionParser = re.compile(u'\{\{' + templateName + u'.*?\}\}', re.DOTALL) splitCompiler = re.compile(u'\|') parameterCompiler = re.compile(u'^(.*?) *\= *(.*?)$') spacesCompiler = re.compile(u'\s+') class KeyBot: def __init__(self, generator, acceptall = False): self.generator = generator self.acceptall = acceptall def run(self): for page in self.generator: try: hasBadParameters = False hasStrangeParameter = False word = page.title() while(len(word) > 0) and ((heading + word) != page.title()): word = word wordBase = word while(len(wordBase) > 0) and ((wordBase + ending) != word): wordBase = wordBase if (wordBase + ending) == word: wikipedia.output(u'page: %s' % page.title()) thePage = page.get() theChangedPage = thePage # as newtext, but without comment # removing <!-- --> oldText = commentCompiler.sub(u'', thePage) # We need to do something here newText = oldText templateList = templateCompiler.findall(newText) ## maxIteration = int(100) ## while flexion and (maxIteration > 0): for oldTemplate in templateList: flexion = flexionCompiler.search(oldTemplate) parameterList = splitCompiler.split(flexion.group(1)) wordStart = parameterList.strip(u'\r\n ') parameterList.pop(0) if len(parameterList) > 0: nothing = parameterList.strip(u'\r\n ') if nothing == u'': parameterList.pop(0) ## ## pron = parameterList.strip(u'\r\n ') parameters = {} parameterIndex = for parameter in parameterList: parameter = parameter.strip(u'\r\n ') parameterElmnt = parameterCompiler.search(parameter) if parameterElmnt: parameterIndex.append(parameterElmnt.group(1)) parameters = parameterElmnt.group(2) else: hasStrangeParameter = True if u'cat' not in parameters: parameters = word parameterIndex.append(u'cat') if u'c1' not in parameters or u'c2' not in parameters: hasBadParameters = True elif (parameters != u'g') or (parameters != u'ge'): hasBadParameters = True if u'pc' in parameters: if parameters != u'ʒ': hasBadParameters = True if u'ill' in parameters or u'j' in parameters or u'e' in parameters: hasBadParameters = True # we upload the text if hasBadParameters: theTitle = word wikipedia.output(u'################### %s ####################' % theTitle) encodedTitle = theTitle.encode('utf-8') outputFile.write(encodedTitle) outputFile.write("\r\n") except wikipedia.NoPage: wikipedia.output(u'Page %s does not exist?!?!'%page.aslink()) except wikipedia.IsRedirectPage: pass except wikipedia.LockedPage: pass def main(): #page generator gen = None pageTitle = for arg in wikipedia.handleArgs(): if arg: if arg.startswith('-start:'): gen = pagegenerators.AllpagesPageGenerator(arg) elif arg.startswith('-ref:'): referredPage = wikipedia.Page(wikipedia.getSite(), arg) gen = pagegenerators.ReferringPageGenerator(referredPage) elif arg.startswith('-links:'): linkingPage = wikipedia.Page(wikipedia.getSite(), arg) gen = pagegenerators.LinkedPageGenerator(linkingPage) elif arg.startswith('-file:'): gen = pagegenerators.TextfilePageGenerator(arg) elif arg.startswith('-cat:'): catGen = pagegenerators.TextfilePageGenerator('./cat.txt') catPreloadingGen = pagegenerators.PreloadingGenerator(catGen) for catPage in catPreloadingGen: cat = catlib.Category(wikipedia.getSite(), catPage.title()) gen = pagegenerators.CategorizedPageGenerator(cat) else: pageTitle.append(arg) if pageTitle: page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle)) gen = iter() if not gen: wikipedia.showHelp('touch') else: preloadingGen = pagegenerators.PreloadingGenerator(gen) bot = KeyBot(preloadingGen) bot.run() if __name__ == "__main__": now = time.localtime() filename = './articlesToChange-' + str(now.tm_hour) + '-' + str(now.tm_min) + '-' + str(now.tm_sec) + '.txt' outputFile =open(filename, 'w') try: main() finally: wikipedia.stopme() outputFile.close()