tranInter_doubleTracker.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bot goes over multiple pages of the home wiki, and edits them without
changing. This is for example used to get category links in templates
working.
Don't forget to set the ftout to your current list of words,
see below for a line that looks like:
ftout =open('/home/cmillet/wikitruks/wiktio/all/2005-12-14.txt', 'r')
This script understands various command-line arguments:
-start: used as -start:page_name, specifies that the robot should
go alphabetically through all pages on the home wiki,
starting at the named page.
-file: used as -file:file_name, read a list of pages to treat
from the named textfile. Page titles should be enclosed
in ].
-ref: used as -start:page_name, specifies that the robot should
touch all pages referring to the named page.
-cat: used as -cat:category_name, specifies that the robot should
touch all pages in the named category.
All other parameters will be regarded as a page title; in this case, the bot
will only touch a single page.
"""
import wikipedia, wiktionary, pagegenerators, catlib
import sys
import re
import datetime
tradMsg = "{{-trad-}}"
commentCompiler = re.compile( u"\<\!\-\-(.*?)\-\-\>", re.DOTALL | re.MULTILINE)
# (eventually one ":") one *, one whitespace, one {{language code}}, then links, then newline char
translntLineCompiler = re.compile("(^:?\* *\{\{(\w*?)\}\}(.*?\n))",re.MULTILINE)
oldLinkCompiler = re.compile("\\]")
beforeTranslntCompiler = re.compile("(\ *,?\ *$)")
afterTranslntCompiler = re.compile("(^\ *,?\ *)")
'''
listelng =
# retrait de dog : pas de wiktionnaire dans cette langue
nowiktiolng =
'''
# Wiktionaries that makes the distinction between this and This :
# (the list http://meta.wikimedia.orghttps://dictious.com/fr/Help:Page_name is not really uptodate)
nocaplng =
#Wiktionary I checked that still capitalize there entries:
# ln -- pt
# ftout MUST BE SET correctly
wordList = {}
date = datetime.date.today()
filename = str(date.year) + '-' + str(date.month) + '-' + str(date.day) + '.txt'
ftout =open('./' + filename, 'r')
line = ftout.readline()
while line:
language, translation = line.split(":",1)
if language not in wordList:
wordList =
wordList.append(translation)
line = ftout.readline()
ftout.close()
# si c'est {{-... ou [[... alors on entre dans une autre section, ou fin de liste
# if re.compile("^(\{\{-|\[\[)",re.M).match(newtext,curIdx):
class TranslationBot:
def __init__(self, generator, acceptall = False):
self.generator = generator
self.acceptall = acceptall
def run(self):
for page in self.generator:
try:
hasShortcut = False
hasInterwikification = False
hasDouble = False
wikipedia.output('page: %s' % page.title())
thePage = page.get()
theChangedPage = thePage # as newtext, but without comment
# removing <!-- -->
oldText = commentCompiler.sub(u"", thePage)
# We need to do something here
newText = oldText
curIdx = newText.find(tradMsg, 0)
while curIdx != -1:
curIdx += len(tradMsg)
result = translntLineCompiler.search(oldText,curIdx)
while result and result.group(2) != "cf":
completeLine = result.group(1)
lang = result.group(2)
analyzedPart = result.group(3)
newLine = completeLine
pattern = u'\{\{trad\|%s\}\}'%lang
transList = re.findall(pattern, analyzedPart)
for translt in transList :
wikipedia.output(u'recherche de "%s:%s"'%(lang,page.title()) )
tosearch = u'%s\n'%page.title()
tosearch = tosearch.encode('utf-8')
if lang not in wordList or tosearch not in wordList:
hasInterwikification = True
print "DEWIKIFICATION"
old = u'{{trad|%s}}'%lang
new = u'{{trad-|%s}}'%lang
newLine = newLine.replace(old , new)
pattern = u'\{\{trad-\|%s\}\}'%lang
transList = re.findall(pattern, analyzedPart)
for translt in transList :
wikipedia.output(u'recherche de "%s:%s"'%(lang,page.title()) )
tosearch = u'%s\n'%page.title()
tosearch = tosearch.encode('utf-8')
if lang in wordList and tosearch in wordList:
hasInterwikification = True
print "DEWIKIFICATION"
old = u'{{trad-|%s}}'%lang
new = u'{{trad|%s}}'%lang
newLine = newLine.replace(old , new)
pattern = u'\{\{trad\|%s\|(.*?)\}\}'%lang
transList = re.findall(pattern, analyzedPart)
for translt in transList :
# we are unable to process the cases in which there is #
if '#' in translt:
continue
wikipedia.output(u'recherche de "%s:%s"'%(lang,translt) )
tosearch = u'%s\n'%translt
tosearch = tosearch.encode('utf-8')
if lang not in wordList or tosearch not in wordList:
hasInterwikification = True
print "DEWIKIFICATION"
old = u'{{trad|%s|%s}}'%(lang,translt)
if translt == page.title():
new = u'{{trad-|%s}}'%lang
else:
new = u'{{trad-|%s|%s}}'%(lang,translt)
newLine = newLine.replace(old , new)
elif translt == page.title():
hasShortcut = True
print "RACCOURCIS"
old = u'{{trad|%s|%s}}'%(lang,translt)
new = u'{{trad|%s}}'%lang
newLine = newLine.replace(old , new)
pattern = u'\{\{trad-\|%s\|(.*?)\}\}'%lang
transList = re.findall(pattern , analyzedPart )
for translt in transList :
if '#' in translt:
continue
wikipedia.output(u'recherche de "%s:%s"'%(lang,translt) )
tosearch = u'%s\n'%translt
tosearch = tosearch.encode('utf-8')
if lang in wordList and tosearch in wordList:
hasInterwikification = True
print "INTERWIKIFICATION"
old = u'{{trad-|%s|%s}}'%(lang,translt)
if translt == page.title():
new = u'{{trad|%s}}'%lang
else:
new = u'{{trad|%s|%s}}'%(lang,translt)
newLine = newLine.replace(old , new)
elif translt == page.title():
hasShortcut = True
print "RACCOURCIS"
old = u'{{trad-|%s|%s}}'%(lang,translt)
new = u'{{trad-|%s}}'%lang
newLine = newLine.replace(old , new)
transList = oldLinkCompiler.findall(analyzedPart)
for translt in transList :
hasInterwikification = True
wikipedia.output(u'recherche de "%s:%s"'%(lang,translt) )
tosearch = u'%s\n'%translt
tosearch = tosearch.encode('utf-8')
if '#' not in translt and lang in wordList and tosearch in wordList:
print "INTERWIKIFICATION"
old = u']'%translt
if translt == page.title():
new = u'{{trad|%s}}'%lang
else:
new = u'{{trad|%s|%s}}'%(lang,translt)
newLine = newLine.replace(old , new)
else:
print "REDEWIKIFICATION"
old = u']'%translt
if translt == page.title():
new = u'{{trad-|%s}}'%lang
else:
new = u'{{trad-|%s|%s}}'%(lang,translt)
newLine = newLine.replace(old , new)
intermediateLine = newLine
# Double tracking (for linked translations)
lineData = re.split('(\{\{trad\|%s\|.*?\}\})'%lang, intermediateLine)
lineMetaData =
for i in range(len(lineData)):
if re.match(u'(\{\{trad\|%s\|.*?\}\})'%lang, lineData):
lineMetaData.append("isATranslation")
content = re.findall(u'\{\{trad\|%s\|(.*?)\}\}'%lang , lineData )
lineData = content
else:
lineMetaData.append("isNotATranslation")
checkedTranslation = {}
i = 0
while i != len(lineData):
if lineMetaData == "isATranslation":
lineMetaData.append("isATranslation")
if lineData not in checkedTranslation:
checkedTranslation] = i
i += 1
else:
hasDouble = True
wikipedia.output(u'DOUBLON (%s)'%lineData)
lineData.pop(i)
lineMetaData.pop(i)
if i != len(lineData)-1 and lineMetaData == "isNotATranslation":
lineData = afterTranslntCompiler.sub("", lineData)
if i != 0 and lineMetaData == "isNotATranslation":
lineData += lineData
lineData.pop(i)
lineMetaData.pop(i)
elif i != 0 and lineMetaData == "isNotATranslation":
lineData = beforeTranslntCompiler.sub("", lineData)
else:
i += 1
newLine = ""
for i in range(len(lineData)):
if lineMetaData == "isATranslation":
newLine += u'{{trad|%s|%s}}'%(lang, lineData)
else:
newLine += lineData
intermediateLine = newLine
# Double tracking (for unlinked translations)
lineData = re.split('(\{\{trad-\|%s\|.*?\}\})'%lang, intermediateLine)
lineMetaData =
for i in range(len(lineData)):
if re.match(u'(\{\{trad-\|%s\|.*?\}\})'%lang, lineData):
lineMetaData.append("isATranslation")
content = re.findall(u'\{\{trad-\|%s\|(.*?)\}\}'%lang , lineData )
lineData = content
else:
lineMetaData.append("isNotATranslation")
checkedTranslation = {}
i = 0
while i != len(lineData):
if lineMetaData == "isATranslation":
lineMetaData.append("isATranslation")
if lineData not in checkedTranslation:
checkedTranslation] = i
i += 1
else:
hasDouble = True
wikipedia.output(u'DOUBLON (%s)'%lineData)
lineData.pop(i)
lineMetaData.pop(i)
if i != len(lineData)-1 and lineMetaData == "isNotATranslation":
lineData = afterTranslntCompiler.sub("", lineData)
if i != 0 and lineMetaData == "isNotATranslation":
lineData += lineData
lineData.pop(i)
lineMetaData.pop(i)
elif i != 0 and lineMetaData == "isNotATranslation":
lineData = beforeTranslntCompiler.sub("", lineData)
else:
i += 1
newLine = ""
for i in range(len(lineData)):
if lineMetaData == "isATranslation":
newLine += u'{{trad-|%s|%s}}'%(lang, lineData)
else:
newLine += lineData
intermediateLine = newLine
# Double tracking (for linked translations with the same word)
lineData = re.split('(\{\{trad\|%s\}\})'%lang, intermediateLine)
lineMetaData =
for i in range(len(lineData)):
if re.match(u'(\{\{trad\|%s\}\})'%lang, lineData):
lineMetaData.append("isATranslation")
lineData = page.title()
else:
lineMetaData.append("isNotATranslation")
checkedTranslation = {}
i = 0
while i != len(lineData):
if lineMetaData == "isATranslation":
lineMetaData.append("isATranslation")
if page.title() not in checkedTranslation:
checkedTranslation = i
i += 1
else:
hasDouble = True
wikipedia.output(u'DOUBLON (%s)'%page.title())
lineData.pop(i)
lineMetaData.pop(i)
if i != len(lineData)-1 and lineMetaData == "isNotATranslation":
lineData = afterTranslntCompiler.sub("", lineData)
if i != 0 and lineMetaData == "isNotATranslation":
lineData += lineData
lineData.pop(i)
lineMetaData.pop(i)
elif i != 0 and lineMetaData == "isNotATranslation":
lineData = beforeTranslntCompiler.sub("", lineData)
else:
i += 1
newLine = ""
for i in range(len(lineData)):
if lineMetaData == "isATranslation":
newLine += u'{{trad|%s}}'%lang
else:
newLine += lineData
intermediateLine = newLine
# Double tracking (for unlinked translations with the same word)
lineData = re.split('(\{\{trad-\|%s\}\})'%lang, intermediateLine)
lineMetaData =
for i in range(len(lineData)):
if re.match(u'(\{\{trad-\|%s\}\})'%lang, lineData):
lineMetaData.append("isATranslation")
lineData = page.title()
else:
lineMetaData.append("isNotATranslation")
checkedTranslation = {}
i = 0
while i != len(lineData):
if lineMetaData == "isATranslation":
lineMetaData.append("isATranslation")
if page.title() not in checkedTranslation:
checkedTranslation = i
i += 1
else:
hasDouble = True
wikipedia.output(u'DOUBLON (%s)'%page.title())
lineData.pop(i)
lineMetaData.pop(i)
if i != len(lineData)-1 and lineMetaData == "isNotATranslation":
lineData = afterTranslntCompiler.sub("", lineData)
if i != 0 and lineMetaData == "isNotATranslation":
lineData += lineData
lineData.pop(i)
lineMetaData.pop(i)
elif i != 0 and lineMetaData == "isNotATranslation":
lineData = beforeTranslntCompiler.sub("", lineData)
else:
i += 1
newLine = ""
for i in range(len(lineData)):
if lineMetaData == "isATranslation":
newLine += u'{{trad-|%s}}'%lang
else:
newLine += lineData
# end of line analyze
newText = newText.replace(completeLine, newLine)
theChangedPage = theChangedPage.replace(completeLine, newLine)
curIdx = result.end(3)
result = translntLineCompiler.search(oldText,curIdx)
# end of if not result
curIdx = newText.find(tradMsg, curIdx)
# end of while we are in the translation section
# end of while {{-trad-}}
# we upload the text
if newText == oldText or (not hasInterwikification and not hasDouble and not hasShortcut):
wikipedia.output('No changes were necessary in %s' % page.title())
else:
if hasInterwikification and hasDouble:
wikipedia.output(u'interwikification et dédoublonnage')
wikipedia.setAction(u'interwikification et dédoublonnage des traductions (modèle trad)')
elif hasInterwikification:
wikipedia.output(u'interwikification')
wikipedia.setAction(u'interwikification des traductions (modèle trad)')
elif hasShortcut:
wikipedia.output(u'raccourcissement')
wikipedia.setAction(u'raccourcissement des traductions (modèle trad)')
else:
wikipedia.output(u'dédoublonnage')
wikipedia.setAction(u'dédoublonnage des traductions (modèle trad)')
wikipedia.output(u'>>> %s <<<' % page.title())
wikipedia.showDiff(thePage, theChangedPage)
if not self.acceptall:
choice = wikipedia.inputChoice(u'Do you want to accept these changes?', , , 'N')
if choice in :
self.acceptall = True
if self.acceptall or choice in :
print "put"
page.put(theChangedPage)
except wikipedia.NoPage:
wikipedia.output(u'Page %s does not exist?!?!'%page.aslink())
except wikipedia.IsRedirectPage:
pass
except wikipedia.LockedPage:
pass
def main():
#page generator
gen = None
pageTitle =
for arg in wikipedia.handleArgs():
if arg:
if arg.startswith('-start:'):
gen = pagegenerators.AllpagesPageGenerator(arg)
elif arg.startswith('-ref:'):
referredPage = wikipedia.Page(wikipedia.getSite(), arg)
gen = pagegenerators.ReferringPageGenerator(referredPage)
elif arg.startswith('-links:'):
linkingPage = wikipedia.Page(wikipedia.getSite(), arg)
gen = pagegenerators.LinkedPageGenerator(linkingPage)
elif arg.startswith('-file:'):
gen = pagegenerators.TextfilePageGenerator(arg)
elif arg.startswith('-cat:'):
cat = catlib.Category(wikipedia.getSite(), arg)
gen = pagegenerators.CategorizedPageGenerator(cat)
else:
pageTitle.append(arg)
if pageTitle:
page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
gen = iter()
if not gen:
wikipedia.showHelp('touch')
else:
preloadingGen = pagegenerators.PreloadingGenerator(gen)
bot = TranslationBot(preloadingGen)
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()