#!/usr/bin/env python
#coding: utf-8
# Copyright CodeCat 2010 - 2013
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import wikipedia, re, string, sys
class GenericFormBot:
"""A generic class for Wiktionary form bots.
This class is an abstract base class, and isn't meant to be instantiated
directly. To use it, derive a new class from it, and override the
generateForms method with a proper definition, and provide a call to
the base class constructor.
Once you're ready to let it run, just call run() and it's all sorted.
The purpose of this script is to provide automated generation of
Wiktionary entries for inflected forms. It does this by fetching a
Wiktionary page, then checks for the existence of certain on that page.
If found, it extracts the necessary information from the template
parameters, and passes it on to the generateForms method, which generates
the forms (just as the templates themselves do) and uploads the result as
new entries.
It will either create a new page or append a new section to the
page. It will skip the page if it already contains a section of the same
type as the one being created.
If the page already exists, it will add {{rfc-auto}} to it,
so that the AutoFormat bot can automatically place the section in the
proper place on the page.
"""
def __init__(self, head, templates, langCode, langName,
cleanupCat = None, simulation = False, force = False, verbose = False):
self._head = head
self._templates = templates
self._langCode = langCode
self._langName = langName
self._cleanupCat = cleanupCat
self._simulation = simulation
self._force = force
self._verbose = verbose
def run(self):
"""Fetch a wiktionary entry and create entries from information in all form template occurrences."""
page = wikipedia.Page(wikipedia.getSite('en', 'wiktionary'), self._head)
if page.exists():
contents = page.get()
# Find all occurrences of form templates
templates = getTemplates(contents, self._templates)
if not templates:
wikipedia.output(u"No form template on page ].".format(self._head))
return
else:
for temp in templates:
wikipedia.output(u"Found: {0}".format(temp))
name, params = parseTemplate(temp)
self.makeEntries(name, params)
else:
wikipedia.output(u"Can't find page ].".format(self._head))
def makeEntries(self, template, params):
"""Create entries from information in one form template."""
entries = self.generateEntries(template, params)
if not entries:
return
try:
del entries
except KeyError:
pass
result = False
# Merge the lists into a single string per entry
for title, entry in entries.iteritems():
changed = self.saveEntry(title, entry)
result = result or changed
if not result:
wikipedia.output(u"Note: Did not add any new entries from page ].".format(self._head))
def zipEntries(self, entries, header):
"""Return with each entry zipped together into one string."""
ret = {}
for form, entry in entries.iteritems():
ret = header + '# ' + '\n# '.join(entry)
return ret
def generateEntries(self, template, params):
"""Override this in a derived class."""
pass
def saveEntry(self, title, entry):
"""Save a new entry to Wiktionary."""
page = wikipedia.Page(wikipedia.getSite('en', 'wiktionary'), title)
newContents = '=={0}==\n'.format(self._langName) + entry
if page.exists():
oldContents = page.get()
if entry in oldContents:
wikipedia.output(u"Skipped page ]. Already contains the new entry.".format(title))
return False
langSections = getSections(oldContents, self._langName, 2)
newContents = '\n\n----\n' + newContents
if langSections:
# There is more than one section for this language already.
# The bot probably was here before!
if len(langSections) > 1:
if self._force:
wikipedia.output(u"WARNING: Forced append to ]. More than one {1} section on page.".format(title, self._langName))
if self._cleanupCat:
newContents += '\n]'
else:
wikipedia.output(u"Skipped page ]. More than one {1} section on page.".format(title, self._langName))
return False
else:
# There is a lang section on the page
langContents = oldContents:langSections]
# Does the lang section have numbered etymologies?
if re.search(ur'=== *Etymology \d+ *===', langContents, re.UNICODE):
if self._force:
wikipedia.output(u"WARNING: Forced append to ]. {1} section has numbered etymology sections.".format(title, self._langName))
if self._cleanupCat:
newContents += '\n]'
else:
wikipedia.output(u"Skipped page ]. {1} section has numbered etymology sections.".format(title, self._langName))
return False
else:
pos = re.match(ur'===(\w+)===', entry, re.UNICODE).group(1)
posHeaders =
# Special case... this happened to me once, so I might as well code it in
if pos == 'Verb':
posHeaders.append(u'Participle')
# Does the lang section have a verb section already in it?
if re.search(ur'=== *(?:{0}) *==='.format(u'|'.join(posHeaders)), langContents, re.UNICODE):
if self._force:
wikipedia.output(u"WARNING: Forced append to ]. Already has {1} {2} section.".format(title, self._langName, pos))
if self._cleanupCat:
newContents += '\n]'
else:
wikipedia.output(u"Skipped page ]. Already has {1} {2} section.".format(title, self._langName, pos))
return False
else:
newContents += '\n{{rfc-auto}}'
else:
newContents += '\n{{rfc-auto}}'
if self._simulation:
wikipedia.output(u"Simulated update to page ].".format(title))
else:
page.put(oldContents + newContents, comment = u'Auto-generated {0} verb forms - appended'.format(self._langName), minorEdit = False)
else:
if self._simulation:
wikipedia.output(u"Simulated creating page ].".format(title))
else:
page.put(newContents, comment = u'Auto-generated {0} verb forms'.format(self._langName), minorEdit = True)
if self._verbose:
wikipedia.output(u"Page ] new contents:\n".format(title) + '-' * 60, toStdout = True)
wikipedia.output(newContents, toStdout = True)
wikipedia.output('*' * 60, toStdout = True)
return True
def getTemplates(contents, names):
"""Get all template calls to a specific set of templates from a page."""
templates =
matches = re.finditer(ur'{{\s*((?:' + ur'|'.join(names) + ur').*?)\s*}}', contents, re.UNICODE | re.DOTALL)
for match in matches:
templates.append(match.group(1))
return templates
def parseTemplate(template):
"""Parse and convert parameters of a template into dictionaries."""
template = re.sub(ur'<!--.*?-->', '', template, flags = re.UNICODE | re.DOTALL)
template = string.split(template, '|')
templateName = template
params = {}
paramIndex = 1
for s in template:
s = string.split(s, '=', 1)
# The string contains an =
if len(s) >= 2:
paramName = string.strip(s)
# Is the name a number?
try:
paramName = int(paramName)
except ValueError:
pass
paramValue = string.strip(s)
if paramValue:
params = paramValue
else:
paramValue = string.strip(s)
if paramValue:
params = paramValue
paramIndex += 1
return templateName, params
def makeTemplate(name, params):
"""Expand a template, given its name and parameters."""
templatestring = u"{{" + name
for key, val in params.iteritems():
templatestring += u"|" + str(key) + u"=" + val
templatestring += "}}"
return templatestring
def getSections(contents, name, level, inclHeader = True):
"""Get the start and end index of a section of a given name, or return None."""
sectionRegex = ur'({0} *{1} *{0}\s*)(.*?)(?:(?:\n{0} *+ *{0})|$)'.format('=' * level, name)
matches = re.finditer(sectionRegex, contents, re.DOTALL | re.UNICODE)
if not matches:
return None
ret =
for match in matches:
if inclHeader:
ret.append((match.start(1), match.end(2)))
else:
ret.append((match.start(2), match.end(2)))
return ret