This is a list of pages which are suspected to contain incorrectly formatted translations and should probably be run through xte.
There are 14351 items on the list (generated by checking against a known-good pattern). So I needed to split it.
/0, /1, /2, /3, /4, /5, /6, /7, /8, /9, /10, /11, /12, /13, /14
The script has also been run through some other dumps, giving the number of results as shown below:
25815 2012-11-04 23993 2013-08-25 23582 2013-09-07 22278 2013-09-19 21946 2013-10-02 21837 2013-10-17 21590 2013-11-17 21478 2013-12-02 2013-12-08..15: first Buttermilch run 15595 2013-12-17 14494 2014-04-15 14426 2014-05-22 14398 2014-06-09 14351 2014-07-02 2014-07-28: xte was updated to process even items containing {{t}}
The list was generated from the 2013-08-25 dump of Wiktionary by the following script:
#!/usr/bin/python3
# released under the WTFPL
# usage: curl http://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 | ./scan > needs_wuv.mw
# processing the whole dump takes a couple of minutes, but little memory.
import re
import sys
import bz2
import xml.sax
_re_trans_top = re.compile(r'\s*\{\{trans-top(?:-also)?(\||}})')
_re_ttbc_top = re.compile(r'\s*\{\{(checktrans|ttbc)-top(\||}})')
_re_trans_bot = re.compile(r'\s*\{\{(checktrans|trans|ttbc)-bottom(\||}})\s*')
_re_trans_line = re.compile(r'\*(?!)\s*(?:\+?)(?:\]\])?\s*:\s*')
_re_ttbc_line = re.compile(r'\*:?\s*(\{\{ttbc\s*\|(.*?)}})\s*:\s*')
_re_trans_sub = re.compile(r'\*\s*(?:\+?)(?:\]\])?\s*:\s*')
_re_trans_mid = re.compile(r'\s*\{\{(checktrans|trans|ttbc)-mid(\||}})\s*')
_re_trans_req = re.compile(r'\*:?\s*\{\{trreq\|(.*?)}}\s*?(?=\s|$)')
_re_dummy = re.compile(r'(?:<!--.*?-->)?$')
_re_hiero = re.compile(r'\*:?\s*<hiero>.*?</hiero>')
_re_split = re.compile(r'\s*\s*')
_re_item = re.compile(r'((\{\{(qualifier|i)\|*?}}\s*)?\{\{(t?|tcheck|)\|*?}}(\s*\{\{(qualifier|i)\|*?}})?|\{\{t-needed\|+}})$')
def splitdefs(defs):
if defs == '':
return
op = 0
oc = 0
os = 0
ot = 0
cur = ''
for item in re.split(_re_split, defs):
op += item.count('(') - item.count(')')
oc += item.count('{') - item.count('}')
os += item.count('')
ot += item.count('<') - item.count('>')
cur += item
if not (op or oc or os or ot):
yield cur
cur = ''
def scanpage(title, text):
mode = 0
for line in text.splitlines():
if re.match(_re_trans_top, line):
mode = 1
continue
elif re.match(_re_ttbc_top, line):
mode = 1
continue
if mode == 0:
continue
elif re.match(_re_trans_bot, line):
mode = 0
continue
m = re.match(_re_trans_line, line)
if m:
rest = line
for item in splitdefs(rest):
if not re.match(_re_item, item):
return 'definition not recognised: %s' % (repr(item))
continue
m = re.match(_re_ttbc_line, line)
if m:
continue
m = re.match(_re_trans_sub, line)
if m:
continue
m = re.match(_re_trans_mid, line)
if m:
continue
m = re.match(_re_trans_req, line)
if m:
continue
if re.match(_re_hiero, line):
continue
if re.match(_re_dummy, line):
continue
return 'line not recognised: %s' % (repr(line))
return None
class handler:
def __init__(self):
self.buf = None
self.tit = None
self.current = None
def processingInstruction(self):
pass
def setDocumentLocator(self, locator):
pass
def startDocument(self):
pass
def startElement(self, name, attrs):
self.current = name
if name == 'text':
self.buf = ''
elif name == 'title':
self.tit = ''
def characters(self, data):
if self.current == 'text':
self.buf += data
elif self.current == 'title':
self.tit += data
def endElement(self, name):
self.current = None
if name == 'text':
reason = scanpage(self.tit, self.buf)
if reason:
sys.stdout.write('* ] because: <tt><nowiki>%s</nowiki></tt>\n' % (self.tit, reason))
elif name == 'page':
self.buf = None
self.tit = None
def endDocument(self):
pass
xml.sax.parse(bz2.BZ2File(sys.stdin.buffer), handler())