def parselist():
import xmlreader
English=set()
dump=xmlreader.XmlDump("D:\Code\\wikt.bz2")
for entry in dump.parse():
if "==English==" not in entry and "== English ==" not in entry: continue
else: English.add(entry.title)
print len(English) # Should be something sane.
writefile=open("parsed_titles.txt","w")
for entry in dump.parse():
if entry.title not in English: continue
section=re.split("\n\=\={1}",entry.text.split("English==",1))
posses=re.split("\n{3,5}(?={1})",section)
for p in posses:
if "\n#" not in p: continue
pos=p.split("=").replace("{","").replace("}","").title()
defs=re.split("\n\#(?={1})",p)
for d in defs:
d=d.split("\n")
d=re.sub("\]*?\|","",d)
d=re.sub("\<\!\-\-.*?\-\-\>","",d)
d=d.replace("]","")
d=d.replace("{{","(").replace("}}",")")
d=d.replace(" of|"," of ")
d=d.replace("from=","from ") # Surname /given name templates
d=d.replace("context|","").replace("qualifier|","").replace("ib|","").replace("italbrac|","")
d=d.replace("'''",'"') #Most common use of explicit boldface (heaven knows why...)
d=d.replace("''(","(").replace(")''",")").replace("(''","(").replace("'')",")")
d=d.replace("|_|"," ").replace("|",", ")
d=d.replace("\t"," <tab> ") #final cleaning
d=d.replace('""','"')
line="\t".join()
line=line.encode("utf-8","ignore")
writefile.write(line+"\n")
writefile.close()