#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:Tbot/code/createflw """ Create a simple foreign word entry in the en.wikt Append a section if not already present """ import wikipedia import catlib import sys import re import pickle import socket from time import time, sleep import shelve from mwapi import getwikitext, getedit from __main__ import cache, logpage, plock def safe(s): return pickle.dumps(s) def log(s): with plock: print safe(s).strip("'" + '"') # entries we've already seen that exist : Exists = set() # some regex # these only catch default namespace names ... reimage = re.compile(r'\]', re.I) reaudio = re.compile(r'\]', re.I) # catch image by the |thumb| parameter ;-) rethumb = re.compile(r'\]*thumb]') # image by .jpg or .png: rejpg = re.compile(r'(*?\.(jpg|png))}]') # and perhaps an ogg file in a template, as in en.wikt? reogg = re.compile(r'(*?\.ogg)') # IPA string reIPA = re.compile(r'IPA.*?(]+?])') reIPAt = re.compile(r'\{\{IPA\|(+?)') # fix glosses, context at start, (1) sense number at end should be removed, each should be subbed with space regloss1 = re.compile(r"^''\(.*?\)''") regloss2 = re.compile(r"^\(''.*?''\)") regloss3 = re.compile(r"\(\d+\)$") rejump = re.compile(r'\{\{jump\|*}}') # need only do once on load site = wikipedia.getSite("en", "wiktionary") csite = wikipedia.getSite("commons", "commons") # trans table prefixes, other than "*" at the start of the line Tlist = dict( ru = r'\s*\|en=', uk = r'\s*\|en=', nl = r':?\*', sq = r'<br>\{\{en}}', ga = r'\{\{aistr\|en', # careful here, next char is | which must match \W lt = r'\{\{env1}}', yi = r'\|EN=', tr = r':?\*\{\{en}}:', mn = r':\*\{\{en}}:' ) # and: Tlist = r'\|en=' # "is" is a keyword ;-) # by lc here, various languages Wlist = dict( cs="{{Wikipedie}}", de="{{W}}", fr="{{WP}}", ga=u"{{vicip\u00e9id}}", hu="{{wp1}", # one arg? la="{{vicipaedia}}", lt="{{vikipedija}}", nl="{{-info-}}", pt=u"{{Wikip\u00e9dia}}", sl="{{W}}", vi="{{-info-}}" ) # pronunciation templates for IPA (modded for regex, use . for diacritics etc): Plist = dict( de="Lautschrift", es="ronunciaci.n", fr="pron" ) # images that show up in page structure for various reasons, e.g. first two on pt.wikt Istops = set([ 'LuisdeCamoes4.jpg', 'Os Lusadas.jpg', 'Wikipedia.png' ]) cis = 0 def createFLentry(flw, lang, lc, pos, title, gloss, mod): global cis # for now, don't add to the same page (would cause edit conflict anyway?) if flw == title: # log("skipping addition to same title for now") return True # doesn't matter because not called with title == flw and return value used (see tbot.py) # check cache # records last time we tried this word, don't try again for 110 days # may need to disable sometimes for debugging! ckey = lc + ':' + flw if ckey in cache: last = cache if last > time() - (110 * 24 * 3600): # log("%s:%s in 110 day cache, not checked" % (lc, flw)) return False cache = time() # assume we will complete check now ... cis += 1 if cis % 20 == 0: cache.sync() log("createFL %s: %s %s, %s (%s)" % (flw, lang, lc, pos, title, gloss)) # get the FL.wikt page # fix codes WMF hasn't yet (or has, but we still don't have set correctly :-) zlc = lc if lc == 'nb': zlc = 'no' if lc == 'cmn': zlc = 'zh' if lc == 'nan': zlc = 'zh-min-nan' # (no yue wikt as yet, hopefully will be created as yue, not zh-yue as in pedia) try: flsite = wikipedia.getSite(zlc, "wiktionary") flpage = wikipedia.Page(flsite, flw) # fltext = flpage.get() fltext = getwikitext(flpage) if fltext: print "FL page exists ..." except wikipedia.NoPage: with plock: print "page not in FL wikt" return False except wikipedia.IsRedirectPage: with plock: print "FL wikt entry is a redirect" return True # can change to t+ except KeyboardInterrupt: raise KeyboardInterrupt except Exception, e: with plock: print "some exception getting page from FL wikt" return False if not fltext: with plock: print "page not in FL wikt" return False # see if English word in FL page, presumably as a translation if title not in fltext: print "FL wikt page does not contain title" # logpage.add("] entry ] exists, title not in entry" % (title, lc, flw)) return True # we want to insert t+ template, even though not adding entry # nl.wikt uses ":*", will be other variations, # ru.wikt uses |en= ... etc etc: if lc in Tlist: tpre = Tlist else: tpre = r'\*' retrans = re.compile(r'^' + tpre + r'.*\W' + re.escape(title) + r'(\W|$)', re.M) # look for a line that may be a trans line, with title surrounded by non-word characters mo = retrans.search(fltext) if mo: # truncate fltext at that line, so we don't get extra stuff from following sections fltextall = fltext fltext = fltext # must be there, but -1 won't hurt else: print "title not in translation line?" logpage.add("] entry ] exists, pattern not matched" % (title, lc, flw)) return True # we want to insert t+ template, even though not adding entry # a short entry may be just the English translation, not very good (80 is arbitrary) # if len(fltext) < 80: # print "FL wikt page is too short" # return True # we want to insert t+ template, even though not adding entry # now reconfirm local existence and section absent, get text seealso = '' addc = 'created %s entry ' % lang try: log("getting local page %s" % flw) page = wikipedia.Page(site, flw) text = getedit(page) # check language section ... if re.search('^==\s*\*\s*==', text, re.M): log("page %s and section %s already exists" % (flw, lang)) return True # meaning there is a page and section there now, so convert to t+ # crappy special case until rationality w/r/t Norwegian and Nynorsk returns ... if lang == "Norwegian" and '==Norwegian ' in text: log("page %s and some Norwegian section already exists" % flw) return True # meaning there is a page and section there now, so convert to t+ # another temporary crappy special case, SC bullshit ... if lang in and '==Serbo-' in text: log("page %s and some Serbo- section already exists" % flw) return True # meaning there is a page and section there now, so convert to t+ addc = 'added %s section ' % lang except wikipedia.NoPage: # usual case when entry is new text = '' except wikipedia.IsRedirectPage: # overwrite a redirect if present text = '' addc = 'replaced redirect with %s entry ' % lang seealso = page.getRedirectTarget() # limit to case redirects, simple case for now (so we don't "fix" Hebrew) if flw.lower() != seealso.lower(): log("page %s is a redirect to %s, not replaced" % (flw, seealso)) return True # see if we can "borrow" image or audio image = '' mo = reimage.search(fltext) if not mo and '|thumb|' in fltext: mo = rethumb.search(fltext) if not mo: mo = rejpg.search(fltext) if mo: img = mo.group(1) if ':' in img: img = img.split(':') if img and img not in Istops: log("found image: %s" % img) ipage = wikipedia.Page(csite, "Image:" + img) try: ipt = getwikitext(ipage) image = ']\n' % (img, flw) with plock: print "found on commons" except wikipedia.NoPage: with plock: print "not found on commons" except Exception, e: with plock: print "other exception looking for commons image" pass audio = '' mo = reaudio.search(fltext) if not mo: mo = reogg.search(fltext) if mo: aud = mo.group(1) if ':' in aud: aud = aud.split(':') if aud.lower() != lc: log("audio file name %s does not match language %s" % (aud, lc)) aud = '' if aud: log("found audio: %s" % aud) apage = wikipedia.Page(csite, "Image:" + aud) try: apt = getwikitext(apage) audio = '* {{audio|%s|%s}}\n' % (aud, flw) with plock: print "found on commons" except wikipedia.NoPage: with plock: print "not found on commons" except Exception, e: with plock: print "other exception looking for commons audio" pass ipa = '' ipas = set() # so repeats don't bother us for i in reIPA.findall(fltext): ipas.add(i) for i in reIPAt.findall(fltext): ipas.add(i) if lc in Plist: rp = re.compile(r'\{\{' + Plist + '\|(.*?)\}\}') for i in rp.findall(fltext): ipas.add(i) if len(ipas) == 1: i = ipas.pop().strip() if i.startswith('/'): i = '/' + i.strip(' /') + '/' elif i.startswith(' /') + ']' elif i: i = '/' + i.strip(' /') + '/' if i == '//' or i == '': i = '' if i == '/.../' or i == '': i = '' if i: ipa = "* {{IPA|%s|lang=%s}}\n" % (i, lc) log("found IPA %s" % i) elif len(ipas) > 1: with plock: print "more than one IPA?" if audio or ipa: pron = '\n===Pronunciation===\n' + ipa + audio else: pron = '' # 'pedia link? look at all original text; often follow trans table wplink = '' if ("{{wikipedia}}" in fltextall or "{{wikipedia|" + flw + '}}' in fltextall or (lc in Wlist and Wlist in fltextall) or (lc in Wlist and Wlist + '|' + flw + '}}' in fltextall)): wplink = '{{wikipedia|lang=%s}}\n' % lc print "added wikipedia link" # set up additional infl params from attribute dict: aip = '' if 'alt' in mod and mod: aip += '|head=' + mod if 'tra' in mod and mod: aip += '|tr=' + mod if 'g' in mod and mod: aip += '|g=' + mod if 'g2' in mod and mod: aip += '|g2=' + mod if 'g3' in mod and mod: aip += '|g3=' + mod if 'scr' in mod and mod: aip += '|sc=' + mod gwas = gloss gloss = gloss.strip() gloss = regloss1.sub(' ', gloss) gloss = regloss2.sub(' ', gloss) gloss = regloss3.sub(' ', gloss) gloss = rejump.sub(' ', gloss) gloss = gloss.strip() if not gloss: log("nothing left to gloss ...") return True # as FL wikt page does exist # decap gloss (some people insist on capitalizing it, which is wrong) and fix, this is almost always right: if gloss.startswith('Of '): gloss = 'of ' + gloss if gloss.startswith('Country '): gloss = 'country ' + gloss if gloss.startswith('Person '): gloss = 'person ' + gloss gl = gloss.lower() if "translation" in gl: log("word 'translation' in gloss, skipped") return True # FL wikt page exists if gl != gloss: gl = gloss # caps in string after first, so probably okay if gloss.startswith(title): gl = gloss # Proper noun, e.g. "French language" if gl != gwas: log("gloss changed %s -> %s" % (gwas, gl)) # change Proper noun to Noun if lower case; usually the right answer if pos == "Proper noun" and flw.islower(): log("changed Proper noun to Noun") pos = "Noun" # add to or create entry text: if text: text += '\n\n----\n{{rfc-auto|sort languages}}\n' text += """==%s== %s%s%s ===%s=== {{infl|%s|%s%s}} # ] (%s) {{tbot entry|%s|%s|{{subst:CURRENTYEAR}}|{{subst:CURRENTMONTHNAME}}|%s}} """ % (lang, wplink, image, pron, pos, lc, pos.lower(), aip, title, gl, lang, title, lc) # other special things (no reason not to ;-) if lc == 'fr' and pos == 'Verb': text = text.replace("{{tbot", "{{rfinfl|type=conjugation|lang=fr}}\n{{tbot") # # add interwiki, let AutoFormat and Interwicket sort things as needed iw = ']' % (lc, flw) if iw not in text: text += iw + '\n' # if overwriting redirect, add see if seealso: text = '{{also|' + seealso + '}}\n' + text try: with plock: page.put(text, comment = addc + "from translation at ] and ]" % \ (title, lc, flw), minorEdit = False) except wikipedia.PageNotSaved: with plock: print "failed to save page" return False except socket.timeout: with plock: print "socket timeout, maybe not saving page" return False except socket.error: with plock: print "socket error, maybe not saving page" return False except Exception, e: with plock: print "some exception saving page", repr(e) return False # Exists.add(flw) return True