from __future__ import absolute_import, unicode_literals import pywikibot from pywikibot import pagegenerators import os.path import re import os import io
class fileGenBot():
#global maxListLength #maxListLength=10
#global catName #catName="Bosphorus crossings" #catName="Undersea tunnels" #catName="Suburbs of Brisbane" global recurseGlobal recurseGlobal = 0 # was 5 global lines_per_file lines_per_file = 400 #300 global wikiCheck wikiCheck = False #Look up each article to see if audio already exists #global fileTitle #fileTitle=catName
#global petScanFile #petScanFile="C:\\Users\\jim\\pywikitest\\Suburbs of Brisbane.txt"
def __init__(self): #self.newRecording = print() print("fileGenBot Initialsed") print() #main()
def visitCat2(self, catNameOnly): site=pywikibot.Site("en","wiktionary") #leave blank for en.wp uniCat = "u'Category:" +str(catNameOnly)# + '"' #print "Uni cat: " + uniCat print ("Cat name only: " + str(catNameOnly)) #cat = pywikibot.Category(site,uniCat) cat = pywikibot.Category(site,catNameOnly) #Category:Bosphorus crossings pages=cat.articles()
list1= lowerNumber=0 num=0
#for page in pagegenerators.PreloadingGenerator(pages, 49): for page in pagegenerators.CategorizedPageGenerator(cat, recurse=recurseGlobal): noBrackets = str(page) noFront=noBrackets noEnd=noFront #noEnd=noEnd.decode('utf-8') list1.append(noEnd) #print noEnd.decode('utf-8') return list1
def printListToFile(self, listGenerated): filePathBeginning = 'C:\\Users\\jim\\pywikitest\\' #newFileName = 'C:\\Users\\jim\\pywikitest\\' + str(fileTitle) + '\\' + str(fileTitle)+ '.txt' newDir = filePathBeginning + str(fileTitle) newFileName='C:\\Users\\jim\\pywikitest\\' + str(fileTitle) + '\\' + str(fileTitle) + '.txt' num=1 dirNum=1 if os.path.isdir(newDir): while os.path.isdir(newDir): dirNum=dirNum+1 newDir = filePathBeginning + str(fileTitle) + str(dirNum) newFileName = filePathBeginning + str(fileTitle) + str(dirNum) + '\\' + str(fileTitle) + str(dirNum) + '.txt'
if os.path.isdir(newDir)==False: os.mkdir(newDir)
#newFileName = newDir + '\\' + str(fileTitle) + '.txt' with io.open(newFileName, "a", encoding='utf8') as outFile: for line in listGenerated: #print line to text file #print str(listGenerated) line=line.encode('utf-8') #line=line.encode('utf-8') lineEnd = '\n' lineEnd=lineEnd.encode('utf-8') lineCombined = line + lineEnd lineCombined = lineCombined.decode('utf8') outFile.write(lineCombined) return newFileName
def checkArticle(self, bigList, torF): if torF==False: return bigList else: newBigList= for line in bigList: #print line site=pywikibot.Site() #page=pywikibot.Page(site,"u" + str(line) + "") #uniLine = 'u'+str(line) page=pywikibot.Page(site,line) #textUnEn=page.text text=page.text #text2=text.encode('utf-8') #print text2 if re.search(u'\.ogg',text) == None and re.search(u'\.oga',text) == None: #print "good to record: ", line newBigList.append(line)
#print "length of list is: ",len(newBigList) return newBigList
def petScan2simpleListFile(self): #newDirtyList=PetScanFile with open(petScanFile, "r") as longFile: for line in longFile: if str(line)=="| [[": print (str(line))
def splitBigFile(self, newFileName): shortcutsFile = r'C:\Users\jim\pywikibot\Command shortcuts\Generated lists.txt' smallfile = None #newFile = 'C:\\Users\\jim\\pywikitest\\' + str(fileTitle)+ '.txt' with io.open(newFileName, encoding='utf8') as bigfile: folderNames = open(shortcutsFile,"a") for lineno, line in enumerate(bigfile): if lineno % lines_per_file == 0: if smallfile: smallfile.close() startOfFileName = newFileName small_filename = startOfFileName+'_sf_{}.txt'.format(lineno + lines_per_file) smallfile = io.open(small_filename, "w", encoding='utf8') smallfile.write(line) if smallfile: folderNameStripped = small_filename folderNames.write(folderNameStripped + ' \n') smallfile.close() folderNames.close()
def main(*args):
local_args = pywikibot.handle_args(args) args = local_args argCatName = args print ("arg cat name: " + str(argCatName)) global fileTitle fileTitle=argCatName
bot = fileGenBot() listFromCat = bot.visitCat2(argCatName)
list2 = bot.checkArticle(listFromCat, wikiCheck) #False to not check if audio exists fileName1 = bot.printListToFile(list2) bot.splitBigFile(fileName1)
if __name__ == '__main__':
main()