Use Python to crawl mobi workbooks
Recently, I made a public account for pushing kindle e-books: there are not many free book libraries for kindle, so crawlers are needed to obtain enough books. So I wrote the following crawler to crawl the kindle114 ebook. Note: When the number of crawlers is too large, a javascript rather than the original html will be returned because the other party enables the crawling, so I used PyV8 to execute this js file to get the real address. The current problem: Regular Expression writing is not good enough. After all, it is the first official write crawler: you cannot download the attachment crawler you need to purchase as a single thread, and it is slow to crawl the entire website. I have tried converting it to a multi-process, but it seems that most crawler processes cannot crawl @ copy Code #-*-coding: UTF-8-*-import urllib2import reimport requestsimport osimport hashlib def fuckJS (js): import PyV8 import re # Remove the <script> label js = js [31: -9] for st in ['window', 'location', "'assign'", "'href '", "'replace'"]: equal = re. findall ('[_ A-Za-z0-9 =] + % s;' % st, js) # Find the variable assignment equation if equal = []: # possible no continue else: equal = equal [0] var = equal. split ('=') [0]. strip () # locate the variable name # Killing equations js = js. replace (equal, '') # replace the variable with the actual meaning js = js. replace (var, st) # replace ['xx']. xx js = js. replace ("['% s']" % st. strip ("'"),'. % s' % st. strip ("'") # convert window. href = after the content is kicked off, because when PyV8 only outputs the value of the last equation if re. findall ('window \. href =. + ', js )! = []: Js = js. replace (re. findall ('window \. href =. + ', js) [0], '') # Delete location. xxx = js. replace ('location. href = ',''). replace ('location. replace ',''). replace ('location. assign ', '') # handed over to you-v-ctxt2 = PyV8.JSContext () ctxt2.enter () # print ctxt2.eval (js) trueAddr = ctxt2.eval (js) print trueAddr return trueAddr def downloadMobi (name, url): # Remove invalid file names in windows. unlawName = '<>/\ | :""*? 'For I in unlawName: name = name. replace (I, '') # problems caused by insufficient regular expression write @ if name. count ('& nbsp; img src1_templateyeei_dream1cssyeeidigest_1.gif class = vm alt = title =')> 0: name = name. split ('& nbsp') [0] + '. mobi' # avoid repeated download if OS. path. exists ('d: \ Kindle114SpiderDownload \ '+ name): print 'already have', name return url = url. split ('') [0] s = requests. session () username = 'your username' password = 'your passwordmd5. = Hashlib. md5 (password ). hexdigest () data = {'formhash': '23cd6c29', 'Referer': '', 'username': username, 'passwordmd5, 'questionid ': '0', 'ancer': ''} res = s. post ('HTTP: // www.kindle114.com/member.php? Mod = logging & action = login & loginsubmit = yes & loginhash = LYn7n & inajax = 1', data) # res = s. get ('HTTP: // www.kindle114.com/forum.php? Mod = attachment & aid = MTQ2NTB8ZjhkNjY3NmF8MTQxNjg5OTYxOXw0NDIxfDczNjI % 3D ') try: res = s. get (url, timeout = 200) expires T: print 'Time out for ', name # print 'content [: 50]' # print res. content [: 50] if res. content. count ('<! DOCTYPE html ')> 0: print '!!!!!!!!!!!!!!!!! Not a mobi, this file need gold coin !!!!!!!!!!!!!!! 'Return try: with open ('d: \ Kindle114SpiderDownload \ '+ name, "wb") as code: code. write (res. content) expires T: print '!!!!!!!!!!!!!!!!!!!!! Invalid file name !!!!!!!!!!!!!!!!!! ', Name def spiderThread (url, threadName): req = urllib2.urlopen (url, timeout = 10) text = req. read () if text. count (' <! DOCTYPE html ') = 0: js = text trueURL = 'HTTP: // www.kindle114.com/' + fuckJS (js) print 'trueurl', trueURL req = urllib2.urlopen (trueURL) text = req. read () # href = '<a href = "(. *?) "Onmouseover =" showMenu ({\ 'ctrlid \ ': this. id, \ 'pos \': \ '12 \ '}) "id = .*? Target = "_ blank"> (.*?) </A> 'href = '<a href = "(.*?) ".*? Target = "_ blank"> (.*?) </A> 'href_re = re. compile (href) href_info = href_re.findall (text) bookSum = 0 for I in href_info: if I [1]. count ('. mobi ')> 0: bookSum + = 1 if bookSum = 0: print '!!! BookSum = 0 !!!! ', Text [: 100] if bookSum = 1: print 'only one book in this thread 'bookfilename = threadName + '. mobi 'for I in href_info: if I [1]. count ('. mobi ')> 0: link = I [0]. replace ('amp; ', '') break print link, bookFileName downloadMobi (bookFileName, link) else: print str (bookSum), 'in this thread' for I in href_info: if I [1]. count ('. mobi ')> 0: link = I [0]. replace ('amp; ', '') bookFileName = I [1] print link, BookFileName downloadMobi (bookFileName, link) for pageNum in range (1,125): url = 'HTTP: // www.kindle114.com/forum.php? Mod = forumdisplay & fid = 2 & filter = sortid & sortid = 1 & searchsort = 1 & geshi = 1 & page = '+ str (pageNum) print '================ url', url,' ========================== 'try: req = urllib2.urlopen (url, timeout = 10) expires T: print 'page time out', url text = req. read () href = '