Recently made a push Kindle ebook public number: Kindle Free Library
But the current ebook is not very much, so you need to use crawlers to get enough books.
So, wrote the following crawler, to crawl kindle114 e-book.
Notable places:
When the crawl count is too large, it will return a JavaScript instead of the original HTML because the other side has an open-and-drop crawl, so I use
PyV8 to execute this JS and get the real address.
Existing problems:
Regular writing is not good enough, after all, is the first time to formally write the crawler:)
Cannot download attachments that need to be purchased
Crawler for single-threaded, crawl the entire site slow. I have tried to turn into a multi-process, but seemingly unable to log in at the same time, most
Crawler processes are not able to crawl the normal @@
#-*-coding:utf-8-*-ImportUrllib2ImportReImportRequestsImportOSImportHashlibdeffuckjs (JS):ImportPyV8ImportRe #Remove <script> LabelJs=js[31:-9] forStinch['window',' Location',"' Assign '","' href '","' Replace '"]: Equal=re.findall ('[_a-za-z0-9 =]+%s;'%ST,JS)#Find variable assignment equation ifEqual==[]:#there may not be Continue Else: Equal=Equal[0] Var=equal.split ('=') [0].strip ()#find the variable name #kill the equation .Js=js.replace (Equal,"') #replace the variable with its true meaningjs=js.replace (var,st)#replace [' xx '] with. XXJs=js.replace ("['%s ']"%st.strip ("'"),'.%s'%st.strip ("'")) #kick off the content after window.href=, because when PyV8 only outputs the value of the last equation ifRe.findall ('window\.href=.+', JS)! =[]: JS=js.replace (Re.findall ('window\.href=.+', JS) [0],"') #Remove location.xxx=Js=js.replace ('location.href=',"'). Replace ('Location.replace',"'). Replace ('location.assign',"') #Here you are,-v-.CTXT2 =Pyv8.jscontext () ctxt2.enter ( )#print Ctxt2.eval (JS)TRUEADDR =ctxt2.eval (JS)Printtrueaddrreturntrueaddrdefdownloadmobi (name, URL):#Remove illegal file names under WindowsUnlawname ='<>/\\|: "" *?' forIinchUnlawname:name= Name.replace (I,"') #regular expressions are not well written to cause problems @@ ifName.count (' img src=templateyeei_dream1cssyeeidigest_1.gif class=vm alt= title=') >0:name= Name.split (' in') [0]+'. mobi' #Avoid duplicate downloads ifOs.path.exists ('D:\Kindle114SpiderDownload\\'+name):Print 'already has', namereturnURL= Url.split (' ') [0] s=requests.session () Username='your user name'Password='your password.'PasswordMd5=hashlib.md5 (password). Hexdigest () Data= {'Formhash':'23cd6c29','Referer':"','username': Username,'Password': PasswordMd5,'QuestionID':'0','Answer':"'} res=s.post ('http://www.kindle114.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=LYn7n &inajax=1', data)#res = s.get (' http://www.kindle114.com/forum.php?mod=attachment&aid= Mtq2ntb8zjhknjy3nmf8mtqxnjg5otyxoxw0ndixfdcznji%3d ') Try: Res= S.get (URL, timeout = 200) except: Print ' Time Out for', name#print ' content[:50] ' #Print res.content[:50] ifRes.content.count ('<! DOCTYPE HTML') >0:Print '!!!!!!!!!!!!!!!!! Not a Mobi, the This file need gold coin!!!!!!!!!!!!!!!' return Try: With open ('d:\\kindle114spiderdownload\\'+ Name,"WB") as Code:code.write (res.content)except: Print '!!!!!!!!!!!!!!!!!!!!! An illegal file name was encountered!!!!!!!!!!!!!!!!!!', namedefspiderthread (URL, threadname): Req= Urllib2.urlopen (URL, timeout = 10) Text=Req.read ()ifText.count ('<! DOCTYPE HTML') ==0:js=text Trueurl='http://www.kindle114.com/'+fuckjs (JS)Print 'Trueurl', Trueurl req=Urllib2.urlopen (trueurl) Text=Req.read ()#href = ' <a href= ' (. *?) "onmouseover=" ShowMenu ({\ ' ctrlid\ ': this.id,\ ' pos\ ': \ ' 12\ '}) "id=.*?target=" _blank " > (. *?) </a> 'href ='<a href= "(. *?)". *?target= "_blank" > (. *?) </a>'Href_re=re.compile (href) href_info=href_re.findall (text) booksum=0 forIinchHref_info:ifI[1].count ('. mobi') >0:booksum+=1ifBooksum = =0:Print '!!! booksum = 0!!!!', text[:100] ifBooksum = = 1: Print 'Only one book in this thread'Bookfilename= ThreadName +'. mobi' forIinchHref_info:ifI[1].count ('. mobi') >0:link= I[0].replace ('amp;',"') Break PrintLink, bookfilename downloadmobi (bookfilename, link)Else: PrintSTR (booksum),'In this thread' forIinchHref_info:ifI[1].count ('. mobi') >0:link= I[0].replace ('amp;',"') Bookfilename= I[1] PrintLink, bookfilename downloadmobi (bookfilename, link) forPagenuminchRange (1, 125): URL='http://www.kindle114.com/forum.php?mod=forumdisplay&fid=2&filter=sortid&sortid=1& Searchsort=1&geshi=1&page='+Str (pagenum)Print '=============url'Url'===============' Try: Req= Urllib2.urlopen (URL, timeout = 10) except: Print 'page Time Out', url text=req.read () href=''Href_re=re.compile (href) href_info=href_re.findall (text) forIinchHref_info:PrintI[0], i[1] URL='http://www.kindle114.com/'+I[0] ThreadName= I[1] Try: Spiderthread (URL, threadname)exceptException, E:Print '!!!!!!!!!!!!! Error with', ThreadName, URL,'!!!!!!!!!!!!!!!!' PrintEraw_input ('Finish ALL!!!')
Use Python to crawl Mobi format e-paper book