#Coding=utf-8ImportdatetimeImport TimeImportSYSImportOSImportUrllib2ImportUrllibSX ='novel station web site'type=sys.getfilesystemencoding () user_agent='mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'Headers= {'user-agent': user_agent} fo= Open ("Note.txt","WB")defgethtml (URL):Try: Request= Urllib2. Request (URL, headers=headers) Response=Urllib2.urlopen (Request) Data=response.read () data= Data.decode ('GBK') Data= Data.encode ('Utf-8') Printlen (data)returnDataexceptUrllib2. Urlerror, E:ifHasattr (E,"Code"): PrintE.codeifHasattr (E,"reason"): PrintE.resonPassdefdealindex (URL): Data=gethtml (URL)#pos = data.find ()Bgnpos = Data.index ('chapterlist_hengfu_1') + 10Endpos= Data.index ('chapterlist_hengfu_2')-10PrintBgnposPrintEndpos Achfx=Data[bgnpos:endpos] Pos=Bgnpos i=0 while1: Newpos= Achfx.find ('href=', POS)ifNewpos = =-1orNewpos >=Endpos: Break #Print data[newpos:newpos+200]Indexurl = achfx[newpos+6:newpos+19] Titlepos= Achfx.find ('</a>', newpos+20) TitleName= Achfx[newpos+21:titlepos+1] #print Indexurl + "" + titlenamepos = Titlepos + 5Dealcontext (SX+Indexurl, TitleName)#i = i + 1 ## print "-----------------" + str (POS) #If I >= 1: # Break Pass #Print Achfxdefdealcontext (URL, title):PrintURLPrintTitle Data=gethtml (URL) bgnpos= Data.find ('name= "Content"', 10) + 15Endpos= Data.find ('Yuedu_bottom', Bgnpos) Endpos= Data.find ('</div>', endpos-50) Scontent=Data[bgnpos:endpos] Scontent= Scontent.replace (' ',' ') Scontent= Scontent.replace ('<br/>',' ') ## scontent = Scontent.strip (" ") ## scontent = Scontent.strip (' <br/> ') #Print Scontent ## Print Scontent.strip (' <br/> ')Scontent = title +" "+scontent fo.write (scontent) dealindex (SX) fo.close ()
Python crawl Fiction