python 爬小說

來源:互聯網
上載者:User

標籤:apt   decode   list   text   ==   files   get   otto   gen   

#coding=utf-8import datetimeimport timeimport sysimport os import urllib2import urllibsx = ‘小說站網址‘type = sys.getfilesystemencoding()  user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘  headers = { ‘User-Agent‘ : user_agent }  fo = open("note.txt", "wb")def getHtml(url):      try:          request = urllib2.Request(url, headers=headers)          response = urllib2.urlopen(request)          data = response.read()          data = data.decode(‘gbk‘)          data = data.encode(‘utf-8‘)          print len(data)          return data    except urllib2.URLError, e:          if hasattr(e, "code"):              print e.code          if hasattr(e, "reason"):              print e.reson          passdef dealIndex(url):    data = getHtml(url)    # pos = data.find()    bgnpos = data.index(‘ChapterList_HengFu_1‘) + 10    endpos = data.index(‘ChapterList_HengFu_2‘) - 10    print bgnpos    print endpos    achfx = data[bgnpos:endpos]    pos = bgnpos    i = 0    while 1:        newpos = achfx.find(‘href=‘, pos)        if newpos == -1 or newpos >= endpos:            break        # print data[newpos:newpos+200]        indexurl = achfx[newpos+6:newpos+19]        titlepos = achfx.find(‘</a>‘, newpos+20)        titlename = achfx[newpos+21:titlepos+1]        # print indexurl + "   " + titlename        pos = titlepos + 5        dealContext(sx + indexurl, titlename)        # i = i + 1        # # print "-----------------" + str(pos)        # if i >= 1:        #     break        pass    # print achfxdef dealContext(url, title):    print url    print title    data = getHtml(url)    bgnpos = data.find(‘name="content"‘, 10) + 15    endpos = data.find(‘yuedu_bottom‘, bgnpos)    endpos = data.find(‘</div>‘, endpos - 50)    sContent = data[bgnpos:endpos]    sContent = sContent.replace(‘&nbsp;‘, ‘ ‘)    sContent = sContent.replace(‘<br />‘, ‘ ‘)    # # sContent = sContent.strip("&nbsp;")    # # sContent = sContent.strip(‘<br />‘)    # print sContent    # # print sContent.strip(‘<br />‘)    sContent = title + "  " + sContent    fo.write(sContent)dealIndex(sx)fo.close()

 

python 爬小說

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.