Python - 抓取豆列

來源:互聯網
上載者:User

標籤:default   sea   entry   資訊   fir   python 2   urlopen   匯出   pattern   

將豆列匯出為 Markdown 檔案。

#!/usr/bin/env python#! encoding=utf-8# Description   : 將豆列匯出為 Markdown 檔案. # Version       : 1.0.0.0# Python Version: Python 2.7.3#import osimport threadingimport timeimport datetimeimport reimport stringimport urllib2import timeitfrom bs4 import BeautifulSoupgHeader = {"User-Agent": "Mozilla-Firefox5.0"}# 書籍資訊類class BookInfo:    name = ‘‘    url = ‘‘    icon = ‘‘    ratingNum = 0.0    ratingPeople = 0    comment = ‘‘    def __init__(self, name, url, icon, nums, people, comment):        self.name = name        self.url = url        self.icon = icon        self.ratingNum = nums        self.ratingPeople = people        self.comment = comment # 擷取 url 內容def getHtml(url):    try :        request = urllib2.Request(url, None, gHeader)        response = urllib2.urlopen(request)        data = response.read().decode(‘utf-8‘)    except urllib2.URLError, e :        if hasattr(e, "code"):            print "The server couldn‘t fulfill the request: " + url            print "Error code: %s" % e.code        elif hasattr(e, "reason"):            print "We failed to reach a server. Please check your url: " + url + ", and read the Reason."            print "Reason: %s" % e.reason    return data# 匯出為 Markdown 格式檔案def exportToMarkdown(doulistTile, doulistAbout, bookInfos):    path = "{0}.md".format(doulistTile)    if(os.path.isfile(path)):        os.remove(path)    today = datetime.datetime.now()    todayStr = today.strftime(‘%Y-%m-%d %H:%M:%S %z‘)    file = open(path, ‘a‘)    file.write(‘## {0}\n‘.format(doulistTile))    file.write(‘{0}\n‘.format(doulistAbout))    file.write(‘## 圖書列表\n‘)    file.write(‘### 收藏添加順序,非排名,總計 {0} 本,更新時間:{1}\n‘.format(len(bookInfos), todayStr))    i = 0            for book in bookInfos:        file.write(‘\n### No.{0:d} {1}\n‘.format(i + 1, book.name))        file.write(‘ > **圖書名稱**: [{0}]({1})  \n‘.format(book.name, book.icon))        file.write(‘ > **豆瓣連結**: [{0}]({1})  \n‘.format(book.url, book.url))        file.write(‘ > **豆瓣評分**: {0}  \n‘.format(book.ratingNum))        file.write(‘ > **評分人數**: {0} 人 \n‘.format(book.ratingPeople))        file.write(‘ > **我的評論**: {0}  \n‘.format(book.comment))        i = i + 1        ‘‘‘    file.write(‘<style>a img {border: none;width: 127px;height:76px;overflow:hidden;}.article-entry img, .article-entry video {display: block;height: 110px;margin: auto;max-width: 100%;}</style>\n‘)    file.write(‘{% stream %}\n‘)    for book in bookInfos:        file.write(‘{% figure ‘ + book.icon + ‘ [‘ + book.name + ‘](‘ + book.icon + ‘) %}\n‘)        #file.write(‘{% figure ‘ + book.icon + ‘ [ No.‘ + str((i+1)) + ‘](‘ + book.icon + ‘) %}\n‘)        i = i + 1    file.write(‘{% endstream %}\n‘)    file.write(‘<style>div.hexo-img-stream figure figcaption {font-size: .9rem;color: #444;line-height: 1.5;overflow: hidden;text-overflow: ellipsis;white-space: nowrap;max-width: 127px;}</style>\n‘)    ‘‘‘        file.close()# 解析圖書資訊def parseItemInfo(page, bookInfos):    soup = BeautifulSoup(page, ‘html.parser‘)    items = soup.find_all("div", "doulist-item")    for item in items:        #print item.prettify().encode(‘utf-8‘)        # get book name        bookName = ‘‘        content = item.find("div", "title")        if content != None:            href = content.find("a")            if href != None and href.string != None:                bookName = href.string.strip().encode(‘utf-8‘)        #print " > name: {0}".format(bookName)        # get book url and icon        bookUrl = ‘‘        bookImage = ‘‘        content = item.find("div", "post")        if content != None:            tag = content.find(‘a‘)            if tag != None:                bookUrl = tag[‘href‘].encode(‘utf-8‘)            tag = content.find(‘img‘)            if tag != None:                bookImage = tag[‘src‘].encode(‘utf-8‘)        #print " > url: {0}, image: {1}".format(bookUrl, bookImage)        # get rating        ratingNum = 0.0        ratingPeople = 0        contents = item.find("div", "rating")        if content is None:            continue        for content in contents:            if content.name != None and content.string != None:                if content.get("class") != None:                    ratingStr = content.string.strip().encode(‘utf-8‘)                    if len(ratingStr) > 0:                        ratingNum = float(ratingStr)                else:                    ratingStr = content.string.strip().encode(‘utf-8‘)                    pattern = re.compile(r‘(\()([0-9]*)(.*)(\))‘)                    match = pattern.search(ratingStr)                    if match:                        ratingStr = match.group(2).strip()                        if len(ratingStr) > 0:                            ratingPeople = int(ratingStr)        #print " > ratingNum: {0}, ratingPeople: {1}".format(ratingNum, ratingPeople)        # get comment        comment = ‘‘        content = item.find("blockquote", "comment")        if content != None:            for child in content.contents:                if child.name == None and child.string != None:                    comment = child.string.strip().encode(‘utf-8‘)        #print " > comment: {0}".format(comment)        # add book info to list        bookInfo = BookInfo(bookName, bookUrl, bookImage, ratingNum, ratingPeople, comment)        bookInfos.append(bookInfo)# 解析豆列 urldef parse(url):    start = timeit.default_timer()    page = getHtml(url)    soup = BeautifulSoup(page, ‘html.parser‘)    # get doulist title    doulistTile = soup.html.head.title.string.encode(‘utf-8‘)    print " > 擷取豆列:" + doulistTile    # get doulist about    doulistAbout = ‘‘    content = soup.find("div", "doulist-about")    for child in content.children:        if child.string != None:            htmlContent = child.string.strip().encode(‘utf-8‘)            doulistAbout = "{0}\n{1}".format(doulistAbout, htmlContent)    #print "doulist about:" + doulistAbout    # get page urls    pageUrls = []    nextPageStart = 100000    lastPageStart = 0    content = soup.find("div", "paginator")    for child in content.children:        if child.name == ‘a‘:            pattern = re.compile(r‘(start=)([0-9]*)(.*)(&sort=)‘)            match = pattern.search(child[‘href‘].encode(‘utf-8‘))            if match:                index = int(match.group(2))                if nextPageStart > index:                    nextPageStart = index                if lastPageStart < index:                    lastPageStart = index    books = []    # get books from current page    print ‘ > process page :  {0}‘.format(url)    parseItemInfo(page, books)    # get books from follow pages    for pageStart in range(nextPageStart, lastPageStart + nextPageStart, nextPageStart):        pageUrl = "{0}?start={1:d}&sort=seq&sub_type=".format(url, pageStart)        print ‘ > process page :  {0}‘.format(pageUrl)        page = getHtml(pageUrl)        if page is not None:            parseItemInfo(page, books)    # export to markdown file    exportToMarkdown(doulistTile, doulistAbout, books)    # summrise    total = len(books)    elapsed = timeit.default_timer() - start    print " > 共擷取 {0} 本圖書資訊,耗時 {1} 秒".format(total, elapsed)#=============================================================================# 程式入口:抓取指定指定豆列的書籍#=============================================================================gDoulistUrl = "https://www.douban.com/doulist/1264675/"if __name__ == ‘__main__‘:     parse(gDoulistUrl)

Python - 抓取豆列

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.