python 簡單的網路爬蟲 + html 本文抽取

最後更新：2018-12-03 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

今天將一個bfs 的爬蟲和抽取Html整合到一起了。現在功能還是有局限性。其中抽取本文，詳見 http://www.fuxiang90.me/2012/02/%E6%8A%BD%E5%8F%96html-%E6%AD%A3%E6%96%87/

現在只限定爬取 http 協議的網址，並只在內網測試了，因為和外網的串連不是不快。
一個全域的 url 隊列和 url set 。隊列是為了方便的實現bfs ， set 是為了不重複爬取網頁，流程還是相當的簡單的，原理也是相當的簡單。
然後是單線程的，所以應該是比較慢的，之後會考慮多線程，爬取網頁，抽取URL ，抽取本文，可以同步進行。
其中是來源
https://www.ibm.com/developerworks/cn/opensource/os-cn-crawler/ ，然後抽取網頁中的url ，我同時還抽取了裡面的本文，這個是為了以後建立索引的時候，方便進行中文分詞

代碼在這裡貼有問題，可能是裡面有 html 的標籤，請移步http://www.fuxiang90.me/?p=728

# encoding:utf-8# use BeautifulSoup to get font|p context# 單線程版本的爬取html ，並深度遍曆 ，之後 抽取本文 ，但是單線程未免有點慢# 可以隨意的使用這段代碼，但請保留 下面的一行# author ： fuxiang ，mail： fuxiang90@gmail.comfrom BeautifulSoup import BeautifulSoup          # For processing HTMLimport urllib2import osimport sysimport reimport Queueimport socketimport timesocket.setdefaulttimeout(8) g_url_queue = Queue.Queue()g_url_queue.put('http://www.bupt.edu.cn/')tt = ['http://www.bupt.edu.cn/']g_url_set = set(tt)max_deep = 1#  傳入的參數是soup 類型 這個提取soup 類型裡面的網址def get_url_list(html):    global g_url_set    re_html = r'(http://(\w+\.)+\w+)'        res = html.findAll('a') #找到所有a標籤        for x in res:        t = unicode(x) #這裡的x是soup對象        #url[pos] = str(unicode(x['href']) )        #t = unicode(x)        #print unicode(x['href'])        m = re.findall(re_html , t)        if m  is None:            continue        for xx in m:                        str_url = xx[0]            #print str_url            g_url_set |= set('fuxiang')            if str_url not in g_url_set :                g_url_queue.put(str_url )                 g_url_set |= set(str_url)#######################################################def strip_tags(html):    """    Python中過濾HTML標籤的函數    >>> str_text=strip_tags("<font color=red>hello</font>")    >>> print str_text    hello    """    from HTMLParser import HTMLParser    html = html.strip()    html = html.strip("\n")    result = []    parser = HTMLParser()    parser.handle_data = result.append    parser.feed(html)    parser.close()    return ''.join(result)######################################################## 可以傳入 網址 或者 本地檔案 ，解析出裡面的本文def get_context( url ):    re_html = 'http[s]?://[A-Za-z0-9]+.[A-Za-z0-9]+.[A-Za-z0-9]+

m = re.match(re_html,str(url)) if m is None : # 如果url 是本地檔案 fp = open(unicode(url),'r') else: fp = urllib2.urlopen(url) html = fp.read() soup = BeautifulSoup(html) allfonttext=soup.findAll(['a','p','font']) if len(allfonttext)<=0: print 'not found text' fwrite
= open('u'+str(url) ,'w') for i in allfonttext: t = (i.renderContents() ) context = strip_tags(t) fwrite.write (context) fwrite.close()####################################################### def main_fun(deep): global g_url_set global g_url_queue if deep >
max_deep: return count = 0 print 'debug' while g_url_queue.empty() is not True: print 'debug2' l_url = g_url_queue.get() print l_url # 捕捉逾時錯誤，有些網頁連結不上 try: fp = urllib2.urlopen(l_url) except : continue html = fp.read() fwrite = open(str(count+1) ,'w') fwrite.write(html)
fwrite.close() soup = BeautifulSoup(html) get_url_list(soup) get_context(count+1) count += 1 if count >= 100 : return # uncompletedef get_html_page(url): furl = urllib2.urlopen(url) html = furl.read() soup = BeautifulSoup(html)if __name__ == "__main__": main_fun(1)
time.sleep(10)

然後我現在想做一個多線程的，即下載頁面和分析html 抽取裡面的本文和url 是可以同步進行的，然後對上面的代碼進行簡單的修改後，勉強能運行，主要是增加了 threading ，對全域的queue 訪問了加了鎖控制，因為之前沒有寫過多線程的代碼，所以覺得還是希望路過的朋友可以，提出建議。

# encoding:utf-8# use BeautifulSoup to get font|p context# 可以隨意的使用這段代碼，但請保留 下面的一行# author ： fuxiang ，mail： fuxiang90@gmail.comfrom BeautifulSoup import BeautifulSoup          # For processing HTMLimport urllib2import osimport sysimport reimport Queueimport socketimport timeimport threadingqueue_lock = threading.RLock()file_lock = threading.RLock()socket.setdefaulttimeout(8) g_url_queue = Queue.Queue()g_url_queue.put('http://www.bupt.edu.cn/')g_file_queue = Queue.Queue()tt = ['http://www.bupt.edu.cn/']g_url_set = set(tt)max_deep = 1#######################################################def strip_tags(html):    """    Python中過濾HTML標籤的函數    >>> str_text=strip_tags("<font color=red>hello</font>")    >>> print str_text    hello    """    from HTMLParser import HTMLParser    html = html.strip()    html = html.strip("\n")    result = []    parser = HTMLParser()    parser.handle_data = result.append    parser.feed(html)    parser.close()    return ''.join(result)  def get_context( soup ,url):           allfonttext=soup.findAll(['a','p','font'])    if len(allfonttext)<=0:        print 'not found text'    fwrite = open('u'+str(url) ,'w')    for i in allfonttext:        t = (i.renderContents() )        context = strip_tags(t)        fwrite.write (context)    fwrite.close()        class get_page_thread(threading.Thread):    def __init__(self, name):        threading.Thread.__init__(self)        self.t_name = name        def run(self):        global g_url_set        global g_url_queue        global g_file_queue        count = 0        print 'debug'        while g_url_queue.empty() is not True:            print self.t_name            # 增加一個鎖            queue_lock.acquire()            l_url = g_url_queue.get()            queue_lock.release()            print l_url            # 捕捉逾時錯誤 ，有些網頁連結不上            try:                fp = urllib2.urlopen(l_url)            except :                continue            html = fp.read()            fwrite = open(str(count+1) ,'w')            fwrite.write(html)            fwrite.close()            file_lock.acquire()            g_file_queue.put(count+1)            file_lock.release()                        count += 1            if count >= 100 :               exit        class get_url_list_thread(threading.Thread):    def __init__(self, name):        threading.Thread.__init__(self)        self.t_name = name                      def run(self):        global g_url_set        global g_file_queue        global queue_lock        global file_lock        while g_file_queue.empty() is not True:            file_lock.acquire()            filename = g_file_queue.get()            file_lock.release()            fd = open(str(filename),'r')            html = fd.read();            soup = BeautifulSoup(html)             get_context(soup,filename)                        re_html = r'(http://(\w+\.)+\w+)'                    res = soup.findAll('a') #找到所有a標籤                                    for x in res:                t = unicode(x) #這裡的x是soup對象                #url[pos] = str(unicode(x['href']) )                #t = unicode(x)                #print unicode(x['href'])                m = re.findall(re_html , t)                if m  is None:                    continue                for xx in m:                                str_url = xx[0]                    #print str_url                    g_url_set |= set('fuxiang')                    if str_url not in g_url_set :                        queue_lock.acquire()                        g_url_queue.put(str_url )                        queue_lock.release()                        g_url_set |= set(str_url)            # uncompletedef get_html_page(url):    furl =  urllib2.urlopen(url)    html = furl.read()    soup = BeautifulSoup(html)if __name__ == "__main__":    thread1 = get_page_thread('a')        thread2 = get_url_list_thread('b')    thread3 = get_page_thread('c')    thread4 = get_page_thread('d')    thread1.start()    time.sleep(20)    thread2.start()    time.sleep(20)    thread3.start()    thread4.start()

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More