python編寫網路爬蟲程式

來源:互聯網
上載者:User

    工作了,需要抓取一些特定的資源進行分析。沒有高速海量資料那麼大的需求。所以想到用python直接寫一個簡單的爬蟲,之後再編寫模板進行定向解析,就解決問題了。畢竟我們的時間要求和數量級要求不是很高。

    在網上搜尋了一些python編寫爬蟲的文檔,下載下來試試,看看效果。雖然我的這篇文章標明是原創,但是文章中也有一部分是參考了別人的資訊,請諒解。因為我是想參考別人的精華,組合重構,來實現我需要的系統。

    先是從網上下載一個別人編寫的爬蟲試試效果,網址如下:http://blog.csdn.net/cashey1991/article/details/6262704

Test.py

# -*- coding: utf-8 -*-import WebCrawlerurl = raw_input('設定入口url(例-->http://www.baidu.com): \n')thNumber = int(raw_input('設定線程數:'))    #之前類型未轉換出bugMaxdepth = int(raw_input('最大搜尋深度:'))wc = WebCrawler.WebCrawler(thNumber, Maxdepth)wc.Craw(url)

WebCrawler.py

# -*- coding: utf-8 -*-import threadingimport GetUrlimport urllibg_mutex = threading.Lock()g_pages = []      #線程下載頁面後,將頁面內容添加到這個list中g_dledUrl = []    #所有下載過的urlg_toDlUrl = []    #當前要下載的urlg_failedUrl = []  #下載失敗的urlg_totalcount = 0  #下載過的頁面數class WebCrawler:    def __init__(self,threadNumber,Maxdepth):        self.threadNumber = threadNumber        self.threadPool = []        self.Maxdepth = Maxdepth        self.logfile = file('#log.txt','w')                                   ##    def download(self, url, fileName):        Cth = CrawlerThread(url, fileName)        self.threadPool.append(Cth)        Cth.start()    def downloadAll(self):        global g_toDlUrl        global g_totalcount        i = 0        while i < len(g_toDlUrl):            j = 0            while j < self.threadNumber and i + j < len(g_toDlUrl):                g_totalcount += 1    #進入迴圈則下載頁面數加1                self.download(g_toDlUrl[i+j],str(g_totalcount)+'.htm')                print 'Thread started:',i+j,'--File number = ',g_totalcount                j += 1            i += j            for th in self.threadPool:                th.join(30)     #等待線程結束,30秒逾時            self.threadPool = []    #清空線程池        g_toDlUrl = []    #清空列表    def updateToDl(self):        global g_toDlUrl        global g_dledUrl        newUrlList = []        for s in g_pages:            newUrlList += GetUrl.GetUrl(s)   #######GetUrl要具體實現        g_toDlUrl = list(set(newUrlList) - set(g_dledUrl))    #提示unhashable                    def Craw(self,entryUrl):    #這是一個深度搜尋,到g_toDlUrl為空白時結束        g_toDlUrl.append(entryUrl)        self.logfile.write('>>>Entry:\n')                                      ##        self.logfile.write(entryUrl)                                           ##        depth = 0        while len(g_toDlUrl) != 0 and depth <= self.Maxdepth:            depth += 1            print 'Searching depth ',depth,'...\n\n'            self.downloadAll()            self.updateToDl()            content = '\n>>>Depth ' + str(depth)+':\n'                         ##            self.logfile.write(content)                                        ##            i = 0                                                              ##            while i < len(g_toDlUrl):                                          ##                content = str(g_totalcount + i + 1) + '->' + g_toDlUrl[i] + '\n'#   ##                self.logfile.write(content)                                    ##                i += 1                                                         ##         class CrawlerThread(threading.Thread):    def __init__(self, url, fileName):        threading.Thread.__init__(self)        self.url = url    #本線程下載的url        self.fileName = fileName    def run(self):    #線程工作-->下載html頁面        global g_mutex        global g_failedUrl        global g_dledUrl        try:            f = urllib.urlopen(self.url)            s = f.read()            fout = file(self.fileName, 'w')            fout.write(s)            fout.close()        except:            g_mutex.acquire()    #線程鎖-->鎖上            g_dledUrl.append(self.url)            g_failedUrl.append(self.url)            g_mutex.release()    #線程鎖-->釋放            print 'Failed downloading and saving',self.url            return None    #記著返回!                g_mutex.acquire()    #線程鎖-->鎖上        g_pages.append(s)        g_dledUrl.append(self.url)        g_mutex.release()    #線程鎖-->釋放

GetUrl.py

# -*- coding: cp936 -*- urlSep = ['<','>','//','(',')', r'"', r"'", ' ', '\t', '\n']urlTag = ['http://']def is_sep(ch):    for c in urlSep:        if c == ch:            return True    return Falsedef find_first_sep(i,s):    while i < len(s):        if is_sep(s[i]):            return i        i+=1    return len(s)def GetUrl(strPage):    rtList = []    for tag in urlTag:        i = 0        i = strPage.find(tag, i, len(strPage))        while i != -1:            begin = i            end = find_first_sep(begin+len(tag),strPage)            rtList.append(strPage[begin:end])            i = strPage.find(tag, end, len(strPage))    return rtList

 

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.