這是一個建立於 的文章,其中的資訊可能已經有所發展或是發生改變。最近打算介紹go語言編寫的開源類庫架構分析,自己最開始使用go語言是做爬蟲的,採用的是gocrawl, gocrawl是一個使用go語言編寫的開源爬蟲類庫,今天來分析下gocrawl的實現。## 範例程式碼```package gocrawlimport ( "github.com/PuerkitoBio/goquery" "net/http" "regexp" "time")// Only enqueue the root and paths beginning with an "a"var rxOk = regexp.MustCompile(`http://duckduckgo\.com(/a.*)?$`)// DefaultExtender是gocrawl提供的一些預設實現//我們可以定義自己的Extender實現對爬取過程的自訂type ExampleExtender struct { DefaultExtender // 提供預設實現}// gocrawl爬取完一個頁面後會調用Visit介面func (this *ExampleExtender) Visit(ctx *URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) { // 使用doc(使用goquery類庫對該頁面解析好的DOM)或者直接操作 res.Body 提取我們想要的資訊並做處理 return nil, true}// 在爬取網頁之前對調用Visitfunc (this *ExampleExtender) Filter(ctx *URLContext, isVisited bool) bool {//根據我們定義的規則過濾要爬取的頁面,比如一個頁面最多爬取N次,只爬取符合特定Regex的網頁。。。。 return !isVisited && rxOk.MatchString(ctx.NormalizedURL().String())}func ExampleCrawl() {//Options是一些設定項,比如爬取間隔,是否只爬取同一個網站的內容,輸出日誌資訊等 opts := NewOptions(new(ExampleExtender)) opts.CrawlDelay = 1 * time.Second opts.LogFlags = LogAll opts.MaxVisits = 2 //建立一個Crawl對象,並將根網址傳入運行 c := NewCrawlerWithOptions(opts) c.Run("https://duckduckgo.com/")}```##概要設計我們先來瞭解下gocrawl大體的架構1. crawler會跟要爬取的所有網域名稱,對每一個網域名稱建立一個worker(獨立的goroutine),負責該網域名稱下所有網頁的爬取2. crawler檢查所有要爬取的網頁, 並根據網頁所在的網域名稱通過channel分配給特定的worker3. worker負責具體網頁的爬取,並將爬取的結果通過channel通知給crawler4. 在爬取的各個階段crawler,worker會調用extender中的各個函數,來實現使用者各種自訂動作## crawler 源碼實現> 源碼實現並不與實際代碼完全吻合,只是為了展示gocrawler的核心內容```//我們先來看下crawler的定義,刪掉了一些不重要的內容type Crawler struct {Options *Options //一些配置項push chan *workerResponse //用於將worker的爬取結果返回給crawlerenqueue chan interface{} //存放待爬取網頁的channel//網域名稱與worker的映射關係workers map[string]*worker//Visited存放已訪問過的網站的集合//host存放所有網域名稱的集合//這兩個其實是set,但是golang中沒有set,使用map中的key當做set集合//map的value是一個struct{},是沒有任何用處的,struct{}是不佔任何空間visited map[string]struct{}hosts map[string]struct{}}type Options struct {MaxVisits int //設定最多爬取網頁總次數CrawlDelay time.Duration // 爬取一個網域名稱下的兩個網址之間的時間間隔WorkerIdleTTL time.Duration //worker超過多長時間沒有反應應當關閉SameHostOnly bool//只爬取同一網域名稱下的網頁Extender Extender IsRedirect bool //如果一個網址返回302,是否爬取重新導向的網址}func (this *Crawler) Run(seeds interface{}) error {seeds = this.Options.Extender.Start(seeds)//將seeds轉成UrlContext結構,UrlContext內包含待爬取網址的地址及上下文資訊ctxs := this.toURLContexts(seeds, nil)this.init(ctxs)// enqueueUrls負責將待爬取的網址分配到workerthis.enqueueUrls(ctxs)//crawler收集worker爬取的結果err := this.collectUrls()this.Options.Extender.End(err)return err}//做一些初始化操作func (this *Crawler) init(ctxs []*URLContext) {//hosts存放所有網域名稱的集合this.hosts = make(map[string]struct{}, len(ctxs))for _, ctx := range ctxs {if _, ok := this.hosts[ctx.normalizedURL.Host]; !ok {this.hosts[ctx.normalizedURL.Host] = struct{}{}}} this.workers, this.push = make(map[string]*worker, hostCount),make(chan *workerResponse, hostCount) this.enqueue = make(chan interface{}, this.Options.EnqueueChanBuffer)}type URLContext struct {HeadBeforeGet bool//上下文資訊,我們可以在Extender Filter中賦值我們自訂的結構,在Filter等其它介面中取出上下文資訊State interface{}//待爬取網址的資訊url *url.URLnormalizedURL *url.URL//帶爬去網址的原網址(是從哪個網址發現的這個網址)資訊sourceURL *url.URLnormalizedSourceURL *url.URL}func (this *Crawler) enqueueUrls(ctxs []*URLContext) (cnt int) {for _, ctx := range ctxs {var isVisited, enqueue bool// 檢查這個網址是否已經爬取過,所有爬取過的網址都會存放在visited map中_, isVisited = this.visited[ctx.normalizedURL.String()]//調用extender的Filter介面,如果Filter返回flase則不需要爬取該網址if enqueue = this.Options.Extender.Filter(ctx, isVisited); !enqueue {continue}//檢查是否設定了SameHostOnly(只爬取特定網站的內容) if this.Options.SameHostOnly && !this.isSameHost(ctx) {// Only allow URLs coming from the same hostthis.logFunc(LogIgnored, "ignore on same host policy: %s", ctx.normalizedURL)} else {// All is good, visit this URL (robots.txt verification is done by worker)// Possible caveat: if the normalization changes the host, it is possible// that the robots.txt fetched for this host would differ from the one for// the unnormalized host. However, this should be rare, and is a weird// behaviour from the host (i.e. why would site.com differ in its rules// from www.site.com) and can be fixed by using a different normalization// flag. So this is an acceptable behaviour for gocrawl.// Launch worker if required, based on the host of the normalized URLw, ok := this.workers[ctx.normalizedURL.Host]if !ok {// No worker exists for this host, launch a new onew = this.launchWorker(ctx)// Automatically enqueue the robots.txt URL as first in lineif robCtx, e := ctx.getRobotsURLCtx(); e != nil {this.Options.Extender.Error(newCrawlError(ctx, e, CekParseRobots))this.logFunc(LogError, "ERROR parsing robots.txt from %s: %s", ctx.normalizedURL, e)} else {this.logFunc(LogEnqueued, "enqueue: %s", robCtx.url)this.Options.Extender.Enqueued(robCtx)w.pop.stack(robCtx)}}cnt++this.logFunc(LogEnqueued, "enqueue: %s", ctx.url)this.Options.Extender.Enqueued(ctx)w.pop.stack(ctx)this.pushPopRefCount++// Once it is stacked, it WILL be visited eventually, so add it to the visited slice// (unless denied by robots.txt, but this is out of our hands, for all we// care, it is visited).if !isVisited {// The visited map works with the normalized URLthis.visited[ctx.normalizedURL.String()] = struct{}{}}}}return}func (this *Crawler) collectUrls() error {defer func() {this.wg.Wait()this.logFunc(LogInfo, "crawler done.")}()for {//如果沒有待爬取的網址程式就退出if this.pushPopRefCount == 0 && len(this.enqueue) == 0 {close(this.stop)return nil}select {//worker通過push channel將爬取結果返回給crawlercase res := <-this.push:// Received a response, check if it contains URLs to enqueueif res.visited {this.visits++//判斷該網址是否已經爬取過最大次數if this.Options.MaxVisits > 0 && this.visits >= this.Options.MaxVisits {close(this.stop)return ErrMaxVisits}}if res.idleDeath {delete(this.workers, res.host)this.logFunc(LogInfo, "worker for host %s cleared on idle policy", res.host)} else {//將從該網頁上爬取的一個網頁連結壓入到具體的worker中this.enqueueUrls(this.toURLContexts(res.harvestedURLs, res.ctx.url))this.pushPopRefCount--}//接收到命令要爬取一個網頁// 在使用者自訂的 Extend中的EnqueueChannel與this.enqueue 實際上是同一個channelcase enq := <-this.enqueue:ctxs := this.toURLContexts(enq, nil)this.enqueueUrls(ctxs)case <-this.stop:return ErrInterrupted}}}```##worker源碼實現```type worker struct {host stringpush chan<- *workerResponsepop popChannel}func (this *worker) run() {defer func() {this.wg.Done()}()// Enter loop to process URLs until stop signal is receivedfor {//設定最大空閑時間,如果長時間worker沒有工作則退出var idleChan <-chan time.Timeif this.opts.WorkerIdleTTL > 0 {idleChan = time.After(this.opts.WorkerIdleTTL)}select {case <-this.stop:return//向crawler發出退出通知case <-idleChan:this.sendResponse(nil, false, nil, true)return//crawler向worker推送的需要爬取的網址case batch := <-this.pop:for _, ctx := range batch {//爬取網頁this.requestUrl(ctx, ctx.HeadBeforeGet)}}}}// Process the specified URL.func (this *worker) requestUrl(ctx *URLContext, headRequest bool) {if res, ok := this.fetchUrl(ctx, this.opts.UserAgent, headRequest); ok {var harvested interface{}var visited bool// Close the body on function enddefer res.Body.Close()// Any 2xx status code is good to goif res.StatusCode >= 200 && res.StatusCode < 300 {// visitUrl()會自動抓取該網頁上所有的a標籤,將所有的網址提取出來harvested = this.visitUrl(ctx, res)visited = true} else {//出錯的話調用Extender的Error回調this.opts.Extender.Error(newCrawlErrorMessage(ctx, res.Status, CekHttpStatusCode))}this.sendResponse(ctx, visited, harvested, false)}}// Request the specified URL and return the response.func (this *worker) fetchUrl(ctx *URLContext, agent string, headRequest bool) (res *http.Response, ok bool) {var e errorvar silent boolfor {// Request the URLif res, e = this.opts.Extender.Fetch(ctx, agent, headRequest); e != nil {this.sendResponse(ctx, false, nil, false)return nil, false} else {// Get the fetch durationfetchDuration := now.Sub(time.Now())// Crawl delay starts now.this.wait = time.After(this.lastCrawlDelay)// Keep trace of this last fetch infothis.lastFetch = &FetchInfo{ctx,fetchDuration,res.StatusCode,headRequest,}}}return}```