廣度配合匿名爬取(改進)

來源:互聯網
上載者:User

package main

import (

    "fmt"

    "io/ioutil"

    "log"

    "net/http"

    "os"

    "regexp"

    "strings"

    "sync"

    "golang.org/x/net/html"

)

//timer

var (

    ground sync.WaitGroup

    str    string = "https://docs.hacknode.org/gopl-zh/"

)

func init() {

    defer func() {

        if err := recover(); err != nil {

            log.Fatal("recover error is :", err)

        }

    }()

}

//CreatFile is func to get infomation

func CreatFile(bt []byte, i string) {

    f, err := os.OpenFile("F:/MyGo/src/waitground_user/url"+i+".txt", os.O_CREATE|os.O_APPEND, 0666)

    if err != nil {

        log.Fatal(err)

    }

    defer f.Close()

    _, err = f.Write([]byte(bt))

    if err != nil {

        log.Fatal(err)

    }

}

//GetURLInfomation is get url infomation

func GetURLInfomation(URL string, ch chan int) (bt []byte) {

    resp, err := http.Get(URL)

    if err != nil {

        log.Fatal(err)

    }

    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {

        log.Fatal("Can't connect")

    }

    bt, err = ioutil.ReadAll(resp.Body)

    if err != nil {

        log.Fatal(err)

    }

    ch <- 1

    return bt

}

//GetURLInfomationAdress is a func get URL infomation

func GetURLInfomationAdress(URL string) []string {

    resp, err := http.Get(URL)

    if err != nil {

        log.Fatal(err)

    }

    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {

        log.Fatal("Can't connect:", URL)

    }

    //開始節點處理

    doc, err := html.Parse(resp.Body)

    if err != nil {

        log.Fatal(err)

    }

    var links []string

    ForOneNode := func(n *html.Node) { //單次節點處理

        if n.Type == html.ElementNode && n.Data == "a" {

            for _, a := range n.Attr {

                if a.Key != "href" {

                    continue

                }

                link, err := resp.Request.URL.Parse(a.Val)

                if err != nil {

                    log.Fatal(err)

                }

                if CheckURL(link.String(), links) {

                    links = append(links, link.String()) //這條語句可以改成並行擷取URL地址內容

                }

            }

        }

    }

    ForEachNode(doc, ForOneNode, nil)

    return links

}

//ForEachNode is 廣度優先遍曆

func ForEachNode(n *html.Node, pre, post func(n *html.Node)) {

    if pre != nil {

        pre(n)

    }

    for c := n.FirstChild; c != nil; c = c.NextSibling {

        ForEachNode(c, pre, post)

    }

    if post != nil {

        post(n)

    }

}

//Regex檢查

func checkRegexp(cont string, reg string, style int) (result interface{}) {

    check := regexp.MustCompile(reg)

    switch style {

    case 0:

        result = check.FindString(cont)

    case 1:

        result = check.FindAllString(cont, -1)

    default:

        result = check.FindAll([]byte(cont), -1)

    }

    return

}

//CheckURL is check the repeated fields

func CheckURL(link string, links []string) bool {

    bl := true

    for _, str := range links {

        if str == link {

            bl = false

            break

        }

    }

    return bl

}

func main() {

    bt := GetURLInfomationAdress(str) //第一次運行,擷取所有的標籤連結地址

    fmt.Println("first finish")

    // fmt.Println(bt[1])

    // f := GetURLInfomation(bt[1])

    // CreatFile(f, strconv.Itoa(1))

    ch := make(chan int)

    for _, t := range bt {

        t := t

        go func() { //遍曆所有地址,擷取地址內容

            fname := strings.Split(t, "/")

            ff := fname[len(fname)-1]

            fmt.Println("地址:", t)

            fmt.Println(ff)

            ft := strings.Split(ff, ".")

            fft := ft[0]

            fmt.Println(fft) //進行地址處理,取頁面名當做檔案名稱

            p := GetURLInfomation(t, ch)

            CreatFile([]byte(p), fft)

        }()

    }

    for range ch {

        <-ch

    }

}

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.