Breadth Mate anonymous crawl (improved)

Source: Internet
Author: User

Package Main

Import (

"FMT"

"Io/ioutil"

"Log"

"Net/http"

"OS"

"RegExp"

"Strings"

"Sync"

"Golang.org/x/net/html"

)

Timer

VAR (

Ground sync. Waitgroup

str string = "https://docs.hacknode.org/gopl-zh/"

)

Func init () {

Defer func () {

If err: = Recover (); Err! = Nil {

Log. Fatal ("Recover error is:", err)

}

}()

}

Creatfile is func-to get infomation

Func Creatfile (BT []byte, I string) {

F, err: = OS. OpenFile ("F:/mygo/src/waitground_user/url" +i+ ". txt", OS. O_create|os. O_append, 0666)

If err! = Nil {

Log. Fatal (ERR)

}

Defer F.close ()

_, Err = F.write ([]byte (BT))

If err! = Nil {

Log. Fatal (ERR)

}

}

Geturlinfomation is get URL infomation

Func geturlinfomation (URL string, ch Chan int) (BT []byte) {

RESP, err: = http. Get (URL)

If err! = Nil {

Log. Fatal (ERR)

}

Defer resp. Body.close ()

If resp. StatusCode! = http. Statusok {

Log. Fatal ("Can ' t Connect")

}

BT, err = Ioutil. ReadAll (resp. Body)

If err! = Nil {

Log. Fatal (ERR)

}

CH <-1

Return BT

}

Geturlinfomationadress is a Func get URL infomation

Func geturlinfomationadress (URL string) []string {

RESP, err: = http. Get (URL)

If err! = Nil {

Log. Fatal (ERR)

}

Defer resp. Body.close ()

If resp. StatusCode! = http. Statusok {

Log. Fatal ("Can ' t connect:", URL)

}

Start node processing

Doc, err: = HTML. Parse (resp. Body)

If err! = Nil {

Log. Fatal (ERR)

}

var links []string

Foronenode: = Func (n *html. node) {//Single nodes processing

If N.type = = html. Elementnode && N.data = = "a" {

For _, A: = Range n.attr {

If A.key! = "href" {

Continue

}

Link, err: = resp. Request.URL.Parse (A.val)

If err! = Nil {

Log. Fatal (ERR)

}

If Checkurl (link. String (), links) {

Links = Append (links, link. String ())//This statement can be changed to get the URL address content in parallel

}

}

}

}

Foreachnode (Doc, Foronenode, nil)

Return links

}

Foreachnode is breadth-first traversal

Func foreachnode (n *html. Node, Pre, post func (n *html. Node)) {

If pre! = Nil {

Pre (N)

}

For c: = N.firstchild; c! = Nil; c = c.nextsibling {

Foreachnode (c, pre, POST)

}

If post! = nil {

Post (N)

}

}

Regular-expression checks

Func CHECKREGEXP (cont string, reg string, style int) (Result interface{}) {

Check: = RegExp. Mustcompile (REG)

Switch Style {

Case 0:

result = check. FindString (cont)

Case 1:

result = check. Findallstring (cont,-1)

Default

result = check. FindAll ([]byte (cont),-1)

}

Return

}

Checkurl is check the repeated fields

Func checkurl (link string, links []string) bool {

BL: = True

For _, str: = Range Links {

If str = = link {

BL = False

Break

}

}

return BL

}

Func Main () {

BT: = geturlinfomationadress (str)//First run, get all tag link addresses

Fmt. Println ("First Finish")

Fmt. Println (Bt[1])

F: = Geturlinfomation (bt[1])

Creatfile (f, StrConv. Itoa (1))

CH: = make (chan int)

For _, T: = Range BT {

T: = t

Go func () {//Traverse all addresses to get the contents of the address

FName: = Strings. Split (T, "/")

FF: = Fname[len (fname)-1]

Fmt. PRINTLN ("Address:", T)

Fmt. PRINTLN (FF)

FT: = strings. Split (FF, ".")

FFT: = ft[0]

Fmt. PRINTLN (FFT)//address processing, take page name as file name

P: = geturlinfomation (t, ch)

Creatfile ([]byte (P), FFT)

}()

}

For Range CH {

<-ch

}

}

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.