Package Main
Import (
"FMT"
"Io/ioutil"
"Log"
"Net/http"
"OS"
"RegExp"
"Strings"
"Sync"
"Golang.org/x/net/html"
)
Timer
VAR (
Ground sync. Waitgroup
str string = "https://docs.hacknode.org/gopl-zh/"
)
Func init () {
Defer func () {
If err: = Recover (); Err! = Nil {
Log. Fatal ("Recover error is:", err)
}
}()
}
Creatfile is func-to get infomation
Func Creatfile (BT []byte, I string) {
F, err: = OS. OpenFile ("F:/mygo/src/waitground_user/url" +i+ ". txt", OS. O_create|os. O_append, 0666)
If err! = Nil {
Log. Fatal (ERR)
}
Defer F.close ()
_, Err = F.write ([]byte (BT))
If err! = Nil {
Log. Fatal (ERR)
}
}
Geturlinfomation is get URL infomation
Func geturlinfomation (URL string, ch Chan int) (BT []byte) {
RESP, err: = http. Get (URL)
If err! = Nil {
Log. Fatal (ERR)
}
Defer resp. Body.close ()
If resp. StatusCode! = http. Statusok {
Log. Fatal ("Can ' t Connect")
}
BT, err = Ioutil. ReadAll (resp. Body)
If err! = Nil {
Log. Fatal (ERR)
}
CH <-1
Return BT
}
Geturlinfomationadress is a Func get URL infomation
Func geturlinfomationadress (URL string) []string {
RESP, err: = http. Get (URL)
If err! = Nil {
Log. Fatal (ERR)
}
Defer resp. Body.close ()
If resp. StatusCode! = http. Statusok {
Log. Fatal ("Can ' t connect:", URL)
}
Start node processing
Doc, err: = HTML. Parse (resp. Body)
If err! = Nil {
Log. Fatal (ERR)
}
var links []string
Foronenode: = Func (n *html. node) {//Single nodes processing
If N.type = = html. Elementnode && N.data = = "a" {
For _, A: = Range n.attr {
If A.key! = "href" {
Continue
}
Link, err: = resp. Request.URL.Parse (A.val)
If err! = Nil {
Log. Fatal (ERR)
}
If Checkurl (link. String (), links) {
Links = Append (links, link. String ())//This statement can be changed to get the URL address content in parallel
}
}
}
}
Foreachnode (Doc, Foronenode, nil)
Return links
}
Foreachnode is breadth-first traversal
Func foreachnode (n *html. Node, Pre, post func (n *html. Node)) {
If pre! = Nil {
Pre (N)
}
For c: = N.firstchild; c! = Nil; c = c.nextsibling {
Foreachnode (c, pre, POST)
}
If post! = nil {
Post (N)
}
}
Regular-expression checks
Func CHECKREGEXP (cont string, reg string, style int) (Result interface{}) {
Check: = RegExp. Mustcompile (REG)
Switch Style {
Case 0:
result = check. FindString (cont)
Case 1:
result = check. Findallstring (cont,-1)
Default
result = check. FindAll ([]byte (cont),-1)
}
Return
}
Checkurl is check the repeated fields
Func checkurl (link string, links []string) bool {
BL: = True
For _, str: = Range Links {
If str = = link {
BL = False
Break
}
}
return BL
}
Func Main () {
BT: = geturlinfomationadress (str)//First run, get all tag link addresses
Fmt. Println ("First Finish")
Fmt. Println (Bt[1])
F: = Geturlinfomation (bt[1])
Creatfile (f, StrConv. Itoa (1))
CH: = make (chan int)
For _, T: = Range BT {
T: = t
Go func () {//Traverse all addresses to get the contents of the address
FName: = Strings. Split (T, "/")
FF: = Fname[len (fname)-1]
Fmt. PRINTLN ("Address:", T)
Fmt. PRINTLN (FF)
FT: = strings. Split (FF, ".")
FFT: = ft[0]
Fmt. PRINTLN (FFT)//address processing, take page name as file name
P: = geturlinfomation (t, ch)
Creatfile ([]byte (P), FFT)
}()
}
For Range CH {
<-ch
}
}