I helped my friend collect the data. It was too slow for him to open the Korean website and store the data one by one. I originally wanted to use PHP to write the data, but I just watched the huge go collection by ghost, then you can use it after shamelessly changing it.,
Package main
Import (
"FMT"
"Io"
"IO/ioutil"
"Log"
"Net/HTTP"
"OS"
"Path"
"Regexp"
"Strings"
)
Type notfounderror struct {
Message string
}
Func (E notfounderror) error () string {
Return e. Message
}
Type remoteerror struct {
Host string
Err Error
}
Func (E * remoteerror) error () string {
Return e. Err. Error ()
}
VaR useragent = "Mozilla/5.0 (Windows NT 6.1; wow64) applewebkit/537.36 (khtml, like gecko) Chrome/29.0.1541.0 Safari/537.36"
// Httpget gets the specified resource. errnotfound is returned if
// Server responds with status 404.
Func httpget (client * HTTP. Client, URL string, header HTTP. header) (Io. readcloser, error ){
Req, err: = http. newrequest ("get", URL, nil)
If Err! = Nil {
Return nil, err
}
Req. header. Set ("User-Agent", useragent)
For K, VS: = range header {
Req. header [k] =
}
Resp, err: = client. Do (req)
If Err! = Nil {
Return nil, & remoteerror {Req. url. Host, err}
}
If resp. statuscode == 200 {
Return resp. Body, Nil
}
Resp. Body. Close ()
If resp. statuscode = 404 {// 403 can be rate limit error. | resp. statuscode = 403 {
Err = notfounderror {"resource not found:" + URL}
} Else {
Err = & remoteerror {Req. url. Host, FMT. errorf ("Get % s-> % d", URL, resp. statuscode )}
}
Return nil, err
}
// Httpgetbytes gets the specified resource. errnotfound is returned if the server
// Responds with status 404.
Func httpgetbytes (client * HTTP. Client, URL string, header HTTP. header) ([] Byte, error ){
RC, err: = httpget (client, URL, header)
If Err! = Nil {
Return nil, err
}
Defer RC. Close ()
Return ioutil. readall (RC)
}
// Httpgettofile gets the specified resource and writes to file.
// Errnotfound is returned if the server responds with status 404.
Func httpgettofile (client * HTTP. Client, URL string, header HTTP. header, filename string) error {
RC, err: = httpget (client, URL, header)
If Err! = Nil {
Return err
}
Defer RC. Close ()
OS. mkdirall (path. dir (filename), OS. modeperm)
F, err: = OS. Create (filename)
If Err! = Nil {
Return err
}
Defer F. Close ()
_, Err = Io. Copy (F, RC)
Return err
}
VaR IMG = Regexp. mustcompile ('href = \ "javascript: goview \ (\ D + )')
VaR imgpattern = Regexp. mustcompile ('Id = "mainimage" src = \ "../upload (.*?). JPG ')
VaR totaltask int
Func download (URL string, num Chan bool ){
Url = strings. trimprefix (URL, 'href = "javascript: goview (')
Page: = "http://www.gdweb.co.kr/main/koreaWebView.asp? Idx = % S & url = koreaweb. asp"
T, err: = httpgetbytes (& HTTP. Client {}, FMT. sprintf (page, URL), nil)
If Err! = Nil {
Log. fatalf ("failed to get the page: % v", err)
}
Matches: = imgpattern. findall (T,-1)
For _, Match: = range matches {
Url = "http://www.gdweb.co.kr" + strings. trimprefix (string (MATCH), 'Id = "mainimage" src = "..')
Log. printf ("downloading: % s", URL)
Err: = httpgettofile (& HTTP. Client {}, URL, nil, "pics/" + path. Base (URL ))
If Err! = Nil {
Log. printf ("image download failed (% s): % v", URL, err)
}
}
Totaltask --
<-Num
}
Func main (){
// Control the number of simultaneous downloads
Num: = make (Chan bool, 5)
// The main thread crawls the page and the sub-thread downloads the image
// Baseurl: = "http://nvmingxing.net/hotness/%d"
// Abaseurl: = "http://www.gdweb.co.kr/main/koreaWebView.asp? Idx = 8200 & url = koreaweb. asp"
Baseurl: = "http://www.gdweb.co.kr/main/koreaWeb.asp? Idx = & url = index. asp> lpage = 124 & page = % d"
For I: = 2; I <124; I ++ {
Log. printf ("Capture page: % d", totaltask)
Data, err: = httpgetbytes (& HTTP. Client {}, FMT. sprintf (baseurl, I + 1), nil)
If Err! = Nil {
Log. fatalf ("failed to get the page (% d): % v", I, err)
}
Matches: = IMG. findall (data,-1)
For _, Match: = range matches {
Totaltask ++
Num <-true
Go download (string (MATCH), num)
}
}
}