The simple version picture spider

Source: Internet
Author: User

Probably, otaku or old drivers will like this tool in some way. In fact, the tool is actually written in response to the needs of older drivers, but it is relatively simple and may not be improved or enhanced in the short term (for example, bypassing the anti-theft chain processing).

Full reference command line: miniimagecrawler-numcrawlgoroutine=5-baseinterval=2-randominterval=5-tickerinterval=10-savepath= ""- imgwidthmin=500-imgheightmin=500 http://eladies.sina.com.cn/

or Direct: Miniimagecrawler http://eladies.sina.com.cn/

You can then wait for the program to fetch all (eligible) images from the specified website locally.

Package Mainimport ("Imagecrawler" "Flag" "FMT" "Helperutils" "image" _ "Image/jpeg" _ "image/png" "Log" "Math/rand" net/ url "os" "Runtime" "Sync/atomic" "Time") var numcrawlgoroutine intvar baseinterval, Randominterval intvar tickerinterval Intvar Savepath stringvar imgwidthmin, imgheightmin intvar urlhost stringfunc init () {rand. Seed (time. Now (). Unixnano ())}func Parseflag () {flag. Intvar (&numcrawlgoroutine, "Numcrawlgoroutine", 5, "Maximum number of fetch threads") flag. Intvar (&baseinterval, "Baseinterval", 2, "minimum crawl interval") flag. Intvar (&randominterval, "Randominterval", 5, "Crawl random Interval") flag. Intvar (&tickerinterval, "Tickerinterval", "Goroutine number reporting interval (unit: s)") flag. Stringvar (&savepath, "Savepath", "" "," Picture Save directory (default to program directory) ") flag. Intvar (&imgwidthmin, "Imgwidthmin", 0, "minimum picture width") flag. Intvar (&imgheightmin, "Imgheightmin", 0, "min picture height") flag. Parse () If Len (flag. Args ()) = = 0 {Panic ("Please specify the starting crawl Web address! ")} else {u, err: = URL. Parse (flag. Args () [0]) if err! = Nil {panic (err)}urlhost = U.host}if Numcrawlgoroutine < 1 {panic("Please set the maximum number of fetch threads not less than 1!") ")}if Baseinterval < 1 {Panic (" please set the minimum crawl interval of not less than 1! ") ")}if Randominterval < 2 {Panic (" Please set the legal crawl random interval! ")}if Tickerinterval < 5 {Panic (" Please set the report interval of not less than 5! ") ")}if Savepath = =" "{Savepath = helperutils. Getapppath () + Urlhost + ' if!helperutils. DirectoryExists (Savepath) {if err: = OS. Mkdir (Savepath, OS. Modeperm); Err! = Nil {panic (FMT. Sprintf ("Can not make dir:%s", Savepath)}}} else {if!helperutils. DirectoryExists (Savepath) {panic ("Illegal picture save directory settings! ")}savepath = Helperutils. Includetrailingbackslash (Savepath)}if Imgwidthmin < 0 {Panic ("please set the minimum picture width of not less than 0!") ")}if Imgheightmin < 0 {Panic (" please set the minimum picture height of not less than 0! ") ")}}func parsepage (URL, homepage string, Numchan chan<-string) []string {ret, err: = Imagecrawler.parsepage (URL, HOMEP Age, Savepath, Numchan) if err! = Nil {return nil}time. Sleep (time. Duration (Rand. INTN (randominterval) +baseinterval) * time. Second) return Ret}func checkimagesize (FileName string, minWidth, MinHeight int) bool {file, err: = OS. Open (fileName) if err! = Nil {Return false}img, _, Err: = image. Decode (file) if err! = Nil {file. Close () OS. Remove (FileName) return FALSE}PT: = img. Bounds (). Size () if Pt. X < MinWidth | | Pt. Y < minheight {file. Close () OS. Remove (FileName) return false}file. Close () return True}func main () {Parseflag () var imgnum, smallnum Int64namechan: = Make (Chan string) go func () {for s: = Rang E Namechan {imgnum + = 1go func (imgname string) {if! Checkimagesize (Imgname, Imgwidthmin, imgheightmin) {atomic. AddInt64 (&smallnum, 1)}} (s)}} () Worklist: = Make (chan []string) Pendingnum: = 1go func () {worklist <-[]string{flag . Args () [0]}} () Ticker: = time. Newticker (time. Duration (tickerinterval) * time. Second) go func () {for range ticker. C {log. Printf ("Num of Goroutines:%d\n", runtime.) Numgoroutine ())}} () Tokens: = Make (chan struct{}, Numcrawlgoroutine) Seenurls: = Make (Map[string]bool) log. Println ("Picture grab started ...") Timebegin: = time. Now () for; Pendingnum > 0; pendingnum--{list: = <-worklistfor _, Link: = Range list {if!seenurls[link] {SeenurlS[link] = truependingnum++go func (url string) {tokens <-struct{}{}defer func () {<-tokens} () worklist <-Parsepag E (URL, urlhost, Namechan)} (link)}}}log. Printf ("Picture fetch ends.") Duration:%s\n ", time. Since (Timebegin). String ()) log. PRINTLN ("Finishing stats ...") Close (Namechan) ticker. Stop () time. Sleep (Time.millisecond *) Invalidnum: = Atomic. LoadInt64 (&smallnum) log. PRINTF ("Fetch total: Total images%d, small images%d, valid pictures%d\n", Imgnum, Invalidnum, imgnum-invalidnum) log. Println ("the End.")}

Package Imagecrawlerimport ("FMT" "io" "net/http" "OS" "Path/filepath" "Strings" "golang.org/x/net/html") func parsepage (URL, homepage, savepath string, Namechan chan<-string) ([]string, error) {RESP, err: = http. Get (URL) if err! = Nil {return nil, Err}defer resp. Body.close () If Resp. StatusCode! = http. Statusok {return nil, fmt. Errorf ("Getting%s:%s", URL, resp. Status)}doc, err: = HTML. Parse (resp. Body) If err! = Nil {return nil, fmt. Errorf ("parsing%s as HTML:%v", url, err)}var links []stringvisitnode: = Func (n *html. Node) {if N.type = = html. Elementnode && N.data = = "A" {for _, A: = Range n.attr {if a.key! = "href" {continue}link, err: = resp. Request.URL.Parse (a.val) if err! = Nil {continue}addr: = link. String () if strings. Hassuffix (addr, ". jpg") | | Strings. Hassuffix (addr, ". jpeg") | | Strings. Hassuffix (addr, ". png") {downloadimage (addr, Savepath, Namechan)} else {if strings. Contains (addr, homepage) {links = append (links, addr)}}}} else if N.type = = html. Elementnode && N.data= = "img" {for _, A: = Range n.attr {if a.key! = "src" {continue}link, err: = resp. Request.URL.Parse (a.val) if err! = Nil {continue}addr: = link. String () if strings. Hassuffix (addr, ". jpg") | | Strings. Hassuffix (addr, ". jpeg") | | Strings. Hassuffix (addr, ". png") {downloadimage (addr, Savepath, Namechan)}}}}foreachnode (Doc, Visitnode, nil) return links, nil} Func downloadimage (addr, savepath string, Namechan chan<-string) {resp, err: = http. Get (addr) If err! = nil {Return}defer resp. Body.close () If Resp. StatusCode! = http. Statusok {return}filename: = Savepath + filepath. Base (addr) DST, err: = OS. Create (fileName) if err! = Nil {Return}io. Copy (DST, resp. Body) DST. Close () Namechan <-filename}func foreachnode (n *html. Node, Pre, post func (n *html.  Node) {if pre! = nil {pre (n)}for c: = N.firstchild; c! = nil; c = c.nextsibling {Foreachnode (c, pre, POST)}if post! = Nil {post (n)}}

  

The executable file download link is here.

The simple version picture spider

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.