This is a creation in Article, where the information may have evolved or changed.
Go language has a crawler software Pholcus, wrote a crawler rules, catch is the latest news of the Daily.
Pholcus open source software is still very good, but think the go language is not very fun.
The rules are put on GitHub:
github:https://github.com/nncode/pholcus-people.com.cn
By the way:
PackageSpider_lib//Basic PackageImport("Log" //"Github.com/puerkitobio/goquery"//dom parsing "Github.com/henrylee2cn/pholcus/app/downloader/request" //Required //"github.com/henrylee2cn/pholcus/logs"//Information output."Github.com/henrylee2cn/pholcus/app/spider" //Required // . "Github.com/henrylee2cn/pholcus/app/spider/common"//Selection //NET package //"net/http"//Set HTTP. Header //"Net/url" //Code pack //"Encoding/xml" "Encoding/json" //String processing package //"RegExp" //"StrConv" //"strings" //Other packages //"FMT" //"math" //"Time")funcInit () {People.register ()}typeItemstruct{Idstring ' JSON: ' ID 'Titlestring ' JSON: ' title 'Urlstring ' json: ' URL 'Datestring ' JSON: ' Date 'NodeIdstring ' JSON: ' NodeId 'Imgcountstring ' JSON: ' Imgcount '}typeNewsstruct{Items []item' JSON: ' Items '}varNews NewsvarPeople = &spider{Name:"Net News crawl", Description:"The latest classified news",//pausetime:300, //Keyin:keyin, //Limit:limit,Enablecookie:false, Ruletree: &ruletree{Root:func(CTX *context) {ctx. Addqueue (&request. request{Method:"GET"Url:"Http://news.people.com.cn/210801/211150/index.js?cache=false", Rule:"News list",})}, Trunk:Map[string]*rule{"News list": {parsefunc:func(CTX *context) {//query: = ctx. GetDOM () //str: = query. Find ("Body"). Text () //str: = ' {' Items ': [{' id ': ' 282 ', ' title ': ' Social transformation upgrade " Tactics " manual ', ' url ': ' HT ', ' Date ': ' 201 ', ' NodeId ":" 1001 "," Imgcount ":" 4 "}]} 'str: = CTX. GetText () Err: = json. Unmarshal ([]byte(str), &news)ifErr! =Nil{log. Printf ("Parsing error:%v\n", err)return}/////////////////Newslength: =Len(News.) Items) forI: =0; i < newslength; i++ {ctx. Addqueue (&request. request{Url:news. Items[i]. URL, Rule:"Hot News", Temp:Map[string]Interface{}{"id": News. Items[i]. Id"title": News. Items[i]. Title,"Date": News. Items[i]. Date,"Newstype": News. Items[i]. NodeId,},})}/////////////////}, },"Hot News": {//Note: There is no word Qi and whether the output data must be consistentItemfields: []string{"ID","title","Content","category","Releasetime",}, Parsefunc:func(CTX *context) {query: = CTX. GetDOM ()//Get contentContent: = Query. Find ("#p_content"). Text ()//Re, _: = Regexp.compile ("\\<[\\s\\s]+?\\>") //content = Re. Replaceallstringfunc (content, strings. ToLower) //content = Re. Replaceallstring (Content, "") //Results deposited response relayCTx. Output (Map[int]Interface{}{0: CTX. Gettemp ("id",""),1: CTX. Gettemp ("title",""),2: Content,3: CTX. Gettemp ("Newstype",""),4: CTX. Gettemp ("Date",""), }) }, }, }, },}