Reference article:
https://andyliwr.github.io/2017/12/05/nodejs_spider_ip/
https://segmentfault.com/q/1010000008196143
Code:
Import Request from 'Request'; import useragents from './common/useragent';//This is only a test, so use variables, and in practice, you should use the data cacheConstExpirytime =Ten* -* +;//expiration interval, millisecondsLet IPs =NULL;//Proxy IPLet time =NULL;//the time to store the proxy IP, determine whether it expires, and re-request if expired/** * Request free proxy, can do cache, here there are variables, only test*/ConstGetproxylist = () = { return NewPromise (Resolve, reject) = { ConstNowdate =Date.now (); if(Nowdate-time <expirytime) {Resolve (); return; } ConstApiurl ='http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++% c8%a1&textarea=http%3a%2f%2fwww.66ip.cn%2f%3fsxb%3d%26tqsl%3d100%26ports%255b%255d2%3d%26ktip%3d%26sxa%3d% 26RADIO%3DRADIO%26SUBMIT%3D%25CC%25E1%2B%2B%25C8%25A1'; ConstOptions = {method:'GET', Url:apiurl, Gzip:true, Encoding:NULL, headers: {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'accept-encoding':'gzip, deflate', 'Accept-language':'zh-cn,zh;q=0.8,en;q=0.6,zh-tw;q=0.4', 'user-agent':'mozilla/8.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/45.0.2454.101 safari/537.36', 'Referer':'http://www.66ip.cn/' }, }; Request (options, (Error, response, body)={ Try { if(Buffer.isbuffer (body)) {Constip = body.tostring (). Match (/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,4}/g); IPs=IP; time=Date.now (); Resolve (); } } Catch(e) {reject (); } }); })}//Start MethodConstStartfun = () = { //Get proxy IPgetproxylist (). Then (()={Let useragent= Useragents[parseint (Math.random () *Useragents.length)]; Let IP= Ips[parseint (Math.random () *Ips.length)]; Let Useip= ' http://${ip} ';ConstOptions = {method:'GET'Url:'http://www.xxx.com'Gzip:true, Encoding:NULL, headers: {'user-agent': UserAgent,//dynamically set browser header information}, Proxy:useip,//dynamically set proxy IPTimeout8000 }; Request (options, (Error, response, body)={ //this is because some IP is inaccessible, so if you do not have access to it, then continue to use other IP access if(Error) {Console.log (' crawl page failed, ${error}, re-looking for proxy ip...x '); //If the proxy IP is not reachable, select an additional agentStartfun (); return; } console.log ('Crawl page success, √'); if(Buffer.isbuffer (body)) {//Parsing HTML//Console.log (body.tostring ()); } }) }) .Catch(e=>{Console.log (e); })}//Start MethodStartfun ();
Useragent.js
ConstUseragents = [ 'mozilla/5.0 (X11; U Linux i686; En-us; rv:1.8.0.12) gecko/20070731 ubuntu/dapper-security firefox/1.5.0.12', 'mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1;. NET CLR 2.0.50727; Media Center PC 5.0;. NET CLR 3.0.04506)', 'mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/535.11 (khtml, like Gecko) chrome/17.0.963.56 safari/535.11', 'mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) applewebkit/535.20 (khtml, like Gecko) chrome/19.0.1036.7 safari/535.20', 'mozilla/5.0 (X11; U Linux i686; En-us; rv:1.9.0.8) Gecko fedora/1.9.0.8-1.fc10 kazehakase/0.5.6', 'mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.1 (khtml, like Gecko) chrome/21.0.1180.71 safari/537.1 lbbrowser', 'mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; trident/5.0;. NET CLR 3.5.30729;. NET CLR 3.0.30729;. NET CLR 2.0.50727; Media Center PC 6.0), LYNX/2.8.5REL.1 libwww-fm/2.14 ssl-mm/1.4.1 gnutls/1.2.9', 'mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1;. NET CLR 1.1.4322;. NET CLR 2.0.50727)', 'mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; trident/5.0; SLCC2;. NET CLR 2.0.50727;. NET CLR 3.5.30729;. NET CLR 3.0.30729; Media Center PC 6.0;. net4.0c;. net4.0e; qqbrowser/7.0.3698.400)', 'mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Qqdownload 732;. net4.0c;. NET4.0E)', 'mozilla/5.0 (Windows NT 6.1; Win64; x64; Rv:2.0b13pre) gecko/20110307 Firefox/4.0b13pre', 'opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U FR) presto/2.9.168 version/11.52', 'mozilla/5.0 (X11; U Linux i686; En-us; rv:1.8.0.12) gecko/20070731 ubuntu/dapper-security firefox/1.5.0.12', 'mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; trident/5.0; SLCC2;. NET CLR 2.0.50727;. NET CLR 3.5.30729;. NET CLR 3.0.30729; Media Center PC 6.0;. net4.0c;. net4.0e; Lbbrowser)', 'mozilla/5.0 (X11; U Linux i686; En-us; rv:1.9.0.8) Gecko fedora/1.9.0.8-1.fc10 kazehakase/0.5.6', 'mozilla/5.0 (X11; U Linux; En-US) applewebkit/527+ (khtml, like Gecko, safari/419.3) arora/0.6', 'mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; trident/5.0; SLCC2;. NET CLR 2.0.50727;. NET CLR 3.5.30729;. NET CLR 3.0.30729; Media Center PC 6.0;. net4.0c;. net4.0e; qqbrowser/7.0.3698.400)', 'opera/9.25 (Windows NT 5.1; U EN), lynx/2.8.5rel.1 libwww-fm/2.14 ssl-mm/1.4.1 gnutls/1.2.9', 'mozilla/5.0 (Windows NT 10.0; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/61.0.3163.100 safari/537.36']exportdefaultuseragents;
node. JS crawler Dynamic Proxy IP