larbin// sebastien ailleret// 29-11-99 -> 08-03-00 #include < unistd.h> #include <sys/socket.h> #include <netinet/in.h> #include <errno.h> # include <fcntl.h> #include <iostream.h> #include <string.h> #include < adns.h> #include <netdb.h> #include <sys/socket.h> #include "Types.h" #include "Global.h" #include "xutils/text.h" #include "xutils/fifo.h" #include "xutils/site.h" #include "Xutils/debug.h" #include "xutils/maxedsizedfifo.h" #include "xutils/persistentfifo.h" #include "Xutils/constantsizedfifo.h" #include "Xutils/constantsizedfifopriority.h"////////////////////////// struct global/////////////////////////////////////////////////// define all the static variables//static variable definition process, which includes static member variables in the class. Hashtable&nbsP;*global::seen; The type of the variable is the template class genericfifo<url> *global::urlsinput; genericfifo<url> *global::urlsinternal; site *global::sitelist; genericfifo<site> *global::oksites; Genericfifo<site> *global::d nssites; connexion **global::connexions; adns_state global::ads; constantsizedfifopriority<connexion> *global::freeconns; constantsizedfifo<connexion> *global::userconns; interval *global::inter; Uint global::d epthinsite; time_t global::waitduration; char *global::useragent; char *global::sender; char *global::headers; Sockaddr_in *global::p roxyaddr; bool global::isspecific; char *global::contenttype; Char *global::p rivilegedext; Vector<char> *global::d omains; vector<char> global::forbext; char *global::firsturl; uint global::nb_conn; Uint global::d nsconn; unsigned short int global::httpport; Unsigned short int global::inputport; /** constructor : initialize allmost everything * Everything is read from the config file (larbin.conf by default) */Global::global (int argc, char *argv[]) { char *configFile = "larbin.conf "; bool reload = false; //whether it's a reboot or a first start // Verification of arguments int pos = 1; while (POS&NBSP;<&NBSP;ARGC) { //argc is greater than 1, indicating that there are parameters if (!strcmp (argv[pos], "-C") & & argc > pos+1) { configfile = argv[pos+1]; //#通过参数设置配置文件的名字 pos += 2; } else if (!strcmp (argv[pos], "-reload")) //reboot, starting at the end of last crawl { reload = true; //# Set reload, the specific role is not clear. pos++; } else { break; &NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP}  } //program parameters have errors, prompt to use method if (POS&NBSP;!=&NBSP;ARGC) { //#显示使用方法. cerr << "usage : " << argv[0]; cerr << " [-c configfile] [-reload]/n"; exit (1); &NBSP;&NBSP//#初始话设置 // standard values waitDuration = 60; //access to the same server, the time interval, can not be less than 30s depthinsite = 5; // Maximum depth of access to the Web page userAgent = "Larbin"; //robot name sender = " larbin@somewhere.com "; //sender information to construct HTTP packets. firstUrl = "http://localhost/" //the URL first accessed nb_conn = 20; //maximum Parallel number connection number dnsConn = 3; //dns maximum number of parallel connections httpPort = 8081; // interface to see the statistics of Larbin crawls through the web inputPort = 1976; //Telnet interface that adds input information such as URLs to Larbin proxyaddr = null; //Proxy server Address isspecific = false; // is a specific search domains = null; //domain name limit // Read the Configuration file crash ("Read the configuration file"); //has a macro definition in Debug.h file #define crash (s) (cerr << s << "/n") parsefile (configfile); //#解析配置文件 // initialize everything crash ("Create global values"); // headers String strtmp; strtmp.addstring ("/r/nuser-agent: "); strtmp.addstring (useragent); strtmp.addstring (" "); strtmp.addstring (sender); strtmp.addstring ("/r/naccept: text/html/r/n/r/n"); headers = strtmp.givestring (); //define static char *headers; in header file // fifos //The constructor here requires further attention URLsInternal = new Persistentfifo<url> (Fifofile, reload, this); URLsInput = new Fifo<url>; inter = new interval (Ramurls); siteList = new Site[siteListSize]; okSites = new Fifo<Site>; dnsSites = new Fifo<Site>; seen = new hashtable (!reload); //Initialize hash table, static hashtable *seen; defined in header file userConns = new ConstantSizedFifo<Connexion> (Nb_conn); //defines queues for connections that are already in use freeConns = new ConstantSizedFifoPriority<Connexion> (nb_ conn); //defines a queue where free does not use a connection connexions = new connexion *[nb_ conn]; //defines the structure body array pointers for saving connection information for (uint i=0; i<nb_conn; i++) //cycles are the maximum number of connections { connexions[i] = new connexion; //Create a connection information structure freeconns->put (Connexions[i]); //the newly created connection information structure to the free connection queue // init non blocking dns calls crash ("Start adns"); Initializes the DNS call adns_initflags flags = adns_initflags (adns_if_nosigpipe | adns_if_noerrprint); // adns_initflags (Adns_if_nosigpipe); adns_init (&ads, flags, null); }/** destructor : never used because the program should never end ! */global::~global () { cerr << why he hell do you want to delete global !/n ";////////////////////////////////////////////////////////////////////function function: Parse config file//parameter: char *file the name//return value of the profile: void//NOTE: Call//////////////////in constructor global /** parse configuration file */void Global::p arsefile (char *file) { int fds = open (file, o_rdonly); if (fds < 0) { cerr << "cannot open config file/n"; exit (1); &NBSP;&NBSP} char *tmp = readfile (FDS); //define text.h in, file The BUF close (FDS) that is equal to the file length is requested in the function; //closes the file // suppress Commentary bool eff = false; //Remove the comment line marked with "#" in the configuration file, the number of lines that begin with "#"It all changed to spaces for (int i=0; tmp[i] != 0; i++) { switch (Tmp[i]) { case ' n ': eff = false; Break case ' # ': eff = true; // no break !!! default: if (EFF) tmp[i] = ' '; &NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP} } String content; The //string class String.h file has a defined content.addstring (TMP); delete [] tmp; uint pos = 0; char *tok = nexttoken (Content, &pos); // Defined in the Text.h file, function function: Extracts the next word from a string, Nexttoken // Will reapply for a space to save the extracted words, so after use to pay attention to release the space while (tok != null) { if (!strcasecmp (tok, "useragent")) { //Read the user agent information in the configuration file delete [] tok; useragent = nexttoken (Content, &pos); } else if (!strcasecmp (tok, "from")) { //Read the spider owner's e-mail message in the configuration file delete [] tok; sender = nexttoken (Content, &pos); } else if (!strcasecmp (tok, "StartURL")) { //read the first crawled URL information in the configuration file delete [] Tok firstUrl = nexttoken (Content, &pos); } else if (!) ( tok, "Waitduration") { //read in the configuration file, access the same Web server interval delete [] tok; tok = nexttoken (Content, &pos); waitduration = atoi (tok); //converts a string to an integer number delete  [] tok; } else if (!strcasecmp (tok, "proxy")) { //read the proxy information in the configuration file delete [] tok; // host name and dns call tok = nexttoken (content, &pos); //Proxy Server host name Content is a string read from the configuration file struct hostent* hp; proxyaddr = new sockaddr_in; bzero ((Char *) proxyaddr, sizeof (struct sockaddr_in))
The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion;
products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the
content of the page makes you feel confusing, please write us an email, we will handle the problem
within 5 days after receiving your email.
If you find any instances of plagiarism from the community, please send an email to:
info-contact@alibabacloud.com
and provide relevant evidence. A staff member will contact you within 5 working days.