The first is to use a tool in Linux to obtain the webpage source code. I use wget or curl. Curl is more flexible, and many parameters of C ++ code can be set.
- // Obtain the webpage through wget
- String gethtmlbywget (string URL)
- {
- // Get the file name of the webpage to be downloaded
- String filename = URL. substr (INT) URL. find_last_of ("/") + 1 );
- If (filename! = "")
- {
- String strcom = "wget-Q"; // wget command.-Q indicates that the download information is not displayed.
- Strcom. append (URL );
- System (strcom. c_str (); // run wget
- Ifstream fin (filename. c_str ());
- If (! Fin)
- {
- Return "";
- }
- String strhtml = "";
- Char chtemp [1024] = "";
- // Read the webpage file to the memory
- While (Fin. Getline (chtemp, 1024 ))
- {
- Strhtml. append (string (chtemp ));
- Strcpy (chtemp ,"");
- }
- Fin. Close ();
- Strcom = "RM-F"; // command for deleting a file.-F indicates that the file is deleted directly without any prompt.
- Strcom. append (filename );
- System (strcom. c_str (); // Delete the downloaded file.
- Return strhtml; // return the webpage source code
- }
- Else
- {
- Return "";
- }
- }
// Get the webpage string gethtmlbywget (string URL) through wget {// get the name of the webpage to be downloaded string filename = URL. substr (INT) URL. find_last_of ("/") + 1); If (filename! = "") {String strcom = "wget-Q"; // wget command.-Q indicates that the download information strcom is not displayed. append (URL); System (strcom. c_str (); // run wget ifstream fin (filename. c_str (); If (! Fin) {return "";} string strhtml = ""; char chtemp [1024] = ""; // read the webpage file to the memory while (Fin. getline (chtemp, 1024) {strhtml. append (string (chtemp); strcpy (chtemp, "");} fin. close (); strcom = "RM-F"; // the command for deleting files.-F indicates that the file is deleted directly without prompting strcom. append (filename); System (strcom. c_str (); // Delete the downloaded file return strhtml; // return the webpage source code} else {return "";}}
The second is to use the socket to obtain the source code C ++.
- // Get the webpage source code through get
- String gethtmlbyget (string URL)
- {
- String strhtmlcontent = "";
- Int sockfd;
- Struct sockaddr_in ADDR;
- Struct hostent * purl;
- Char text [recvbuf];
- // Analysis Link
- Urlinfo = parseurl (URL );
- String saccept = "accept: */* \ r \ naccept-language: ZH-CN \ r \ naccept-encoding: gzip, deflate ";
- // Different host useragent
- String suseragent = "Mozilla/5.0 (X11; U; Linux i686; en-US) applewebkit/534.10 (khtml, like gecko) Chrome/8.0.552.20.safari/534.10 ";
- // Convert a port to a string
- Char T [6];
- String strport;
- Sprintf (T, "% d", urlinfo. Port );
- Strport = T;
- // Construct the sending string
- String strrequest = "";
- Strrequest. append ("get ");
- Strrequest. append (urlinfo. File );
- Strrequest. append ("? ");
- Strrequest. append (urlinfo. Body );
- Strrequest. append ("HTTP/1.1 \ r \ n ");
- Strrequest. append (saccept );
- Strrequest. append ("\ r \ nuser-AGENT :");
- Strrequest. append (suseragent );
- Strrequest. append ("\ r \ nhost :");
- Strrequest. append (urlinfo. HOST );
- Strrequest. append (":");
- Strrequest. append (strport );
- Strrequest. append ("\ r \ nconnection: keep-alive \ r \ n ");
- Char * Host = const_cast <char *> (urlinfo. Host. c_str ());
- Sockfd = socket (af_inet, sock_stream, ipproto_tcp); // send in TCP Mode
- Purl = gethostbyname (host );
- ADDR. sin_family = af_inet;
- ADDR. sin_addr.s_addr = * (unsigned long *) purl-> h_addr );
- ADDR. sin_port = htons (80 );
- // Connection
- Connect (sockfd, (struct sockaddr *) & ADDR, sizeof (ADDR ));
- // Send
- Send (sockfd, const_cast <char *> (strrequest. c_str (), strrequest. Length (), 0 );
- // Accept
- While (Recv (sockfd, text, recvbuf, 0)> 0)
- {
- Strhtmlcontent. append (text );
- Bzero (text, recvbuf );
- }
- // Close the socket
- Close (sockfd );
- // Return the acceptance result
- Return strhtmlcontent;
- }
// Get the webpage source code string gethtmlbyget (string URL) {string strhtmlcontent = ""; int sockfd; struct sockaddr_in ADDR; struct hostent * purl; char text [recvbuf]; // analysis link urlinfo = parseurl (URL); string saccept = "accept: */* \ r \ naccept-language: ZH-CN \ r \ naccept-encoding: gzip, deflate "; // different host useragent different string suseragent =" Mozilla/5.0 (X11; U; Linux i686; en-US) applewebkit/534.10 (khtml, like Geck O) Chrome/8.0.552.20.safari/534.10 "; // converts a port to a string char T [6]; string strport; sprintf (T," % d ", urlinfo. port); strport = T; // construct the sending string strrequest = ""; strrequest. append ("get"); strrequest. append (urlinfo. file); strrequest. append ("? "); Strrequest. append (urlinfo. body); strrequest. append ("HTTP/1.1 \ r \ n"); strrequest. append (saccept); strrequest. append ("\ r \ nuser-AGENT:"); strrequest. append (suseragent); strrequest. append ("\ r \ nhost:"); strrequest. append (urlinfo. host); strrequest. append (":"); strrequest. append (strport); strrequest. append ("\ r \ nconnection: keep-alive \ r \ n"); char * Host = const_cast <char *> (urlinfo. host. c_str (); sockfd = socket (af_inet, sock_stream, ipproto_tcp); // send purl = gethostbyname (host) in TCP mode; ADDR. sin_family = af_inet; ADDR. sin_addr.s_addr = * (unsigned long *) purl-> h_addr); ADDR. sin_port = htons (80); // connect to connect (sockfd, (struct sockaddr *) & ADDR, sizeof (ADDR); // send (sockfd, const_cast <char *> (strrequest. c_str (), strrequest. length (), 0); // accept while (Recv (sockfd, text, recvbuf, 0)> 0) {strhtmlcontent. append (text); bzero (text, recvbuf);} // close socket close (sockfd); // return the acceptance result return strhtmlcontent ;}
Use libcurl Java code
- # Include <stdio. h>
- # Include <string. h>
- # Include <curl/curl. h>
- # Define max_buckets 65536
- Char wr_buf [max_buf + 1];
- Int wr_index;
- /*
- * Write data callback function (called within the context
- * Curl_easy_perform.
- */
- Size_t write_data (void * buffer, size_t size, size_t nmemb, void * userp)
- {
- Int segsize = size * nmemb;
- /* Check to see if this data exceeds the size of our buffer. If so,
- * Set the user-defined context value and return 0 to indicate
- * Problem to curl.
- */
- If (wr_index + segsize> max_buf ){
- * (Int *) userp = 1;
- Return 0;
- }
- /* Copy the data from the curl buffer into our buffer */
- Memcpy (void *) & wr_buf [wr_index], buffer, (size_t) segsize );
- /* Update the write Index */
- Wr_index + = segsize;
- /* Null terminate the buffer */
- Wr_buf [wr_index] = 0;
- /* Return the number of bytes encoded ed, indicating to curl that all is okay */
- Return segsize;
- }
- /*
- * Simple curl application to read the index.html file from a web site.
- */
- Int main (void)
- {
- Curl * curl;
- Curlcode ret;
- Int wr_error;
- Wr_error = 0;
- Wr_index = 0;
- /* First step, init curl */
- Curl = curl_easy_init ();
- If (! Curl ){
- Printf ("couldn't init curl \ n ");
- Return 0;
- }
- /* Tell curl the URL of the file we're re going to retrieve */
- Curl_easy_setopt (curl, curlopt_url, "www.exampledomain.com ");
- /* Tell curl that we'll receive data to the function write_data, and
- * Also provide it with a context pointer for our error return.
- */
- Curl_easy_setopt (curl, curlopt_writedata, (void *) & wr_error );
- Curl_easy_setopt (curl, curlopt_writefunction, write_data );
- /* Allow curl to perform the action */
- Ret = curl_easy_perform (curl );
- Printf ("ret = % d (write_error = % d) \ n", RET, wr_error );
- /* Emit the page if curl indicates that no errors occurred */
- If (ret = 0) printf ("% s \ n", wr_buf );
- Curl_easy_cleanup (curl );
- Return 0;
- }
- Http://yang7229693.iteye.com/blog/855208