Linux: several methods for obtaining webpage source code: Linux Crawler

Source: Internet
Author: User

The first is to use a tool in Linux to obtain the webpage source code. I use wget or curl. Curl is more flexible, and many parameters of C ++ code can be set.
 

  1. // Obtain the webpage through wget
  2. String gethtmlbywget (string URL)
  3. {
  4. // Get the file name of the webpage to be downloaded
  5. String filename = URL. substr (INT) URL. find_last_of ("/") + 1 );
  6. If (filename! = "")
  7. {
  8. String strcom = "wget-Q"; // wget command.-Q indicates that the download information is not displayed.
  9. Strcom. append (URL );
  10. System (strcom. c_str (); // run wget
  11. Ifstream fin (filename. c_str ());
  12. If (! Fin)
  13. {
  14. Return "";
  15. }
  16. String strhtml = "";
  17. Char chtemp [1024] = "";
  18. // Read the webpage file to the memory
  19. While (Fin. Getline (chtemp, 1024 ))
  20. {
  21. Strhtml. append (string (chtemp ));
  22. Strcpy (chtemp ,"");
  23. }
  24. Fin. Close ();
  25. Strcom = "RM-F"; // command for deleting a file.-F indicates that the file is deleted directly without any prompt.
  26. Strcom. append (filename );
  27. System (strcom. c_str (); // Delete the downloaded file.
  28. Return strhtml; // return the webpage source code
  29. }
  30. Else
  31. {
  32. Return "";
  33. }
  34. }
// Get the webpage string gethtmlbywget (string URL) through wget {// get the name of the webpage to be downloaded string filename = URL. substr (INT) URL. find_last_of ("/") + 1); If (filename! = "") {String strcom = "wget-Q"; // wget command.-Q indicates that the download information strcom is not displayed. append (URL); System (strcom. c_str (); // run wget ifstream fin (filename. c_str (); If (! Fin) {return "";} string strhtml = ""; char chtemp [1024] = ""; // read the webpage file to the memory while (Fin. getline (chtemp, 1024) {strhtml. append (string (chtemp); strcpy (chtemp, "");} fin. close (); strcom = "RM-F"; // the command for deleting files.-F indicates that the file is deleted directly without prompting strcom. append (filename); System (strcom. c_str (); // Delete the downloaded file return strhtml; // return the webpage source code} else {return "";}}

The second is to use the socket to obtain the source code C ++.
 

  1. // Get the webpage source code through get
  2. String gethtmlbyget (string URL)
  3. {
  4. String strhtmlcontent = "";
  5. Int sockfd;
  6. Struct sockaddr_in ADDR;
  7. Struct hostent * purl;
  8. Char text [recvbuf];
  9. // Analysis Link
  10. Urlinfo = parseurl (URL );
  11. String saccept = "accept: */* \ r \ naccept-language: ZH-CN \ r \ naccept-encoding: gzip, deflate ";
  12. // Different host useragent
  13. String suseragent = "Mozilla/5.0 (X11; U; Linux i686; en-US) applewebkit/534.10 (khtml, like gecko) Chrome/8.0.552.20.safari/534.10 ";
  14. // Convert a port to a string
  15. Char T [6];
  16. String strport;
  17. Sprintf (T, "% d", urlinfo. Port );
  18. Strport = T;
  19. // Construct the sending string
  20. String strrequest = "";
  21. Strrequest. append ("get ");
  22. Strrequest. append (urlinfo. File );
  23. Strrequest. append ("? ");
  24. Strrequest. append (urlinfo. Body );
  25. Strrequest. append ("HTTP/1.1 \ r \ n ");
  26. Strrequest. append (saccept );
  27. Strrequest. append ("\ r \ nuser-AGENT :");
  28. Strrequest. append (suseragent );
  29. Strrequest. append ("\ r \ nhost :");
  30. Strrequest. append (urlinfo. HOST );
  31. Strrequest. append (":");
  32. Strrequest. append (strport );
  33. Strrequest. append ("\ r \ nconnection: keep-alive \ r \ n ");
  34. Char * Host = const_cast <char *> (urlinfo. Host. c_str ());
  35. Sockfd = socket (af_inet, sock_stream, ipproto_tcp); // send in TCP Mode
  36. Purl = gethostbyname (host );
  37. ADDR. sin_family = af_inet;
  38. ADDR. sin_addr.s_addr = * (unsigned long *) purl-> h_addr );
  39. ADDR. sin_port = htons (80 );
  40. // Connection
  41. Connect (sockfd, (struct sockaddr *) & ADDR, sizeof (ADDR ));
  42. // Send
  43. Send (sockfd, const_cast <char *> (strrequest. c_str (), strrequest. Length (), 0 );
  44. // Accept
  45. While (Recv (sockfd, text, recvbuf, 0)> 0)
  46. {
  47. Strhtmlcontent. append (text );
  48. Bzero (text, recvbuf );
  49. }
  50. // Close the socket
  51. Close (sockfd );
  52. // Return the acceptance result
  53. Return strhtmlcontent;
  54. }
// Get the webpage source code string gethtmlbyget (string URL) {string strhtmlcontent = ""; int sockfd; struct sockaddr_in ADDR; struct hostent * purl; char text [recvbuf]; // analysis link urlinfo = parseurl (URL); string saccept = "accept: */* \ r \ naccept-language: ZH-CN \ r \ naccept-encoding: gzip, deflate "; // different host useragent different string suseragent =" Mozilla/5.0 (X11; U; Linux i686; en-US) applewebkit/534.10 (khtml, like Geck O) Chrome/8.0.552.20.safari/534.10 "; // converts a port to a string char T [6]; string strport; sprintf (T," % d ", urlinfo. port); strport = T; // construct the sending string strrequest = ""; strrequest. append ("get"); strrequest. append (urlinfo. file); strrequest. append ("? "); Strrequest. append (urlinfo. body); strrequest. append ("HTTP/1.1 \ r \ n"); strrequest. append (saccept); strrequest. append ("\ r \ nuser-AGENT:"); strrequest. append (suseragent); strrequest. append ("\ r \ nhost:"); strrequest. append (urlinfo. host); strrequest. append (":"); strrequest. append (strport); strrequest. append ("\ r \ nconnection: keep-alive \ r \ n"); char * Host = const_cast <char *> (urlinfo. host. c_str (); sockfd = socket (af_inet, sock_stream, ipproto_tcp); // send purl = gethostbyname (host) in TCP mode; ADDR. sin_family = af_inet; ADDR. sin_addr.s_addr = * (unsigned long *) purl-> h_addr); ADDR. sin_port = htons (80); // connect to connect (sockfd, (struct sockaddr *) & ADDR, sizeof (ADDR); // send (sockfd, const_cast <char *> (strrequest. c_str (), strrequest. length (), 0); // accept while (Recv (sockfd, text, recvbuf, 0)> 0) {strhtmlcontent. append (text); bzero (text, recvbuf);} // close socket close (sockfd); // return the acceptance result return strhtmlcontent ;}

Use libcurl Java code
 

  1. # Include <stdio. h>
  2. # Include <string. h>
  3. # Include <curl/curl. h>
  4. # Define max_buckets 65536
  5. Char wr_buf [max_buf + 1];
  6. Int wr_index;
  7. /*
  8. * Write data callback function (called within the context
  9. * Curl_easy_perform.
  10. */
  11. Size_t write_data (void * buffer, size_t size, size_t nmemb, void * userp)
  12. {
  13. Int segsize = size * nmemb;
  14. /* Check to see if this data exceeds the size of our buffer. If so,
  15. * Set the user-defined context value and return 0 to indicate
  16. * Problem to curl.
  17. */
  18. If (wr_index + segsize> max_buf ){
  19. * (Int *) userp = 1;
  20. Return 0;
  21. }
  22. /* Copy the data from the curl buffer into our buffer */
  23. Memcpy (void *) & wr_buf [wr_index], buffer, (size_t) segsize );
  24. /* Update the write Index */
  25. Wr_index + = segsize;
  26. /* Null terminate the buffer */
  27. Wr_buf [wr_index] = 0;
  28. /* Return the number of bytes encoded ed, indicating to curl that all is okay */
  29. Return segsize;
  30. }
  31. /*
  32. * Simple curl application to read the index.html file from a web site.
  33. */
  34. Int main (void)
  35. {
  36. Curl * curl;
  37. Curlcode ret;
  38. Int wr_error;
  39. Wr_error = 0;
  40. Wr_index = 0;
  41. /* First step, init curl */
  42. Curl = curl_easy_init ();
  43. If (! Curl ){
  44. Printf ("couldn't init curl \ n ");
  45. Return 0;
  46. }
  47. /* Tell curl the URL of the file we're re going to retrieve */
  48. Curl_easy_setopt (curl, curlopt_url, "www.exampledomain.com ");
  49. /* Tell curl that we'll receive data to the function write_data, and
  50. * Also provide it with a context pointer for our error return.
  51. */
  52. Curl_easy_setopt (curl, curlopt_writedata, (void *) & wr_error );
  53. Curl_easy_setopt (curl, curlopt_writefunction, write_data );
  54. /* Allow curl to perform the action */
  55. Ret = curl_easy_perform (curl );
  56. Printf ("ret = % d (write_error = % d) \ n", RET, wr_error );
  57. /* Emit the page if curl indicates that no errors occurred */
  58. If (ret = 0) printf ("% s \ n", wr_buf );
  59. Curl_easy_cleanup (curl );
  60. Return 0;
  61. }
  62. Http://yang7229693.iteye.com/blog/855208

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.