Capture webpage data using curl and regular expression

Source: Internet
Author: User
Capture webpage data using curl and regular expression
The curl and regular expression are used to capture novels from non-vip chapters of the Chinese text Network, and the novel ID can be input to download novels.
Dependency: curl
The curl, regular expression, ajax and other technologies are used in a simple look. this is suitable for beginners. During local testing, you must ensure that the network is connected and the curl mode is enabled for php.

  1. Session_start ();
  2. // Encapsulate the content into a class to enable automatic article capturing
  3. # Header ("Refresh: 30; http://www.test.com: 8080 ");
  4. Class SpiderTools {
  5. //////////////////////////////////////// //////////////////////////////////////// //////////////////////////
  6. /* Input the article ID to parse the article title */
  7. //////////////////////////////////////// //////////////////////////////////////// //////////////////////////
  8. Public function getBookNameById ($ aid ){
  9. // Initialize curl
  10. $ Ch = curl_init ();
  11. // Url
  12. $ Url = 'http: // www.motie.com/book/'.w.aid;
  13. If (is_numeric ($ aid )){
  14. // Regular expression matching
  15. $ Ru = "/\ s * (. *) \ s * <\/a> \ s * <\/h1> /";
  16. }
  17. Else {
  18. //The Family Survival path of the zombie outbreak _ Chapter 1 The zombie outbreak is updated for my friendly music ~ _ Iron grinding
  19. $ Ru = "/(. *) <\/Title>/"; <li >}< li> // Set options, including URL <li> curl_setopt ($ ch, CURLOPT_URL, $ url ); <li> curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, 1); // The content is not automatically output <li> curl_setopt ($ ch, CURLOPT_HEADER, 0 ); // no header information is returned <li> curl_setopt ($ ch, CURLOPT_CONNECTTIMEOUT_MS, 0); <li> // run the curl <li> $ output = curl_exec ($ ch ); <li> // error message <li> if (curl_exec ($ ch) === false) {<li> die (curl_error ($ ch )); <li >}< li> // check for errors <li> if (curl_errno ($ ch) {<li> echo 'curl error :'. curl_error ($ ch); <li >}< li> // release the curl handle <li> curl_close ($ ch); <li> $ arr = array (); <li> preg_match_all ($ ru, $ output, $ arr); <li> return $ arr [1] [0]; <li >}< li> ///////////////////////////////// //////////////////////////////////////// /// // <li> /* ID parsing article content */<li> /////////////////////////////// //////////////////////////////////////// /// // <li> public function getBookContextById ($ aid) {<li> // start parsing the article <li> $ ids = array (); <li> $ ids = explode ("_", $ aid ); <li> $ titleId = trim ($ ids [0]); <li> $ aticleId = trim ($ ids [1]); <li> $ ch = curl_init (); <li> $ ru = "/<p class = \" page-content \ "> [\ s \ S] * <pre ondragstart = \" return false \ "oncopy = \ "return false; \ "oncut = \" return false; \ "oncontextmenu = \" return false \ "class = \" note \ "id = \" html_content _ \ d * \ "> [\ s \ S] * (. *) <\/pre>/ui "; <li> $ url =' http://www.motie.com/book/ '. $ Aid; <li> // regular expression match <li> // Set options, including URL <li> curl_setopt ($ ch, CURLOPT_URL, $ url ); <li> curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, 1); // The content is not automatically output <li> curl_setopt ($ ch, CURLOPT_HEADER, 0 ); // no header information is returned <li> curl_setopt ($ ch, CURLOPT_CONNECTTIMEOUT_MS, 0); <li> // run the curl <li> $ output = curl_exec ($ ch ); <li> // error message <li> if (curl_exec ($ ch) === false) {<li> die (curl_error ($ ch )); <li >}< li> // check for errors <li> if (curl_errno ($ ch) {<li> echo 'Curl error :'. curl_error ($ ch); <li >}< li >$ arr = array (); <li >$ arr2 = array (); <li> preg_match_all ($ ru, $ output, $ arr); <li> curl_close ($ ch); <li ># var_dump ($ arr ); <li> $ s = $ arr [0] [0]; <li> $ s = substr ($ s, 180 ); <li> $ arr2 = explode ("return trim ($ arr2 [0]); <li >}< li> <li> ////////////////////////////// //////////////////////////////////////// /// // <li>/ * static method @ generate novel file can be called directly */<li> ////////////////// //////////////////////////////////////// //////////////////////////////////////// ///// <Li> public static function createBookById ($ id) {<li> if (! Is_numeric ($ id) {<li> echo "<br/> init begin start write! "; <Li> $ st = new self (); <li> $ cons = $ st-> getBookContextById ($ id ); <li> $ title = $ st-> getBookNameById ($ id); <li> $ cons = trim ($ cons); <li> $ t = explode ("", $ title); <li> // Construct the directory <li> $ dir = array (); <li> $ dir = explode ("_", $ t [0]); <li> $ wzdir = $ dir [0]; // The name of the book as the directory name <li> $ wzchapter = $ dir [1]; // Chapter <li> // create a directory <li> $ wzdir2 = iconv ("UTF-8", "GBK", $ wzdir ); // Directory encoding note that the reference to the $ wzdir string is retained here to construct the file name. it cannot be used here to prevent secondary encoding <li> if (! File_exists ($ wzdir2) {<li> mkdir ($ wzdir2); // create a directory <li >}< li> // Construct a file name <li> $ wztitle = ". /". $ wzdir. "/". "$ t [0]". ". txt "; <li> // ensure that the name of the saved file is not garbled <li> $ wztitle = iconv (" UTF-8 "," GBK ", $ wztitle ); <li> $ f = fopen ($ wztitle, "w +"); <li> fwrite ($ f, $ cons); <li> echo "$ wzdir ". $ wzchapter. "Write successful"; <li> fclose ($ f); <li> <li >}< li> else {<li> $ ids = self :: getBookIdsById ($ id); <li> // The server may be offline, so it is best to use session record loop <li> # for ($ I =$ _ SESSION ["$ id ". "_ fid"]; $ I <= count ($ ids); $ _ SESSION ["$ id ". "_ fid"] ++, $ I ++) {<li> # self: createBookById ($ id. "_". $ ids [$ _ SESSION ["$ id ". "_ fid"] + +]); // Construct the id <li >#}< li> <li> for ($ I =$ _ SESSION ["$ id ". "_ fid"]; $ I <= count ($ ids); $ _ SESSION ["$ id ". "_ fid"] ++, $ I ++) {<li> self: createBookById ($ id. "_". $ ids [$ I]); // Construct the id <li >}< li> <li> # echo "<pr/> <br/> the write operation is complete "; <li> # echo $ id. "_". $ ids [0]. "<br/>"; <li> # var_dump ($ ids ); <li> <li >}< li>/* <li> obtain all novel IDs <li> @ param $ ID article id <li> @ return array; <li> */<li> public static function getBookIdsById ($ aid) {<li> $ ch = curl_init (); <li> $ url =' http://www.motie.com/book/ '. $ Aid. "/chapter"; <li> // pay attention to this? You can obtain the minimum matching item <li> $ ru = '/[\ s \ S] *? <Li class = \ "\" createdate = \ "\ d {4} \-\ d {2} \-\ d {2} \ d {2 }: \ d {2 }:\ d {2} \ "> [\ s \ S] *?. *? <\/A> .*? /U'; // regular expression match <li> // Set options, including URL <li> curl_setopt ($ ch, CURLOPT_URL, $ url ); <li> curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, 1); // The content is not automatically output <li> curl_setopt ($ ch, CURLOPT_HEADER, 0 ); // no header information is returned <li> curl_setopt ($ ch, CURLOPT_CONNECTTIMEOUT_MS, 0); <li> // run the curl <li> $ output = curl_exec ($ ch ); <li> // check for errors <li> if (curl_errno ($ ch) {<li> echo 'curl error :'. curl_error ($ ch); <li >}< li> // release the curl handle <li> curl_close ($ ch); <li> $ arr = arr Ay (); <li> preg_match_all ($ ru, $ output, $ arr, PREG_PATTERN_ORDER); <li> return $ arr [1]; <li >}< li> <li>?> </Ol> </p> <em onclick = "copycode ($ ('code _ sfk ')); "> </em> </p> <p class =" blockcode "> <p id =" code_Zt6 "> <ol> <li> <? Php <li> session_start (); <li> require_once ("SpiderTools. class. php "); <li> if ($ _ REQUEST [" bid "]) {<li> if (is_numeric ($ _ REQUEST [" bid "]) {<li> SpiderTools: createBookById (trim ($ _ REQUEST ["bid"]); <li >}< li> else {<li> echo "<br/> enter the correct article ID <br/> "; <li >}< li >?> <Li> </ol> </p> <em onclick = "copycode ($ ('code _ Zt6 ')); "> </em> </p> <p class =" blockcode "> <p id =" code_ievaluate "> <ol> <li> <ptml> <li> <pead> <meta charset = "UTF-8"/> </pead> <li> <title> download novels
  20. Enter the ID number of the novel you want to see to download the novel.

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.