This article mainly introduces how php uses curl and regular expressions to capture webpage data. here is a novel for capturing a website, if you need this, you can modify the capture of other data and use the curl and regular expression to capture a novel against the non-vip chapter of the Chinese text Network. you can enter the novel ID to download the novel.
Dependency: curl
The curl, regular expression, ajax and other technologies are used in a simple look. this is suitable for beginners. During local testing, you must ensure that the network is connected and the curl mode is enabled for php.
SpiderTools. class. php
The code is as follows:
Session_start ();
// Encapsulate the content into a class to enable automatic article capturing
# Header ("Refresh: 30; http://www.test.com: 8080 ");
Class SpiderTools {
//////////////////////////////////////// //////////////////////////////////////// //////////////////////////
/* Input the article ID to parse the article title */
//////////////////////////////////////// //////////////////////////////////////// //////////////////////////
Public function getBookNameById ($ aid ){
// Initialize curl
$ Ch = curl_init ();
// Url
$ Url = 'http: // www.motie.com/book/'.w.aid;
If (is_numeric ($ aid )){
// Regular expression matching
$ Ru = "/\ s * (. *) \ s * <\/a> \ s * <\/h1> /";
}
Else {
//The Family Survival path of the zombie outbreak _ Chapter 1 The zombie outbreak is updated for my friendly music ~ _ Iron grinding
$ Ru = "/(. *) <\/Title>/"; <BR >}< BR> // Set options, including URL <BR> curl_setopt ($ ch, CURLOPT_URL, $ url ); <BR> curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, 1); // The content is not automatically output <BR> curl_setopt ($ ch, CURLOPT_HEADER, 0 ); // no header information is returned <BR> curl_setopt ($ ch, CURLOPT_CONNECTTIMEOUT_MS, 0); <BR> // execute curl <BR> $ output = curl_exec ($ ch ); <BR> // error message <BR> if (curl_exec ($ ch) === false) {<BR> die (curl_error ($ ch )); <BR >}< BR> // check for errors <BR> if (curl_errno ($ ch) {<BR> echo 'curl error :'. curl_error ($ ch); <BR >}< BR> // release the curl handle <BR> curl_close ($ ch); <BR> $ arr = array (); <BR> preg_match_all ($ ru, $ output, $ arr); <BR> return $ arr [1] [0]; <BR >}< BR> ///////////////////////////////// //////////////////////////////////////// /// // <BR> /* ID parsing article content */<BR> /////////////////////////////// //////////////////////////////////////// /// // <BR> public function getBookContextById ($ aid) {<BR> // start parsing the article <BR> $ ids = array (); <BR> $ ids = explode ("_", $ aid ); <BR> $ titleId = trim ($ ids [0]); <BR> $ aticleId = trim ($ ids [1]); <BR> $ ch = curl_init (); <BR> $ ru = "/<p class = \" page-content \ "> [\ s \ S] * <pre ondragstart = \" return false \ "oncopy = \ "return false; \ "oncut = \" return false; \ "oncontextmenu = \" return false \ "class = \" note \ "id = \" html_content _ \ d * \ "> [\ s \ S] * (. *) <\/pre>/ui "; <BR> $ url =' http://www.motie.com/book/ '. $ Aid; <BR> // regular expression matching </P> <P> // Set options, including URL <BR> curl_setopt ($ ch, CURLOPT_URL, $ url ); <BR> curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, 1); // The content is not automatically output <BR> curl_setopt ($ ch, CURLOPT_HEADER, 0 ); // no header information is returned <BR> curl_setopt ($ ch, CURLOPT_CONNECTTIMEOUT_MS, 0); <BR> // execute curl <BR> $ output = curl_exec ($ ch ); <BR> // error message <BR> if (curl_exec ($ ch) === false) {<BR> die (curl_error ($ ch )); <BR >}< BR> // check for errors <BR> if (curl_errno ($ c H) {<BR> echo 'curl error :'. curl_error ($ ch); <BR >}< BR> $ arr = array (); <BR> $ arr2 = array (); <BR> preg_match_all ($ ru, $ output, $ arr); <BR> curl_close ($ ch); <BR> # var_dump ($ arr ); <BR> $ s = $ arr [0] [0]; <BR> $ s = substr ($ s, 180 ); <BR> $ arr2 = explode ("return trim ($ arr2 [0]); <BR >}</P> <P> ///////////////////////////// //////////////////////////////////////// /// // <BR> /* static method @ generate novel file can be directly Call */<BR> ////////////////////////////////// //////////////////////////////////////// //// // <BR> public static function createBookById ($ id) {<br> if (! Is_numeric ($ id) {</P> <P> echo "<br/> init begin start write! "; <BR> $ st = new self (); <BR> $ cons = $ st-> getBookContextById ($ id ); <BR> $ title = $ st-> getBookNameById ($ id); <BR> $ cons = trim ($ cons); <BR> $ t = explode ("", $ title); <BR> // Construct a directory <BR> $ dir = array (); <BR> $ dir = explode ("_", $ t [0]); <BR> $ wzdir = $ dir [0]; // The name of the book as the directory name <BR> $ wzchapter = $ dir [1]; // Chapter <BR> // create a directory <BR> $ wzdir2 = iconv ("UTF-8", "GBK", $ wzdir ); // Directory encoding note that the reference to the $ wzdir string is retained here to construct the file name, which cannot be used here to prevent secondary encoding <BR> if (! File_exists ($ wzdir2) {<BR> mkdir ($ wzdir2); // create a directory <BR >}< BR> // Construct a file name <BR> $ wztitle = ". /". $ wzdir. "/". "$ t [0]". ". txt "; <BR> // ensure that the name of the saved file is not garbled <BR> $ wztitle = iconv (" UTF-8 "," GBK ", $ wztitle ); <BR> $ f = fopen ($ wztitle, "w +"); <BR> fwrite ($ f, $ cons); <BR> echo "$ wzdir ". $ wzchapter. "Write successful"; <BR> fclose ($ f); <br> <br >}< BR> else {<BR> $ ids = self :: getBookIdsById ($ id); <br> // The server may be offline, so it is best to use session record loop <BR> # for ($ I = $ _ SESSION ["$ id ". "_ fid"]; $ I <= count ($ ids); $ _ SESSION ["$ id ". "_ fid"] ++, $ I ++) {<br> # self: createBookById ($ id. "_". $ ids [$ _ SESSION ["$ id ". "_ fid"] + +]); // Construct the id <BR >#}< br> <br> for ($ I = $ _ SESSION ["$ id ". "_ fid"]; $ I <= count ($ ids); $ _ SESSION ["$ id ". "_ fid"] ++, $ I ++) {<br> self: createBookById ($ id. "_". $ ids [$ I]); // Construct the id <BR >}< br> <br> # echo "<pr/> <br/> the write operation is complete "; <BR> # echo $ id. "_". $ ids [0]. "<br/>"; <BR> # var_dump ($ ids ); <br> <br >}</P> <BR >}< BR>/* <BR> obtain all novel IDs <BR> @ param $ ID article id <BR> @ return array; <BR> */<BR> public static function getBookIdsById ($ aid) {<BR> $ ch = curl_init (); <BR> $ url =' http://www.motie.com/book/ '. $ Aid. "/chapter"; <BR> // pay attention to this? You can obtain the minimum matching item <BR> $ ru = '/[\ s \ S] *? <Li class = \ "\" createdate = \ "\ d {4} \-\ d {2} \-\ d {2} \ d {2 }: \ d {2 }:\ d {2} \ "> [\ s \ S] *?. *? <\/A> .*? /U'; // regular expression match <BR> // Set options, including URL <BR> curl_setopt ($ ch, CURLOPT_URL, $ url ); <BR> curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, 1); // The content is not automatically output <BR> curl_setopt ($ ch, CURLOPT_HEADER, 0 ); // no header information is returned <BR> curl_setopt ($ ch, CURLOPT_CONNECTTIMEOUT_MS, 0); <BR> // execute curl <BR> $ output = curl_exec ($ ch ); <BR> // check for errors <BR> if (curl_errno ($ ch) {<BR> echo 'curl error :'. curl_error ($ ch); <BR >}< BR> // release the curl handle <BR> curl_close ($ ch); <BR> $ arr = array (); <BR> preg_match_all ($ ru, $ output, $ arr, PREG_PATTERN_ORDER); <BR> return $ arr [1]; <BR >}< BR >?> <BR> </p> </P> <P> getinfo. php <BR> <p class = "codetitle"> <U> </U> code is as follows: </p> <p class = "codebody" id = "code93004"> <BR> <? Php <BR> session_start (); <BR> require_once ("SpiderTools. class. php "); <BR> if ($ _ REQUEST [" bid "]) {<BR> if (is_numeric ($ _ REQUEST [" bid "]) {<BR> SpiderTools: createBookById (trim ($ _ REQUEST ["bid"]); <BR >}< BR> else {<BR> echo "<br/> enter the correct article ID <br/> "; <BR >}< BR >?> <BR> </p> </P> <P> index.html <BR> <p class = "codetitle"> <U> </U> the code is as follows: </p> <p class = "codebody" id = "code70978"> <BR> <ptml> <BR> <pead> <meta charset = "UTF-8"/> </ head> <BR> <title> download novels
Enter the ID number of the novel you want to see to download the novel.