Using curl and regular expressions to do a novel crawler for the non-VIP chapter of the Chinese web of grinding iron, support the input of novel ID download novel.
Dependencies: Curl
Can be simple to see, inside use curl, regular expression, Ajax and other technologies, suitable for beginners to see. In local testing, you must ensure that the network is networked and that PHP opens Curl mode
SpiderTools.class.php
Copy Code code as follows:
<?php
Session_Start ();
Encapsulated into classes to open these automatic crawl articles
#header ("refresh:30;http://www.test.com:8080");
Class spidertools{
//////////////////////////////////////////////////////////////////////////////////////////////////////////
/* Incoming Article ID resolution of the article title * *
//////////////////////////////////////////////////////////////////////////////////////////////////////////
Public Function Getbooknamebyid ($aid) {
Initialize Curl
$ch = Curl_init ();
Url
$url = ' http://www.motie.com/book/'. $aid;
if (Is_numeric ($aid)) {
Regular expression Matching
$ru = "/}
else{
<title> Zombie outbreak of the Family Survival Road _ the first chapter zombie outbreak for my love Happy update ~_ iron grinding </title>
$ru = "/<title> (. *) <\/title>/";
}
Setting options, including URLs
curl_setopt ($ch, Curlopt_url, $url);
curl_setopt ($ch, Curlopt_returntransfer, 1);/does not automatically output content
curl_setopt ($ch, Curlopt_header, 0);/Do not return header information
curl_setopt ($ch, Curlopt_connecttimeout_ms, 0);
Execute Curl
$output = curl_exec ($ch);
Error tips
if (curl_exec ($ch) = = False) {
Die (Curl_error ($ch));
}
Check to see if any errors occurred
if (Curl_errno ($ch)) {
Echo ' Curl error: '. Curl_error ($ch);
}
Release Curl handle
Curl_close ($ch);
$arr =array ();
Preg_match_all ($ru, $output, $arr);
return $arr [1][0];
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
/* Incoming Article ID resolution article content * *
//////////////////////////////////////////////////////////////////////////////////////////////////////////
Public Function Getbookcontextbyid ($aid) {
Start parsing the article
$ids =array ();
$ids =explode ("_", $aid);
$titleId =trim ($ids [0]);
$aticleId =trim ($ids [1]);
$ch = Curl_init ();
$ru = "/<div class=\" page-content\ ">[\s\s]*<pre ondragstart=\" return false\ "oncopy=\" return false;\ "oncut=\ "Return false;\" oncontextmenu=\ "return false\" class=\ "note\" id=\ "html_content_\d*\" >[\s\s]* (. *) <\/pre>/ui";
$url = ' http://www.motie.com/book/'. $aid;
Regular expression Matching
//settings options, including URLs
curl_setopt ($ch, Curlopt_url, $url);
curl_setopt ($ch, Curlopt_returntransfer, 1);//do not automatically output
curl_setopt ($ch, Curlopt_header, 0);/Do not return header information
curl_setopt ($ch, Curlopt_connecttimeout_ms, 0);
//Execution Curl
$output = curl_exec ($ch);
//error hint
if (curl_ EXEC ($ch) = = False) {
die (Curl_error ($ch));
//Check for errors
if (Curl_errno ($ch)) {
echo ' curl error: '. Curl_error ($ch);
}
& nbsp; $arr =array ();
$arr 2=array ();
preg_match_all ($ru, $output, $arr);
curl_close ($ch);
#var_dump ($arr);
$s = $arr [0][0];
$s =substr ($s, 180);
$arr 2=explode (" return Trim ($arr 2[0]);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
/* Static method @ Generate novel file can be directly called * *
//////////////////////////////////////////////////////////////////////////////////////////////////////////
public static function Createbookbyid ($id) {
if (!is_numeric ($id)) {
echo "<br/>init BEGIN START write!";
$st =new self ();
$cons = $st->getbookcontextbyid ($id);
$title = $st->getbooknamebyid ($id);
$cons =trim ($cons);
$t =explode ("", $title);
Constructing a Directory
$dir =array ();
$dir =explode ("_", $t [0]);
$wzdir = $dir [0]; Book name as directory name
$wzchapter = $dir [1]; Chapter II
Create a table of contents
$wzdir 2=iconv ("UTF-8", "GBK", $wzdir);//directory encoding note here is a reference to the $wzdir string that is used to construct the file name and cannot be used here to prevent two of times encoded
if (!file_exists ($wzdir 2)) {
mkdir ($wzdir 2); Create a table of contents
}
Construct file name
$wztitle = "./". $wzdir. " /"." $t [0] ".". TXT ";
Ensure that the saved file name is not garbled
$wztitle =iconv ("UTF-8", "GBK", $wztitle);
$f =fopen ($wztitle, "w+");
Fwrite ($f, $cons);
echo "<font color= ' green ' > $wzdir </font>". $wzchapter. " <font color= ' Red ' > Write success </font>;
Fclose ($f);
}
else{
$ids =self::getbookidsbyid ($id);
Here the server may drop the line, so it is best to use the session record loop
#for ($i =$_session["$id". " _fid "]; $i <=count ($ids); $_session[" $id "." _fid "]++, $i + +) {
#self:: Createbookbyid ($id. " _ ". $ids [$_session[" $id "." _fid "]++]);//Construction ID
#}
for ($i =$_session["$id". _fid "]; $i <=count ($ids); $_session[" $id "." _fid "]++, $i + +) {
Self::createbookbyid ($id. " _ ". $ids [$i]);//Construction ID
}
#echo "#echo $id. " _ ". $ids [0]." <br/> ";
#var_dump ($ids);
}
}
/*
Get all the IDs of the novel
@param $id Article ID
@return Array;
*/
public static function Getbookidsbyid ($aid) {
$ch = Curl_init ();
$url = ' http://www.motie.com/book/'. $aid. " /chapter ";
Attention here? You can get a minimum number of matches
$ru = '/[\s\s]*?<li class=\ ' \ createdate=\ ' \d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}\ ' >[\s\s]*?<a href=\ ' \/ book\/'. $aid. ' _ (\d*?) \ ' \s{1}>.*?<\/a>.*?/u ';//Regular expression matching
Setting options, including URLs
curl_setopt ($ch, Curlopt_url, $url);
curl_setopt ($ch, Curlopt_returntransfer, 1);/does not automatically output content
curl_setopt ($ch, Curlopt_header, 0);/Do not return header information
curl_setopt ($ch, Curlopt_connecttimeout_ms, 0);
Execute Curl
$output = curl_exec ($ch);
Check to see if any errors occurred
if (Curl_errno ($ch)) {
Echo ' Curl error: '. Curl_error ($ch);
}
Release Curl handle
Curl_close ($ch);
$arr =array ();
Preg_match_all ($ru, $output, $arr, Preg_pattern_order);
return $arr [1];
}
}
?>
getinfo.php
Copy Code code as follows:
<?php
Session_Start ();
Require_once ("SpiderTools.class.php");
if ($_request["bid"]) {
if (Is_numeric ($_request["bid"])) {
Spidertools::createbookbyid (Trim ($_request["bid"));
}
else{
echo "<br/> Please enter the correct article id<br/>";
}
}
?>
Index.html
Copy Code code as follows:
<title> Download Novel </title>
<body>
<form method= "Get" action= "getinfo.php" >
<input type= "text" id= "myID" name= "myID" value= ""/>
<input type= "button" value= "Generate novel" onclick= "Createbook ();" />
</form>
<div id= "Info" style= "Background:black;height:500px;width:1067px;overflow:scroll;color:white" >
</div>
<!-----AJAX------>
<script language= "JavaScript" >
var xmlHttp;
function Createbook ()
{
Xmlhttp=getxmlhttpobject ()
if (xmlhttp==null)
{
Alert ("Browser does not support Ajax")
Return
}
var Bookid=document.getelementbyid ("myID"). Value
var url= "getinfo.php"
url=url+ "bid=" +bookid;
url=url+ "&sid=" +math.random ()
Xmlhttp.onreadystatechange=statechanged
Xmlhttp.open ("Get", url,true)
Xmlhttp.send (NULL)
}
function statechanged ()
{
if (xmlhttp.readystate==1) {
document.getElementById ("Info"). Innerhtml= "is preparing to work, please be patient Oh ~ ^_^ ~<br/>";
}
if (xmlhttp.readystate==2) {
document.getElementById ("Info"). Innerhtml= "is contacting the server, which may take a little time ^><^<br/ > ";
}
if (xmlhttp.readystate==3) {
document.getElementById ("Info"). Innerhtml= "Parsing data <br/>";
}
if (xmlhttp.readystate==4 | | xmlhttp.readystate== "complete")
{
document.getElementById ("Info"). Innerhtml=xmlhttp.responsetext;
Xmlhttp.abort ();
}
}
function Getxmlhttpobject ()
{
var xmlhttp=null;
Try
{
Firefox, Opera 8.0+, Safari
Xmlhttp=new XMLHttpRequest ();
}
catch (E)
{
Internet Explorer
Try
{
Xmlhttp=new ActiveXObject ("msxml2.xmlhttp");
}
catch (E)
{
Xmlhttp=new ActiveXObject ("Microsoft.XMLHTTP");
}
}
return xmlHttp;
}
</script>
</body>
PS: About the regular, here again for you to recommend 2 of this site's regular expression online tools for everyone to reference use (including regular generation, matching, verification and other functions):
JavaScript Regular expression on-line test tool:http://tools.jb51.net/regex/javascript
Regular Expression online generation tool:Http://tools.jb51.net/regex/create_reg