After sorting it out, I can't see it clearly. This is one of my code for capturing pages and backing up the Code to facilitate future calls. I hope you can give me more comments.
Public function snatch ()
{
Set_time_limit (0 );
$ This-> benchmark-> mark ('Code _ start ');
/* Get second-hand car news of different categories */
For ($ I = 1; $ I <= 4; $ I ++)
{
$ Url = 'HTTP: // news.2sche.cn/list.asp? Stype = '. $ I;
$ Result = $ this-> curl_snatch ($ url );
Preg_match_all ('/<strong> \ d \/(.*?) <\/Strong>/', $ result, $ page_news );
// Print_r ($ page_news );
// Echo '/* Obtain the news list of all pages in a single category */
For ($ j = 1; $ j <= $ page_news [1]; $ j ++)
{
If (1 = $ j)
{
$ Url_news = 'HTTP: // news.2sche.cn/list.asp? Stype = '. $ I;
}
Else
{
$ Url_news = 'HTTP: // news.2sche.cn/list.asp? Page = '. $ j.' & stype = '. $ I;
}
$ Result_news = $ this-> curl_snatch ($ url_news );
Preg_match_all ('/<td width = "516" height = "28" class = "z14"> <a href = "(.*?) "Target =" _ blank "> .*? <\/A> <\/td>/sim ', $ result_news, $ url_newslist );
// Print_r ($ url_newslist );
/* Traverse each url on the list page */
Foreach ($ url_newslist [1] as $ url_newslists)
{
$ Url_newsinfo = 'HTTP: // news.2sche.cn/'.w.url_newslists;
$ Result_newsinfo = $ this-> curl_snatch ($ url_newsinfo );
/* Obtain the title */
Preg_match_all ('/
// Print_r ($ title [1]);
/* Obtain the source */
Preg_match_all ('/<td style = "BORDER-BOTTOM: #666666 1PX DASHED" width = "155"> <span class = "right"> [Source :(.*?) & Nbsp;] <\/span> <\/td>/sim ', $ result_newsinfo, $ source );
// Print_r ($ source [1]);
/* Get content */
Preg_match_all ('/<td colspan = "2" class = "z14" style = "padding-top: 20px; padding-left: 1px; padding-bottom: 20px; line-height: 25px "> (. *?) <\/Td>/sim ', $ result_newsinfo, $ content );
// Print_r ($ content [1] [0]);
/* Get all image URLs in the content */
// Preg_match_all ('//Sim ', $ content [1] [0], $ img );
Preg_match_all ('/ /Sim ', $ content [1] [0], $ img );
// Echo 'ddddd ';
// Print_r ($ img [1]);
// Echo 'dddd <br> ';
// Exit;
$ Picture = '';
Foreach ($ img [1] as $ imgs)
{
// Echo $ imgs;
// Echo '<br> ';
If (strpos ($ imgs, 'HTTP: // ') = false)
{
Continue;
}
$ Img_source = file_get_contents ($ imgs );
/* Obtain the name of a single image */
$ Img_names = trim (strrchr ($ imgs ,'/'),'/');
// Print_r ($ img_name );
// Echo $ img_names;
// Exit;
$ Picture. = $ img_names .':';
File_put_contents ("./static/uploads/news/". $ img_names, $ img_source );
// Image path replacement
$ Img_path = '/static/uploads/news/'. $ img_names;
$ Content [1] [0] = str_replace ($ imgs, $ img_path, $ content [1] [0]);
}
// Print_r ($ picture );
// Echo 'hhhh <br> ';
// Print_r ($ content [1] [0]);
// Echo '<br> ';
$ Data = array (
'Title' => $ title [1] [0],
'Source' => $ source [1] [0],
'Contents' => trim ($ content [1] [0]),
'Picture' => $ picture,
'Style' => $ I,
'Create _ time' => time (),
);
If (! $ This-> News_model-> add ($ data ))
{
Continue;
}
// Print_r ($ data); exit;
}
Echo '
}
}
$ This-> benchmark-> mark ('Code _ end ');
Echo $ this-> benchmark-> elapsed_time ('Code _ start', 'Code _ end ');
}
Function curl_snatch ($ url = 'HTTP: // www.2sche.cn/buy.asp ')
{
$ Url = trim ($ url );
$ Content = '';
If (extension_loaded ('curl '))
{
$ Ch = curl_init ();
// 2. Set options, including URL
Curl_setopt ($ ch, CURLOPT_URL, $ url );
Curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, 1 );
Curl_setopt ($ ch, CURLOPT_FOLLOWLOCATION, 1 );
Curl_setopt ($ ch, CURLOPT_HEADER, 0 );
// 3. Execute and obtain the HTML document content
$ Output = curl_exec ($ ch );
$ Content = iconv ("GBK", "UTF-8", $ output );
If ($ output = FALSE ){
Echo "cURL Error:". curl_error ($ ch );
}
// $ Info = curl_getinfo ($ ch );
// Echo 'get '. $ info ['url']. 'timeout'. $ info ['total _ time']. 'second ';
// 4. Release the curl handle
Curl_close ($ ch );
}
Else
{
$ Content = file_get_contents ($ url );
}
Return trim ($ content );
}
Author: Li jiashun