Php crawls website images remotely and saves them. Previously, when I saw someone on the Internet saying that I was writing a program to capture webpage images, I thought it was amazing. When should I write a method to capture images myself !? Just in the past two days, I took a look at a php image capture code on the Internet, focusing on using the regular expressions matching the img tag and its src attribute, encapsulated a php remote image capturing class, tested, fast php remote image capturing and saving
In the past, when I saw someone on the Internet saying that I was writing a program to capture web images, it was amazing. I thought about how to capture images myself!
?
Just in the past two days, I took a look at a php image capture code on the Internet, focusing on using the regular expressions matching the img tag and its src attribute,
Encapsulated a php remote image capturing class, tested it, and the speed was even better. in two minutes, I captured more than 110 images from Eni ini.iteye.com.
?
The code is as follows:
Save_path = $ save_path; $ this-> img_size = $ img_size;}/*** recursive method for downloading images on the home page and its subpages (recursive recursion) ** @ param String $ capture_url: the URL used to capture images **/public function recursive_download_images ($ capture_url) {if (! In_array ($ capture_url, self: $ a_url_arr) // {self: $ a_url_arr [] = $ capture_url; // included in static array} else // crawled, directly exit the function {return;} $ this-> download_current_page_images ($ capture_url ); // download all the images on the current page // use @ to block the warning error caused by the capture address being unreadable. $ content = @ file_get_contents ($ capture_url ); // match the href attribute of tag? The previous regular expression $ a_pattern = "|] + href = ['\"]? ([^ '\ "?] +) ['\ ">] | U"; preg_match_all ($ a_pattern, $ content, $ a_out, PREG_SET_ORDER); $ tmp_arr = array (); // defines an array, used to store the hyperlink address foreach ($ a_out as $ k => $ v) for capturing images in the current loop {/*** remove the null ''', '#' in the hyperlink '','#', '/' and repeat value * 1: The value of the hyperlink address cannot be equal to the url of the currently crawled page, otherwise it will be in an endless loop * 2: The hyperlink is ''or '#', '/' is also the current page, which will also fall into an endless loop. * 3: sometimes a hyperlink address appears multiple times in a web page. if this address is not removed, A sub-page will be repeatedly downloaded) */if ($ v [1] &! In_array ($ v [1], self: $ a_url_arr )&&! In_array ($ v [1], array ('#', '/', $ capture_url) {$ tmp_arr [] = $ v [1];} foreach ($ tmp_arr as $ k => $ v) {// hyperlink path address if (strpos ($ v, 'http ://')! = False) // if the url contains http: //, you can directly access {$ a_url = $ v;} else // otherwise, it indicates that it is a relative address, url {$ domain_url = substr ($ capture_url, 0, strpos ($ capture_url, '/', 8) + 1); $ a_url = $ domain_url. $ v ;}$ this-> recursive_download_images ($ a_url );}} /*** download all images on the current webpage ** @ param String $ capture_url: The webpage address used to capture images * @ return Array an Array of the img tag URLs of all images on the current webpage */public function download_current_page_images ($ capture_url) {$ content = @ File_get_contents ($ capture_url); // shield the warning error // match the src attribute of the img tag? The previous regular expression $ img_pattern = "|] + src = ['\"]? ([^ '\ "?] +) ['\ ">] | U"; preg_match_all ($ img_pattern, $ content, $ img_out, PREG_SET_ORDER); $ photo_num = count ($ img_out ); // The number of matched images echo ''. $ capture_url. "Found ". $ photo_num. "Images"; foreach ($ img_out as $ k = >$ v) {$ this-> save_one_img ($ capture_url, $ v [1]);} /*** method for saving a single image ** @ param String $ capture_url the webpage address used to capture the image * @ param String $ img_url the url of the image to be saved **/public function save_one_img ($ capture_url, $ img_ur L) {// Image path address if (strpos ($ img_url, 'http ://')! = False) {// $ img_url = $ img_url;} else {$ domain_url = substr ($ capture_url, 0, strpos ($ capture_url, '/', 8) + 1); $ img_url = $ domain_url. $ img_url;} $ pathinfo = pathinfo ($ img_url); // Obtain the image path information $ pic_name = $ pathinfo ['basename']; // Obtain the image name if (file_exists ($ this-> save_path. $ pic_name) // If the image exists, it indicates that it has been captured. exit the {echo $ img_url function. 'The image has been crawled!
'; Return;} // read the image content into a string $ img_data = @ file_get_contents ($ img_url ); // shield out the warning error because the image address cannot be read. if (strlen ($ img_data)> $ this-> img_size) // download images with a larger size ratio {$ img_size = file_put_contents ($ this-> save_path. $ pic_name, $ img_data); if ($ img_size) {echo $ img_url. 'The image is saved successfully!
';} Else {echo $ img_url.' an error occurred while saving the image!
';}} Else {echo $ img_url.' an error occurred while reading the image!
';}}// ENDset_time_limit (120); // set the maximum execution time of the script. set $ download_img = new download_image ('E:/images/', 0) as needed ); // instantiate the downloaded image object $ download_img-> recursive_download_images ('http: // ini.iteye.com /'); // recursive image capturing method // $ download_img-> download_current_page_images ($ _ POST ['capture _ url']); // how to capture images on the current page only?>
?