Let's talk about the next idea. Using PHP's curl function to store cookies, google's search page cannot be opened using file_get_connents. You must simulate the browser completely. Baidu is different. You can directly use file_get_conntens to capture the page, then you can use the regular expression to process it. Baidu will not be listed here.
<? Php
Header ("Content-Type: text/html; charset = UTF-8 ");
Function ggsearch ($ url_s, $ keyword, $ page = 1 ){
$ EnKeyword = urlencode ($ keyword );
$ RsState = false;
$ Page_num = ($ page-1) * 10;
If ($ page <= 10 ){
$ Interface = "eth0:". rand (1, 4); // avoid gg ip Address
$ Cookie_file = dirname (_ FILE _). "/temp/google.txt"; // store cookie values
$ Url = "http://www.google.com/search? Q = $ enKeyword & hl = en & prmd = imvns & ei = jpnjtvlfi8hlggexwdpm & start = $ page_num & sa = N ";
$ Ch = curl_init ();
Curl_setopt ($ ch, CURLOPT_URL, $ url );
// Curl_setopt ($ ch, CURLOPT_USERAGENT, $ _ SERVER ['HTTP _ USER_AGENT ']); // obtain the browser type
Curl_setopt ($ ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv: 1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB5 ");
Curl_setopt ($ ch, CURLOPT_INTERFACE, "$ interface"); // specify the access IP Address
Curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, 1 );
Curl_setopt ($ ch, CURLOPT_FOLLOWLOCATION, 1 );
Curl_setopt ($ ch, CURLOPT_COOKIEJAR, $ cookie_file );
$ Contents = curl_exec ($ ch );
Curl_close ($ ch );
$ Match = "! <Div \ s * id = \ "search \"> (. *) </div> \ s + <\! -- Z -->! ";
Preg_match_all ("$ match", "$ contents", $ line );
While (list ($ k, $ v) = each ($ line [0]) {
Preg_match_all ("! <H3 \ s + class = \ "r \"> <a [^>] +> (.*?) </A>! ", $ V, $ title );
$ Num = count ($ title [1]);
For ($ I = 0; $ I <$ num; $ I ++ ){
If (strstr ($ title [0] [$ I], $ url_s )){
$ RsState = true;
$ J = $ I + 1;
$ Sum = $ j + ($ page) * 10-10 );
// Echo $ contents;
Echo "keyword ". $ keyword. "<br> ". "ranking :". '<font color = "red" size = "20"> '. $ sum. '</font> '. "####". "th ". '<font color = "#00 FFFF" size = "18"> '. $ page. '</font> '. "page ". "th ". '<font color = "# 8000FF" size = "15"> '. $ j. '</font> '. "name ". $ title [0] [$ I]. "<br> ";
Echo "<a href = '". $ url. "'>". "Click search result". "</a>". "<br> ";
Echo "
Break;
}
}
}
Unset ($ contents );
If ($ rsState = false ){
Ggsearch ($ url_s, $ keyword, ++ $ page); // you cannot find the search page to continue searching.
}
} Else {
Echo 'keyword '. $ keyword.' The Top 10 pages of the website are not '.' <br> ';
Echo "
}
}
If (! Empty ($ _ POST ['submit ']) {
$ Time = explode ('', microtime ());
$ Start = $ time [0] + $ time [1];
$ More_key = trim ($ _ POST ['textarea ']);
$ Url_s = trim ($ _ POST ['url']);
If (! Empty ($ more_key )&&! Empty ($ url_s )){
/* Determine the rules of input characters */
If (strstr ($ more_key, "\ n ")){
$ Exkey = explode ("\ n", $ more_key );
}
If (strstr ($ more_key, "| ")){
$ Exkey = explode ("|", $ more_key );
}
If (! Strstr ($ more_key, "\ n ")&&! Strstr ($ more_key, "| ")){
$ Exkey = array ($ more_key );
}
/* Determine whether there is anything like www or http */
If (count (explode ('.', $ url_s) <= 2 ){
$ Url = ltrim ($ url_s, 'HTTP: // www ');
$ Url = 'www. '. $ url_s;
}
Foreach ($ exkey as $ keyword ){
// $ Keyword;
Ggsearch ($ url_s, $ keyword );
}
$ Endtime = explode ('', microtime ());
$ End = $ endtime [0] + $ endtime [1];
Echo '
Echo 'program running time :';
Echo $ end-$ start;
// Die ();
}
}
?>
<! DOCTYPE html PUBLIC "-// W3C // dtd xhtml 1.0 Transitional // EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<Html xmlns = "http://www.w3.org/1999/xhtml">
<Head>
<Title> capture rankings </title>
</Head>
<Body>
<Form action = "" method = "post">
<Span> Keyword: </span> <textarea name = "textarea" rows = "20" cols = "40" wrap = "off">
Format: keyword1 | keyword2 | keyword3
Or: keyword1
Keyword2
Keyword3
</Textarea>
<Span> url: </span> <input type = "text" name = "url">
<Input type = "submit" name = "submit" value = "Search">
Www.2cto.com
</Form>
</Body>
</Html>
From Shine's holy heaven-Min Chen 〃