This article describes how to capture the website code www.icbase.com (with the asp.net parameter) with the keyword on the page. For more information, see
The code is as follows:
/**
* HOST: www.icbase.com
*/
// Set_time_limit (0 );
// Base function
Function curl_get ($ url, $ data = array (), $ header = array (), $ timeout = 15, $ port = 80, $ reffer = '', $ proxy = '')
{
$ Ch = curl_init ();
If (! Empty ($ data )){
$ Data = is_array ($ data )? Http_build_query ($ data): $ data;
$ Url. = (strpos ($ url ,'? ')? '&':"? "). $ Data;
}
Curl_setopt ($ ch, CURLOPT_URL, $ url );
Curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, true );
Curl_setopt ($ ch, CURLOPT_CONNECTTIMEOUT, $ timeout );
Curl_setopt ($ ch, CURLOPT_POST, 0 );
Curl_setopt ($ ch, CURLOPT_PORT, $ port );
Curl_setopt ($ ch, CURLOPT_HTTPHEADER, $ header );
Curl_setopt ($ ch, CURLOPT_FOLLOWLOCATION, 1); // whether to capture the redirected page
$ Reffer & curl_setopt ($ ch, CURLOPT_REFERER, $ reffer );
If ($ proxy ){
Curl_setopt ($ ch, CURLOPT_PROXY, $ proxy );
Curl_setopt ($ ch, CURLOPT_PROXYPORT, 1723 );
Curl_setopt ($ ch, CURLOPT_PROXYUSERPWD, "andhm001: andhm123 ");
}
$ Result = array ();
$ Result ['result'] = curl_exec ($ ch );
If (0! = Curl_errno ($ ch )){
$ Result ['error'] = "error: \ n". curl_error ($ ch );
}
Curl_close ($ ch );
Return $ result;
}
The code is as follows:
Function curl_post ($ url, $ data = array (), $ header = array (), $ timeout = 5, $ port = 80)
{
$ Ch = curl_init ();
Curl_setopt ($ ch, CURLOPT_URL, $ url );
Curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, true );
Curl_setopt ($ ch, CURLOPT_CONNECTTIMEOUT, $ timeout );
// Curl_setopt ($ ch, CURLOPT_PORT, $ port );
! Empty ($ header) & curl_setopt ($ ch, CURLOPT_HTTPHEADER, $ header );
Curl_setopt ($ ch, CURLOPT_POST, 1 );
Curl_setopt ($ ch, CURLOPT_POSTFIELDS, $ data );
$ Result = array ();
$ Result ['result'] = curl_exec ($ ch );
If (0! = Curl_errno ($ ch )){
$ Result ['error'] = "error: \ n". curl_error ($ ch );
}
Curl_close ($ ch );
Return $ result;
}
/**
* Obtain the html source code of the list page.
* @ Param string $ keywords: search keyword
* @ Param int $ page number
* @ Return boolean | array
*/
Function getListHtml ($ keywords, $ page = 1)
{
If ($ page <0)
{
Return false;
}
$ Page = 0? 1: intval ($ page );
If ($ page = 1)
{
$ Result = curl_get ('http: // www.icbase.com/ProResult.aspx', array ('prokey' => $ keywords ));
If (isset ($ result ['error'])
{
Return false;
// Exit ($ result ['error']);
}
$ Result = $ result ['result'];
// Asp.net post submits data
If (! Defined ('_ viewstate') & preg_match ('/ {
Define ('_ viewstate', $ matches [1]);
} Else {
Return false;
}
If (! Defined ('_ previouspage') & preg_match ('/ {
Define ('_ previouspage', $ matches [1]);
} Else {
Return false;
}
If (! Defined ('_ EVENTVALIDATION') & preg_match ('/ {
Define ('_ EVENTVALIDATION', $ matches [1]);
} Else {
Return false;
}
Return $ result;
}
$ Data = array (
'_ Eventtarget' => 'pager ',
'_ EVENTARGUMENT' => $ page,
'_ Viewstate' => _ VIEWSTATE,
'_ Previouspage' => _ PREVIOUSPAGE,
'_ EVENTVALIDATION' => _ EVENTVALIDATION,
);
$ Result = curl_post ('http: // www.icbase.com/ProResult.aspx? ProKey = '. $ keywords, $ data );
If (isset ($ result ['error'])
{
Return false;
// Exit ($ result ['error']);
}
$ Result = $ result ['result'];
Return $ result;
}
/**
* Obtain the url of link a on the list page.
* @ Param string $ html source code
* @ Return array
*/
Function getListHref ($ html)
{
$ Pattern = '/[\ s \ n] *] \/>/isu ';
If (preg_match_all ($ pattern, $ html, $ matches ))
{
Return $ matches [1];
} Else {
// No matching items
Return array ();
}
}
/**
* Get the number of the next page
* @ Param string $ html source code
* @ Return number
*/
Function getListNextPage ($ html)
{
$ Pattern = '/ ]>. +> <\/A>/isU ';
If (preg_match ($ pattern, $ html, $ matches ))
{
Return intval ($ matches [1]);
} Else {
Return-1;
}
}
/**
* Obtain all href of the list.
* @ Param string $ keywords: search keyword
* @ Return boolean | array
*/
Function getListHrefAll ($ keywords)
{
If (empty ($ keywords ))
{
Return false;
}
$ Html = getListHtml ($ keywords );
$ HrefList = getListHref ($ html );
If (empty ($ hrefList ))
{
// No results
Return array ();
}
$ NextPage = getListNextPage ($ html );
While ($ nextPage> 0)
{
$ Html = getListHtml ($ keywords, $ nextPage );
$ TmpHrefList = getListHref ($ html );
$ HrefList = array_merge ($ hrefList, $ tmpHrefList );
$ NextPage = getListNextPage ($ html );
}
Return $ hrefList;
}
/**
* Get details page information
* @ Param string $ the url or the captured html source code is differentiated by @ see $ is_url
* @ Param int $ is_url 1 uses url address 0 to directly process html source code
* @ Return boolean | multitype: string
*/
Function getDetail ($ url, $ is_url = 1)
{
If (empty ($ url ))
{
Return false;
}
$ Host = 'www .icbase.com ';
$ Html = $ url;
If ($ is_url ){
$ Url = '/'. ltrim ($ url ,'/');
$ Result = curl_get ($ host. $ url );
If (isset ($ result ['error'])
{
Exit ($ result ['error']);
}
$ Html = $ result ['result'];
}
$ Result = array (
'Sup _ part' => '', // supplier model
'Sup _ id' => '', // supplier id
'Mfg _ part' => '', // manufacturer model
'Mfg _ name' => '', // manufacturer name
'Cat _ name' => '', // category name
'Para' => '', // attribute
'Desc' => '', // description
'PDF _ url' => '', // pdf address
'Sup _ stock' => '', // stock
'Min _ purch' => '', // minimum order quantity
'Price' => '', // price
'IMG _ url' => '', // Image address
'Createtime' => '', // creation time
'Datacode' => '', // batch number
'Package' => '', // encapsulation
'Page _ url' => '', // page address
);
// Mfg_part
$ Pattern = '/Product Model <\/td>(. [^ <] +)If (preg_match ($ pattern, $ html, $ matches ))
{
$ Result ['mfg _ part'] = trim ($ matches [1]);
} Else {
// This item is available, indicating that it is not everywhere
Return array ();
}
// Mfg_name
$ Pattern = '/Vendor <\/td> [\ s \ n] *(. +) <\/Td>/isU ';
If (preg_match ($ pattern, $ html, $ matches ))
{
$ Result ['mfg _ name'] = trim ($ matches [1]);
}
// Para
$ Pattern = '/ (. +) <\/Tr> <\/table>/isU ';
If (preg_match ($ pattern, $ html, $ matches ))
{
If (preg_match_all ('/(. +) <\/Td>/isU ', $ matches [1], $ matches ))
{
$ Count = count ($ matches [1]);
$ Count = intval ($ count/2 );
Foreach ($ matches [1] as $ k => $ v)
{
If ($ k> = $ count)
{
Break;
}
If (trim ($ v) = 'Description ')
{
// Desc
$ Result ['desc'] = trim ($ matches [1] [$ count + $ k]);
Continue;
}
$ V = trim ($ v );
$ Result ['para'] [$ v] = trim ($ matches [1] [$ count + $ k]);
}
}
}
// Pai_url
$ Pattern = '/Details <\/td>If (preg_match ($ pattern, $ html, $ matches ))
{
$ Result ['PDF _ url'] = trim ($ matches [1]);
}
// Sup_stock
$ Pattern = '/Inventory quantity <\/td> [\ s \ n] *(\ D +) <\/td>/isU ';
If (preg_match ($ pattern, $ html, $ matches ))
{
$ Result ['Sup _ stock'] = trim ($ matches [1]);
}
// Price
$ Pattern = '/ ] +> (\ D +) \ + <\/td> ] +>. [^ \ D] * ([\ d.] +) <\/td> <\/tr>/isU ';
If (preg_match_all ($ pattern, $ html, $ matches ))
{
Foreach ($ matches [1] as $ k => $ v)
{
$ Result ['price'] [$ v] = '¥'. $ matches [2] [$ k];
}
}
// Img_url
$ Pattern = '/Image <\/td>If (preg_match ($ pattern, $ html, $ matches ))
{
$ Result ['IMG _ url'] = trim ($ matches [1]);
}
// Page_url
If ($ is_url)
{
$ Result ['page _ url'] = $ host. $ url;
}
Return $ result;
}
/**
* Final function call
* @ Param string $ keywords: search keyword
* @ Return array
*/
Function getData ($ keywords)
{
$ HrefList = getListHrefAll ($ keywords );
$ Result = array ();
Foreach ($ hrefList as $ k => $ v)
{
$ Result [] = getDetail ($ v );
}
Return $ result;
}
// Test Script
$ Keywords = trim ($ _ GET ['keyword']);
$ Result = getData ($ keywords );
Print_r ($ result );