Provides various official and user-released code examples. For code reference, you are welcome to exchange and learn how to collect highly efficient data. However, DOMdocument does not seem to be able to collect tagged content, it is very fast to collect plain text.
Most of the functions are rejected before they can be implemented. They can only be used for your reference.
You can see the CURL proxy access + ZEND_DOM collection usage.
It mainly depends on CollectGoodsController. class. php.
// Obtain the store Column
Public function getShopCate ($ shop_url = "", $ continue = 0 ){
$ Source = file_get_contents ('offline/shops.htm ');
$ Shops_id = 1;
$ Mall_id = 1;
$ Cate = M ('goods _ category ')-> where (array ('shops _ id' => $ shops_id)-> find ();
If (! Empty ($ cate )){
Return fasle; // The collection shop column already exists.
// $ This-> error ('collection shop column already exists ');
}
Import ('@. Tao. TaoHttp', ','. php ');
$ Http = new \ TaoHttp ();
$ Shop_html = $ Http-> encoding ($ source );
$ Shop_category_rule = D ('collectgoods ')-> getRule ($ mall_id, 'shop _ category ');
Import ('@. Tao. Dom. query', '','. php ');
$ Dom = new \ Zend_Dom_Query ($ shop_html );
$ Shop_category = $ Dom-> query ($ shop_category_rule );
If (count ($ shop_category) = 0 ){
Return false; // No store column is collected
// $ This-> error ('shop topic not collected ');
}
$ Result = array ();
Foreach ($ shop_category as $ key => $ value ){
$ Result [$ key] ['url'] = $ value-> getAttribute ('href ');
$ Result [$ key] ['name'] = trim ($ value-> nodeValue );
}
Unset ($ result [0]);
$ Data = array ();
$ Time = time ();
$ Cate_url = array ();
Foreach ($ result as $ value ){
$ DataTmp = array (
'Shops _ id' => $ shops_id,
'Cate _ name' => $ value ['name'],
'Cate _ url' => $ value ['url'],
'Collect _ time' => $ time,
);
$ Cate_url [] = $ value ['url'];
$ Data [] = $ dataTmp;
}
M ('goods _ category ')-> addAll ($ data );
Return true; // The store column is successfully collected.
}
// Obtain a topic item in the store
// Http: // localhost/TaoGoods/index. php? M = Taogoods & c = CollectGoods & a = getShopGoods & cate_id = 3
Public function getShopGoods ($ cate_id = 0 ){
If ($ cate_id = 0) {return false ;}
$ Goods_time = M ('goods')-> where (array ('cate _ id' => $ cate_id)-> getField ('Collect _ Time ');
If ($ goods_time ){
If ($ goods_time + 86400 * $ this-> day> time ()){
$ This-> error ('Do not collect records repeatedly within 15 days ', U ('index '));
}
$ This-> error ('items under the collection shop column already exists ', U ('index '));
}
$ Cate_data = M ('goods _ category ')-> find ($ cate_id );
$ Shops_id = $ cate_data ['shops _ id'];
$ Cate_id = $ cate_data ['id'];
$ Mall_id = $ cate_data ['mall _ id'];
Import ('@. Tao. TaoHttp', ','. php ');
$ Http = new \ TaoHttp ();
$ Source = $ Http-> get ($ cate_data ['cate _ url']);
$ Shop_html = $ Http-> encoding ($ source );
$ Cate_rule = D ('collectgoods ')-> getRule ($ mall_id );
Import ('@. Tao. Dom. query', '','. php ');
$ Dom = new \ Zend_Dom_Query ($ shop_html );
$ Cate_imgs = $ Dom-> query ($ cate_rule ['shop _ category_goods_img ']);
$ Cate_names = $ Dom-> query ($ cate_rule ['shop _ category_goods_name ']);
$ Cate_sales = $ Dom-> query ($ cate_rule ['shop _ category_goods_sale ']);
$ Cate_cprices = $ Dom-> query ($ cate_rule ['shop _ category_goods_cprice ']);
// $ Cate_sprices = $ Dom-> query ($ cate_rule ['shop _ category_goods_sprice ']);
$ Num = count ($ cate_names );
$ Time = time ();
$ Result = array ();
For ($ I = 0; $ I <$ num; $ I ++ ){
$ Result [$ I] ['goods _ thumb'] = $ cate_imgs-> bykey ($ I)-> getAttribute ('src ');
$ Result [$ I] ['goods _ name'] = $ cate_names-> bykey ($ I)-> nodeValue;
$ Result [$ I] ['goods _ url'] = $ cate_names-> bykey ($ I)-> getAttribute ('href ');
$ Result [$ I] ['goods _ cprice'] = $ cate_cprices-> bykey ($ I)-> nodeValue;
$ Result [$ I] ['goods _ sale'] = $ cate_sales-> bykey ($ I)-> nodeValue;
// $ Result [$ I] ['goods _ spirce '] = $ cate_sprices-> bykey ($ I)-> nodeValue;
$ Result [$ I] ['mall _ id'] = $ mall_id;
$ Result [$ I] ['shops _ id'] = $ shops_id;
$ Result [$ I] ['cate _ id'] = $ cate_id;
$ Result [$ I] ['Collect _ time'] = $ time;
}
If (M ('goods ')-> addAll ($ result )){
$ This-> success ('Success in collecting goods under the shop topic ', U ('index '));
}
/**
* Judge webpage data, convert GBK to UTF-8
*/
Public function encoding ($ source ){
$ Encode = mb_detect_encoding ($ source, array ("GBK", "UTF-8", "GB2312", "BIG5 "));
If ($ encode = 'cp936 '){
$ Source = iconv ("GBK", "UTF-8 // IGNORE", $ source );
// $ Meta used for DOM Encoding
$ Meta =' ';
$ Source = $ meta. $ source;
}
Return $ source;
}
Test: only the two buttons can be used, and none of the other buttons can be used.
For testing, you can clear the goods table and click to collect items.
SQL file in the compressed package
BY: youyoushan Yu
Taogoods.zip (2.2 MB download: 72 times)
AD: truly free, domain name + VM + enterprise mailbox = 0 RMB