The following is the HTML parsing class and usage. the following function is to collect Baidu data from www.opendir.cn. you can test the function if you need it.
The code is as follows:
$ OldSetting = libxml_use_internal_errors (true );
Libxml_clear_errors ();
/**
*
*-+ -----------------------------------
* | PHP5 Framework-2011
* | Web Site: www. iblue. cc
* | E-mail: mejinke@gmail.com
* | Date: 2012-10-12
*-+ -----------------------------------
*
* @ Desc HTML parser
* @ Author jingke
*/
Class XF_HtmlDom
{
Private $ _ xpath = null;
Private $ _ nodePath = '';
Public function _ construct ($ xpath = null, $ nodePath = '')
{
$ This-> _ xpath = $ xpath;
$ This-> _ nodePath = $ nodePath;
}
Public function loadHtml ($ url)
{
Ini_set ('User _ agent', 'mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) version/4.0 Mobile Safari/530.17-Nexus ');
$ Content = '';
If (strpos (strtolower ($ url), 'http ') ===false)
{
$ Content = file_get_contents ($ url );
}
Else
{
$ Ch = curl_init ();
$ User_agent = "Baiduspider + (+ http://www.baidu.com/search/spider.htm )";
$ User_agent1 = 'mozilla/5.0 (Windows NT 5.1; rv: 6.0) Gecko/20100101 Firefox/123456 ';
Curl_setopt ($ ch, CURLOPT_URL, $ url );
Curl_setopt ($ ch, CURLOPT_HEADER, false );
Curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, 1 );
Curl_setopt ($ ch, CURLOPT_REFERER, $ url );
Curl_setopt ($ ch, CURLOPT_USERAGENT, $ user_agent1 );
Curl_setopt ($ ch, CURLOPT_FOLLOWLOCATION, 1 );
$ Content = curl_exec ($ ch );
Curl_close ($ ch );
}
$ Html = new DOMDocument ();
$ Html-> loadHtml ($ content );
$ This-> _ xpath = new DOMXPath ($ html );
// Return $ this;
}
Public function find ($ query, $ index = null)
{
If ($ this-> _ nodePath = '')
$ This-> _ nodePath = '//';
Else
$ This-> _ nodePath. = '/';
$ Nodes = $ this-> _ xpath-> query ($ this-> _ nodePath. $ query );
// Echo $ nodes-> item (0)-> getNodePath (); exit;
If ($ index = null &&! Is_numeric ($ index ))
{
$ Tmp = array ();
Foreach ($ nodes as $ node)
{
$ Tmp [] = new XF_HtmlDom ($ this-> _ xpath, $ node-> getNodePath ());
}
Return $ tmp;
}
Return new XF_HtmlDom ($ this-> _ xpath, $ this-> _ xpath-> query ($ this-> _ nodePath. $ query)-> item ($ index)-> getNodePath ());
}
/**
* Get content
*/
Public function text ()
{
If ($ this-> _ nodePath! = ''& $ This-> _ xpath! = Null)
Return $ this-> _ xpath-> query ($ this-> _ nodePath)-> item (0)-> textContent;
Else
Return false;
}
/**
* Get attribute values
*/
Public function getAttribute ($ name)
{
If ($ this-> _ nodePath! = ''& $ This-> _ xpath! = Null)
Return $ this-> _ xpath-> query ($ this-> _ nodePath)-> item (0)-> getAttribute ($ name );
Else
Return false;
}
Public function _ get ($ name)
{
If ($ name = 'innertext ')
Return $ this-> text ();
Else
Return $ this-> getAttribute ($ name );
}
}
$ Xp = new xf_HtmlDom ();
$ Xp-> loadHtml ('http: // www.aizhan.com/siteall/www.opendir.cn /');
$ Rows = $ xp-> find ("td [@ id = 'baidu']/a", 0)-> innertext;
Print_r ($ rows );