A simple and practical HTML parsing class recently collected and written
$ Xp = new xf_HtmlDom (); $ xp-> loadHtml ('http: // dealer.bitauto.com/100040078/cars.html'); $ rows = $ xp-> find ('dl/dd/', 0)-> innertext; print_r ($ rows );
- $ OldSetting = libxml_use_internal_errors (true );
- Libxml_clear_errors ();
- /**
- *
- *-+ -----------------------------------
- * | PHP5 Framework-2011
- * | Web Site: www. iblue. cc
- * | E-mail: mejinke@gmail.com
- * | Date: 2012-10-12
- *-+ -----------------------------------
- *
- * @ Desc HTML parser
- * @ Author jingke
- */
- Class XF_HtmlDom
- {
- Private $ _ xpath = null;
- Private $ _ nodePath = '';
- Public function _ construct ($ xpath = null, $ nodePath = '')
- {
- $ This-> _ xpath = $ xpath;
- $ This-> _ nodePath = $ nodePath;
- }
- Public function loadHtml ($ url)
- {
- Ini_set ('User _ agent', 'mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) version/4.0 Mobile Safari/530.17-Nexus ');
- $ Content = '';
- If (strpos (strtolower ($ url), 'http ') ===false)
- {
- $ Content = file_get_contents ($ url );
- }
- Else
- {
- $ Ch = curl_init ();
- $ User_agent = "Baiduspider + (+ http://www.baidu.com/search/spider.htm )";
- $ User_agent1 = 'mozilla/5.0 (Windows NT 5.1; rv: 6.0) Gecko/20100101 Firefox/123456 ';
- Curl_setopt ($ ch, CURLOPT_URL, $ url );
- Curl_setopt ($ ch, CURLOPT_HEADER, false );
- Curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, 1 );
- Curl_setopt ($ ch, CURLOPT_REFERER, $ url );
- Curl_setopt ($ ch, CURLOPT_USERAGENT, $ user_agent1 );
- Curl_setopt ($ ch, CURLOPT_FOLLOWLOCATION, 1 );
- $ Content = curl_exec ($ ch );
- Curl_close ($ ch );
- }
- $ Html = new DOMDocument ();
- $ Html-> loadHtml ($ content );
- $ This-> _ xpath = new DOMXPath ($ html );
- Return $ this;
- }
- Public function find ($ query, $ index = null)
- {
- If ($ this-> _ nodePath = '')
- $ This-> _ nodePath = '//';
- Else
- $ This-> _ nodePath. = '/';
-
- $ Nodes = $ this-> _ xpath-> query ($ this-> _ nodePath. $ query );
- If ($ index = null &&! Is_numeric ($ index ))
- {
- $ Tmp = array ();
- Foreach ($ nodes as $ node)
- {
- $ Tmp [] = new XF_HtmlDom ($ this-> _ xpath, $ node-> getNodePath ());
- }
- Return $ tmp;
- }
- Return new XF_HtmlDom ($ this-> _ xpath, $ this-> _ xpath-> query ($ this-> _ nodePath. $ query)-> item ($ index)-> getNodePath ());
- }
- /**
- * Get content
- */
- Public function text ()
- {
- If ($ this-> _ nodePath! = ''& $ This-> _ xpath! = Null)
- Return $ this-> _ xpath-> query ($ this-> _ nodePath)-> item (0)-> textContent;
- Else
- Return false;
- }
- /**
- * Get attribute values
- */
- Public function getAttribute ($ name)
- {
- If ($ this-> _ nodePath! = ''& $ This-> _ xpath! = Null)
- Return $ this-> _ xpath-> query ($ this-> _ nodePath)-> item (0)-> getAttribute ($ name );
- Else
- Return false;
- }
- Public function _ get ($ name)
- {
- If ($ name = 'innertext ')
- Return $ this-> text ();
- Else
- Return $ this-> getAttribute ($ name );
- }
- }
|