Just look at the code.
<?php
Define (' Hdom_type_element ', 1);
Define (' Hdom_type_comment ', 2);
Define (' Hdom_type_text ', 3);
Define (' Hdom_type_endtag ', 4);
Define (' Hdom_type_root ', 5);
Define (' Hdom_type_unknown ', 6);
Define (' Hdom_quote_double ', 0);
Define (' Hdom_quote_single ', 1);
Define (' Hdom_quote_no ', 3);
Define (' Hdom_info_begin ', 0);
Define (' Hdom_info_end ', 1);
Define (' Hdom_info_quote ', 2);
Define (' Hdom_info_space ', 3);
Define (' Hdom_info_text ', 4);
Define (' Hdom_info_inner ', 5);
Define (' Hdom_info_outer ', 6);
Define (' Hdom_info_endspace ', 7);
Helper functions
// -----------------------------------------------------------------------------
Get HTML DOM form file
function file_get_html () {
$dom = new Simple_html_dom;
$args = Func_get_args ();
$dom->load (Call_user_func_array (' file_get_contents ', $args), true);
return $dom;
}
Get HTML DOM form string
function str_get_html ($str, $lowercase =true) {
$dom = new Simple_html_dom;
$dom->load ($str, $lowercase);
return $dom;
}
Dump HTML DOM tree
function Dump_html_tree ($node, $show _attr=true, $deep =0) {
$lead = Str_repeat (", $deep);
echo $lead. $node->tag;
if ($show _attr && count ($node->attr) >0) {
Echo ' (';
foreach ($node->attr as $k => $v)
echo "[$k]=>\" ". $node-> $k. '", ';
echo ') ';
}
echo "\ n";
foreach ($node->nodes as $c)
Dump_html_tree ($c, $show _attr, $deep + 1);
}
Get DOM form file (deprecated)
function File_get_dom () {
$dom = new Simple_html_dom;
$args = Func_get_args ();
$dom->load (Call_user_func_array (' file_get_contents ', $args), true);
return $dom;
}
Get DOM form string (deprecated)
function Str_get_dom ($str, $lowercase =true) {
$dom = new Simple_html_dom;
$dom->load ($str, $lowercase);
return $dom;
}
Simple HTML DOM node
// -----------------------------------------------------------------------------
Class Simple_html_dom_node {
Public $nodetype = Hdom_type_text;
Public $tag = ' text ';
Public $attr = Array ();
Public $children = Array ();
Public $nodes = Array ();
public $parent = null;
Public $_ = Array ();
Private $dom = null;
function __construct ($dom) {
$this->dom = $dom;
$dom->nodes[] = $this;
}
function __destruct () {
$this->clear ();
}
function __tostring () {
return $this->outertext ();
}
Clean up memory due to PHP5 circular references memory leak ...
function Clear () {
$this->dom = null;
$this->nodes = null;
$this->parent = null;
$this->children = null;
}
Dump node ' s tree
function dump ($show _attr=true) {
Dump_html_tree ($this, $show _attr);
}
Returns the parent of node
function parent () {
return $this->parent;
}
Returns children of node
function children ($idx =-1) {
if ($idx ===-1) return $this->children;
if (Isset ($this->children[$idx]) return $this->children[$idx];
return null;
}
Returns the "a" of node
function First_child () {
if (count ($this->children) >0) return $this->children[0];
return null;
}
//Returns the last child of node
function Last_child () {
& nbsp; if ($count =count ($this->children)) >0) return $this->children[$count-1];
return null;
}
//Returns the next sibling of node
function next_sibling () {
if ($this->parent===null) return null;
$idx = 0;
$count = count ($this->parent->children);
while ($idx < $count && $this!== $this->parent-> children[$idx])
+ + $idx;
if (+ + $idx >= $count) return null;
return $this->parent->children[$idx];
}
//Returns the previous sibling of node
function prev_sibling () {
& nbsp; if ($this->parent===null) return null;
$idx = 0;
$count = count ($this->parent->children);
while ($idx < $count && $this!== $this->parent-> children[$idx])
+ + $idx;
if (--$idx <0) return null;
return $this->parent->children[$idx];
}
Get DOM node ' s inner HTML
function innertext () {
if (Isset ($this->_[hdom_info_inner]) return $this->_[hdom_info_inner];
if (Isset ($this->_[hdom_info_text])) return $this->dom->restore_noise ($this->_[hdom_info_text]);
$ret = ';
foreach ($this->nodes as $n)
$ret. = $n->outertext ();
return $ret;
}
Get DOM node ' s outer-text (with tag)
function Outertext () {
if ($this->tag=== ' root ') return $this->innertext ();
Trigger Callback
if ($this->dom->callback!==null)
Call_user_func_array ($this->dom->callback, Array ($this));
if (Isset ($this->_[hdom_info_outer]) return $this->_[hdom_info_outer];
if (Isset ($this->_[hdom_info_text])) return $this->dom->restore_noise ($this->_[hdom_info_text]);
Render begin Tag
$ret = $this->dom->nodes[$this->_[hdom_info_begin]]->makeup ();
Render inner text
if (Isset ($this->_[hdom_info_inner]))
$ret. = $this->_[hdom_info_inner];
else {
foreach ($this->nodes as $n)
$ret. = $n->outertext ();
}
Render end Tag
if (Isset ($this->_[hdom_info_end]) && $this->_[hdom_info_end]!=0)
$ret. = ' </'. $this->tag. ' > ';
return $ret;
}
//Get DOM node ' s plain text
function text () {
& nbsp; if (isset ($this->_[hdom_info_inner])) return $this->_[hdom_info_inner];
switch ($this->nodetype) {
Case Hdom_type_text:return $this->dom->restore_noise ($this->_[hdom_ Info_text]);
case Hdom_type_comment:return ';
case Hdom_type_unknown:return ';
}
if (strcasecmp ($this->tag, ' script ') ===0) return ';
if (strcasecmp ($this->tag, ' style ') ===0) return ';
$ret = ';
foreach ($this->nodes as $n)
$ret. = $n->text ();
return $ret;
}
function xmltext () {
$ret = $this-> InnerText ();
$ret = Str_ireplace (' <![ Cdata[', ', $ret);
$ret = str_replace (']]> ', ', ', $ret);
return $ret;
}
Build node ' s-Text with tag
function makeup () {
Text, comment, unknown
if (Isset ($this->_[hdom_info_text])) return $this->dom->restore_noise ($this->_[hdom_info_text]);
$ret = ' < '. $this->tag;
$i =-1;
foreach ($this->attr as $key => $val) {
+ + $i;
Skip removed attribute
if ($val ===null | | $val ===false)
Continue
$ret. = $this->_[hdom_info_space][$i][0];
No value attr:nowrap, checked selected ...
if ($val ===true)
$ret. = $key;
else {
Switch ($this->_[hdom_info_quote][$i]) {
Case hdom_quote_double: $quote = ' "'; Break
Case Hdom_quote_single: $quote = ' \ '; Break
Default: $quote = ';
}
$ret. = $key. $this->_[hdom_info_space][$i][1]. ' = '. $this->_[hdom_info_space][$i][2]. $quote. $val. $quote;
}
}
$ret = $this->dom->restore_noise ($ret);
Return $ret. $this->_[hdom_info_endspace]. ' > ';
}
Find elements by CSS selector
function Find ($selector, $idx =null) {
$selectors = $this->parse_selector ($selector);
if ($count =count ($selectors)) ===0 return array ();
$found _keys = Array ();
Find each Selector
for ($c =0; $c < $count; + + $c) {
if ($levle =count ($selectors [0])) ===0 return array ();
if (!isset ($this->_[hdom_info_begin])) return array ();
$head = Array ($this->_[hdom_info_begin]=>1);
Handle descendant selectors, no recursive!
for ($l =0; $l < $levle; + + $l) {
$ret = Array ();
foreach ($head as $k => $v) {
$n = ($k ===-1)? $this->dom->root: $this->dom->nodes[$k];
$n->seek ($selectors [$c] [$l], $ret);
}
$head = $ret;
}
foreach ($head as $k => $v) {
if (!isset ($found _keys[$k]))
$found _keys[$k] = 1;
}
}
Sort keys
Ksort ($found _keys);
$found = Array ();
foreach ($found _keys as $k => $v)
$found [] = $this->dom->nodes[$k];
return Nth-element or array
if (Is_null ($IDX)) return $found;
else if ($idx <0) $idx = count ($found) + $idx;
Return (Isset ($found [$idx]))? $found [$IDX]: null;
}
Seek for given conditions
protected function Seek ($selector, & $ret) {
List ($tag, $key, $val, $exp, $no _key) = $selector;
XPath index
if ($tag && $key && is_numeric ($key)) {
$count = 0;
foreach ($this->children as $c) {
if ($tag = = = ' * ' | | | $tag = = = $c->tag) {
if (+ + $count = = $key) {
$ret [$c->_[hdom_info_begin]] = 1;
Return
}
}
}
Return
}
$end = (!empty ($this->_[hdom_info_end]))? $this->_[hdom_info_ end]: 0;
if ($end ==0) {
$parent = $this->parent;
while (!isset ($parent->_[hdom_info_end ] && $parent!==null) {
$end-= 1;
$parent = $parent- >parent;
}
$end + + $parent->_[hdom_info_end];
}
for ($i = $this->_[hdom_info_begin]+1; $i < $end; + + $i) {
$node = $this->dom->nodes[$i];
$pass = true;
if ($tag = = ' * ' &&! $key) {
if (In_array ($node, $this->children, True)
$ret [$i] = 1;
Continue
}
Compare tag
if ($tag && $tag!= $node->tag && $tag!== ' * ') {$pass =false;}
Compare key
if ($pass && $key) {
if ($no _key) {
if (Isset ($node->attr[$key])) $pass =false;
}
else if (!isset ($node->attr[$key)) $pass =false;
}
Compare value
if ($pass && $key && $val && $val!== ' * ') {
$check = $this->match ($exp, $val, $node->attr[$key]);
Handle Multiple class
if (! $check && strcasecmp ($key, ' class ') ===0) {
foreach (Explode (', $node->attr[$key]) as $k) {
$check = $this->match ($exp, $val, $k);
if ($check) break;
}
}
if (! $check) $pass = false;
}
if ($pass) $ret [$i] = 1;
Unset ($node);
}
}
protected function Match ($exp, $pattern, $value) {
Switch ($EXP) {
Case ' = ':
return ($value = = = $pattern);
Case '!= ':
Return ($value!== $pattern);
Case ' ^= ':
Return Preg_match ("/^". Preg_quote ($pattern, '/'). " /", $value);
Case ' $= ':
Return Preg_match ("/" Preg_quote ($pattern, '/'). " $/", $value);
Case ' *= ':
if ($pattern [0]== '/')
Return Preg_match ($pattern, $value);
Return Preg_match ("/". $pattern. " /i ", $value);
}
return false;
}
protected function Parse_selector ($selector _string) {
//pattern of CSS selectors, modified from MooTools
$pattern = "/([\w-:\*]*) (?: \ # ([\w-]+) |\. ([\w-]+)]? (?:\ [@? (!? [\w-]+) (?:( [!*^$]?=)[\"']? (.*?) [\"']?)? \])? ([\/,]+)/is ";
preg_match_all ($pattern, Trim ($selector _string). ', $matches, Preg_set_order);
$selectors = Array ();
$result = Array ();
//print_r ($matches);
foreach ($matches as $m) {
$m [0] = Trim ($m [0]);
if ($m [0]===] | | $m [0]===] | | | | $m [0]=== '//] ) continue;
//For Borwser grnreated XPath
if ($m [1]=== ' tbody ') continue;
list ($tag, $key, $val, $exp, $no _key) = Array ($ M[1], NULL, NULL, ' = ', false);
if (!empty ($m [2])) {$key = ' id '; $val = $m [2];}
if (!empty ($m [3])) {$key = ' class '; $val = $m [3 ];}
if (!empty ($m [4])) {$key = $m [4];}
if (!empty ($m [5])) {$exp = $m [5];}
if (!empty ($m [6])) {$val = $m [6];}
//Convert to lowercase
if ($this->dom->lowercase) {$tag =strtolower ($tag); Key=strtolower ($key);
//elements that does not have the specified Attribute
if (isset ($key [0]) && $key [0]=== '! '] {$key =substr ($key, 1); $no _key=true}
$result [] = Array ($tag, $key, $val, $exp, $no _ Key);
if (Trim ($m [7]) = = ', ') {
$selectors [] = $result;
$result = Array ();
}
}
if (count ($result) >0)
$selectors [] = $result;
return $selectors;
}
function __get ($name) {
if (Isset ($this->attr[$name]) return $this->attr[$name];
Switch ($name) {
Case ' outertext ': Return $this->outertext ();
Case ' innertext ': Return $this->innertext ();
Case ' plaintext ': Return $this->text ();
Case ' xmltext ': Return $this->xmltext ();
Default:return array_key_exists ($name, $this->attr);
}
}
function __set ($name, $value) {
Switch ($name) {
Case ' outertext ': return $this->_[hdom_info_outer] = $value;
Case ' innertext ':
if (Isset ($this->_[hdom_info_text])) return $this->_[hdom_info_text] = $value;
return $this->_[hdom_info_inner] = $value;
}
if (!isset ($this->attr[$name])) {
$this->_[hdom_info_space][] = Array (', ', ', ');
$this->_[hdom_info_quote][] = hdom_quote_double;
}
$this->attr[$name] = $value;
}
function __isset ($name) {
Switch ($name) {
Case ' Outertext ': return true;
Case ' innertext ': return true;
Case ' plaintext ': return true;
}
No value attr:nowrap, checked selected ...
Return (Array_key_exists ($name, $this->attr))? True:isset ($this->attr[$name]);
}
function __unset ($name) {
if (Isset ($this->attr[$name]))
unset ($this->attr[$name]);
}
Camel Naming conventions
function Getallattributes () {return $this->attr;}
function GetAttribute ($name) {return $this->__get ($name);}
function setattribute ($name, $value) {$this->__set ($name, $value);
function Hasattribute ($name) {return $this->__isset ($name);}
function RemoveAttribute ($name) {$this->__set ($name, null);}
function getElementById ($id) {return $this->find ("# $id", 0);}
function GetElementsById ($id, $idx =null) {return $this->find ("# $id", $IDX);}
function Getelementbytagname ($name) {return $this->find ($name, 0);}
function getElementsByTagName ($name, $idx =null) {return $this->find ($name, $IDX);
function ParentNode () {return $this->parent ();}
function ChildNodes ($idx =-1) {return $this->children ($IDX);}
function FirstChild () {return $this->first_child ();}
function LastChild () {return $this->last_child ();}
function nextSibling () {return $this->next_sibling ();}
function previoussibling () {return $this->prev_sibling ();}
}
Simple HTML DOM Parser
// -----------------------------------------------------------------------------
Class Simple_html_dom {
public $root = null;
Public $nodes = Array ();
public $callback = null;
Public $lowercase = false;
protected $pos;
protected $doc;
protected $char;
protected $size;
protected $cursor;
protected $parent;
Protected $noise = Array ();
protected $token _blank = "\t\r\n";
protected $token _equal = ' =/> ';
protected $token _slash = "/>\r\n\t";
protected $token _attr = ' > ';
Use Isset instead of In_array, performance boost about 30% ...
protected $self _closing_tags = Array (' img ' =>1, ' BR ' =>1, ' input ' =>1, ' meta ' =>1, ' link ' =>1, ' hr ' =>1, ' Base ' =>1, ' embed ' =>1, ' spacer ' =>1);
protected $block _tags = array (' root ' =>1, ' body ' =>1, ' form ' =>1, ' div ' =>1, ' span ' =>1, ' table ' =>1);
protected $optional _closing_tags = Array (
' TR ' =>array (' tr ' =>1, ' TD ' =>1, ' th ' =>1),
' th ' =>array (' th ' =>1),
' TD ' =>array (' TD ' =>1),
' Li ' =>array (' Li ' =>1),
' DT ' =>array (' dt ' =>1, ' DD ' =>1),
' DD ' =>array (' dd ' =>1, ' DT ' =>1),
' DL ' =>array (' dd ' =>1, ' DT ' =>1),
' P ' =>array (' P ' =>1),
' nobr ' =>array (' nobr ' =>1),
);
function __construct ($str =null) {
if ($STR) {
if (Preg_match ("/^http:\/\//i", $str) | | is_file ($STR))
$this->load_file ($STR);
Else
$this->load ($STR);
}
}
http://www.devdao.com/
function __destruct () {
$this->clear ();
}
Load HTML from string
function Load ($str, $lowercase =true) {
Prepare
$this->prepare ($str, $lowercase);
Strip out comments
$this->remove_noise ("' <!--(. *?) --> ' is ');
Strip out CDATA
$this->remove_noise ("' <!\[cdata\[") \]\]> ' is ', true);
Strip out <style> tags
$this->remove_noise ("' <\s*style[^>]*[^/]>. *?) <\s*/\s*style\s*> ' is ');
$this->remove_noise ("' <\s*style\s*>. *?) <\s*/\s*style\s*> ' is ');
Strip out <script> tags
$this->remove_noise ("' <\s*script[^>]*[^/]>. *?) <\s*/\s*script\s*> ' is ');
$this->remove_noise ("' <\s*script\s*>. *?) <\s*/\s*script\s*> ' is ');
Strip out preformatted tags
$this->remove_noise ("' <\s* (?: code) [^>]*>" (. *?) <\s*/\s* (?: code) \s*> ' is ');
Strip out server side scripts
$this->remove_noise ("' (<\?) (.*?) (\?>) ' s ", true);
Strip Smarty Scripts
$this->remove_noise ("' (\{\w) (. *?) (\})' S ", true);
Parsing
while ($this->parse ());
End
$this->root->_[hdom_info_end] = $this->cursor;
}
Load HTML from File
function Load_file () {
$args = Func_get_args ();
$this->load (Call_user_func_array (' file_get_contents ', $args), true);
}
Set callback function
function Set_callback ($function _name) {
$this->callback = $function _name;
}
Remove callback function
function Remove_callback () {
$this->callback = null;
}
Save Dom As String
function Save ($filepath = ' ") {
$ret = $this->root->innertext ();
if ($filepath!== ') file_put_contents ($filepath, $ret);
return $ret;
}
//Find DOM node by CSS selector
function Find ($selector, $idx =null) {
&nb sp; return $this->root->find ($selector, $IDX);
}
Clean up memory due to PHP5 circular references memory leak ...
function Clear () {
foreach ($this->nodes as $n) {$n->clear (); $n = null;}
if (Isset ($this->parent)) {$this->parent->clear (); unset ($this->parent);}
if (Isset ($this->root)) {$this->root->clear (); unset ($this->root);}
unset ($this->doc);
unset ($this->noise);
}
function dump ($show _attr=true) {
$this->root->dump ($show _attr);
}
Prepare HTML data and init everything
protected function Prepare ($STR, $lowercase =true) {
$this->clear ();
$this->doc = $str;
$this->pos = 0;
$this->cursor = 1;
$this->noise = Array ();
$this->nodes = Array ();
$this->lowercase = $lowercase;
$this->root = new Simple_html_dom_node ($this);
$this->root->tag = ' root ';
$this->root->_[hdom_info_begin] =-1;
$this->root->nodetype = hdom_type_root;
$this->parent = $this->root;
Set the length of content
$this->size = strlen ($STR);
if ($this->size>0) $this->char = $this->doc[0];
}
Parse HTML content
protected function Parse () {
if ($s = $this->copy_until_char (' < ')) = = = ')
return $this->read_tag ();
Text
$node = new Simple_html_dom_node ($this);
+ + $this->cursor;
$node->_[hdom_info_text] = $s;
$this->link_nodes ($node, false);
return true;
}
//Read tag info
protected function Read_tag () {
if ($this->char!== ' < ') {
$this->root->_[hdom_info_end] = $this->cursor;
return false;
}
$begin _tag_pos = $this->pos;
$this->char = (+ + $this->pos< $this->size)? $this->doc [$this->pos]: null; Next
End tag
if ($this->char=== '/') {
$this->char = (+ + $this->pos< $this->size)? $this->doc[$this->pos]: null; Next
$this->skip ($this->token_blank_t);
$tag = $this->copy_until_char (' > ');
Skip attributes in end tag
if ($pos = Strpos ($tag, '))!==false)
$tag = substr ($tag, 0, $pos);
$parent _lower = strtolower ($this->parent->tag);
$tag _lower = Strtolower ($tag);
if ($parent _lower!== $tag _lower) {
if (Isset ($this->optional_closing_tags[$parent _lower]) && isset ($this->block_tags[$tag _lower])) {
$this->parent->_[hdom_info_end] = 0;
$org _parent = $this->parent;
while (($this->parent->parent) && strtolower ($this->parent->tag)!== $tag _lower)
$this->parent = $this->parent->parent;
if (Strtolower ($this->parent->tag)!== $tag _lower) {
$this->parent = $org _parent; Restore Origonal Parent
if ($this->parent->parent) $this->parent = $this->parent->parent;
$this->parent->_[hdom_info_end] = $this->cursor;
return $this->as_text_node ($tag);
}
}
else if (($this->parent->parent) && isset ($this->block_tags[$tag _lower)) {
$this->parent->_[hdom_info_end] = 0;
$org _parent = $this->parent;
while (($this->parent->parent) && strtolower ($this->parent->tag)!== $tag _lower)
$this->parent = $this->parent->parent;
if (Strtolower ($this->parent->tag)!== $tag _lower) {
$this->parent = $org _parent; Restore Origonal Parent
$this->parent->_[hdom_info_end] = $this->cursor;
return $this->as_text_node ($tag);
}
}
else if (($this->parent->parent) && strtolower ($this->parent->parent->tag) = = = $tag _lower) {
$this->parent->_[hdom_info_end] = 0;
$this->parent = $this->parent->parent;
}
Else
return $this->as_text_node ($tag);
}
$this->parent->_[hdom_info_end] = $this->cursor;
if ($this->parent->parent) $this->parent = $this->parent->parent;
$this->char = (+ + $this->pos< $this->size)? $this->doc[$this->pos]: null; Next
return true;
}
$node = new Simple_html_dom_node ($this);
$node->_[hdom_info_begin] = $this->cursor;
+ + $this->cursor;
$tag = $this->copy_until ($this->token_slash);
DOCTYPE, CDATA & Comments ...
if (isset ($tag [0]) && $tag [0]=== '! ') {
$node->_[hdom_info_text] = ' < '. $tag. $this->copy_until_char (' > ');
if (Isset ($tag [2]) && $tag [1]=== '-' && $tag [2]=== '-') {
$node->nodetype = hdom_type_comment;
$node->tag = ' Comment ';
} else {
$node->nodetype = Hdom_type_unknown;
$node->tag = ' Unknown ';
}
if ($this->char=== ' > ') $node->_[ Hdom_info_text].= ' > ';
$this->link_nodes ($node, true);
$this->char = (+ + $this->pos<$ this->size)? $this->doc[$this->pos]: null; Next
return true;
}
//Text
if ($pos = Strpos ($tag, ' < ')!==false) {
$tag = ' < ' . substr ($tag, 0,-1);
$node->_[hdom_info_text] = $tag;
$this->link_nodes ($node, false);
$this->char = $this->doc[--$this- >pos]; Prev
return true;
}
if (!preg_match ("/^[\w-:]+$/", $tag)) {
$node->_[hdom_info_text] = ' < '. $tag. $this->copy_until (' <> ');
if ($this->char=== ' < ') {
$this->link_nodes ($node , false);
return true;
}
if ($this->char=== ' > ') $node->_[hdom_info_text].= ' > ';
$this->link_nodes ($node, false);
$this->char = (+ + $this->pos< $this->size)? $this->doc[$this->pos]: null; Next
return true;
}
Begin Tag
$node->nodetype = hdom_type_element;
$tag _lower = Strtolower ($tag);
$node->tag = ($this->lowercase)? $tag _lower: $tag;
//Handle optional closing tags
if (isset ($this->optional_closing_tags[$tag _lower]) {
while (isset ($this->optional_closing_tags[$tag _lower][strtolower ($this-> Parent->tag)]) {
$this->parent->_[hdom_info_end] = 0;
$this->parent = $this->parent->parent;
}
$node->parent = $this->parent;
}
$guard = 0;//Prevent Infinity Loop
& nbsp; $space = Array ($this->copy_skip ($this->token_blank), ",");
//attributes
do {
if ($this->char!==null && $space [0]= = = "") break;
$name = $this->copy_until ($this-> Token_equal);
if ($guard = = $this->pos) {
$this->char = (+ + $this- >pos< $this->size)? $this->doc[$this->pos]: null; Next
continue;
}
$guard = $this->pos;
Handle Endless ' < '
if ($this->pos>= $this->size-1 && $this->char!== ' > ') {
$node->nodetype = Hdom_type_text;
$node->_[hdom_info_end] = 0;
$node->_[hdom_info_text] = ' < '. $tag. $space [0]. $name;
$node->tag = ' text ';
$this->link_nodes ($node, false);
return true;
}
Handle mismatch ' < '
if ($this->doc[$this->pos-1]== ' < ') {
$node->nodetype = Hdom_type_text;
$node->tag = ' text ';
$node->attr = Array ();
$node->_[hdom_info_end] = 0;
$node->_[hdom_info_text] = substr ($this->doc, $begin _tag_pos, $this->pos-$begin _tag_pos-1);
$this->pos-= 2;
$this->char = (+ + $this->pos< $this->size)? $this->doc[$this->pos]: null; Next
$this->link_nodes ($node, false);
return true;
}
if ($name!== '/' && $name!== ') {
$space [1] = $this->copy_skip ($this->token_blank);
$name = $this->restore_noise ($name);
if ($this->lowercase) $name = Strtolower ($name);
if ($this->char=== ' = ') {
$this->char = (+ + $this->pos< $this->size)? $this->doc[$this->pos]: null; Next
$this->parse_attr ($node, $name, $space);
}
else {
No value attr:nowrap, checked selected ...
$node->_[hdom_info_quote][] = hdom_quote_no;
$node->attr[$name] = true;
if ($this->char!= ' > ') $this->char = $this->doc[--$this->pos]; Prev
}
$node->_[hdom_info_space][] = $space;
$space = Array ($this->copy_skip ($this->token_blank), ', ' ";
}
Else
Break
while ($this->char!== ' > ' && $this->char!== '/');
$this->link_nodes ($node, true);
$node->_[hdom_info_endspace] = $space [0];
Check self closing
if ($this->copy_until_char_escape (' > ') = = = '/') {
$node->_[hdom_info_endspace]. = '/';
$node->_[hdom_info_end] = 0;
}
else {
Reset Parent
if (!isset ($this->self_closing_tags[strtolower ($node->tag))) $this->parent = $node;
}
$this->char = (+ + $this->pos< $this->size)? $this->doc[$this->pos]: null; Next
return true;
}
Parse attributes
protected function parse_attr ($node, $name, & $space) {
$space [2] = $this->copy_skip ($this->token_blank);
Switch ($this->char) {
Case ' "':
$node->_[hdom_info_quote][] = hdom_quote_double;
$this->char = (+ + $this->pos< $this->size)? $this->doc[$this->pos]: null; Next
$node->attr[$name] = $this->restore_noise ($this->copy_until_char_escape (' "));
$this->char = (+ + $this->pos< $this->size)? $this->doc[$this->pos]: null; Next
Break
Case ' \ ':
$node->_[hdom_info_quote][] = Hdom_quote_single;
$this->char = (+ + $this->pos< $this->size)? $this->doc[$this->pos]: null; Next
$node->attr[$name] = $this->restore_noise ($this->copy_until_char_escape (' \ '));
$this->char = (+ + $this->pos< $this->size)? $this->doc[$this->pos]: null; Next
Break
Default
$node->_[hdom_info_quote][] = hdom_quote_no;
$node->attr[$name] = $this->restore_noise ($this->copy_until ($this->token_attr));
}
}
//Link node ' s parent
protected function link_nodes (& $node, $is _child) {
$node->parent = $this->parent;
$this->parent->nodes[] = $node;
if ($is _child)
$this->parent->children[] = $node;
}
//As a text node
protected function As_text_node ($tag) {
& nbsp; $node = new Simple_html_dom_node ($this);
+ + $this->cursor;
$node->_[hdom_info_text] = ' </'. $tag. ' > ';
$this->link_nodes ($node, false);
$this->char = (+ + $this->pos< $this->size)? $this->doc [$this->pos]: null; Next
return true;
}
protected function Skip ($chars) {
$this->pos + + strspn ($this->doc, $chars, $this->pos);
$this->char = ($this->pos< $this->size)? $this->doc[$this->pos]: null; Next
}
protected function Copy_skip ($chars) {
$pos = $this->pos;
$len = strspn ($this->doc, $chars, $pos);
$this->pos + + $len;
$this->char = ($this->pos< $this->size)? $this->doc[$this->pos]: null; Next
if ($len ===0) return ";
Return substr ($this->doc, $pos, $len);
}
protected function Copy_until ($chars) {
$pos = $this->pos;
$len = strcspn ($this->doc, $chars, $pos);
$this->pos + + $len;
$this->char = ($this->pos< $this->size)? $this->doc[$this->pos]: null; Next
Return substr ($this->doc, $pos, $len);
}
protected function Copy_until_char ($char) {
if ($this->char===null) return ";
if (($pos = Strpos ($this->doc, $char, $this->pos)) ===false) {
$ret = substr ($this->doc, $this->pos, $ this->size-$this->pos);
$this->char = null;
$this->pos = $this->size;
return $ret;
}
if ($pos = = = $this->pos) return ";
$pos _old = $this->pos;
$this->char = $this->doc[$pos];
$this->pos = $pos;
Return substr ($this->doc, $pos _old, $pos-$pos _old);
}
protected function Copy_until_char_escape ($char) {
if ($this->char===null) return ";
$start = $this->pos;
while (1) {
if (($pos = Strpos ($this->doc, $char, $start)) ===false) {
$ret = substr ($this->doc, $this->pos, $this->size-$this->pos);
$this->char = null;
$this->pos = $this->size;
return $ret;
}
if ($pos = = = $this->pos) return ";
if ($this->doc[$pos -1]=== ' \ ") {
$start = $pos +1;
Continue
}
$pos _old = $this->pos;
$this->char = $this->doc[$pos];
$this->pos = $pos;
Return substr ($this->doc, $pos _old, $pos-$pos _old);
}
}
Remove noise from HTML content
protected function Remove_noise ($pattern, $remove _tag=false) {
$count = Preg_match_all ($pattern, $this->doc, $matches, preg_set_order| Preg_offset_capture);
for ($i = $count-1; $i >-1;-$i) {
$key = ' ___noise___ '. sprintf ('% 3d ', count ($this->noise) +100);
$idx = ($remove _tag)? 0:1;
$this->noise[$key] = $matches [$i] [$idx][0];
$this->doc = Substr_replace ($this->doc, $key, $matches [$i] [$idx][1], strlen ($matches [$i] [$idx][0]);
}
/Reset the length of content
$this->size = strlen ($this->doc);
if ($this->size>0) $this->char = $this->doc[0];
}
Restore noise to HTML content
function Restore_noise ($text) {
while (($pos =strpos ($text, ' ___noise___ '))!==false) {
$key = ' ___noise___ '. $text [$pos +11]. $text [$pos +12]. $text [$pos +13];
if (Isset ($this->noise[$key]))
$text = substr ($text, 0, $pos). $this->noise[$key].substr ($text, $pos +14);
}
return $text;
}
function __tostring () {
return $this->root->innertext ();
}
function __get ($name) {
switch ($name) {
case ' outertext ': Return $this->root-> InnerText ();
case ' innertext ': Return $this->root- >innertext ();
case ' plaintext ': return $this->root- >text ();
}
}
Camel Naming conventions
function ChildNodes ($idx =-1) {return $this->root->childnodes ($IDX);}
function FirstChild () {return $this->root->first_child ();}
function LastChild () {return $this->root->last_child ();}
function getElementById ($id) {return $this->find ("# $id", 0);}
function GetElementsById ($id, $idx =null) {return $this->find ("# $id", $IDX);}
function Getelementbytagname ($name) {return $this->find ($name, 0);}
function getElementsByTagName ($name, $idx =-1) {return $this->find ($name, $IDX);
function LoadFile () {$args = Func_get_args (); $this->load (Call_user_func_array (' file_get_contents ', $args), true);
}
?>
tqq.php
<?php
/*******************************************************************************
version:1.11 ($Rev: 175 $)
Website:http://www.115.co
AUTHOR:S.C Chen <admin@185.cm>
Acknowledge:jose Solorzano (http://www.115.co/)
Contributions BY:QQ Exchange Group: 89097023
Yousuke Kumakura (Attribute filters)
Vadim Voituk (Negative indexes supports of "find" method)
Antcs (constructor with automatically load contents either text or File/url)
Licensed under the MIT License
Redistributions of files must retain the above copyright notice.
*******************************************************************************/
Cache time, units: seconds
$t = 360;
if (!is_file (' index.html ') | | (Time ()-filemtime (' index.html ')) > $t) {
Micro Blog Account
$QQ = ' Kuaisubeian ';
After Tencent's Md5_3 () encrypted password
$pwd = ' 624d3274815f2237817a7c62f42dd26a ';
$verifyURL = ' http://ptlogin2.qq.com/check?uin=@ '. $qq. ' &appid=46000101 ';
$loginURL = ' http://ptlogin2.qq.com/login? ';
Get the authentication code and the first cookie
$curl = Curl_init ($verifyURL);
$cookie _jar = Tempnam ('. ', ' Cookie ');
curl_setopt ($curl, Curlopt_returntransfer, 1);
curl_setopt ($curl, Curlopt_cookiejar, $cookie _jar);
$verifyCode = curl_exec ($curl);
Curl_close ($curl);
$verifyCode = Strtoupper (substr ($verifyCode, 18, 4));
echo ' Verification code: '. $verifyCode;
echo ' <hr/> ';
Echo ' Cookies: '. $cookie _jar;
echo ' <hr/> ';
Send a login request and get a second cookie
$loginURL. = ' u=@ '. $qq. ' &p= '. MD5 ($pwd. $verifyCode). ' &verifycode= '. $verifyCode. ' &aid=46000101&u1=http%3a%2f%2ft.qq.com&h=1&from_ui=1&fp=loginerroralert ';
Echo ' Login address: '. $loginURL;
echo ' <hr/> ';
$curl = Curl_init ($loginURL);
curl_setopt ($curl, Curlopt_returntransfer, 1);
curl_setopt ($curl, Curlopt_cookiejar, $cookie _jar);
curl_setopt ($curl, Curlopt_cookiefile, $cookie _jar);
curl_setopt ($curl, Curlopt_cookiejar, $cookie _jar);
$loginResult = curl_exec ($curl);
Curl_close ($curl);
Echo ' Login verification Result: '. $loginResult;
echo ' <hr/> ';
Http://www.knowsky.com
Get a third time cookie
$curl = Curl_init (' http://t.qq.com ');
curl_setopt ($curl, Curlopt_returntransfer, 1);
curl_setopt ($curl, Curlopt_cookiejar, $cookie _jar);
curl_setopt ($curl, Curlopt_cookiefile, $cookie _jar);
curl_setopt ($curl, Curlopt_cookiejar, $cookie _jar);
$loginResult = curl_exec ($curl);
Curl_close ($curl);
four times
$curl = Curl_init (' http://t.qq.com/'. $qq. '/mine ');
curl_setopt ($curl, Curlopt_returntransfer, 1);
curl_setopt ($curl, Curlopt_cookiejar, $cookie _jar);
curl_setopt ($curl, Curlopt_cookiefile, $cookie _jar);
curl_setopt ($curl, Curlopt_cookiejar, $cookie _jar);
$loginResult = curl_exec ($curl);
Curl_close ($curl);
Unlink ($cookie _jar);
File_put_contents (' index.html ', $loginResult);
}
Include (' cnz.php ');
$html = file_get_html (' index.html ');
$talkList = $html->find (' #talkList ');
$lastTalk = $talkList [0];
$userName = $lastTalk->children (0)->children (1)->find ('. UserName ');
$msgCnt = $lastTalk->children (0)->children (1)->find ('. msgcnt ');
$pubInfo = $lastTalk->children (0)->children (1)->find ('. PubInfo ');
$userName = $userName [0]->plaintext;
$result = ';
More than two are broadcast
if (count ($msgCnt) < 10) {
$pi = $pubInfo [0]->find ('. Left ');
$result = $userName. $msgCnt [0]->plaintext.] <p style= "Text-align:center;" > <span style= "Display:none" ></span></p>
<title>qqweiboqq|www.beiantuan.com</title><meta http-equiv= "Content-type" content= "text/html; Charset=utf-8 "/><span style=" font-style:italic; Color:rgb (238, 29, 36); > $pi [0]->children (0)->plaintext. ' '. $pi [0]->children (1)->plaintext. ' </span> ';
}else{
$pi = $pubInfo [1]->find ('. Left ');
$result = $userName. $msgCnt [0]->plaintext.] ['. $msgCnt [1]->plaintext. '] <span style= "font-style:italic; Color:rgb (149, 158, 135); > '. $pi [0]->plaintext.] </span> ';
}
echo $result;
?>