<?php Class Readability { The tag bit name that holds the result of the decision Const Attr_content_score = "Contentscore"; The DOM parsing class currently supports only UTF-8 encoding Const DOM_DEFAULT_CHARSET = "Utf-8"; What is displayed when a decision fails Const MESSAGE_CAN_NOT_GET = "Readability is unable to parse this page for content." DOM parsing class (PHP5 already built in) protected $DOM = null; Source code that needs to be parsed protected $source = ""; List of parent elements of chapter Private $parentNodes = Array (); Labels that need to be deleted note:added Extra tags from http://www.111cn.net Private $junkTags = Array ("Style", "form", "iframe", "Script", "button", "Input", "textarea", & nbsp; "NoScript", "select", "option", " Object ", applet," Basefont ", " Bgsound "," Blink "," Canvas "," command "," menu "," Nav "," DataList ", "Embed", "frame", "frameset", "keygen", "label", "marquee", "link"); Properties that need to be removed Private $junkAttrs = Array ("Style", "Class", "onclick", "onmouseover", "align", "border", "margin"); /** * Constructor * @param $input The encoding of the _char string. Default Utf-8, you can omit */ function __construct ($source, $input _char = "Utf-8") { $this->source = $source; DOM parsing classes can only handle characters in UTF-8 format $source = mb_convert_encoding ($source, ' html-entities ', $input _char); preprocessing HTML tags, eliminating redundant tags, etc. $source = $this->preparsource ($source); //Generate DOM Resolution classes $this- >dom = new DOMDocument (' 1.0 ', $input _char); try { //libxml_use_internal_errors (TRUE); //There will be some error messages, but not tight: ^) if (!@ $this->dom->loadhtml (' <?xml encoding= '). Readability::D om_default_charset. ' " > '. $source)) { throw new Exception ("Parse HTML error!"); } foreach ($this->dom->childnodes as $ Item) { if ($item- >nodetype = = Xml_pi_node) { $this->dom->removechild ($item); Remove Hack } } //Insert proper $this->dom->encoding = readability::D om_default_charset; } catch (Exception $e) { //... } } /** * Preprocessing HTML tags so that they can be handled accurately by DOM parsing classes * * @return String */ Private function Preparsource ($string) { //Eliminate extra HTML encoding tags to avoid parsing errors Preg_match ("/charset=" ([\w|\-]+); /", $string, $match); if (isset ($match [1])) { $string = preg_replace ("/charset=" ([\w|\-]+); /"," ", $string, 1); } Replace all doubled-up <BR> tags with <P> tags, and remove fonts. $string = Preg_replace ("/<br\/?>[\r\n\s]*<br\/?>/i", "</p><p>", $string); $string = Preg_replace ("/<\/?font[^>]*>/i", "", $string); @see HTTPS://GITHUB.COM/FEELINGLUCKY/PHP-READABILITY/ISSUES/7 -From Http://stackoverflow.com/questions/7130867/remove-script-tag-from-html-content $string = Preg_replace ("#<script" (. *?) > (. *?) </script> #is "," ", $string); Return trim ($string); }
/**
* Delete all $TagName tags in the DOM element
*
* @return DOMDocument
*/
Private Function Removejunktag ($RootNode, $TagName) {
$Tags = $RootNode->getelementsbytagname ($TagName);
Note:always index 0, because removing a tag removes it from the results as.
while ($Tag = $Tags->item (0)) {
$parentNode = $Tag->parentnode;
$parentNode->removechild ($TAG);
}
return $RootNode;
} /** * Delete all unwanted attributes in the element */ Private Function Removejunkattr ($RootNode, $Attr) { $Tags = $RootNode->getelementsbytagname ("*"); $i = 0; while ($Tag = $Tags->item ($i + +)) { $Tag->removeattribute ($Attr); } return $RootNode; } /** * Get the box model of the main content of the page according to the score * Decision algorithm from: http://code.google.com/p/arc90labs-readability/ * This is forwarded by Zhengxiao Blog * @return Domnode */ Private Function Gettopbox () { Get all the chapters in a page $allParagraphs = $this->dom->getelementsbytagname ("P"); Study the paragraphs and find the chunk that has the best score.
A score is determined by things like:number of <p> ' s, commas, special classes, etc.
$i = 0;
while ($paragraph = $allParagraphs->item ($i + +)) {
$parentNode = $paragraph->parentnode;
$contentScore = Intval ($parentNode->getattribute (Readability::attr_content_score));
$className = $parentNode->getattribute ("class");
$id = $parentNode->getattribute ("id"); //Look for a special classname if (Preg_match ("/(comment|meta|footer|footnote)/I", $className)) { $ Contentscore-= 50; } else if (Preg_match ( [(^|\\s) (post|hentry|entry[-]? ( content|text|body)? | Article[-]? (content|text|body)?) (\\s|$)) /i ", $className)) { $contentScore + = 25; } //Look for a special ID if (Preg_match ("/(comment|meta|footer|footnote)/i", $id)) { $contentScore-= 50; } else if (Preg_match ( "/^ (post|hentry|entry[-]?" ( content|text|body)? | Article[-]? (content|text|body)?) $/i ", $id)) { $contentScore + 25; } Add a point for the paragraph found Add points for no commas within this paragraph if (strlen ($paragraph->nodevalue) > 10) { $contentScore + + strlen ($paragraph->nodevalue); } Save the decision score for the parent element $parentNode->setattribute (Readability::attr_content_score, $contentScore); //Save the parent element of the chapter so that the next quick fetch Array_push ($this->parentnodes, $parentNode); } $topBox = null; //assignment from Index for performance. // http://www.peachpit.com/articles/ Article.aspx?p=31567&seqnum=5 for ($i = 0, $len = sizeof ($this-> Parentnodes); $i < $len; $i + +) { $parentNode = $this->parentnodes[$i]; $contentScore = intval ($ Parentnode->getattribute (Readability::attr_content_score)); $orgContentScore = intval ($topBox? $topBox- >getattribute (Readability::attr_content_score): 0; if ($contentScore && $contentScore > $orgContentScore) { $topBox = $parentNode; } } //At this point, $topBox Should be the main element of the page content that has been judged return $topBox; } /** * Get HTML page title * & nbsp; * @return String */ public Function GetTitle () br> $split _point = '-'; $titleNodes = $this->dom->getelementsbytagname ("title"); if ($titleNodes->length && $titleNode = $titleNodes->item (0)) { @see Http://stackoverflow.com/questions/717328/how-to-explode-string-right-to-left $title = Trim ($titleNode->nodevalue); $result = Array_map (' Strrev ', explode ($split _point, Strrev ($title)); return sizeof ($result) > 1? Array_pop ($result): $title; } return null; } /** * Get leading Image URL * * @return String */ Public Function Getleadimageurl ($node) { $images = $node->getelementsbytagname ("img"); if ($images->length && $leadImage = $images->item (0)) { return $leadImage->getattribute ("src"); } return null; } /** * Get the main content of the page (readability content later) * * @return Array */ Public Function getcontent () { if (! $this->dom) return false; Get page title $ContentTitle = $this->gettitle (); /Get page main contents $ Contentbox = $this->gettopbox (); //check if we found a Suitable top-box. if ($ContentBox = = null) throw new RuntimeException (Readability::message_can_not_get); //copy content to new DOMDocument $Target = new DOMDocument; $Target->appendchild ($Target->importnode ($ContentBox, True)); Delete unwanted labels foreach ($this->junktags as $tag) { $Target = $this->removejunktag ($Target, $tag); } Delete unwanted properties foreach ($this->junkattrs as $attr) { $Target = $this->removejunkattr ($Target, $attr); } $content = mb_convert_encoding ($Target->savehtml (), readability::D om_default_charset, "html-entities"); Multiple data, returned as an array Return Array ( ' Lead_image_url ' => $this->getleadimageurl ($Target), ' Word_count ' => mb_strlen (Strip_tags ($content), readability::D om_default_charset), ' title ' => $ContentTitle? $ContentTitle: null, ' Content ' => $content ); } function __destruct () {} } |