Experimental building of the PHP game title, Web page data extraction.
Address of the topic: https://www.shiyanlou.com/contests/lou5/challenges
The following code is the answer to the question
<?phpheader ("Content-type:text/html;charset=utf-8"); class crawler{private $content; private $data; static private $mysql; Public Function __construct () {echo starts crawling content ....} Public Function LoadFile ($file _path) {echo "Loading file"; $this->content = file_get_contents ($file _path); function Parsecoursebody () {$regex = "/<body[^>]*?> (. *\s*?) <\/body>/is "; if (Preg_match_all ($regex, $this->content, $matches)) {$this->content = $matches [0];}} Public Function parsecontent () {echo "begins parsing content ...<br/>"; $this->parsecoursebody (); $this->parsetitle (); $ This->parsedesc (); $this->parsetype (); $this->titleislong (); $this->savedata (); echo "Parsing content end!<br/>"; } public Function SaveData () {echo "into database ...<br/>"; Self:: $mysql = mysql_connect ("localhost", "root", "root"); mysql_query ("Set names UTF8"); mysql_select_db ("Databases", Self:: $mysql); $cnames = $this->data[' cnames '); $cdescs = $this->data[' Cdescs '); $ctypes = $this->data[' ctypes‘]; $nlongs = $this->data[' nlongs '); foreach ($cnames as $key = + $value) {$sql = "insert INTO ' course_data ' (' cname ', ' cdesc ', ' CType ', ' Nlong ') VALUES ('". $cna mes[$key]. "', '". $cdescs [$key]. "', '". $ctypes [$key]. "', '". $nlongs [$key]. "')"; mysql_query ($sql); } mysql_close (); The Public Function Parsetitle () {echo "resolves the course title ...<br/>"; $regex = "/<div class=\" course-name\ ". *?>.*?<\/ Div>/ism "; if (Preg_match_all ($regex, $this->content, $matches)) {$cnames = $matches [0];} foreach ($cnames as & $value) {$value = Str_replace ("</div>", "", Str_replace ("<div class=\" course-name\ " > "," ", $value));} $this->data[' cnames ') = $cnames; } public Function Parsedesc () {echo "Parsing course introduction ...<br/>"; $regex 4= "/<div class=\" course-desc\ ". *?>.*?<\/ Div>/ism "; if (Preg_match_all ($regex, $this->content, $matches)) {$cdescs = $matches [0];} foreach ($cdescs as & $value) {$value = Str_replace ("</div>", "", Str_replace ("<div class=\" course-desc\ " > "," ", $value));} $this->data[' cdescs ') = $cdescs; The Public Function Parsetype () {echo "parses the course type ...<br/>"; $regex = "/<div class=\" Course-footer\ ". *?>.*?<\/ Div>/ism "; if (Preg_match_all ($regex, $this->content, $matches)) {$ctypes = $matches [0];} foreach ($ctypes as & $value) {$str = Str_replace ("</div>", "", Str_replace ("<div class=\" Course-footer\ " > "," ", $value)), if (Preg_match_all ("/([\x{4e00}-\x{9fa5}]/U ", $str, $match)) {$value = Join (" ", $match [0]);} else{$value = "Free";} $this->data[' ctypes ') = $ctypes; The Public Function Titleislong () {echo "Determines whether the course name is ...<br/>"; $cnames = $this->data[' cnames '); foreach ($cnames as $ Value) {$nlongs [] = Mb_strlen ($value) >: "true": "false";} $this->data[' nlongs ') = $nlongs; }} $Crawler = new Crawler (); $Crawler->loadfile ("test.html"); $Crawler->parsecontent ();/** table structure CNAME (varchar) : Full course name CDESC (varchar): Course description CType (varchar): course type, value for free, member, training camp. Nlong (enum (' true ', ' false '): Whether the course name is too long, true if the course name exceeds 16 characters, otherwise falsecreatE table ' Course_data ' (' id ' int (one) not null auto_increment, ' CNAME ' varchar (255) default NULL, ' CDESC ' varchar (255) Default NULL, ' ctype ' varchar (255) default NULL, ' Nlong ' enum (' true ', ' false ') default Null,primary key (' id ')) engine=innodb Default charset=utf8;*/
Experimental building of the PHP game title, Web page data extraction.