This paper is a small program to extract the text from a batch of news pages, and it can save the contents of each news as a text file with the news title as the filename. If there is a better way to deal with, please contact me:
Lwx3069@sina.com
Here, for example, is the news from the "Today's News" in the daily.
<?php
($url)? "": $url = "http://www.unn.com.cn/GB/channel2/3/11/index.html"; Today's News
if (Isset ($url) && $url!= "") {
$str = Implode ("", File ($url));
$str _ary = Explode ("<ul>", $str);
$str _ary = Explode ("<li>", Trim ($str _ary[1]);
For ($i =0 $i <8; $i + +) {
if (strlen ($str _ary[$i)) <3) {
Continue
}
echo "News", $i. " : ". $str _ary[$i];
$str 1=strstr ("$str _ary[$i]", "<a href="/");
$str 2=strstr ("$str _ary[$i]", "target");
$len 1=strlen ("$str 1");
$len 2=strlen ("$str 2");
$len = $len 1-$len 2;
$url =substr ("$str 1", $len-10);
if (strlen ($url))!=0 {
$url = "http://www.unn.com.cn/". $url;
Define (Contents_dir, "./contents/");
if (Isset ($url) && $url!= "") {
$str = Implode ("", File ($url));
$str 1=explode (' <div align= ' right ' > ', $str); Remove the upper part of the file that doesn't work
$str 2 = explode ('
Take out the lower part of the file and remove the useless lower half, and you get all the useful
$str 3=explode (' </font><font size= "+2" ><b><font size= "3" > ", $str 2[0]); Remove the file title and body from the entire useful section
$str 4=explode (' </div> ', $str 2[0]); Remove date and time
$str 5=explode (' </font></b></font><font size= ' 2 "> ', $str 3[1]); Remove a title from the title and body section
$title =str_replace ("<br>", "", $str 5[0]);
$str 3=explode (' <p><font size= ' 2 "> ', $str 2[0]); Remove the body of a file from the entire useful section
$str 3[1]=str_replace (' <br><br> ', "\ n". "", $str 3[1]);
$str 3[1]=str_replace ("," ", $str 3[1]);
$str 3=strip_tags ($str 3[1]);
$PF =trim ($title). " TXT ";
$PPF =fopen (Contents_dir.) $PF ", ' W ');
Fputs ($PPF, $title);
Fputs ($PPF, "$str 4[0]");
Fputs ($PPF, $str 3);
}
}
}
}
?> turn from: Dynamic Network production guide www.knowsky.com