This is a small program that extracts text from a batch of news pages. it can save the news content as a text file named after the news title. If there is a better way to deal with, please contact me: lwx3069@sina.com here in people's network "today's news" under the example.
This is a small program that extracts text from a batch of news pages. it can save the news content as a text file named after the news title. If you have a better solution, please contact me:
Lwx3069@sina.com
Here we take the news under "today's news" in people's network as an example.
($ Url )? ": $ Url =" http://www.unn.com.cn/GB/channel2/3/11/index.html "; // today's news
If (isset ($ url) & $ url! = ""){
$ Str = implode ("", file ($ url ));
$ Str_ary = explode ("
", $ Str );
$ Str_ary = explode ("
- ", Trim ($ str_ary [1]);
For ($ I = 0; $ I <8; $ I ++ ){
If (strlen (trim ($ str_ary [$ I]) <3 ){
Continue;
}
Echo "news". $ I. ":". $ str_ary [$ I];
$ Str1 = strstr ("$ str_ary [$ I]", $ str2 = strstr ("$ str_ary [$ I]", "target );
$ Len1 = strlen ("$ str1 ");
$ Len2 = strlen ("$ str2 ");
$ Len = $ len1-$ len2;
$ Url = substr ("$ str1", 10, $ len-10 );
If (strlen (trim ($ url ))! = 0 ){
$ Url = "http://www.unn.com.cn/". $ url;
Define (CONTENTS_DIR, "./contents /");
If (isset ($ url) & $ url! = ""){
$ Str = implode ("", file ($ url ));
$ Str1 = explode (, $ str); // remove the useless upper part of the file.
$ Str2 = explode (, $ str1 [1]);
// Retrieve the lower part of the file and remove the useless lower part. all the results are useful.
$ Str3 = explode (, $ Str2 [0]); // extracts the file title and body from the entire useful part.
$ Str4 = explode (, $ str2 [0]); // Retrieve the date and time
$ Str5 = explode (, $ Str3 [1]); // extracts the title from the title and body
$ Title = str_replace ("
"," ", $ Str5 [0]);
$ Str3 = explode (
, $ Str2 [0]); // extracts the object body from the entire useful part
$ Str3 [1] = str_replace (
, "". "", $ Str3 [1]);
$ Str3 [1] = str_replace (, "", $ str3 [1]);
$ Str3 = strip_tags ($ str3 [1]);
$ Pf = trim ($ title). ". txt ";
$ Ppf = fopen (CONTENTS_DIR. "$ pf", w );
Fputs ($ ppf, $ title );
Fputs ($ ppf, "$ str4 [0]");
Fputs ($ ppf, $ str3 );
}
}
}
}
?>