CopyCode The Code is as follows: nodelist body_nodes = This. getparser (). parse (body_filter );
For (INT I = 0; I <body_nodes.size (); I ++)
{
Node node = body_nodes.elementat (I );
Parser body_parser = new Parser (node. tohtml ());
Textextractingvisitor visitor = new textractingvisitor ();
Body_parser.visitallnodeswith (visitor );
Body. append (visitor. getextractedtext ());
}
Textextractingvisitor, visitallnodeswith, and other classes and methods are important but rare in visitor.
Attached belowSource code:Copy codeThe Code is as follows: Import java. Io. bufferedwriter;
Import java. Io. file;
Import java. Io. filewriter;
Import java. Io. ioexception;
Import java. util. date;
Import org.html parser. node;
Import org.html parser. nodefilter;
Import org.html parser. parser;
Import org.html parser. Filters. andfilter;
Import org.html parser. Filters. hasattributefilter;
Import org.html parser. Filters. haschildfilter;
Import org.html parser. Filters. tagnamefilter;
Import org.html parser. util. nodelist;
Import org.html parser. Visitors. textextractingvisitor;
Import com. Extractor. Extractor;
Public class extractorhangdian extends extractor {
Public void extract ()
{
Bufferedwriter BW = NULL;
String indextime;
String title;
Stringbuffer body = new stringbuffer ();;
Nodefilter time_filter = new andfilter (New tagnamefilter ("font"), new hasattributefilter ("color", "#808080 "));
Nodefilter title_filter1 = new andfilter (New tagnamefilter ("TD"), new haschildfilter (New tagnamefilter ("B ")));
Nodefilter body_filter = new andfilter (New tagnamefilter ("TD"), new haschildfilter (New tagnamefilter ("p ")));
Try
{
Nodelist title_nodes = This. getparser (). parse (title_filter1 );
Node node = title_nodes.elementat (0 );
Nodelist node2 = node. getchildren ();
// Title = node2.elementat (0). tohtml ();/* '\ r \ n '*/
// Title = node2.elementat (1). tohtml ();/* Font color = "#000080" style = "font-size: 14.4px */
// Title = node2.elementat (2). tohtml ();/* B */
Title = node2.elementat (3). tohtml ();/* notice of textbook subscription and instructor registration */
BW = new bufferedwriter (New filewriter (new file (this. getoutputpath () + title + ". txt ")));
String url_seg1 = getinputfilepath (). substring (3,30 );
Int end = getinputfilepath (). lastindexof (".");
String url_seg2 = getinputfilepath (). substring (30, end );
String url_seg = url_seg1 + ". asp? "+ Url_seg2;
Url_seg = url_seg.replaceall ("\\\\","/");
String url = "http: //" + url_seg;
Bw. Write (URL + newline );
Bw. Write (Title + newline );
}
Catch (exception E)
{
E. printstacktrace ();
}
This. getparser (). Reset ();
Try
{
Nodelist time_nodes = This. getparser (). parse (time_filter );
Node time_node = time_nodes.elementat (1); // here "1" indicates the second element that conforms to time_filter
Indextime = time_node.getnextsibling (). tohtml ();
Bw. Write (indextime + newline );
}
Catch (exception E)
{
E. printstacktrace ();
}
This. getparser (). Reset (); // obtain all TXT text except the tag.
Try
{
Nodelist body_nodes = This. getparser (). parse (body_filter );
For (INT I = 0; I <body_nodes.size (); I ++)
{
Node node = body_nodes.elementat (I );
Parser body_parser = new Parser (node. tohtml ());
Textextractingvisitor visitor = new textractingvisitor ();
Body_parser.visitallnodeswith (visitor );
Body. append (visitor. getextractedtext ());
}
Bw. Write (body + newline );
}
Catch (exception E)
{
E. printstacktrace ();
}
Try
{
If (BW! = NULL)
Bw. Close ();
} Catch (ioexception E)
{
E. printstacktrace ();
}
}
}
Here, by the way, BW was not switched off in the past, so why can't I read it? After several days, it was so depressing that it was very popular when I thought of it. Pay attention !!