Package xstreamtest;
Import java. Io. file;
Import java. Io. fileinputstream;
Import java. Io. fileoutputstream;
Import java. Io. inputstream;
Import java. Io. inputstreamreader;
Import java. Io. outputstream;
Import java. Io. outputstreamwriter;
Import java. NiO. charset. charset;
Import java. util. arraylist;
Import java. util. List;
Import com. thoughtworks. xstream. xstream;
Import com. thoughtworks. xstream. Io. xml. domdriver;
Public class extractorxpathconfig {
Public static void main (string [] ARGs ){
Extractorxpathconfig. Write ();
Extractorxpathconfig. Read ();
}
Public static void write (){
Xstream Sm = new xstream (New domdriver ());
Filterconfig fc = new filterconfig ();
List <seedconfig> seedconfig = new arraylist <seedconfig> ();
List <extractorconfig> extratorconfig = new arraylist <extractorconfig> ();
List <writerconfig> writerconfig = new arraylist <writerconfig> ();
Seedconfig SC = new seedconfig ("http://www.qq.com /");
Seedconfig SC1 = new seedconfig ("http://www.sina.com /");
Seedconfig. Add (SC );
Seedconfig. Add (SC1 );
Extractorconfig EC = new extractorconfig (
"Only capture and configure DIV/A/@ href | DIV/H1/A/@ href", "Only parse and configure DIV/A/@ href | ",
"That is, configure DIV/A/@ href by capturing and parsing ");
Extratorconfig. Add (EC );
Writerconfig WC = new writerconfig ("singername ",
"DIV/A/@ href | DIV/H1//");
Writerconfig wc1 = new writerconfig ("singergender ",
"DIV/A/@ href | DIV/H1/Gender ");
Writerconfig wc2 = new writerconfig ("singerage ",
"DIV/A/@ href | DIV/H1/age ");
Writerconfig wc3 = new writerconfig ("singercountry ",
"DIV/A/@ href | DIV/H1/age ");
Wc3.setotherconfig ("Fujian Provincial third-party ");
Writerconfig. Add (WC );
Writerconfig. Add (wc1 );
Writerconfig. Add (wc2 );
Writerconfig. Add (wc3 );
FC. setextratorconfig (extratorconfig );
FC. setseedconfig (seedconfig );
FC. setwriterconfig (writerconfig );
Try {
Outputstream out = new fileoutputstream (new file ("xpathconfig. xml "));
Outputstreamwriter writer = new outputstreamwriter (Out, charset
. Forname ("UTF-8 "));
Writer. Write ("/N ");
SM. toxml (FC, writer );
Out. Close ();
} Catch (exception e ){
E. printstacktrace ();
}
}
Public static void read (){
Xstream Sm = new xstream (New domdriver ());
Try {
Inputstream in = new fileinputstream (new file ("xpathconfig. xml "));
Inputstreamreader reader = new inputstreamreader (in, charset
. Forname ("UTF-8 "));
Filterconfig fc = (filterconfig) Sm. fromxml (in );
System. Out. println (FC. getcharset ());
List <extractorconfig> extratorconfig = FC. getextratorconfig ();
For (extractorconfig EC: extratorconfig ){
System. Out. println (EC. getbothurls ());
System. Out. println (EC. getfetchurchill ());
System. Out. println (EC. getwriteurls ());
}
In. Close ();
} Catch (exception e ){
E. printstacktrace ();
}
}
}
/**
* Configure the template class
*
* @ Author ffshi
*
*/
Class filterconfig {
Private list <seedconfig> seedconfig;
Private list <extractorconfig> extratorconfig;
Private list <writerconfig> writerconfig;
Private string charset = "UTF-8 ";
Public String getcharset (){
Return charset;
}
Public void setcharset (string charset ){
This. charset = charset;
}
Public filterconfig (){
Seedconfig = new arraylist <seedconfig> ();
Extratorconfig = new arraylist <extractorconfig> ();
Writerconfig = new arraylist <writerconfig> ();
}
Public list <seedconfig> getseedconfig (){
Return seedconfig;
}
Public void setseedconfig (list <seedconfig> seedconfig ){
This. seedconfig = seedconfig;
}
Public list <extractorconfig> getextratorconfig (){
Return extratorconfig;
}
Public void setextratorconfig (list <extractorconfig> extratorconfig ){
This. extratorconfig = extratorconfig;
}
Public list <writerconfig> getwriterconfig (){
Return writerconfig;
}
Public void setwriterconfig (list <writerconfig> writerconfig ){
This. writerconfig = writerconfig;
}
}
/**
* Set bean on the seed page
*
* @ Author ffshi
*
*/
Class seedconfig {
Private string seed;
Public seedconfig (){
}
Public String getseed (){
Return seed;
}
Public void setseed (string seed ){
This. Seed = seed;
}
Public seedconfig (string seed ){
Super ();
This. Seed = seed;
}
}
/**
* URL filtering configuration class
*
* @ Author ffshi
*
*/
Class extractorconfig {
Private string fetchurchill;
Private string writeurls;
Private string bothurls;
Public extractorconfig (){
}
Public extractorconfig (string fetchills, string writeurls, string bothurls ){
Super ();
This. bothurls = bothurls;
This. fetchunls = fetchunls;
This. writeurls = writeurls;
}
Public String getfetchills (){
Return fetchurchill;
}
Public void setfetchills (string fetchills ){
This. fetchunls = fetchunls;
}
Public String getwriteurls (){
Return writeurls;
}
Public void setwriteurls (string writeurls ){
This. writeurls = writeurls;
}
Public String getbothurls (){
Return bothurls;
}
Public void setbothurls (string bothurls ){
This. bothurls = bothurls;
}
}
/**
* Configuration classes for structured Extraction
*
* @ Author ffshi
*
*/
Class writerconfig {
Private string fieldname;
Private string fieldxpath;
Private string otherconfig;
Private Boolean bool;
Public Boolean isbool (){
Return bool;
}
Public void setbool (Boolean bool ){
This. bool = bool;
}
Public writerconfig (){
}
Public writerconfig (string fieldname, string fieldxpath ){
Super ();
This. fieldname = fieldname;
This. fieldxpath = fieldxpath;
}
Public String getfieldname (){
Return fieldname;
}
Public void setfieldname (string fieldname ){
This. fieldname = fieldname;
}
Public String getfieldxpath (){
Return fieldxpath;
}
Public void setfieldxpath (string fieldxpath ){
This. fieldxpath = fieldxpath;
}
Public String getotherconfig (){
Return otherconfig;
}
Public void setotherconfig (string otherconfig ){
This. otherconfig = otherconfig;
}
}
The generated XML format is as follows:
<Xstreamtest. filterconfig>
<Seedconfig>
<Xstreamtest. seedconfig>
<Seed> http://www.qq.com/</seed>
</Xstreamtest. seedconfig>
<Xstreamtest. seedconfig>
<Seed> http://www.sina.com/</seed>
</Xstreamtest. seedconfig>
</Seedconfig>
<Extratorconfig>
<Xstreamtest. extractorconfig>
<Fetchills> only configure DIV/A/@ href | DIV/H1/A/@ href </fetchills>
<Writeurls> only parse and configure DIV/A/@ href | </writeurls>
<Bothurls> Configure DIV/A/@ href by capturing and parsing functions </bothurls>
</Xstreamtest. extractorconfig>
</Extratorconfig>
<Writerconfig>
<Xstreamtest. writerconfig>
<Fieldname> singername </fieldname>
<Fieldxpath> DIV/A/@ href | DIV/H1/A/</fieldxpath>
<Bool> false </bool>
</Xstreamtest. writerconfig>
<Xstreamtest. writerconfig>
<Fieldname> singergender </fieldname>
<Fieldxpath> DIV/A/@ href | DIV/H1/gender </fieldxpath>
<Bool> false </bool>
</Xstreamtest. writerconfig>
<Xstreamtest. writerconfig>
<Fieldname> singerage </fieldname>
<Fieldxpath> DIV/A/@ href | DIV/H1/age </fieldxpath>
<Bool> false </bool>
</Xstreamtest. writerconfig>
<Xstreamtest. writerconfig>
<Fieldname> singercountry </fieldname>
<Fieldxpath> DIV/A/@ href | DIV/H1/age </fieldxpath>
<Otherconfig> Fujian Provincial third-party </otherconfig>
<Bool> false </bool>
</Xstreamtest. writerconfig>
</Writerconfig>
<Charset> UTF-8 </charset>
</Xstreamtest. filterconfig>