Droids is a project being incubated by Apache, but it is available now.
1. SVN must be installed
2. Install Maven to install README.
3 The original tutorial seems to be unavailable, and the combination of netbeans and eclipse is acceptable.
4. Download SVN, and open MVN clean install; netbeans.
5. Open netbeans, open the project, and find that some projects have no dependencies. Select the library folder, right-click to download the missing dependency, and click netbean. It's really useful. I downloaded it myself, this is better than eclipse.
6. You can set the spelling and encoding options to generate code prompts. Good.
7 entity refers to the content of the webpage. In the httpprotol class, the load method returns.
Public managedcontententity load (URI) throws ioexception {
Httpget = new httpget (URI );
Httpresponse response = httpclient.exe cute (httpget );
Statusline = response. getstatusline ();
If (statusline. getstatuscode ()> = httpstatus. SC _bad_request ){
Httpget. Abort ();
Throw new httpresponseexception (
Statusline. getstatuscode (), statusline. getreasonphrase ());
}
Httpentity entity = response. getentity ();
If (entity = NULL ){
// Shold _ almost _ never happen with http get requests.
Throw new clientprotocolexception ("Empty entity ");
}
Long maxlen = httpclient. getparams (). getlongparameter (droidshttpclient. max_body_length, 0 );
Return new httpcontententity (entity, maxlen );
}
8 org. Apache. Droids. Robot. crawler package, which has two types: crawler droids and crawler worker
Crawlingdroids Definition
Public abstract class crawlingdroid extends actdroid <link>
{
Private collection <string> initiallocations;
Protocolfactory;
Parserfactory;
Urlfiltersfactory filtersfactory;
Public crawlingdroid (taskqueue <link> queue, taskmaster <link> taskmaster)
{
Super (queue, taskmaster );
}
Public void setinitiallocations (collection <string> initiallocations ){
This. initiallocations = initiallocations;
}
Public void Init () throws invalidtaskexception {
If (initiallocations = NULL | initiallocations. isempty ()){
Throw new illegalstateexception ("webcrawlerdroid requires at least one starting file ");
}
For (string location: initiallocations ){
Uri URI;
Try {
Uri = new uri (location );
} Catch (urisyntaxexception ex ){
Throw new invalidtaskexception ("invalid lication:" + location );
}
Queue. Merge (New linktask (null, Uri, 0 ));
}
}
Public void start ()
{
Taskmaster. processalltasks (queue, this );
}
Public void finished ()
{
Log.info ("Finished !!! ");
}
Public abstract worker <link> getnewworker ();
Public protocolfactory getprotocolfactory (){
Return protocolfactory;
}
Public void setprotocolfactory (protocolfactory ){
This. protocolfactory = protocolfactory;
}
Public parserfactory getparserfactory (){
Return parserfactory;
}
Public void setparserfactory (parserfactory ){
This. parserfactory = parserfactory;
}
Public urlfiltersfactory getfiltersfactory (){
Return filtersfactory;
}
Public void setfiltersfactory (urlfiltersfactory filtersfactory ){
This. filtersfactory = filtersfactory;
}
}
Crawlingworker definition:
Public class crawlingworker extends loggable implements worker <link>
{
Private Final crawlingdroid droid;
Handlerfactory;
Public crawlingworker (crawlingdroid droid)
{
This. Droid = droid;
}
Public void execute (link) throws droidsexception, ioexception
{
Final string useragent = This. getclass (). getcanonicalname ();
If (log. isdebugenabled ()){
Log. debug ("starting" + useragent );
}
Uri uri = link. geturi ();
Final Protocol protocol = droid. getprotocolfactory (). getprotocol (URI );
If (Protocol = NULL ){
If (log. iswarnenabled ()){
Log. Warn ("unsupported Protocol scheme" + URI. getscheme () + "'");
}
Return;
}
If (protocol. isallowed (URI )){
If (log. isinfoenabled ()){
Log.info ("loading" + URI );
}
Managedcontententity entity = protocol. Load (URI );
Try {
String contenttype = entity. getmimetype ();
If (log. isdebugenabled ()){
Log. debug ("content type" + contenttype );
}
If (contenttype = NULL ){
Log.info ("missing content type... Can't parse ...");
}
Else {
Parser = droid. getparserfactory (). getparser (contenttype );
If (parser = NULL ){
If (log. isdebugenabled ()){
Log. debug ("cocould not find Parser for" + contenttype );
}
}
Else {
Parse parse = parser. parse (entity, link );
If (PARSE. getoutlinks ()! = NULL ){
Collection <link> outlinks = getfilteredoutlinks (PARSE );
Droid. getqueue (). Merge (outlinks );
}
Entity. setparse (PARSE );
Handle (entity, link );
}
}
} Finally {
Entity. Finish ();
}
}
Else {
If (log. isinfoenabled ()){
Log.info ("Stopping processing since"
+ "Bots are not allowed for" + URI );
}
}
}
Protected void handle (contententity entity, link)
Throws droidsexception, ioexception
{
Gethandlerfactory (). Handle (link. geturi (), entity );
}
Protected collection <link> getfilteredoutlinks (PARSE)
{
Urlfiltersfactory filters = droid. getfiltersfactory ();
// Todo -- make the hashvalue for outlink...
Map <string, link> filtered = new linkedhashmap <string, link> ();
For (link outlink: parse. getoutlinks ()){
String id = outlink. GETID ();
If (filters. Accept (outlink. GETID ())&&! Filtered. containskey (ID )){
Filtered. Put (ID, outlink );
}
}
Return filtered. Values ();
}
Public handlerfactory gethandlerfactory (){
Return handlerfactory;
}
Public void sethandlerfactory (handlerfactory ){
This. handlerfactory = handlerfactory;
}
}
Reprinted by wussearch Network Technology