Finally, complete droids.

Source: Internet
Author: User
Tags netbeans

Droids is a project being incubated by Apache, but it is available now.

1. SVN must be installed

2. Install Maven to install README.

3 The original tutorial seems to be unavailable, and the combination of netbeans and eclipse is acceptable.

4. Download SVN, and open MVN clean install; netbeans.

5. Open netbeans, open the project, and find that some projects have no dependencies. Select the library folder, right-click to download the missing dependency, and click netbean. It's really useful. I downloaded it myself, this is better than eclipse.

6. You can set the spelling and encoding options to generate code prompts. Good.

7 entity refers to the content of the webpage. In the httpprotol class, the load method returns.

Public managedcontententity load (URI) throws ioexception {
Httpget = new httpget (URI );
Httpresponse response = httpclient.exe cute (httpget );
Statusline = response. getstatusline ();
If (statusline. getstatuscode ()> = httpstatus. SC _bad_request ){
Httpget. Abort ();
Throw new httpresponseexception (
Statusline. getstatuscode (), statusline. getreasonphrase ());
}
Httpentity entity = response. getentity ();
If (entity = NULL ){
// Shold _ almost _ never happen with http get requests.
Throw new clientprotocolexception ("Empty entity ");
}
Long maxlen = httpclient. getparams (). getlongparameter (droidshttpclient. max_body_length, 0 );
Return new httpcontententity (entity, maxlen );

}

8 org. Apache. Droids. Robot. crawler package, which has two types: crawler droids and crawler worker

Crawlingdroids Definition

Public abstract class crawlingdroid extends actdroid <link>
{
Private collection <string> initiallocations;

Protocolfactory;
Parserfactory;
Urlfiltersfactory filtersfactory;
Public crawlingdroid (taskqueue <link> queue, taskmaster <link> taskmaster)
{
Super (queue, taskmaster );
}

Public void setinitiallocations (collection <string> initiallocations ){
This. initiallocations = initiallocations;
}
Public void Init () throws invalidtaskexception {
If (initiallocations = NULL | initiallocations. isempty ()){
Throw new illegalstateexception ("webcrawlerdroid requires at least one starting file ");
}
For (string location: initiallocations ){
Uri URI;
Try {
Uri = new uri (location );
} Catch (urisyntaxexception ex ){
Throw new invalidtaskexception ("invalid lication:" + location );
}
Queue. Merge (New linktask (null, Uri, 0 ));
}
}
Public void start ()
{
Taskmaster. processalltasks (queue, this );
}
Public void finished ()
{
Log.info ("Finished !!! ");
}

Public abstract worker <link> getnewworker ();

Public protocolfactory getprotocolfactory (){
Return protocolfactory;
}

Public void setprotocolfactory (protocolfactory ){
This. protocolfactory = protocolfactory;
}

Public parserfactory getparserfactory (){
Return parserfactory;
}

Public void setparserfactory (parserfactory ){
This. parserfactory = parserfactory;
}

Public urlfiltersfactory getfiltersfactory (){
Return filtersfactory;
}

Public void setfiltersfactory (urlfiltersfactory filtersfactory ){
This. filtersfactory = filtersfactory;
}
}

Crawlingworker definition:

Public class crawlingworker extends loggable implements worker <link>
{
Private Final crawlingdroid droid;
Handlerfactory;
Public crawlingworker (crawlingdroid droid)
{
This. Droid = droid;
}

Public void execute (link) throws droidsexception, ioexception
{
Final string useragent = This. getclass (). getcanonicalname ();
If (log. isdebugenabled ()){
Log. debug ("starting" + useragent );
}
Uri uri = link. geturi ();
Final Protocol protocol = droid. getprotocolfactory (). getprotocol (URI );
If (Protocol = NULL ){
If (log. iswarnenabled ()){
Log. Warn ("unsupported Protocol scheme" + URI. getscheme () + "'");
}
Return;
}
If (protocol. isallowed (URI )){
If (log. isinfoenabled ()){
Log.info ("loading" + URI );
}
Managedcontententity entity = protocol. Load (URI );
Try {
String contenttype = entity. getmimetype ();
If (log. isdebugenabled ()){
Log. debug ("content type" + contenttype );
}
If (contenttype = NULL ){
Log.info ("missing content type... Can't parse ...");
}
Else {
Parser = droid. getparserfactory (). getparser (contenttype );
If (parser = NULL ){
If (log. isdebugenabled ()){
Log. debug ("cocould not find Parser for" + contenttype );
}
}
Else {
Parse parse = parser. parse (entity, link );
If (PARSE. getoutlinks ()! = NULL ){
Collection <link> outlinks = getfilteredoutlinks (PARSE );
Droid. getqueue (). Merge (outlinks );
}
Entity. setparse (PARSE );
Handle (entity, link );
}
}
} Finally {
Entity. Finish ();
}
}
Else {
If (log. isinfoenabled ()){
Log.info ("Stopping processing since"
+ "Bots are not allowed for" + URI );
}
}
}
Protected void handle (contententity entity, link)
Throws droidsexception, ioexception
{
Gethandlerfactory (). Handle (link. geturi (), entity );
}
Protected collection <link> getfilteredoutlinks (PARSE)
{
Urlfiltersfactory filters = droid. getfiltersfactory ();
// Todo -- make the hashvalue for outlink...
Map <string, link> filtered = new linkedhashmap <string, link> ();
For (link outlink: parse. getoutlinks ()){
String id = outlink. GETID ();
If (filters. Accept (outlink. GETID ())&&! Filtered. containskey (ID )){
Filtered. Put (ID, outlink );
}
}
Return filtered. Values ();
}
Public handlerfactory gethandlerfactory (){
Return handlerfactory;
}

Public void sethandlerfactory (handlerfactory ){
This. handlerfactory = handlerfactory;
}
}

Reprinted by wussearch Network Technology

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.