終於搞定droids

來源:互聯網
上載者:User

droids是apache正在孵化的項目,不過現在已經可用。

1、必須安裝SVN

2  必須安裝MAVEN 安裝 readme安裝。

3  原來的教程似乎不可用 ,NETBEANS和eclipse的結合起來尚可。

4 svn 下載 ,mvn clean install ;  netbeans 開啟。

5 開啟 netbeans ,開啟項目,發現有些項目 沒有依賴關係,選擇 庫檔案夾,右擊選擇下載缺少的依賴關係,netbean,真的蠻好用的,自己去下載了,這點比 eclipse要好

6 工具 -》選項  拼字 編碼設定,產生代碼提示。不錯。

7 entity 實體,指網頁的 內容。 httpprotol類中 ,load方法返回。

public ManagedContentEntity load(URI uri) throws IOException {
   HttpGet httpget = new HttpGet(uri);
   HttpResponse response = httpclient.execute(httpget);
   StatusLine statusline = response.getStatusLine();
   if (statusline.getStatusCode() >= HttpStatus.SC_BAD_REQUEST) {
     httpget.abort();
     throw new HttpResponseException(
         statusline.getStatusCode(), statusline.getReasonPhrase());
   }
   HttpEntity entity = response.getEntity();
   if (entity == null) {
     // Should _almost_ never happen with HTTP GET requests.
     throw new ClientProtocolException("Empty entity");
   }
   long maxlen = httpclient.getParams().getLongParameter(DroidsHttpClient.MAX_BODY_LENGTH, 0);
   return new HttpContentEntity(entity, maxlen);

8 org.apache.droids.robot.crawler 包,有兩個類 CrawlingDroids 和CrawlingWorker

CrawlingDroids 定義

public abstract class CrawlingDroid extends AbstractDroid<Link>
{
  private Collection<String> initialLocations;

  ProtocolFactory protocolFactory;
  ParserFactory parserFactory;
  URLFiltersFactory filtersFactory;
  public CrawlingDroid( TaskQueue<Link> queue, TaskMaster<Link> taskMaster )
  {
    super( queue, taskMaster );
  }

  public void setInitialLocations(Collection<String> initialLocations) {
    this.initialLocations = initialLocations;
  }
  public void init() throws InvalidTaskException {
    if( initialLocations == null || initialLocations.isEmpty() ) {
      throw new IllegalStateException( "WebCrawlerDroid requires at least one starting file" );
    }
    for( String location : initialLocations ) {
      URI uri;
      try {
        uri = new URI(location);
      } catch (URISyntaxException ex) {
        throw new InvalidTaskException("Invalid lication: " + location);
      }
      queue.merge( new LinkTask( null, uri, 0 ) );
    }
  }
  public void start()
  {
    taskMaster.processAllTasks(queue, this);
  }
  public void finished()
  {
    log.info( "FINISHED!!!" );
  }

  public abstract Worker<Link> getNewWorker();

  public ProtocolFactory getProtocolFactory() {
    return protocolFactory;
  }

  public void setProtocolFactory(ProtocolFactory protocolFactory) {
    this.protocolFactory = protocolFactory;
  }

  public ParserFactory getParserFactory() {
    return parserFactory;
  }

  public void setParserFactory(ParserFactory parserFactory) {
    this.parserFactory = parserFactory;
  }

  public URLFiltersFactory getFiltersFactory() {
    return filtersFactory;
  }

  public void setFiltersFactory(URLFiltersFactory filtersFactory) {
    this.filtersFactory = filtersFactory;
  }
}

CrawlingWorker 定義:

public class CrawlingWorker extends Loggable implements Worker<Link>
{
  private final CrawlingDroid droid;
  HandlerFactory handlerFactory;
  public CrawlingWorker( CrawlingDroid droid )
  {
    this.droid = droid;
  }

  public void execute(Link link) throws DroidsException, IOException
  {
    final String userAgent = this.getClass().getCanonicalName();
    if (log.isDebugEnabled()) {
      log.debug("Starting " + userAgent);
    }
    URI uri = link.getURI();
    final Protocol protocol = droid.getProtocolFactory().getProtocol(uri);
    if (protocol == null) {
      if (log.isWarnEnabled()) {
        log.warn("Unsupported protocol scheme '" + uri.getScheme() + "'");
      }
      return;
    }
    if (protocol.isAllowed(uri)) {
      if (log.isInfoEnabled()) {
        log.info("Loading " + uri);
      }
      ManagedContentEntity entity = protocol.load(uri);
      try {
        String contentType = entity.getMimeType();
        if (log.isDebugEnabled()) {
          log.debug("Content type " + contentType);
        }
        if (contentType == null){
          log.info("Missing content type... can't parse...");
        }
        else {
          Parser parser = droid.getParserFactory().getParser(contentType);
          if( parser == null ) {
            if (log.isDebugEnabled()) {
              log.debug("Could not find parser for " + contentType);
            }
          }
          else {
            Parse parse = parser.parse(entity, link);
            if( parse.getOutlinks() != null ) {
              Collection<Link> outlinks = getFilteredOutlinks( parse );
              droid.getQueue().merge( outlinks );
            }
            entity.setParse(parse);
            handle(entity, link);
          }
        }
      } finally {
        entity.finish();
      }
    }
    else {
      if (log.isInfoEnabled()) {
        log.info("Stopping processing since"
            + " bots are not allowed for " + uri );
      }
    }
  }
  protected void handle(ContentEntity entity, Link link)
      throws DroidsException, IOException
  {
    getHandlerFactory().handle(link.getURI(), entity);
  }
  protected Collection<Link> getFilteredOutlinks( Parse parse )
  {
    URLFiltersFactory filters = droid.getFiltersFactory();
    // TODO -- make the hashvalue for Outlink...
    Map<String,Link> filtered = new LinkedHashMap<String,Link>();
    for( Link outlink : parse.getOutlinks() ) {
      String id = outlink.getId();
      if (filters.accept(outlink.getId()) && !filtered.containsKey(id)) {
        filtered.put(id,outlink);
      }
    }
    return filtered.values();
  }
  public HandlerFactory getHandlerFactory() {
    return handlerFactory;
  }

  public void setHandlerFactory(HandlerFactory handlerFactory) {
    this.handlerFactory = handlerFactory;
  }
}

轉載 吾搜網路科技

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.