droids是apache正在孵化的項目,不過現在已經可用。
1、必須安裝SVN
2 必須安裝MAVEN 安裝 readme安裝。
3 原來的教程似乎不可用 ,NETBEANS和eclipse的結合起來尚可。
4 svn 下載 ,mvn clean install ; netbeans 開啟。
5 開啟 netbeans ,開啟項目,發現有些項目 沒有依賴關係,選擇 庫檔案夾,右擊選擇下載缺少的依賴關係,netbean,真的蠻好用的,自己去下載了,這點比 eclipse要好
6 工具 -》選項 拼字 編碼設定,產生代碼提示。不錯。
7 entity 實體,指網頁的 內容。 httpprotol類中 ,load方法返回。
public ManagedContentEntity load(URI uri) throws IOException {
HttpGet httpget = new HttpGet(uri);
HttpResponse response = httpclient.execute(httpget);
StatusLine statusline = response.getStatusLine();
if (statusline.getStatusCode() >= HttpStatus.SC_BAD_REQUEST) {
httpget.abort();
throw new HttpResponseException(
statusline.getStatusCode(), statusline.getReasonPhrase());
}
HttpEntity entity = response.getEntity();
if (entity == null) {
// Should _almost_ never happen with HTTP GET requests.
throw new ClientProtocolException("Empty entity");
}
long maxlen = httpclient.getParams().getLongParameter(DroidsHttpClient.MAX_BODY_LENGTH, 0);
return new HttpContentEntity(entity, maxlen);
}
8 org.apache.droids.robot.crawler 包,有兩個類 CrawlingDroids 和CrawlingWorker
CrawlingDroids 定義
public abstract class CrawlingDroid extends AbstractDroid<Link>
{
private Collection<String> initialLocations;
ProtocolFactory protocolFactory;
ParserFactory parserFactory;
URLFiltersFactory filtersFactory;
public CrawlingDroid( TaskQueue<Link> queue, TaskMaster<Link> taskMaster )
{
super( queue, taskMaster );
}
public void setInitialLocations(Collection<String> initialLocations) {
this.initialLocations = initialLocations;
}
public void init() throws InvalidTaskException {
if( initialLocations == null || initialLocations.isEmpty() ) {
throw new IllegalStateException( "WebCrawlerDroid requires at least one starting file" );
}
for( String location : initialLocations ) {
URI uri;
try {
uri = new URI(location);
} catch (URISyntaxException ex) {
throw new InvalidTaskException("Invalid lication: " + location);
}
queue.merge( new LinkTask( null, uri, 0 ) );
}
}
public void start()
{
taskMaster.processAllTasks(queue, this);
}
public void finished()
{
log.info( "FINISHED!!!" );
}
public abstract Worker<Link> getNewWorker();
public ProtocolFactory getProtocolFactory() {
return protocolFactory;
}
public void setProtocolFactory(ProtocolFactory protocolFactory) {
this.protocolFactory = protocolFactory;
}
public ParserFactory getParserFactory() {
return parserFactory;
}
public void setParserFactory(ParserFactory parserFactory) {
this.parserFactory = parserFactory;
}
public URLFiltersFactory getFiltersFactory() {
return filtersFactory;
}
public void setFiltersFactory(URLFiltersFactory filtersFactory) {
this.filtersFactory = filtersFactory;
}
}
CrawlingWorker 定義:
public class CrawlingWorker extends Loggable implements Worker<Link>
{
private final CrawlingDroid droid;
HandlerFactory handlerFactory;
public CrawlingWorker( CrawlingDroid droid )
{
this.droid = droid;
}
public void execute(Link link) throws DroidsException, IOException
{
final String userAgent = this.getClass().getCanonicalName();
if (log.isDebugEnabled()) {
log.debug("Starting " + userAgent);
}
URI uri = link.getURI();
final Protocol protocol = droid.getProtocolFactory().getProtocol(uri);
if (protocol == null) {
if (log.isWarnEnabled()) {
log.warn("Unsupported protocol scheme '" + uri.getScheme() + "'");
}
return;
}
if (protocol.isAllowed(uri)) {
if (log.isInfoEnabled()) {
log.info("Loading " + uri);
}
ManagedContentEntity entity = protocol.load(uri);
try {
String contentType = entity.getMimeType();
if (log.isDebugEnabled()) {
log.debug("Content type " + contentType);
}
if (contentType == null){
log.info("Missing content type... can't parse...");
}
else {
Parser parser = droid.getParserFactory().getParser(contentType);
if( parser == null ) {
if (log.isDebugEnabled()) {
log.debug("Could not find parser for " + contentType);
}
}
else {
Parse parse = parser.parse(entity, link);
if( parse.getOutlinks() != null ) {
Collection<Link> outlinks = getFilteredOutlinks( parse );
droid.getQueue().merge( outlinks );
}
entity.setParse(parse);
handle(entity, link);
}
}
} finally {
entity.finish();
}
}
else {
if (log.isInfoEnabled()) {
log.info("Stopping processing since"
+ " bots are not allowed for " + uri );
}
}
}
protected void handle(ContentEntity entity, Link link)
throws DroidsException, IOException
{
getHandlerFactory().handle(link.getURI(), entity);
}
protected Collection<Link> getFilteredOutlinks( Parse parse )
{
URLFiltersFactory filters = droid.getFiltersFactory();
// TODO -- make the hashvalue for Outlink...
Map<String,Link> filtered = new LinkedHashMap<String,Link>();
for( Link outlink : parse.getOutlinks() ) {
String id = outlink.getId();
if (filters.accept(outlink.getId()) && !filtered.containsKey(id)) {
filtered.put(id,outlink);
}
}
return filtered.values();
}
public HandlerFactory getHandlerFactory() {
return handlerFactory;
}
public void setHandlerFactory(HandlerFactory handlerFactory) {
this.handlerFactory = handlerFactory;
}
}
轉載 吾搜網路科技