由於工作需要,需要使用PHP實現對網站內大量數量進行全文檢索索引,而且目前最流行的全文檢索索引的搜尋 引擎庫就是Lucene了,它是Apache Jakarta的一個子項目,並且提供了簡單實用的API,用這些API,就可 以對任何基本文本的資料(包括資料庫)進行全文檢索索引。
因為PHP本身就支援調用外部Java類,所以先用Java寫了一個類,這個類通過調用Lucene的API,實現 了兩個方法:
* public String createIndex(String indexDir_path,String dataDir_path)
* public String searchword(String ss,String index_path)
其中createIndex是建立索引方法,傳入了兩個參數分別是indexDir_path(索引檔案的目錄), dataDir_path(被索引的檔案目錄),返回被索引的檔案清單字串,另一個是searchword,通過傳入的關 鍵字參數(ss)對索引進行檢索,index_path就是索引檔案的目錄。返回所有檢索到的檔案。
這裡是原始碼,很簡單,大家可以參考一下:TxtFileIndexer.java
package TestLucene;
import java.io.File;import java.io.FileReader;
import java.io.Reader;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
public class TxtFileIndexer ...{
public String test() ...{
return "test is ok hohoho";
}
/**//**
* @param args
*/
public String createIndex(String indexDir_path,String dataDir_path) throws Exception ...{
String result = "";
File indexDir = new File(indexDir_path);
File dataDir = new File (dataDir_path);
Analyzer luceneAnalyzer = new StandardAnalyzer();
File[] dataFiles = dataDir.listFiles();
IndexWriter indexWriter = new IndexWriter(indexDir,luceneAnalyzer,true);
long startTime = new Date().getTime ();
for(int i=0; i < dataFiles.length; i++) ...{
if(dataFiles [i].isFile() && dataFiles[i].getName().endsWith(".html")) ...{
result += "Indexing file" + dataFiles[i].getCanonicalPath()+"<br />";
Document document = new Document();
Reader txtReader = new FileReader(dataFiles[i]);
document.add(Field.Text("path",dataFiles [i].getCanonicalPath()));
document.add(Field.Text ("contents",txtReader));
indexWriter.addDocument(document);
}
}
indexWriter.optimize();
indexWriter.close();
long endTime = new Date().getTime();
result += "It takes"+(endTime- startTime)
+ " milliseconds to create index for the files in directory "
+ dataDir.getPath();
return result;
}
public String searchword(String ss,String index_path) throws Exception ...{
String queryStr = ss;
String result = "Result:<br />";
//This is the directory that hosts the Lucene index
File indexDir = new File (index_path);
FSDirectory directory = FSDirectory.getDirectory (indexDir,false);
IndexSearcher searcher = new IndexSearcher(directory);
if(!indexDir.exists())...{
result = "The Lucene index is not exist";
return result;
}
Term term = new Term ("contents",queryStr.toLowerCase());
TermQuery luceneQuery = new TermQuery (term);
Hits hits = searcher.search(luceneQuery);
for(int i = 0; i < hits.length(); i++)...{
Document document = hits.doc(i);
result += "<br /><a href='getfile.php?w="+ss+"&f="+document.get("path") +"'>File: " + document.get("path")+"</a>n";
}
return result;
}
}