標籤:rar rem wro string empty tac apach stop ssi
lucene同義字搜尋原理其實是根據 PositionIncrementAttribute 和 CharTermAttribute的次元記錄資訊來實現的,當前使用lucene版本為4.8.0首先同義字要實現
package lucene_index; import java.io.IOException;import java.util.Map;import java.util.Stack; import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.util.AttributeSource; public class MySameFiter extends TokenFilter { Stack<String> stack = null; private CharTermAttribute cta = null; //詞元資訊 private PositionIncrementAttribute position = null; // 詞元位置資訊 private AttributeSource.State current; // 記錄當前的詞元位置狀態 private Map<String, String[]> map ; // 同義字表 protected MySameFiter(TokenStream input,Map<String, String[]> map ) { super(input); stack = new Stack<>(); cta = input.addAttribute(CharTermAttribute.class); position = input.addAttribute(PositionIncrementAttribute.class); this.map = map ; } @Override public boolean incrementToken() throws IOException { //同義字操作 while (stack.size() > 0) { String word = stack.pop(); restoreState(current); cta.setEmpty(); cta.append(word); position.setPositionIncrement(0); return true; } //判斷是否有下一個分詞 if (!input.incrementToken()) { return false; } //擷取當前的狀態 if (getSameWrds(cta.toString())) { current = captureState(); } return true; } private boolean getSameWrds(String words) { String[] arr = map.get(words); if (arr != null) { for (String word : arr) { stack.push(word); } return true; } return false; } } 自訂分詞器
package lucene_index; import java.io.Reader;import java.util.HashMap;import java.util.Map;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.core.StopAnalyzer;import org.apache.lucene.analysis.core.StopFilter;import org.apache.lucene.analysis.util.CharArraySet;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKTokenizer; public class StopWrodsAnalyse extends Analyzer{ private Map<String, String[]> map = new HashMap<String, String[]>();// private CharArraySet set = null; public StopWrodsAnalyse(Map<String, String[]> map ){ //for(Map.Entry<String, String []> entry : map.entrySet()){ // set = StopFilter.makeStopSet(Version.LUCENE_48, entry.getValue(),true); // } // set.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET); this.map = map ; } @Override protected TokenStreamComponents createComponents(String words, Reader reader) { Tokenizer source = new IKTokenizer(reader, false); TokenStream stream = new MySameFiter(source,map);//將自訂的filter傳入詞庫的話用ik的 // stream = new StopFilter(Version.LUCENE_48, stream, set); return new TokenStreamComponents(source,stream); } }
package lucene_index; import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.Collection;import java.util.HashMap;import java.util.Map; import javax.print.Doc; import org.apache.commons.io.FileUtils;import org.apache.commons.io.LineIterator;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.StringField;import org.apache.lucene.document.Field.Index;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Term;import org.apache.lucene.queryparser.classic.ParseException;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version; public class MainTest { public static void main(String[] args) throws IOException, ParseException { LineIterator it = FileUtils.lineIterator(new File("E://searchwork_custom//data_index//ConfigFile//ExpansionWord.csv"),"gbk"); Map<String, String []> map = new HashMap<String, String[]>(); while (it.hasNext()) { String word = it.nextLine(); String [] wordArr = word.replace("-,", "").trim().split("\\,"); if(map.containsKey(wordArr[0])) continue; map.put(wordArr[0], wordArr); } Analyzer analyzer = new StopWrodsAnalyse(map); Directory directory = FSDirectory.open(new File("E:\\luceneindex")); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer); IndexWriter writer = new IndexWriter(directory, config); Collection<Document> coll = new ArrayList<Document>(); for (Map.Entry<String, String []> entry : map.entrySet()) { Document docss = new Document(); Field field = new Field("name", entry.getKey(),Store.YES,Index.ANALYZED); docss.add(field); coll.add(docss); } writer.addDocuments(coll); writer.commit(); writer.close(); IndexSearcher searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(new File("E:\\luceneindex")))); // QueryParser parser = new QueryParser(Version.LUCENE_48, "name", analyzer); search(searcher); //WordInfo.getWordInfo(word, analyzer); } public static void search(IndexSearcher searcher) throws IOException{ Query q = new TermQuery(new Term("name","中國建設銀行")); System.out.println(q); TopDocs doc = searcher.search(q, 10); ScoreDoc [] docs = doc.scoreDocs; for (int i = 0; i < docs.length; i++) { Document d = searcher.doc(docs[i].doc); System.out.println(d.get("name")); } }}
3.測試 當搜建行建設銀行中國建設銀行時建行或者建設銀行時
lucene自訂同義字實現