lucene自訂同義字實現

來源:互聯網
上載者:User

標籤:rar   rem   wro   string   empty   tac   apach   stop   ssi   

 

 lucene同義字搜尋原理其實是根據 PositionIncrementAttribute 和 CharTermAttribute的次元記錄資訊來實現的,當前使用lucene版本為4.8.0首先同義字要實現
 package lucene_index; import java.io.IOException;import java.util.Map;import java.util.Stack; import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.util.AttributeSource; public class MySameFiter extends TokenFilter {    Stack<String> stack = null;    private CharTermAttribute cta = null; //詞元資訊    private PositionIncrementAttribute position = null; // 詞元位置資訊    private AttributeSource.State current; // 記錄當前的詞元位置狀態    private Map<String, String[]> map  ; // 同義字表     protected MySameFiter(TokenStream input,Map<String, String[]> map ) {        super(input);        stack = new Stack<>();        cta = input.addAttribute(CharTermAttribute.class);        position = input.addAttribute(PositionIncrementAttribute.class);        this.map = map ;     }    @Override    public boolean incrementToken() throws IOException {            //同義字操作        while (stack.size() > 0) {            String word = stack.pop();            restoreState(current);            cta.setEmpty();            cta.append(word);            position.setPositionIncrement(0);            return true;        }        //判斷是否有下一個分詞        if (!input.incrementToken()) {            return false;        }        //擷取當前的狀態        if (getSameWrds(cta.toString())) {            current = captureState();        }                        return true;    }     private boolean getSameWrds(String words) {        String[] arr = map.get(words);        if (arr != null) {            for (String word : arr) {                stack.push(word);             }            return true;        }         return false;    } }  自訂分詞器
 package lucene_index; import java.io.Reader;import java.util.HashMap;import java.util.Map;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.core.StopAnalyzer;import org.apache.lucene.analysis.core.StopFilter;import org.apache.lucene.analysis.util.CharArraySet;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKTokenizer; public class StopWrodsAnalyse extends Analyzer{    private Map<String, String[]> map = new HashMap<String, String[]>();//  private CharArraySet  set = null;    public StopWrodsAnalyse(Map<String, String[]> map ){        //for(Map.Entry<String, String []> entry : map.entrySet()){        //  set = StopFilter.makeStopSet(Version.LUCENE_48, entry.getValue(),true);    //  }    //  set.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);        this.map = map ;    }    @Override    protected TokenStreamComponents createComponents(String words, Reader reader) {                Tokenizer source =  new IKTokenizer(reader, false);        TokenStream stream = new MySameFiter(source,map);//將自訂的filter傳入詞庫的話用ik的    //  stream = new StopFilter(Version.LUCENE_48, stream, set);        return new TokenStreamComponents(source,stream);    } }  
 package lucene_index; import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.Collection;import java.util.HashMap;import java.util.Map; import javax.print.Doc; import org.apache.commons.io.FileUtils;import org.apache.commons.io.LineIterator;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.StringField;import org.apache.lucene.document.Field.Index;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Term;import org.apache.lucene.queryparser.classic.ParseException;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version; public class MainTest {    public static void main(String[] args) throws IOException, ParseException {        LineIterator it = FileUtils.lineIterator(new File("E://searchwork_custom//data_index//ConfigFile//ExpansionWord.csv"),"gbk");        Map<String, String []> map = new HashMap<String, String[]>();        while (it.hasNext()) {            String word = it.nextLine();            String [] wordArr = word.replace("-,", "").trim().split("\\,");            if(map.containsKey(wordArr[0]))                continue;            map.put(wordArr[0], wordArr);        }        Analyzer analyzer = new StopWrodsAnalyse(map);        Directory directory = FSDirectory.open(new File("E:\\luceneindex"));         IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer);        IndexWriter writer = new IndexWriter(directory, config);        Collection<Document> coll = new ArrayList<Document>();        for (Map.Entry<String, String []> entry : map.entrySet()) {            Document docss = new Document();            Field field = new Field("name", entry.getKey(),Store.YES,Index.ANALYZED);            docss.add(field);            coll.add(docss);        }        writer.addDocuments(coll);        writer.commit();        writer.close();        IndexSearcher searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(new File("E:\\luceneindex"))));      // QueryParser parser = new QueryParser(Version.LUCENE_48, "name", analyzer);        search(searcher);        //WordInfo.getWordInfo(word, analyzer);    }    public static void search(IndexSearcher searcher) throws IOException{        Query q =  new TermQuery(new Term("name","中國建設銀行"));        System.out.println(q);        TopDocs doc = searcher.search(q, 10);        ScoreDoc [] docs = doc.scoreDocs;        for (int i = 0; i < docs.length; i++) {            Document d = searcher.doc(docs[i].doc);            System.out.println(d.get("name"));        }            }}  
3.測試 當搜建行建設銀行中國建設銀行時建行或者建設銀行時
 

lucene自訂同義字實現

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.