----------------------------------------------------------
Lucene participle _ Chinese participle introduction
----------------------------------------------------------
Paoding: Discovering word breaker. No updates have been made
MMSEG: Using Sogou's Thesaurus
1. Import package (with two packages: 1. With DIC, 2. Without dic)
If you use non-DIC, you have to specify the thesaurus location
2. Create Mmseganalyzer (indicates where the thesaurus is located)
----------------------------------------------------------
Lucene Participle _ implement custom synonym Word breaker _ Thinking Analysis (Analyzer---->tokenstream---> Tokenfilter--->tokenizer) with the order of the word breaker details are constructed in reverse
----------------------------------------------------------
/* * Implement custom Chinese synonym word breaker (mmseg thesaurus) */public class Mysameanalyzer extends Analyzer {@Overridepublic Tokenstream tokenstream ( String fieldName, Reader Read) {Dictionary dic = dictionary.getinstance ("f:\\baiduyundownload\\cache\\lucune\\ Chinesedic "); return new Mysametokenfilter (new Mmsegtokenizer (DIC), read);}}
---------------------------------------------------------
Lucene participle _ Implement custom synonym Word breaker _ implement word breaker
---------------------------------------------------------
Reader---->mmsegtokenizer (word breaker)----> (add Synonyms) mysametokenfilter (custom word breaker)----> Get synonyms (store synonyms in the same location)
---> Find synonyms---> Save current state---> skip to next element---> Store elements based on a synonym list---> Restore state---> saving elements in the same location
/* * Custom Synonyms Word filter */public class Mysametokenfilter extends Tokenfilter {//store Word data private chartermattribute CTA = null;//storage The location information of the Lexicon unit private Positionincrementattribute pia = null;//Adds whether there is a synonym for the variable attribute, saving the current element's state information private attributesource.state current;//Stack storage private stack<string> sames = null;protected mysametokenfilter (Tokenstream input) {super (input); CTA = This.addattribute (chartermattribute.class);p ia = This.addattribute (positionincrementattribute.class); sames = New stack<string> ();} @Overridepublic Boolean Incrementtoken () throws IOException {//save synonym for Last word while (sames.size () > 0) {//out of stack and get synonyms Strin G str = Sames.pop ();//Restore the state of the previous vocabulary restorestate (current);//Save the Element Cta.setempty () on the previous vocabulary; cta.append (str);// Set synonym location to 0pia.setpositionincrement (0); return true;} Skip to Next Ctaif (!this.input.incrementtoken ())//No element returns Falsereturn False;if (Getsamewords (cta.tostring ())) {//If there are synonyms, Changes the current state information of the vocabulary, saves (captures the current state) of present state (capturestate);} return true;} /* * * get synonyms */private BooleAn getsamewords (String name) {map<string, string[]> maps = new hashmap<string, string[]> (); Maps.put ("I", new String[] {"I", "I"}), Maps.put ("Hunan", new string[] {"Land of Plenty", "Xiang"}); string[] SwS = maps.get (name), if (SWS! = null) {//Add in stack for (String Str:sws) {sames.push (str);} return true;} return false;}}
----------------------------------------------------
Lucene participle _ implementation of the custom synonym word breaker _ implementation of the word breaker (good design)
----------------------------------------------------
Idea: Programming for the interface is the kingly way
1. Create an interface to manage synonyms
/* * Interface for storing synonyms */public interface Mysamecontxt {//Get synonyms string[]public string[] getsamewords (String name);}
2. Implement the interface and add the thesaurus
public class Mysimplesamecontxt implements Mysamecontxt {/* * implements synonym interface */map<string, string[]> maps = new hashmap< String, string[]> ();p ublic mysimplesamecontxt () {maps.put ("I", new string[] {"I", "we"}); Maps.put ("Hunan", new string[] { "Land of Plenty", "Xiang"}); Public string[] Getsamewords (String name) {return maps.get (name);}}
3. Add a synonym attribute to tokenfilter in a custom word breaker filter
Get a library that specializes in managing synonyms
Private mysamecontxt samecontxt;
Full code
/* * Custom Synonyms Word filter */public class Mysametokenfilter extends Tokenfilter {//store Word data private chartermattribute CTA = null;//storage The location information of the Lexicon unit private Positionincrementattribute pia = null;//Adds whether there is a synonym for the variable attribute, saving the current element's state information private attributesource.state current;//Stack Store private stack<string> sames = null;//Get a library dedicated to managing synonyms private Mysamecontxt samecontxt;protected Mysametokenfilter (Tokenstream input, Mysamecontxt samecontxt) {super (input); CTA = This.addattribute ( Chartermattribute.class);p ia = This.addattribute (positionincrementattribute.class); sames = new Stack<string> ( ); this.samecontxt = Samecontxt;} @Overridepublic Boolean Incrementtoken () throws IOException {//save synonym for Last word while (sames.size () > 0) {//out of stack and get synonyms Strin G str = Sames.pop ();//Restore the state of the previous vocabulary restorestate (current);//Save the Element Cta.setempty () on the previous vocabulary; cta.append (str);// Set synonym location to 0pia.setpositionincrement (0); return true;} Skip to Next Ctaif (!this.input.incrementtoken ())//No element returns Falsereturn False;if (Getsamewords (cta.tostring ())) {//If there are synonyms, Change the current state information of a vocabulary, the current state is saved (capturing the current state), present = Capturestate ();} return true;} /* * Get synonyms */private Boolean getsamewords (String name) {//all strings that get synonyms via interface Samecontxt []string[] SwS = samecontxt.getsamewords (name), if (SWS! = null) {//Add in stack for (String Str:sws) {sames.push (str);} return true;} return false;}} 4. Implement tokenstream/* * Implement custom Chinese synonym word breaker (mmseg thesaurus) */public class Mysameanalyzer extends Analyzer {//Add Thesaurus Private Mysamecontxt samecontxt;public Mysameanalyzer (mysamecontxt msc) {this.samecontxt = MSC;} @Overridepublic tokenstream Tokenstream (String fieldName, Reader Read) {//Z finally passed the custom Thesaurus management class Dictionary dic = Dictionary . getinstance ("F:\\baiduyundownload\\cache\\lucune\\chinesedic"); return new Mysametokenfilter (New MMSegTokenizer ( New Maxwordseg (DIC), read), samecontxt);}
}
5. Writing index tests
public void test05 () {try {//The thesaurus as the word Breaker Analyzer property gets Tokenstreamanalyzer a1 = new Mysameanalyzer (New Mysimplesamecontxt ()); String txt = "I come from Hunan Shaoyang";//CREATE index directory dir = new ramdirectory (); IndexWriter writer = new IndexWriter (dir, new IndexWriter Config (version.lucene_35, A1));D ocument doc = new Document ();d Oc.add (New Field ("content", TXT, Field.Store.YES, Field.Index.ANALYZED)); Writer.adddocument (doc); Writer.close ();//Create search Indexreader reader = Indexreader.open (dir); Ndexsearcher search = new Indexsearcher (reader); Topdocs TDS = Search.search (New Termquery ("content", "Land of Plenty")), for (Scoredoc sdc:tds.scoreDocs) {Document do CC = Search.doc (Sdc.doc); System.out.println (Docc.get ("content"));} New Analyzerutils (). Displaytoken (TXT, a1);} catch (Corruptindexexception e) {e.printstacktrace ();} catch (Lockobtainfailedexception e) {e.printstacktrace ();} catch (IOException e) {e.printstacktrace ();}}
Lucene implements a custom Chinese synonym word breaker