1
、
2, the structure of the Vocabulary unit interpretation
3. Design ideas of synonyms
4, the comparison and test of the word breaker
Package org.lucene.test;
Import Java.io.File;
Import java.io.IOException;
Import Org.apache.lucene.analysis.Analyzer;
Import Org.apache.lucene.analysis.SimpleAnalyzer;
Import Org.apache.lucene.analysis.StopAnalyzer;
Import Org.apache.lucene.analysis.WhitespaceAnalyzer;
Import Org.apache.lucene.analysis.standard.StandardAnalyzer;
Import org.apache.lucene.document.Document;
Import Org.apache.lucene.document.Field;
Import org.apache.lucene.index.CorruptIndexException;
Import Org.apache.lucene.index.IndexReader;
Import Org.apache.lucene.index.IndexWriter;
Import Org.apache.lucene.index.IndexWriterConfig;
Import Org.apache.lucene.index.Term;
Import Org.apache.lucene.search.IndexSearcher;
Import Org.apache.lucene.search.ScoreDoc;
Import Org.apache.lucene.search.TermQuery;
Import Org.apache.lucene.search.TopDocs;
Import Org.apache.lucene.store.Directory;
Import Org.apache.lucene.store.RAMDirectory;
Import org.apache.lucene.util.Version;
Import Org.junit.Test; Import Org.lucene.util.AnalyzerUTILs;
Import Org.lucene.util.MySameAnalyzer;
Import Org.lucene.util.MyStopAnalyzer;
Import Com.chenlb.mmseg4j.analysis.MMSegAnalyzer; public class Testanalyzer {/** * Several word breakers in English participle below the comparison */@Test public void test01 () {//Standard word breaker Analyzer a1 = new Stan
Dardanalyzer (version.lucene_35);
Stop Word breaker Analyzer a2 = new Stopanalyzer (version.lucene_35);
Simple word breaker Analyzer a3 = new Simpleanalyzer (version.lucene_35);
Space word breaker Analyzer a4 = new Whitespaceanalyzer (version.lucene_35); String txt = "This is my house,i am come from Yunnang zhaotong," + "My e-mail is ynkonghao@gmail.com,my QQ is 707807876
";
Analyzerutils.displaytoken (TXT, a1);
[my][house][i][am][come][from][yunnang][zhaotong][my][email][ynkonghao][gmail.com][my][qq][707807876]
Analyzerutils.displaytoken (TXT, a2);
[MY][HOUSE][I][AM][COME][FROM][YUNNANG][ZHAOTONG][MY][EMAIL][YNKONGHAO][GMAIL][COM][MY][QQ]
Analyzerutils.displaytoken (TXT, A3); [This][is][my][house][i][am][come][from][yunnang][zhaotOng][my][email][is][ynkonghao][gmail][com][my][qq][is] Analyzerutils.displaytoken (txt, A4); [this][is][my][house,i][am][come][from][yunnang][zhaotong,my][email][is][ynkonghao@gmail.com,my][qq][is][ 707807876]}/** * Several word breakers in the Chinese word below the comparison */@Test public void test02 () {//Standard word breaker Analyzer a1 = new StandardAnalyzer (V Ersion.
LUCENE_35);
Stop Word breaker Analyzer a2 = new Stopanalyzer (version.lucene_35);
Simple word breaker Analyzer a3 = new Simpleanalyzer (version.lucene_35);
Space word breaker Analyzer a4 = new Whitespaceanalyzer (version.lucene_35);
String txt = "I come from Yunnan zhaotong Zhaoyang District teachers ' College";
Analyzerutils.displaytoken (TXT, a1);
[i] [come] [from] [The cloud] [South] [Zhao] [Tong]] [Yang] [district] [division] [Special] Analyzerutils.displaytoken (TXT, a2);
[I come from Yunnan zhaotong Zhaoyang District Teachers ' College] Analyzerutils.displaytoken (TXT, A3);
[I come from Yunnan zhaotong Zhaoyang District Teachers ' College] analyzerutils.displaytoken (TXT, A4); [I come from Yunnan zhaotong Zhaoyang District Teachers ' College]}/** * Print word breaker details */@Test public void test03 () {//Standard word breaker Analyzer a1 = new StandardAnalyzer (V Ersion.
LUCENE_35); Stop Word breaker Analyzer a2 = new Stopanalyzer(version.lucene_35);
Simple word breaker Analyzer a3 = new Simpleanalyzer (version.lucene_35);
Space word breaker Analyzer a4 = new Whitespaceanalyzer (version.lucene_35);
String txt = "How is thank you";
Analyzerutils.displayalltoken (TXT, a1);
Analyzerutils.displayalltoken (TXT, a2);
Analyzerutils.displayalltoken (TXT, A3);
Analyzerutils.displayalltoken (txt, A4);
}/** * Disable word Test */@Test public void test04 () {Analyzer A1 = new Mystopanalyzer (new string[]{"I", "You", "hate"});
Analyzer A2 = new Stopanalyzer (version.lucene_35);
String txt = "How is you thAnk's hate you";
Analyzerutils.displaytoken (TXT, a1);
Analyzerutils.displaytoken (TXT, a2); }/** * Chinese word segmentation test * Use thesaurus participle, own extensible thesaurus */@Test public void test05 () {//Analyzer a1 = new Mmseganalyzer ();//not added to the word breaker comes with The Thesaurus//[me [to] [from] [The cloud] [South] [Zhao] [Tong]] [Yang] [district] [division] [Special]//import word-breaker dictionary there is the thesaurus Analyzer a1 = new Mmseganalyzer ("d:\\workspaces\\03
_lucene_analyzer\\mmseg4j-1.8.4\\data ")); [i] [from] [Yunnan] [Zhao]] [Zhao] [Yang] [district] [teachers ' College]//You can expand your own dictionary in the words-my.dic below the data file, such as the addition of Zhaotong, the result is://[I [from] [Yunnan] [Zhaotong] [Zhao] [Yang] [district] [teacher's College] String txt = "I come from the Yunnan zhaotong Zhaoyang District teachers ' College";
Analyzerutils.displaytoken (TXT, a1); }/** * Synonym Test * @throws ioexception * @throws corruptindexexception * * @Test public void test06 () throws Co
Rruptindexexception, ioexception{Analyzer a1 = new Mysameanalyzer ();
String txt = "I come from the Chinese Yunnan zhaotong Zhaoyang District teachers ' College";
Analyzerutils.displayalltoken (TXT, a1);
String keyword = "i";
Directory dire = new ramdirectory ();
IndexWriter indexwriter = new IndexWriter (dire,new indexwriterconfig (version.lucene_35, A1));
Document doc = new document ();
Doc.add (New Field ("content", txt,field.store.yes,field.index.analyzed));
Indexwriter.adddocument (DOC);
Indexwriter.close ();
Indexsearcher search = new Indexsearcher (Indexreader.open (dire));
Topdocs Topdoc = Search.search (new Termquery ("content", keyword), 10);
scoredoc[] Scoredoc = Topdoc.scoredocs; for (Scoredoc score:scoredoc) {Document Doc1 = search. doc (Score.doc);
System.out.println (Doc1.get ("content"));
}
}
}
5. Expand your own stop word word breaker
Package org.lucene.util;
Import java.io.IOException;
Import Java.io.Reader;
Import Java.util.Set;
Import Org.apache.lucene.analysis.Analyzer;
Import Org.apache.lucene.analysis.LetterTokenizer;
Import Org.apache.lucene.analysis.LowerCaseFilter;
Import Org.apache.lucene.analysis.StopAnalyzer;
Import Org.apache.lucene.analysis.StopFilter;
Import Org.apache.lucene.analysis.TokenStream;
Import Org.apache.lucene.analysis.Tokenizer;
Import Org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
Import org.apache.lucene.util.Version;
/** * Expand your own stop word breaker * @author User * */public class Mystopanalyzer extends analyzer{private Set stops;
Public Mystopanalyzer (string[] SWS) {//will automatically convert the string array to set stops = Stopfilter.makestopset (version.lucene_35, SWS, true);
Add the original stop word to Stops.addall (stopanalyzer.english_stop_words_set);
} public Mystopanalyzer () {//Get the original deactivation word stops.addall (stopanalyzer.english_stop_words_set); } @Override Public Tokenstream tokenstream (String fieldName, Reader reader) {System.out.println ("//------------------------------------");
Tokenizer tokenizer = new Lettertokenizer (Version.lucene_35,reader);
Tokenizer tokenizer = new Standardtokenizer (Version.lucene_35,reader);
Chartermattribute CTA = Tokenizer.addattribute (Chartermattribute.class);
try {while (Tokenizer.incrementtoken ()) {System.out.println (CTA);
}} catch (IOException e) {e.printstacktrace ();
} System.out.println ("------------------------------------\ \"); Set the filter chain for this word breaker and Tokenizer return new Stopfilter (version.lucene_35, New Lowercasefilter (version.lucene_35, New Le
Ttertokenizer (version.lucene_35, reader)), stops); }
}
6, the expansion of the word breaker, synonym word breaker
Package org.lucene.util;
Import Java.io.Reader;
Import Org.apache.lucene.analysis.Analyzer;
Import Org.apache.lucene.analysis.TokenStream;
Import com.chenlb.mmseg4j.Dictionary;
Import Com.chenlb.mmseg4j.MaxWordSeg;
Import Com.chenlb.mmseg4j.analysis.MMSegTokenizer;
/**
* Word breaker extension, synonym word breaker
* @author User * */public
class Mysameanalyzer extends analyzer{
@ Override public
tokenstream tokenstream (String fieldName, Reader Reader) {
Dictionary dic = Dictionary.getinstance ("D:\\workspaces\\03_lucene_analyzer\\mmseg4j-1.8.4\\data");
return new Mysametokenfilter (new Mmsegtokenizer (DIC), reader);}
}
7. Extension of synonym Filter
Package org.lucene.util;
Import java.io.IOException;
Import Java.util.HashMap;
Import Java.util.Map;
Import Java.util.Stack;
Import Org.apache.lucene.analysis.TokenFilter;
Import Org.apache.lucene.analysis.TokenStream;
Import Org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
Import Org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
Import Org.apache.lucene.util.AttributeSource; /** * Synonym Filter extension * @author user * */public class Mysametokenfilter extends tokenfilter{private Chartermattribute CT
A = null;
Private Positionincrementattribute pia = null;
Private attributesource.state current = null;
Private stack<string> sames = null;
Protected Mysametokenfilter (Tokenstream input) {super (input);
CTA = This.addattribute (Chartermattribute.class);
Pia = This.addattribute (Positionincrementattribute.class);
Sames = new stack<string> (); /** * Thoughts are as follows: * In fact, each synonym should be placed in the Chartermattribute, but if the direct cta.append ("Continent"), the words * that will direct the original words and synonymsConnected in the same vocabulary unit [China], that's not going to work. * To have such an effect [China] [mainland] * then you have to save a copy of the current state at the time of the synonym, and put the array of synonyms in the stack, so that when the next token is empty, it is not empty. To restore the previous saved state, then the value of Cta.setempty () before the modified state, and then add the value of the synonym to Cta.append ("Continent") * and then set the position increment to 0,pia.setpositionincrement (0), This means that it is synonymous, * then returns the lexical unit of the synonym */@Override public boolean Incrementtoken () throws IOException {while (Sames.size ()
> 0) {//The element is out of the stack and gets the synonym String str = sames.pop ();
Restore state restorestate (current);
Cta.setempty ();
Cta.append (str);
Set position pia.setpositionincrement (0);
return true;
} if (!input.incrementtoken ()) return false;
if (Getsamewords (cta.tostring ())) {//If there is a synonym, save current status first = Capturestate ();
} return true; }/* Does not work in this way, it will result in [China] being replaced by [mainland] * instead of becoming [China] [mainland] @Override public boolean incrementtoken () throws IOException
{if (!input.incrementtoken ()) return false;
if (cta.tostring (). Equals ("China")) {cta.setempty ();
Cta.append ("mainland");
} return true; } */Private Boolean GetsameworDS (String name) {map<string,string[]> maps = new hashmap<string,string[]> ();
Maps.put ("China", new string[]{"mainland", "Celestial"});
Maps.put ("I", New string[]{"We", "I"});
string[] SwS = maps.get (name);
if (SWS! = null) {for (String S:sws) {Sames.push (s);
} return true;
} return false;
}
}
8. Print the information of the vocabulary unit
Package org.lucene.util;
Import java.io.IOException;
Import Java.io.StringReader;
Import Org.apache.lucene.analysis.Analyzer;
Import Org.apache.lucene.analysis.TokenStream;
Import Org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
Import Org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
Import Org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
Import Org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** * Print the information for the vocabulary unit * @author User * */public class Analyzerutils {public static void Displaytoken (String str,analyzer
A) {Tokenstream stream = A.tokenstream ("Content", new StringReader (str)); /* * Tokenstream is equivalent to a stream * Chartermattribute is equivalent to a bowl * then throws the bowl into the stream, and when the bowl gets an element, the bowl will automatically flow to the next * an element is taken to value * This is a design pattern: Create a
property, this property will be added in the stream, * with this tokenstream increase */chartermattribute CTA = Stream.addattribute (Chartermattribute.class); try {while (Stream.incrementtoken ()) {System.out.print ("[" +cta+ "]");//System.out.println (stream); If you print the stream directly, ToString prints as follows://(Come,startoffset=1,endoffset=2,positionincrement=1,type=<ideographic>)} Sy
Stem.out.println ();
} catch (IOException e) {e.printstacktrace (); }}/** * The lexical unit for printing details * @param str * @param A */public static void Displayalltoken (String str,analyzer a) {T
Okenstream stream = A.tokenstream ("Content", new StringReader (str));
Position Increment Positionincrementattribute pia = stream.addattribute (Positionincrementattribute.class);
Offset Offsetattribute OA = Stream.addattribute (Offsetattribute.class);
Word element Chartermattribute CTA = Stream.addattribute (Chartermattribute.class);
Type of participle typeattribute ta = stream.addattribute (typeattribute.class);
try {while (Stream.incrementtoken ()) {System.out.print (pia.getpositionincrement () + ":");
System.out.print (cta+ "[" +oa.startoffset () + "-" + oa.endoffset () + "-" +ta.type ());
System.out.println ();
} System.out.println (); } catch (IOException e) {e.printstAcktrace (); }
}
}
Project Download path: http://download.csdn.net/detail/wxwzy738/5284705