Supports English, numbers, and Chinese (simplified) Word Segmentation
Matching of common numbers and names
Dictionary sorting for over 0.22 million words
Implement the forward maximum Matching Algorithm
// Adopts the forward-largest matching Chinese word segmentation algorithm, which is equivalent to a word segmentation granularity equal to 0
Mmanalyzer analyzer = new mmanalyzer ();
// The parameter is the word segmentation granularity. When the word count equals or exceeds this parameter and becomes a word, the word is split.
Mmanalyzer analyzer = new mmanalyzer (2 );
// Add a new dictionary that reads one word per row
Mmanalyzer. adddictionary (Reader );
// Add a new word
Mmanalyzer. addword (newword );
// Delete all words in the dictionary. (Note: All word segmentation before a new dictionary is loaded will fail)
Mmanalyzer. Clear ();
// Whether the word is included in the dictionary
Mmanalyzer. Contains (string word );
// Remove the word from the dictionary
Mmanalyzer. removeword (string word );
// Total number of words contained in the current dictionary
Mmanalyzer. Size ();
View plaincopy to clipboardprint?
Package demo. analysis;
Import java. Io. ioexception;
Import jeasy. analysis. mmanalyzer;
Public class segment
{
Public static void main (string [] ARGs)
{
String text = "according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th ,"
+ "At 05:53, the first magnitude of the 6.2-magnitude earthquake occurred at local time on the 27 th day near the city of Riya, killing at least 5427 people ,"
+ "More than 20000 people were injured and nearly 0.2 million were left homeless. ";
Mmanalyzer analyzer = new mmanalyzer ();
Try
{
System. Out. println (analyzer. segment (text, "| "));
}
Catch (ioexception E)
{
E. printstacktrace ();
}
}
}
Package demo. analysis;
Import java. Io. ioexception;
Import jeasy. analysis. mmanalyzer;
Public class segment
{
Public static void main (string [] ARGs)
{
String text = "according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th ,"
+ "At 05:53, the first magnitude of the 6.2-magnitude earthquake occurred at local time on the 27 th day near the city of Riya, killing at least 5427 people ,"
+ "More than 20000 people were injured and nearly 0.2 million were left homeless. ";
Mmanalyzer analyzer = new mmanalyzer ();
Try
{
System. Out. println (analyzer. segment (text, "| "));
}
Catch (ioexception E)
{
E. printstacktrace ();
}
}
}
Effect:
| Reuters | report | Indonesia | social | affairs | Ministry | officials | Tuesday | 29th | dianchao | city | nearby | Local time | 27th
| Morning | 53 | occurrence | Rishi | magnitude 6.2 | earthquake | caused | at least | 5427 | deaths | 20000 | more people | injured | nearly | 0.2 million | homeless |
View plaincopy to clipboardprint?
Package demo. analysis;
Import jeasy. analysis. mmanalyzer;
Import org. Apache. Lucene. analysis. analyzer;
Import org.apache.e.doc ument. Document;
Import org.apache.e.doc ument. field;
Import org. Apache. Lucene. Index. indexwriter;
Import org. Apache. Lucene. queryparser. queryparser;
Import org. Apache. Lucene. Search. Hits;
Import org. Apache. Lucene. Search. indexsearcher;
Import org. Apache. Lucene. Search. query;
Import org. Apache. Lucene. Store. Directory;
Import org. Apache. Lucene. Store. ramdirectory;
Public class segment
{
Public static void main (string [] ARGs)
{
String fieldname = "text ";
String text = "according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th ,"
+ "At 05:53, the first magnitude of the 6.2-magnitude earthquake occurred at local time on the 27 th day near the city of Riya, killing at least 5427 people ,"
+ "More than 20000 people were injured and nearly 0.2 million were left homeless. "; // Retrieve content
// Use the forward maximum matching Chinese Word Segmentation Algorithm
Analyzer analyzer = new mmanalyzer ();
Directory directory = new ramdirectory ();
// Directory = fsdirectory. getdirectory ("/tmp/testindex", true );
Try
{
Indexwriter iwriter = new indexwriter (directory, analyzer, true );
Iwriter. setmaxfieldlength (25000 );
Document Doc = new document ();
Doc. Add (new field (fieldname, text, field. Store. Yes, field. Index. tokenized ));
Iwriter. adddocument (DOC );
Iwriter. Close ();
Indexsearcher isearcher = new indexsearcher (directory );
Queryparser parser = new queryparser (fieldname, analyzer );
Query query = parser. parse ("Indonesia earthquake magnitude 6.2"); // query term
Hits hits = isearcher. Search (query );
System. Out. println ("Hit:" + hits. Length ());
For (INT I = 0; I {
Document hitdoc = hits.doc (I );
System. Out. println ("content:" + hitdoc. Get (fieldname ));
}
Isearcher. Close ();
Directory. Close ();
}
Catch (exception E)
{
E. printstacktrace ();
}
}
}
Package demo. analysis;
Import jeasy. analysis. mmanalyzer;
Import org. Apache. Lucene. analysis. analyzer;
Import org.apache.e.doc ument. Document;
Import org.apache.e.doc ument. field;
Import org. Apache. Lucene. Index. indexwriter;
Import org. Apache. Lucene. queryparser. queryparser;
Import org. Apache. Lucene. Search. Hits;
Import org. Apache. Lucene. Search. indexsearcher;
Import org. Apache. Lucene. Search. query;
Import org. Apache. Lucene. Store. Directory;
Import org. Apache. Lucene. Store. ramdirectory;
Public class segment
{
Public static void main (string [] ARGs)
{
String fieldname = "text ";
String text = "according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th ,"
+ "At 05:53, the first magnitude of the 6.2-magnitude earthquake occurred at local time on the 27 th day near the city of Riya, killing at least 5427 people ,"
+ "More than 20000 people were injured and nearly 0.2 million were left homeless. "; // Retrieve content
// Use the forward maximum matching Chinese Word Segmentation Algorithm
Analyzer analyzer = new mmanalyzer ();
Directory directory = new ramdirectory ();
// Directory = fsdirectory. getdirectory ("/tmp/testindex", true );
Try
{
Indexwriter iwriter = new indexwriter (directory, analyzer, true );
Iwriter. setmaxfieldlength (25000 );
Document Doc = new document ();
Doc. Add (new field (fieldname, text, field. Store. Yes, field. Index. tokenized ));
Iwriter. adddocument (DOC );
Iwriter. Close ();
Indexsearcher isearcher = new indexsearcher (directory );
Queryparser parser = new queryparser (fieldname, analyzer );
Query query = parser. parse ("Indonesia earthquake magnitude 6.2"); // query term
Hits hits = isearcher. Search (query );
System. Out. println ("Hit:" + hits. Length ());
For (INT I = 0; I {
Document hitdoc = hits.doc (I );
System. Out. println ("content:" + hitdoc. Get (fieldname ));
}
Isearcher. Close ();
Directory. Close ();
}
Catch (exception E)
{
E. printstacktrace ();
}
}
}
Effect:
Hit: 1
Content: according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th) that the occurrence of 05:53 on the 27 th local time near the city of rianchao
At least 6.2 people were killed, more than 5427 people were injured, and nearly 20000 people were left homeless.
View plaincopy to clipboardprint?
Package demo. analysis;
Import jeasy. analysis. mmanalyzer;
Import org. Apache. Lucene. analysis. analyzer;
Import org. Apache. Lucene. analysis. tokenstream;
Import org.apache.e.doc ument. Document;
Import org.apache.e.doc ument. field;
Import org. Apache. Lucene. Index. indexreader;
Import org. Apache. Lucene. Index. indexwriter;
Import org. Apache. Lucene. Index. termpositionvector;
Import org. Apache. Lucene. queryparser. queryparser;
Import org. Apache. Lucene. Search. Hits;
Import org. Apache. Lucene. Search. indexsearcher;
Import org. Apache. Lucene. Search. query;
Import org. Apache. Lucene. Search. Highlight. highlighter;
Import org. Apache. Lucene. Search. Highlight. queryscorer;
Import org. Apache. Lucene. Search. Highlight. tokensources;
Import org. Apache. Lucene. Store. Directory;
Import org. Apache. Lucene. Store. ramdirectory;
Public class segment
{
Public static void main (string [] ARGs)
{
String fieldname = "text ";
String text = "according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th ,"
+ "At 05:53, the first magnitude of the 6.2-magnitude earthquake occurred at local time on the 27 th day near the city of Riya, killing at least 5427 people ,"
+ "More than 20000 people were injured and nearly 0.2 million were left homeless. "; // Retrieve content
// Use the forward maximum matching Chinese Word Segmentation Algorithm
Analyzer analyzer = new mmanalyzer ();
Directory directory = new ramdirectory ();
// Directory = fsdirectory. getdirectory ("/tmp/testindex", true );
Try
{
Indexwriter iwriter = new indexwriter (directory, analyzer, true );
Iwriter. setmaxfieldlength (25000 );
Document Doc = new document ();
Doc. Add (new field (fieldname, text, field. Store. Yes,
Field. Index. tokenized,
Field. termvector. with_positions_offsets ));
Iwriter. adddocument (DOC );
Iwriter. Close ();
Indexsearcher isearcher = new indexsearcher (directory );
Queryparser parser = new queryparser (fieldname, analyzer );
Query query = parser. parse ("Indonesia earthquake magnitude 6.2"); // query term
Hits hits = isearcher. Search (query );
System. Out. println ("Hit:" + hits. Length ());
Highlighter = new highlighter (New queryscorer (query ));
For (INT I = 0; I {
TEXT = hits.doc (I). Get (fieldname );
Termpositionvector TPV = (termpositionvector) indexreader. Open (
Directory). gettermfreqvector (hits. ID (I), fieldname );
Tokenstream = tokensources. gettokenstream (TPV );
String result = highlighter. getbestfragments (tokenstream, text, 3 ,"...");
System. Out. println ("content:" + result );
}
Isearcher. Close ();
Directory. Close ();
}
Catch (exception E)
{
E. printstacktrace ();
}
}
}
Package demo. analysis;
Import jeasy. analysis. mmanalyzer;
Import org. Apache. Lucene. analysis. analyzer;
Import org. Apache. Lucene. analysis. tokenstream;
Import org.apache.e.doc ument. Document;
Import org.apache.e.doc ument. field;
Import org. Apache. Lucene. Index. indexreader;
Import org. Apache. Lucene. Index. indexwriter;
Import org. Apache. Lucene. Index. termpositionvector;
Import org. Apache. Lucene. queryparser. queryparser;
Import org. Apache. Lucene. Search. Hits;
Import org. Apache. Lucene. Search. indexsearcher;
Import org. Apache. Lucene. Search. query;
Import org. Apache. Lucene. Search. Highlight. highlighter;
Import org. Apache. Lucene. Search. Highlight. queryscorer;
Import org. Apache. Lucene. Search. Highlight. tokensources;
Import org. Apache. Lucene. Store. Directory;
Import org. Apache. Lucene. Store. ramdirectory;
Public class segment
{
Public static void main (string [] ARGs)
{
String fieldname = "text ";
String text = "according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th ,"
+ "At 05:53, the first magnitude of the 6.2-magnitude earthquake occurred at local time on the 27 th day near the city of Riya, killing at least 5427 people ,"
+ "More than 20000 people were injured and nearly 0.2 million were left homeless. "; // Retrieve content
// Use the forward maximum matching Chinese Word Segmentation Algorithm
Analyzer analyzer = new mmanalyzer ();
Directory directory = new ramdirectory ();
// Directory = fsdirectory. getdirectory ("/tmp/testindex", true );
Try
{
Indexwriter iwriter = new indexwriter (directory, analyzer, true );
Iwriter. setmaxfieldlength (25000 );
Document Doc = new document ();
Doc. Add (new field (fieldname, text, field. Store. Yes,
Field. Index. tokenized,
Field. termvector. with_positions_offsets ));
Iwriter. adddocument (DOC );
Iwriter. Close ();
Indexsearcher isearcher = new indexsearcher (directory );
Queryparser parser = new queryparser (fieldname, analyzer );
Query query = parser. parse ("Indonesia earthquake magnitude 6.2"); // query term
Hits hits = isearcher. Search (query );
System. Out. println ("Hit:" + hits. Length ());
Highlighter = new highlighter (New queryscorer (query ));
For (INT I = 0; I {
TEXT = hits.doc (I). Get (fieldname );
Termpositionvector TPV = (termpositionvector) indexreader. Open (
Directory). gettermfreqvector (hits. ID (I), fieldname );
Tokenstream = tokensources. gettokenstream (TPV );
String result = highlighter. getbestfragments (tokenstream, text, 3 ,"...");
System. Out. println ("content:" + result );
}
Isearcher. Close ();
Directory. Close ();
}
Catch (exception E)
{
E. printstacktrace ();
}
}
}
Effect:
Hit: 1
Content: according to Reuters, a Ministry of Social Affairs Officer in <B> Indonesia </B> said on Tuesday (29th) that the occurrence of 05:53 on the 27 th day of the local time near the city of Japan
The <B> magnitude 6.2 </B> <B> earthquake </B> has killed at least 5427 people, injured more than 20000 people, and left nearly 0.2 million homeless.
This article from the csdn blog, reproduced please indicate the source: http://blog.csdn.net/Java2King/archive/2010/01/08/5155878.aspx