Mmanalyzer easy-to-use Chinese Word Segmentation component

Source: Internet
Author: User

Supports English, numbers, and Chinese (simplified) Word Segmentation
Matching of common numbers and names
Dictionary sorting for over 0.22 million words
Implement the forward maximum Matching Algorithm

// Adopts the forward-largest matching Chinese word segmentation algorithm, which is equivalent to a word segmentation granularity equal to 0
Mmanalyzer analyzer = new mmanalyzer ();

// The parameter is the word segmentation granularity. When the word count equals or exceeds this parameter and becomes a word, the word is split.
Mmanalyzer analyzer = new mmanalyzer (2 );

// Add a new dictionary that reads one word per row
Mmanalyzer. adddictionary (Reader );

// Add a new word
Mmanalyzer. addword (newword );

 

// Delete all words in the dictionary. (Note: All word segmentation before a new dictionary is loaded will fail)
Mmanalyzer. Clear ();

// Whether the word is included in the dictionary
Mmanalyzer. Contains (string word );

// Remove the word from the dictionary
Mmanalyzer. removeword (string word );

// Total number of words contained in the current dictionary
Mmanalyzer. Size ();

 

View plaincopy to clipboardprint?
Package demo. analysis;

Import java. Io. ioexception;

Import jeasy. analysis. mmanalyzer;

Public class segment
{

Public static void main (string [] ARGs)
{
String text = "according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th ,"
+ "At 05:53, the first magnitude of the 6.2-magnitude earthquake occurred at local time on the 27 th day near the city of Riya, killing at least 5427 people ,"
+ "More than 20000 people were injured and nearly 0.2 million were left homeless. ";

Mmanalyzer analyzer = new mmanalyzer ();
Try
{
System. Out. println (analyzer. segment (text, "| "));
}
Catch (ioexception E)
{
E. printstacktrace ();
}
}
}
Package demo. analysis;

Import java. Io. ioexception;

Import jeasy. analysis. mmanalyzer;

Public class segment
{

Public static void main (string [] ARGs)
{
String text = "according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th ,"
+ "At 05:53, the first magnitude of the 6.2-magnitude earthquake occurred at local time on the 27 th day near the city of Riya, killing at least 5427 people ,"
+ "More than 20000 people were injured and nearly 0.2 million were left homeless. ";

Mmanalyzer analyzer = new mmanalyzer ();
Try
{
System. Out. println (analyzer. segment (text, "| "));
}
Catch (ioexception E)
{
E. printstacktrace ();
}
}
}

Effect:

| Reuters | report | Indonesia | social | affairs | Ministry | officials | Tuesday | 29th | dianchao | city | nearby | Local time | 27th

| Morning | 53 | occurrence | Rishi | magnitude 6.2 | earthquake | caused | at least | 5427 | deaths | 20000 | more people | injured | nearly | 0.2 million | homeless |

 

View plaincopy to clipboardprint?
Package demo. analysis;

Import jeasy. analysis. mmanalyzer;

Import org. Apache. Lucene. analysis. analyzer;
Import org.apache.e.doc ument. Document;
Import org.apache.e.doc ument. field;
Import org. Apache. Lucene. Index. indexwriter;
Import org. Apache. Lucene. queryparser. queryparser;
Import org. Apache. Lucene. Search. Hits;
Import org. Apache. Lucene. Search. indexsearcher;
Import org. Apache. Lucene. Search. query;
Import org. Apache. Lucene. Store. Directory;
Import org. Apache. Lucene. Store. ramdirectory;

Public class segment
{

Public static void main (string [] ARGs)
{
String fieldname = "text ";
String text = "according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th ,"
+ "At 05:53, the first magnitude of the 6.2-magnitude earthquake occurred at local time on the 27 th day near the city of Riya, killing at least 5427 people ,"
+ "More than 20000 people were injured and nearly 0.2 million were left homeless. "; // Retrieve content

// Use the forward maximum matching Chinese Word Segmentation Algorithm
Analyzer analyzer = new mmanalyzer ();

Directory directory = new ramdirectory ();
// Directory = fsdirectory. getdirectory ("/tmp/testindex", true );

Try
{
Indexwriter iwriter = new indexwriter (directory, analyzer, true );
Iwriter. setmaxfieldlength (25000 );
Document Doc = new document ();
Doc. Add (new field (fieldname, text, field. Store. Yes, field. Index. tokenized ));
Iwriter. adddocument (DOC );
Iwriter. Close ();

Indexsearcher isearcher = new indexsearcher (directory );
Queryparser parser = new queryparser (fieldname, analyzer );
Query query = parser. parse ("Indonesia earthquake magnitude 6.2"); // query term
Hits hits = isearcher. Search (query );
System. Out. println ("Hit:" + hits. Length ());

For (INT I = 0; I {
Document hitdoc = hits.doc (I );
System. Out. println ("content:" + hitdoc. Get (fieldname ));
}

Isearcher. Close ();
Directory. Close ();
}
Catch (exception E)
{
E. printstacktrace ();
}
}

}
Package demo. analysis;

Import jeasy. analysis. mmanalyzer;

Import org. Apache. Lucene. analysis. analyzer;
Import org.apache.e.doc ument. Document;
Import org.apache.e.doc ument. field;
Import org. Apache. Lucene. Index. indexwriter;
Import org. Apache. Lucene. queryparser. queryparser;
Import org. Apache. Lucene. Search. Hits;
Import org. Apache. Lucene. Search. indexsearcher;
Import org. Apache. Lucene. Search. query;
Import org. Apache. Lucene. Store. Directory;
Import org. Apache. Lucene. Store. ramdirectory;

Public class segment
{

Public static void main (string [] ARGs)
{
String fieldname = "text ";
String text = "according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th ,"
+ "At 05:53, the first magnitude of the 6.2-magnitude earthquake occurred at local time on the 27 th day near the city of Riya, killing at least 5427 people ,"
+ "More than 20000 people were injured and nearly 0.2 million were left homeless. "; // Retrieve content

// Use the forward maximum matching Chinese Word Segmentation Algorithm
Analyzer analyzer = new mmanalyzer ();

Directory directory = new ramdirectory ();
// Directory = fsdirectory. getdirectory ("/tmp/testindex", true );

Try
{
Indexwriter iwriter = new indexwriter (directory, analyzer, true );
Iwriter. setmaxfieldlength (25000 );
Document Doc = new document ();
Doc. Add (new field (fieldname, text, field. Store. Yes, field. Index. tokenized ));
Iwriter. adddocument (DOC );
Iwriter. Close ();

Indexsearcher isearcher = new indexsearcher (directory );
Queryparser parser = new queryparser (fieldname, analyzer );
Query query = parser. parse ("Indonesia earthquake magnitude 6.2"); // query term
Hits hits = isearcher. Search (query );
System. Out. println ("Hit:" + hits. Length ());

For (INT I = 0; I {
Document hitdoc = hits.doc (I );
System. Out. println ("content:" + hitdoc. Get (fieldname ));
}

Isearcher. Close ();
Directory. Close ();
}
Catch (exception E)
{
E. printstacktrace ();
}
}

}

Effect:

Hit: 1
Content: according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th) that the occurrence of 05:53 on the 27 th local time near the city of rianchao

At least 6.2 people were killed, more than 5427 people were injured, and nearly 20000 people were left homeless.

 

View plaincopy to clipboardprint?
Package demo. analysis;

Import jeasy. analysis. mmanalyzer;

Import org. Apache. Lucene. analysis. analyzer;
Import org. Apache. Lucene. analysis. tokenstream;
Import org.apache.e.doc ument. Document;
Import org.apache.e.doc ument. field;
Import org. Apache. Lucene. Index. indexreader;
Import org. Apache. Lucene. Index. indexwriter;
Import org. Apache. Lucene. Index. termpositionvector;
Import org. Apache. Lucene. queryparser. queryparser;
Import org. Apache. Lucene. Search. Hits;
Import org. Apache. Lucene. Search. indexsearcher;
Import org. Apache. Lucene. Search. query;
Import org. Apache. Lucene. Search. Highlight. highlighter;
Import org. Apache. Lucene. Search. Highlight. queryscorer;
Import org. Apache. Lucene. Search. Highlight. tokensources;
Import org. Apache. Lucene. Store. Directory;
Import org. Apache. Lucene. Store. ramdirectory;

Public class segment
{

Public static void main (string [] ARGs)
{
String fieldname = "text ";
String text = "according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th ,"
+ "At 05:53, the first magnitude of the 6.2-magnitude earthquake occurred at local time on the 27 th day near the city of Riya, killing at least 5427 people ,"
+ "More than 20000 people were injured and nearly 0.2 million were left homeless. "; // Retrieve content

// Use the forward maximum matching Chinese Word Segmentation Algorithm
Analyzer analyzer = new mmanalyzer ();

Directory directory = new ramdirectory ();
// Directory = fsdirectory. getdirectory ("/tmp/testindex", true );

Try
{
Indexwriter iwriter = new indexwriter (directory, analyzer, true );
Iwriter. setmaxfieldlength (25000 );
Document Doc = new document ();
Doc. Add (new field (fieldname, text, field. Store. Yes,
Field. Index. tokenized,
Field. termvector. with_positions_offsets ));
Iwriter. adddocument (DOC );
Iwriter. Close ();

Indexsearcher isearcher = new indexsearcher (directory );
Queryparser parser = new queryparser (fieldname, analyzer );
Query query = parser. parse ("Indonesia earthquake magnitude 6.2"); // query term
Hits hits = isearcher. Search (query );
System. Out. println ("Hit:" + hits. Length ());

Highlighter = new highlighter (New queryscorer (query ));
For (INT I = 0; I {
TEXT = hits.doc (I). Get (fieldname );
Termpositionvector TPV = (termpositionvector) indexreader. Open (
Directory). gettermfreqvector (hits. ID (I), fieldname );
Tokenstream = tokensources. gettokenstream (TPV );
String result = highlighter. getbestfragments (tokenstream, text, 3 ,"...");
System. Out. println ("content:" + result );
}

Isearcher. Close ();
Directory. Close ();
}
Catch (exception E)
{
E. printstacktrace ();
}
}

}
Package demo. analysis;

Import jeasy. analysis. mmanalyzer;

Import org. Apache. Lucene. analysis. analyzer;
Import org. Apache. Lucene. analysis. tokenstream;
Import org.apache.e.doc ument. Document;
Import org.apache.e.doc ument. field;
Import org. Apache. Lucene. Index. indexreader;
Import org. Apache. Lucene. Index. indexwriter;
Import org. Apache. Lucene. Index. termpositionvector;
Import org. Apache. Lucene. queryparser. queryparser;
Import org. Apache. Lucene. Search. Hits;
Import org. Apache. Lucene. Search. indexsearcher;
Import org. Apache. Lucene. Search. query;
Import org. Apache. Lucene. Search. Highlight. highlighter;
Import org. Apache. Lucene. Search. Highlight. queryscorer;
Import org. Apache. Lucene. Search. Highlight. tokensources;
Import org. Apache. Lucene. Store. Directory;
Import org. Apache. Lucene. Store. ramdirectory;

Public class segment
{

Public static void main (string [] ARGs)
{
String fieldname = "text ";
String text = "according to Reuters, an Indonesian Ministry of Social Affairs Officer said on Tuesday (29th ,"
+ "At 05:53, the first magnitude of the 6.2-magnitude earthquake occurred at local time on the 27 th day near the city of Riya, killing at least 5427 people ,"
+ "More than 20000 people were injured and nearly 0.2 million were left homeless. "; // Retrieve content

// Use the forward maximum matching Chinese Word Segmentation Algorithm
Analyzer analyzer = new mmanalyzer ();

Directory directory = new ramdirectory ();
// Directory = fsdirectory. getdirectory ("/tmp/testindex", true );

Try
{
Indexwriter iwriter = new indexwriter (directory, analyzer, true );
Iwriter. setmaxfieldlength (25000 );
Document Doc = new document ();
Doc. Add (new field (fieldname, text, field. Store. Yes,
Field. Index. tokenized,
Field. termvector. with_positions_offsets ));
Iwriter. adddocument (DOC );
Iwriter. Close ();

Indexsearcher isearcher = new indexsearcher (directory );
Queryparser parser = new queryparser (fieldname, analyzer );
Query query = parser. parse ("Indonesia earthquake magnitude 6.2"); // query term
Hits hits = isearcher. Search (query );
System. Out. println ("Hit:" + hits. Length ());

Highlighter = new highlighter (New queryscorer (query ));
For (INT I = 0; I {
TEXT = hits.doc (I). Get (fieldname );
Termpositionvector TPV = (termpositionvector) indexreader. Open (
Directory). gettermfreqvector (hits. ID (I), fieldname );
Tokenstream = tokensources. gettokenstream (TPV );
String result = highlighter. getbestfragments (tokenstream, text, 3 ,"...");
System. Out. println ("content:" + result );
}

Isearcher. Close ();
Directory. Close ();
}
Catch (exception E)
{
E. printstacktrace ();
}
}

}
 
Effect:

Hit: 1
Content: according to Reuters, a Ministry of Social Affairs Officer in <B> Indonesia </B> said on Tuesday (29th) that the occurrence of 05:53 on the 27 th day of the local time near the city of Japan

The <B> magnitude 6.2 </B> <B> earthquake </B> has killed at least 5427 people, injured more than 20000 people, and left nearly 0.2 million homeless.

This article from the csdn blog, reproduced please indicate the source: http://blog.csdn.net/Java2King/archive/2010/01/08/5155878.aspx

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.