About the C # Chinese cut in e.net at a.m., Author: Forum Organizing responsibility Editor:> ghost
The following is a reference clip: Using system; Using system. Collections. Generic; Using system. text; Using analyzer = Lucene. net. analysis. analyzer; Using simpleanalyzer = Lucene. net. analysis. simpleanalyzer; Using standardanalyzer = Lucene. net. analysis. Standard. standardanalyzer; Using token = Lucene. net. analysis. Token; Using tokenstream = Lucene. net. analysis. tokenstream; Namespace mydeleetest { Class Program { [Stathread] Public static void main (system. String [] ARGs) { Try { Test ("the People's Republic of China was established in 1949 and has since begun a great chapter in the New China. Speech by Mayor Changchun during the Spring Festival ", true ); } Catch (system. Exception E) { System. Console. Out. writeline ("caught a" + E. GetType () + "" N with message: "+ E. Message + E. tostring ()); } } Internal static void test (system. String text, bool verbose) { System. Console. Out. writeline ("tokenizing string:" + text ); Test (new system. Io. stringreader (text), verbose, text. Length ); } Internal static void test (system. Io. textreader reader, bool verbose, long bytes) { // Analyzer = new standardanalyzer (); Analyzer analyzer = new Lucene. fanswo. chineseanalyzer (); Tokenstream stream = analyzer. tokenstream (null, Reader ); System. datetime start = system. datetime. now; Int COUNT = 0; For (token T = stream. Next (); t! = NULL; t = stream. Next ()) { If (verbose) { System. Console. Out. writeline ("token =" + T. tostring ()); } Count ++; } System. datetime end = system. datetime. now; Long time = end. ticks-start. ticks; System. Console. Out. writeline (Time + "milliseconds to extract" + Count + "tokens "); System. Console. Out. writeline (time * 1000.0)/count + "microseconds/token "); System. Console. Out. writeline (Bytes * 1000.0*60.0*60.0)/(time * 1000000.0) + "megabytes/hour "); } } } |
Test results:
Finished!
Still waiting for market divisionAlgorithm. There are still Chinese Punctuation Marks not processed, and I will further improve them.
I can't write a lot of text, onlyCodeReplace my words. Give me some comments. Thank you