Java Chinese word segmentation tool (4)

Source: Internet
Author: User

Java Chinese word segmentation tool (4)

 

 

 

Import java. io. bufferedReader; import java. io. file; import java. io. fileInputStream; import java. io. IOException; import java. io. inputStreamReader; import java. io. randomAccessFile; import java. io. serializable; import java. util. arrayList; import java. util. stringTokenizer;/** File Format: Chinese text that has been segmented. Each word is separated by a space and each paragraph in each line. * This class is suitable for reading a small number of texts in each row, such as text with good paragraphs and storing one paragraph in one row. * Reads a row. If the step is 1, the return phrase is returned. It does not generate phrases across paragraphs. * Three modes: * 1 reading the end of the file, ending * 2 reading the end of the file, starting from the beginning * 3 rows of loops multiple times, after browsing to the end of the text, it will be finished */public class ParaWordReader implements Reader {static final int normalMode = 0; // static final int againMode = 1 After browsing to the end of the text; // browse to the end of the text, start from the beginning and then start with static final int paraAgainMode = 2; // One Line repeats multiple times. After browsing to the end of the text, private int currentMode = 0; private RandomAccessFile raf = null; private File file; private ArrayList
 
  
ParaWords = null; private StringTokenizer tokenizer; private int currentPara =-1; private int paraPos = 0; private int paraIter = 0; private int paraIters = 1; public ParaWordReader (String fileName) throws IOException {file = new File (fileName); raf = new RandomAccessFile (file, r); paraWords = new ArrayList
  
   
();} Public void setMode (int m) {currentMode = m;} public void setParaIters (int iters) {paraIters = iters; setMode (paraAgainMode);} public int paraIndex () {return currentPara;} private boolean readPara () throws IOException {String line = raf. readLine (); if (line = null) // {if (currentMode = normalMode | currentMode = paraAgainMode) {return false;} else {System. out. println (the file is too large and may not be supported); raf. seek (0); currentPara =-1; return readPara () ;}} paraWords. clear (); line = new String (line. getBytes (iso8859-1), UTF-8); tokenizer = new StringTokenizer (line,); while (tokenizer. hasMoreTokens () {paraWords. add (tokenizer. nextToken ();} currentPara ++; paraPos = 0; return true;} public String [] getNextWords (int count) throws IOException {if (paraPos + count> = paraWords. size () // to the end of the section {if (currentMode = paraAgainMode & paraIter <paraIters) // start the section {paraPos = 0; paraIter ++; return getNextWords (count);} else {paraIter = 0; if (readPara () // read the new paragraph return getNextWords (count); else return null ;}} string [] words = new String [count]; for (int I = 0; I
   
    

 

 

 

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.