Java Chinese word segmentation tool (4)
Import java. io. bufferedReader; import java. io. file; import java. io. fileInputStream; import java. io. IOException; import java. io. inputStreamReader; import java. io. randomAccessFile; import java. io. serializable; import java. util. arrayList; import java. util. stringTokenizer;/** File Format: Chinese text that has been segmented. Each word is separated by a space and each paragraph in each line. * This class is suitable for reading a small number of texts in each row, such as text with good paragraphs and storing one paragraph in one row. * Reads a row. If the step is 1, the return phrase is returned. It does not generate phrases across paragraphs. * Three modes: * 1 reading the end of the file, ending * 2 reading the end of the file, starting from the beginning * 3 rows of loops multiple times, after browsing to the end of the text, it will be finished */public class ParaWordReader implements Reader {static final int normalMode = 0; // static final int againMode = 1 After browsing to the end of the text; // browse to the end of the text, start from the beginning and then start with static final int paraAgainMode = 2; // One Line repeats multiple times. After browsing to the end of the text, private int currentMode = 0; private RandomAccessFile raf = null; private File file; private ArrayList
ParaWords = null; private StringTokenizer tokenizer; private int currentPara =-1; private int paraPos = 0; private int paraIter = 0; private int paraIters = 1; public ParaWordReader (String fileName) throws IOException {file = new File (fileName); raf = new RandomAccessFile (file, r); paraWords = new ArrayList
();} Public void setMode (int m) {currentMode = m;} public void setParaIters (int iters) {paraIters = iters; setMode (paraAgainMode);} public int paraIndex () {return currentPara;} private boolean readPara () throws IOException {String line = raf. readLine (); if (line = null) // {if (currentMode = normalMode | currentMode = paraAgainMode) {return false;} else {System. out. println (the file is too large and may not be supported); raf. seek (0); currentPara =-1; return readPara () ;}} paraWords. clear (); line = new String (line. getBytes (iso8859-1), UTF-8); tokenizer = new StringTokenizer (line,); while (tokenizer. hasMoreTokens () {paraWords. add (tokenizer. nextToken ();} currentPara ++; paraPos = 0; return true;} public String [] getNextWords (int count) throws IOException {if (paraPos + count> = paraWords. size () // to the end of the section {if (currentMode = paraAgainMode & paraIter <paraIters) // start the section {paraPos = 0; paraIter ++; return getNextWords (count);} else {paraIter = 0; if (readPara () // read the new paragraph return getNextWords (count); else return null ;}} string [] words = new String [count]; for (int I = 0; I