Chinese Word Segmentation source code-Java

Source: Internet
Author: User
package org.apache.lucene.analysis.cn;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.Reader;import org.apache.lucene.analysis.*;/** * Title: ChineseTokenizer * Description: Extract tokens from the Stream using Character.getType() *              Rule: A Chinese character as a single token * Copyright:   Copyright (c) 2001 * Company: * * The difference between thr ChineseTokenizer and the * CJKTokenizer (id=23545) is that they have different * token parsing logic. *  * Let me use an example. If having a Chinese text * "C1C2C3C4" to be indexed, the tokens returned from the * ChineseTokenizer are C1, C2, C3, C4. And the tokens * returned from the CJKTokenizer are C1C2, C2C3, C3C4. * * Therefore the index the CJKTokenizer created is much * larger. * * The problem is that when searching for C1, C1C2, C1C3, * C4C2, C1C2C3 ... the ChineseTokenizer works, but the * CJKTokenizer will not work. * * @author Yiyi Sun * @version 1.0 * */public final class ChineseTokenizer extends Tokenizer {    public ChineseTokenizer(Reader in) {        input = in;    }    private int offset = 0, bufferIndex=0, dataLen=0;    private final static int MAX_WORD_LEN = 255;    private final static int IO_BUFFER_SIZE = 1024;    private final char[] buffer = new char[MAX_WORD_LEN];    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];    private int length;    private int start;    private final void push(char c) {        if (length == 0) start = offset-1;            // start of token        buffer[length++] = Character.toLowerCase(c);  // buffer it    }    private final Token flush() {        if (length>0) {            //System.out.println(new String(buffer, 0, length));            return new Token(new String(buffer, 0, length), start, start+length);        }        else            return null;    }    public final Token next() throws java.io.IOException {        length = 0;        start = offset;        while (true) {            final char c;            offset++;            if (bufferIndex >= dataLen) {                dataLen = input.read(ioBuffer);                bufferIndex = 0;            }            if (dataLen == -1) return flush();            else                c = ioBuffer[bufferIndex++];            switch(Character.getType(c)) {            case Character.DECIMAL_DIGIT_NUMBER:            case Character.LOWERCASE_LETTER:            case Character.UPPERCASE_LETTER:                push(c);                if (length == MAX_WORD_LEN) return flush();                break;            case Character.OTHER_LETTER:                if (length>0) {                    bufferIndex--;                    offset--;                    return flush();                }                push(c);                return flush();            default:                if (length>0) return flush();                break;            }        }    }}

/////////////////////////////////////////

package org.apache.lucene.analysis.cn;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.util.Hashtable;import org.apache.lucene.analysis.*;/** * Title: ChineseFilter * Description: Filter with a stop word table *              Rule: No digital is allowed. *                    English word/token should larger than 1 character. *                    One Chinese character as one Chinese word. * TO DO: *   1. Add Chinese stop words, such as /ue400 *   2. Dictionary based Chinese word extraction *   3. Intelligent Chinese word extraction * * Copyright:    Copyright (c) 2001 * Company: * @author Yiyi Sun * @version 1.0 * */public final class ChineseFilter extends TokenFilter {    // Only English now, Chinese to be added later.    public static final String[] STOP_WORDS = {    "and", "are", "as", "at", "be", "but", "by",    "for", "if", "in", "into", "is", "it",    "no", "not", "of", "on", "or", "such",    "that", "the", "their", "then", "there", "these",    "they", "this", "to", "was", "will", "with"    };    private Hashtable stopTable;    public ChineseFilter(TokenStream in) {        super(in);        stopTable = new Hashtable(STOP_WORDS.length);        for (int i = 0; i < STOP_WORDS.length; i++)            stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);    }    public final Token next() throws java.io.IOException {        for (Token token = input.next(); token != null; token = input.next()) {            String text = token.termText();          // why not key off token type here assuming ChineseTokenizer comes first?            if (stopTable.get(text) == null) {                switch (Character.getType(text.charAt(0))) {                case Character.LOWERCASE_LETTER:                case Character.UPPERCASE_LETTER:                    // English word/token should larger than 1 character.                    if (text.length()>1) {                        return token;                    }                    break;                case Character.OTHER_LETTER:                    // One Chinese character as one Chinese word.                    // Chinese word extraction to be added later here.                    return token;                }            }        }        return null;    }}
/////////////////////////
package org.apache.lucene.analysis.cn;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.Reader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;/** * Title: ChineseAnalyzer * Description: *   Subclass of org.apache.lucene.analysis.Analyzer *   build from a ChineseTokenizer, filtered with ChineseFilter. * Copyright:   Copyright (c) 2001 * Company: * @author Yiyi Sun * @version 1.0 * */public class ChineseAnalyzer extends Analyzer {    public ChineseAnalyzer() {    }    /**    * Creates a TokenStream which tokenizes all the text in the provided Reader.    *    * @return  A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.    */    public final TokenStream tokenStream(String fieldName, Reader reader) {        TokenStream result = new ChineseTokenizer(reader);        result = new ChineseFilter(result);        return result;    }}
Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.