Chinese Word Segmentation source code-Java

Last Update:2018-12-05 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

package org.apache.lucene.analysis.cn;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.Reader;import org.apache.lucene.analysis.*;/** * Title: ChineseTokenizer * Description: Extract tokens from the Stream using Character.getType() *              Rule: A Chinese character as a single token * Copyright:   Copyright (c) 2001 * Company: * * The difference between thr ChineseTokenizer and the * CJKTokenizer (id=23545) is that they have different * token parsing logic. *  * Let me use an example. If having a Chinese text * "C1C2C3C4" to be indexed, the tokens returned from the * ChineseTokenizer are C1, C2, C3, C4. And the tokens * returned from the CJKTokenizer are C1C2, C2C3, C3C4. * * Therefore the index the CJKTokenizer created is much * larger. * * The problem is that when searching for C1, C1C2, C1C3, * C4C2, C1C2C3 ... the ChineseTokenizer works, but the * CJKTokenizer will not work. * * @author Yiyi Sun * @version 1.0 * */public final class ChineseTokenizer extends Tokenizer {    public ChineseTokenizer(Reader in) {        input = in;    }    private int offset = 0, bufferIndex=0, dataLen=0;    private final static int MAX_WORD_LEN = 255;    private final static int IO_BUFFER_SIZE = 1024;    private final char[] buffer = new char[MAX_WORD_LEN];    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];    private int length;    private int start;    private final void push(char c) {        if (length == 0) start = offset-1;            // start of token        buffer[length++] = Character.toLowerCase(c);  // buffer it    }    private final Token flush() {        if (length>0) {            //System.out.println(new String(buffer, 0, length));            return new Token(new String(buffer, 0, length), start, start+length);        }        else            return null;    }    public final Token next() throws java.io.IOException {        length = 0;        start = offset;        while (true) {            final char c;            offset++;            if (bufferIndex >= dataLen) {                dataLen = input.read(ioBuffer);                bufferIndex = 0;            }            if (dataLen == -1) return flush();            else                c = ioBuffer[bufferIndex++];            switch(Character.getType(c)) {            case Character.DECIMAL_DIGIT_NUMBER:            case Character.LOWERCASE_LETTER:            case Character.UPPERCASE_LETTER:                push(c);                if (length == MAX_WORD_LEN) return flush();                break;            case Character.OTHER_LETTER:                if (length>0) {                    bufferIndex--;                    offset--;                    return flush();                }                push(c);                return flush();            default:                if (length>0) return flush();                break;            }        }    }}

/////////////////////////////////////////

package org.apache.lucene.analysis.cn;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.util.Hashtable;import org.apache.lucene.analysis.*;/** * Title: ChineseFilter * Description: Filter with a stop word table *              Rule: No digital is allowed. *                    English word/token should larger than 1 character. *                    One Chinese character as one Chinese word. * TO DO: *   1. Add Chinese stop words, such as /ue400 *   2. Dictionary based Chinese word extraction *   3. Intelligent Chinese word extraction * * Copyright:    Copyright (c) 2001 * Company: * @author Yiyi Sun * @version 1.0 * */public final class ChineseFilter extends TokenFilter {    // Only English now, Chinese to be added later.    public static final String[] STOP_WORDS = {    "and", "are", "as", "at", "be", "but", "by",    "for", "if", "in", "into", "is", "it",    "no", "not", "of", "on", "or", "such",    "that", "the", "their", "then", "there", "these",    "they", "this", "to", "was", "will", "with"    };    private Hashtable stopTable;    public ChineseFilter(TokenStream in) {        super(in);        stopTable = new Hashtable(STOP_WORDS.length);        for (int i = 0; i < STOP_WORDS.length; i++)            stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);    }    public final Token next() throws java.io.IOException {        for (Token token = input.next(); token != null; token = input.next()) {            String text = token.termText();          // why not key off token type here assuming ChineseTokenizer comes first?            if (stopTable.get(text) == null) {                switch (Character.getType(text.charAt(0))) {                case Character.LOWERCASE_LETTER:                case Character.UPPERCASE_LETTER:                    // English word/token should larger than 1 character.                    if (text.length()>1) {                        return token;                    }                    break;                case Character.OTHER_LETTER:                    // One Chinese character as one Chinese word.                    // Chinese word extraction to be added later here.                    return token;                }            }        }        return null;    }}

/////////////////////////

package org.apache.lucene.analysis.cn;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.Reader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;/** * Title: ChineseAnalyzer * Description: *   Subclass of org.apache.lucene.analysis.Analyzer *   build from a ChineseTokenizer, filtered with ChineseFilter. * Copyright:   Copyright (c) 2001 * Company: * @author Yiyi Sun * @version 1.0 * */public class ChineseAnalyzer extends Analyzer {    public ChineseAnalyzer() {    }    /**    * Creates a TokenStream which tokenizes all the text in the provided Reader.    *    * @return  A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.    */    public final TokenStream tokenStream(String fieldName, Reader reader) {        TokenStream result = new ChineseTokenizer(reader);        result = new ChineseFilter(result);        return result;    }}

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Chinese Word Segmentation source code-Java

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Chinese Word Segmentation source code-Java

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support