[推薦]jsp中文分詞程式

來源:互聯網
上載者:User
 代碼如下 複製代碼
publicclass MM2
{
 privatestaticfinal Log log = LogFactory.getLog(MM2.class);
 
 privatestatic HashMap<String, Integer> dictionary =null;
 privatestaticfinalint WORD_MAX_LENGTH =9;
 private Reader reader;
 
 static
 {
 loadDictionary();
 }
 
 public MM2(Reader reader)
 {
 this.reader = reader;
 }
 
 //切分出由中文、字母、數字組成的句子
public ArrayList<Sentence> getSentence() throws IOException
 {
 ArrayList<Sentence> list=new ArrayList<Sentence>();
 StringBuffer cb=new StringBuffer();
 int d=reader.read();
 int offset=0;
 boolean b=false;
 while(d>-1)
 {
 int type=Character.getType(d);
 if(type==2|| type==9|| type==5)
 {
 d=toAscii(d);
 cb.append((char)d);
 }
 else
 {
 b=true;
 }
 d=reader.read();
 if(d==-1|| b)
 {
 if(d==-1) offset++;
 b=false;
 char[] ioBuffer =newchar[cb.length()];
 cb.getChars(0, cb.length(), ioBuffer, 0);
 Sentence sen=new Sentence(ioBuffer,offset-cb.length());
 list.add(sen);
 cb.setLength(0);
 }
 offset++;
 }
 return list;
 }
 
 //將句子切分出詞
public ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
 {
 ArrayList<Token> tokenlist=new ArrayList<Token>();
 for(Sentence sen:list)
 {
 StringBuffer word =new StringBuffer();
 int offset=sen.getStartOffset();
 int bufferIndex =0;
 char c;
 boolean b=false;
 while(bufferIndex<sen.getText().length)
 {
 offset++;
 c=sen.getText()[bufferIndex++];
 if(word.length()==0)
 word.append(c);
 else
 {
 String temp = (word.toString() + c).intern();
 if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
 word.append(c);
 elseif(dictionary.containsKey(temp) && bufferIndex<sen.getText().length)
 word.append(c);
 else
 {
 bufferIndex--;
 offset--;
 while(word.length()>1&& dictionary.get(word.toString())!=null&& dictionary.get(word.toString())==2)
 {
 word.deleteCharAt(word.length()-1);
 bufferIndex--;
 offset--;
 }
 b=true;
 }
 }
 if(b || bufferIndex==sen.getText().length)
 {
 Token token =new Token(word.toString(),offset-word.length(),offset,"word");
 word.setLength(0);
 tokenlist.add(token);
 b=false;
 }
 }
 }
 return tokenlist;
 }
 
 //將相連的單個英文或數字組合成詞
public ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
 {
 ArrayList<Token> tokenlist=new ArrayList<Token>();
 Token word=null;
 for(int i=0;i<list.size();i++)
 {
 Token t=list.get(i);
 if(t.getWord().length()==1&& Character.getType((int)t.getWord().charAt(0))!=5)
 {
 if(word==null)
 word=t;
 elseif(word.getEnd()==t.getStart())
 {
 word.setEnd(t.getEnd());
 word.setWord(word.getWord()+t.getWord());
 }
 else
 {
 tokenlist.add(word);
 word=t;
 }
 }
 elseif(word!=null)
 {
 tokenlist.add(word);
 word=null;
 tokenlist.add(t);
 }
 else
 tokenlist.add(t);
 }
 if(word!=null)
 tokenlist.add(word);
 return tokenlist;
 }
 
 //雙角轉單角
publicstaticint toAscii(int codePoint)
 {
 if((codePoint>=65296&& codePoint<=65305) //0-9
|| (codePoint>=65313&& codePoint<=65338) //A-Z
|| (codePoint>=65345&& codePoint<=65370) //a-z
 )
 {
 codePoint -=65248;
 }
 return codePoint;
 }
 
 //載入詞典
publicstaticvoid loadDictionary()
 {
 if (dictionary ==null)
 {
 dictionary =new HashMap<String, Integer>();
 InputStream is =null;
 BufferedReader br =null;
 try
 {
 is =new FileInputStream(new File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
 br =new BufferedReader(new InputStreamReader(is, "UTF-8"));
 String word =null;
 while ((word = br.readLine()) !=null)
 {
 word=word.toLowerCase();
 if ((word.indexOf("#") ==-1) && (word.length() <= WORD_MAX_LENGTH))
 {
 dictionary.put(word.intern(), 1);
 int i = word.length()-1;
 while(i >=2)
 {
 String temp = word.substring(0, i).intern();
 if (!dictionary.containsKey(temp))
 dictionary.put(temp,2);
 i--;
 }
 }
 }
 }
 catch (Exception e)
 {
 log.info(e);
 }
 finally
 {
 try
 {
 if(br!=null)
 br.close();
 if(is!=null)
 is.close();
 }
 catch (IOException e)
 {
 log.info(e);
 }
 }
 }
 }
 
 publicstatic String[] segWords(Reader input)
 {
 ArrayList<String> list=new ArrayList<String>();
 try
 {
 MM2 f=new MM2(input);
 ArrayList<Token> tlist= f.getNewToken(f.getToken(f.getSentence()));
 for(Token t:tlist)
 {
 list.add(t.getWord());
 }
 }
 catch(IOException e)
 {
 log.info(e);
 }
 return (String[])list.toArray(new String[0]);
 }
 
 publicstaticvoid main(String[] args)
 {
 String[] cc=MM2.segWords(new StringReader("ibm商務機t60p".toLowerCase()));
 for(String c:cc)
 {
 System.out.println(c);
 }
 }
}
相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.