Statement
The following code is just the basic implementation of the TF-IDF algorithm idea, so many places need to be perfected, summarized as follows:
1. To achieve the logic problem: special position, such as paragraph first or noun (relative to the verb), should have a greater weight;
2. Before the word segmentation should be the basic processing of text: Remove punctuation, the appropriate way to call the word segmentation interface, so that the text content can be called two times, but the results are the same;
3. Speed to be improved: The total number of text updated once a week on the line, the text of the keyword is the measurement method;
Package demo.utils;
Import Com.google.common.util.concurrent.ThreadFactoryBuilder;
Import org.springframework.beans.factory.annotation.Autowired;
Import Org.springframework.beans.factory.annotation.Value;
Import org.springframework.http.ResponseEntity;
Import org.springframework.stereotype.Component;
Import Org.springframework.web.client.RestTemplate;
Import java.util.*;
Import java.util.concurrent.*;
Import java.util.function.Function;
Import Java.util.regex.Matcher;
Import Java.util.regex.Pattern;
Import java.util.stream.Collectors; /** * @author du kui @date 2017/12/1 * * @Component public class Ltputils {@Value ("${demo.ltp-url}") Private St
Ring Ltpurl;
@Value ("${demo.api-key}") Private String Apikey;
Private Executorservice pool;
Private final pattern sum_pattern= pattern.compile ("\\d+ (, \\d{3}) *\\s strip result"); @Autowired public ltputils () {threadfactory namedthreadfactory = new Threadfactorybuilder (). Setnameformat ("Th read-pool-%d "). BuilD (); Executorservice ex = new Threadpoolexecutor (5, 0L, Timeunit.milliseconds, New linkedblockingdeque<runnable> (
1024), Namedthreadfactory, New Threadpoolexecutor.abortpolicy ());
Pool = ex; The implementation of the/** * TF-IDF algorithm * @param content needs participle, calculation frequency, inverse text frequencies, TF-IDF value of the contents * @return keywords and corresponding TF-IDF values, descending according to value values Column */public map<string,double> TFIDF (String content) {try {string[] Strarrs = GetInfo
FORLTP (content, "ws", "plain");
map<string, double> tf = Counttf (STRARRS);
Gets the TF-IDF value map<string, double> result = new hashmap<> (); For (map.entry<string,double> Ele:tf.entrySet ()) {Result.put (Ele.getkey (), Ele.getvalue () * GETIDF (
Ele.getkey ())); ///According to value values Result=result.entryset (). Stream (). Sorted (Map.Entry.comparingByValue COLLECTIONS.R Everseorder ())). Collect (Collectors.tomap (//rEsult=result.entryset (). Stream (). Sorted (Map.Entry.comparingByValue (/* Remove this in ascending order)). Collect (Collectors.tomap (
Map.entry::getkey, Map.entry::getvalue, (e1,e2)->e1,
Linkedhashmap::new));
return result;
}catch (Exception e) {//todo subdivision throw new RuntimeException (E.getmessage ()); }/** * calls the hit word and returns the result * * @param text to be processed @param pattern matching mode * @param format Returns the data format * @return */public string[] GETINFOFORLTP (string text, string pattern, string format) throws Classno Tfoundexception, Executionexception, interruptedexception {String url = ltpurl + "? api_key=" + Apikey + "&tex
t= "+ text +" &pattern= "+ pattern +" &format= "+ format;
Resttemplate resttemplate = new Resttemplate (); Future<responseentity> resp = Pool.submit (()-> Resttemplate.getforentiTy (URL, string.class, "participle"));
Responseentity<string> Respbody=resp.get ();
String [] Resparrs=respbody.getbody (). Split ("");
return resparrs; /** * Statistics frequency, normalized frequencies, that occurs more than the total number of times * @param strarrs ' * @return/public map<string,double> C Ounttf (String [] strarrs) {map<string,long> map=arrays.stream (strarrs). Collect (Collectors.groupingby
N.identity (), collectors.counting ()); Map=map.entryset (). Stream (). Sorted (Map.Entry.comparingByValue (Collections.reverseorder ())). Collect ( Collectors.tomap (//result=result.entryset). Stream (). Sorted (* Remove this in ascending order). Collect
(Collectors.tomap (Map.entry::getkey, Map.entry::getvalue, (e1,e2)->e1,
Linkedhashmap::new));
Map<string,double> result=new hashmap<> (); Map.entryset (). Stream (). ForEach (x-> result.put x.getkey (), X.GetValue ()/(double) strarrs.length);
return result; /** * Get the IDF value of the word str * @param str * @return/public double GETIDF (String str) {Rest
Template resttemplate=new resttemplate ();
String respsum= "", resparr= "";
try {respsum = Resttemplate.getforobject ("https://cn.bing.com/search?q=", String.class, "total");
Resparr = Resttemplate.getforobject ("https://cn.bing.com/search?q=" + str, string.class, "Occurrence of a Word document number");
}catch (Exception e) {e.printstacktrace ();
return 0;
Long sumresp=666l;
Long arrresp=666l;
Matcher m= Sum_pattern.matcher (respsum);
if (M.find ()) {String patternstr=m.group ();
sumresp= Long.parselong (patternstr.substring ("The result of the strip"). Replace (",", ""));
} m= Sum_pattern.matcher (Resparr);
if (M.find ()) {String patternstr=m.group (); Arrresp= Long.parselong (patternstr.substring ("The result of the strip"). Replace (",", ""));
} if (sumresp!=666l&&arrresp!=666l) {//If there are return results returned Math.log (SUMRESP/ARRRESP);
}else{//throw new RuntimeException ("Incorrect return result");
SYSTEM.OUT.PRINTLN ("Return result is incorrect:" +STR);
return 0;
}
}
}