The basic implementation of TF-IDF algorithm, java__ algorithm

Source: Internet
Author: User
Statement

The following code is just the basic implementation of the TF-IDF algorithm idea, so many places need to be perfected, summarized as follows:
1. To achieve the logic problem: special position, such as paragraph first or noun (relative to the verb), should have a greater weight;
2. Before the word segmentation should be the basic processing of text: Remove punctuation, the appropriate way to call the word segmentation interface, so that the text content can be called two times, but the results are the same;
3. Speed to be improved: The total number of text updated once a week on the line, the text of the   keyword is the measurement method;

Package demo.utils;
Import Com.google.common.util.concurrent.ThreadFactoryBuilder;
Import org.springframework.beans.factory.annotation.Autowired;
Import Org.springframework.beans.factory.annotation.Value;
Import org.springframework.http.ResponseEntity;
Import org.springframework.stereotype.Component;

Import Org.springframework.web.client.RestTemplate;
Import java.util.*;
Import java.util.concurrent.*;
Import java.util.function.Function;
Import Java.util.regex.Matcher;
Import Java.util.regex.Pattern;

Import java.util.stream.Collectors; /** * @author du kui @date 2017/12/1 * * @Component public class Ltputils {@Value ("${demo.ltp-url}") Private St

    Ring Ltpurl;


    @Value ("${demo.api-key}") Private String Apikey;

    Private Executorservice pool;

    Private final pattern sum_pattern= pattern.compile ("\\d+ (, \\d{3}) *\\s strip result"); @Autowired public ltputils () {threadfactory namedthreadfactory = new Threadfactorybuilder (). Setnameformat ("Th read-pool-%d "). BuilD (); Executorservice ex = new Threadpoolexecutor (5, 0L, Timeunit.milliseconds, New linkedblockingdeque<runnable> (
        1024), Namedthreadfactory, New Threadpoolexecutor.abortpolicy ());
    Pool = ex; The implementation of the/** * TF-IDF algorithm * @param content needs participle, calculation frequency, inverse text frequencies, TF-IDF value of the contents * @return keywords and corresponding TF-IDF values, descending according to value values Column */public map<string,double> TFIDF (String content) {try {string[] Strarrs = GetInfo
            FORLTP (content, "ws", "plain");
            map<string, double> tf = Counttf (STRARRS);
            Gets the TF-IDF value map<string, double> result = new hashmap<> (); For (map.entry<string,double> Ele:tf.entrySet ()) {Result.put (Ele.getkey (), Ele.getvalue () * GETIDF (
            Ele.getkey ())); ///According to value values Result=result.entryset (). Stream (). Sorted (Map.Entry.comparingByValue COLLECTIONS.R Everseorder ())). Collect (Collectors.tomap (//rEsult=result.entryset (). Stream (). Sorted (Map.Entry.comparingByValue (/* Remove this in ascending order)). Collect (Collectors.tomap (
                    Map.entry::getkey, Map.entry::getvalue, (e1,e2)->e1,
            Linkedhashmap::new));
        return result;
        }catch (Exception e) {//todo subdivision throw new RuntimeException (E.getmessage ()); }/** * calls the hit word and returns the result * * @param text to be processed @param pattern matching mode * @param format Returns the data format * @return */public string[] GETINFOFORLTP (string text, string pattern, string format) throws Classno Tfoundexception, Executionexception, interruptedexception {String url = ltpurl + "? api_key=" + Apikey + "&tex

        t= "+ text +" &pattern= "+ pattern +" &format= "+ format;
        Resttemplate resttemplate = new Resttemplate (); Future<responseentity> resp = Pool.submit (()-> Resttemplate.getforentiTy (URL, string.class, "participle"));

        Responseentity<string> Respbody=resp.get ();
        String [] Resparrs=respbody.getbody (). Split ("");
    return resparrs; /** * Statistics frequency, normalized frequencies, that occurs more than the total number of times * @param strarrs ' * @return/public map<string,double> C Ounttf (String [] strarrs) {map<string,long> map=arrays.stream (strarrs). Collect (Collectors.groupingby
        N.identity (), collectors.counting ()); Map=map.entryset (). Stream (). Sorted (Map.Entry.comparingByValue (Collections.reverseorder ())). Collect ( Collectors.tomap (//result=result.entryset). Stream (). Sorted (* Remove this in ascending order). Collect 
                (Collectors.tomap (Map.entry::getkey, Map.entry::getvalue, (e1,e2)->e1,
        Linkedhashmap::new));
        Map<string,double> result=new hashmap<> (); Map.entryset (). Stream (). ForEach (x-> result.put x.getkey (), X.GetValue ()/(double) strarrs.length);
    return result; /** * Get the IDF value of the word str * @param str * @return/public double GETIDF (String str) {Rest
        Template resttemplate=new resttemplate ();
        String respsum= "", resparr= "";
            try {respsum = Resttemplate.getforobject ("https://cn.bing.com/search?q=", String.class, "total");
        Resparr = Resttemplate.getforobject ("https://cn.bing.com/search?q=" + str, string.class, "Occurrence of a Word document number");
            }catch (Exception e) {e.printstacktrace ();
        return 0;
        Long sumresp=666l;
        Long arrresp=666l;
        Matcher m= Sum_pattern.matcher (respsum);
            if (M.find ()) {String patternstr=m.group ();
        sumresp= Long.parselong (patternstr.substring ("The result of the strip"). Replace (",", ""));
        } m= Sum_pattern.matcher (Resparr);
            if (M.find ()) {String patternstr=m.group (); Arrresp= Long.parselong (patternstr.substring ("The result of the strip"). Replace (",", ""));
        } if (sumresp!=666l&&arrresp!=666l) {//If there are return results returned Math.log (SUMRESP/ARRRESP);
            }else{//throw new RuntimeException ("Incorrect return result");
            SYSTEM.OUT.PRINTLN ("Return result is incorrect:" +STR);
        return 0;
 }
    }
}

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.