Python3.5 Data Processing and python3.5 Data Processing
1. Install pip3:
#sudo apt-get install pip3
2. Install jieba:
#sudo pip3 install jieba
3. Install sklearn:
#sudo pip3 install scikit-learn
4. Install the sklearn dependency (numpy, scipy ):
#sudo pip3 install numpy#sudo pip3 install scipy
For example, a time-out error may occur during installation in China. The solution is as follows:
#sudo pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple numpy#sudo pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple scipy
Five, simple word segmentation and calculation of TF-IDF values:
#! /Usr/bin python3.5 # coding = UTF-8 # import osimport jiebaimport refrom sklearn. feature_extraction.text import TfidfTransformerfrom sklearn. feature_extraction.text import CountVectorizer # import collectionsclass Tfi (object): def _ init _ (self): self. stop_list = [] pass def fenci (self, file): # list = [] fin = open (file, 'R') read_ B = fin. read () fin. close () read_res = ''. join (re. findall (u'[ a-zA-Z0-9 \ u4e00-\ u9fa5] + ', read_ B) cut_res = jieba. cut (read_res, cut_all = True) line_res = ''for I in cut_res: if I not in self. stop_list: line_res = line_res + I + ''fout = open ('res/'+ file, 'w') fout. write (line_res) fout. close () def cipin (self, fil_list): corpus = [] for fil in fil_list: ffout = open ('res/'+ fil, 'R') read_r = ffout. read () ffout. close () corpus. append (read_r) vectorizer = CountVectorizer () transformer = TfidfTransformer () tfidf = transformer. fit_transform (vectorizer. fit_transform (corpus) word = vectorizer. get_feature_names () # keyword weight = tfidf for all texts. toarray () for j in range (len (weight): f = open ('fes/'+ fil_list [j], 'w ') for I in range (len (word): f. write (word [I] + ''+ str (weight [j] [I]) + '\ n') f. close () if _ name _ = '_ main _': first = Tfi () fil_list = ['inputtext'] first. fenci ('inputtext') first. cipin (fil_list)