Xlsx reading and word cloud output, and xlsx reading
# Coding = UTF-8
Import sys
Reload (sys)
Sys. setdefaultencoding ('utf-8 ')
Import xlrd
Import jieba
Import codecs
Import csv
Import numpy as np
From wordcloud import WordCloud
Import jieba. posseg
Import logging
Def read_xlsx (filename ):
Workbook = xlrd. open_workbook (filename)
Booksheet = workbook. sheet_by_name ('sheet1 ')
P = list ()
Count = 0
For row in range (booksheet. nrows ):
Count + = 1
# If (count = 1000): break
Row_data = []
For col in range (booksheet. ncols ):
Cel = booksheet. cell (row, col)
Val = cel. value
Try:
Val = cel. value
Val = re. sub (R' \ s + ', '', val)
Except t:
Pass
If type (val) = float:
Val = int (val)
Else:
Val = str (val)
Row_data.append (val)
P. append (row_data)
Print 'the size of p is '+ str (len (p ))
Return p
Def seperate (p ):
Result = {}
Count = 0
For I in p:
Count + = 1
If (count % 100 = 0): print 'have seperate: # '+ str (count) +' # word'
For j in I:
Seg_list = jieba. posseg. cut (j)
Try:
For k in seg_list:
If k. flag. startswith ('n '):
V = 1
W = k. word
If result. has_key (w ):
V = result [w]
V + = 1
Result [w] = v
Except t Exception, e:
Print Exception, ":", e
Return result
Def is_chinese (s ):
Rt = False
If s> = u "\ u4e00" and s <= u "\ u9fa6 ":
Rt = True
Return rt
Def cutdict (p, top ):
Result = {}
Biggerone = {}
Vs = []
For (k, v) in p. items ():
If (len (k)> 1 and is_chinese (k )):
Vs. append (v)
Biggerone [k] = v
Vs. sort (reverse = True)
Top_v = np. percentile (vs, top)
For (k, v) in biggerone. items ():
If v> = top_v:
Result [k] = v
Return result
Def initfile (filename ):
Csvfile = open (filename, 'wb ')
Csvfile. write (codecs. BOM_UTF8)
Writer = csv. writer (csvfile)
Writer. writerow (['word', 'word frequency '])
Return csvfile, writer
Def ci2file (csvfile, writer, result ):
For (k, v) in result. items ():
If (len (k)> 1 ):
Row = []
Row. append (k)
Row. append (v)
Writer. writerow (row)
Csvfile. flush ()
Csvfile. close ()
P = read_xlsx('user_tweets_2.xlsx ')
Logging.info ('done read tweets ')
Result = seperate (p)
Csvfile, writer = initfile('user_tweets_2.csv ')
Ci2file (csvfile, writer, result)
Result = cutdict (result, 90)
Logging.info ('done Cut result ')
Print 'the size of final result is '+ str (len (result ))
# Generate a word cloud image the text method is used here. Use frequencies instead.
# Wordcloud = WordCloud (). generate (text
Import random
# Forming a unique gray-black tone
Def grey_color_func (word, font_size, position, orientation, random_state = None, ** kwargs ):
Return "hsl (0, 0% %, % d %)" % random. randint (60,100)
From PIL import Image
Import matplotlib. pyplot as plt
Mask = np.array(Image.open('timg2.png '))
Logging.info ('done Read image ')
Wordcloud = WordCloud (max_words = 1000, mask = mask,
Margin = 10, font_path = '/Library/Fonts/ 文. ttf ')
Wordcloud. fit_words (result)
Default_colors = wordcloud. to_array ()
Plt. title ("Custom colors ")
Plt. imshow (wordcloud. recolor (color_func = grey_color_func, random_state = 3 ))
Wordcloud. to_file ("a_new_hope.png ")
Plt. axis ("off ")
Plt. figure ()
Plt. title ("Default colors ")
Plt. imshow (default_colors)
Plt. axis ("off ")
Plt. show ()
# Display the generated image:
# The matplotlib way:
# Plt. imshow (wordcloud)
# Plt. axis ("off ")
# Plt. show ()
# From operator import itemgetter
# Item1 = itemgetter (1)
# Frequencies = sorted (result. items (), key = item1, reverse = True)
Print 'done'