Import Feedparser # Package to parse Rssimport re # package to match regular expressiondef Getwordscount (URL): D = Feed Parser.parse (URL) sum = 0 # sum for all the words sum_en = 0 Sum_zh = 0 Sum_punc = 0 Wordcount_zh = {} # di CT to store Chinese and it frequency wordcount_en = {} Wordcount_punc = {} for e in d[' entries ']: if ' sum Mary ' in e:# every entry should has a ' summary ' or a ' description ' summary = E.summary else: Summary = E.description Words_zh, words_en, Words_punc = getwords (e.title + "+ summary) for word in Word S_zh:wordcount_zh.setdefault (Word, 0) #to set a dict[item] to default 0 if it doesn ' t exist Wordcoun T_zh[word] + = 1 Sum_zh + = 1 for word in Words_en:wordcount_en.setdefault (Word, 0) Wordcount_en[word] + = 1 Sum_en + = 1 for word in words_punc:wordcount_punc.setdefault (Word, 0) Wordcount_Punc[word] + = 1 Sum_punc + = 1 sum = Sum_zh + sum_en + sum_punc Wordcount_zh = sorted (Wordcount_zh.items () , key = Lambda d:d[1], reverse = True) # Sort the dict with the value, and from the largest to smallest wordcount_en = s Orted (Wordcount_en.items (), key = Lambda d:d[1], reverse = True) Wordcount_punc = sorted (Wordcount_punc.items (), key = Lambda d:d[1], reverse = True) print (d[' feed ' [' title ']) print (' Chinese:%d \ n%s '% (Sum_zh, Wordcount_zh)) Prin T (' 中文版:%d \%s '% (Sum_en, wordcount_en)) print (' punctuation:%d \ n%s '% (Sum_punc, wordcount_punc)) print (' Th Ere is%d words in total '% (sum)) def getwords (html): txt = re.compile (R ' <[^>]+> '). Sub (", HTML) # RE to Replac E ' <title><\title> ' and so on with backspace Words_zh = Re.compile (R ' [\u4e00-\u9ffa] '). FindAll (TXT) # Chines e character is between \u4e00-\u9ffa words_en = Re.compile (R ' [a-za-z]+ '). FindAll (txt) words_punc = re.compile (R ' [\s+ \.\!\/_,$%^* (+\ "\ ']+| [+-—! ,。? , [email protected]#¥%......&* ()]+ '). FindAll (TXT) return Words_zh, Words_en, Words_punc (' http:// Feed.cnblogs.com/blog/u/426928/rss '))
The specific use of their own blog Park feed, with the blog update data will be different
utf-8
the encoding range for Chinese is\u4E00-\u9FFA
The number of Chinese and English punctuation in the statistics feed