1 # coding = UTF-8 2 from numpy import * 3 4 # function of parsing documents 5 def textParse (bigString): 6 import re 7 listOfTokens = re. split (R' \ W * ', bigString) 8 return [tok. lower () for tok in listOfTokens if len (tok)> 2] 9 10 11 # create a list with all words 12 def createVocabList (dataSet ): 13 vocabSet = set ([]) 14 for document in dataSet: 15 vocabSet = vocabSet | set (document) 16 return list (vocabSet) 17 18 def setOfWords2Vec (vocabLis T, inputSet): 19 retVocabList = [0] * len (vocabList) 20 for word in inputSet: 21 if word in vocabList: 22 retVocabList [vocabList. index (word)] = 1 23 else: 24 print 'word', word, 'not in dict '25 return retVocabList 26 27 # Another model 28 def bagOfWords2VecMN (vocabList, inputSet): 29 returnVec = [0] * len (vocabList) 30 for word in inputSet: 31 if word in vocabList: 32 returnVec [vocabList. index (word)] + = 1 3 3 return returnVec 34 35 def trainNB0 (trainMatrix, trainCatergory): 36 numTrainDoc = len (trainMatrix) 37 numWords = len (trainMatrix [0]) 38 pAbusive = sum (trainCatergory) /float (numTrainDoc) 39 # prevent one of the scores with multiple probabilities from being 0 40 p0Num = ones (numWords) 41 p1Num = ones (numWords) 42 p0Denom = 2.0 43 p1Denom = 2.0 44 for I in range (numTrainDoc): 45 if trainCatergory [I] = 1: 46 p1Num + = trainMatrix [I] 47 p1Denom + = sum (TrainMatrix [I]) 48 else: 49 p0Num + = trainMatrix [I] 50 p0Denom + = sum (trainMatrix [I]) 51 p1Vect = log (p1Num/p1Denom) # accuracy considerations. Otherwise, it is likely that the limit to null 52 p0Vect = log (p0Num/p0Denom) 53 return p0Vect, p1Vect, pAbusive 54 55 def classifyNB (vec2Classify, p0Vec, p1Vec, pClass1): 56 p1 = sum (vec2Classify * p1Vec) + log (pClass1) # element-wise mult 57 p0 = sum (vec2Classify * p0Vec) + log (1.0-pClass1) 58 if p1> p0: 59 r Eturn 1 60 else: 61 return 0 62 63 def stopWords (): 64 stopW = [] 65 f = open('stopwords.txt '). readlines () 66 for eachLine in f: 67 stopW. append (eachLine [:-1]) 68 return stopW 69 70 def calcMostFreq (vocabList, fullText): 71 import operator 72 freqDict ={} 73 for token in vocabList: 74 freqDict [token] = fullText. count (token) 75 sortedFreq = sorted (freqDict. iteritems (), key = operator. itemgetter (1), rev Erse = True) 76 return sortedFreq [: 30] 77 78 def localWords (rss1, rss0): 79 import feedparser 80 feed1 = feedparser. parse (rss1) 81 feed0 = feedparser. parse (rss0) 82 docList = []; classList = []; fullText = [] 83 minLen = min (len (feed1 ['entries']), len (feed0 ['entries']) 84 for I in range (minLen): 85 wordList = textParse (feed1 ['entries'] [I] ['summary ']) 86 docList. append (wordList) 87 fullText. extend (wordLi St) 88 classList. append (1) # NY is class 1 89 wordList = textParse (feed0 ['entries'] [I] ['summary ']) 90 docList. append (wordList) 91 fullText. extend (wordList) 92 classList. append (0) 93 vocabList = createVocabList (docList) # create vocabulary 94 top30Words = calcMostFreq (vocabList, fullText) # remove top 30 words 95 for pairW in top30Words: 96 if pairW [0] in vocabList: vocabList. remove (pairW [0]) 97 stopW = StopWords () 98 for pairW in stopW: 99 if pairW [0] in vocabList: 100 vocabList. remove (pairW [0]) 101 trainingSet = range (2 * minLen); testSet = [] # create test set102 for I in range (20): 103 randIndex = int (random. uniform (0, len (trainingSet) 104 testSet. append (trainingSet [randIndex]) 105 del (trainingSet [randIndex]) 106 trainMat = []; trainClasses = [] 107 for docIndex in trainingSet: # train the classifier (get Probs) trainNB0108 trainMat. append (bagOfWords2VecMN (vocabList, docList [docIndex]) 109 trainClasses. append (classList [docIndex]) 110 p0V, p1V, pSpam = trainNB0 (array (trainMat), array (trainClasses) 111 errorCount = 0112 for docIndex in testSet: # classify the remaining items113 wordVector = bagOfWords2VecMN (vocabList, docList [docIndex]) 114 if classifyNB (array (wordVector), p0V, p1V, pSpam )! = ClassList [docIndex]: 115 errorCount + = 1116 print 'the error rate is: ', float (errorCount)/len (testSet) 117 return vocabList, p0V, p1V118 119 def getTopWords (ny, sf): 120 import operator121 vocabList, p0V, p1V = localWords (ny, sf) 122 topNY = []; topSF = [] 123 for I in range (len (p0V): 124 if p0V [I]>-6.0: topSF. append (vocabList [I], p0V [I]) 125 if p1V [I]>-6.0: topNY. append (vocabList [I], p1V [I]) 126 sortedSF = sorted (topSF, key = lambda pair: pair [1], reverse = True) 127 print "SF ** SF ** SF ** "128 for item in sortedSF: 129 print item [0] 130 sortedNY = sorted (topNY, key = lambda pair: pair [1], reverse = True) 131 print "NY ** NY ** NY ** "132 for item in sortedNY: 133 print item [0] 134 135 def main (): 136 # print stopWords () 137 localWords ('HTTP: // response ') 138 139 if _ name _ = '_ main _': 140 main ()