Follow the bag of Words meets Bags of popcorn for a beginner instance and knock over the code. The main use is Countvectorizer, which generates the word frequency vector for each comment, and then uses the random forest model to predict the new comment. After submission, the score is about 0.84.
ImportPandas as PDImportRe fromBs4ImportBeautifulSoupImportLogginglogging.basicconfig ( level=logging. ERROR) Train= Pd.read_csv ('/USERS/MEITU/DOWNLOADS/LABELEDTRAINDATA.TSV', header=0, delimiter="\ t", quoting=3)Print(train['sentiment'].head (10))#print (Train.shape)#print (train.columns.values)#Print (train[' review '][0])example1 = BeautifulSoup (train['Review'][0])#print (Example1.get_text ())letters_only= Re.sub ('[^a-za-z]'," ", Example1.get_text ())Print(letters_only) lower_case=letters_only.lower () words=Lower_case.split ()Print(words) fromNltk.corpusImportStopwords#Print (Stopwords.words ("中文版"))words= [w forWinchWordsif notWinchStopwords.words ('中文版')]Print(words)defreview_to_words (raw_review):#1. Remove HTMLReview_text =BeautifulSoup (Raw_review). Get_text ()#2. Remove non-lettersLetters_only = Re.sub ("[^a-za-z]"," ", Review_text)#3. Convert to lower case, split into individual wordsWords =Letters_only.lower (). Split ()#4. In Python, searching a set is much faster than searching a listStops = Set (Stopwords.words ('中文版')) #5. Remove Stop WordsMeaningful_words = [w forWinchWordsif notWinchstops]#6. Join the words back to one string separated by space, and return the result return(" ". Join (meaningful_words))if __name__=='__main__': Num_reviews= train['Review'].size clean_train_reviews= [] forIinchRange (0, num_reviews): Clean_train_reviews.append (review_to_words (train['Review'][i]))if(i + 1)% 1000 = =0:Print("Review%d of%d\n", I + 1, Num_reviews)Print("Creating the bag of words...\n") fromSklearn.feature_extraction.textImportCountvectorizer Vectorzer= Countvectorizer (analyzer="Word", Tokenizer=None, Stop_words=None, Max_features=5000) Train_data_features=vectorzer.fit_transform (clean_train_reviews)#print (train_data_features[0]) Print(Vectorzer.get_feature_names ()) Train_data_features=Train_data_features.toarray ()Print(train_data_features) vocab=vectorzer.get_feature_names ()Print(vocab)Print("Training the random forest ...") fromSklearn.ensembleImportRandomforestclassifier Forest= Randomforestclassifier (n_estimators=100) Forest= Forest.fit (Train_data_features, train['sentiment']) test= Pd.read_csv ('/USERS/MEITU/DOWNLOADS/TESTDATA.TSV', header=0, delimiter="\ t", quoting=3) Print(test.shape) num_reviews= Len (test['Review']) Clean_test_reviews= [] forIinchRange (0, num_reviews):if(i + 1)% 1000 = =0:Print("Review%d of%d\n"% (i + 1), Num_reviews)) Clean_review= Review_to_words (test['Review'][i]) clean_test_reviews.append (clean_review) test_data_features=vectorzer.transform (clean_test_reviews) test_data_features=Test_data_features.toarray () result=forest.predict (test_data_features) Output= PD. DataFrame (data={'ID': test['ID'],'sentiment': Result}) Output.to_csv ('Bag_of_word_model.csv', Index=false, quoting=3) #test_data_features = Vec
"NLP" Beginner natural language Processing