#coding: Utf-8 import time from selenium import webdriver lxml import etree #这里一定要设置编码格式 to prevent subsequent writes to the file times wrong import sys rel Oad (SYS) sys.setdefaultencoding ("Utf-8") friend = ' The QQ number of friends ', Friend's space request allows you to access user = ' * * * # your QQ number PW = ' * * ' # your QQ password #获取浏览器驱动, if there is no download driver, then system environment to join this directory driver = webdriver.
Firefox (Executable_path = ' d:/app/firefox/geckodriver.exe ') # browser window maximized Driver.maximize_window () # Browser address directed to QQ landing page
Driver.get ("http://i.qq.com") # So here you need to select the frame, or you can not find the following required page elements Driver.switch_to.frame ("Login_frame") # AUTOMATIC click account login mode driver.find_element_by_id ("Switcher_plogin"). Click () # Account input box to enter a known QQ account driver.find_element_by_id ("U"). Send_keys ( User) # Password box enter a known password driver.find_element_by_id ("P"). Send_keys (PW) # automatically clicks the login button driver.find_element_by_id ("Login_button"). Click () # Let Webdriver manipulate the current page driver.switch_to.default_content () # Jump to the URL, friend you can arbitrarily change to the space you want to visit Driver.get ("http:// user.qzone.qq.com/"+ friend +"/311 ") next_num = 0 # initial" next page "id while True: # Drop-down scroll bar, so that the browser loads dynamic addContent, # I'm here from 1 start to 6 end 5 load per page data for I in range (1,6): height = 20000*i# slide 20000 pixels at a time
Strword = "Window.scrollby (0," +str (height) + ")" Driver.execute_script (Strword) time.sleep (4)
# many times the Web page is made up of multiple <frame> or <iframe>, the Webdriver default is the outermost frame, # So here you need to select the frame you're talking about, or you can't find the page element below. Driver.switch_to.frame ("app_canvas_frame") selector = etree. HTML (driver.page_source) divs = Selector.xpath ('//*[@id = ' msglist ']/li/div[3] ') #这里使用 a indicates that the content can be continuously empty, and the current
The saved document path is the absolute path with the open (' D:/app/python3.6.4/project/qqzeon/qq_word.txt ', ' a ') as F:for div in divs:
Qq_name = Div.xpath ('./div[2]/a/text () ') qq_content = Div.xpath ('./div[2]/pre/text () ') Qq_time = Div.xpath ('./div[4]/div[1]/span/a/text () ') Qq_name = qq_name[0] If Len (qq_name) >0 El Se ' qq_content = qq_content[0] If Len (qq_content) >0 ELSE ' QQ_TIME = qq_time[0] If Len (qq_time) >0 Else ' # print Qq_name,qq_time,qq_conten T f.write (qq_content+ "\ n") # when already on the last page, the "Next" button has no ID, and can end if Driver.page_source.find (' PA Ger_next_ ' + str (next_num) = = -1:break # Find the "Next" button, because the button on the next page is dynamically changing, here you need to record the dynamic Driver.find_elem ent_by_id (' Pager_next_ ' + str (next_num)). Click () # "Next" id next_num = 1 # because in the next loop, the page is first pulled down, the To jump to the outer frame of the driver.switch_to.parent_frame ()
Generate Word Cloud
#coding: Utf-8 "" "" "" "" "" to generate cloud images using a stuttering word 1. Generate the word cloud be sure to set the font style, otherwise Chinese characters appear garbled or do not show 2. I don't know why this machine has been unable to display Chinese, In the back I added Jieba word thesaurus can display Chinese "" "" "from Wordcloud import wordcloud import matplotlib.pyplot as plt import Jieba #生成词云 def creat
E_word_cloud (filename): text= open ("D:/app/python3.6.4/project/qqzeon/{}.txt". Format (filename)). Read () # stuttering participle
Wordlist = jieba.cut (text, cut_all=true) wl = "". Join (wordlist) # set word cloud WC = wordcloud (# Set Background color Background_color= "White", # Set the maximum number of words cloud max_words=2000, # This font is in the computer font, general path font_path= ' C : \windows\fonts\simfang.ttf ', height= 1200, width= 1600, # Set the maximum font size max_font_size=100, # Set how many randomly generated states, that is, how many color schemes random_state=30,) MyWord = Wc.generate (WL) # generates word cloud # display word clouds plt.imshow (mywo RD) Plt.axis ("Off") Plt.show () wc.to_file (' Py_book.png ') # holds the word cloud under if __name__ = ' __main__ ':
Create_word_cloud (' * * * ') #为文件名
The latest browser may not open, suggest using the older version of Firefox browser and driver link: https://pan.baidu.com/s/1c31OqQk Password: 6PFP