Use selenium multithreading to crawl Iqiyi movie information
Reprint please indicate the source.
Crawl targets: Rating, name, duration, starring, and type for each movie
Crawling ideas:
Source file: (with Comments)
fromSeleniumImportWebdriver fromThreadingImportThreadImportThreadingImportTimeImportOpenpyxl#操作excel#爱奇艺的看电影的url is not the homepage. Url=' http://list.iqiyi.com/www/1/-8------------11-1-1-iqiyi--. html '#自定义一个线程类 implementing Multi-threaded crawlingclassM_thread (Thread):def __init__( Self, Name1,url): Thread.__init__( Self) Self. URL=Url Self. name1=Name1defRun Self): Self. Kind_movie=Page Self. name1, Self. URL)#page运行完后lock进行 Let the current movie end# Initialize the crawler to crawl from the URL and crawl through each kind of connection. defInit ():# Browser has no interface and interface. Firefoxoptions=Webdriver. Firefoxoptions () firefoxoptions.set_headless () brower=Webdriver. Firefox (firefox_options=Firefoxoptions)# brower = Webdriver. Firefox ()Brower.get (URL)#定位到种类标签 (found without XPath error prone)Kind=Brower.find_element_by_xpath ("/html/body/div[3]/div/div/div[1]/div[4]/ul")#a标签就是那个 ConnectionKinds=Kind.find_elements_by_tag_name ("a")#将每个类型的页面连接储存到kinds_dict中Movie_kind_link={} forAinchKinds:Try:if(A.text=="All" orA.text==""):#去掉 all types and an empty type. ContinueMovie_kind_link[a.text]=A.get_attribute ("href")except:Print("error!")ContinueBrower.close ()returnMovie_kind_link#返回的是 kind: url dictionary. defPage (Name,link):#每一个种类 all open a excle storageWordbook=Openpyxl. Workbook () Sheet1=Wordbook.active num=1 #初始化excle第一行 forQweinch["movie name","duration","Good","Type","actor"]: Sheet1.cell (row=1, column=Num,value=QWE) num+=1Num=2 #本来一开始是用txt写的但是布局太丑. The advantage is the speed fast! # using over-the-data inventory, but writing large amounts of data at the same time is always a strange mistake. It's not settled yet. # File=open (name+ ". txt", "w", encoding= "Utf-8")Firefoxoptions=Webdriver. Firefoxoptions () firefoxoptions.set_headless () Br=Webdriver. Firefox (firefox_options=Firefoxoptions)# Br = Webdriver. Firefox () # Try:Br.get (link)Print("is open%spage "%Name) page=Br.find_element_by_class_name ("Mod-page") Page_href=[] forAainchPage.find_elements_by_tag_name ("a"): Page_href.append (Aa.get_attribute ("href")) forCcinchPage_href:Print(* * * * * * * * * Crawling {} page * * * * * * * * * *.format(Name,page_href.index (CC)+1))# time.sleep (1) # The first page doesn't have to be reopened if(Page_href.index (CC)!=0): Br.get (CC)#movie the movie tag list for the current pageMovie=Br.find_element_by_class_name ("Wrapper-piclist"). Find_elements_by_tag_name ("Li") forBbinchMovie# Try:Things=Bb.text.split ("\ n")"""Why do we have to differentiate here? Iqiyi Art is very rubbish, a little film score not to give,but in the direct get text in if judgment and sub-element to get four attributes, I think it is still useful. """ if(Len(things)==4): Sheet1.cell (row=Num, column=1, value=things[2]) Sheet1.cell (row=Num, column=2, value=things[0]) Sheet1.cell (row=Num, column=3, value=things[1]) Sheet1.cell (row=Num, column=4, value=Name) Sheet1.cell (row=Num, column=5, value=things[3]) num+=1 elif(Len(things)== 3): Sheet1.cell (row=Num, column=1, value=things[1]) Sheet1.cell (row=Num, column=2, value="*") Sheet1.cell (row=Num, column=3, value=things[0]) Sheet1.cell (row=Num, column=4, value=Name) Sheet1.cell (row=Num, column=5, value=things[2]) num+=1 Else:Print("error (moive)")# BreakLock_thread.release ()# UnlockWordbook.save (name+". xlsx") Br.close ()if __name__=="__main__":#控制线程最大数量为3Lock_thread=Threading. Semaphore (3)#控制线程数为3 #kind: Link Dict=Init ()# Print (dict) #多线程爬取 forName1,linkinch Dict. Items (): Lock_thread.acquire ()#枷锁, unlock after each page () is finished runningThread_live=M_thread (Name1,link)Print(Name1,"Begin") Thread_live.start () Time.sleep (3)
Use selenium multithreading to crawl Iqiyi movie information