1 piaofang.py2 #-*-coding:utf-8-*-3 " "4 the script can capture the box office data on the cat's Eye box office site5 use the data for the watercress crawl on the movie, see File: Doubanmovies_imdbscore.csv6 " "7 ImportRequests8 Importlxml.html9 Import TimeTen fromPandasImportDataFrame One ImportPandas as PD A -headers={'user-agent':'mozilla/5.0 (Windows NT 10.0; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/45.0.2454.101 safari/537.36'} - defgetdoc (URL): theResp=requests.get (Url,headers=headers)#Get web Response -Time.sleep (0.1)#pause for 0.1 seconds to prevent crawling too frequently to be blocked by IP -Content=resp.text#get the appropriate content -Doc =lxml.html.fromstring (content) + returnDoc - + #function: Enter the box for movie name, output for the movie on the cat's eye online A #Description: If there is no information about the movie on the cat's eye, mark: NotFound at #If you can find the movie on the cat's eye, but there is no box-office data, mark: Withoutdata - defGetpiaofang (title): - #form a cat's eye on the movie's search results page based on the movie name -URL ='http://pf.maoyan.com/search?_v_=yes&key='+title - #because the encoding format is confusing, here are two coding formats to try - Try: inUrl=url.decode ('GBK'). Encode ('Utf-8') - except: toUrl=url.encode ('Utf-8') + finally: -Templist=[]#A list that is temporarily used in the initialization function theDoc=getdoc (URL)#Parsing Web pages * #crawled suffix name, may be ' million box office ', ' people want to see ', ' No box office data ' $Temp_back=doc.xpath ('//*[@id = "Search-list"]/article/em/text ()')Panax Notoginseng #a movie search results page, because there will be similar names of the movie will be searched out, so to be judged -Temp_name=doc.xpath ('//*[@id = "Search-list"]/article/div/text ()') the ifTemp_name!=[]:#first the results page catches the movie list is either empty, that can search for the movie + #if empty, mark as ' NotFound ' A forIinchRange (len (temp_name)):#Determine the name of the movie you are searching for, and take out the exact same name as the movie you are searching for. theTemp1= (Temp_name[i]). Encode ('Utf-8') + ifTemp1==title:#If you loop to the first name and find exactly the same item as the name of the movie you are searching for, then the suffix of the number is judged - #If you do not find exactly the same name as the movie you are searching for, Mark ' Withoutdata ' $Temp2=unicode (Temp_back[i]). Encode ('Utf-8') $ iftemp2=='million box office':#If the suffix is called ' million box office ', then the data may be the data we're looking for. -Temp_num = Doc.xpath ('//*[@id = "Search-list"]/article['+str (i+1) +']/em/span/text ()') - ifTemp_num!=[]:#if data can be crawled, the data converted to type int is the box office data you are looking for the #mark as ' Withoutdata ' if not captured - PrintInt (temp_num[0])Wuyi templist.append (int (temp_num[0])) the Else: -Templist.append ('Withoutdata') Wu Else: -Templist.append ('Withoutdata') About Else: $Templist.append ('Withoutdata') - Else: -Templist.append ('NotFound') - returnTemplist[0] A +Df=pd.read_csv ('Doubanmovies_imdbscore.csv')#Open the Movie list file crawled to the watercress thePiaofanglist=[]#Initialize the box office list -Errornum=0#number of initialization errors $ forIinchRange (0,len (DF)): the Try: theTemp=df.ix[i,'title'] theTemp=temp.decode ('GBK'). Encode ('Utf-8')#converting to encoded format thePiaofanglist.append (Getpiaofang (temp))#Call the Getpiaofang function to get the box office data - except: inErrornum+=1#Error , the number of errors plus 1 thePiaofanglist.append ('Error')#Mark box office numbers as ' error ' the Print 'error No.', ErrorNum About finally: theDf1=dataframe ({'title':d F.ix[:i,'title'],'Piaofang':p iaofanglist}) theDf1.to_csv ('Test.csv', index=False) the PrintI+1#Print marks
1-2 crawl the cat's Eye box office movie box office information