def getonemoviesinfo (Mid,url): Import requests from lxml import etree #print (URL) data = requests.get (URL) . text #download the website s = etree. HTML (data) #analyse data picture = S.xpath ('//*[@id = "main"]/section/div[1]/div/div/section/div[1]/div[1]/img /@src ') If len (picture) = = 0:picture = ' NULL ' #longPicture = S.xpath ('//*[@id = "Media_v4"]/div[2]/div[1]/div/d iv/section[3]/div[2]/div/div[1]/img/@src ') name = S.xpath ('//*[@id = "main"]/section/div[1]/div/div/section/div[2]/ Section/div[1]/span/a/h2/text () ') If Len (name) ==0:print ("Mid =%s, failed for a lack of TMDB ID"%mid) return name = S.xpath ('//*[@id = "main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/a/h2/text () ') [0] Year = S.xpath ('//*[@id = "main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/span/text () ') [0].strip ("("). Strip (). Strip (")") Date = S.xpath ('//*[@id = "Media_v4"]/div[2]/div[2]/div/section/div[1]/div/section[1]/ul/ li[1]/Text () ') [1].strip () brief = S.xpath ('//*[@id = "main"]/section/div[1]/div/div/section/div[2]/section/div[2]/div/p/ Text () ') [0].replace ("\ n", "\\n") maincreators =s.xpath ('//*[@id = "main"]/section/div[1]/div/div/section/div[2]/ Section/div[2]/ol/li ') #all main creators array writers = [] Director = "NULL" for Div in maincreators:if Len (Div.xpath ('./p[1]/a/text () ')) = = 0:director = ' null ' writers = [' null ', ' null ', ' null '] El Se:creatorname = Div.xpath ('./p[1]/a/text () ') [0] #print (creatorname) creator profession = Div.xpath ('./p[2]/text () ') [0] #print (creatorprofession) if ' Director ' in Creatorprofes Sion:director = creatorname elif ' screenplay ' in Creatorprofession or ' Writer ' in Creatorprofes Sion:writers.append (creatorname) stars = [] Starsdata = S.xpath ('//*[@id = "Media_v4"]/div [2]/div[1]/div/div/section[1]/ol/li ')For div in Starsdata:star = Div.xpath ('./p[1]/a/text () ') If Len (star) = = 0:stars = = ["NULL", "NUL L "," NULL "] Else:star = Star[0] Stars.append (star) Writerslen = Len (WR Iters) Starslen=len (stars) for I in Range (writerslen,3): Writers.append ("NULL"); For I in Range (starslen,5): Stars.append ("NULL"); With open (R ' C:\Users\yuqiao\Desktop\testSpider.txt ', ' a ', encoding= ' Utf-8 ') as F:f.write ("{}|{}| {}| {}| {}| {}| {}| {}| {}| {}| {}| {}| {}| {}| {}\n ". Format (Mid,name,brief,year,date,director, Writers[0],writers[1],writ ERS[2], stars[0],stars[1],stars[2],stars[3],stars[4], Picture) print (Mid) print (name) #___________________________________________________ ___ Main Function __________________________________________________________import Timewith open (R 'C:\Users\yuqiao\Desktop\testSpider.txt ', ' W ', encoding= ' Utf-8 ') as F:f.write ("") language = '? LANGUAGE=ZH-CN ' ####### ############### #with Open (R ' D:\git\ZiyeMovie\MidURL.txt ', "RT", encoding= ' Utf-8 ') as In_file:all = In_file.read () li NES = All.split ("\ n") #for I in Range (51,61): 51~60 for I in Range (9124,9125): line = Lines[i] Print (line) print (' finished ')
Python crawler example for future reference