Python Web crawler Introduction:
Sometimes we need to copy the picture of a webpage. Usually the manual way is the right mouse button save picture as ...
Python web crawler can copy all the pictures at once.
The steps are as follows:
1. Read the HTML to the crawler
2. Store and process the crawled HTML:
3. Save pictures to local based on connection
The main difficulty:
The code is as follows:
Import Urllib.requestimport Osimport re
def gethtml (URL): #get html page = Urllib.request.urlopen (URL) html = page.read () return htmldef write (html, htmlfile): #write html into a file name html.txt try: f = open (htmlfile, mode= ' W ') F.writelines (str (HTML)) f.close () except TypeError: print ("write html file Failed ")
Def getimg2 (html, initialfile, finalfile): reg = ' "*" #split string html with " and write in file name re.txt imgre1 = Re.compile (REG) imglist = re.split (IMGRE1, STR (HTML)) f1 = open (initialfile, mode= ' W ') for index in Imglist: f1.write ("\ n") f1.write (Index) f1.close reg2 = "^https.* JPG " # match items start with " https " and ends with "JPG" imgre2 = re.compile (REG2) f2 = open (Initialfile, mode= ' R ') f3 = open (finalfile, mode= ' W ') tempre = f2.readlines () for index in tempre: temp = re.match (Imgre2,index) if temp != none: f3.write ( Index) #f3. Write ("\ n") f2.close () f3.close ()
def saveImg2 (imagefile): #save Image f_imglist2 = open (ImageFile, mode= ' r ') Templist = F_imglist2.readlines ( ) x = 0 for index in Templist:urllib.request.urlretrieve (index, '%s.jpg '%x) x = x + 1
HTML = "https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gbk& word=%ba%fb%b5%fb&fr=ala&ala=1&alatpl=adress&pos=0&hs=2&xthttps=111111 "Htmlfile =" D:\\ New\\html.txt "splitfile =" d:\\new\\re.txt "imgefile =" D:\\new\\imglist.txt "
HTML = gethtml (HTML) print ("Get HTML complete!") GetImg2 (HTML, splitfile, imgefile) print ("Get Image link list complete!") SaveImg2 (imgefile) print ("Save Image complete!")
A brief analysis of Python web crawler