The most basic code to crawl Web content is implemented:
#!/usr/bin/env python from urllib import urlretrieve def firstnonblank (lines): For eachline in lines: If not Eachline.strip (): continue Else: return eachline def firstlast (webpage): f = open (webpage ) lines = F.readlines () f.close () print Firstnonblank (lines), lines.reverse () Print Firstnonblank (lines), def download (url= ' http://www ', process=firstlast): try: retval = Urlretrieve ( URL) [0] except IOError: retval = None if retval: process (retval) if __name__ = = ' __main__ ':
Use the Urllib module to implement a Web page that captures the image:
Import urllib.request Import socket import re import sys import OS TargetDir = R "C:\Users\elqstux\Desktop\pic" Def Destfil E (path): if not Os.path.isdir (targetDir): os.mkdir (targetDir) pos = Path.rindex ('/') t = Os.path.join (TargetDir, path[pos+1:]) return t if __name__ = = "__main__": hostname = "/http/ Www.douban.com " req = urllib.request.Request (hostname) webpage = urllib.request.urlopen (req) Contentbytes = Webpage.read () for link, the T in Set (Re.findall (R ' (http:[^\s]*) ( jpg|png|gif)) ', str (contentbytes)): print (link)
import urllib.request import socket import re import sys import OS TargetDir = R "H:\pic" Def Destfi Le (path): If not Os.path.isdir (TargetDir): Os.mkdir (targetDir) pos = Path.rindex ('/') T = Os.path.join (TargetDir , path[pos+1:]) #会以/As separate return t if __name__ = = "__main__": hostname = "http://www.douban.com/" req = Urllib.reque St. Request (hostname) webpage = urllib.request.urlopen (req) contentbytes = Webpage.read () match = Re.findall (R ' (http:[^\ S]*? (jpg|png|gif)) ', str (contentbytes)) #r ' (http:[^\s]*? ( Jpg|png|gif) ' contains two layers of parentheses, so there are two groupings, #上面会返回列表, the matching contents in parentheses will appear in the list for Picname, Pictype in match: Print (picname) print (pictype) "Output: Http://img3.douban.com/pics/blank.gif gif http://img3.douban.com/icon/g1 11328-1.jpg jpg http://img3.douban.com/pics/blank.gif gif http://img3.douban.com/icon/g197523-19.jpg jpg/http/ Img3.douban.com/pics/blank.gif gif ... '''