I wrote a picture for spider fun and tried to catch a few webpages. it doesn't feel good. The core code may be less than 20 lines, which is concise and clear. Nonsense, Cuihua, on the code
The code is as follows:
# Coding = UTF-8
Import OS
Import sys
Import re
Import urllib
URL_REG = re. compile (r' (http: // [^ //] +) ', re. I)
IMG_REG = re. compile (r'] *? Src = ([/'"]) ([^/1] *?) /1', re. I)
Def download (dir, url ):
''' Download images from the webpage
@ Dir: Local path
@ Url webpage url
'''
Global URL_REG, IMG_REG
M = URL_REG.match (url)
If not m:
Print '[Error] Invalid URL:', url
Return
Host = m. group (1)
If not OS. path. isdir (dir ):
OS. mkdir (dir)
# Obtain html and extract Image URLs
Html = urllib. urlopen (url). read ()
Imgs = [item [1]. lower () for item in IMG_REG.findall (html)]
F = lambda path: path if path. startswith ('http: // ') else/
Host + path if path. startswith ('/') else url + '/' + path
Imgs = list (set (map (f, imgs )))
Print '[Info] Find % d images.' % len (imgs)
# Download images
For idx, img in enumerate (imgs ):
Name = img. split ('/') [-1]
Path = OS. path. join (dir, name)
Try:
Print '[Info] Download (% d): % s' % (idx + 1, img)
Urllib. urlretrieve (img, path)
Except t:
Print "[Error] Cant't download (% d): % s" % (idx + 1, img)
Def main ():
If len (sys. argv )! = 3:
Print 'invalid argument count .'
Return
Dir, url = sys. argv [1:]
Download (dir, url)
If _ name _ = '_ main __':
# Download ('d: // Imgs ', 'http: // www.163.com ')
Main ()