This article describes how to use python to download cartoons by Crawlers. parse the cartoon resources of Youxia. the code for downloading all its cartoon chapters is as follows:
#! /Usr/bin/python3.2
Import OS, socket
Import urllib
Import urllib. request, threading, time
Import re, sys
Global manhuaweb, weburl, floder, chapterbegin, currentthreadnum, threadcount, mutex, mutex2
Weburl =''
Floder =''
Chapterbegin = 0
Currentthreadnum = 0
Threadcount = 6
If len (sys. argv)> = 3:
Weburl = sys. argv [1]
Floder = sys. argv [2]
Else:
Print ("usag: downloadmanhua weburl floder chapterbegin = 0 threadnnum = 6 ")
Sys. exit (0)
If len (sys. argv)> = 4:
Chapterbegin = int (sys. argv [3])
If len (sys. argv)> = 5:
Threadcount = (int) (sys. argv [4])
Def jin (I, jinzhi ):
Finalans = ""
Answer = I % jinzhi
I = int (I/jinzhi)
If answer> 9:
Finalans = finalans + chr (ord ('A') + (answer-10 ))
Else:
Finalans = finalans + str (answer)
If I! = 0:
Finalans = jin (I, jinzhi) + finalans
Return finalans
Def urlparse (p, a, c, k ):
D = {}
E = lambda c: jin (c, 36)
If 1:
While c:
C = C-1
If not k [c]:
D [jin (c, 36)] = jin (c, 36)
Else:
D [jin (c, 36)] = k [c]
K = [lambda e: d [e]
E = lambda c: '\ w +'
C = 1
Newstr = ""
While c:
C = C-1
If k [c]:
For I in range (0, len (p )):
Tempi = p [I]
Tempi = ord (tempi)
If tempi> = ord ('A') and tempi <= ord ('F '):
Newstr + = d [chr (tempi)]
Elif tempi> = ord ('0') and tempi <= ord ('9 '):
Newstr + = d [chr (tempi)]
Else:
Newstr + = chr (tempi)
Return newstr
Def meispower (s ):
P = re. compile (r "(? =\}\ (). * ", Re. IGNORECASE)
S = p. findall (s)
S = s [0]
S = s [0 :( len (s)-19)]
Par = s. split (',')
Par [3] = par [3] [1: len (par [3])]
Answer = par [3]. split ('| ')
Chapterpath = urlparse (par [0], int (par [1]), int (par [2]), answer)
Allurl = re. findall ('imgpath = [^;] * ', chapterpath) [0]
Allurl = allurl [10 :( len (allurl)-2)]
Return allurl
Def pictofile (weburl, filename, loop = 100 ):
If loop <0:
Print ('Can \'t download the picture % s' % weburl)
Return
Loop = loop-1
If OS. path. exists (filename ):
Return
Try:
Url = urllib. request. urlopen (weburl)
Data = url. read ()
If len (data) <2048:
Url. close ()
Pictofile (weburl, filename, loop)
Else:
Print ('Download from % s name is % s \ n' % (weburl, filename ))
Myfile = open ('% s' % filename, 'wb ')
Myfile. write (data)
Myfile. close ()
Url. close ();
Failed T socket. timeout:
Print ('timeout ')
Pictofile (weburl, filename, loop)
Failed T Exception as e:
Print ('error', e)
Pictofile (weburl, filename, loop)
Finally:
Pass
Def downloadpic (url, loadpicdir, num ):
# Download the all url picture to loadpicdir
Global currentthreadnum, mutex, mutex2
Mymode = re. compile (r' [0-9a-z.] * \ z ')
Try:
Mutex2.acquire ()
OS. chdir (loadpicdir)
Mutex2.release ()
Except t:
Print ("can't open the floder % s will be create" % loadpicdir)
Try:
If (mutex2.locked ()):
OS. mkdir (loadpicdir)
OS. chdir (loadpicdir)
Mutex2.release ()
Print ('create floder succeed ')
Except t:
Print ("can't create floder % s" % loadpicdir)
If (mutex. acquire ()):
Mutex. release ()
Quit (0)
Name = mymode. findall (url)
Filename = 'manhua' + name [0]
Pictofile (url, loadpicdir + '//' + str (num) + '-' + filename)
Mutex. acquire ()
Currentthreadnum = currentthreadnum-1
Mutex. release ()
Def downloadchapter (url, loadpicdir, num, begin = 0 ):
Global manipulweb, threadcount, currentthreadnum, mutex
Print (manparse Web + url)
Webdata = urllib. request. urlopen (manrentweb + url). read ()
Webdata = webdata. decode ('utf-8 ')
Chaptername = re. findall (r'[^ _] * ', Webdata) [0]
Chaptername = chaptername [7: len (chaptername)]
Webscrip = re. findall (r 'eval. * [^ <>] ', webdata)
Chapterurl = meispower (webscrip [0]);
Chapterurl = 'http: // mhimg.ali213.net '+ chapterurl
For I in range (begin, num ):
Try:
While (currentthreadnum> = threadcount ):
Time. sleep (0.5)
Mutex. acquire ()
Currentthreadnum = currentthreadnum + 1
Mutex. release ()
Threading.thread(target+downloadpic,args+(r'{s}d.jpg '% (chapterurl, I), loadpicdir + chaptername, num). start ()
Failed T socket. error:
Mutex. acquire ()
I = i-1
Currentthreadnum = currentthreadnum-1
Mutex. release ()
Failed T Exception as error:
Print (error, 'break ')
Print ('Download chapter % d of picture make a error' % I)
Break
If _ name __= = '_ main __':
Manparse web = r 'http: // manhua.ali213.net'
Socket. setdefatimetimeout (60.0)
Mutex = threading. Lock ()
Mutex2 = threading. Lock ()
Webfile = urllib. request. urlopen (weburl)
Webdata = webfile. read ();
Webdata = webdata. decode ('utf-8 ')
Meshmode = re. compile (r'
.*
')
Meshdata = meshmode. findall (webdata) [0]
Indexmode = re. compile (r' ([0-9] * page )')
Indexdata = indexmode. findall (meshdata)
Picurlmode = re. compile (r'/comic/[0-9/] *. html ')
Picurldata = picurlmode. findall (meshdata)
Chapterlength = len (picurldata)
Nummode = re. compile (r' [\ d] + ')
I = chapterbegin
While I Manhuachapter = picurldata [chapterlength-i-1]
Downloadchapter (manhuachapter, floder, int (nummode. findall (indexdata [chapterlength-i-1]) [0])
I = I + 1