Download something called SWFToImage. dll first
Create a bat file and run:
Copy codeThe Code is as follows: COPY SWFToImage. dll % windir % \ system32
Regsvr32 % windir % \ system32 \ SWFToImage. dll
Copy codeThe Code is as follows: # Use python to download the code of Baidu Library. modify the code as required. The following is a prompt:
# Http://www.cnblogs.com/dearplain/
# Code by plain
Import urllib2
Import win32com. client
Import OS
Import sys
If _ name __= = '_ main __':
# OS. system ('');
OS. chdir ('d: \ my project \ pywenku ') # folder to which the file is saved
SWFToImage = win32com. client. Dispatch ("{479A1AAC-C148-40BB-9868-A9773DA66AF9 }");
'''
Allfile = OS. listdir (".")
Findrecord = 0
For file in allfile:
If file = ". record ":
Record = open (file, 'rw ')
Findrecord = 1
Break
If findrecord = 0:
Record = open ('. record', 'w ')
'''
# Url = "http://wenku.baidu.com/view/8d3ed840be1e650e52ea9938.html? From = rec & pos = 1 & weight = 2 & lastweight = 2 & count = 5"
# Url = "http://wenku.baidu.com/view/f2fe7a3987c24028915fc37a.html? From = related & hasrec = 1"
# Url is the address of the document you want to download
Url = sys. argv [1]
If url. find ("http ://")! = 0:
Print "error! The url is not correct"
Sys. exit ()
Print "downloading % s" % url
Try:
UrlReferer = url [url. index ('http'): url. index ('/V')]
Print urlReferer
# Urlbody = url [url. index ('/V')-1:]
Urlnum = url [url. index ('ew/') + 3: url.index('.htm')]
Failed t ValueError:
Print "parse url error"
Sys. exit ()
# Print urlnum
Wenku = 'wenku .baidu.com'
Reurl = '/play /'
Pagefrom = '? Pn ='
Downnum = '& rn ='
# Try to get title and make dir
Req = urllib2.Request (url)
Res = urllib2.urlopen (req)
Data = res. read ()
Try:
Sfrom = data. index ('<title>') + len ('<title> ')
# Print sfrom
Sbefore = sfrom + data [sfrom:]. index ('</title> ')
# Print sbefore
Title = data [sfrom: sbefore]
Title = title [: title. rindex ('_')]
Print 'downloading' + title
Failed t ValueError:
Print "get title error"
Sys. exit ()
Allfile = OS. listdir (".")
If (title in allfile) = False:
OS. mkdir (title)
OS. chdir ('./' + title)
# Get the first swf
Req = urllib2.Request ('HTTP: // wenku.baidu.com '+ reurl + urlnum + pagefrom + '1' + downnum + '1 ')
Req. add_header ("Referer", urlReferer)
Res = urllib2.urlopen (req)
Data = res. read ()
Res. close ()
Head = data [0: 45]
Pagenum = 0
Sfrom = head. index ('\ ": \"') + len ('\":\"')
Sbefore = sfrom + head [sfrom:]. index ('\"')
Pagenum = int (head [sfrom: sbefore])
Print 'pagenum: '+ str (pagenum)
If pagenum <= 0 or pagenum> 2000:
Print "error !!! Pagenum <0 or pagenum> 2000"
Sys. exit ()
Data = data [106:]
Swf = open ("1. pywenku", 'wb ')
Swf. write (data)
Swf. close ()
I = 1
SWFToImage. InputSWFFileName = "% d. pywenku" % I
SWFToImage. ImageOutputType = 1
SWFToImage. ImageWidth = 1048
SWFToImage. ImageHeight = 1478
SWFToImage. Execute_Begin ()
SWFToImage. FrameIndex = 1
SWFToImage. Execute_GetImage ()
SWFToImage. SaveToFile ("mongod.jpg" % I)
SWFToImage. Execute_End ()
OS. rename ("% d. pywenku" % I, "mongod.swf" % I)
Allfile = OS. listdir (".")
# From the second page to the last page
For I in range (2, pagenum + 1 ):
If '{d.swf '% I in allfile:
Continue
# Not find in the dir mean
Req = urllib2.Request ('HTTP: // wenku.baidu.com '+ reurl + urlnum + pagefrom + str (I) + downnum + '1 ')
Res = urllib2.urlopen (req)
Data = res. read ()
Data = data [106:]
Swf = open ("% d. pywenku" % I, 'wb ')
Swf. write (data)
Swf. close ()
SWFToImage. InputSWFFileName = "% d. pywenku" % I
SWFToImage. ImageOutputType = 1
SWFToImage. Execute_Begin ()
SWFToImage. FrameIndex = 1
SWFToImage. Execute_GetImage ()
SWFToImage. SaveToFile ("mongod.jpg" % I)
SWFToImage. Execute_End ()
OS. rename ("% d. pywenku" % I, "mongod.swf" % I)
Res. close ()
Print 'Task complete'