Python video crawling (actually a benefit), python benefits
It was raining out the window. As a single programmer, I found a good thing. I know what you use Python? The first highlighted answer.
Go to the above and check that the addresses are all in plain text. Come on, hurry up.
Download stream files. Set the requested stream in the requests database to True.
First, try to find a video address:
#-*-Coding: UTF-8-*-import requestsdef download_file (url, path): with requests. get (url, stream = True) as r: chunk_size = 1024 content_size = int (r. headers ['content-length']) print 'Download start' with open (path, "wb") as f: for chunk in r. iter_content (chunk_size = chunk_size): f. write (chunk) if _ name _ = '_ main _': url = 'is in the original post... 'path = 'indicates all files you want to store. 'download_file (url, path)
The best experience:
AttributeError: __exit__
Will this document be a lie too!
It seems that the _ exit _ method is not required to implement the context. Since we only want to ensure that r is finally closed to release the connection pool, we can use the closing feature of contextlib:
#-*-Coding: UTF-8-*-import requestsfrom contextlib import closingdef download_file (url, path): with closing (requests. get (url, stream = True) as r: chunk_size = 1024 content_size = int (r. headers ['content-length']) print 'Download start' with open (path, "wb") as f: for chunk in r. iter_content (chunk_size = chunk_size): f. write (chunk)
The program is running normally, but I stare at this file. Why is the size not changed? How much has it been done? We still need to store the following content to the hard disk in time, saving some memory, right:
#-*-Coding: UTF-8-*-import requestsfrom contextlib import closingimport osdef download_file (url, path): with closing (requests. get (url, stream = True) as r: chunk_size = 1024 content_size = int (r. headers ['content-length']) print 'Download start' with open (path, "wb") as f: for chunk in r. iter_content (chunk_size = chunk_size): f. write (chunk) f. flush () OS. fsync (f. fileno ())
The file is increasing at a visible speed. I really love my hard disk. I 'd like to write it to the hard disk for the last time. Just remember the number in the program:
Def download_file (url, path): with closing (requests. get (url, stream = True) as r: chunk_size = 1024 content_size = int (r. headers ['content-length']) print 'Download start' with open (path, "wb") as f: n = 1 for chunk in r. iter_content (chunk_size = chunk_size): loaded = n * 1024.0/content_size f. write (chunk) print 'downloaded {0: % }'. format (loaded) n + = 1
The results are intuitive:
Downloaded: 2.579129% downloaded: 2.581255% downloaded: 2.583382% downloaded: 2.585508% downloaded:
How can I satisfy this one with lofty ideals? Write a class to use it together:
#-*-Coding: UTF-8-*-import requestsfrom contextlib import closingimport timedef download_file (url, path): with closing (requests. get (url, stream = True) as r: chunk_size = 1024*10 content_size = int (r. headers ['content-length']) print 'Download start' with open (path, "wb") as f: p = ProgressData (size = content_size, unit = 'kb ', block = chunk_size) for chunk in r. iter_content (chunk_size = chunk_size): f. write (chunk) p. output () class ProgressData (object): def _ init _ (self, block, size, unit, file_name = '',): self. file_name = file_name self. block = block/1000.0 self. size = size/1000.0 self. unit = unit self. count = 0 self. start = time. time () def output (self): self. end = time. time () self. count + = 1 speed = self. block/(self. end-self.start) if (self. end-self.start)> 0 else 0 self. start = time. time () loaded = self. count * self. block progress = round (loaded/self. size, 4) if loaded> = self. size: print u '% s downloaded \ r \ n' % self. file_name else: print U' {0} download progress {1 :. 2f} {2}/{3 :. 2f} {4} download speed {5 :. 2%} {6 :. 2f} {7}/s '. \ format (self. file_name, loaded, self. unit, \ self. size, self. unit, progress, speed, self. unit) print '% 50s' % ('/'* int (1-progress) * 50 ))
Run:
Download start download progress: 10.24Kb/12.164.05kb 0.01% download speed: 4.75Kb/s /////////////////////////// /////////////////// download progress: 20.48Kb/120174.05Kb 0.02% download speed: 32.93Kb/s /////// //////////////////////////////////////// //
It seems much more comfortable.
The following code downloads multiple threads at the same time. The main thread produces the url and puts it in the queue. The download thread obtains the url:
#-*-Coding: UTF-8-*-import requestsfrom contextlib import closingimport timeimport Queueimport hashlibimport threadingimport osdef download_file (url, path): with closing (requests. get (url, stream = True) as r: chunk_size = 1024*10 content_size = int (r. headers ['content-length']) if OS. path. exists (path) and OS. path. getsize (path)> = content_size: print 'downloaded 'Return print' download start 'with open (path, "wb") as f: p = ProgressData (size = content_size, unit = 'kb', block = chunk_size, file_name = path) for chunk in r. iter_content (chunk_size = chunk_size): f. write (chunk) p. output () class ProgressData (object): def _ init _ (self, block, size, unit, file_name = '',): self. file_name = file_name self. block = block/1000.0 self. size = size/1000.0 self. unit = unit self. count = 0 self. start = time. time () def output (self): self. end = time. time () self. count + = 1 speed = self. block/(self. end-self.start) if (self. end-self.start)> 0 else 0 self. start = time. time () loaded = self. count * self. block progress = round (loaded/self. size, 4) if loaded> = self. size: print u '% s downloaded \ r \ n' % self. file_name else: print U' {0} download progress {1 :. 2f} {2}/{3 :. 2f} {4} {5 :. 2%} download speed {6 :. 2f} {7}/s '. \ format (self. file_name, loaded, self. unit, \ self. size, self. unit, progress, speed, self. unit) print '% 50s' % ('/'* int (1-progress) * 50) queue = Queue. queue () def run (): while True: url = queue. get (timeout = 100) if url is None: print U' all finished. 'break h = hashlib. md5 () h. update (url) name = h. hexdigest () path = 'e:/download/'+ name + '.mp4' download_file (url, path) def get_url (): queue. put (None) if _ name _ = '_ main _': get_url () for I in xrange (4): t = threading. thread (target = run) t. daemon = True t. start ()
With the judgment of repeated downloads, as for how to continuously produce URLs, please feel free to take care of yourself!