This resource is a document reading course resource in the computer vision field at the University of Texas at Austin. This Lab must be well known in the CV field. There are a lot of papers on this page. It is difficult to use the batch download pages of thunder, I wrote a multi-thread batch download program in Python, downloaded the paper to a different folder, and named the folder according to the category. The download speed is as fast as 6 m/s here, we recommend that you open the wall before downloading (otherwise, some foreign links may be restricted and cannot access the website resources). You 'd better run the program before going to bed at night, and then you can go to bed directly ~
The download size of all the papers is 600 MB +. Ensure that the hard disk has enough space ~
Share the following resources with you!
Resource URL: http://www.cs.utexas.edu /~ Cv-fall2012/schedule.html, Python source code paste below, pay attention to the trial before please ensure that the installation of the beautifulsoup Library
#-*-Coding: UTF-8-*-"created on Wed Jan 09 10:33:29 2013 @ Author: lanbing510 lxs "From bs4 import beautifulsoupimport reimport urllibimport urlparseimport osimport sysimport threading import tracebackfull_url = 'HTTP: // www.cs.utexas.edu /~ Cv-fall2012/schedule.html 'response = urllib. urlopen (full_url) soup = beautifulsoup (response. read () # import codecs, sys # Old = sys. stdout # sys. stdout = codecs. lookup ('utf-8') [-1] (sys. stdout) # print soup. prettify () down_count = 0def download (URL, PATH): def CBK (a, B, c): "" Callback Function @ A: downloaded data block @ B: data Block Size @ C: Remote File Size "Per = 100.0 * a * B/c If per> 100: Per = 100 # print '%. 2f % '% per urllib. urlretrieve (URL, path, CBK) Global down_count + = 1; print path. split ("\") [-1], 'Has download' print 'have finished % d files using _^ '% down_count "# url = 'HTTP: // www.sina.com.cn '# local = 'd: \ sina.html' # download (URL, local) def main (): descrifdomainopen('path.txt ', 'W + ') # threads = [] # thread pool local_path = '. \ LAN \ '# root directory c1_path = ''# level-1 directory c2_path ='' # level-2 Directory COUNT = 0; C = 0; # For test for sibling in soup. TR. next_siblings: If Si Bling! = '\ N': # Count + = 1 # print "[% d]: % s" % (count, repr (sibling) # print type (sibling) sibstr = repr (sibling) if (Re. search ('rgb \ (204,204,255 \) ', sibstr ))! = None: "note the Regular Expression in parentheses" "# C + = 1 print sibling. A ['name'] c1_path = local_path + repr (sibling. A ['name']) If OS. path. exists (c1_path) = false: OS. mkdir (c1_path) # continue slist = sibling. find_all ('A') If slist! = []: Try: c2_path = c1_path + '\' + repr (slist [0] ['name']) handle T keyerror: continue # print c2_path c2_path = c1_path + '\' + repr (slist [0] ['name']) If OS. path. exists (c2_path) = false: OS. mkdir (c2_path) for Li in slist [1:]: temp_url = Li. get ('href ') Count + = 1 If temp_url! = None: patt = 'HTTP. + | FTP. + | www. + 'if re. match (patt, temp_url) = none: temp_url = r 'HTTP: // www.cs.utexas.edu /~ Cv-fall2012/'+ temp_url # process the file url_split_temp = temp_url.split ('/') url_sp = url_split_temp [-1] If url_split_temp [-1]! = ''Else (url_split_temp%-2%%'.html ') patt2 = '\. PDF $ | \. HTML $ | \. PPT $ | \. doc $ | \. docx $ | \. pptx $ | \. RAR $ | \. htm $ | \. GZ $ | \. XML $ 'if re. search (patt2, url_sp) = none: url_sp=url_sp+'.html 'temp_path = c2_path + '\' + url_sp else: Continue # print> F, temp_url, '\ n' # print> F, temp_path,' \ n' # print count # print 'C = ', c t = threading. thread (target = download, argS = (temp_url, temp_path) threads. append (t) T. start () # F. close () If _ name __= = '_ main _': Fe = open ("error.txt", 'w') # sys. stderr = Fe try: Main () Finally: Fe. close () sys. stderr = sys. stdout