Suddenly want to get a tool like this, write to write to do not know what to do, originally two files, is now integrated together.
Disorderly not, too lazy to tidy up, can use on the line.
The download section uses multiple threads, but downloading a file is a single thread, that is, a range problem with no HTTP headers. It seems that the speed is not very well.
Beginning to encounter a problem is directly with Urllib Urlretrieve or first urlopen and then read () Come out always wrong, or lost, Later originally intended to plug a wget.exe to download, want to go or not worth the candle, so now divided into several blocks to read, it should be possible.
In addition Baidu album with Ajax dynamic page loading technology, so in the browser to see the URL with # or #! can not directly use Urllib to open, #后的东西是不会POST到服务器的.
Later with the browser of the review element tool, switch to the Web page, found the URL of the request, in fact, or passed the parameters, which is better solved.
Forget it, lose the code.
PS: Recently found the United States version of the "Fast forward" is very good-looking, this period of time has been watching
#-*-coding:cp936-*-import urllib,re,urllib2import os,os.pathimport timeimport threadingimport SysIS_DEBUG = FalseMULT I_thread = Falsemax_thread = 20TIEBA_INFO_FP = "" #tbName = Raw_input ("Enter bar Name:") Tbname = "Tieba_url_base =" http://tieba.b Aidu.com "pgurl_base=" http://tieba.baidu.com/photo/g?kw= "photo_url_base =" Http://imgsrc.baidu.com/forum/pic/item /"block_size = 4096threadLock = Threading. Lock () # Print information to file and screen def print_f (msg): Global TIEBA_INFO_FP print msg tieba_info_fp.write (msg+ ' \ n ') tieba_info_fp . Flush () def download_file (Url,path): If Os.path.isfile (path): Return r = Urllib.urlopen (URL) fileName = " "If Path! =" ": filename = path elif r.info (). Has_key (' content-disposition '): filename = R.info () [' Co Ntent-disposition '].split (' filename= ') [1] filename = filename.replace (' "', '"). Replace ("'", "") Else:fil ename = Url[url.rfind ('/') + 1:] if Os.path.isfile (fileName): return else:file_length =Int (R.info () [' Content-length ']) download_size=0 f = open (FileName, ' WB ') Try:while Downlo Ad_size<file_length:dat = R.read (block_size) L = len (DAT) if l>0: F.write (DAT) download_size + = L else:f.close () Os.remove (fileName) Raise Exception, "Time Out" except Exception,e:f.clo SE () os.remove (fileName) Raise Exception,e finally:f.close () class Multidownload (t Hreading. Thread): Def __init__ (Self,dat): Threading. Thread.__init__ (self) self.dat = dat def run (self): while 1:pos,url,path = Self.dat.start_one () if pos = = None:break Try:download_file (url,path) s Elf.dat.end_one (POS) #出错标记为未下载 except exception,e: Self.dat.renew_one (POS) Print Url,e class Ddata:def __init__ (self): Self.pos = 0 Self.url = [] Self.path = [] #0 1 2 self.status = [] def add (self,url,p ATH): self.url.append (URL) self.path.append (path) self.status.append (' 0 ') Self.pos + = 1 #获取 A data that is not downloaded and set to 1 (downloading), return to POS, all downloaded return none def start_one (self): Try:pos = Self.status.index (' 0 ') Threadlock.acquire () Self.status[pos] = ' 1 ' threadlock.release () return Pos,self.url [Pos],self.path[pos] except Valueerror:return none,none,none #结束一条下载 def end_one (self,pos): Threadlock.acquire () Self.status[pos] = ' 2 ' threadlock.release () #标记未下载一条下载 def renew_one (self,pos): Threadlock.acquire () self.status[pos] = ' 0 ' threadlock.release () def multi_download_run (Url_list,path _LIST=[],MAX_THREAD=10): DAT = DData () for I in Xrange (0,len (url_list)): If path_list==[]: fn = url[url.rfind ('/') + 1:] Path = Os.path.join (OS.GETCWD (), fn) Else:path = Path_list[i] Dat.add (Url_list[i],path) threads = [] for i in Xrange (0,max_thread): Threads.append (multidownload (DAT)) for T in Threads:t.s Tart () for T-Threads:t.join () def multi_download (pic_list): Url_list = [] path_list =[] for ID in pi C_LIST:FN = id + ". jpg" url = photo_url_base + fn Path = Os.path.join (OS.GETCWD (), FN) Url_lis T.append (URL) path_list.append (path) Multi_download_run (url_list,path_list,max_thread=10) # enters a subdirectory and creates def CHS if it does not exist Ubdir (dirname): CWD=OS.GETCWD () subdir = Os.path.join (cwd,dirname) if os.path.exists (subdir) = = False:os. mkdir (subdir) Os.chdir (subdir) # # Read album Def Read_album (Tid,name): Chsubdir (name) If Is_debug = True:return Url= ' http://tieba.baIdu.com/photo/bw/picture/guide?kw=%s&tid=%s&see_lz=1&from_page=0&alt=jview&next=15 '% ( Tbname,tid) # print URL pagedata = urllib.urlopen (URL). Read () #print pagedata p = re.compile (' pic_amount ':(\d +), ') Pic_amount = P.search (pagedata). Group (1) print_f ("┗━━" +name + ' +pic_amount + ' zhang ') p = re.compile (' "Origi Nal ": {" id ":" (\s+?) ") Find_list = P.findall (pagedata) pic_list = find_list i= len (pic_list) Pic_amount=int (pic_amount) # Convert to Integer type Whil E pic_amount>i: #print i url2 = url+ "&prev=0&pic_id=" +pic_list[-1] pagedata = Urllib.urlop En (URL2). Read () p = re.compile (' "original": {"id": "(\s+?)") Find_list = P.findall (pagedata) pic_list = Pic_list + find_list[1:] I=len (pic_list) multi_download (pic_li ST) # # Read album set def read_catalog (url,name): if Name! = ': Chsubdir (name) print_f (name) page = 1 whi Le 1:url_page = "%s&pn=%d"% (url,page) PagedatA = Urllib2.urlopen (url_page). Read () p = re.compile (' <div class= ' grbm_ele_title.+?href= ' (\s+?). +?title= "(. +?)" ', Re. S) result = P.findall (pagedata) Root_dir = OS.GETCWD () If Len (result) ==0:break Els E:for A in result: #cUrl = tieba_url_base + a[1] tid=a[0][3:] CN ame = a[1] Os.chdir (root_dir) read_album (tid,cname) page + = 1## Read root information def read_r Oot (url,name): Global tieba_info_fp Chsubdir (name) try:tieba_info_fp = File ('%s bar info. txt '% (name), "W") Print_f (' "%s" '% (name) ") Pagedata = Urllib.urlopen (URL). Read () #1, read total number of photos p = re.compile (' <div class= "Picture_amount_total" > Total picture (\d+) Zhang </div> ', re. S) result = P.findall (pagedata) picture_amount_total = 0 If len (result) = = 0:print_f (' Maybe this A bar does not exist, or the program cannot be used ') Tieba_info_fp.close () return else: picture_amount_total = Int (result[0]) print_f (' total picture%d '% (picture_amount_total)) #2, try to have an album first The case of the classification P = re.compile (' <li class= "catalog_li_normal.+?href=" (\s+?) ". +?catalog_a_inner "> (. +?) <span class= "Catalog_a_amount" >\ ((\d+?) \) </span> ', re. S) result = P.findall (pagedata) Root_dir = OS.GETCWD () If Len (result) >0:for A in Resul t:cat_id = a[0][10:] cat_name = a[1] Os.chdir (root_dir) Cat_u RL = url+ "&cat_id=" + cat_id Read_catalog (cat_url,cat_name) #3, no album category, direct access to all album catalogs else: Cat_url = url+ "&cat_id=all" Read_catalog (Cat_url, ') except Exception,e:print e finall Y:tieba_info_fp.close () def main (): global Tbname args = Len (sys.argv) if ARGS>1:FO R i in range (1,args): Tbname = sys.argv[i] Print Sys.argv[i] PgurL = pgurl_base + tbname read_root (pgurl,tbname) else:tbname = Raw_input ("Enter bar Name:") Pgurl = PG Url_base + tbname Read_root (pgurl,tbname) if __name__ = = ' __main__ ': Main ()
Python write Baidu paste album download