Python write Baidu paste album download

Last Update:2014-10-29 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Suddenly want to get a tool like this, write to write to do not know what to do, originally two files, is now integrated together.

Disorderly not, too lazy to tidy up, can use on the line.

The download section uses multiple threads, but downloading a file is a single thread, that is, a range problem with no HTTP headers. It seems that the speed is not very well.

Beginning to encounter a problem is directly with Urllib Urlretrieve or first urlopen and then read () Come out always wrong, or lost, Later originally intended to plug a wget.exe to download, want to go or not worth the candle, so now divided into several blocks to read, it should be possible.

In addition Baidu album with Ajax dynamic page loading technology, so in the browser to see the URL with # or #! can not directly use Urllib to open, #后的东西是不会POST到服务器的.

Later with the browser of the review element tool, switch to the Web page, found the URL of the request, in fact, or passed the parameters, which is better solved.

Forget it, lose the code.

PS: Recently found the United States version of the "Fast forward" is very good-looking, this period of time has been watching

#-*-coding:cp936-*-import urllib,re,urllib2import os,os.pathimport timeimport threadingimport SysIS_DEBUG = FalseMULT I_thread = Falsemax_thread = 20TIEBA_INFO_FP = "" #tbName = Raw_input ("Enter bar Name:") Tbname = "Tieba_url_base =" http://tieba.b Aidu.com "pgurl_base=" http://tieba.baidu.com/photo/g?kw= "photo_url_base =" Http://imgsrc.baidu.com/forum/pic/item /"block_size = 4096threadLock = Threading. Lock () # Print information to file and screen def print_f (msg): Global TIEBA_INFO_FP print msg tieba_info_fp.write (msg+ ' \ n ') tieba_info_fp . Flush () def download_file (Url,path): If Os.path.isfile (path): Return r = Urllib.urlopen (URL) fileName = " "If Path! =" ": filename = path elif r.info (). Has_key (' content-disposition '): filename = R.info () [' Co Ntent-disposition '].split (' filename= ') [1] filename = filename.replace (' "', '"). Replace ("'", "") Else:fil ename = Url[url.rfind ('/') + 1:] if Os.path.isfile (fileName): return else:file_length =Int (R.info () [' Content-length ']) download_size=0 f = open (FileName, ' WB ') Try:while Downlo                    Ad_size<file_length:dat = R.read (block_size) L = len (DAT) if l>0:                    F.write (DAT) download_size + = L else:f.close () Os.remove (fileName) Raise Exception, "Time Out" except Exception,e:f.clo SE () os.remove (fileName) Raise Exception,e finally:f.close () class Multidownload (t Hreading. Thread): Def __init__ (Self,dat): Threading. Thread.__init__ (self) self.dat = dat def run (self): while 1:pos,url,path = Self.dat.start_one () if pos = = None:break Try:download_file (url,path) s            Elf.dat.end_one (POS) #出错标记为未下载 except exception,e:                    Self.dat.renew_one (POS) Print Url,e class Ddata:def __init__ (self): Self.pos = 0 Self.url = [] Self.path = [] #0 1 2 self.status = [] def add (self,url,p ATH): self.url.append (URL) self.path.append (path) self.status.append (' 0 ') Self.pos + = 1 #获取            A data that is not downloaded and set to 1 (downloading), return to POS, all downloaded return none def start_one (self): Try:pos = Self.status.index (' 0 ') Threadlock.acquire () Self.status[pos] = ' 1 ' threadlock.release () return Pos,self.url        [Pos],self.path[pos] except Valueerror:return none,none,none #结束一条下载 def end_one (self,pos):         Threadlock.acquire () Self.status[pos] = ' 2 ' threadlock.release () #标记未下载一条下载 def renew_one (self,pos): Threadlock.acquire () self.status[pos] = ' 0 ' threadlock.release () def multi_download_run (Url_list,path _LIST=[],MAX_THREAD=10):    DAT = DData () for I in Xrange (0,len (url_list)): If path_list==[]: fn = url[url.rfind ('/') + 1:]    Path = Os.path.join (OS.GETCWD (), fn) Else:path = Path_list[i] Dat.add (Url_list[i],path) threads = [] for i in Xrange (0,max_thread): Threads.append (multidownload (DAT)) for T in Threads:t.s Tart () for T-Threads:t.join () def multi_download (pic_list): Url_list = [] path_list =[] for ID in pi C_LIST:FN = id + ". jpg" url = photo_url_base + fn Path = Os.path.join (OS.GETCWD (), FN) Url_lis T.append (URL) path_list.append (path) Multi_download_run (url_list,path_list,max_thread=10) # enters a subdirectory and creates def CHS if it does not exist Ubdir (dirname): CWD=OS.GETCWD () subdir = Os.path.join (cwd,dirname) if os.path.exists (subdir) = = False:os.    mkdir (subdir) Os.chdir (subdir) # # Read album Def Read_album (Tid,name): Chsubdir (name) If Is_debug = True:return Url= ' http://tieba.baIdu.com/photo/bw/picture/guide?kw=%s&tid=%s&see_lz=1&from_page=0&alt=jview&next=15 '% ( Tbname,tid) # print URL pagedata = urllib.urlopen (URL). Read () #print pagedata p = re.compile (' pic_amount ':(\d +), ') Pic_amount = P.search (pagedata). Group (1) print_f ("┗━━" +name + ' +pic_amount + ' zhang ') p = re.compile (' "Origi    Nal ": {" id ":" (\s+?) ") Find_list = P.findall (pagedata) pic_list = find_list i= len (pic_list) Pic_amount=int (pic_amount) # Convert to Integer type Whil E pic_amount>i: #print i url2 = url+ "&prev=0&pic_id=" +pic_list[-1] pagedata = Urllib.urlop        En (URL2). Read () p = re.compile (' "original": {"id": "(\s+?)") Find_list = P.findall (pagedata) pic_list = Pic_list + find_list[1:] I=len (pic_list) multi_download (pic_li ST) # # Read album set def read_catalog (url,name): if Name! = ': Chsubdir (name) print_f (name) page = 1 whi Le 1:url_page = "%s&pn=%d"% (url,page) PagedatA = Urllib2.urlopen (url_page). Read () p = re.compile (' <div class= ' grbm_ele_title.+?href= ' (\s+?). +?title= "(. +?)" ', Re. S) result = P.findall (pagedata) Root_dir = OS.GETCWD () If Len (result) ==0:break Els E:for A in result: #cUrl = tieba_url_base + a[1] tid=a[0][3:] CN ame = a[1] Os.chdir (root_dir) read_album (tid,cname) page + = 1## Read root information def read_r        Oot (url,name): Global tieba_info_fp Chsubdir (name) try:tieba_info_fp = File ('%s bar info. txt '% (name), "W") Print_f (' "%s" '% (name) ") Pagedata = Urllib.urlopen (URL). Read () #1, read total number of photos p = re.compile (' <div class= "Picture_amount_total" > Total picture (\d+) Zhang </div> ', re. S) result = P.findall (pagedata) picture_amount_total = 0 If len (result) = = 0:print_f (' Maybe this A bar does not exist, or the program cannot be used ') Tieba_info_fp.close () return else:            picture_amount_total = Int (result[0]) print_f (' total picture%d '% (picture_amount_total)) #2, try to have an album first The case of the classification P = re.compile (' <li class= "catalog_li_normal.+?href=" (\s+?) ". +?catalog_a_inner "> (. +?) <span class= "Catalog_a_amount" >\ ((\d+?) \) </span> ', re. S) result = P.findall (pagedata) Root_dir = OS.GETCWD () If Len (result) >0:for A in Resul t:cat_id = a[0][10:] cat_name = a[1] Os.chdir (root_dir) Cat_u            RL = url+ "&cat_id=" + cat_id Read_catalog (cat_url,cat_name) #3, no album category, direct access to all album catalogs else: Cat_url = url+ "&cat_id=all" Read_catalog (Cat_url, ') except Exception,e:print e finall Y:tieba_info_fp.close () def main (): global Tbname args = Len (sys.argv) if ARGS&GT;1:FO R i in range (1,args): Tbname = sys.argv[i] Print Sys.argv[i] PgurL = pgurl_base + tbname read_root (pgurl,tbname) else:tbname = Raw_input ("Enter bar Name:") Pgurl = PG     Url_base + tbname Read_root (pgurl,tbname) if __name__ = = ' __main__ ': Main ()

Python write Baidu paste album download

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Python write Baidu paste album download

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Python write Baidu paste album download

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support