Python crawler get File Web site resource full version (based on Python 3.6)

Last Update:2017-08-21 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

<--------------------------------download function----------------------------->

Import requests
Import threading


# incoming command-line arguments, url to download the file
# url = ' Http://www.nco.ncep.noaa.gov/pmb/codes/nwprod/nosofs.v3.0.4/fix/cbofs/nos.cbofs.romsgrid.nc '

Def Handler (start, end, URL, filename,address):
headers = {' Range ': ' bytes=%d-%d '% (start, end)}
r = Requests.get (URL, headers=headers, stream=true)

# write file corresponding location
With open (Address+filename, "R+b") as FP:
Fp.seek (Start)
var = fp.tell ()
Fp.write (r.content)


def download_file (address, URL, num_thread=500):
r = Requests.head (URL)
Try
file_name = Url.split ('/') [-1]
file_size = Int (
r.headers[' Content-length ') # Content-length Gets the size of the file body and does not support Content-length when the HTTP server uses connection:keep-alive
Except
Print ("Check URL, or do not support thread download")
Return

# Create a file of the same size as the file you want to download
fp = open (Address+file_name, "WB")
Fp.truncate (File_size)
Fp.close ()

# Start multi-threaded write file
Part = file_size//Num_thread # If not divisible, the last piece should be a few more bytes
For I in Range (Num_thread):
Start = part * I
if i = = num_thread-1: # last piece
End = File_size
Else
End = start + part

t = Threading. Thread (Target=handler, kwargs={' start ': Start, ' End ': End, ' url ': URL, ' filename ': file_name, ' Address ': address})
T.setdaemon (True)
T.start ()

# Wait for all threads to download complete
Main_thread = Threading.current_thread ()
For T in Threading.enumerate ():
If T is main_thread:
Continue
T.join ()
Print ('%s download complete '% file_name)


# if __name__ = = ' __main__ ':
# start = Datetime.datetime.now (). Replace (microsecond=0)
# download_file (URL)
# end = Datetime.datetime.now (). Replace (microsecond=0)
# print ("spents:", end= ")
# Print (End-start)






<-------------------link function-------------------------->

def get_link (page):  # Looking for a linked href
    LinkData = []
    For page in Page.find_all (' TD '):
        Links = page.select ("a")
        For each in Links:
            # if STR (each.get (' href ')) [: 1] = = '/': Filter if code
                Data=each.get (' href ')
                Linkdata.append (data)
    Return (LinkData)






<---------------------various functions----------------->

Import Urllib.request

From BS4 import BeautifulSoup

From findlinks import Get_link

From Download import Download_file

Import OS
Import datetime
Import time
Import errno


def mkdir_p (path): #递归创建多级目录
Try
Os.makedirs (PATH)
Except OSError as exc: # Python >2.5 (except OSError, exc:for Python <2.5)
if Exc.errno = = errno. Eexist and Os.path.isdir (path):
Pass
Else:raise

# def File_down (connet,file): #小文件下载模块
# Urllib.request.urlretrieve (connet, file, Schedule)

def decice (data): #通过判断斜杠, to differentiate files and folders
A = '/'
If a in data:
Return 1
Else
return 0


def gain (URL):
page = Urllib.request.urlopen (URL). Read ()
Soup = beautifulsoup (page, ' lxml ') #利用soup获取网页内容
Links = Get_link (soup) #获取 <a href=? Content
Return links

def take (links,file,file_cre,connet):
If Decice (links):
Mkdir_p (file)
Else

Start = Datetime.datetime.now (). Replace (microsecond=0)
Download_file (File_cre, connet)
End = Datetime.datetime.now (). Replace (microsecond=0)
# Handler (Start, End, Connet, Links[childlink],file_cre1)
Print ("spents:", end= ")
Print (End-start)













<-----------main function------------->


From Urllib.parse import Urljoin

From carriage Import Decice
From carriage import gain
From carriage Import take



Import OS

Import time




Def findAll (): #主函数
Url= ' http://www.nco.ncep.noaa.gov/pmb/codes/nwprod/nosofs.v3.0.4/'
Links=gain (URL)
Print (' Fetch URL: ' +url)

For Childlink in range (len (links)-1):
Childlink =childlink +1
connet = Urljoin (URL, links[childlink]) #拼接网址路径


File = Os.path.join (' d:\\info\\index ' + "/" + Links[childlink]) #拼接绝对路径
File_cre1 = Os.path.join (' d:\\info\\index ' + "/")

Print (connet)
Take (Links[childlink], file, File_cre1, connet)

If Decice (Links[childlink]):
Link_next = Gain (connet) # <a href= within the 2nd link?
Else
Continue

Print ("Start:%s"% time.ctime ())
Time.sleep (5)
Print ("End:%s"% time.ctime ())

For Child_next in range (len (link_next)-1):
Child_next =child_next +1
Connet_next=urljoin (Connet,link_next[child_next]) #拼接网址路径


Filef = Os.path.join (File,link_next[child_next]) #拼接路径
File_cre2 = File

Print (Connet_next)
Take (Link_next[child_next], Filef, File_cre2, Connet_next)

If Decice (Link_next[child_next]):
LINK_NEXTF = Gain (connet_next) # <a href= within the 3rd link?
Else
Continue

Print ("Start:%s"% time.ctime ())
Time.sleep (5)
Print ("End:%s"% time.ctime ())


For CHILD_NEXTT in range (len (LINK_NEXTF)-1):
CHILD_NEXTT = child_nextt + 1
Connet_nextt = Urljoin (Connet_next, Link_nextf[child_nextt])



Filet = Os.path.join (Filef,link_nextf[child_nextt])
File_cre3=filef

Print (CONNET_NEXTT)
Take (Link_nextf[child_nextt], filet, file_cre3, CONNET_NEXTT)
If Decice (Link_nextf[child_nextt]):
Link_nextt = Gain (CONNET_NEXTT)
Else
Continue

For CHILD_NEXTTH in range (len (LINK_NEXTT)-1):
CHILD_NEXTTH = child_nextth + 1
CONNET_NEXTTH = Urljoin (Connet_nextt, LINK_NEXTT[CHILD_NEXTTH])

Fileth = Os.path.join (Filef,link_nextt[child_nextth])
File_cre4=filet

Print (CONNET_NEXTTH)
Take (Link_nextt[child_nextth], Fileth, File_cre4, CONNET_NEXTTH)
If Decice (Link_nextt[child_nextth]):
LINK_NEXTTH = Gain (CONNET_NEXTTH)
Else
Continue

For CHILD_NEXTFO in range (len (LINK_NEXTTH)-1):
CHILD_NEXTFO = CHILD_NEXTFO + 1
CONNET_NEXTFO = Urljoin (CONNET_NEXTTH, LINK_NEXTTH[CHILD_NEXTFO])

Filefo = Os.path.join (Filef, LINK_NEXTTH[CHILD_NEXTFO])
File_cre5 = Fileth

Print (CONNET_NEXTFO)
Take (LINK_NEXTTH[CHILD_NEXTFO], Filefo, File_cre5, CONNET_NEXTFO)
If Decice (LINK_NEXTTH[CHILD_NEXTFO]):
LINK_NEXTFO = Gain (CONNET_NEXTFO)
Else
Continue
For Child_nextfi in range (len (LINK_NEXTFO)-1):
Child_nextfi = Child_nextfi + 1
Connet_nextfi = Urljoin (CONNET_NEXTFO, Link_nextfo[child_nextfi])

Filefi = Os.path.join (Filefo, Link_nextfo[child_nextfi])
File_cre6 = Filefo

Print (CONNET_NEXTFI)
Take (Link_nextfo[child_nextfi], Filefi, File_cre6, Connet_nextfi)
If Decice (Link_nextfo[child_nextfi]):
Link_nextfi = Gain (CONNET_NEXTFI)
Else
Continue
For Child_nextsi in range (len (LINK_NEXTFI)-1):
Child_nextsi = Child_nextsi + 1
Connet_nextsi = Urljoin (Connet_nextfi, Link_nextfi[child_nextsi])

Filesi = Os.path.join (Filefi, Link_nextfi[child_nextsi])
File_cre7 = Filefi

Print (CONNET_NEXTSI)
Take (Link_nextfi[child_nextsi], Filesi, File_cre7, Connet_nextsi)
If Decice (Link_nextfi[child_nextsi]):
Link_nextsi = Gain (Connet_nextsi)
Else
Continue
For Child_nextse in range (len (Link_nextsi)-1):
Child_nextse = Child_nextse + 1
Connet_nextse = Urljoin (Connet_nextsi, Link_nextsi[child_nextse])

Filese = Os.path.join (Filesi, Link_nextsi[child_nextse])
File_cre8 = Filesi

Print (Connet_nextse)
Take (Link_nextsi[child_nextse], Filese, File_cre8, Connet_nextse)
If Decice (Link_nextsi[child_nextse]):
Link_nextse = Gain (Connet_nextse)
Else
Continue

< ———————————————————— main function —————————————————————— >

From the To import FindAll



if __name__ = = ' __main__ ':
    FindAll ()

Python crawler get File Web site resource full version (based on Python 3.6)

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Python crawler get File Web site resource full version (based on Python 3.6)

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Python crawler get File Web site resource full version (based on Python 3.6)

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support