<--------------------------------download function----------------------------->
Import requests
Import threading
# incoming command-line arguments, url to download the file
# url = ' Http://www.nco.ncep.noaa.gov/pmb/codes/nwprod/nosofs.v3.0.4/fix/cbofs/nos.cbofs.romsgrid.nc '
Def Handler (start, end, URL, filename,address):
headers = {' Range ': ' bytes=%d-%d '% (start, end)}
r = Requests.get (URL, headers=headers, stream=true)
# write file corresponding location
With open (Address+filename, "R+b") as FP:
Fp.seek (Start)
var = fp.tell ()
Fp.write (r.content)
def download_file (address, URL, num_thread=500):
r = Requests.head (URL)
Try
file_name = Url.split ('/') [-1]
file_size = Int (
r.headers[' Content-length ') # Content-length Gets the size of the file body and does not support Content-length when the HTTP server uses connection:keep-alive
Except
Print ("Check URL, or do not support thread download")
Return
# Create a file of the same size as the file you want to download
fp = open (Address+file_name, "WB")
Fp.truncate (File_size)
Fp.close ()
# Start multi-threaded write file
Part = file_size//Num_thread # If not divisible, the last piece should be a few more bytes
For I in Range (Num_thread):
Start = part * I
if i = = num_thread-1: # last piece
End = File_size
Else
End = start + part
t = Threading. Thread (Target=handler, kwargs={' start ': Start, ' End ': End, ' url ': URL, ' filename ': file_name, ' Address ': address})
T.setdaemon (True)
T.start ()
# Wait for all threads to download complete
Main_thread = Threading.current_thread ()
For T in Threading.enumerate ():
If T is main_thread:
Continue
T.join ()
Print ('%s download complete '% file_name)
# if __name__ = = ' __main__ ':
# start = Datetime.datetime.now (). Replace (microsecond=0)
# download_file (URL)
# end = Datetime.datetime.now (). Replace (microsecond=0)
# print ("spents:", end= ")
# Print (End-start)
<-------------------link function-------------------------->
def get_link (page): # Looking for a linked href
LinkData = []
For page in Page.find_all (' TD '):
Links = page.select ("a")
For each in Links:
# if STR (each.get (' href ')) [: 1] = = '/': Filter if code
Data=each.get (' href ')
Linkdata.append (data)
Return (LinkData)
<---------------------various functions----------------->
Import Urllib.request
From BS4 import BeautifulSoup
From findlinks import Get_link
From Download import Download_file
Import OS
Import datetime
Import time
Import errno
def mkdir_p (path): #递归创建多级目录
Try
Os.makedirs (PATH)
Except OSError as exc: # Python >2.5 (except OSError, exc:for Python <2.5)
if Exc.errno = = errno. Eexist and Os.path.isdir (path):
Pass
Else:raise
# def File_down (connet,file): #小文件下载模块
# Urllib.request.urlretrieve (connet, file, Schedule)
def decice (data): #通过判断斜杠, to differentiate files and folders
A = '/'
If a in data:
Return 1
Else
return 0
def gain (URL):
page = Urllib.request.urlopen (URL). Read ()
Soup = beautifulsoup (page, ' lxml ') #利用soup获取网页内容
Links = Get_link (soup) #获取 <a href=? Content
Return links
def take (links,file,file_cre,connet):
If Decice (links):
Mkdir_p (file)
Else
Start = Datetime.datetime.now (). Replace (microsecond=0)
Download_file (File_cre, connet)
End = Datetime.datetime.now (). Replace (microsecond=0)
# Handler (Start, End, Connet, Links[childlink],file_cre1)
Print ("spents:", end= ")
Print (End-start)
<-----------main function------------->
From Urllib.parse import Urljoin
From carriage Import Decice
From carriage import gain
From carriage Import take
Import OS
Import time
Def findAll (): #主函数
Url= ' http://www.nco.ncep.noaa.gov/pmb/codes/nwprod/nosofs.v3.0.4/'
Links=gain (URL)
Print (' Fetch URL: ' +url)
For Childlink in range (len (links)-1):
Childlink =childlink +1
connet = Urljoin (URL, links[childlink]) #拼接网址路径
File = Os.path.join (' d:\\info\\index ' + "/" + Links[childlink]) #拼接绝对路径
File_cre1 = Os.path.join (' d:\\info\\index ' + "/")
Print (connet)
Take (Links[childlink], file, File_cre1, connet)
If Decice (Links[childlink]):
Link_next = Gain (connet) # <a href= within the 2nd link?
Else
Continue
Print ("Start:%s"% time.ctime ())
Time.sleep (5)
Print ("End:%s"% time.ctime ())
For Child_next in range (len (link_next)-1):
Child_next =child_next +1
Connet_next=urljoin (Connet,link_next[child_next]) #拼接网址路径
Filef = Os.path.join (File,link_next[child_next]) #拼接路径
File_cre2 = File
Print (Connet_next)
Take (Link_next[child_next], Filef, File_cre2, Connet_next)
If Decice (Link_next[child_next]):
LINK_NEXTF = Gain (connet_next) # <a href= within the 3rd link?
Else
Continue
Print ("Start:%s"% time.ctime ())
Time.sleep (5)
Print ("End:%s"% time.ctime ())
For CHILD_NEXTT in range (len (LINK_NEXTF)-1):
CHILD_NEXTT = child_nextt + 1
Connet_nextt = Urljoin (Connet_next, Link_nextf[child_nextt])
Filet = Os.path.join (Filef,link_nextf[child_nextt])
File_cre3=filef
Print (CONNET_NEXTT)
Take (Link_nextf[child_nextt], filet, file_cre3, CONNET_NEXTT)
If Decice (Link_nextf[child_nextt]):
Link_nextt = Gain (CONNET_NEXTT)
Else
Continue
For CHILD_NEXTTH in range (len (LINK_NEXTT)-1):
CHILD_NEXTTH = child_nextth + 1
CONNET_NEXTTH = Urljoin (Connet_nextt, LINK_NEXTT[CHILD_NEXTTH])
Fileth = Os.path.join (Filef,link_nextt[child_nextth])
File_cre4=filet
Print (CONNET_NEXTTH)
Take (Link_nextt[child_nextth], Fileth, File_cre4, CONNET_NEXTTH)
If Decice (Link_nextt[child_nextth]):
LINK_NEXTTH = Gain (CONNET_NEXTTH)
Else
Continue
For CHILD_NEXTFO in range (len (LINK_NEXTTH)-1):
CHILD_NEXTFO = CHILD_NEXTFO + 1
CONNET_NEXTFO = Urljoin (CONNET_NEXTTH, LINK_NEXTTH[CHILD_NEXTFO])
Filefo = Os.path.join (Filef, LINK_NEXTTH[CHILD_NEXTFO])
File_cre5 = Fileth
Print (CONNET_NEXTFO)
Take (LINK_NEXTTH[CHILD_NEXTFO], Filefo, File_cre5, CONNET_NEXTFO)
If Decice (LINK_NEXTTH[CHILD_NEXTFO]):
LINK_NEXTFO = Gain (CONNET_NEXTFO)
Else
Continue
For Child_nextfi in range (len (LINK_NEXTFO)-1):
Child_nextfi = Child_nextfi + 1
Connet_nextfi = Urljoin (CONNET_NEXTFO, Link_nextfo[child_nextfi])
Filefi = Os.path.join (Filefo, Link_nextfo[child_nextfi])
File_cre6 = Filefo
Print (CONNET_NEXTFI)
Take (Link_nextfo[child_nextfi], Filefi, File_cre6, Connet_nextfi)
If Decice (Link_nextfo[child_nextfi]):
Link_nextfi = Gain (CONNET_NEXTFI)
Else
Continue
For Child_nextsi in range (len (LINK_NEXTFI)-1):
Child_nextsi = Child_nextsi + 1
Connet_nextsi = Urljoin (Connet_nextfi, Link_nextfi[child_nextsi])
Filesi = Os.path.join (Filefi, Link_nextfi[child_nextsi])
File_cre7 = Filefi
Print (CONNET_NEXTSI)
Take (Link_nextfi[child_nextsi], Filesi, File_cre7, Connet_nextsi)
If Decice (Link_nextfi[child_nextsi]):
Link_nextsi = Gain (Connet_nextsi)
Else
Continue
For Child_nextse in range (len (Link_nextsi)-1):
Child_nextse = Child_nextse + 1
Connet_nextse = Urljoin (Connet_nextsi, Link_nextsi[child_nextse])
Filese = Os.path.join (Filesi, Link_nextsi[child_nextse])
File_cre8 = Filesi
Print (Connet_nextse)
Take (Link_nextsi[child_nextse], Filese, File_cre8, Connet_nextse)
If Decice (Link_nextsi[child_nextse]):
Link_nextse = Gain (Connet_nextse)
Else
Continue
< ———————————————————— main function —————————————————————— >
From the To import FindAll
if __name__ = = ' __main__ ':
FindAll ()
Python crawler get File Web site resource full version (based on Python 3.6)