Python crawler get File Web site resource full version (based on Python 3.6)

Source: Internet
Author: User

<--------------------------------download function----------------------------->
Import requests
Import threading


# incoming command-line arguments, url to download the file
# url = ' Http://www.nco.ncep.noaa.gov/pmb/codes/nwprod/nosofs.v3.0.4/fix/cbofs/nos.cbofs.romsgrid.nc '

Def Handler (start, end, URL, filename,address):
headers = {' Range ': ' bytes=%d-%d '% (start, end)}
r = Requests.get (URL, headers=headers, stream=true)

# write file corresponding location
With open (Address+filename, "R+b") as FP:
Fp.seek (Start)
var = fp.tell ()
Fp.write (r.content)


def download_file (address, URL, num_thread=500):
r = Requests.head (URL)
Try
file_name = Url.split ('/') [-1]
file_size = Int (
r.headers[' Content-length ') # Content-length Gets the size of the file body and does not support Content-length when the HTTP server uses connection:keep-alive
Except
Print ("Check URL, or do not support thread download")
Return

# Create a file of the same size as the file you want to download
fp = open (Address+file_name, "WB")
Fp.truncate (File_size)
Fp.close ()

# Start multi-threaded write file
Part = file_size//Num_thread # If not divisible, the last piece should be a few more bytes
For I in Range (Num_thread):
Start = part * I
if i = = num_thread-1: # last piece
End = File_size
Else
End = start + part

t = Threading. Thread (Target=handler, kwargs={' start ': Start, ' End ': End, ' url ': URL, ' filename ': file_name, ' Address ': address})
T.setdaemon (True)
T.start ()

# Wait for all threads to download complete
Main_thread = Threading.current_thread ()
For T in Threading.enumerate ():
If T is main_thread:
Continue
T.join ()
Print ('%s download complete '% file_name)


# if __name__ = = ' __main__ ':
# start = Datetime.datetime.now (). Replace (microsecond=0)
# download_file (URL)
# end = Datetime.datetime.now (). Replace (microsecond=0)
# print ("spents:", end= ")
# Print (End-start)





<-------------------link function-------------------------->
def get_link (page):  # Looking for a linked href
LinkData = []
For page in Page.find_all (' TD '):
Links = page.select ("a")
For each in Links:
# if STR (each.get (' href ')) [: 1] = = '/': Filter if code
Data=each.get (' href ')
Linkdata.append (data)
Return (LinkData)





<---------------------various functions----------------->
Import Urllib.request

From BS4 import BeautifulSoup

From findlinks import Get_link

From Download import Download_file

Import OS
Import datetime
Import time
Import errno


def mkdir_p (path): #递归创建多级目录
Try
Os.makedirs (PATH)
Except OSError as exc: # Python >2.5 (except OSError, exc:for Python <2.5)
if Exc.errno = = errno. Eexist and Os.path.isdir (path):
Pass
Else:raise

# def File_down (connet,file): #小文件下载模块
# Urllib.request.urlretrieve (connet, file, Schedule)

def decice (data): #通过判断斜杠, to differentiate files and folders
A = '/'
If a in data:
Return 1
Else
return 0


def gain (URL):
page = Urllib.request.urlopen (URL). Read ()
Soup = beautifulsoup (page, ' lxml ') #利用soup获取网页内容
Links = Get_link (soup) #获取 <a href=? Content
Return links

def take (links,file,file_cre,connet):
If Decice (links):
Mkdir_p (file)
Else

Start = Datetime.datetime.now (). Replace (microsecond=0)
Download_file (File_cre, connet)
End = Datetime.datetime.now (). Replace (microsecond=0)
# Handler (Start, End, Connet, Links[childlink],file_cre1)
Print ("spents:", end= ")
Print (End-start)












<-----------main function------------->


From Urllib.parse import Urljoin

From carriage Import Decice
From carriage import gain
From carriage Import take



Import OS

Import time




Def findAll (): #主函数
Url= ' http://www.nco.ncep.noaa.gov/pmb/codes/nwprod/nosofs.v3.0.4/'
Links=gain (URL)
Print (' Fetch URL: ' +url)

For Childlink in range (len (links)-1):
Childlink =childlink +1
connet = Urljoin (URL, links[childlink]) #拼接网址路径


File = Os.path.join (' d:\\info\\index ' + "/" + Links[childlink]) #拼接绝对路径
File_cre1 = Os.path.join (' d:\\info\\index ' + "/")

Print (connet)
Take (Links[childlink], file, File_cre1, connet)

If Decice (Links[childlink]):
Link_next = Gain (connet) # <a href= within the 2nd link?
Else
Continue

Print ("Start:%s"% time.ctime ())
Time.sleep (5)
Print ("End:%s"% time.ctime ())

For Child_next in range (len (link_next)-1):
Child_next =child_next +1
Connet_next=urljoin (Connet,link_next[child_next]) #拼接网址路径


Filef = Os.path.join (File,link_next[child_next]) #拼接路径
File_cre2 = File

Print (Connet_next)
Take (Link_next[child_next], Filef, File_cre2, Connet_next)

If Decice (Link_next[child_next]):
LINK_NEXTF = Gain (connet_next) # <a href= within the 3rd link?
Else
Continue

Print ("Start:%s"% time.ctime ())
Time.sleep (5)
Print ("End:%s"% time.ctime ())


For CHILD_NEXTT in range (len (LINK_NEXTF)-1):
CHILD_NEXTT = child_nextt + 1
Connet_nextt = Urljoin (Connet_next, Link_nextf[child_nextt])



Filet = Os.path.join (Filef,link_nextf[child_nextt])
File_cre3=filef

Print (CONNET_NEXTT)
Take (Link_nextf[child_nextt], filet, file_cre3, CONNET_NEXTT)
If Decice (Link_nextf[child_nextt]):
Link_nextt = Gain (CONNET_NEXTT)
Else
Continue

For CHILD_NEXTTH in range (len (LINK_NEXTT)-1):
CHILD_NEXTTH = child_nextth + 1
CONNET_NEXTTH = Urljoin (Connet_nextt, LINK_NEXTT[CHILD_NEXTTH])

Fileth = Os.path.join (Filef,link_nextt[child_nextth])
File_cre4=filet

Print (CONNET_NEXTTH)
Take (Link_nextt[child_nextth], Fileth, File_cre4, CONNET_NEXTTH)
If Decice (Link_nextt[child_nextth]):
LINK_NEXTTH = Gain (CONNET_NEXTTH)
Else
Continue

For CHILD_NEXTFO in range (len (LINK_NEXTTH)-1):
CHILD_NEXTFO = CHILD_NEXTFO + 1
CONNET_NEXTFO = Urljoin (CONNET_NEXTTH, LINK_NEXTTH[CHILD_NEXTFO])

Filefo = Os.path.join (Filef, LINK_NEXTTH[CHILD_NEXTFO])
File_cre5 = Fileth

Print (CONNET_NEXTFO)
Take (LINK_NEXTTH[CHILD_NEXTFO], Filefo, File_cre5, CONNET_NEXTFO)
If Decice (LINK_NEXTTH[CHILD_NEXTFO]):
LINK_NEXTFO = Gain (CONNET_NEXTFO)
Else
Continue
For Child_nextfi in range (len (LINK_NEXTFO)-1):
Child_nextfi = Child_nextfi + 1
Connet_nextfi = Urljoin (CONNET_NEXTFO, Link_nextfo[child_nextfi])

Filefi = Os.path.join (Filefo, Link_nextfo[child_nextfi])
File_cre6 = Filefo

Print (CONNET_NEXTFI)
Take (Link_nextfo[child_nextfi], Filefi, File_cre6, Connet_nextfi)
If Decice (Link_nextfo[child_nextfi]):
Link_nextfi = Gain (CONNET_NEXTFI)
Else
Continue
For Child_nextsi in range (len (LINK_NEXTFI)-1):
Child_nextsi = Child_nextsi + 1
Connet_nextsi = Urljoin (Connet_nextfi, Link_nextfi[child_nextsi])

Filesi = Os.path.join (Filefi, Link_nextfi[child_nextsi])
File_cre7 = Filefi

Print (CONNET_NEXTSI)
Take (Link_nextfi[child_nextsi], Filesi, File_cre7, Connet_nextsi)
If Decice (Link_nextfi[child_nextsi]):
Link_nextsi = Gain (Connet_nextsi)
Else
Continue
For Child_nextse in range (len (Link_nextsi)-1):
Child_nextse = Child_nextse + 1
Connet_nextse = Urljoin (Connet_nextsi, Link_nextsi[child_nextse])

Filese = Os.path.join (Filesi, Link_nextsi[child_nextse])
File_cre8 = Filesi

Print (Connet_nextse)
Take (Link_nextsi[child_nextse], Filese, File_cre8, Connet_nextse)
If Decice (Link_nextsi[child_nextse]):
Link_nextse = Gain (Connet_nextse)
Else
Continue

< ———————————————————— main function —————————————————————— >
From the To import FindAll



if __name__ = = ' __main__ ':
FindAll ()

Python crawler get File Web site resource full version (based on Python 3.6)

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.