Import Urllib.request
From BS4 import BeautifulSoup
From Urllib.parse import Urljoin
From cat.findlinks import Get_link
From cat.load import Schedule
Import OS
Import time
Import errno
-------the rest of the package code for import----------------
def get_link (page): # Looking for a linked href
LinkData = []
For page in Page.find_all (' TD '):
Links = page.select ("a")
For each in Links:
# if STR (each.get (' href ')) [: 1] = = '/': Filter if code
Data=each.get (' href ')
Linkdata.append (data)
Return (LinkData)
def Schedule (a,b,c): #当数据过大, loading display module
‘‘‘‘‘
A: Data blocks that have already been downloaded
B: Size of the data block
C: The size of the remote file
‘‘‘
per = 100.0 * A * b/c
If per > 100:
per = 100
Print ('%.2f%% '% per)
----------End-------------------
def mkdir_p (path): #递归创建多级目录
Try
Os.makedirs (PATH)
Except OSError as exc: # Python >2.5 (except OSError, exc:for Python <2.5)
if Exc.errno = = errno. Eexist and Os.path.isdir (path):
Pass
Else:raise
def file_down (connet,file):
Urllib.request.urlretrieve (connet, file, Schedule)
def decice (data):
A = '/'
If a in data:
Return 1
Def findAll (): #主函数
Url= ' http://www.nco.ncep.noaa.gov/pmb/codes/nwprod/nosofs.v3.0.4/'
page = Urllib.request.urlopen (URL). Read ()
Soup = beautifulsoup (page, ' lxml ') #利用BeautifulSoup取得网页代码
Links=get_link (soup)
# Print (links)
For Childlink in range (len (links)-1):
Childlink =childlink +1
connet = Urljoin (URL, links[childlink]) #拼接网址路径
Page_next = Urllib.request.urlopen (connet). Read ()
Soup_next = BeautifulSoup (Page_next, ' lxml ')
Link_next=get_link (soup_next) #第2次链接内的 <a href=?
File = Os.path.join (' d:\\test\\index ' + "\ \" + Links[childlink])
# Decice (Links[childlink])
# file_cre=os.path.join (' D:\\test\\index ', Links[childlink])
If Decice (Links[childlink]):
Mkdir_p (file)
Else
File_down (connet, file)
Print (connet)
For Child_next in range (len (link_next)-1):
Child_next =child_next +1
Connet_next=urljoin (Connet,link_next[child_next])
Page_next = Urllib.request.urlopen (connet_next). Read ()
SOUP_NEXTF = BeautifulSoup (Page_next, ' lxml ')
LINK_NEXTF = Get_link (SOUP_NEXTF) # <a href= within the 3rd link?
Filef = Os.path.join (' d:/test/index ' + "/", Links[childlink]+link_next[child_next])
If Decice (Links[childlink]):
Mkdir_p (Filef)
Else
File_down (Connet, Filef)
Print ("Start:%s"% time.ctime ())
Time.sleep (4)
Print ("End:%s"% time.ctime ())
Print (Connet_next)
For CHILD_NEXTT in range (len (LINK_NEXTF)-1):
CHILD_NEXTT = child_nextt + 1
Connet_nextt = Urljoin (Connet_next, Link_nextf[child_nextt])
Filet = Os.path.join (' d:/test/index ' + "/", Links[childlink] + link_next[child_next]+link_nextf[child_nextt])
If Decice (link_nextf[child_nextt]) = = 1:
Mkdir_p (Filet)
Else
File_down (connet, filet)
Print (CONNET_NEXTT)
if __name__ = = ' __main__ ':
FindAll ()
Python crawler gets file site resources (based on Python 3.6)