Installation: BEAUTIFULSOUP4
fromBs4ImportBeautifulSoup
Yoyo= Open(' yoyo.html ',' R ') # open "yoyo.html" file in read mode
# Print (Yoyo.read ()) # This method opens a string that contains no hierarchy
# Yoyo.close ()
Soup= BeautifulSoup(Yoyo' Html.parser ') # This prints the same effect as. Read
# Print (Soup.prettify ()) # Now prints out a hierarchical HTML format
# through the Soup object, go to the tag tag
Head= Soup.head
Print(Head) #
# when there are multiple labels with the same name, you will find it from top to bottom, find the first one and end it, and don't go down again.
# by Soup object, go to the P tag
P= Soup.p
Print(P) # <pclass= "title" ><b>yoyoketang</b></p>
# Gets the string object: Gets by the Tag property
S= P.string
Print(S) # Yoyoketang
# Get Comment object, (note here in B-tag)
B_str= Soup.b.string
Print(B_str) #--Hey, this in comment!--
Print(Type(B_str)) # comment Type
# Tag Properties
fromBs4ImportBeautifulSoup
Yoyo= Open(' yoyo.html ',' R ') # open "yoyo.html" file in read mode
# Print (Yoyo.read ()) # This method opens a string that contains no hierarchy
# Yoyo.close ()
Soup= BeautifulSoup(Yoyo' Html.parser ')
P= SOUP.P # p Tag
Print(P) # <p class= "title" ><b>yoyoketang</b></p>
# Get Tag Properties
Value= P.attrs[' class '] # Tag object, can be used as a dictionary value
Print(Value) # [' title '] List property
# The CALSS property has multiple properties and the returned value is a list
# class= "Clearfix SDK is a very wide range of
# value = p.attrs[' class ')
# print (value) # [' Clearfix ', ' SDK ', ' very extensive ']
# Find All text
BeautifulSoup
Open(' yoyo.html'R '# Opens "yoyo.html" file in read mode
# Print (Yoyo.read ()) # This method opens a string that contains no hierarchy
# Yoyo.close ()
BeautifulSoup(' html.parser ')
# Get Body Object contents
Soup.body
Print(body)
# get text messages inside the body only
Body.get_text# Gets the string of all descendant nodes under the current tab
Print(get_text)
# Find all the label objects
fromBs4ImportBeautifulSoup
Yoyo= Open(' yoyo.html ',' R ') # open "yoyo.html" file in read mode
# Print (Yoyo.read ()) # This method opens a string that contains no hierarchy
# Yoyo.close ()
Soup= BeautifulSoup(Yoyo' Html.parser ')
# Find all A-label objects
All= Soup.find_all(' A ')
Print(All) # returns the list
# [<a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/fiddler/" id= "Link1" >FIDDLER</A>, < A class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/python/" id= "Link2" >PYTHON</A>, <a class= " Sister "href=" http://www.cnblogs.com/yoyoketang/tag/selenium/"id=" Link3 ">PYTHON</A>]
# Print all the A tags
forIinchAll:
Print(I)
# <a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/fiddler/" id= "Link1" >fiddler</a>
# <a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/python/" id= "Link2" >python</a>
# <a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/selenium/" id= "Link3" >python</a>
# Find all the class tag objects
All= Soup.find_all(Class_="Sister") # class is a keyword and should be changed to "Class_" here
Print(All) # returns the list
# [<a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/fiddler/" id= "Link1" >FIDDLER</A>, < A class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/python/" id= "Link2" >PYTHON</A>, <a class= " Sister "href=" http://www.cnblogs.com/yoyoketang/tag/selenium/"id=" Link3 ">PYTHON</A>]
# Print all class_= "sister" labels
forIinchAll:
Print(I)
# <a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/fiddler/" id= "Link1" >fiddler</a>
# <a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/python/" id= "Link2" >python</a>
# <a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/selenium/" id= "Link3" >python</a>
# Crawl Pictures
# Target site: http://699pic.com/sousuo-218808-13-1.html
ImportRequests
fromBs4ImportBeautifulSoup
ImportOs
R= Requests.get(' http://699pic.com/sousuo-218808-13-1.html ')
# r.content # returns a byte stream
Soup= BeautifulSoup(R.content,' Html.parser ') # Use HTML parser to find R.content
# tu = Soup.find_all (' img ') # Find all the tagged objects with the name "IMG"
Tu= Soup.find_all(Class_="Lazy") # Find all the tags with the name "class_=" "lazy" object
forIinchTu:
# Print (i)
#
Print(I[' data-original ']) # get all the URL addresses
# Climb a single picture
Url= ' http://img95.699pic.com/photo/50061/5608.jpg_wh300.jpg '
R= Requests.get(Url)
F= Open(' 123.jpg ',' WB ') # Open a file named 123.jpg in binary write (suffix can be changed arbitrarily)
F.write(R.content) # writes the stream of R bytes to a file
F.close() # Close File
# Bulk Write:
# Create a path, create a folder named "Tupian"
Curpath= Os.path.dirname(Os.path.dirname(Os.path.realpath(__file__)))
Tupian= Os.path.join(Curpath,' Tupian ')
if notOs.path.exists(Tupian): # Determine if the folder named "Tupian" is not present
Os.mkdir(Tupian) # does not exist, create a folder named "Tupian"
# Write pictures in batches and save
forIinchTu:
Try:
Jpg_url= I[' data-original '] # The address of the picture to get
Name= I[' Alt ']
R= Requests.get(Jpg_url)
# Write the content and put it in the Tupian folder
F= Open(Os.path.join(Tupian,'%s.jpg '%Name),' WB ')
F.write(R.content)
F.close()
except:
Pass
Python interface Automation Test five: Crawler