Python interface Automation Test five: Crawler

Source: Internet
Author: User

Installation: BEAUTIFULSOUP4


fromBs4ImportBeautifulSoup
Yoyo= Open(' yoyo.html ',' R ')   # open "yoyo.html" file in read mode
# Print (Yoyo.read ()) # This method opens a string that contains no hierarchy
# Yoyo.close ()

Soup= BeautifulSoup(Yoyo' Html.parser ')   # This prints the same effect as. Read
# Print (Soup.prettify ()) # Now prints out a hierarchical HTML format

# through the Soup object, go to the tag tag
Head= Soup.head
Print(Head)    #
# when there are multiple labels with the same name, you will find it from top to bottom, find the first one and end it, and don't go down again.
# by Soup object, go to the P tag
P= Soup.p
Print(P)    # <pclass= "title" ><b>yoyoketang</b></p>


# Gets the string object: Gets by the Tag property
S= P.string
Print(S)    # Yoyoketang

# Get Comment object, (note here in B-tag)
B_str= Soup.b.string
Print(B_str)    #--Hey, this in comment!--
Print(Type(B_str))  # comment Type

# Tag Properties
fromBs4ImportBeautifulSoup
Yoyo= Open(' yoyo.html ',' R ')   # open "yoyo.html" file in read mode
# Print (Yoyo.read ()) # This method opens a string that contains no hierarchy
# Yoyo.close ()

Soup= BeautifulSoup(Yoyo' Html.parser ')

P= SOUP.P # p Tag
Print(P)    # <p class= "title" ><b>yoyoketang</b></p>
# Get Tag Properties
Value= P.attrs[' class ']    # Tag object, can be used as a dictionary value
Print(Value)        # [' title '] List property

# The CALSS property has multiple properties and the returned value is a list
# class= "Clearfix SDK is a very wide range of
# value = p.attrs[' class ')
# print (value) # [' Clearfix ', ' SDK ', ' very extensive ']

# Find All text
BeautifulSoup
Open(' yoyo.html'R '# Opens "yoyo.html" file in read mode
# Print (Yoyo.read ()) # This method opens a string that contains no hierarchy
# Yoyo.close ()

BeautifulSoup(' html.parser ')

# Get Body Object contents
Soup.body
Print(body)

# get text messages inside the body only
Body.get_text# Gets the string of all descendant nodes under the current tab
Print(get_text)

# Find all the label objects
fromBs4ImportBeautifulSoup
Yoyo= Open(' yoyo.html ',' R ')   # open "yoyo.html" file in read mode
# Print (Yoyo.read ()) # This method opens a string that contains no hierarchy
# Yoyo.close ()

Soup= BeautifulSoup(Yoyo' Html.parser ')

# Find all A-label objects
All= Soup.find_all(' A ')
Print(All# returns the list
# [<a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/fiddler/" id= "Link1" &GT;FIDDLER&LT;/A&GT;, < A class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/python/" id= "Link2" &GT;PYTHON&LT;/A&GT;, <a class= " Sister "href=" http://www.cnblogs.com/yoyoketang/tag/selenium/"id=" Link3 "&GT;PYTHON&LT;/A&GT;]

# Print all the A tags
forIinchAll:
Print(I)
# <a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/fiddler/" id= "Link1" >fiddler</a>
# <a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/python/" id= "Link2" >python</a>
# <a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/selenium/" id= "Link3" >python</a>


# Find all the class tag objects
All= Soup.find_all(Class_="Sister")    # class is a keyword and should be changed to "Class_" here
Print(All# returns the list
# [<a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/fiddler/" id= "Link1" &GT;FIDDLER&LT;/A&GT;, < A class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/python/" id= "Link2" &GT;PYTHON&LT;/A&GT;, <a class= " Sister "href=" http://www.cnblogs.com/yoyoketang/tag/selenium/"id=" Link3 "&GT;PYTHON&LT;/A&GT;]

# Print all class_= "sister" labels
forIinchAll:
Print(I)
# <a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/fiddler/" id= "Link1" >fiddler</a>
# <a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/python/" id= "Link2" >python</a>
# <a class= "sister" href= "http://www.cnblogs.com/yoyoketang/tag/selenium/" id= "Link3" >python</a>

# Crawl Pictures
# Target site: http://699pic.com/sousuo-218808-13-1.html
ImportRequests
fromBs4ImportBeautifulSoup
ImportOs
R= Requests.get(' http://699pic.com/sousuo-218808-13-1.html ')

# r.content # returns a byte stream

Soup= BeautifulSoup(R.content,' Html.parser '# Use HTML parser to find R.content

# tu = Soup.find_all (' img ') # Find all the tagged objects with the name "IMG"
Tu= Soup.find_all(Class_="Lazy")   # Find all the tags with the name "class_=" "lazy" object


forIinchTu:
# Print (i)
#
Print(I[' data-original '])   # get all the URL addresses

# Climb a single picture
Url= ' http://img95.699pic.com/photo/50061/5608.jpg_wh300.jpg '
R= Requests.get(Url)
F= Open(' 123.jpg ',' WB ')   # Open a file named 123.jpg in binary write (suffix can be changed arbitrarily)
F.write(R.content# writes the stream of R bytes to a file
F.close()   # Close File


# Bulk Write:


# Create a path, create a folder named "Tupian"
Curpath= Os.path.dirname(Os.path.dirname(Os.path.realpath(__file__)))
Tupian= Os.path.join(Curpath,' Tupian ')
if notOs.path.exists(Tupian)# Determine if the folder named "Tupian" is not present
Os.mkdir(Tupian)        # does not exist, create a folder named "Tupian"
# Write pictures in batches and save
forIinchTu:
Try:
Jpg_url= I[' data-original ']    # The address of the picture to get
Name= I[' Alt ']
R= Requests.get(Jpg_url)
# Write the content and put it in the Tupian folder
F= Open(Os.path.join(Tupian,'%s.jpg '%Name),' WB ')
F.write(R.content)
F.close()
except:
Pass

Python interface Automation Test five: Crawler

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.