This procedure involves the following aspects of knowledge:
1.python links MySQL Database: http://www.cnblogs.com/miranda-tang/p/5523431.html
2. crawl Chinese website and various garbled processing : http://www.cnblogs.com/miranda-tang/p/5566358.html
3.BeautifulSoup Use
4. the original Web page data information is not all in a dictionary, the non-existent field is set to empty
Detailed code:
#!/usr/bin/python
#-*-Encoding:utf-8-*-
‘‘‘
Ideas:
1.data on the refrigerator from the Yixun network, including brand, model, Price, volume, energy efficiency class, Refrigeration mode, door style, display mode, fixed frequency/frequency conversion, defrost mode, operation mode
2.depositedMYSQLDatabase
This limit is:300Labove
‘‘‘
fromBs4ImportBeautifulSoup
ImportRequests
ImportMySQLdb
ImportDatetime
#coding
ImportSys
Reload (SYS)
Sys.setdefaultencoding (' Utf-8 ')
#connect to the database and insert the crawled data
definsert_db (page_list):
Try:
#Note When linking, addcharset= ' UTF8 'Solving coding problems
conn = MySQLdb.connect (user=' Root ', passwd=' 112233AA ', host=' 192.168.1.14 ', db=' Miranda.tang ', charset=' UTF8 ')
cursor = Conn.cursor ()
#Delete inserted data on the day to avoid repeated insertions
Cursor.execute (' DELETE from Yixun_price_refrigeratorwhere update_day=current_date() ')
Conn.commit ()#Submit
#withExecutemanythe one-time commit crawl data, than directly withExecuteFast
Sql=' INSERT into Yixun_price_refrigeratorvalues (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s) '
Cursor.executemany (Sql,page_list)
Conn.commit ()#Submit
Cursor.close ()#Closecursor
Conn.close ()#Close Connection
exceptException asE:
PrintE
Conn.rollback ()
#GetSoup
defurlbs (URL):
Response=requests.get (URL)
Soup = BeautifulSoup (Response.text,"lxml")
returnSoup
#get a total of how many pages
defGet_pagenumber (URL):
soup=urlbs (URL)
Page=soup.select ('. Sort_page_num span ') [0]
PAGE_CONTENTS=PAGE.CONTENTS[1]
Pagenumber=int (Page_contents.replace ('/',"'))
returnPageNumber
#Get page Info
defGet_info (Product_url):
soup=urlbs (Product_url)
# Print Soup
#get Title
title = Unicode (Soup.title.text.strip (). Strip (u '"Price_Quote_Image_Market "-Yixun Net'). Replace (u '"',"'))\
. Encode (' Utf-8 '). Decode (' Utf-8 ')
#print Title
#get_Original Price
Try:
Soup_origin = Soup.find ("DL", {"Class":"Xbase_item xprice xprice_origin"})
Price_origin = Soup_origin.find ("Span", {"Class":"Mod_price xprice_val"}).\
Contents[1].text.encode (' Utf-8 '). Decode (' Utf-8 ')
# print U 'Original Price:' + price_origin
except:
Price_origin=0
#pass
#getpay
Try:
Soup_sale= Soup.find (' DL ',{' class ':' Xbase_item xprice '})
Price_sale = Soup_sale.find ("Span", {"Class":"Mod_price xprice_val"}). Contents[1].encode (' Utf-8 '). Decode (' Latin1 ')
#print u 'Pay:' + Price_sale
except:
Price_sale=0
#pass
#Get column name names
Oup_info_name=soup.find_all (' TD ',{' class ':' name '})
# for each inoup_info_name:
# Print Each.contents[0].encode (' Utf-8 '). Decode (' Utf-8 ')
Name_list=[each.contents[0].encode (' Utf-8 '). Decode (' Utf-8 ') foreachinchOup_info_name]
#Get Content
Soup_info_desc=soup.find_all (' TD ',{' class ':' desc '})
# for each insoup_info_desc:
#prod_list =[soup_info_desc[0].contents[0].encode (' Utf-8 '). Decode (' Latin1 ')]
Prod_list=[each.contents[0].encode ("Utf-8"). Decode ("Utf-8") foreachinchSOUP_INFO_DESC]#use list generation to put data from the original table into the list
pro_dic={}
Pro_list=[today,product_url,title,price_origin,price_sale]
#because there is no data in the column name crawl data, in the way of the dictionary, no data is recorded as empty
forIinchRange (len (name_list)):
Pro_dic[name_list[i]]=prod_list[i]
name=['Brand','Model','Color','Energy Efficiency Class','Refrigerator Volume','Refrigeration Mode','Door Style','Weight','Dimensions','Refrigeration Type',
'display Mode','Fixed Frequency/Frequency Conversion','Defrost mode','temperature range of freezer room','Refrigerator Temperature range','Refrigerator Freezer Model','operation mode']
foreachinchName
Try:
Each=each.encode ("Utf-8"). Decode ("Utf-8")
Pro_list.append (Pro_dic[each])
# Printpro_dic[each]
except:
Pro_list.append ("')
# print ' null '
# Print Pro_list
# Print Len (pro_list)
Page_list.append (Pro_list)
#Get the Product page link
defGet_product_href (URL):
soup=urlbs (URL)
Product_list=soup.select (' #itemList. mod_goods_img a ')
# Printproduct_list
forIinchRange (len (product_list)):
Pro=product_list[i]
pro_href=pro[' href ']
# return Pro_href
#print Pro_href
Get_info (PRO_HREF)
if__name__==' __main__ ':
Beseurl=' Http://searchex.yixun.com/html?path=705882t705892&attr=42515e1o2o3o4o5o6o7 '
Max_number=get_pagenumber (Beseurl)
Page_list=[]
Today=datetime.date.today ()#get current date, insert update date
forIinchRange (1,max_number+1):
# for I inrange:
newurl=beseurl+' &page= '+STR (i)
#print Newurl
Get_product_href (Newurl)
insert_db (Page_list)
Print("It 's All Done")
#Build Table
# drop table yixun_price_refrigerator;
# CREATE TABLE Yixun_price_refrigerator (
# Update_day Date--Update Date
#, Product_url VARCHAR--Product Links
#, Title VARCHAR--Name
#, Price_origin VARCHAR (+)--Original Price
#, Price_sale VARCHAR (+)--pay
#, Brands VARCHAR (+)--Brand
#, GOODS_SN VARCHAR (+)--model
#, Colour VARCHAR (+)--Color
#, Energy_efficiency_rating VARCHAR (+)--Energy Efficiency class
#, Refrigerator_volume VARCHAR (+)--Refrigerator Volume
#, Refrigeration VARCHAR (+)--Refrigeration Mode
#, Door_style VARCHAR (+)--Door Style
#, Weight VARCHAR (+)--weight
#, size VARCHAR (+)--Dimensions
#, Cooling_type VARCHAR (+)--Refrigeration Type
#, Display_method VARCHAR (+)--display Mode
#, Frequency VARCHAR (+)--Fixed Frequency/Frequency Conversion
#, Defrost_mode VARCHAR (+)--Defrost mode
#, Freezer_temperature_range VARCHAR (+)--temperature range of freezer room
#, Save_temperature_range VARCHAR (+)--Refrigerator Temperature range
#, Fridge_freezer_models VARCHAR (+)--Refrigerator Freezer Model
#, Operation_method VARCHAR (+)--operation mode
# );
Results:
Python crawler: Crawl Yixun Web Price information and write to MySQL database