# coding:utf-8 from common.contest import *def spider ():
url = "http://www.salamoyua.com/es/subasta.aspx?origen=subastas&subasta=79"
Chromedriver = ' C:/users/xuchunlin/appdata/local/google/chrome/application/chromedriver.exe 'Chome_options =Webdriver. Chromeoptions ()
#使用代理 # proxies = r.get (' 4 ') # chome_options.add_argument (('--proxy-server=http://' + proxies) os.environ["webdriver.chrome.drive r "] = chromedriver Driver = webdriver. Chrome (Chromedriver, chrome_options= chome_options) for I in range (1,100 ): print "crawling" + str (i) + "page data" if I ==1 : # Request URL Driver.get (session_url) result = Driver.page_source else : Try : # Drag the page scroll bar to the bottom JS = "var q=document.documentelement.scrolltop=10000" driver.execute_script (JS) driver.find_element_by_id (' Ctl00_phcontenidos_lbsiguiente ' ). Click () # to get the results of a crawl page result = Driver.page_source time.sleep (3 ) Except:result = "" soup = beautifulsoup (result, ' Html.parser ' ) result_div = Soup.find_all (' figure ', attrs={" Class ":" Lotes Fade "}) # Print Len (result_div) for I in result_div:
Result_replace = replace (i) print result_replace Item_url = Re.findall (' <figure class= "Lotes fade" ><a href= "(. *?)" id= ", Result_replace) [0] Item_url =" http://www.salamoyua.com/es/" + item_url.replace (', ') Item_imgurl = Re.findall (' <strong>remate: (. *?) </strong></p></figcaption> ', result_replace) [0] Sold_price = sold_price.replace (', ' ') Try:item_lotnum = Re.findall (' title= "Lote vendido" ><span id= ". *?" > (. *?) </span> ', result_replace) [0] item_lotnum = item_lotnum.replace (' Lote ', '). RepLace (",") Except:item_lotnum = Re.findall (' <span id= '). > (. *?) </span>
Spider ()
Python Crawler Instance (--python) Selenium crawler