1, create the project
Scrapy Startproject PPD
2, crawl a single page, mainly with XPath
Spider inside the source code
From scrapy.spiders import Spider to scrapy.selector import selector from Ppd.items import Blackitem class Ppdspider (Sp Ider): name = "PPD" allowed_domains = ["dailianmeng.com"] start_urls = ["http://www.dailianmeng.com/p 2pblacklist/index.html "] def parse (self, response): sites = Response.xpath ('//*[@id = ' yw0 ']/table/tbody/tr ' items = [] for site in Sites:item = Blackitem () item[' name ' = Site.xpath (' Td[1]/text () '). Extract () item[' idcard '] = Site.xpath (' Td[2]/text () '). Extract () item[' Mobile ']=site.xpath (' Td[3]/text () '). Extract () item[' email ']=site.xpath (' Td[4]/text () '). Extract () item[' Total ']=site.xpath (' Td[5]/text () '). extr Act () item[' Bepaid ']=site.xpath (' Td[6]/text () '). Extract () item[' Notpaid ']=site.xpath (' Td[7]/text () '). Extra CT () item[' time ']=site.xpath (' Td[8]/text () '). Extract () item[' Loanamount ']=site.xpath (' Td[9]/text () '). Extra CT () Items.appenD (item) return items
items.py Add the source code
Class Blackitem (Item):
name = Field ()
idcard = field ()
Mobile=field ()
Email=field ()
Total=field ()
Bepaid=field () Notpaid=field () Time=field ()
Loanamount=field ()
The result is a single page result, but the order of the attributes is the same as the name of the property, and the next step is to change to the sequence I define.
3, output in order of the specified property
Because scrapy originally output attributes and property values in alphabetical order, now I want to change to the order I specified:
First in the directory spider inside, create a file, named csv_item_exporter.py
From scrapy.conf import settings from
scrapy.contrib.exporter import Csvitemexporter
class Myprojectcsvitemexporter (Csvitemexporter):
def __init__ (self, *args, **kwargs):
delimiter = Settings.get (' Csv_delimiter ', ', ')
kwargs[' DELIMITER '] = DELIMITER
fields_to_export = settings.get (' Fields_to_export ', [])
if Fields_to_export:
kwargs[' fields_to_export '] = fields_to_export
super (Myprojectcsvitemexporter, Self). __init__ (*args, **kwargs)
Then, go to the setting.py, add the following code, where the order of the following attributes are determined by themselves, and the name of the beginning of the dictionary will be replaced with its own project name:
Feed_exporters = {
' csv ': ' Ppd.spiders.csv_item_exporter. Myprojectcsvitemexporter ',
} #jsuser为工程名
fields_to_export = [
' name ',
' idcard ', '
mobile ' ,
' email ',
' total ',
' bepaid ',
' notpaid ',
' time ',
' Loanamount '
]
Results:
Name,idcard,mobile,email,total,bepaid,notpaid,time,loanamount Yu Liang, 61250119890307****,13055099***,233424611@ qq.com,3000.00,1063.01,999.89,2013-10-11,3 Zhang Xu, 44152219890923****,15767638***,466780713@qq.com, 3000.00,2319.84,819.56,2013-09-09,3 months Sun Fudong, 37150219890919****,15194000***,1275972787@qq.com, 3000.00,2075.14,1018.55,2013-09-25,3 months Li Chiyin, 45012119870211****,13481120***,993412914@qq.com, 3050.00,2127.64,167.99,2013-04-08,1 Wu will embrace, 45232819810201****,13977369***,13977369850@139.com, 3000.00,2670.40,524.01,2013-06-07,6 a single Yangtze river, 32072319820512****,18094220***,13851220018@136.com, 8900.00,6302.04,1521.78,2013-07-22,6 months Zheng, 35042619890215****,15959783***,271320236@qq.com, 5000.00,3278.60,425.51,2013-04-08,1 years Wu Wenhao, 44190019890929****,13267561***,234663601@qq.com, 6000.00,579.79,463.40,2013-10-09,1 Zhong Hua, 45060319870526****,18277072***,48959434@qq.com, 5700.00,3141.24,957.50,2013-08-07,6 a month of soup double Jay, 34082119620804****,13329062***,332416587@qq.com, 100000.00,105293.45,9111.54,2012-11-19,1 Yellow River, 43240219791103****, 13786520***,zhuanghe8589@126.com,6700.00,4795.24,2307.54,2013-06-21,6 months Sun Jingchang, 13092119850717****,15127714***, 492101828@qq.com,3000.00, ,455.71,2013-10-18,1, 42050319740831****,15337410***,10855219@qq.com,3000.00, ,965.51,2013-10-17,6 months Cao Cheng, 41088119720221****,18639192***,1404816232@qq.com, 3300.00,1781.64,838.18,2013-06-17,8 a silver ball, 33032519761109****,13806800***,1838599723@qq.com,60000.00,
19407.50,2013-10-16,6 a month.
4, crawl all the page data
The main features are as follows: (1) page automatic acquisition, (2) write loop, crawl multiple pages, (3) speed relative to selenium faster
From scrapy.spiders import Spider to
scrapy.selector import selector from
ppd.items import Blackitem
Class Ppdspider (Spider):
name = "PPD"
allowed_domains = ["dailianmeng.com"]
start_urls = []
#start_ Urls.append ("http://www.dailianmeng.com/p2pblacklist/index.html")
#total_page = 164
Page_re=request.get (' http://www.dailianmeng.com/p2pblacklist/index.html ')
Page_info=page_re.find_element_ By_css_selector (' #yw0 > Div.summary ')
# 第2446-2448条, Total 2,448.
Pages=page_info.text.split (', ') [1]
pages=int (int (pages[3:6))/15)
size_page = pages
Size_page = 165
start_page = 1 for
PGE in range (Start_page,start_page + size_page):
start_urls.append (' http ://www.dailianmeng.com/p2pblacklist/index.html? P2pblacklist_page= ' +str (PGE))
def parse (self, Response):
sites = Response.xpath ('//*[@id = ' yw0 ']/table/ Tbody/tr ')
items = [] for
site in sites:
item = Blackitem ()
item[' name ' = Site.xpath (' Td[1]/text () '). Extract ()
item[' idcard '] = Site.xpath (' Td[2]/text () '). Extract ()
item[' mobile ']=site.xpath (' Td[3]/text ( ). Extract ()
item[' email ']=site.xpath (' Td[4]/text () '). Extract () item[
' Total ']=site.xpath (' td[5]/ Text () "). Extract ()
item[' Bepaid ']=site.xpath (' Td[6]/text () '). Extract () item[
' Notpaid ']=site.xpath (' Td[7]/text () "). Extract ()
item[' time ']=site.xpath (' Td[8]/text () '). Extract () item[
' Loanamount ']= Site.xpath (' Td[9]/text () '). Extract ()
items.append (item) return
items