Today records the scrapy data stored in the local and database, not will not write, because the small series each write feel the same, so record, after the direct use on it-^o^-
1. Local storage
Set Pipel ines.py
class Ak17Pipeline(object):
def __init__(self):
self.file = open(‘ak17.json‘, ‘w‘) # 存储文件的类型
def process_item(self, item, spider):
result = json.dumps(dict(item), ensure_ascii=False) + ‘,\n‘
self.file.write(result)
return item
def close_spider(self):
self.file.close()
2. Store to the MongoDB database
Setting up the setting file
# mongo数据库
MONGO_HOST = "127.0.0.1" # 数据库地址
MONGO_PORT = 27017 # 端口号
MONGO_DBNAME = "ak17" # 数据库名称
MONGO_COLNAME = "ak" # 集合名称
Set Pipel ines.py
class MongoPipeline(object):
"""
保存进数据库
"""
def __init__(self):
# 初始化操作
host = settings[‘MONGO_HOST‘]
port = settings[‘MONGO_PORT‘]
dbs = settings[‘MONGO_DBNAME‘]
colname = settings[‘MONGO_COLNAME‘]
# 链接数据库
self.db = MongoClient(host=host, port=port)
# 选择数据库
self.database = self.db[dbs]
# 选择集合
self.col = self.database[colname]
def process_item(self, item, spider):
# 插入数据
date = dict(item)
self.col.insert(date)
return item
def close_spider(self):
# 关闭链接
self.db.close()
3.MYSQL Database Storage
Setting up the setting file
MYSQL_HOSTS = ‘127.0.0.1‘
MYSQL_USER = ‘root‘
MYSQL_PASSWORD = ‘root‘
MYSQL_PORT = 3306
MYSQL_DB=‘xiciip‘
CHARSET=‘utf8‘
Set Pipel ines.py
class WebcrawlerScrapyPipeline(object):
‘‘‘保存到数据库中对应的class
1、在settings.py文件中配置
2、在自己实现的爬虫类中yield item,会自动执行‘‘‘
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
‘‘‘1、@classmethod声明一个类方法,而对于平常我们见到的叫做实例方法。
2、类方法的第一个参数cls(class的缩写,指这个类本身),而实例方法的第一个参数是self,表示该类的一个实例
3、可以通过类来调用,就像C.f(),相当于java中的静态方法‘‘‘
#读取settings中配置的数据库参数
dbparams = dict(
host=settings[‘MYSQL_HOST‘],
db=settings[‘MYSQL_DBNAME‘],
user=settings[‘MYSQL_USER‘],
passwd=settings[‘MYSQL_PASSWD‘],
charset=‘utf8‘, # 编码要加上,否则可能出现中文乱码问题
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=False,
)
dbpool = adbapi.ConnectionPool(‘MySQLdb‘, **dbparams) # **表示将字典扩展为关键字参数,相当于host=xxx,db=yyy....
return cls(dbpool) # 相当于dbpool付给了这个类,self中可以得到
# pipeline默认调用
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item) # 调用插入的方法异步处理
query.addErrback(self._handle_error, item, spider) # 调用异常处理方法
return item
# 写入数据库中
# SQL语句在这里
def _conditional_insert(self, tx, item):
sql = "insert into jsbooks(author,title,url,pubday,comments,likes,rewards,views) values(%s,%s,%s,%s,%s,%s,%s,%s)"
params = (item[‘author‘], item[‘title‘], item[‘url‘], item[‘pubday‘],item[‘comments‘],item[‘likes‘],item[‘rewards‘],item[‘reads‘])
tx.execute(sql, params)
# 错误处理方法
def _handle_error(self, failue, item, spider):
print failue
Scrapy crawl data for database storage and local storage