Before we talked about the Elasticsearch (search engine) operation, such as: Add, delete, change, check and other operations are used Elasticsearch language commands, like SQL command, of course Elasticsearch Official also provides a python operation Elasticsearch (search engine) interface package, just like the SQLAlchemy Operation database ORM box, so that we operate elasticsearch do not have to write commands, Use the Elasticsearch-dsl-py module to manipulate a class in Python mode .
Elasticsearch-dsl-py Download
:https://github.com/elastic/elasticsearch-dsl-py
Document Description: http://elasticsearch-dsl.readthedocs.io/en/latest/
First, install the elasticsearch-dsl-py module .
1, ELASTICSEARCH-DSL module use instructions
Create_connection (hosts=[' 127.0.0.1 '): Connect a Elasticsearch (search engine) server method, you can connect to more than one server
Class Meta: Set index name and table name
Index class name. Init (): Generate Indexes and tables and fields
Instantiate the Index class. Save (): Write Data to Elasticsearch (search engine)
elasticsearch_orm.py Operation Elasticsearch (search engine) file
#! / usr / bin / env python
#-*-coding: utf8-*-
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
# For more field types, see section 364 elastic mapping (search engine) mapping management
from elasticsearch_dsl.connections import connections # import elasticsearch (search engine) server method
connections.create_connection (hosts = [‘127.0.0.1‘])
class lagouType (DocType): # Customize a class to extend the DocType class
# Text type needs word segmentation, so you need to know Chinese word segmenter, ik_max_wordwei is Chinese word segmenter
title = Text (analyzer = "ik_max_word") # Set, field name = field type, Text is a string type and can be indexed by word segmentation
description = Text (analyzer = "ik_max_word")
keywords = Text (analyzer = "ik_max_word")
url = Keyword () # Set, field name = field type, Keyword is a common string type, no word segmentation
riqi = Date () # set, field name = field type, date date type
class Meta: # Meta is fixed
index = "lagou" # set the index name (equivalent to the database name)
doc_type = ‘biao’ # set table name
if __name__ == "__main__": # Judge the methods in this code file to execute before executing the methods in the other pages
lagouType.init () # Generate elasticsearch (search engine) index, table, field and other information
# Instructions for use:
# On the page where you want to operate elasticsearch (search engine), import this module
# lagou = lagouType () #Instantiated class
# lagou.title = ‘值’ #Write field = value
# lagou.description = ‘value’
# lagou.keywords = ‘value’
# lagou.url = ‘value’
# lagou.riqi = ‘value’
# lagou.save () #Write data to elasticsearch (search engine)
2. Scrapy Write data to Elasticsearch
Crawler files
#-*-coding: utf-8-*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from adc.items import LagouItem, LagouItemLoader #Import items container class, and ItemLoader class
import time
class LagouSpider (CrawlSpider): #Create crawler class
name = ‘lagou’ # crawler name
allowed_domains = [‘www.luyin.org’] #starting domains
start_urls = [‘http://www.luyin.org/‘] #start url
custom_settings = {
"AUTOTHROTTLE_ENABLED": True, #Override the same settings in settings.py and enable cookies
"DOWNLOAD_DELAY": 5
}
rules = (
#Configure the crawl list page rule
Rule (LinkExtractor (allow = (‘ggwa /.*‘)), follow = True),
#Configure rules for crawling content pages
Rule (LinkExtractor (allow = (‘post / \ d + .html. *‘)), Callback = ‘parse_job’, follow = True),
)
def parse_job (self, response): #Callback function, note: because the source code of the CrawlS template creates a parse callback function, remember that we cannot create a function with the name parse
atime = time.localtime (time.time ()) #Get the current system time
dqatime = "{0}-{1}-{2} {3}: {4}: {5}". format (
atime.tm_year,
atime.tm_mon,
atime.tm_mday,
atime.tm_hour,
atime.tm_min,
atime.tm_sec
) # The formatted date and time are taken out separately and stitched into a complete date
url = response.url
item_loader = LagouItemLoader (LagouItem (), response = response) # Fill the data into the LagouItem of the items.py file
item_loader.add_xpath (‘title’, ‘/ html / head / title / text ()‘)
item_loader.add_xpath (‘description’, ‘/ html / head / meta [@ name =" Description "] / @ content’)
item_loader.add_xpath (‘keywords’, ‘/ html / head / meta [@ name =" keywords "] / @ content’)
item_loader.add_value (‘url’, url)
item_loader.add_value (‘riqi’, dqatime)
article_item = item_loader.load_item ()
yield article_item
items.py file
#-*-coding: utf-8-*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
# items.py, the file is specifically used to receive the data information obtained by the crawler, which is equivalent to a container file
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst
from scrapy.loader import ItemLoader #Import the ItemLoader class and load the items container class to fill the data
from adc.models.elasticsearch_orm import lagouType #Import the elasticsearch operation module
class LagouItemLoader (ItemLoader): #Custom Loader inherits ItemLoader class, call this class on the crawler page to fill the data to Item class
default_output_processor = TakeFirst () #The ItemLoader class is used by default to load the items container class to fill the data. It is a list type. You can get the contents of the list through the TakeFirst () method
def tianjia (value): #Custom data preprocessing function
return value #Return the processed data to the Item
class LagouItem (scrapy.Item): #Set the information container class obtained by the crawler
title = scrapy.Field (#Receive title information obtained by the crawler
input_processor = MapCompose (tianjia), #Pass the data preprocessing function name into the MapCompose method for processing. The formal parameter value of the data preprocessing function will automatically receive the field title
)
description = scrapy.Field ()
keywords = scrapy.Field ()
url = scrapy.Field ()
riqi = scrapy.Field ()
def save_to_es (self):
lagou = lagouType () # instantiate elasticsearch (search engine object)
lagou.title = self [‘title‘] # field name = value
lagou.description = self [‘description‘]
lagou.keywords = self [‘keywords’]
lagou.url = self [‘url‘]
lagou.riqi = self [‘riqi‘]
lagou.save () # write data to elasticsearch (search engine object)
return
pipelines.py file
#-*-coding: utf-8-*-
# Define your item pipelines here
#
# Do n‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from adc.models.elasticsearch_orm import lagouType #Import the elasticsearch operation module
class AdcPipeline (object):
def process_item (self, item, spider):
#You can also write data to the elasticsearch search engine here. The disadvantage here is unified processing.
# lagou = lagouType ()
# lagou.title = item [‘title‘]
# lagou.description = item [‘description‘]
# lagou.keywords = item [‘keywords’]
# lagou.url = item [‘url‘]
# lagou.riqi = item [‘riqi‘]
# lagou.save ()
item.save_to_es () #Execute the save_to_es method of the items.py file to write data to the elasticsearch search engine
return item
settings.py file, register pipelines
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
‘adc.pipelines.AdcPipeline‘: 300,
}
main.py Crawler Boot file
#! / usr / bin / env python
#-*-coding: utf8-*-
from scrapy.cmdline import execute #Import and execute scrapy command method
import sys
import os
sys.path.append (os.path.join (os.getcwd ())) #Add a new path to the Python interpreter, add the directory where the main.py file is located to the Python interpreter
execute ([‘scrapy’, ‘crawl’, ‘lagou’, ‘--nolog’]) #execute the scrapy command
# execute ([‘scrapy’, ‘crawl’, ‘lagou’]) #execute scrapy command
Run crawler
Write Elasticsearch (search engine) condition
Add:ELASTICSEARCH-DSL The addition and deletion change
#! / usr / bin / env python
#-*-coding: utf8-*-
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
# For more field types, see section 364 elastic mapping (search engine) mapping management
from elasticsearch_dsl.connections import connections # import elasticsearch (search engine) server method
connections.create_connection (hosts = [‘127.0.0.1‘])
class lagouType (DocType): # Customize a class to extend the DocType class
# Text type needs word segmentation, so you need to know Chinese word segmenter, ik_max_wordwei is Chinese word segmenter
title = Text (analyzer = "ik_max_word") # Set, field name = field type, Text is a string type and can be indexed by word segmentation
description = Text (analyzer = "ik_max_word")
keywords = Text (analyzer = "ik_max_word")
url = Keyword () # Set, field name = field type, Keyword is a common string type, no word segmentation
riqi = Date () # set, field name = field type, date date type
class Meta: # Meta is fixed
index = "lagou" # set the index name (equivalent to the database name)
doc_type = ‘biao’ # set table name
if __name__ == "__main__": # Judge the methods in this code file to execute before executing the methods in the other pages
lagouType.init () # Generate elasticsearch (search engine) index, table, field and other information
# Instructions for use:
# On the page where you want to operate elasticsearch (search engine), import this module
# lagou = lagouType () #Instantiated class
# lagou.title = ‘值’ #Write field = value
# lagou.description = ‘value’
# lagou.keywords = ‘value’
# lagou.url = ‘value’
# lagou.riqi = ‘value’
# lagou.save () #Write data to elasticsearch (search engine)
1 new data
from adc.models.elasticsearch_orm import lagouType #Import the elasticsearch operation module just configured
Lagou = lagouType () # instantiate elasticsearch (search engine object)
Lagou._id = 1 #Custom ID is very important, it will be operated based on ID in the future
lagou.title = self [‘title‘] # field name = value
lagou.description = self [‘description‘]
lagou.keywords = self [‘keywords’]
lagou.url = self [‘url‘]
lagou.riqi = self [‘riqi‘]
lagou.save () # write data to elasticsearch (search engine object)
2 Delete specified data
from adc.models.elasticsearch_orm import lagouType #Import the elasticsearch operation module just configured
sousuo_orm = lagouType () # instantiation
sousuo_orm.get (id = 1) .delete () # delete data with id equal to 1
3 Modifying the specified data
from adc.models.elasticsearch_orm import lagouType #Import the elasticsearch operation module just configured
sousuo_orm = lagouType () # instantiation
sousuo_orm.get (id = 1) .update (title = ‘123456789’) # Modify the data with id equal to 1
All above using ELASTICSEARCH-DSL module
Note the native Elasticsearch module used below
Deletes the specified use, which is equivalent to deleting the specified database
Delete the specified index using the native Elasticsearch module
from elasticsearch import Elasticsearch # Import the native elasticsearch (search engine) interface
client = Elasticsearch (hosts = settings.Elasticsearch_hosts) # Connect to native elasticsearch
# Delete the specified index using the native elasticsearch module
#Do fault-tolerant processing, error will be reported if the index does not exist
try:
client.indices.delete (index = ‘jxiou_zuopin’)
except Exception as e:
pass
Native query
from elasticsearch import Elasticsearch # Import the native elasticsearch (search engine) interface
client = Elasticsearch (hosts = Elasticsearch_hosts) # Connect to native elasticsearch
response = client.search (# The search () method of the native elasticsearch interface is a search, which can support native elasticsearch query
index = "jxiou_zuopin", # set the index name
doc_type = "zuopin", # set table name
body = {# write elasticsearch statement
"query": {
"multi_match": {# multi_match query
"query": sousuoci, # query keywords
"fields": ["title"] # query fields
}
},
"from": (page-1) * tiaoshu, # get from
"size": tiaoshu, # how many pieces of data to get
"highlight": {# query keyword highlighting
"pre_tags": [‘<span class =" gaoliang ">‘], # highlight start tags
"post_tags": [‘</ span>‘], # highlight end tags
"fields": {# highlight settings
"title": {} # highlight field
}
}
}
)
# Get data
total_nums = response ["hits"] ["total"] # Get the total number of query results
hit_list = [] # Set a list to store the searched information and return to the html page
for hit in response ["hits"] ["hits"]: # loop query results
hit_dict = () # set a dictionary to store the loop results
if "title" in hit ["highlight"]: # determine the title field, if the highlighted field has a category
hit_dict ["title"] = "" .join (hit ["highlight"] ["title"]) # Get the title in the highlight
else:
hit_dict ["title"] = hit ["_ source"] ["title"] # Otherwise get the title that is not in the highlight
hit_dict ["id"] = hit ["_ source"] ["nid"] # get the returned nid
# Encrypted sample address
hit_dict ["yangsrc"] = jia_mi (str (hit ["_ source"] ["yangsrc"])) # Get the returned yangsrc
hit_list.append (hit_dict)
46 Python distributed crawler build search engine Scrapy explaining-elasticsearch (search engine) Scrapy write data to Elasticsearch