Wrote a crawler tool class.
#-*-Coding:utf-8-*-# @Time: 2018/8/7 16:29# @Author: cxa# @File: utils.py# @Software: Pycharmfrom retrying I Mport retryfrom decorators.decorators Import decoratorfrom glom Import glomfrom config import headersimport datetimeimpor T Hashlib@retry (stop_max_attempt_number=3, wait_fixed=2000, stop_max_delay=10000) @decoratordef post_html (Session, post_url:int,post_data:dict,headers=headers,timeout=30): "':p Aram session: Incoming Session object:p Aram Post_url:post request required To the URL:p Aram headers: header information, the Config module is provided by default:p Aram Post_data:post information Dictionary type:p Aram timeout:: return: "' Post_ Req=session.post (url=post_url,headers=headers,data=post_data,timeout=timeout) if Post_req.status_code==200:post _req.encoding=post_req.apparent_encoding return Post_req@retry (stop_max_attempt_number=3,wait_fixed=2000, Stop_ max_delay=10000) @decoratordef Get_response (session,url:str,headers=headers,timeout=30): ":p Aram URL:: return: Return response object ' ' Req=sEssion.get (url=url,headers=headers,timeout=timeout) if req.status_code==200:req.encoding=req.apparent_encoding Return Req@decoratordef get_html (req): Source=req.text return source@decoratordef Get_json (req): Jsonstr=r Eq.json () return jsonstr@decoratordef Get_xpath (req,xpathstr:str): ":p Aram req::p Aram Xpathstr:: Return : ' Node=req.html.xpath (XPATHSTR) return node@decoratordef get_json_data (jsonstr:str,pat:str): "' #通过glo M module operation Data:p Aram JSONSTR::p Aram Pat:: Return: ' Item=glom (jsonstr,pat) return item@decoratordef Get_has H_code (Key): Value=hashlib.md5 (Key.encode (' Utf-8 ')). Hexdigest () return value@decoratordef Get_datetime_from_unix ( Unix_time): Unix_time_value=unix_time if not isinstance (unix_time_value,int): Unix_time_value=int (Unix_time) New_datetime=datetime.datetime.fromtimestamp (Unix_time_value) return new_datetime
The following is the contents of the adorner decorators file
# -*- coding: utf-8 -*-# @Time : 2018/03/28 15:35# @Author : cxa# @File : decorators.py# @Software: PyCharmfrom functools import wrapsfrom logger.log import get_loggerimport tracebackdef decorator(func): @wraps(func) def log(*args, **kwargs): try: return func(*args, **kwargs) except Exception as e: get_logger().error("{} is error,here are details:{}".format(func.__name__,traceback.format_exc())) return log
The following is the contents of the headers file
import randomfirst_num = random.randint(55, 62)third_num = random.randint(0, 3200)fourth_num = random.randint(0, 140)class FakeChromeUA: os_type = [ ‘(Windows NT 6.1; WOW64)‘, ‘(Windows NT 10.0; WOW64)‘, ‘(X11; Linux x86_64)‘, ‘(Macintosh; Intel Mac OS X 10_12_6)‘ ] chrome_version = ‘Chrome/{}.0.{}.{}‘.format(first_num, third_num, fourth_num) @classmethod def get_ua(cls): return ‘ ‘.join([‘Mozilla/5.0‘, random.choice(cls.os_type), ‘AppleWebKit/537.36‘, ‘(KHTML, like Gecko)‘, cls.chrome_version, ‘Safari/537.36‘] )headers = { ‘User-Agent‘: FakeChromeUA.get_ua(), ‘Accept-Encoding‘: ‘gzip, deflate, sdch‘, ‘Accept-Language‘: ‘zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3‘, ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, ‘Connection‘: ‘keep-alive‘}
The following is the contents of the logger file
#-*-Coding:utf-8-*-import osimport timeimport loggingimport syslog_dir1=os.path.join (Os.path.dirname ( Os.path.dirname (__file__)), "Logs") today = Time.strftime ('%y%m%d ', Time.localtime (Time.time ())) full_path= Os.path.join (Log_dir1,today) if not os.path.exists (Full_path): Os.makedirs (Full_path) log_path=os.path.join (Full_ Path, "T.log") def Get_logger (): # Gets the logger instance and returns root Logger logger = Logging.getlogger ("T") if not logger if the argument is empty . Handlers: # Specifies the logger output format formatter = logging. Formatter ('% (asctime) s% (levelname) -8s:% (message) s ') # file Log File_handler = logging. Filehandler (log_path,encoding= "UTF8") File_handler.setformatter (Formatter) # can specify output format by Setformatter # console Log Console_handler = logging. Streamhandler (sys.stdout) Console_handler.formatter = Formatter # can also be assigned directly to formatter # for logger added log at Manager Logger.addhandler (File_handler) Logger.addhandler (Console_handler) # Specify the lowest output level of the log, default to warn level Logger.setlevel (logging.info) # Add the following sentence to remove the handle after logging return logger
A Python crawler tool class