A Python crawler tool class

Source: Internet
Author: User

Wrote a crawler tool class.
#-*-Coding:utf-8-*-# @Time: 2018/8/7 16:29# @Author: cxa# @File: utils.py# @Software: Pycharmfrom retrying I Mport retryfrom decorators.decorators Import decoratorfrom glom Import glomfrom config import headersimport datetimeimpor T Hashlib@retry (stop_max_attempt_number=3, wait_fixed=2000, stop_max_delay=10000) @decoratordef post_html (Session, post_url:int,post_data:dict,headers=headers,timeout=30): "':p Aram session: Incoming Session object:p Aram Post_url:post request required To the URL:p Aram headers: header information, the Config module is provided by default:p Aram Post_data:post information Dictionary type:p Aram timeout:: return: "' Post_ Req=session.post (url=post_url,headers=headers,data=post_data,timeout=timeout) if Post_req.status_code==200:post _req.encoding=post_req.apparent_encoding return Post_req@retry (stop_max_attempt_number=3,wait_fixed=2000, Stop_ max_delay=10000) @decoratordef Get_response (session,url:str,headers=headers,timeout=30): ":p Aram URL:: return: Return response object ' ' Req=sEssion.get (url=url,headers=headers,timeout=timeout) if req.status_code==200:req.encoding=req.apparent_encoding Return Req@decoratordef get_html (req): Source=req.text return source@decoratordef Get_json (req): Jsonstr=r Eq.json () return jsonstr@decoratordef Get_xpath (req,xpathstr:str): ":p Aram req::p Aram Xpathstr:: Return : ' Node=req.html.xpath (XPATHSTR) return node@decoratordef get_json_data (jsonstr:str,pat:str): "' #通过glo M module operation Data:p Aram JSONSTR::p Aram Pat:: Return: ' Item=glom (jsonstr,pat) return item@decoratordef Get_has H_code (Key): Value=hashlib.md5 (Key.encode (' Utf-8 ')). Hexdigest () return value@decoratordef Get_datetime_from_unix (    Unix_time): Unix_time_value=unix_time if not isinstance (unix_time_value,int): Unix_time_value=int (Unix_time) New_datetime=datetime.datetime.fromtimestamp (Unix_time_value) return new_datetime
The following is the contents of the adorner decorators file
# -*- coding: utf-8 -*-# @Time    : 2018/03/28 15:35# @Author  : cxa# @File    : decorators.py# @Software: PyCharmfrom functools import wrapsfrom logger.log import get_loggerimport tracebackdef decorator(func):    @wraps(func)    def log(*args, **kwargs):        try:            return func(*args, **kwargs)        except Exception as e:            get_logger().error("{} is error,here are details:{}".format(func.__name__,traceback.format_exc()))    return log
The following is the contents of the headers file
import randomfirst_num = random.randint(55, 62)third_num = random.randint(0, 3200)fourth_num = random.randint(0, 140)class FakeChromeUA:    os_type = [                ‘(Windows NT 6.1; WOW64)‘, ‘(Windows NT 10.0; WOW64)‘, ‘(X11; Linux x86_64)‘,                ‘(Macintosh; Intel Mac OS X 10_12_6)‘               ]    chrome_version = ‘Chrome/{}.0.{}.{}‘.format(first_num, third_num, fourth_num)    @classmethod    def get_ua(cls):        return ‘ ‘.join([‘Mozilla/5.0‘, random.choice(cls.os_type), ‘AppleWebKit/537.36‘,                         ‘(KHTML, like Gecko)‘, cls.chrome_version, ‘Safari/537.36‘]                        )headers = {    ‘User-Agent‘: FakeChromeUA.get_ua(),    ‘Accept-Encoding‘: ‘gzip, deflate, sdch‘,    ‘Accept-Language‘: ‘zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3‘,    ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,    ‘Connection‘: ‘keep-alive‘}
The following is the contents of the logger file
#-*-Coding:utf-8-*-import osimport timeimport loggingimport syslog_dir1=os.path.join (Os.path.dirname ( Os.path.dirname (__file__)), "Logs") today = Time.strftime ('%y%m%d ', Time.localtime (Time.time ())) full_path= Os.path.join (Log_dir1,today) if not os.path.exists (Full_path): Os.makedirs (Full_path) log_path=os.path.join (Full_ Path, "T.log") def Get_logger (): # Gets the logger instance and returns root Logger logger = Logging.getlogger ("T") if not logger if the argument is empty . Handlers: # Specifies the logger output format formatter = logging. Formatter ('% (asctime) s% (levelname) -8s:% (message) s ') # file Log File_handler = logging.            Filehandler (log_path,encoding= "UTF8") File_handler.setformatter (Formatter) # can specify output format by Setformatter # console Log Console_handler = logging. Streamhandler (sys.stdout) Console_handler.formatter = Formatter # can also be assigned directly to formatter # for logger added log at Manager Logger.addhandler (File_handler) Logger.addhandler (Console_handler)            # Specify the lowest output level of the log, default to warn level Logger.setlevel (logging.info) # Add the following sentence to remove the handle after logging return logger 

A Python crawler tool class

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.