Python Crawler Web page program code

Source: Internet
Author: User
Tags datetime introductions

One: Web page structure analysis

Two: Code combat

#! /usr/bin/env python2 # encoding=utf-8 #BeautifulSoup需要安装  mysqldb import sys,os,re, Hashlib import urllib import httplib2 From lxml import etree import MySQLdb from

 beautifulsoup import beautifulsoup import urllib2 import re import time Reload (SYS) From datetime import datetime as dt,timedelta import re   H=HTTPLIB2. Http (timeout=10) #设置请求http头   analog camouflage   Browser headers={     ' user-agent ': ' mozilla/4.0  ( compatible; msie 8.0; windows nt 6.0; trident/4.0) '} #正则匹配a标签 pattern =  ' <a.*?href= ' (. +) ".*?> (. *?) </a> ' #日志记录 log_path= './sporttery ' log_file= '%s.log '  % dt.now () strftime ('%y-%m-%d ') if not& Nbsp;os.path.exists (Log_path):     os.makedirs (Log_path) log=open ('%s/%s '  %  (log_ path,log_file), ' w+ '   #python操作mysql数据库 CoNn= mysqldb.connect (        host= ' localhost ',          port = 3306,         user= ' Root ',         passwd= ' root ',          db = ' Test ',          conn.set_character_set (' UTF8 ') cur 

= conn.cursor () cur.execute (' set names utf8; ')

Cur.execute (' set character set utf8; ')

Cur.execute (' set character_set_connection=utf8; ') Cur.close ()   #获取请求链接内容   failed to execute again def download (URL):     fails = 0      while true:         if fails>5:return  none         try:              res,contEnt = h.request (URL, ' get ', headers=headers)              return  content.decode (' utf-8 ', ' ignore ')        

 except:             print (U ' Open link failed ' +url)             fails +=1 #字符串截取方法   def  Getmiddlestr (CONTENT,STARTSTR,ENDSTR):   startindex = content.index (STARTSTR)   if  startindex>=0:     startindex += len (STARTSTR)   endIndex  = content.index (ENDSTR)   return content[startindex:endindex] Def get_ul (data):   &NBSP;&NBSP;&NBSP;MYSTRING=GETMIDDLESTR (data, ' <ul class= ' Cenicon ' > ', ' <div class= ' clear  hed "></div>")     return mystring   Def test_sporttery (i):

    url= ' http://www.xxx.com/video/video_%E8%B6%B3%E7%90%83%E9%AD%94%E6%96%B9_ ' +str (i) + '. HTML '     print url      #http://www.xxx.com/video/video_%e8%b6%b3% e7%90%83%e9%ad%94%e6%96%b9_2.html     source=download (URL)     data=get_ UL (source)     datas=data.split (' <li> ')     for each in  datas:         ret=re.findall (R "(? <=href=\"). +? (? =\")| (? <=href=\ '). +? (? =\ ')  ,each         for urls in ret:              detial=download (URLs)              if detial:                  detial_content=getmiddlestr (detial, ' CreateFlasHvideo ', ' m3u8 '). Replace ('   ',  ')                  if detial_content:                      end_url_rex=getmiddlestr (detial_content+). m3u8 ", ' http://', '. m3u8 ') +" m3u8 "                      #最终的url                       #title                      sstree = etree. HTML (detial)                      ssnodes = sstree.xpath ('//*[@id = ' playvideo ']/div[1]/h2 ')                      for ssn in  Ssnodes:                          name= ssn.text.strip (), replace ('/h2> ',  ')                      # Title=getmiddlestr (detial, ' 

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.