One: Web page structure analysis
Two: Code combat
#! /usr/bin/env python2 # encoding=utf-8 #BeautifulSoup需要安装 mysqldb import sys,os,re, Hashlib import urllib import httplib2 From lxml import etree import MySQLdb from
beautifulsoup import beautifulsoup import urllib2 import re import time Reload (SYS) From datetime import datetime as dt,timedelta import re H=HTTPLIB2. Http (timeout=10) #设置请求http头 analog camouflage Browser headers={ ' user-agent ': ' mozilla/4.0 ( compatible; msie 8.0; windows nt 6.0; trident/4.0) '} #正则匹配a标签 pattern = ' <a.*?href= ' (. +) ".*?> (. *?) </a> ' #日志记录 log_path= './sporttery ' log_file= '%s.log ' % dt.now () strftime ('%y-%m-%d ') if not& Nbsp;os.path.exists (Log_path): os.makedirs (Log_path) log=open ('%s/%s ' % (log_ path,log_file), ' w+ ' #python操作mysql数据库 CoNn= mysqldb.connect ( host= ' localhost ', port = 3306, user= ' Root ', passwd= ' root ', db = ' Test ', conn.set_character_set (' UTF8 ') cur
= conn.cursor () cur.execute (' set names utf8; ')
Cur.execute (' set character set utf8; ')
Cur.execute (' set character_set_connection=utf8; ') Cur.close () #获取请求链接内容 failed to execute again def download (URL): fails = 0 while true: if fails>5:return none try: res,contEnt = h.request (URL, ' get ', headers=headers) return content.decode (' utf-8 ', ' ignore ')
except: print (U ' Open link failed ' +url) fails +=1 #字符串截取方法 def Getmiddlestr (CONTENT,STARTSTR,ENDSTR): startindex = content.index (STARTSTR) if startindex>=0: startindex += len (STARTSTR) endIndex = content.index (ENDSTR) return content[startindex:endindex] Def get_ul (data): &NBSP;&NBSP;&NBSP;MYSTRING=GETMIDDLESTR (data, ' <ul class= ' Cenicon ' > ', ' <div class= ' clear hed "></div>") return mystring Def test_sporttery (i):
url= ' http://www.xxx.com/video/video_%E8%B6%B3%E7%90%83%E9%AD%94%E6%96%B9_ ' +str (i) + '. HTML ' print url #http://www.xxx.com/video/video_%e8%b6%b3% e7%90%83%e9%ad%94%e6%96%b9_2.html source=download (URL) data=get_ UL (source) datas=data.split (' <li> ') for each in datas: ret=re.findall (R "(? <=href=\"). +? (? =\")| (? <=href=\ '). +? (? =\ ') ,each for urls in ret: detial=download (URLs) if detial: detial_content=getmiddlestr (detial, ' CreateFlasHvideo ', ' m3u8 '). Replace (' ', ') if detial_content: end_url_rex=getmiddlestr (detial_content+). m3u8 ", ' http://', '. m3u8 ') +" m3u8 " #最终的url #title sstree = etree. HTML (detial) ssnodes = sstree.xpath ('//*[@id = ' playvideo ']/div[1]/h2 ') for ssn in Ssnodes: name= ssn.text.strip (), replace ('/h2> ', ') # Title=getmiddlestr (detial, '