Python crawls videos from a school, and python crawls the school
Video capturing principle: Get all the knowledge category IDs, and then get their sub-items-> analyze the Sub-item links to get the number of courses of this type-> loop to get the video that the link points.
Install python Library: requests
Python parses xml using the code found online.
It can be optimized. But lazy!
1 # coding: UTF-8 2 import OS 3 import sys 4 import requests 5 import urllib. request, io 6 from html. parser import HTMLParser 7 8 # global variable 9 10 id_list = set () # Save the video ID list 11 id_dict ={}# Save the id and the number of corresponding sub-Videos 12 cookies ={}# save cookies 13 14 # HTML parsing class 15 class MyHTMLParser (HTMLParser ): 16 def _ init _ (self, key, attr): 17 HTMLParser. _ init _ (self) 18 self. links = [] 19 self. keys = key 20 self. attr = attr 21 def Handle_starttag (self, tag, attrs): 22 # print "Encountered the beginning of a % s tag" % tag 23 # if tag = "source ": 24 if tag = self. keys: 25 if len (attrs) = 0: 26 pass 27 else: 28 for (variable, value) in attrs: 29 # if variable = "src ": 30 if variable = self. attr: 31 self. links. append (value) 32 33 34 # parse cookies dictionary 35 def getCookies (cookies_str): 36 global cookies 37 for line in cookiesStr. spli T (';'): 38 # if it is set to 1, the string will be split into two parts: 39 name, value = line. strip (). split ('=', 1) 40 cookies [name] = value 41 42 def getHtml (url, key, value): 43 global cookies 44 r = requests. get (url, cookies = cookies) 45 content = r. content. decode ('utf-8') 46 hp = MyHTMLParser ("source", "src") 47 hp. feed (content) 48 hp. close () 49 print (hp. links) 50 for link in hp. links: 51 link_str = str (link) 52 if link_str.find (". mp4 ")> = 0: 53 downloadFile (link, key, value) 54 else: 55 print ("corresponding video not found") 56 57 58 # Get the number of courses 59 def getCourseNum (url ): 60 global cookies 61 url_list = set () 62 r = requests. get (url, cookies = cookies) 63 content = r. content. decode ('utf-8') 64 hp = MyHTMLParser ("a", "href") 65 hp. feed (content) 66 hp. close () 67 for link in hp. links: 68 link_str = str (link) 69 if link_str.find ("http://www.jikexueyuan.com/co Urse/")> = 0 and link_str.find (". html? Ss = 1 ")> = 0: 70 url_list.add (link_str) 71 return url_list. _ len _ () 72 73 # obtain all video IDs, according to the directory webpage 74 def getIdList (root): 75 global cookies 76 r = requests. get (root, cookies = cookies) 77 content = r. content. decode ('utf-8') 78 hp = MyHTMLParser ("a", "href") 79 hp. feed (content) 80 hp. close () 81 # print (hp. links) 82 # declare reference to global id_list, and define 83 global id_list 84 global id_dict 85 86 for link in hp at the top. links: 87 link_s Tr = str (link) 88 if link_str.find ("http://www.jikexueyuan.com/course/")> = 0 and link_str.find (". html ")> = 0: 89 # print (link) 90 c_id = link_str.lstrip (" http://www.jikexueyuan.com/course "). rstrip (". html ") 91 if c_id not in id_list: 92 id_dict [c_id] = getCourseNum (link_str) 93 print (c_id, id_dict [c_id]) 94 id_list.add (c_id) 95 print (id_dict) 96 97 def downloadFile (url, key, value): 98 # url = 'HTTP: // Limit 99 r = requests. get (url) 100 file_name = str (key) + "_" + str (value) + ". mp4 "101 with open (file_name," wb ") as code: 102 code. write (r. content) 103 104 if _ name __= = "_ main _": 105 count = 0106 # videos required for parsing cookies during free time download, cookies107 cookiesStr = "you can get" 108 getCookies (cookies) through Google's browser Str) 109 110 111 root = "http://ke.jikexueyuan.com/xilie/331? Huodong = shequn_0307 "112 getIdList (root) 113 head =" http://www.jikexueyuan.com/course/ "114 115 for key in id_dict: 116 if id_dict [key] <= 0: 118 print (id_dict [key], "No data") 119 break120 for I in range (1, id_dict [key] + 1): 121 url = head + key + "_" + str (I) + ". html? Ss = 1 "122 print (" Download: ") 123 print (url) 124 count + = 1125 getHtml (url, key, I) 126 print (" Total number of videos :") 127 print (count)
Optimization: It looks uncomfortable because the name of each video is not obtained. You can obtain the video name and create a folder based on the category. In this way, it is easier to watch.
Cookies can be obtained and used directly. This means that if the user's browser login information is intercepted, you can directly log on and obtain useful information. Which hackers obtain cookies and steal user information? Interesting.