- Directed crawl Geek College video, originally only annual fee VIP can only download, after analysis, just find a free experience VIP can crawl all video
- Basic techniques involved: Python XPath regular COM +
- Call Thunderbolt from the component via Python to automate the creation of folders and automatically add bulk download tasks, provided the Thunderbolt and Thunderbolt components are successfully installed
- Idea: Path Path Crawl All tags-"search page All this course category-" Course page get Course details-"Regular analysis video address
- The Geek Academy has been improving and may need to improve on its own
Import requests from lxml import etree import re import sys, OS, glob,time import scrapy
Reload (SYS) sys.setdefaultencoding ("Utf-8")
#baesurl = "Http://www.jikexueyuan.com/search/s/q_"
#base_path = "f:/jike/"
#heanders cookies need to be crawled by themselves or can only be crawled to free courses
headers = {"Host": "Www.jikexueyuan.com", "user-agent": "mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) gecko/20100101 firefox/39.0 "," Accept ":" text/html,application/xhtml+xml,application/xml;q=0.9,/; q=0.8 "," Accept-language ":" zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3 "," accept-encoding ":" gzip, deflate "," Cookie ":"ga=ga1.2.1700377703.1438173034; HmLVTf3c68d41bda15331608595c98e9c3915=1438173034; Mechatlvtime=1438179151498; Mechatckid=cookieval=006600143817303272961295; Stat.ssid=1438985023415; Statuuid=1438173038588973692017; connect.sid=s%3awt8iwwxkvz6zlhop7hpbg-vtx qtwias.qc1tyy4qv1bhomdn0utufsclkfncl4ny5zak1ss17kw; qingcloudelb=37e16e60f0cd051b754b0acf9bdfd4b5d562b81daa2a899c46d3a1e304c7eb2b| Vbjft| vbjft; Hmlpvtf3c68d41bda15331608595c98e9c3915=1438179151; Statisnew=0; Stat.fromweburl=;gat=1; Uname=jike76; uid=2992598; CODE=SMAPFI; authcode=d572tzivhfxnivnxcnf4vi5lv1tqlyeknag4m0mdqmvmrpa4vhdotjxosfo% 2bevfvpzra8m1sekezxqlx9qrgs6nwhd5vmobbdpeqvj726i54tqmodo81p4olhq "," Connection ":" Keep-alive "}
Class Jikeautodown:basepath = "Baseurl =" "Coursetag =" "Courseid =" "
def __init__(self, base_path, base_url): if base_path and base_url: self.base_path = base_path self.base_url = base_url self.get_tags() else: print("base_path and base_url is all must needed!") returndef run(self): self.get_tags()
Get_tags Get all Notes
def get_tags(self): url = "http://www.jikexueyuan.com/path/" tag_html = requests.get(url).text.decode("utf-8").encode("GB18030") tag_etree = etree.HTML(tag_html) tag_lists = [str(tag).rstrip("/")[str(tag).rstrip("/").rindex("/") + 1:] for tag in tag_etree.xpath(‘/html/body/div[1]/div[4]/div/div[3]/div/a/@href‘) if tag] if tag_lists: for tag in tag_lists: print(tag) self.course_tag = tag self.get_total_page(tag)
Get_tags Get course All pages course pagination is JS generated bad direct crawl, so it's violent
def get_total_page(self, tag): if tag: for page in range(1, 50): page_url = self.base_url + tag + "?pageNum=%d" % page # print(page_url) page_html = requests.get(page_url, headers=headers).text.decode("utf-8").encode("GB18030") # print(page_html) no_userMenu = re.search(r"userMenu", page_html, re.S) if no_userMenu is None: print("please check the cookies") return no_search = re.search(r"no-search", page_html, re.S) if no_search: print("the tag ;%s,%d is biggest page" % (tag, page - 1)) # return page_url_lists break else: # page_url_lists.append(page_url) self.get_course_pages(page_url) # print(page_url)
Get
CoursePages get the Course Details page
def get_course_pages(self, tag_url): if tag_url: print("the tag_url:%s " % tag_url) course_page_lists = self.get_xpath_lists(tag_url, headers, ‘//*[@id="changeid"]/ul/li/div/div[2]/h5/a/@href‘) if course_page_lists: for course_page_url in course_page_lists: self.get_down_urls(course_page_url)
Get
DownURLs get video through regular
def get_down_urls (self, course_page_url): If course_page_url:self.course_id = COURSE_PAGE_URL[COURSE_PAGE_URL.R Index ("/") + 1:course_page_url.rindex (".")] # print (course_page_url) print ("course_id:%s%s"% (self.course_id, course_page_url)) Course_do Wn_lists = self.get_xpath_lists (course_page_url, headers, '//*[@class = ' vid Eo-list "]/div[2]/ul/li/div/h2/a/@href ') if Course_down_lists:for course_down_url in course_down_lists: course_down_html = Requests.get (Course_down_url, headers=headers). Text.decode ("Utf-8"). Encode ( "GB18030") Course_down = Re.findall (R ' Source src= "(. *?)" ', course_down_html, re. S) If Course_down:print ("%s"% course_down[0]) If Self.addtasktoxunlei (Course_down[0]): # Print ("%s is add success!"% Course_down[0]) Print ("is add success!") Time.sleep (5)
Get
fileLists creating a folder
def get_file_lists (self, Course_tag, course_id): Course_path = "" If Self.base_path and Os.path.exists (Self.base_pat h) = = False:try:os.mkdir (self.base_path) except Exception:print ("error:%s"% EXCEP Tion.message) return if Course_tag and os.path.exists (Self.base_path + course_tag) = = False:try: Os.mkdir (Self.base_path + course_tag) # print ("%s dir is create success!"% (Self.base_path + course_ta g) except Exception:print ("dir is create error,the error is%s"% exception.message) tmp = Self.bas E_path + course_tag + "\ \" + str (course_id) if course_id and os.path.exists (tmp) = = False:try:os.mk DIR (tmp) Course_path = tmp # print ("%s dir is create success!"% tmp) except Exception: Print ("dir is create error,the error is%s"% exception.message) return Else:course_path = tmp Return Course_path
Get
XPathLists specifically parses XPath without writing it every time
def get_xpath_lists(self, url, headers, xpath): try: html = requests.get(url, headers=headers).text.decode("utf-8").encode("GB18030") tree = etree.HTML(html) lists = [str(plist) for plist in tree.xpath(xpath) if plist] except Exception: print("get xpath list is error is :%s" % Exception.message) return return lists
Addtasktoxunlei Add Thunderbolt ren, must install Thunderbolt, also need to set the default Thunder Thunderbolt do not remind, otherwise you need to manually click OK
def addTasktoXunlei(self, down_url): flag = False from win32com.client import Dispatch o = Dispatch("ThunderAgent.Agent.1") # http: // cv3.jikexueyuan.com / 201508011650 / a396d5f2b9a19e8438da3ea888e4cc73 / python / course_776 / 01 / video / c776b_01_h264_sd_960_540.mp4 if down_url: course_infos = str(down_url).replace(" ", "").replace("http://", "").split("/") course_path = self.get_file_lists(self.course_tag, self.course_id) try: o.AddTask(down_url, course_infos[len(course_infos)-1], course_path, "", "http://cv3.jikexueyuan.com", 1, 0, 5) o.CommitTasks() flag = True except Exception: print(Exception.message) print(" AddTask is fail!") return flagif __name__ == "__main__": myjike = jike_auto_down("f:\\jike\\", "http://www.jikexueyuan.com/search/s/q_") myjike.run()
Python Geek College Reptile V1