Python Geek College Reptile V1

Source: Internet
Author: User
Tags xpath

    1. Directed crawl Geek College video, originally only annual fee VIP can only download, after analysis, just find a free experience VIP can crawl all video
    2. Basic techniques involved: Python XPath regular COM +
    3. Call Thunderbolt from the component via Python to automate the creation of folders and automatically add bulk download tasks, provided the Thunderbolt and Thunderbolt components are successfully installed
    4. Idea: Path Path Crawl All tags-"search page All this course category-" Course page get Course details-"Regular analysis video address
    5. The Geek Academy has been improving and may need to improve on its own

Import requests from lxml import etree import re import sys, OS, glob,time import scrapy

Reload (SYS) sys.setdefaultencoding ("Utf-8")

#baesurl = "Http://www.jikexueyuan.com/search/s/q_"
#base_path = "f:/jike/"
#heanders cookies need to be crawled by themselves or can only be crawled to free courses

headers = {"Host": "Www.jikexueyuan.com", "user-agent": "mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) gecko/20100101 firefox/39.0 "," Accept ":" text/html,application/xhtml+xml,application/xml;q=0.9,/; q=0.8 "," Accept-language ":" zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3 "," accept-encoding ":" gzip, deflate "," Cookie ":"ga=ga1.2.1700377703.1438173034; HmLVTf3c68d41bda15331608595c98e9c3915=1438173034; Mechatlvtime=1438179151498; Mechatckid=cookieval=006600143817303272961295; Stat.ssid=1438985023415; Statuuid=1438173038588973692017; connect.sid=s%3awt8iwwxkvz6zlhop7hpbg-vtx qtwias.qc1tyy4qv1bhomdn0utufsclkfncl4ny5zak1ss17kw; qingcloudelb=37e16e60f0cd051b754b0acf9bdfd4b5d562b81daa2a899c46d3a1e304c7eb2b| Vbjft| vbjft; Hmlpvtf3c68d41bda15331608595c98e9c3915=1438179151; Statisnew=0; Stat.fromweburl=;gat=1; Uname=jike76; uid=2992598; CODE=SMAPFI; authcode=d572tzivhfxnivnxcnf4vi5lv1tqlyeknag4m0mdqmvmrpa4vhdotjxosfo% 2bevfvpzra8m1sekezxqlx9qrgs6nwhd5vmobbdpeqvj726i54tqmodo81p4olhq "," Connection ":" Keep-alive "}

Class Jikeautodown:basepath = "Baseurl =" "Coursetag =" "Courseid =" "

def __init__(self, base_path, base_url):    if base_path and base_url:        self.base_path = base_path        self.base_url = base_url        self.get_tags()    else:        print("base_path and base_url is all must needed!")        returndef run(self):    self.get_tags()
Get_tags Get all Notes
def get_tags(self):    url = "http://www.jikexueyuan.com/path/"    tag_html = requests.get(url).text.decode("utf-8").encode("GB18030")    tag_etree = etree.HTML(tag_html)    tag_lists = [str(tag).rstrip("/")[str(tag).rstrip("/").rindex("/") + 1:] for tag in                 tag_etree.xpath(‘/html/body/div[1]/div[4]/div/div[3]/div/a/@href‘) if tag]    if tag_lists:        for tag in tag_lists:            print(tag)            self.course_tag = tag            self.get_total_page(tag)
Get_tags Get course All pages course pagination is JS generated bad direct crawl, so it's violent
def get_total_page(self, tag):    if tag:        for page in range(1, 50):            page_url = self.base_url + tag + "?pageNum=%d" % page            # print(page_url)            page_html = requests.get(page_url, headers=headers).text.decode("utf-8").encode("GB18030")            # print(page_html)            no_userMenu = re.search(r"userMenu", page_html, re.S)            if no_userMenu is None:                print("please check the cookies")                return            no_search = re.search(r"no-search", page_html, re.S)            if no_search:                print("the tag ;%s,%d is biggest page" % (tag, page - 1))                # return page_url_lists                break            else:                # page_url_lists.append(page_url)                self.get_course_pages(page_url)                # print(page_url)
Get CoursePages get the Course Details page
def get_course_pages(self, tag_url):    if tag_url:        print("the tag_url:%s " % tag_url)        course_page_lists = self.get_xpath_lists(tag_url, headers,                                                 ‘//*[@id="changeid"]/ul/li/div/div[2]/h5/a/@href‘)        if course_page_lists:            for course_page_url in course_page_lists:                self.get_down_urls(course_page_url)
Get DownURLs get video through regular
def get_down_urls (self, course_page_url): If course_page_url:self.course_id = COURSE_PAGE_URL[COURSE_PAGE_URL.R        Index ("/") + 1:course_page_url.rindex (".")] # print (course_page_url) print ("course_id:%s%s"% (self.course_id, course_page_url)) Course_do Wn_lists = self.get_xpath_lists (course_page_url, headers, '//*[@class = ' vid                Eo-list "]/div[2]/ul/li/div/h2/a/@href ') if Course_down_lists:for course_down_url in course_down_lists:                    course_down_html = Requests.get (Course_down_url, headers=headers). Text.decode ("Utf-8"). Encode ( "GB18030") Course_down = Re.findall (R ' Source src= "(. *?)" ', course_down_html, re.                    S) If Course_down:print ("%s"% course_down[0]) If Self.addtasktoxunlei (Course_down[0]): # Print ("%s is add success!"% Course_down[0]) Print ("is add success!") Time.sleep (5)
Get fileLists creating a folder
def get_file_lists (self, Course_tag, course_id): Course_path = "" If Self.base_path and Os.path.exists (Self.base_pat h) = = False:try:os.mkdir (self.base_path) except Exception:print ("error:%s"% EXCEP            Tion.message) return if Course_tag and os.path.exists (Self.base_path + course_tag) = = False:try: Os.mkdir (Self.base_path + course_tag) # print ("%s dir is create success!"% (Self.base_path + course_ta g) except Exception:print ("dir is create error,the error is%s"% exception.message) tmp = Self.bas E_path + course_tag + "\ \" + str (course_id) if course_id and os.path.exists (tmp) = = False:try:os.mk            DIR (tmp) Course_path = tmp # print ("%s dir is create success!"% tmp) except Exception:     Print ("dir is create error,the error is%s"% exception.message) return Else:course_path = tmp Return Course_path
Get XPathLists specifically parses XPath without writing it every time
def get_xpath_lists(self, url, headers, xpath):    try:        html = requests.get(url, headers=headers).text.decode("utf-8").encode("GB18030")        tree = etree.HTML(html)        lists = [str(plist) for plist in tree.xpath(xpath) if plist]    except Exception:        print("get xpath list is error is :%s" % Exception.message)        return    return lists
Addtasktoxunlei Add Thunderbolt ren, must install Thunderbolt, also need to set the default Thunder Thunderbolt do not remind, otherwise you need to manually click OK
def addTasktoXunlei(self, down_url):    flag = False    from win32com.client import Dispatch    o = Dispatch("ThunderAgent.Agent.1")    # http: // cv3.jikexueyuan.com / 201508011650 / a396d5f2b9a19e8438da3ea888e4cc73 / python / course_776 / 01 / video / c776b_01_h264_sd_960_540.mp4    if down_url:        course_infos = str(down_url).replace(" ", "").replace("http://", "").split("/")        course_path = self.get_file_lists(self.course_tag, self.course_id)        try:            o.AddTask(down_url, course_infos[len(course_infos)-1], course_path, "", "http://cv3.jikexueyuan.com", 1, 0, 5)            o.CommitTasks()            flag = True        except Exception:            print(Exception.message)            print("                     AddTask is fail!")    return flagif __name__ == "__main__":    myjike = jike_auto_down("f:\\jike\\", "http://www.jikexueyuan.com/search/s/q_")    myjike.run()

Python Geek College Reptile V1

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.