Python geek college crawlers V1 and python crawlers v1

Source: Internet
Author: User

Python geek college crawlers V1 and python crawlers v1

 

Import requests from lxml import etree import re import sys, OS, glob, time import scrapy

Reload (sys) sys. setdefaultencoding ("UTF-8 ")

#baesurl = "http://www.jikexueyuan.com/search/s/q_"
#base_path = "f:/jike/"
# Heanders cookies must be captured by yourself; otherwise, only free courses can be crawled.

Headers = {"Host": "www.jikexueyuan.com", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv: 39.0) Gecko/20100101 Firefox/39.0 ", "Accept": "text/html, application/xhtml + xml, application/xml; q = 0.9,/; Q = 0.8 "," Accept-Language ":" zh-CN, zh; q = 0.8, en-US; q = 0.5, en; q = 0.3 ", "Accept-Encoding": "gzip, deflate", "Cookie ":"Ga = GA1.2.1700377703.1438173034; HmLvtF3c68d41bda15331608595c98e9c3915 = 1438173034; MECHATLVTime = 1438179151498; MECHATCKID = cookieVal = 006600143817303272961295; statSsid = 1438985023415; statUuid = 1438173038588973692017; connect. sid = s % 3awt8iwwxkvz6zl3167hpbg-vtxQtwIAs. QC1tYy4qV1bHOMDN0UTUfScLKFncl4NY5zAk1SS17Kw; QINGCLOUDELB = Hangzhou | VbjfT; HmLpvtF3c68d41bda15331608595c98e9c3915 = 1438179151; statIsNew = 0; statFromWebUrl =;Gat = 1; uname = jike76; uid = 2992598; code = SMapFI; authcode = signature % success "," Connection ":" keep-alive "}

Class jikeAutoDown: basePath = "" baseUrl = "" courseTag = "" courseId = ""

def __init__(self, base_path, base_url):    if base_path and base_url:        self.base_path = base_path        self.base_url = base_url        self.get_tags()    else:        print("base_path and base_url is all must needed!")        returndef run(self):    self.get_tags()
Get_tags get all tabs
def get_tags(self):    url = "http://www.jikexueyuan.com/path/"    tag_html = requests.get(url).text.decode("utf-8").encode("GB18030")    tag_etree = etree.HTML(tag_html)    tag_lists = [str(tag).rstrip("/")[str(tag).rstrip("/").rindex("/") + 1:] for tag in                 tag_etree.xpath('/html/body/div[1]/div[4]/div/div[3]/div/a/@href') if tag]    if tag_lists:        for tag in tag_lists:            print(tag)            self.course_tag = tag            self.get_total_page(tag)
Get_tags obtains the course pages. The course pages are not directly captured by js generation, so it is violent.
def get_total_page(self, tag):    if tag:        for page in range(1, 50):            page_url = self.base_url + tag + "?pageNum=%d" % page            # print(page_url)            page_html = requests.get(page_url, headers=headers).text.decode("utf-8").encode("GB18030")            # print(page_html)            no_userMenu = re.search(r"userMenu", page_html, re.S)            if no_userMenu is None:                print("please check the cookies")                return            no_search = re.search(r"no-search", page_html, re.S)            if no_search:                print("the tag ;%s,%d is biggest page" % (tag, page - 1))                # return page_url_lists                break            else:                # page_url_lists.append(page_url)                self.get_course_pages(page_url)                # print(page_url)
Get CoursePages get course details page
def get_course_pages(self, tag_url):    if tag_url:        print("the tag_url:%s " % tag_url)        course_page_lists = self.get_xpath_lists(tag_url, headers,                                                 '//*[@id="changeid"]/ul/li/div/div[2]/h5/a/@href')        if course_page_lists:            for course_page_url in course_page_lists:                self.get_down_urls(course_page_url)
Get DownUrls obtains the video through regular expressions.
def get_down_urls(self, course_page_url):    if course_page_url:        self.course_id = course_page_url[course_page_url.rindex("/") + 1:course_page_url.rindex(".")]        # print(course_page_url)        print("             course_id:%s %s" % (self.course_id, course_page_url))        course_down_lists = self.get_xpath_lists(course_page_url, headers,                                                 '//*[@class="video-list"]/div[2]/ul/li/div/h2/a/@href')        if course_down_lists:            for course_down_url in course_down_lists:                course_down_html = requests.get(course_down_url, headers=headers).text.decode("utf-8").encode(                    "GB18030")                course_down = re.findall(r'source src="(.*?)"', course_down_html, re.S)                if course_down:                    print("                     %s" % course_down[0])                    if self.addTasktoXunlei(course_down[0]):                        # print("                     %s is add success!" % course_down[0])                        print("                     is add success!")                        time.sleep(5)
Get FileCreate a folder in lists
def get_file_lists(self, course_tag, course_id):    course_path = ""    if self.base_path and os.path.exists(self.base_path) == False:        try:            os.mkdir(self.base_path)        except Exception:            print("error :%s" % Exception.message)            return    if course_tag and os.path.exists(self.base_path + course_tag) == False:        try:            os.mkdir(self.base_path + course_tag)            # print("%s dir is create success!" % (self.base_path + course_tag))        except Exception:            print("dir is create error,the error is %s" % Exception.message)    tmp = self.base_path + course_tag + "\\" + str(course_id)    if course_id and os.path.exists(tmp) == False:        try:            os.mkdir(tmp)            course_path = tmp            # print("%s dir is create success!" % tmp)        except Exception:            print("dir is create error,the error is %s" % Exception.message)            return    else:        course_path = tmp    return course_path
Get XpathLists specializes in parsing xpath and does not need to be written every time.
def get_xpath_lists(self, url, headers, xpath):    try:        html = requests.get(url, headers=headers).text.decode("utf-8").encode("GB18030")        tree = etree.HTML(html)        lists = [str(plist) for plist in tree.xpath(xpath) if plist]    except Exception:        print("get xpath list is error is :%s" % Exception.message)        return    return lists
AddTasktoXunlei: Add a thunder task. The thunder task must be installed. Do not remind you of the default setting of thunder. Otherwise, click OK.
def addTasktoXunlei(self, down_url):    flag = False    from win32com.client import Dispatch    o = Dispatch("ThunderAgent.Agent.1")    # http: // cv3.jikexueyuan.com / 201508011650 / a396d5f2b9a19e8438da3ea888e4cc73 / python / course_776 / 01 / video / c776b_01_h264_sd_960_540.mp4    if down_url:        course_infos = str(down_url).replace(" ", "").replace("http://", "").split("/")        course_path = self.get_file_lists(self.course_tag, self.course_id)        try:            o.AddTask(down_url, course_infos[len(course_infos)-1], course_path, "", "http://cv3.jikexueyuan.com", 1, 0, 5)            o.CommitTasks()            flag = True        except Exception:            print(Exception.message)            print("                     AddTask is fail!")    return flagif __name__ == "__main__":    myjike = jike_auto_down("f:\\jike\\", "http://www.jikexueyuan.com/search/s/q_")    myjike.run()

 

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.