Python geek college crawlers V1 and python crawlers v1

Last Update:2015-09-07 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Import requests from lxml import etree import re import sys, OS, glob, time import scrapy

Reload (sys) sys. setdefaultencoding ("UTF-8 ")

#baesurl = "http://www.jikexueyuan.com/search/s/q_"

#base_path = "f:/jike/"

# Heanders cookies must be captured by yourself; otherwise, only free courses can be crawled.

Headers = {"Host": "www.jikexueyuan.com", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv: 39.0) Gecko/20100101 Firefox/39.0 ", "Accept": "text/html, application/xhtml + xml, application/xml; q = 0.9,/; Q = 0.8 "," Accept-Language ":" zh-CN, zh; q = 0.8, en-US; q = 0.5, en; q = 0.3 ", "Accept-Encoding": "gzip, deflate", "Cookie ":"Ga = GA1.2.1700377703.1438173034; HmLvtF3c68d41bda15331608595c98e9c3915 = 1438173034; MECHATLVTime = 1438179151498; MECHATCKID = cookieVal = 006600143817303272961295; statSsid = 1438985023415; statUuid = 1438173038588973692017; connect. sid = s % 3awt8iwwxkvz6zl3167hpbg-vtxQtwIAs. QC1tYy4qV1bHOMDN0UTUfScLKFncl4NY5zAk1SS17Kw; QINGCLOUDELB = Hangzhou | VbjfT; HmLpvtF3c68d41bda15331608595c98e9c3915 = 1438179151; statIsNew = 0; statFromWebUrl =;Gat = 1; uname = jike76; uid = 2992598; code = SMapFI; authcode = signature % success "," Connection ":" keep-alive "}

Class jikeAutoDown: basePath = "" baseUrl = "" courseTag = "" courseId = ""

def __init__(self, base_path, base_url):    if base_path and base_url:        self.base_path = base_path        self.base_url = base_url        self.get_tags()    else:        print("base_path and base_url is all must needed!")        returndef run(self):    self.get_tags()

Get_tags get all tabs

def get_tags(self):    url = "http://www.jikexueyuan.com/path/"    tag_html = requests.get(url).text.decode("utf-8").encode("GB18030")    tag_etree = etree.HTML(tag_html)    tag_lists = [str(tag).rstrip("/")[str(tag).rstrip("/").rindex("/") + 1:] for tag in                 tag_etree.xpath('/html/body/div[1]/div[4]/div/div[3]/div/a/@href') if tag]    if tag_lists:        for tag in tag_lists:            print(tag)            self.course_tag = tag            self.get_total_page(tag)

Get_tags obtains the course pages. The course pages are not directly captured by js generation, so it is violent.

def get_total_page(self, tag):    if tag:        for page in range(1, 50):            page_url = self.base_url + tag + "?pageNum=%d" % page            # print(page_url)            page_html = requests.get(page_url, headers=headers).text.decode("utf-8").encode("GB18030")            # print(page_html)            no_userMenu = re.search(r"userMenu", page_html, re.S)            if no_userMenu is None:                print("please check the cookies")                return            no_search = re.search(r"no-search", page_html, re.S)            if no_search:                print("the tag ;%s,%d is biggest page" % (tag, page - 1))                # return page_url_lists                break            else:                # page_url_lists.append(page_url)                self.get_course_pages(page_url)                # print(page_url)

Get CoursePages get course details page

def get_course_pages(self, tag_url):    if tag_url:        print("the tag_url:%s " % tag_url)        course_page_lists = self.get_xpath_lists(tag_url, headers,                                                 '//*[@id="changeid"]/ul/li/div/div[2]/h5/a/@href')        if course_page_lists:            for course_page_url in course_page_lists:                self.get_down_urls(course_page_url)

Get DownUrls obtains the video through regular expressions.

def get_down_urls(self, course_page_url):    if course_page_url:        self.course_id = course_page_url[course_page_url.rindex("/") + 1:course_page_url.rindex(".")]        # print(course_page_url)        print("             course_id:%s %s" % (self.course_id, course_page_url))        course_down_lists = self.get_xpath_lists(course_page_url, headers,                                                 '//*[@class="video-list"]/div[2]/ul/li/div/h2/a/@href')        if course_down_lists:            for course_down_url in course_down_lists:                course_down_html = requests.get(course_down_url, headers=headers).text.decode("utf-8").encode(                    "GB18030")                course_down = re.findall(r'source src="(.*?)"', course_down_html, re.S)                if course_down:                    print("                     %s" % course_down[0])                    if self.addTasktoXunlei(course_down[0]):                        # print("                     %s is add success!" % course_down[0])                        print("                     is add success!")                        time.sleep(5)

Get FileCreate a folder in lists

def get_file_lists(self, course_tag, course_id):    course_path = ""    if self.base_path and os.path.exists(self.base_path) == False:        try:            os.mkdir(self.base_path)        except Exception:            print("error :%s" % Exception.message)            return    if course_tag and os.path.exists(self.base_path + course_tag) == False:        try:            os.mkdir(self.base_path + course_tag)            # print("%s dir is create success!" % (self.base_path + course_tag))        except Exception:            print("dir is create error,the error is %s" % Exception.message)    tmp = self.base_path + course_tag + "\\" + str(course_id)    if course_id and os.path.exists(tmp) == False:        try:            os.mkdir(tmp)            course_path = tmp            # print("%s dir is create success!" % tmp)        except Exception:            print("dir is create error,the error is %s" % Exception.message)            return    else:        course_path = tmp    return course_path

Get XpathLists specializes in parsing xpath and does not need to be written every time.

def get_xpath_lists(self, url, headers, xpath):    try:        html = requests.get(url, headers=headers).text.decode("utf-8").encode("GB18030")        tree = etree.HTML(html)        lists = [str(plist) for plist in tree.xpath(xpath) if plist]    except Exception:        print("get xpath list is error is :%s" % Exception.message)        return    return lists

AddTasktoXunlei: Add a thunder task. The thunder task must be installed. Do not remind you of the default setting of thunder. Otherwise, click OK.

def addTasktoXunlei(self, down_url):    flag = False    from win32com.client import Dispatch    o = Dispatch("ThunderAgent.Agent.1")    # http: // cv3.jikexueyuan.com / 201508011650 / a396d5f2b9a19e8438da3ea888e4cc73 / python / course_776 / 01 / video / c776b_01_h264_sd_960_540.mp4    if down_url:        course_infos = str(down_url).replace(" ", "").replace("http://", "").split("/")        course_path = self.get_file_lists(self.course_tag, self.course_id)        try:            o.AddTask(down_url, course_infos[len(course_infos)-1], course_path, "", "http://cv3.jikexueyuan.com", 1, 0, 5)            o.CommitTasks()            flag = True        except Exception:            print(Exception.message)            print("                     AddTask is fail!")    return flagif __name__ == "__main__":    myjike = jike_auto_down("f:\\jike\\", "http://www.jikexueyuan.com/search/s/q_")    myjike.run()

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Python geek college crawlers V1 and python crawlers v1

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Python geek college crawlers V1 and python crawlers v1

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support