Because seeing the old boy's students have done this period, is also very interested, and just learning python, so with Python to achieve the next. Because of just learning python, there are certainly plenty of places to optimize.
#!/bin/bash# oldboy linux training# 2015-06-01# happy children ' s Day# Description: This script is from the old boy linux21 period student Zhang Yao development! Edufile=/tmp/edu.htmledufile2=/tmp/edu2.htmlurl= "$*" # check for given parameters [ $# -eq 0 ] && { echo "usage: /bin/sh $0 http://. " exit 1} # Judge url is ok?curl -i $URL &>/dev/null [ $? -ne 0 ] &&{ echo "Bad url,please check it" exit 1} # defined get pagenum and courseid functionsfunction getnum () { curl -s $Url > $EduFile grep ' " Pagesgoend "' $EduFile &>/dev/null if [ $? -eq 0 ] then num= ' sed -rn ' s#.*page= ([0-9].*) " Class= "Pagesgoend". *$#\1#gp ' $EduFile ' else num= ' sed -rn ' s|. *page= ([0-9].*) # " class=" Pagesnum ". *$|\1|gp ' $EduFile ' fi pagenum=${num:-1} courseid= ' echo $Url |awk -f "[-.]" ' {print $4} '} # defined curl html functionsfunction curl () { getnum for i in ' seq $pagenum ' do curl "http://edu.51cto.com /index.php?do=course&m=lessions&course_id= $CourseId &page= $i " 1>> $EduFile 2>/dev /null done} # defined create table Functionsfunction table () { sum= "" index=1 sed -rn '/do=lesson/ s#<.* (<a href= ") (. *) </H4>$#\1HTTP://EDU.51CTO.COM\2#GP ' $EduFile > $ edufile2 while read line do sum= $sum "<tr><th width=" scope= "Row" > $index </th><td width= "520" >$ Line</td> "&NBSP;&Nbsp; ((index++)) done < $EduFile 2} # defined create html functionsfunction html () { cat >/tmp/oldboy.html<<-end #!/usr/bin/env python#coding:utf-8import urllib,urllib2,sys,os,rereload (SYS) sys.setdefaultencoding (' Utf-8 ') def get_course_id_page_num (URI): orgin = urllib2.urlopen ( URI). Read (). Decode (' Utf-8 '). Encode (' Utf-8 ') try: page_pattern = re.compile (R ' <a href= "/index.php?. *course_id= (. *) &page= (. *) " class=" Pagesgoend "') course _id = page_pattern.search (orgin). Group (1) pagenum = page_pattern.search (orgin). Group (2) except AttributeError: page_pattern = re.compile (R ' <a href= "/index.php?. *course_id= (. *) &page= (. *) # " class=" Pagesnum "') course_ Id = page_pattern.search (orgin). Group (1) pagenum = page_pattern.search (orgin). Group (2) return (Course_id,pagenum) def get_url_title (course,page): #cto = file ('/tmp/edu.51cto.html ', ' A + ') url= "http://edu.51cto.com/index.php ? do=course&m=lessions&course_id= " + str (course) + " &page= " + str ( page) request = urllib2.urlopen (URL). read (). Decode (' Utf-8 '). Encode (' Utf-8 ') url_title_pattern = re.compile (R ' <a href= "(. *)" target= "_blank" > (. *) </a>The following is the program run after the 650) this.width=650; "Src=" http://s3.51cto.com/wyfs02/M02/6E/88/wKioL1V_kIewN-AwAAUN6eNRuVc726.jpg " title= "QQ picture 20150616104922.jpg" alt= "Wkiol1v_kiewn-awaaun6enruvc726.jpg"/>
This article is from the "tireless learning ..." Blog, be sure to keep this source http://jonyisme.blog.51cto.com/3690784/1662243
Python crawls 51cto a curriculum