#练习1: Get all the URLs on the Sohu page and filter out the contents of the basketball. # algorithm: #1, get all the content of Sohu website #2, determine which links are linked, and get URL format valid links #3, get the contents of each valid URL page #4, judging whether the content contains basketball #5, if included basketball, save webpage import requestsimport reimport bs4resq= requests.Get("http://www.sohu.com") #请求搜狐网站print resq.text[: -] #打印响应结果前一百长度print"*****************"*5links= Re.findall (r'href= "(. *?)"', Resq.text) #查找所有包含href内容 #print linksprint len (links) Valid_link=[] #保存有效连接invalid_link=[] #保存无效连接print"*****************"*5 forLinkinchLinks:ifRe.search (R"(\.jpg) | (\.jpeg) | (\.gif) | (\.ico) | (\.png) | (\.js) | (\.CSS) $", Link.strip ()): #资源连接筛选出来 #print6, Link invalid_link.append (Link.strip ())Continue#进入此判断之后执行完直接执行下一次循环 elif Link.strip ()==""or link.strip () = ="#"or link.strip () = ="/": #无效内容筛选去除 #print1, link invalid_link.append (link)Continueelif Link.strip (). StartsWith ("//"): #把有效相对连接筛选保存 #print2, link valid_link.append ("http:"+Link.strip ())Continueelif Link.strip (). Count ("JavaScript") >=1or Link.strip (). Count ("mailto:") >=1: #引用js连接及邮箱超级连接去除 #print3, Link invalid_link.append (Link.strip ())Continueelif Re.match (R"/\w+", link): #把剩下所有内容连接再做进一步筛选 #print5, LinkifRe.match (R"http://.*?/", Resq.url.strip ()): #http开头连接筛选 valid_link.append (Re.match (R"http://.*?/", Resq.url.strip ()). Group () + Link.strip ()) #把连接以/End Content SaveElse: Valid_link.append (Re.match (R"http://.*", Resq.url.strip ()). Group () +Link.strip ()) #把连接以内容结尾保存Continue Else: #print7, Link valid_link.append (Link.strip ()) #筛选剩下的内容都保存到有效列表中print"*****************"*5File_num=1#为创建文件准备 forLinkinchListSet(Valid_link)): # Print link Resq= requests.Get(Link, verify=True) #允许证书校验并访问所有保存的有效连接ifU"Basketball" inchResq.text: #筛选网页内容中是否存在 "basketball" content #print linkifU'meta charset= "Utf-8"' inchResq.text: #判断网页是否以utf-8 encoding with open (R"e:\\test4\\crawler\\"+ STR (file_num) +". html","W") asFp:fp.write (Resq.text.strip (). Encode ("Utf-8")) #编码内容为utf-save to the specified directory after 8Else: With open (R"e:\\test4\\crawler\\"+ STR (file_num) +". html","W") asFp:fp.write (Resq.text.strip (). Encode ("GBK")) #编码内容为gbk后保存到指定目录 File_num+=1Print"done!"
"Python" crawler-1