Python crawler: "catch up with the new fan" Website Resource Link crawling, python Crawler
"Catch up with new fan" website
The new website provides the latest Japanese TV series and movies, which are updated quickly.
I personally prefer watching Japanese dramas, so I want to create a resource map by crawling the website.
You can view which Japanese dramas are available on the website and download them at any time.
Resource map
The crawled resource map is as follows:
In linuxLs | grep keywordsYou can easily find the desired Resource (simply search for it in windows)
Crawler Script Development 1. Determine the crawling Policy
After entering multiple Japanese dramas, you can see that the website of each drama is in the following form:
It can be seen that each Japanese TV drama webpage corresponds to a number.
Therefore, we can crawl through traversal numbers.
2. Get the name of the Japanese Opera
Open the webpage of a Japanese TV drama and view the source code of the title as follows:
We can see that the title label ID is "pdtname". We only need to get the text of this label to get the Japanese drama name.
Obtain the TAG content through the beautifulSoup interface (remove unnecessary items in the name)
1 # try get tv name 2 tag_name = soup.find(id='pdtname') 3 if None == tag_name: 4 print('tv_{:0>4d}: not exist.'.format(num)) 5 return None 6 7 # remove signs not need 8 name = tag_name.get_text().replace(' ', '') 9 try:10 name = name.replace(re.search('【.*】', name).group(0), '') 11 name = name.replace(re.search('\(.*\)', name).group(0), '') 12 name = name.replace('《', '') 13 name = name.replace('》', '') 14 name = name.replace('/', '') 15 except :16 pass
3. Obtain the Resource Link
Each Japanese TV drama page also contains the resource link address. The source code is as follows:
You can see that the resource link uses a table block and the table block ID is "ajax_tbody"
Each set is a row element of the table, and each row contains several columns to display information about the resource.
We can retrieve the resource links of each set by traversing the elements of the table.
# try get tv resources list tag_resources = soup.find(id='ajax_tbody') if None == tag_resources: print('tv_{:0>4d}: has no resources.'.format(num)) return None # walk resources for res in tag_resources.find_all('tr'): # get link tag tag_a = res.find('a') info = res.find_all('td') print('resource: ', tag_a.get_text()) # get download link downlink = get_resources_link(session, tag_a.get('href')) # record resouces tv.resources.append([tag_a.get_text(), info[2].get_text(), downlink, '']) delay(1)
4. Get the download link
Click a resource to go To the download link page. The source code is as follows:
You can see that the download link ID of the donkey is "emule_url", so we only need to get the text of the tag (the magnetic link is similar)
First, we need to obtain the download page. The overall operation code is as follows:
def get_resources_link(session, url): ''' get tv resources download link ''' global domain res_url = domain + url # open resources page resp = session.get(res_url, timeout = 10) resp.raise_for_status() soup = page_decode(resp.content, resp.encoding) tag_emule = soup.find(id='emule_url') return tag_emule.get_text() if tag_emule != None else ''
5. Save the resource download link to your local device.
Since it is time-consuming to crawl the download links of all Japanese dramas, the previous judgment can only crawl the title, and then crawl the download link based on the serial number in the future.
def save_tv(tv): ''' save tv infomation on disk ''' filename = os.path.join(os.path.abspath(save_dir), '{:0>4d}_{}.txt'.format(tv.num, tv.name)) global only_catalog if only_catalog == True: with open(filename, 'a+') as f: pass else: with open(filename, 'w') as f: for info in tv.resources: f.write(os.linesep.join(info)) f.write('========' + os.linesep)
The above is the whole development process of crawling scripts.
Welcome to my code repository: https://gitee.com/github-18274965/Python-Spider
The crawling scripts for other websites will be developed in the future.
Appendix
Overall code:
1 #!/usr/bin/python3 2 # -*- coding:utf-8 -*- 3 4 import os 5 import sys 6 import re 7 import requests 8 from bs4 import BeautifulSoup 9 from time import sleep 10 11 # website domain 12 domain = 'http://www.zhuixinfan.com/' 13 14 # spide infomation save directory 15 save_dir = './tvinfo/' 16 17 # only tv catalog 18 only_catalog = False 19 20 class TVInfo: 21 ''' TV infomation class''' 22 23 def __init__(self, num, name): 24 self.num = num 25 self.name = name 26 self.resources = [] 27 28 29 def delay(seconds): 30 ''' sleep for secondes ''' 31 32 while seconds > 0: 33 sleep(1) 34 seconds = seconds - 1 35 36 def page_decode(content, encoding): 37 ''' decode page ''' 38 39 # lxml may failed, then try html.parser 40 try: 41 soup = BeautifulSoup(content, 'lxml', from_encoding=encoding) 42 except: 43 soup = BeautifulSoup(content, 'html.parser', from_encoding=encoding) 44 45 return soup 46 47 def open_home_page(session): 48 ''' open home page first as humain being ''' 49 50 global domain 51 home_url = domain + 'main.php' 52 53 # open home page 54 resp = session.get(home_url, timeout = 10) 55 resp.raise_for_status() 56 57 # do nothing 58 59 def get_resources_link(session, url): 60 ''' get tv resources download link ''' 61 62 global domain 63 res_url = domain + url 64 65 # open resources page 66 resp = session.get(res_url, timeout = 10) 67 resp.raise_for_status() 68 69 soup = page_decode(resp.content, resp.encoding) 70 71 tag_emule = soup.find(id='emule_url') 72 return tag_emule.get_text() if tag_emule != None else '' 73 74 75 def spider_tv(session, num): 76 ''' fetch tv infomaion ''' 77 78 global domain 79 tv_url = domain + 'viewtvplay-{}.html'.format(num) 80 81 # open tv infomation page 82 resp = session.get(tv_url, timeout = 10) 83 resp.raise_for_status() 84 85 soup = page_decode(resp.content, resp.encoding) 86 87 # try get tv name 88 tag_name = soup.find(id='pdtname') 89 if None == tag_name: 90 print('tv_{:0>4d}: not exist.'.format(num)) 91 return None 92 93 # try get tv resources list 94 tag_resources = soup.find(id='ajax_tbody') 95 if None == tag_resources: 96 print('tv_{:0>4d}: has no resources.'.format(num)) 97 return None 98 99 # remove signs not need100 name = tag_name.get_text().replace(' ', '')101 try:102 name = name.replace(re.search('【.*】', name).group(0), '')103 name = name.replace(re.search('\(.*\)', name).group(0), '')104 name = name.replace('《', '')105 name = name.replace('》', '')106 name = name.replace('/', '')107 except :108 pass109 110 print('tv_{:0>4d}: {}'.format(num, name))111 112 tv = TVInfo(num, name)113 114 global only_catalog115 if only_catalog == True:116 return tv117 118 # walk resources119 for res in tag_resources.find_all('tr'):120 121 # get link tag122 tag_a = res.find('a')123 info = res.find_all('td')124 print('resource: ', tag_a.get_text())125 126 # get download link127 downlink = get_resources_link(session, tag_a.get('href'))128 129 # record resouces130 tv.resources.append([tag_a.get_text(), info[2].get_text(), downlink, ''])131 delay(1)132 133 return tv134 135 136 def save_tv(tv):137 ''' save tv infomation on disk '''138 139 filename = os.path.join(os.path.abspath(save_dir), '{:0>4d}_{}.txt'.format(tv.num, tv.name)) 140 141 global only_catalog142 if only_catalog == True:143 with open(filename, 'a+') as f:144 pass145 else:146 with open(filename, 'w') as f:147 for info in tv.resources: 148 f.write(os.linesep.join(info))149 f.write('========' + os.linesep)150 151 def main():152 153 start = 1154 end = 999155 156 if len(sys.argv) > 1:157 start = int(sys.argv[1])158 159 if len(sys.argv) > 2:160 end = int(sys.argv[2])161 162 global only_catalog163 s = input("Only catalog ?[y/N] ")164 if s == 'y' or s == 'Y':165 only_catalog = True166 167 # headers: firefox_58 on ubuntu168 headers = {169 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:58.0)' 170 + ' Gecko/20100101 Firefox/58.0',171 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',172 'Accept-Language': 'zh-CN,en-US;q=0.7,en;q=0.3',173 'Accept-Encoding': 'gzip, deflate',174 }175 176 # create spider session177 with requests.Session() as s:178 179 try:180 s.headers.update(headers)181 open_home_page(s)182 for num in range(start, end+1):183 delay(3)184 tv = spider_tv(s, num)185 if tv != None:186 save_tv(tv)187 188 except Exception as err:189 print(err)190 exit(-1)191 192 if __name__ == '__main__':193 main()