Python Regular Expression crawls Cat's Eye movie top100 and pythontop100
Use a regular expression to crawl the top100 of a cat's eye movie. The specific content is as follows:
#! /Usr/bin/python #-*-coding: UTF-8-*-import json # Fast import to this module: Click the cursor to the function to be imported, alt + Enter and select from multiprocessing. pool import Pool # import process pool import requests import re import csv from requests. exceptions import RequestException # introduce exception # Save correctly, no loss # request a page to return the response content # list it in overlord, right-click-view element-a webpage information def get_one_page (url, offset) is displayed: try: response = requests. get (url = url, params = {"offset": offset}) if response. status_code = 200: # The returned result is determined by the status code. 200 indicates that the request is successful, and 300,500 table errors return response. text # returned webpage content else: return None failed t RequestException as e: return None # parse a page def parse_one_page (html): pattern = ('<dd>. *? Board-index. *?> (\ D +) </I> .*? Data-src = "(.*?) ".*? Name "> <a' + '. *?> (.*?) </A> .*? Star "> (.*?) </P> .*? Releasetime "> (.*?) </P> '+ '.*? Integer "> (.*?) </I> .*? Fraction "> (.*?) </I> .*? </Dd> ') # Write a regular expression to match all the results. Here, it starts with <dd> ,.*? Match any character using the board-index matching identifier, class name, # \ d table number is rank, '+' indicates matching at least one number, </I> end character # "? ", The question mark indicates non-Greedy match, that is, once the match is reached, it will not continue to try later. # And \ (and \) respectively indicate matching a "(" and ")" # re. S match multiple rows regex = re. compile (pattern, re. s) # a method that uses a regular expression string to generate a regular expression object, re. S matches any character items = regex. findall (html) # returns all matched substrings in the form of a list. eg: re. findall (pattern, string [, flags]) for item in items: # returns the result in a dictionary. The key-Value Pair yield {# turns this method into a generator 'index ': item [0], 'image': item [1], 'title': item [2], 'actor ': item [3]. strip () [3:], # Use strip () to remove the line break. If you do not want to star, use [3:] to form a slice, name removes the first three strings 'time': ge T_release_time (item [4]. strip () [5:]), # Remove the first five characters 'region': get_release_area (item [4]. strip () [5:]), 'score ': item [5] + item [6] # combine the scoring integer part and the fractional part} ''' # Save it to the txt file, and the Chinese character is changed to the unic encoding, with encoding = 'utf-8' and ensure_ascii = False, the Chinese character can normally output def write_to_file (content): with open('result.txt ', 'A', encoding = 'utf-8 ') as f: # parameter a, indicating to directly append f. write (json. dumps (content, ensure_ascii = False) + '\ n') # content is a dictionary, in json format. dumps converts it to words String, plus a line break f. close () # json. dumps: dict to str # json. loads: str converted to dict ''' # obtain the release time <p class = "releasetime"> release time: (Hong Kong, China) </p> def get_release_time (data): pattern = '^ (. *?) (\ (| $) 'Regex = re. compile (pattern) w = regex. search (data) return w. group (1) # group (1) refers to the first thing in the brackets # obtain the display area def get_release_area (data): pattern = '. *\((. *) \) '# And \ (and \) respectively indicate matching a' ('and') 'regex = re. compile (pattern) w = regex. search (data) if w is None: return 'unknown 'return w. group (1) # obtain the cover image without # def get_large_thumb (url): # pattern = '(. *?) @.*? '# Regex = re. compile (pattern) # w = regex. search (url) # return w. group (1) # store data def store_data (item): with open('movie.csv ', 'A', newline = '', encoding = 'utf-8') as data_csv: # dialect is used to open a csv file. The default value is excel. The delimiter = "\ t" parameter indicates the delimiter csv_writer = csv when writing. writer (data_csv) csv_writer.writerow ([item ['index'], item ['image'], item ['title'], item ['actor '], item ['time'], item ['area '], item ['score']) # The newline parameter is used to control End character of a line in text format. It can be None, '', \ n, \ r, \ r \ n, etc. ''' Can also be used to determine whether an exception occurs. Generally, try: csv_writer = csv. writer (data_csv) csv_writer.writerow ([item ['index'], item ['image'], item ['title'], item ['actor '], item ['time'], item ['area'], item ['score ']) failed t Exception as e: print (e) print (item) ''' # download cover image # If the Read mode is enabled, no new one is created. If the write mode is enabled, a new one is created. R read-only, w writable, a append def download_thumb (title, image): try: response = requests. get (image) # get binary data with open ('image/'effectitlepolic'.jpg ', 'wb') as f: # Save the cover image to the image folder in the current path, image name: Movie name .jpg f. write (response. content) f. close () failed t RequestException as e: print (e) pass # main scheduler def main (): # Starting URL start_url = 'HTTP: // maoyan.com/board/4? 'For I in range (, 10): # Get Response text content html = get_one_page (url = start_url, offset = I) if html is None: print (' link: % s? Offset = % s exception '. format (start_url, I) continue for item in parse_one_page (html): # print (item) store_data (item) # download_thumb (item ['title'], item ['image']) # if _ name __= = '_ main _': main () ''' if _ name __= = '_ main _': for I in range (10): main (I * 10) ''' if _ name __= = '_ main _': for I in range (10): main (I * 10) pool = Pool () # You can provide a specified number of processes for users to call. If a new request is submitted to the process pool, the process Pool is not full, A new process will be created to execute the request. If the request is full, wait for the pool first. m Ap (main, [I * 10 for I in range (10)]) # Use each element in the array as a function parameter and create processes one by one, put it in the process pool to run; the second parameter is to construct an array to form a loop # The speed becomes faster! 1 s '''
Save to database
Def main (offset): url = 'HTTP: // maoyan.com/board/4? Offset = '+ str (offset) html = get_one_page (url) # for item in parse_one_page (html): # print (item ['number']) # correct output, charset = "utf8" try: conn = pymysql. connect (host = 'localhost', user = 'root', passwd = '', port = 3306, db = 'test1', charset =" utf8 ", use_unicode = False) cur = conn. cursor () # create a cursor object for item in parse_one_page (html): try: # SQL = "INSERT INTO movies (number, picture, title, actors, time, area, score) VALUES (% s, % s) "# cur.exe cute (SQL, (item ['number'], item ['picture'], item ['title'], item ['actors '], item ['time'], item ['region'], item ['score ']) SQL = "insert into test_movies (number, picture, title, actors, time, area, score) values (% s, % s, % s, % s) "cur.exe cute (SQL, (item ['number'], item ['picture '], item ['title'], item ['actors '], item ['time'], item ['region'], item ['score']) Doesn't pymysql. error as e: print (e) print ('---data saved successfully---') conn. commit () cur. close () conn. close () # close data consumer t pymysql. error as e: print ("Mysql Error % d: % s" % (e. args [0], e. args [1]) if _ name __= = '_ main _': # connect to the database conn = pymysql. connect (host = 'localhost', user = 'root', passwd = '', port = 3306, db = 'test1', charset =" utf8 ") cur = conn. cursor () # create a cursor object cur.exe cute ("drop table if exists test_movies ") # delete a TABLE if it exists # CREATE a TABLE SQL statement sqlc = "CREATE TABLE test_movies (number int not null primary key auto_increment, picture VARCHAR (100) NOT NULL, title VARCHAR (100) not null, actors VARCHAR (200) not null, time VARCHAR (100) not null, area VARCHAR (100), score VARCHAR (50) not null) "cur.exe cute (sqlc) # execute the create data table operation pool = Pool () pool. map (main, [I * 10 for I in range (10)])
The above is all the content of this article. I hope it will be helpful for your learning and support for helping customers.