Parse HTML with Python: beautifulsoup

Last Update:2018-12-05 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

HTML needs to be parsed in the project, implemented in Python, and beautifulsoup is a useful thing. If you write a program, you may not know what to do, the purpose is to let everyone know how to use beautifulsoup. Of course, I am using beautif. I have never learned how to use beautif. it is too advanced.

# Coding = UTF-8 from beautifulsoup import beautifulsoup, Tag, navigablestring from sentencespliter import sentencespliter from OS. path import basename, dirname, isdir, isfile from OS import makedirs from shutil import copyfile Import Io Import time Import Re class build_tpl: def _ init _ (self, parse_file, build_tpl_name, cp_pic_dir, show_pic_dir, js_path, S Et_lang = 2052): ''' parameter description: resolution file name, Template Name, Save image path, Image Display path, JS path, current language) ''' # obtain the path to the parsed file directory If Len (dirname (parse_file)> 1: self. cur_dir = dirname (parse_file) + "/"; else: self. cur_dir = ". /"; # name of the created template file self. build_tpl_name = build_tpl_name; # image CP directory self. cp_pic_dir = cp_pic_dir; # display the image directory through HTTP self. show_pic_dir = show_pic_dir; # JS loading path Self. Js_path = js_path; # Segment group self. get_text_arr = []; # array of the current image name self. cur_pic_arr = []; # retrieve soup resources by parsing files self. soup = self. get_soup (parse_file); # obtain the section document in the HTML document. self. get_text_arr = self. soup. body. findall (text = Lambda (x): Len (X. strip ()> 0); # obtain a sentence pair self. get_sentence_arr = self. parse_text (self. get_text_arr, set_lang); # obtain the replacement array self. replace_list = self. Get_replace_list (self. get_text_arr, set_lang); # obtain the Image array self. cur_pic_arr = self. soup. findall ('img '); # self. write_file_by_list ("no.txt", self. get_text_arr); # self. write_file_by_list ("yes.txt", self. get_sentence_arr); # Save the phrase to the file def save_data_file (Self): file_name = self. build_tpl_name + ". data "; self. write_file_by_list (file_name, self. get_data (); # Get phrase d EF get_data (Self): return self. get_sentence_arr; # Write the array to the document def write_file_by_list (self, file_name, write_arr): file = Io. fileio (file_name, "W"); file. write ('/N '. join (write_arr )). encode ('utf-8'); file. close (); # Write a string to the document def write_file (self, file_name, file_contents): file = Io. fileio (file_name, "W"); file. write (file_contents.encode ('utf-8'); file. close (); # create an image hash directory def get_pic_hash (Self): return time. strftime ("% Y/% m/% d/"); # create a template file def Builder (Self ): # words that fail to be replaced bug_msg = []; # Replace the content template for I in range (LEN (self. get_text_arr): # Replace rep_str = "$ rep_arr [{0}]". format (I); try: self. soup. body. find (text = self. get_text_arr [I]). replacewith (self. replace_list [I]); T attributeerror: Bug_msg.append (self. get_text_arr [I]); # obtain the image hash path hash_dir = self. get_pic_hash (); # construct the image display path show_pic_dir = self. show_pic_dir + hash_dir; # construct the image storage path cp_pic_dir = self. cp_pic_dir + hash_dir; # determine whether the directory for saving the image does not exist. if not isdir (cp_pic_dir ): makedirs (cp_pic_dir); for pic_name in self. cur_pic_arr: # Replace the image path old_pic_src = pic_name ['src']; P Ic_name ['src'] = show_pic_dir + old_pic_src; # copy an image cp_src_file = self. cur_dir + old_pic_src; cp_dis_file = cp_pic_dir + old_pic_src; copyfile (cp_src_file, cp_dis_file ); # document for setting up bug Information # self. write_file_by_list ("bug.txt", bug_msg); # Add JS tag = tag (self. soup, "script"); tag ['type'] = "text/JavaScript"; tag ['src'] = self. js_path + "jquery. JS "; tag2 = tag (SE Lf. soup, "script"); tag2 ['type'] = "text/JavaScript"; tag2 ['src'] = self. js_path + "init. JS "; self. soup. head. insert (2, tag2); self. soup. head. insert (2, tag); # create a template self. write_file (self. build_tpl_name, self. soup); # obtain the replaced HTML file def get_replace_html (self, rep_id, rep_data = ""): ''' parameter description: replace ID and content. (if it is null, replace it with the template mode) ''' If Len (rep_data)> 0: rep_str = Rep_data; else: rep_str = "$ rep_arr [{0}]". format (rep_id); return " "+ rep_str +" "; # obtain the replacement array def get_replace_list (self, text_arr, set_lang ): sp = sentencespliter (); sp. setlang (set_lang); temp_sentence = []; jump_ I = 0; for text in text_arr: slist = sp. split (text); replace_temp = ""; If s List! = None: for item in slist: replace_temp = replace_temp + self. get_replace_html (jump_ I, item); jump_ I = jump_ I + 1; else: replace_temp = self. get_replace_html (jump_ I, text); jump_ I = jump_ I + 1; temp_sentence.append (replace_temp); return temp_sentence; # clause def parse_text (self, text_arr, set_lang): sp = sentencespliter (); sp. setlang (set_lang); temp_s Entence = []; for text in text_arr: slist = sp. Split (text); If slist! = None: for item in slist: temp_sentence.append (item); else: temp_sentence.append (text ); return temp_sentence; # retrieve resolution resources def get_soup (self, parse_file): try: file = Io. fileio (parse_file, "R"); Doc = file. readall (); file. close (); handle T ioerror: Print 'error: % s file not found! '% Parse_file; return false; # Start parsing HTML documents return beautifulsoup (''. join (DOC); If _ name _ = "_ main _": from sys import argv, exit; If Len (argv) <3: Print "Usage: python % S <input-File> <output-File> "% argv [0] exit (255 ); if not isfile (argv [1]): Print "No such input file: % s "% argv [1] exit (1) paser_file = argv [1]; #" html/testpic.html "; tpl_file = argv [2]; save_pic_path = argv [3]; show_pic_path = argv [4]; load_js_path = argv [5]; # Start parsing and set the parsing file, Template Name, and image storage path, image Display path SO = build_tpl (paser_file, tpl_file, save_pic_path, show_pic_path, load_js_path); # create a template SO. builder (); # Save the clause pair SO. save_data_file ();

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Parse HTML with Python: beautifulsoup

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Parse HTML with Python: beautifulsoup

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support