HTML needs to be parsed in the project, implemented in Python, and beautifulsoup is a useful thing. If you write a program, you may not know what to do, the purpose is to let everyone know how to use beautifulsoup. Of course, I am using beautif. I have never learned how to use beautif. it is too advanced.
# Coding = UTF-8 <br/> from beautifulsoup import beautifulsoup, Tag, navigablestring <br/> from sentencespliter import sentencespliter <br/> from OS. path import basename, dirname, isdir, isfile <br/> from OS import makedirs <br/> from shutil import copyfile <br/> Import Io <br/> Import time <br/> Import Re </P> <p> class build_tpl: <br/> def _ init _ (self, parse_file, build_tpl_name, cp_pic_dir, show_pic_dir, js_path, S Et_lang = 2052): <br/> ''' parameter description: resolution file name, Template Name, Save image path, Image Display path, JS path, current language) ''' </P> <p> # obtain the path to the parsed file directory <br/> If Len (dirname (parse_file)> 1: <br/> self. cur_dir = dirname (parse_file) + "/"; <br/> else: <br/> self. cur_dir = ". /"; </P> <p> # name of the created template file <br/> self. build_tpl_name = build_tpl_name; <br/> # image CP directory <br/> self. cp_pic_dir = cp_pic_dir; <br/> # display the image directory through HTTP <br/> self. show_pic_dir = show_pic_dir; <br/> # JS loading path <br/> Self. Js_path = js_path; </P> <p> # Segment group <br/> self. get_text_arr = []; <br/> # array of the current image name <br/> self. cur_pic_arr = []; </P> <p> # retrieve soup resources by parsing files <br/> self. soup = self. get_soup (parse_file); <br/> # obtain the section document in the HTML document. <br/> self. get_text_arr = self. soup. body. findall (text = Lambda (x): Len (X. strip ()> 0); <br/> # obtain a sentence pair <br/> self. get_sentence_arr = self. parse_text (self. get_text_arr, set_lang); <br/> # obtain the replacement array <br/> self. replace_list = self. Get_replace_list (self. get_text_arr, set_lang); <br/> # obtain the Image array <br/> self. cur_pic_arr = self. soup. findall ('img '); </P> <p> # self. write_file_by_list ("no.txt", self. get_text_arr); <br/> # self. write_file_by_list ("yes.txt", self. get_sentence_arr); </P> <p> # Save the phrase to the file <br/> def save_data_file (Self): <br/> file_name = self. build_tpl_name + ". data "; <br/> self. write_file_by_list (file_name, self. get_data (); <br/> # Get phrase <br/> d EF get_data (Self): <br/> return self. get_sentence_arr; <br/> # Write the array to the document <br/> def write_file_by_list (self, file_name, write_arr): <br/> file = Io. fileio (file_name, "W"); <br/> file. write ('/N '. join (write_arr )). encode ('utf-8'); <br/> file. close (); <br/> # Write a string to the document <br/> def write_file (self, file_name, file_contents): <br/> file = Io. fileio (file_name, "W"); <br/> file. write (file_contents.encode ('utf-8'); <br/> file. close (); <Br/> # create an image hash directory <br/> def get_pic_hash (Self): <br/> return time. strftime ("% Y/% m/% d/"); <br/> # create a template file <br/> def Builder (Self ): <br/> # words that fail to be replaced <br/> bug_msg = []; <br/> # Replace the content template <br/> for I in range (LEN (self. get_text_arr): <br/> # Replace <br/> rep_str = "$ rep_arr [{0}]". format (I); <br/> try: <br/> self. soup. body. find (text = self. get_text_arr [I]). replacewith (self. replace_list [I]); <br/> T attributeerror: <br /> Bug_msg.append (self. get_text_arr [I]); </P> <p> # obtain the image hash path <br/> hash_dir = self. get_pic_hash (); <br/> # construct the image display path <br/> show_pic_dir = self. show_pic_dir + hash_dir; <br/> # construct the image storage path <br/> cp_pic_dir = self. cp_pic_dir + hash_dir; </P> <p> # determine whether the directory for saving the image does not exist. <br/> if not isdir (cp_pic_dir ): <br/> makedirs (cp_pic_dir); </P> <p> for pic_name in self. cur_pic_arr: <br/> # Replace the image path <br/> old_pic_src = pic_name ['src']; <br/> P Ic_name ['src'] = show_pic_dir + old_pic_src; <br/> # copy an image <br/> cp_src_file = self. cur_dir + old_pic_src; <br/> cp_dis_file = cp_pic_dir + old_pic_src; <br/> copyfile (cp_src_file, cp_dis_file ); </P> <p> # document for setting up bug Information <br/> # self. write_file_by_list ("bug.txt", bug_msg); </P> <p> # Add JS <br/> tag = tag (self. soup, "script"); <br/> tag ['type'] = "text/JavaScript"; <br/> tag ['src'] = self. js_path + "jquery. JS "; </P> <p> tag2 = tag (SE Lf. soup, "script"); <br/> tag2 ['type'] = "text/JavaScript"; <br/> tag2 ['src'] = self. js_path + "init. JS "; </P> <p> self. soup. head. insert (2, tag2); <br/> self. soup. head. insert (2, tag); </P> <p> # create a template <br/> self. write_file (self. build_tpl_name, self. soup); <br/> # obtain the replaced HTML file <br/> def get_replace_html (self, rep_id, rep_data = ""): <br/> ''' <br/> parameter description: replace ID and content. (if it is null, replace it with the template mode) <br/> ''' <br/> If Len (rep_data)> 0: <br/> rep_str = Rep_data; <br/> else: <br/> rep_str = "$ rep_arr [{0}]". format (rep_id); <br/> return "<span sty =/" Data/"id =/" rep _ "+ STR (rep_id) + "/"> "+ rep_str +" </span> "; <br/> # obtain the replacement array <br/> def get_replace_list (self, text_arr, set_lang ): <br/> sp = sentencespliter (); <br/> sp. setlang (set_lang); <br/> temp_sentence = []; <br/> jump_ I = 0; <br/> for text in text_arr: <br/> slist = sp. split (text); <br/> replace_temp = ""; <br/> If s List! = None: <br/> for item in slist: <br/> replace_temp = replace_temp + self. get_replace_html (jump_ I, item); <br/> jump_ I = jump_ I + 1; <br/> else: <br/> replace_temp = self. get_replace_html (jump_ I, text); <br/> jump_ I = jump_ I + 1; <br/> temp_sentence.append (replace_temp); <br/> return temp_sentence; <br/> # clause <br/> def parse_text (self, text_arr, set_lang): <br/> sp = sentencespliter (); <br/> sp. setlang (set_lang); <br/> temp_s Entence = []; <br/> for text in text_arr: <br/> slist = sp. Split (text); <br/> If slist! = None: <br/> for item in slist: <br/> temp_sentence.append (item); <br/> else: <br/> temp_sentence.append (text ); </P> <p> return temp_sentence; </P> <p> # retrieve resolution resources <br/> def get_soup (self, parse_file): <br/> try: <br/> file = Io. fileio (parse_file, "R"); <br/> Doc = file. readall (); <br/> file. close (); <br/> handle T ioerror: <br/> Print 'error: % s file not found! '% Parse_file; <br/> return false; <br/> # Start parsing HTML documents <br/> return beautifulsoup (''. join (DOC); </P> <p> If _ name _ = "_ main _": <br/> from sys import argv, exit; </P> <p> If Len (argv) <3: <br/> Print "Usage: python % S <input-File> <output-File> "% argv [0] <br/> exit (255 ); </P> <p> if not isfile (argv [1]): <br/> Print "No such input file: % s "% argv [1] <br/> exit (1) </P> <p> paser_file = argv [1]; #" html/testpic.html "; <br/> tpl_file = argv [2]; <br/> save_pic_path = argv [3]; <br/> show_pic_path = argv [4]; <br/> load_js_path = argv [5]; <br/> # Start parsing and set the parsing file, Template Name, and image storage path, image Display path <br/> SO = build_tpl (paser_file, tpl_file, save_pic_path, show_pic_path, load_js_path); <br/> # create a template <br/> SO. builder (); <br/> # Save the clause pair <br/> SO. save_data_file (); <br/>