Parse HTML with Python: beautifulsoup

Source: Internet
Author: User

CopyCode The Code is as follows: # Coding = UTF-8
From beautifulsoup import beautifulsoup, Tag, navigablestring
From sentencespliter import sentencespliter
From OS. Path import basename, dirname, isdir, isfile
From OS import makedirs
From shutil import copyfile
Import Io
Import time
Import re

Class build_tpl:
Def _ init _ (self, parse_file, build_tpl_name, cp_pic_dir, show_pic_dir, js_path, set_lang = 2052 ):
'''Parameter Description: parses the file name, Template Name, saves the image path, Image Display path, JS path, and current language (used in clauses )'''

# Retrieve resolution file directory path
If Len (dirname (parse_file)> 1:
Self. cur_dir = dirname (parse_file) + "/";
Else:
Self. cur_dir = "./";

# Created template file name
Self. build_tpl_name = build_tpl_name;
# Cp directory
Self. cp_pic_dir = cp_pic_dir;
# Display the image directory through HTTP
Self. show_pic_dir = show_pic_dir;
# JS loading path
Self. js_path = js_path;

# Segment Group
Self. get_text_arr = [];
# Array of current image names
Self. cur_pic_arr = [];

# Retrieving soup resources by parsing files
Self. Soup = self. get_soup (parse_file );
# Obtain the section document in the HTML document
Self. get_text_arr = self. Soup. Body. findall (text = Lambda (x): Len (X. Strip ()> 0 );
# Obtain a correct sentence
Self. get_sentence_arr = self. parse_text (self. get_text_arr, set_lang );
# Retrieve the replacement Array
Self. replace_list = self. get_replace_list (self. get_text_arr, set_lang );
# Retrieving image Arrays
Self. cur_pic_arr = self. Soup. findall ('img ');

# Self. write_file_by_list ("no.txt", self. get_text_arr );
# Self. write_file_by_list ("yes.txt", self. get_sentence_arr );

# Save phrase to file
Def save_data_file (Self ):
File_name = self. build_tpl_name + ". Data ";
Self. write_file_by_list (file_name, self. get_data ());
# Getting phrases
Def get_data (Self ):
Return self. get_sentence_arr;
# Writing arrays to documents
Def write_file_by_list (self, file_name, write_arr ):
File = Io. fileio (file_name, "W ");
File. Write ('\ n'. Join (write_arr). encode ('utf-8 '));
File. Close ();
# Writing strings to documents
Def write_file (self, file_name, file_contents ):
File = Io. fileio (file_name, "W ");
File. Write (file_contents.encode ('utf-8 '));
File. Close ();
# Create an image hash directory
Def get_pic_hash (Self ):
Return time. strftime ("% Y/% m/% d /");
# Create a template file
Def Builder (Self ):
# Words that fail to be replaced
Bug_msg = [];
# Content template replacement
For I in range (LEN (self. get_text_arr )):
# Replacement
Rep_str = "$ rep_arr [{0}]". Format (I );
Try:
Self. Soup. Body. Find (text = self. get_text_arr [I]). replacewith (self. replace_list [I]);
T attributeerror:
Bug_msg.append (self. get_text_arr [I]);

# Retrieving the image hash path
Hash_dir = self. get_pic_hash ();
# Construct the image display path
Show_pic_dir = self. show_pic_dir + hash_dir;
# Construct an image storage path
Cp_pic_dir = self. cp_pic_dir + hash_dir;

# Determine whether the directory for saving the image does not exist
If not isdir (cp_pic_dir ):
Makedirs (cp_pic_dir );

For pic_name in self. cur_pic_arr:
# Replacing the image path
Old_pic_src = pic_name ['src'];
Pic_name ['src'] = show_pic_dir + old_pic_src;
# Copying Images
Cp_src_file = self. cur_dir + old_pic_src;
Cp_dis_file = cp_pic_dir + old_pic_src;
Copyfile (cp_src_file, cp_dis_file );

# Create a document for bug information
# Self. write_file_by_list ("bug.txt", bug_msg );

# Add JS
Tag = tag (self. Soup, "script ");
Tag ['type'] = "text/JavaScript ";
Tag ['src'] = self. js_path + "jquery. js ";

Tag2 = tag (self. Soup, "script ");
Tag2 ['type'] = "text/JavaScript ";
Tag2 ['src'] = self. js_path + "init. js ";

Self. Soup. Head. insert (2, tag2 );
Self. Soup. Head. insert (2, tag );

# Create a template
Self. write_file (self. build_tpl_name, self. soup );
# Obtain the replaced HTML file
Def get_replace_html (self, rep_id, rep_data = ""):
'''
Parameter description: replace ID with the content (if it is null, replace it with the template Mode)
'''
If Len (rep_data)> 0:
Rep_str = rep_data;
Else:
Rep_str = "$ rep_arr [{0}]". Format (rep_id );
Return "<span sty = \" data \ "id = \" rep _ "+ STR (rep_id) +" \ ">" + rep_str + "</span> ";
# Retrieve the replacement Array
Def get_replace_list (self, text_arr, set_lang ):
SP = sentencespliter ();
Sp. setlang (set_lang );
Temp_sentence = [];
Jump_ I = 0;
For text in text_arr:
Slist = sp. Split (text );
Replace_temp = "";
If slist! = None:
For item in slist:
Replace_temp = replace_temp + self. get_replace_html (jump_ I, item );
Jump_ I = jump_ I + 1;
Else:
Replace_temp = self. get_replace_html (jump_ I, text );
Jump_ I = jump_ I + 1;
Temp_sentence.append (replace_temp );
Return temp_sentence;
# Clause
Def parse_text (self, text_arr, set_lang ):
SP = sentencespliter ();
Sp. setlang (set_lang );
Temp_sentence = [];
For text in text_arr:
Slist = sp. Split (text );
If slist! = None:
For item in slist:
Temp_sentence.append (item );
Else:
Temp_sentence.append (text );

Return temp_sentence;

# Retrieve Resolution Resources
Def get_soup (self, parse_file ):
Try:
File = Io. fileio (parse_file, "R ");
Doc = file. readall ();
File. Close ();
Handle t ioerror:
Print 'error: % s file not found! '% Parse_file;
Return false;
# Start parsing HTML documents
Return beautifulsoup (''. Join (DOC ));

If _ name _ = "_ main __":
From sys import argv, exit;

If Len (argv) <3:
Print "Usage: Python % S <input-File> <output-File>" % argv [0]
Exit (255 );

If not isfile (argv [1]):
Print "No such input file: % s" % argv [1]
Exit (1)

Paser_file = argv [1]; # "html/testpic.html ";
Tpl_file = argv [2];
Save_pic_path = argv [3];
Show_pic_path = argv [4];
Load_js_path = argv [5];
# Resolution start setting the resolution file, Template Name, image storage path, and image display path
So = build_tpl (paser_file, tpl_file, save_pic_path, show_pic_path, load_js_path );
# Create a template
So. Builder ();
# Saving Sentence Pairs
So. save_data_file ();

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.