Python parsing HTML Beautifulsoup_python

Source: Internet
Author: User
Copy Code code as follows:

# Coding=utf-8
From BeautifulSoup import BeautifulSoup, Tag, navigablestring
From Sentencespliter import Sentencespliter
From Os.path import Basename,dirname,isdir,isfile
From OS import makedirs
From Shutil import CopyFile
Import IO
Import time
Import re

Class Build_tpl:
def __init__ (self,parse_file,build_tpl_name,cp_pic_dir,show_pic_dir,js_path,set_lang=2052):
"' Parameter description: Parse filename, template name, save picture path, picture display path, JS path, current language (use of clause) '"

#取得解析文件目录路径
If Len (dirname (Parse_file)) >1:
Self.cur_dir = DirName (parse_file) + "/";
Else
Self.cur_dir = "./";

#建立的模版文件文件名
Self.build_tpl_name = Build_tpl_name;
#图片cp到得目录
Self.cp_pic_dir = Cp_pic_dir;
#通过http展现图片的目录
Self.show_pic_dir = Show_pic_dir;
#加载js的路径
Self.js_path = Js_path;

#句段组
Self.get_text_arr = [];
#当前图片名数组
Self.cur_pic_arr = [];

#解析文件 Access to Soup resources
Self.soup = Self.get_soup (parse_file);
#取得html文档中, Che Wen file
Self.get_text_arr = Self.soup.body.findAll (Text=lambda (x): Len (X.strip ()) > 0);
#取得句对
Self.get_sentence_arr = Self.parse_text (Self.get_text_arr,set_lang);
#取得替换数组
Self.replace_list = Self.get_replace_list (Self.get_text_arr,set_lang);
#取得图片数组
Self.cur_pic_arr = SELF.SOUP.FINDALL (' img ');

#self. Write_file_by_list ("No.txt", Self.get_text_arr);
#self. Write_file_by_list ("Yes.txt", Self.get_sentence_arr);

#保存词组到文件
def save_data_file (self):
file_name = self.build_tpl_name+ ". Data";
Self.write_file_by_list (File_name,self.get_data ());
#取得词组
def get_data (self):
return Self.get_sentence_arr;
#数组写入到文档
def write_file_by_list (Self,file_name,write_arr):
File=io. FileIO (file_name, "w");
File.write (' \ n '. Join (Write_arr)). Encode (' utf-8 '));
File.close ();
#字符串写入到文档
def write_file (self,file_name,file_contents):
File=io. FileIO (file_name, "w");
File.write (File_contents.encode (' utf-8 '));
File.close ();
#建立图片hash目录
def get_pic_hash (self):
Return Time.strftime ("%y/%m/%d/");
#建立模版文件
def builder (self):
#没能发生替换的单词
Bug_msg = [];
#进行内容模版替换
For I in range (len (Self.get_text_arr)):
#替换
Rep_str = "$rep _arr[{0}]". Format (i);
Try
Self.soup.body.find (Text=self.get_text_arr[i]). ReplaceWith (Self.replace_list[i]);
Except Attributeerror:
Bug_msg.append (Self.get_text_arr[i]);

#取得图片hash路径
Hash_dir = Self.get_pic_hash ();
#构造展示图片路径
Show_pic_dir = Self.show_pic_dir+hash_dir;
#构造图片保存路径
Cp_pic_dir = Self.cp_pic_dir+hash_dir;

#判断保存图片的目录是否存在 does not exist to establish
If not Isdir (Cp_pic_dir):
Makedirs (Cp_pic_dir);

For Pic_name in Self.cur_pic_arr:
#进行图片路径替换
old_pic_src = pic_name[' src '];
pic_name[' src '] = show_pic_dir+old_pic_src;
#进行图片拷贝
Cp_src_file = SELF.CUR_DIR+OLD_PIC_SRC;
Cp_dis_file = CP_PIC_DIR+OLD_PIC_SRC;
CopyFile (Cp_src_file,cp_dis_file);

#建立bug信息的文档
#self. Write_file_by_list ("Bug.txt", bug_msg);

#添加js
Tag = tag (Self.soup, "script");
tag[' type '] = "text/javascript";
tag[' src '] =self.js_path+ "jquery.js";

Tag2 = Tag (self.soup, "script");
tag2[' type '] = "text/javascript";
tag2[' src '] =self.js_path+ "init.js";

Self.soup.head.insert (2,TAG2);
Self.soup.head.insert (2,tag);


#建立模版
Self.write_file (Self.build_tpl_name,self.soup);
#取得替换的html文件
def get_replace_html (self,rep_id,rep_data= ""):
'''
Parameter description: Replace ID, replace content (replace with template mode for empty)
'''
If Len (Rep_data) > 0:
Rep_str = Rep_data;
Else
Rep_str = "$rep _arr[{0}]". Format (rep_id);
Return "<span sty=\" "Data\" id=\ "Rep_" +str (rep_id) + ">" +rep_str+ "</span>";
#取得替换数组
def get_replace_list (Self,text_arr,set_lang):
Sp = Sentencespliter ();
Sp.setlang (Set_lang);
Temp_sentence = [];
jump_i = 0;
For text in Text_arr:
Slist = Sp.split (text);
Replace_temp = "";
If Slist!= None:
For item in Slist:
Replace_temp = replace_temp+self.get_replace_html (Jump_i,item);
jump_i=jump_i+1;
Else
Replace_temp = self.get_replace_html (Jump_i,text);
jump_i=jump_i+1;
Temp_sentence.append (replace_temp);
return temp_sentence;
#分句
def parse_text (Self,text_arr,set_lang):
Sp = Sentencespliter ();
Sp.setlang (Set_lang);
Temp_sentence = [];
For text in Text_arr:
Slist = Sp.split (text);
If Slist!= None:
For item in Slist:
Temp_sentence.append (item);
Else
Temp_sentence.append (text);

return temp_sentence;

#取得解析资源
def get_soup (self,parse_file):
Try
File=io. FileIO (Parse_file, "R");
doc = File.readall ();
File.close ();
Except IOError:
print ' ERROR:%s file not found! '%parse_file;
return False;
#开始解析html文档
Return BeautifulSoup ('. Join (DOC));

if __name__ = = "__main__":
From sys import argv, exit;

If Len (argv) < 3:
Print "Usage:python%s <input-file> <output-file>"% argv[0]
Exit (255);

If not isfile (Argv[1]):
print ' No such input file:%s '% argv[1]
Exit (1)


Paser_file = argv[1];# "html/testpic.html";
Tpl_file = argv[2];
Save_pic_path = argv[3];
Show_pic_path = argv[4];
Load_js_path = argv[5];
#解析开始 set parsing file, template name, picture save path, picture display path
so = Build_tpl (Paser_file,tpl_file,save_pic_path,show_pic_path,load_js_path);
#建立模版
So.builder ();
#保存分句的句对
So.save_data_file ();

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.