Copy Code code as follows:
# Coding=utf-8
From BeautifulSoup import BeautifulSoup, Tag, navigablestring
From Sentencespliter import Sentencespliter
From Os.path import Basename,dirname,isdir,isfile
From OS import makedirs
From Shutil import CopyFile
Import IO
Import time
Import re
Class Build_tpl:
def __init__ (self,parse_file,build_tpl_name,cp_pic_dir,show_pic_dir,js_path,set_lang=2052):
"' Parameter description: Parse filename, template name, save picture path, picture display path, JS path, current language (use of clause) '"
#取得解析文件目录路径
If Len (dirname (Parse_file)) >1:
Self.cur_dir = DirName (parse_file) + "/";
Else
Self.cur_dir = "./";
#建立的模版文件文件名
Self.build_tpl_name = Build_tpl_name;
#图片cp到得目录
Self.cp_pic_dir = Cp_pic_dir;
#通过http展现图片的目录
Self.show_pic_dir = Show_pic_dir;
#加载js的路径
Self.js_path = Js_path;
#句段组
Self.get_text_arr = [];
#当前图片名数组
Self.cur_pic_arr = [];
#解析文件 Access to Soup resources
Self.soup = Self.get_soup (parse_file);
#取得html文档中, Che Wen file
Self.get_text_arr = Self.soup.body.findAll (Text=lambda (x): Len (X.strip ()) > 0);
#取得句对
Self.get_sentence_arr = Self.parse_text (Self.get_text_arr,set_lang);
#取得替换数组
Self.replace_list = Self.get_replace_list (Self.get_text_arr,set_lang);
#取得图片数组
Self.cur_pic_arr = SELF.SOUP.FINDALL (' img ');
#self. Write_file_by_list ("No.txt", Self.get_text_arr);
#self. Write_file_by_list ("Yes.txt", Self.get_sentence_arr);
#保存词组到文件
def save_data_file (self):
file_name = self.build_tpl_name+ ". Data";
Self.write_file_by_list (File_name,self.get_data ());
#取得词组
def get_data (self):
return Self.get_sentence_arr;
#数组写入到文档
def write_file_by_list (Self,file_name,write_arr):
File=io. FileIO (file_name, "w");
File.write (' \ n '. Join (Write_arr)). Encode (' utf-8 '));
File.close ();
#字符串写入到文档
def write_file (self,file_name,file_contents):
File=io. FileIO (file_name, "w");
File.write (File_contents.encode (' utf-8 '));
File.close ();
#建立图片hash目录
def get_pic_hash (self):
Return Time.strftime ("%y/%m/%d/");
#建立模版文件
def builder (self):
#没能发生替换的单词
Bug_msg = [];
#进行内容模版替换
For I in range (len (Self.get_text_arr)):
#替换
Rep_str = "$rep _arr[{0}]". Format (i);
Try
Self.soup.body.find (Text=self.get_text_arr[i]). ReplaceWith (Self.replace_list[i]);
Except Attributeerror:
Bug_msg.append (Self.get_text_arr[i]);
#取得图片hash路径
Hash_dir = Self.get_pic_hash ();
#构造展示图片路径
Show_pic_dir = Self.show_pic_dir+hash_dir;
#构造图片保存路径
Cp_pic_dir = Self.cp_pic_dir+hash_dir;
#判断保存图片的目录是否存在 does not exist to establish
If not Isdir (Cp_pic_dir):
Makedirs (Cp_pic_dir);
For Pic_name in Self.cur_pic_arr:
#进行图片路径替换
old_pic_src = pic_name[' src '];
pic_name[' src '] = show_pic_dir+old_pic_src;
#进行图片拷贝
Cp_src_file = SELF.CUR_DIR+OLD_PIC_SRC;
Cp_dis_file = CP_PIC_DIR+OLD_PIC_SRC;
CopyFile (Cp_src_file,cp_dis_file);
#建立bug信息的文档
#self. Write_file_by_list ("Bug.txt", bug_msg);
#添加js
Tag = tag (Self.soup, "script");
tag[' type '] = "text/javascript";
tag[' src '] =self.js_path+ "jquery.js";
Tag2 = Tag (self.soup, "script");
tag2[' type '] = "text/javascript";
tag2[' src '] =self.js_path+ "init.js";
Self.soup.head.insert (2,TAG2);
Self.soup.head.insert (2,tag);
#建立模版
Self.write_file (Self.build_tpl_name,self.soup);
#取得替换的html文件
def get_replace_html (self,rep_id,rep_data= ""):
'''
Parameter description: Replace ID, replace content (replace with template mode for empty)
'''
If Len (Rep_data) > 0:
Rep_str = Rep_data;
Else
Rep_str = "$rep _arr[{0}]". Format (rep_id);
Return "<span sty=\" "Data\" id=\ "Rep_" +str (rep_id) + ">" +rep_str+ "</span>";
#取得替换数组
def get_replace_list (Self,text_arr,set_lang):
Sp = Sentencespliter ();
Sp.setlang (Set_lang);
Temp_sentence = [];
jump_i = 0;
For text in Text_arr:
Slist = Sp.split (text);
Replace_temp = "";
If Slist!= None:
For item in Slist:
Replace_temp = replace_temp+self.get_replace_html (Jump_i,item);
jump_i=jump_i+1;
Else
Replace_temp = self.get_replace_html (Jump_i,text);
jump_i=jump_i+1;
Temp_sentence.append (replace_temp);
return temp_sentence;
#分句
def parse_text (Self,text_arr,set_lang):
Sp = Sentencespliter ();
Sp.setlang (Set_lang);
Temp_sentence = [];
For text in Text_arr:
Slist = Sp.split (text);
If Slist!= None:
For item in Slist:
Temp_sentence.append (item);
Else
Temp_sentence.append (text);
return temp_sentence;
#取得解析资源
def get_soup (self,parse_file):
Try
File=io. FileIO (Parse_file, "R");
doc = File.readall ();
File.close ();
Except IOError:
print ' ERROR:%s file not found! '%parse_file;
return False;
#开始解析html文档
Return BeautifulSoup ('. Join (DOC));
if __name__ = = "__main__":
From sys import argv, exit;
If Len (argv) < 3:
Print "Usage:python%s <input-file> <output-file>"% argv[0]
Exit (255);
If not isfile (Argv[1]):
print ' No such input file:%s '% argv[1]
Exit (1)
Paser_file = argv[1];# "html/testpic.html";
Tpl_file = argv[2];
Save_pic_path = argv[3];
Show_pic_path = argv[4];
Load_js_path = argv[5];
#解析开始 set parsing file, template name, picture save path, picture display path
so = Build_tpl (Paser_file,tpl_file,save_pic_path,show_pic_path,load_js_path);
#建立模版
So.builder ();
#保存分句的句对
So.save_data_file ();