項目裡需要解析html,採用python語言實現,發現了BeautifulSoup這個好用的東西,寫了一個程式,可能大家不知道,幹什麼用的,目的是讓大家知道如何使用 BeautifulSoup 當然我這個是用都是很初級的,進階的使用,偶也沒有學會呢,太高深了
# coding=utf-8<br />from BeautifulSoup import BeautifulSoup, Tag, NavigableString<br />from SentenceSpliter import SentenceSpliter<br />from os.path import basename,dirname,isdir,isfile<br />from os import makedirs<br />from shutil import copyfile<br />import io<br />import time<br />import re </p><p>class build_tpl:<br />def __init__(self,parse_file,build_tpl_name,cp_pic_dir,show_pic_dir,js_path,set_lang=2052):<br />'''參數說明:解析檔案名稱,模版名稱,儲存圖片路徑,圖片顯示路徑,js路徑,當前語言(分句使用)'''</p><p>#取得解析檔案目錄路徑<br />if len(dirname(parse_file))>1:<br />self.cur_dir = dirname(parse_file)+"/";<br />else:<br />self.cur_dir ="./";</p><p>#建立的模版檔案檔案名稱<br />self.build_tpl_name = build_tpl_name;<br />#圖片cp到得目錄<br />self.cp_pic_dir = cp_pic_dir;<br />#通過http展現圖片的目錄<br />self.show_pic_dir = show_pic_dir;<br />#載入js的路徑<br />self.js_path = js_path;</p><p>#句段組<br />self.get_text_arr = [];<br />#當前圖片名數組<br />self.cur_pic_arr = [];</p><p>#解析檔案 取得soup 資源<br />self.soup = self.get_soup(parse_file);<br />#取得html文檔中,段文檔<br />self.get_text_arr = self.soup.body.findAll(text=lambda(x): len(x.strip()) > 0);<br />#取得句對<br />self.get_sentence_arr = self.parse_text(self.get_text_arr,set_lang);<br />#取得替換數組<br />self.replace_list = self.get_replace_list(self.get_text_arr,set_lang);<br />#取得圖片數組<br />self.cur_pic_arr = self.soup.findAll('img');</p><p>#self.write_file_by_list("no.txt",self.get_text_arr);<br />#self.write_file_by_list("yes.txt",self.get_sentence_arr);</p><p>#儲存片語到檔案<br />def save_data_file(self):<br />file_name = self.build_tpl_name+".data";<br />self.write_file_by_list(file_name,self.get_data());<br />#取得片語<br />def get_data(self):<br />return self.get_sentence_arr;<br />#數組寫入到文檔<br />def write_file_by_list(self,file_name,write_arr):<br />file=io.FileIO(file_name,"w");<br />file.write(('/n'.join(write_arr)).encode('utf-8'));<br />file.close();<br />#字串寫入到文檔<br />def write_file(self,file_name,file_contents):<br />file=io.FileIO(file_name,"w");<br />file.write(file_contents.encode('utf-8'));<br />file.close();<br />#建立圖片hash目錄<br />def get_pic_hash(self):<br />return time.strftime("%Y/%m/%d/");<br />#建立模版檔案<br />def builder(self):<br />#沒能發生替換的單詞<br />bug_msg = [];<br />#進行內容模版替換<br />for i in range(len(self.get_text_arr)):<br />#替換<br />rep_str = "$rep_arr[{0}]".format(i);<br />try:<br />self.soup.body.find(text=self.get_text_arr[i]).replaceWith(self.replace_list[i]);<br />except AttributeError:<br />bug_msg.append(self.get_text_arr[i]);</p><p>#取得圖片hash路徑<br />hash_dir = self.get_pic_hash();<br />#構造展示圖片路徑<br />show_pic_dir = self.show_pic_dir+hash_dir;<br />#構造圖片儲存路徑<br />cp_pic_dir = self.cp_pic_dir+hash_dir;</p><p>#判斷儲存圖片的目錄是否存在 不存在建立<br />if not isdir(cp_pic_dir):<br />makedirs(cp_pic_dir);</p><p>for pic_name in self.cur_pic_arr:<br />#進行圖片路徑替換<br />old_pic_src = pic_name['src'];<br />pic_name['src'] = show_pic_dir+old_pic_src;<br />#進行圖片拷貝<br />cp_src_file = self.cur_dir+old_pic_src;<br />cp_dis_file = cp_pic_dir+old_pic_src;<br />copyfile(cp_src_file,cp_dis_file);</p><p>#建立bug資訊的文檔<br />#self.write_file_by_list("bug.txt",bug_msg);</p><p>#添加js<br />tag = Tag(self.soup,"script");<br />tag['type'] = "text/javascript";<br />tag['src'] =self.js_path+"jquery.js";</p><p>tag2 = Tag(self.soup,"script");<br />tag2['type'] = "text/javascript";<br />tag2['src'] =self.js_path+"init.js";</p><p>self.soup.head.insert(2,tag2);<br />self.soup.head.insert(2,tag);</p><p>#建立模版<br />self.write_file(self.build_tpl_name,self.soup);<br />#取得替換的html檔案<br />def get_replace_html(self,rep_id,rep_data=""):<br />'''<br />參數說明:替換id,替換內容(為空白的採用模版模式替換)<br />'''<br />if len(rep_data) > 0 :<br />rep_str = rep_data;<br />else:<br />rep_str = "$rep_arr[{0}]".format(rep_id);<br />return "<span sty=/"data/" id=/"rep_"+str(rep_id)+"/">"+rep_str+"</span>";<br />#取得替換數組<br />def get_replace_list(self,text_arr,set_lang):<br />Sp = SentenceSpliter();<br />Sp.SetLang(set_lang);<br />temp_sentence = [];<br />jump_i = 0;<br />for text in text_arr:<br />SList = Sp.Split(text);<br />replace_temp = "";<br />if SList != None:<br />for item in SList:<br />replace_temp = replace_temp+self.get_replace_html(jump_i,item);<br />jump_i=jump_i+1;<br />else:<br />replace_temp = self.get_replace_html(jump_i,text);<br />jump_i=jump_i+1;<br />temp_sentence.append(replace_temp);<br />return temp_sentence;<br />#分句<br />def parse_text(self,text_arr,set_lang):<br />Sp = SentenceSpliter();<br />Sp.SetLang(set_lang);<br />temp_sentence = [];<br />for text in text_arr:<br />SList = Sp.Split(text);<br />if SList != None:<br />for item in SList:<br />temp_sentence.append(item);<br />else:<br />temp_sentence.append(text);</p><p>return temp_sentence;</p><p>#取得解析資源<br />def get_soup(self,parse_file):<br />try:<br />file=io.FileIO(parse_file,"r");<br />doc = file.readall();<br />file.close();<br />except IOError:<br />print 'ERROR: %s file not found!' %parse_file;<br />return False;<br />#開始解析html文檔<br />return BeautifulSoup(''.join(doc));</p><p>if __name__ == "__main__":<br />from sys import argv, exit;</p><p>if len(argv) < 3:<br />print "USAGE: python %s <input-file> <output-file>" % argv[0]<br />exit(255);</p><p>if not isfile(argv[1]):<br />print "no such input file: %s" % argv[1]<br />exit(1)</p><p>paser_file = argv[1];#"html/testpic.html";<br />tpl_file = argv[2];<br />save_pic_path = argv[3];<br />show_pic_path = argv[4];<br />load_js_path = argv[5];<br />#解析開始 設定解析檔案,模版名,圖片儲存路徑,圖片顯示路徑<br />so = build_tpl(paser_file,tpl_file,save_pic_path,show_pic_path,load_js_path);<br />#建立模版<br />so.builder();<br />#儲存分句的句對<br />so.save_data_file();<br />