python解析html之BeautifulSoup

最後更新：2018-12-05 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

項目裡需要解析html，採用python語言實現，發現了BeautifulSoup這個好用的東西，寫了一個程式，可能大家不知道，幹什麼用的，目的是讓大家知道如何使用 BeautifulSoup 當然我這個是用都是很初級的，進階的使用，偶也沒有學會呢，太高深了

# coding=utf-8 from BeautifulSoup import BeautifulSoup, Tag, NavigableString from SentenceSpliter import SentenceSpliter from os.path import basename,dirname,isdir,isfile from os import makedirs from shutil import copyfile import io import time import re class build_tpl: def __init__(self,parse_file,build_tpl_name,cp_pic_dir,show_pic_dir,js_path,set_lang=2052): '''參數說明：解析檔案名稱，模版名稱，儲存圖片路徑，圖片顯示路徑，js路徑，當前語言（分句使用）'''#取得解析檔案目錄路徑 if len(dirname(parse_file))>1: self.cur_dir = dirname(parse_file)+"/"; else: self.cur_dir ="./";#建立的模版檔案檔案名稱 self.build_tpl_name = build_tpl_name; #圖片cp到得目錄 self.cp_pic_dir = cp_pic_dir; #通過http展現圖片的目錄 self.show_pic_dir = show_pic_dir; #載入js的路徑 self.js_path = js_path;#句段組 self.get_text_arr = []; #當前圖片名數組 self.cur_pic_arr = [];#解析檔案取得soup 資源 self.soup = self.get_soup(parse_file); #取得html文檔中，段文檔 self.get_text_arr = self.soup.body.findAll(text=lambda(x): len(x.strip()) > 0); #取得句對 self.get_sentence_arr = self.parse_text(self.get_text_arr,set_lang); #取得替換數組 self.replace_list = self.get_replace_list(self.get_text_arr,set_lang); #取得圖片數組 self.cur_pic_arr = self.soup.findAll('img');#self.write_file_by_list("no.txt",self.get_text_arr); #self.write_file_by_list("yes.txt",self.get_sentence_arr);#儲存片語到檔案 def save_data_file(self): file_name = self.build_tpl_name+".data"; self.write_file_by_list(file_name,self.get_data()); #取得片語 def get_data(self): return self.get_sentence_arr; #數組寫入到文檔 def write_file_by_list(self,file_name,write_arr): file=io.FileIO(file_name,"w"); file.write(('/n'.join(write_arr)).encode('utf-8')); file.close(); #字串寫入到文檔 def write_file(self,file_name,file_contents): file=io.FileIO(file_name,"w"); file.write(file_contents.encode('utf-8')); file.close(); #建立圖片hash目錄 def get_pic_hash(self): return time.strftime("%Y/%m/%d/"); #建立模版檔案 def builder(self): #沒能發生替換的單詞 bug_msg = []; #進行內容模版替換 for i in range(len(self.get_text_arr)): #替換 rep_str = "$rep_arr[{0}]".format(i); try: self.soup.body.find(text=self.get_text_arr[i]).replaceWith(self.replace_list[i]); except AttributeError: bug_msg.append(self.get_text_arr[i]);#取得圖片hash路徑 hash_dir = self.get_pic_hash(); #構造展示圖片路徑 show_pic_dir = self.show_pic_dir+hash_dir; #構造圖片儲存路徑 cp_pic_dir = self.cp_pic_dir+hash_dir;#判斷儲存圖片的目錄是否存在不存在建立 if not isdir(cp_pic_dir): makedirs(cp_pic_dir);for pic_name in self.cur_pic_arr: #進行圖片路徑替換 old_pic_src = pic_name['src']; pic_name['src'] = show_pic_dir+old_pic_src; #進行圖片拷貝 cp_src_file = self.cur_dir+old_pic_src; cp_dis_file = cp_pic_dir+old_pic_src; copyfile(cp_src_file,cp_dis_file);#建立bug資訊的文檔 #self.write_file_by_list("bug.txt",bug_msg);#添加js tag = Tag(self.soup,"script"); tag['type'] = "text/javascript"; tag['src'] =self.js_path+"jquery.js";tag2 = Tag(self.soup,"script"); tag2['type'] = "text/javascript"; tag2['src'] =self.js_path+"init.js";self.soup.head.insert(2,tag2); self.soup.head.insert(2,tag);#建立模版 self.write_file(self.build_tpl_name,self.soup); #取得替換的html檔案 def get_replace_html(self,rep_id,rep_data=""): ''' 參數說明：替換id，替換內容（為空白的採用模版模式替換） ''' if len(rep_data) > 0 : rep_str = rep_data; else: rep_str = "$rep_arr[{0}]".format(rep_id); return ""+rep_str+""; #取得替換數組 def get_replace_list(self,text_arr,set_lang): Sp = SentenceSpliter(); Sp.SetLang(set_lang); temp_sentence = []; jump_i = 0; for text in text_arr: SList = Sp.Split(text); replace_temp = ""; if SList != None: for item in SList: replace_temp = replace_temp+self.get_replace_html(jump_i,item); jump_i=jump_i+1; else: replace_temp = self.get_replace_html(jump_i,text); jump_i=jump_i+1; temp_sentence.append(replace_temp); return temp_sentence; #分句 def parse_text(self,text_arr,set_lang): Sp = SentenceSpliter(); Sp.SetLang(set_lang); temp_sentence = []; for text in text_arr: SList = Sp.Split(text); if SList != None: for item in SList: temp_sentence.append(item); else: temp_sentence.append(text);return temp_sentence;#取得解析資源 def get_soup(self,parse_file): try: file=io.FileIO(parse_file,"r"); doc = file.readall(); file.close(); except IOError: print 'ERROR: %s file not found!' %parse_file; return False; #開始解析html文檔 return BeautifulSoup(''.join(doc));if __name__ == "__main__": from sys import argv, exit;if len(argv) < 3: print "USAGE: python %s <input-file> <output-file>" % argv[0] exit(255);if not isfile(argv[1]): print "no such input file: %s" % argv[1] exit(1)paser_file = argv[1];#"html/testpic.html"; tpl_file = argv[2]; save_pic_path = argv[3]; show_pic_path = argv[4]; load_js_path = argv[5]; #解析開始設定解析檔案，模版名，圖片儲存路徑，圖片顯示路徑 so = build_tpl(paser_file,tpl_file,save_pic_path,show_pic_path,load_js_path); #建立模版 so.builder(); #儲存分句的句對 so.save_data_file();

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More