Python parsing HTML Beautifulsoup

Python parsing HTML Beautifulsoup_python

Last Update:2017-01-18 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Copy Code code as follows:

# Coding=utf-8
From BeautifulSoup import BeautifulSoup, Tag, navigablestring
From Sentencespliter import Sentencespliter
From Os.path import Basename,dirname,isdir,isfile
From OS import makedirs
From Shutil import CopyFile
Import IO
Import time
Import re

Class Build_tpl:
def __init__ (self,parse_file,build_tpl_name,cp_pic_dir,show_pic_dir,js_path,set_lang=2052):
"' Parameter description: Parse filename, template name, save picture path, picture display path, JS path, current language (use of clause) '"

#取得解析文件目录路径
If Len (dirname (Parse_file)) >1:
Self.cur_dir = DirName (parse_file) + "/";
Else
Self.cur_dir = "./";

#建立的模版文件文件名
Self.build_tpl_name = Build_tpl_name;
#图片cp到得目录
Self.cp_pic_dir = Cp_pic_dir;
#通过http展现图片的目录
Self.show_pic_dir = Show_pic_dir;
#加载js的路径
Self.js_path = Js_path;

#句段组
Self.get_text_arr = [];
#当前图片名数组
Self.cur_pic_arr = [];

#解析文件 Access to Soup resources
Self.soup = Self.get_soup (parse_file);
#取得html文档中, Che Wen file
Self.get_text_arr = Self.soup.body.findAll (Text=lambda (x): Len (X.strip ()) > 0);
#取得句对
Self.get_sentence_arr = Self.parse_text (Self.get_text_arr,set_lang);
#取得替换数组
Self.replace_list = Self.get_replace_list (Self.get_text_arr,set_lang);
#取得图片数组
Self.cur_pic_arr = SELF.SOUP.FINDALL (' img ');

#self. Write_file_by_list ("No.txt", Self.get_text_arr);
#self. Write_file_by_list ("Yes.txt", Self.get_sentence_arr);

#保存词组到文件
def save_data_file (self):
file_name = self.build_tpl_name+ ". Data";
Self.write_file_by_list (File_name,self.get_data ());
#取得词组
def get_data (self):
return Self.get_sentence_arr;
#数组写入到文档
def write_file_by_list (Self,file_name,write_arr):
File=io. FileIO (file_name, "w");
File.write (' \ n '. Join (Write_arr)). Encode (' utf-8 '));
File.close ();
#字符串写入到文档
def write_file (self,file_name,file_contents):
File=io. FileIO (file_name, "w");
File.write (File_contents.encode (' utf-8 '));
File.close ();
#建立图片hash目录
def get_pic_hash (self):
Return Time.strftime ("%y/%m/%d/");
#建立模版文件
def builder (self):
#没能发生替换的单词
Bug_msg = [];
#进行内容模版替换
For I in range (len (Self.get_text_arr)):
#替换
Rep_str = "$rep _arr[{0}]". Format (i);
Try
Self.soup.body.find (Text=self.get_text_arr[i]). ReplaceWith (Self.replace_list[i]);
Except Attributeerror:
Bug_msg.append (Self.get_text_arr[i]);

#取得图片hash路径
Hash_dir = Self.get_pic_hash ();
#构造展示图片路径
Show_pic_dir = Self.show_pic_dir+hash_dir;
#构造图片保存路径
Cp_pic_dir = Self.cp_pic_dir+hash_dir;

#判断保存图片的目录是否存在 does not exist to establish
If not Isdir (Cp_pic_dir):
Makedirs (Cp_pic_dir);

For Pic_name in Self.cur_pic_arr:
#进行图片路径替换
old_pic_src = pic_name[' src '];
pic_name[' src '] = show_pic_dir+old_pic_src;
#进行图片拷贝
Cp_src_file = SELF.CUR_DIR+OLD_PIC_SRC;
Cp_dis_file = CP_PIC_DIR+OLD_PIC_SRC;
CopyFile (Cp_src_file,cp_dis_file);

#建立bug信息的文档
#self. Write_file_by_list ("Bug.txt", bug_msg);

#添加js
Tag = tag (Self.soup, "script");
tag[' type '] = "text/javascript";
tag[' src '] =self.js_path+ "jquery.js";

Tag2 = Tag (self.soup, "script");
tag2[' type '] = "text/javascript";
tag2[' src '] =self.js_path+ "init.js";

Self.soup.head.insert (2,TAG2);
Self.soup.head.insert (2,tag);

#建立模版
Self.write_file (Self.build_tpl_name,self.soup);
#取得替换的html文件
def get_replace_html (self,rep_id,rep_data= ""):
'''
Parameter description: Replace ID, replace content (replace with template mode for empty)
'''
If Len (Rep_data) > 0:
Rep_str = Rep_data;
Else
Rep_str = "$rep _arr[{0}]". Format (rep_id);
Return "<span sty=\" "Data\" id=\ "Rep_" +str (rep_id) + ">" +rep_str+ "</span>";
#取得替换数组
def get_replace_list (Self,text_arr,set_lang):
Sp = Sentencespliter ();
Sp.setlang (Set_lang);
Temp_sentence = [];
jump_i = 0;
For text in Text_arr:
Slist = Sp.split (text);
Replace_temp = "";
If Slist!= None:
For item in Slist:
Replace_temp = replace_temp+self.get_replace_html (Jump_i,item);
jump_i=jump_i+1;
Else
Replace_temp = self.get_replace_html (Jump_i,text);
jump_i=jump_i+1;
Temp_sentence.append (replace_temp);
return temp_sentence;
#分句
def parse_text (Self,text_arr,set_lang):
Sp = Sentencespliter ();
Sp.setlang (Set_lang);
Temp_sentence = [];
For text in Text_arr:
Slist = Sp.split (text);
If Slist!= None:
For item in Slist:
Temp_sentence.append (item);
Else
Temp_sentence.append (text);

return temp_sentence;

#取得解析资源
def get_soup (self,parse_file):
Try
File=io. FileIO (Parse_file, "R");
doc = File.readall ();
File.close ();
Except IOError:
print ' ERROR:%s file not found! '%parse_file;
return False;
#开始解析html文档
Return BeautifulSoup ('. Join (DOC));

if __name__ = = "__main__":
From sys import argv, exit;

If Len (argv) < 3:
Print "Usage:python%s <input-file> <output-file>"% argv[0]
Exit (255);

If not isfile (Argv[1]):
print ' No such input file:%s '% argv[1]
Exit (1)

Paser_file = argv[1];# "html/testpic.html";
Tpl_file = argv[2];
Save_pic_path = argv[3];
Show_pic_path = argv[4];
Load_js_path = argv[5];
#解析开始 set parsing file, template name, picture save path, picture display path
so = Build_tpl (Paser_file,tpl_file,save_pic_path,show_pic_path,load_js_path);
#建立模版
So.builder ();
#保存分句的句对
So.save_data_file ();

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Python parsing HTML Beautifulsoup_python

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Python parsing HTML Beautifulsoup_python

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support