標籤:
example: http://xyzp.haitou.cc/article/722427.html
首先是直接下載好每個頁面,可以使用 os.system( "wget "+str(url)) 或者urllib2.urlopen(url) ,很簡單不贅述。
然後,重頭戲,進行資訊抽取:
#!/usr/bin/env python# coding=utf-8from bs4 import BeautifulSoupimport codecsimport sysimport osreload(sys)sys.setdefaultencoding("utf-8")import refrom pymongo import MongoClientdef get_jdstr(fname): soup = "" retdict = {} with open(fname) as fr: soup = BeautifulSoup(fr.read().replace(‘""‘,‘"‘)) jdstr = soup.get_text() retdict["inc_name"] = soup.title.string.split()[0] retdict["page_content"] = soup.find_all("div","panel-body panel-body-text")[0].get_text() retdict["index_url"] = re.search("http://xyzp.haitou.cc/article/\d+.html",jdstr).group() retdict["info_from"] = soup.find_all("p","text-ellipsis")[0].contents[1].get_text() retdict["workplace"] = soup.find_all("p","text-ellipsis")[1].contents[1].get_text() retdict["info_tag"] = soup.find_all("p","text-ellipsis")[2].contents[1].get_text() retdict["pub_time"] = soup.find_all("p","text-ellipsis")[3].contents[1].get_text() return retdictdef JD_extr(): fnames = [ fname for fname in os.listdir("./") if fname.endswith(".html") ] fw = codecs.open("tmp_jd_haitou_clean.csv","w","utf-8") res = [] for fname in fnames[1:500]: tmp = [] retdict = get_jdstr(fname) res.append(retdict) for k,v in retdict.iteritems(): tmp.append(v) fw.write(" , ".join(tmp)+"\n") fw.write("==="*20+"\n") print fname,"done!" return resdef change2html(): fnames = [ fname for fname in os.listdir("./") if fname.endswith(".txt") ] for fname in fnames: cmd = "mv "+str(fname) +" "+fname[:-3]+"html" print cmd os.system(cmd)def store2mongodb(): client = MongoClient("localhost",27017) db = client.JD_Haitou documents = JD_extr() for d in documents: db.haitouJD.insert(d) mycol = db["haitouJD"] print mycol.count()def split_jd_test_data(fname=‘./tmp_jd_haitou_clean.csv‘): fw = codecs.open(‘./split_jd_res.csv‘,‘w‘,‘utf-8‘) fr = codecs.open(fname,‘r‘,‘utf-8‘) indexurl = re.compile("http://xyzp.haitou.cc/article/\d+.html") for line in fr: if indexurl.search(line): url = indexurl.search(line).group() cnt = ‘1‘ #預設為1 fw.write(url+"\t"+cnt+"\n") fr.close() fw.close()if __name__ == "__main__": JD_extr() # 抽取後存入檔案 store2mongodb() split_jd_test_data() print "done"
使用bs4對海投網內容資訊進行提取並存入mongodb資料庫