標籤:
# -*- coding: utf-8 -*-"""Created on Thu Apr 16 23:18:27 2015@author: shifeng"""'''功能:解析CDR_sample.xml檔案,輸出格式為DNorm接收的格式,並將訓練集的“label”寫入到文檔中xml檔案:見CSDN資源共用參考部落格:http://www.cnblogs.com/fnng/p/3581433.html'''import codecsimport StringIOimport xmlfrom lxml import etreefrom xml.sax import *from xml.sax.handler import *from xml.etree import ElementTree as ETimport xml.dom.minidom dom = xml.dom.minidom.parse("CDR_sample.xml")root = dom.documentElement#print root.nodeName#print root.nodeValue#print root.nodeType#print root.ELEMENT_NODE#-----------'''方法一(未採納):#知道元素名字的子項目,使用getElementsByTagName方法擷取#colloction為根節點,有四個元素,知道其名,通過root.getElementsByTagName(i)便能取出其子項目colloction_ele = ["source", "date", "key", "document"]for i in colloction_ele: print root.getElementsByTagName(i)[0].nodeName #擷取標籤名字# print root.getElementsByTagName(i)[0].getAttribute#documents有三個標籤document_ele = ["id", "passage", "annotation"]documents = root.getElementsByTagName("document")#print len(documents)for i in documents: #對每個文檔, for j in document_ele: #取出每個標籤 print i.getElementsByTagName(j)[0].nodeName #擷取標籤名字 print i.getElementsByTagName(j)[0].firstChild.data #擷取標籤之間的資料 if j == "annotation": print i.getElementsByTagName(j)[0].getAttribute("id") #擷取標籤屬性'''#-----------write_text = open("train_text.txt","w")#-----------root_2 = ET.parse("CDR_sample.xml")documents = root_2.findall("./document")for per in documents: #找到所有document for child in per: #對於每個document解析其標籤id,passage,annotation child_tag = child.tag if child_tag =="id": text_id = child.text print child_tag,":",text_id write_text.write(text_id+"\t") #寫入檔案,id和tab符號 elif child_tag =="passage": #對每個passage進行處理 passages = child for passage in passages: #每個document標籤下,有多個passage標籤, #passage有四種標籤,對每種標籤進行處理 passage_tag = passage.tag if passage_tag == "offset": #r如果是位移量,取出位移量 offset = int(passage.text) print "offset:",offset elif passage_tag == "text": #如果是文本,取出文本,title_text或者abstract_text text =passage.text print passage_tag,"::",text write_text.write(text) #寫入檔案,title_text和abstract_text兩個,連續寫在一起 elif passage_tag =="annotation": #如果是標註的, annotations = passage print 10*"*" for annotation in annotations: #每個passage標籤下,annotation有四種標籤,對每種標籤處理 annotation_tag = annotation.tag# print annotation_tag,"+++++++++++++++++++" if annotation_tag == "location": print annotation.attrib["offset"],annotation.attrib["length"] elif annotation_tag == "text": diease_name = annotation.text print diease_name elif annotation_tag == "infon" and annotation.attrib["key"] !="type": #每個passage標籤下,有多個annotation,每個annotation下有兩個infon標籤,取第二個 infons = annotation print infons.attrib["key"],infons.text# for infon in infons:# print infon.attrib["key"] elif child_tag =="annotation": #document_ele[2]: #annotation annotation = child write_text.write("\n") #每個文檔遍曆完一遍後,加一個分行符號號 print 30*"*"write_text.close()#“label”對照待續....'''doc = etree.parse("CDR_sample.xml")xml_string = etree.tostring(doc)root = etree.fromstring(xml_string)parser = make_parser()# MarkDecodeHandler# MarkDecodeHandlerhandler = UserDecodeHandler()parser.setContentHandle(handler)parser.parse(root)for item in handler.marks:for j in item.items():print i,jprint type(doc)print type(root)# print doc.tagprint root.tag# with codecs.open("CDR_sample.xml") as xml:# text = xml.readlines()# s_xml = ""# for i in text:# i=i.strip("\n")# s_xml+=i# print s_xml# soup = BeautifulSoup(s_xml)# print soup.title# for i in text:# print i'''
python xml解析例子