python xml解析例子

來源:互聯網
上載者:User

標籤:

# -*- coding: utf-8 -*-"""Created on Thu Apr 16 23:18:27 2015@author: shifeng"""'''功能:解析CDR_sample.xml檔案,輸出格式為DNorm接收的格式,並將訓練集的“label”寫入到文檔中xml檔案:見CSDN資源共用參考部落格:http://www.cnblogs.com/fnng/p/3581433.html'''import codecsimport StringIOimport xmlfrom lxml import etreefrom xml.sax import *from xml.sax.handler import *from xml.etree import ElementTree as ETimport xml.dom.minidom dom = xml.dom.minidom.parse("CDR_sample.xml")root  = dom.documentElement#print root.nodeName#print root.nodeValue#print root.nodeType#print root.ELEMENT_NODE#-----------'''方法一(未採納):#知道元素名字的子項目,使用getElementsByTagName方法擷取#colloction為根節點,有四個元素,知道其名,通過root.getElementsByTagName(i)便能取出其子項目colloction_ele = ["source", "date", "key", "document"]for i in colloction_ele:    print root.getElementsByTagName(i)[0].nodeName      #擷取標籤名字#    print root.getElementsByTagName(i)[0].getAttribute#documents有三個標籤document_ele = ["id", "passage", "annotation"]documents = root.getElementsByTagName("document")#print len(documents)for i in documents:                 #對每個文檔,    for j in document_ele:          #取出每個標籤        print i.getElementsByTagName(j)[0].nodeName         #擷取標籤名字        print i.getElementsByTagName(j)[0].firstChild.data  #擷取標籤之間的資料        if j == "annotation":            print i.getElementsByTagName(j)[0].getAttribute("id")   #擷取標籤屬性'''#-----------write_text = open("train_text.txt","w")#-----------root_2 = ET.parse("CDR_sample.xml")documents = root_2.findall("./document")for per in documents:               #找到所有document    for child in per:               #對於每個document解析其標籤id,passage,annotation                    child_tag = child.tag        if child_tag =="id":                                    text_id = child.text            print child_tag,":",text_id            write_text.write(text_id+"\t")                      #寫入檔案,id和tab符號                    elif child_tag =="passage":                                 #對每個passage進行處理            passages = child            for passage in passages:                                #每個document標籤下,有多個passage標籤,                                                                    #passage有四種標籤,對每種標籤進行處理                passage_tag = passage.tag                if passage_tag == "offset":                       #r如果是位移量,取出位移量                    offset = int(passage.text)                              print "offset:",offset                elif passage_tag == "text":                      #如果是文本,取出文本,title_text或者abstract_text                    text =passage.text                    print passage_tag,"::",text                    write_text.write(text)                       #寫入檔案,title_text和abstract_text兩個,連續寫在一起                elif passage_tag =="annotation":                 #如果是標註的,                    annotations = passage                    print 10*"*"                    for annotation in annotations:               #每個passage標籤下,annotation有四種標籤,對每種標籤處理                        annotation_tag = annotation.tag#                        print annotation_tag,"+++++++++++++++++++"                        if annotation_tag == "location":                            print annotation.attrib["offset"],annotation.attrib["length"]                        elif annotation_tag == "text":                            diease_name = annotation.text                            print diease_name                        elif annotation_tag == "infon" and annotation.attrib["key"] !="type":                            #每個passage標籤下,有多個annotation,每個annotation下有兩個infon標籤,取第二個                            infons = annotation                            print infons.attrib["key"],infons.text#                            for infon in infons:#                                print infon.attrib["key"]                                                                elif child_tag =="annotation":                #document_ele[2]:           #annotation            annotation = child    write_text.write("\n")  #每個文檔遍曆完一遍後,加一個分行符號號    print 30*"*"write_text.close()#“label”對照待續....'''doc = etree.parse("CDR_sample.xml")xml_string = etree.tostring(doc)root = etree.fromstring(xml_string)parser = make_parser()# MarkDecodeHandler# MarkDecodeHandlerhandler = UserDecodeHandler()parser.setContentHandle(handler)parser.parse(root)for item in handler.marks:for j in item.items():print i,jprint type(doc)print type(root)# print doc.tagprint root.tag# with codecs.open("CDR_sample.xml") as xml:# text = xml.readlines()# s_xml = ""# for i in text:# i=i.strip("\n")# s_xml+=i# print s_xml# soup = BeautifulSoup(s_xml)# print soup.title# for i in text:# print i'''

python xml解析例子

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.