Detailed explanation of how Python processes XML format data and pythonxml format
This article describes how Python processes XML format data. We will share this with you for your reference. The details are as follows:
The operation here is based on the Python3 platform.
When using Python to process XML, we first encounter Encoding Problems.
Python does not support gb2312, so an error occurs in the XML file of encoding = "gb2312. The encoding of the file read by Python may also cause an exception. In this case, you need to specify the encoding when opening the file. In addition, it is the Chinese characters contained by nodes in XML.
Here, the processing is relatively simple. You only need to modify the XML encoding header.
#!/usr/bin/env pythonimport os, sysimport redef replaceXmlEncoding(filepath, oldEncoding='gb2312', newEncoding='utf-8'): f = open(filepath, mode='r') content = f.read() content = re.sub(oldEncoding, newEncoding, content) f.close() f = open(filepath, mode='w') f.write(content) f.close()if __name__ == "__main__": replaceXmlEncoding('./ActivateAccount.xml')
Next, use xml. etree. ElementTree to operate the XML file.
Defining the _ call _ function in a class can make the class callable. For example, the last few lines of the following code are in the _ main _ function. This also highlights the fact that in the Python world, everything is an object, including the object itself :)
The _ main _ function is very useful for testing.
#!/usr/bin/env pythonimport os, reimport xml.etree.ElementTree as etreeLocale_Path = "./locale.txt"class xmlExtractor(object): def __init__(self): pass def __call__(self, filepath): retDict = {} f = open(filepath, 'r') Line = len(open(filepath, 'r').readlines()) retDict['Line'] = Line tree = etree.parse(f) root = tree.find("ResItem") Id = root.get("ID") retDict['Title'] = Id resItemCnt = len(list(root.findall("ResItem"))) + 1 retDict['ResItemCount'] = resItemCnt retDict['ChineseTip'] = 'None' for child in root: attrDict = child.attrib keyword = "Name" if(keyword in attrDict.keys() and attrDict['Name'] == "Caption"): if len(child.attrib['Value']) > 1: if child.attrib['Value'][0] == '~': title = child.attrib['Value'][1:] else: title = child.attrib['Value'][0:] #print(title) chs = open(Locale_Path).read() pattern = '<String id="' + title + '">[^>]+>' m = re.search(pattern, chs) if m != None: realTitle = re.sub('<[^>]+>', '', m.group(0)) retDict['ChineseTip'] = realTitle f.close() return retDictif __name__ == "__main__": fo = xmlExtractor() d = fo('./ActivateAccount.xml') print(d)
Finally, it is the entry file. Import the above two files, and use xml. dom and OS. listdir to recursively process the XML file and generate a result set.
I always think that the Python UnboundLocalError is quite interesting. I don't know if it is a symbol table overwrite problem.
#! /Usr/bin/env pythonfrom xmlExtractor import * from replaceXmlEncoding import * from xml. dom import minidom, Nodedoc = minidom. document () extractor = xmlExtractor () totalLines = 0 totalResItemCnt = 0 totalXmlFileCnt = 0 totalErrorCnt = 0 errorFileList = [] xmlRoot = doc. createElement ("XmlResourceFile") doc. appendChild (xmlRoot) def my‑dir (level, path): global doc, extractor, totalLines, totalResItemCnt, totalXmlFileCnt global totalErrorCnt, errorFileList global xmlRoot for I in OS. listdir (path): if I [-3:] = 'xml': totalXmlFileCnt + = 1 try: # First convert xml encoding from gb2312 to UTF-8 replaceXmlEncoding (path + '\' + I) # extract the information required in the xml document. info = extractor (path + '\' + I) # create a node # print (info) # print (type (I) xmlNode = doc on the basis that no exception occurs in the above two lines of code. createElement ("XmlFile") xmlRoot. appendChild (xmlNode) xmlName = doc. createElement ("Filename") xmlName. setAttribute ('value', I) # xmlName. appendChild (doc. createTextNode (I) xmlNode. appendChild (xmlName) filePath = doc. createElement ("Filepath") filePath. setAttribute ('value', path [34:]) # filePath. appendChild (doc. createTextNode (path [1:]) xmlNode. appendChild (filePath) titleNode = doc. createElement ("Title") titleNode. setAttribute ('value', str (info ['title']) # titleNode. appendChild (doc. createTextNode (str (info ['title']) xmlNode. appendChild (titleNode) chsNode = doc. createElement ("ChineseTip") chsNode. setAttribute ('value', str (info ['China' setip ']) # chsNode. appendChild (doc. createTextNode (str (info ['China']) xmlNode. appendChild (chsNode) resItemNode = doc. createElement ("ResItemCount") resItemNode. setAttribute ('value', str (info ['resitemcount']) # resItemNode. appendChild (doc. createTextNode (str (info ['resitemcount']) xmlNode. appendChild (resItemNode) lineNode = doc. createElement ("LineCount") lineNode. setAttribute ('value', str (info ['line']) # lineNode. appendChild (doc. createTextNode (str (info ['line']) xmlNode. appendChild (lineNode) descNode = doc. createElement ("Description") descNode. setAttribute ('value', '') # descNode. appendChild (doc. createTextNode ('') xmlNode. appendChild (descNode) failed t Exception as errorDetail: totalErrorCnt + = 1 errorFileList. append (path + '\' + I) print (path + '\' + I, errorDetail) if OS. path. isdir (path + '\' + I): my1_dir (level + 1, path + '\' + I) if _ name _ = "_ main _": path = OS. getcwd () + '\ themes' my‑dir (0, path) print (totalXmlFileCnt, totalErrorCnt) # print (doc. toprettyxml (indent = "") resultXml = open (". /xmlResourceList. xml "," w ") resultXml. write (doc. toprettyxml (indent = "") resultXml. close ()
PS: Here are some online tools for xml operations for your reference:
Online XML/JSON conversion tools:
Http://tools.jb51.net/code/xmljson
Online formatting XML/online compression XML:
Http://tools.jb51.net/code/xmlformat
XMLOnline compression/formatting tools:
Http://tools.jb51.net/code/xml_format_compress
XMLCode Online formatting and beautification tools:
Http://tools.jb51.net/code/xmlcodeformat