This article mainly introduces how to process XML files using Python, including how to process XML large files using Python. For more information, see the following requirements.
There is a table with a large amount of data, which is updated every day. its fields can be configured through the xml configuration file, that is, the fields of each table may be different.
During the upstream running, the source file is extracted according to the configuration. to import the data to the database, you need to create a table based on the configuration.
Solution
A simple xml file is written, and fields and types are required for configuration.
Upstream reads the corresponding data
In this step, delete the original table and create a new table based on the configuration.
XML file
<? Xml version = "1.0" encoding = "UTF-8"?>
Id
Query
Varchar (200)
False
Query
Pv
Integer
False
Pv
Avg_money
Integer
False
Process scripts
#!/usr/bin/python# -*- coding:utf-8 -*-#author: wklken#desc: use to read db xml config.#-----------------------#2012-02-18 created#----------------------import sys,osfrom xml.dom import minidom, Nodedef read_dbconfig_xml(xml_file_path): content = {} root = minidom.parse(xml_file_path) table = root.getElementsByTagName("table")[0] #read dbname and table name. table_name = table.getAttribute("name") db_name = table.getAttribute("db_name") if len(table_name) > 0 and len(db_name) > 0: db_sql = "create database if not exists `" + db_name +"`; use " + db_name + ";" table_drop_sql = "drop " + table_name + " if exists " + table_name + ";" content.update({"db_sql" : db_sql}) content.update({"table_sql" : table_drop_sql }) else: print "Error:attribute is not define well! db_name=" + db_name + " ;table_name=" + table_name sys.exit(1) #print table_name, db_name table_create_sql = "create table " + table_name +"(" #read primary cell primary_key = table.getElementsByTagName("primary_key")[0] primary_key_name = primary_key.getElementsByTagName("name")[0].childNodes[0].nodeValue table_create_sql += primary_key_name + " INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY," #print primary_key.toxml() #read ordernary field fields = table.getElementsByTagName("field") f_index = 0 for field in fields: f_index += 1 name = field.getElementsByTagName("name")[0].childNodes[0].nodeValue type = field.getElementsByTagName("type")[0].childNodes[0].nodeValue table_create_sql += name + " " + type if f_index != len(fields): table_create_sql += "," is_index = field.getElementsByTagName("is_index")[0].childNodes[0].nodeValue table_create_sql += ");" content.update({"table_create_sql" : table_create_sql}) #character set latin1 collate latin1_danish_ci; print contentif __name__ == "__main__":read_dbconfig_xml(sys.argv[1])
Parse XML large files using PYTHON [SAX]
Requirement
Reads xml data files. the files are large and need to be inserted to the database in real time.
Xml document
100000
Male
Beijing, Haidian District
437
1989
333
242
Null
University
1
February 14
Processing
When processing a sax file, it does not read the file in a node-like dimension like dom. it can only end the tag with the start tag content.
The processing idea is: through a handler, there is a processing function for each start tag, content, and end tag
Code and annotation
Person processing class
From xml. sax import handler, parseStringclass PersonHandler (handler. contentHandler): def _ init _ (self, db_ops): # db op obj self. db_ops = db_ops # store the map self of a person. person = {}# current tag self. current_tag = "" # whether it is the content between tags. the content between tags is obtained for the purpose, and self is not disturbed by blank spaces. in_quote = 0 # Start, clear map def startElement (self, name, attr): # clear map if name = "person": self. person = {}# record status self. current_tag = name self. in_quote = 1 # end, insert database def endElement (self, name): # end with person indicates reading information of a person. if name = "person ": # do something in_fields = tuple ([('"' + self. person. get (I, "") + '"') for I in fields]) print in_ SQL % in_fields db_ops.insert (in_ SQL % (in_fields) # process self. in_quote = 0 def characters (self, content): # if the content is between tags, update it to the map if self. in_quote: self. person. update ({self. current_tag: content })
Complete Code for warehouse receiving
#! /Usr/bin/python #-*-coding: UTF-8-*-# parse_person.py # version: 0.1 # author: wukunliang@163.com # desc: parse person. xml and out sqlimport sys, osimport MySQLdbreload (sys) sys. setdefaultencoding ('utf-8') in_ SQL = "insert into person (id, sex, address, fansNum, summary, wbNum, gzNum, blog, edu, work, renZh, brithday) values (% s, % s) "fields = (" id "," sex "," address "," fansNum "," summary "," wbNum "," gzNum "," blog "," edu ", "work", "renZh", "brithday") # Database Method class Db_Connect: def _ init _ (self, db_host, user, pwd, db_name, charset = "utf8", use_unicode = True): print "init begin" print db_host, user, pwd, db_name, charset, use_unicode self. conn = MySQLdb. connection (db_host, user, pwd, db_name, charset = charset, use_unicode = use_unicode) print "init end" def insert (self, SQL): try: n = self.conn.cursor(cmd.exe cute (SQL) return n limit T MySQLdb. warning, e: print "Error: execute SQL '", SQL, "'failed' def close (self): self. conn. close () # person processing class from xml. sax import handler, parseStringclass PersonHandler (handler. contentHandler): def _ init _ (self, db_ops): # db op obj self. db_ops = db_ops # store the map self of a person. person = {}# current tag self. current_tag = "" # whether the content between tags is self. in_quote = 0 # Start, clear map def startElement (self, name, attr): # clear map if name = "person": self. person = {}# record status self. current_tag = name self. in_quote = 1 # end, insert database def endElement (self, name): # end with person indicates reading information of a person. if name = "person ": # do something in_fields = tuple ([('"' + self. person. get (I, "") + '"') for I in fields]) print in_ SQL % in_fields db_ops.insert (in_ SQL % (in_fields) # process self. in_quote = 0 def characters (self, content): # if the content is between tags, update it to the map if self. in_quote: self. person. update ({self. current_tag: content}) if _ name _ = "_ main _": f = open (". /person. xml ") # if the source file gbk is transcoded as UTF-8, remove decode. encode db_ops = Db_Connect ("127.0.0.1", "root", "root", "test") parseString (f. read (). decode ("gbk "). encode ("UTF-8"), PersonHandler (db_ops) f. close () db_ops.close ()