XML. DOM
Demand
There is a table with a large amount of data, a daily update, its fields can be configured through an XML configuration file, that is, the table may be built each time the field is different.
Upstream run will be extracted from the source file according to the configuration, to the storage this step needs to be built according to the configuration table.
Solve
Write a simple XML configuration that requires fields and types
Upstream reading to the corresponding data
Warehousing This step, first delete the original table, according to the configuration to build a new table
XML file
<?xml version= "1.0" encoding= "UTF-8"?>
ID
Query
varchar ($)
false
Query
PV
Integer
false
PV
Avg_money
integer
false
Processing scripts
#!/usr/bin/python#-*-coding:utf-8-*-#author: Wklken#desc:use to read db XML config.#-----------------------# 2012-02-18 created#----------------------Import sys,osfrom xml.dom import minidom, Nodedef read_dbconfig_xml (xml_file _path): content = {} root = Minidom.parse (xml_file_path) Table = root.getelementsbytagname ("table") [0] #read dbname an d table name. table_name = Table.getattribute ("name") Db_name = Table.getattribute ("db_name") If Len (table_name) > 0 and Len (db_nam e) > 0:db_sql = "CREATE database if not EXISTS '" + db_name + "; Use "+ Db_name +"; " Table_drop_sql = "Drop" + table_name + "if exists" + table_name + ";" Content.update ({"Db_sql": Db_sql}) content.update ({"Table_sql": Table_drop_sql}) Else:print "Error:attribute" is Not define well! Db_name= "+ db_name +"; table_name= "+ table_name Sys.exit (1) #print table_name, db_name table_create_sql =" Create Table "+ table_name +" ("#read primary cell Primary_key = Table.getelementSbytagname ("Primary_key") [0] primary_key_name = primary_key.getelementsbytagname ("name") [0].childnodes[0]. NodeValue Table_create_sql + = Primary_key_name + "INTEGER not NULL auto_increment primary key," #print PRIMARY_KEY.TOXM L () #read ordernary field fields = Table.getelementsbytagname ("field") F_index = 0 for field in Fields:f_index + = 1 name = Field.getelementsbytagname ("name") [0].childnodes[0].nodevalue type = Field.getelementsbytagname ("type") [0]. Childnodes[0].nodevalue Table_create_sql + = name + "" + type if f_index! = Len (fields): Table_create_sql + = "," Is_index = Field.getelementsbytagname ("Is_index") [0].childnodes[0].nodevalue table_create_sql + =]; " Content.update ({"Table_create_sql": Table_create_sql}) #character set latin1 collate latin1_danish_ci; Print Contentif __name__ = = "__main__": Read_dbconfig_xml (Sys.argv[1])
Methods involved
Root = Minidom.parse (Xml_file_path) Get DOM Object
Root.getelementsbytagname ("table") gets a list of nodes based on tag
Table.getattribute ("name") Gets the property
Primary_key.getelementsbytagname ("name") [0].childnodes[0].nodevalue gets the value of the child node (ID get ID)
Sax
Demand
Read XML data file, large file, need real-time processing insert INTO database
XML document
100000
male
Beijing, Haidian District
437
1989
333
242
null
University
1
February 14
Processing
Sax processing does not read like a DOM in a node-like dimension, it only starts with the end tag of the tag content
The process of thinking is: Through a handler, the start tag, the content, the end tag each has a processing function
Code and annotations
Person Handling class
From Xml.sax import Handler,parsestringclass Personhandler (handler. ContentHandler): Def __init__ (self, db_ops): #db op obj self.db_ops = db_ops #存储一个person的map Self.person = {} #当前的tag self.current_tag = "" #是否是tag之间的内容, for the purpose of getting the contents of the tag, not affected by the blank self.in_quote = 0 #开始, Empty map def startelement (self, Name, attr): #以person, clear the map if name = = "Person": Self.person = {} #记录 status C12/>self.current_tag = name self.in_quote = 1 #结束, inserting database def endElement (self, name): #以person结尾 Represents the end of reading a person's information if name = = "Person": #do something in_fields = tuple ([' "' + self.person.get (i," ") + '" ') for I in Fields] print in_sql% in_fields db_ops.insert (in_sql% (in_fields)) #处理 self.in_quote = 0 def characters (self, content): #若是在tag之间的内容, update to map if self.in_quote: self.person.update ({ Self.current_tag:content})
Add the complete code for the library
#!/usr/bin/python#-*-coding:utf-8-*-#parse_person. Py#version:0.1#author:wukunliang@163.com#desc:parse Person.xml and Out Sqlimport sys,osimport mysqldbreload (SYS) sys.setdefaultencoding (' utf-8 ') In_sql = "Insert to Person (Id,sex,address,fansnum,summary,wbnum,gznum,blog,edu,work,renzh,brithday) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,% S,%s,%s) "Fields = (" id "," Sex "," address "," fansnum "," Summary "," Wbnum "," Gznum "," blogging "," edu "," Work "," Renzh "," Brithday ") #数据库方法class db_connect:def __init__ (self, db_host, user, pwd, db_name, charset=" UTF8 ", Use_unicode = True): print "init begin" Print db_host, user, pwd, db_name, charset, Use_unicode self.conn = Mysqldb.connection (Db_host, User, pwd, db_name, Charset=charset, use_unicode=use_unicode) print "init end" def insert (Self, SQL): Try:n = Self.conn.cursor (). Execute (SQL) return n except mysqldb.warning, e:print "Error:execute sql", SQL, "' FA Iled "Def Close (self): Self.conn.close () #personClass from Xml.sax import Handler,parsestringclass Personhandler (handler. ContentHandler): Def __init__ (self, Db_ops): #db op obj self.db_ops = db_ops #存储一个person的map Self.person = {} #当前的tag Self.current_tag = "" #是否是tag之间的内容 self.in_quote = 0 #开始, clear the map def startelement (self, Name, attr): #以person, clear the map if name = = "Person": Self.person = {} #记录 status Self.current_tag = Name Self.in_quote = 1 #结束, insert Database def endElement (self, name): #以person结尾 represents the end of reading a person's information if name = = "Person": #do something In_ Fields = Tuple ([(' "' + self.person.get (i," ") + ' ') ') print In_sql% in_fields Db_ops.insert ( in_sql% (in_fields)) #处理 self.in_quote = 0 def characters (self, content): #若是在tag之间的内容, update to map if Self.in_quot E:self.person.update ({self.current_tag:content}) if __name__ = = "__main__": F = Open ("./person.xml") #如果源文件gbk transcoding If utf-8, remove decode.encode db_ops = Db_connect ("127.0.0.1", "root", "root", "Test ") parsestring (F.read (). Decode (" GBK "). Encode (" Utf-8 "), Personhandler (Db_ops)) F.close () Db_ops.close ()
Usually take the python to analyze the data, tool scripts and Hadoop streamming, but with the face and depth of the lack can only say that the line is shallow, need a lot of practice