C #: Batch insertion of big data in several databases,
Previously, we only knew that SqlServer supports batch data insertion. We do not know that Oracle, SQLite, and MySql are also supported. However, Oracle needs to use Orace. dataAccess driver. Today we have released several solutions for batch insertion of databases.
First, IProvider has a plug-in service interface IBatcherProvider for batch insertion, which has been mentioned in the previous article.
/// <Summary>
/// Provides a batch data processing method.
/// </Summary>
Public interface IBatcherProvider: IProviderService
{
/// <Summary>
/// Insert the data of <see cref = "DataTable"/> into the database in batches.
/// </Summary>
/// <Param name = "dataTable"> <see cref = "DataTable"/> to be inserted in batches. </Param>
/// <Param name = "batchSize"> data volume written in each batch. </Param>
Void Insert (DataTable dataTable, int batchSize = 10000 );
}
I. Batch insert SQL Server Data
Batch insert of SqlServer is very simple. You can use SqlBulkCopy. The following is the implementation of this class:
/// <Summary>
/// Methods provided for System. Data. SqlClient for batch operations.
/// </Summary>
Public sealed class MsSqlBatcher: IBatcherProvider
{
/// <Summary>
/// Obtain or set the context of the Provider Service.
/// </Summary>
Public ServiceContext {get; set ;}
/// <Summary>
/// Insert the data of <see cref = "DataTable"/> into the database in batches.
/// </Summary>
/// <Param name = "dataTable"> <see cref = "DataTable"/> to be inserted in batches. </Param>
/// <Param name = "batchSize"> data volume written in each batch. </Param>
Public void Insert (DataTable dataTable, int batchSize = 10000)
{
Checker. ArgumentNull (dataTable, "dataTable ");
If (dataTable. Rows. Count = 0)
{
Return;
}
Using (var connection = (SqlConnection) ServiceContext. Database. CreateConnection ())
{
Try
{
Connection. TryOpen ();
// Add a forward/backward guide to the table name
Var tableName = DbUtility. FormatByQuote (ServiceContext. Database. Provider. GetService <ISyntaxProvider> (), dataTable. TableName );
Using (var bulk = new SqlBulkCopy (connection, SqlBulkCopyOptions. KeepIdentity, null)
{
DestinationTableName = tableName,
BatchSize = batchSize
})
{
// Loop all columns and add ing for bulk
DataTable. EachColumn (c => bulk. ColumnMappings. Add (c. ColumnName, c. ColumnName), c =>! C. AutoIncrement );
Bulk. WriteToServer (dataTable );
Bulk. Close ();
}
}
Catch (Exception exp)
{
Throw new BatcherException (exp );
}
Finally
{
Connection. TryClose ();
}
}
}
}
The names of columns in ColumnMappings of SqlBulkCopy are case sensitive. Therefore, when constructing a DataTable, note that the column names must be consistent with those in the table.
If transactions are not used, the performance of these transactions may be affected. To use transactions, you can set SqlBulkCopyOptions. UseInternalTransaction.
Ii. Batch Oracle data insertion
System. Data. OracleClient does not support batch insertion. Therefore, you can only use the Oracle. DataAccess component as the provider.
/// <Summary>
/// Methods provided by the Oracle. Data. Access component for batch operations.
/// </Summary>
Public sealed class OracleAccessBatcher: IBatcherProvider
{
/// <Summary>
/// Obtain or set the context of the Provider Service.
/// </Summary>
Public ServiceContext {get; set ;}
/// <Summary>
/// Insert the data of <see cref = "DataTable"/> into the database in batches.
/// </Summary>
/// <Param name = "dataTable"> <see cref = "DataTable"/> to be inserted in batches. </Param>
/// <Param name = "batchSize"> data volume written in each batch. </Param>
Public void Insert (DataTable dataTable, int batchSize = 10000)
{
Checker. ArgumentNull (dataTable, "dataTable ");
If (dataTable. Rows. Count = 0)
{
Return;
}
Using (var connection = ServiceContext. Database. CreateConnection ())
{
Try
{
Connection. TryOpen ();
Using (var command = ServiceContext. Database. Provider. DbProviderFactory. CreateCommand ())
{
If (command = null)
{
Throw new BatcherException (new ArgumentException ("command "));
}
Command. Connection = connection;
Command. CommandText = GenerateInserSql (ServiceContext. Database, command, dataTable );
Command. ExecuteNonQuery ();
}
}
Catch (Exception exp)
{
Throw new BatcherException (exp );
}
Finally
{
Connection. TryClose ();
}
}
}
/// <Summary>
/// Generate the SQL statement for inserting data.
/// </Summary>
/// <Param name = "database"> </param>
/// <Param name = "command"> </param>
/// <Param name = "table"> </param>
/// <Returns> </returns>
Private string GenerateInserSql (IDatabase database, DbCommand command, DataTable table)
{
Var names = new StringBuilder ();
Var values = new StringBuilder ();
// Convert the data of a DataTable to an array Array
Var data = table. ToArray ();
// Set the ArrayBindCount attribute
Command. GetType (). GetProperty ("ArrayBindCount"). SetValue (command, table. Rows. Count, null );
Var syntax = database. Provider. GetService <ISyntaxProvider> ();
For (var I = 0; I <table. Columns. Count; I ++)
{
Var column = table. Columns [I];
Var parameter = database. Provider. DbProviderFactory. CreateParameter ();
If (parameter = null)
{
Continue;
}
Parameter. ParameterName = column. ColumnName;
Parameter. Direction = ParameterDirection. Input;
Parameter. DbType = column. DataType. GetDbType ();
Parameter. Value = data [I];
If (names. Length> 0)
{
Names. Append (",");
Values. Append (",");
}
Names. AppendFormat ("{0}", DbUtility. FormatByQuote (syntax, column. ColumnName ));
Values. AppendFormat ("{0} {1}", syntax. ParameterPrefix, column. ColumnName );
Command. Parameters. Add (parameter );
}
Return string. Format ("insert into {0} ({1}) VALUES ({2})", DbUtility. FormatByQuote (syntax, table. TableName), names, values );
}
}
The most important step above is to convert the able into an array representation, that is, object [] []. The first array is labeled as the number of columns, and the last array is the number of rows, therefore, the cyclic Columns uses the rear array as the Parameter value, that is, the Parameter value is an array. The insert statement is no different from the general insert statement.
Iii. SQLite data batch insertion
You only need to start the transaction for batch insert of SQLite. The specific principle is unknown.
Public sealed class SQLiteBatcher: IBatcherProvider
{
/// <Summary>
/// Obtain or set the context of the Provider Service.
/// </Summary>
Public ServiceContext {get; set ;}
/// <Summary>
/// Insert the data of <see cref = "DataTable"/> into the database in batches.
/// </Summary>
/// <Param name = "dataTable"> <see cref = "DataTable"/> to be inserted in batches. </Param>
/// <Param name = "batchSize"> data volume written in each batch. </Param>
Public void Insert (DataTable dataTable, int batchSize = 10000)
{
Checker. ArgumentNull (dataTable, "dataTable ");
If (dataTable. Rows. Count = 0)
{
Return;
}
Using (var connection = ServiceContext. Database. CreateConnection ())
{
DbTransaction transcation = null;
Try
{
Connection. TryOpen ();
Transcation = connection. BeginTransaction ();
Using (var command = ServiceContext. Database. Provider. DbProviderFactory. CreateCommand ())
{
If (command = null)
{
Throw new BatcherException (new ArgumentException ("command "));
}
Command. Connection = connection;
Command. CommandText = GenerateInserSql (ServiceContext. Database, dataTable );
If (command. CommandText = string. Empty)
{
Return;
}
Var flag = new AssertFlag ();
DataTable. EachRow (row =>
{
Var first = flag. AssertTrue ();
ProcessCommandParameters (dataTable, command, row, first );
Command. ExecuteNonQuery ();
});
}
Transcation. Commit ();
}
Catch (Exception exp)
{
If (transcation! = Null)
{
Transcation. Rollback ();
}
Throw new BatcherException (exp );
}
Finally
{
Connection. TryClose ();
}
}
}
Private void ProcessCommandParameters (DataTable dataTable, DbCommand command, DataRow row, bool first)
{
For (var c = 0; c <dataTable. Columns. Count; c ++)
{
DbParameter parameter;
// Create parameters for the first time to use the cache
If (first)
{
Parameter = ServiceContext. Database. Provider. DbProviderFactory. CreateParameter ();
Parameter. ParameterName = dataTable. Columns [c]. ColumnName;
Command. Parameters. Add (parameter );
}
Else
{
Parameter = command. Parameters [c];
}
Parameter. Value = row [c];
}
}
/// <Summary>
/// Generate the SQL statement for inserting data.
/// </Summary>
/// <Param name = "database"> </param>
/// <Param name = "table"> </param>
/// <Returns> </returns>
Private string GenerateInserSql (IDatabase database, DataTable table)
{
Var syntax = database. Provider. GetService <ISyntaxProvider> ();
Var names = new StringBuilder ();
Var values = new StringBuilder ();
Var flag = new AssertFlag ();
Table. EachColumn (column =>
{
If (! Flag. AssertTrue ())
{
Names. Append (",");
Values. Append (",");
}
Names. Append (DbUtility. FormatByQuote (syntax, column. ColumnName ));
Values. AppendFormat ("{0} {1}", syntax. ParameterPrefix, column. ColumnName );
});
Return string. Format ("insert into {0} ({1}) VALUES ({2})", DbUtility. FormatByQuote (syntax, table. TableName), names, values );
}
}
Iv. Batch insert MySql Data
/// <Summary>
/// The method provided for the MySql. Data component for batch operations.
/// </Summary>
Public sealed class MySqlBatcher: IBatcherProvider
{
/// <Summary>
/// Obtain or set the context of the Provider Service.
/// </Summary>
Public ServiceContext {get; set ;}
/// <Summary>
/// Insert the data of <see cref = "DataTable"/> into the database in batches.
/// </Summary>
/// <Param name = "dataTable"> <see cref = "DataTable"/> to be inserted in batches. </Param>
/// <Param name = "batchSize"> data volume written in each batch. </Param>
Public void Insert (DataTable dataTable, int batchSize = 10000)
{
Checker. ArgumentNull (dataTable, "dataTable ");
If (dataTable. Rows. Count = 0)
{
Return;
}
Using (var connection = ServiceContext. Database. CreateConnection ())
{
Try
{
Connection. TryOpen ();
Using (var command = ServiceContext. Database. Provider. DbProviderFactory. CreateCommand ())
{
If (command = null)
{
Throw new BatcherException (new ArgumentException ("command "));
}
Command. Connection = connection;
Command. CommandText = GenerateInserSql (ServiceContext. Database, command, dataTable );
If (command. CommandText = string. Empty)
{
Return;
}
Command. ExecuteNonQuery ();
}
}
Catch (Exception exp)
{
Throw new BatcherException (exp );
}
Finally
{
Connection. TryClose ();
}
}
}
/// <Summary>
/// Generate the SQL statement for inserting data.
/// </Summary>
/// <Param name = "database"> </param>
/// <Param name = "command"> </param>
/// <Param name = "table"> </param>
/// <Returns> </returns>
Private string GenerateInserSql (IDatabase database, DbCommand command, DataTable table)
{
Var names = new StringBuilder ();
Var values = new StringBuilder ();
Var types = new List <DbType> ();
Var count = table. Columns. Count;
Var syntax = database. Provider. GetService <ISyntaxProvider> ();
Table. EachColumn (c =>
{
If (names. Length> 0)
{
Names. Append (",");
}
Names. AppendFormat ("{0}", DbUtility. FormatByQuote (syntax, c. ColumnName ));
Types. Add (c. DataType. GetDbType ());
});
Var I = 0;
Foreach (DataRow row in table. Rows)
{
If (I> 0)
{
Values. Append (",");
}
Values. Append ("(");
For (var j = 0; j <count; j ++)
{
If (j> 0)
{
Values. Append (",");
}
Var isStrType = IsStringType (types [j]);
Var parameter = CreateParameter (database. Provider, isStrType, types [j], row [j], syntax. ParameterPrefix, I, j );
If (parameter! = Null)
{
Values. Append (parameter. ParameterName );
Command. Parameters. Add (parameter );
}
Else if (isStrType)
{
Values. AppendFormat ("'{0}'", row [j]);
}
Else
{
Values. Append (row [j]);
}
}
Values. Append (")");
I ++;
}
Return string. Format ("insert into {0} ({1}) VALUES {2}", DbUtility. FormatByQuote (syntax, table. TableName), names, values );
}
/// <Summary>
/// Determine whether it is a string type.
/// </Summary>
/// <Param name = "dbType"> </param>
/// <Returns> </returns>
Private bool IsStringType (DbType dbType)
{
Return dbType = DbType. AnsiString | dbType = DbType. AnsiStringFixedLength | dbType = DbType. String | dbType = DbType. StringFixedLength;
}
/// <Summary>
/// Create parameters.
/// </Summary>
/// <Param name = "provider"> </param>
/// <Param name = "isStrType"> </param>
/// <Param name = "dbType"> </param>
/// <Param name = "value"> </param>
/// <Param name = "parPrefix"> </param>
/// <Param name = "row"> </param>
/// <Param name = "col"> </param>
/// <Returns> </returns>
Private DbParameter CreateParameter (IProvider provider, bool isStrType, DbType dbType, object value, char parPrefix, int row, int col)
{
// If all parameters are generated, the speed is very slow. Therefore, parameters are added only when the data type is string (including the 'sign) or date type.
If (isStrType & value. ToString (). IndexOf ('\'')! =-1) | dbType = DbType. DateTime)
{
Var name = string. Format ("{0} p _ {1} _ {2}", parPrefix, row, col );
Var parameter = provider. DbProviderFactory. CreateParameter ();
Parameter. ParameterName = name;
Parameter. Direction = ParameterDirection. Input;
Parameter. DbType = dbType;
Parameter. Value = value;
Return parameter;
}
Return null;
}
}
MySql batch inserts all values in the statement's values. For example, insert batcher (id, name) values (1, '1', 2, '2', 3, '3 ',........ 10, '10 ').
V. Test
Next, write a test case to see the effect of batch insert.
[Test]
Public void TestBatchInsert ()
{
Console. WriteLine (TimeWatcher. Watch () =>
InvokeTest (database =>
{
Var table = new DataTable ("Batcher ");
Table. Columns. Add ("Id", typeof (int ));
Table. Columns. Add ("Name1", typeof (string ));
Table. Columns. Add ("Name2", typeof (string ));
Table. Columns. Add ("Name3", typeof (string ));
Table. Columns. Add ("Name4", typeof (string ));
// Construct 100000 data records
For (var I = 0; I <100000; I ++)
{
Table. Rows. Add (I, I. ToString (), I. ToString ());
}
// Obtain IBatcherProvider
Var batcher = database. Provider. GetService <IBatcherProvider> ();
If (batcher = null)
{
Console. WriteLine ("batch insert is not supported. ");
}
Else
{
Batcher. Insert (table );
}
// Output the data volume of the batcher table
Var SQL = new SqlCommand ("SELECT COUNT (1) FROM Batcher ");
Console. WriteLine ("currently {0} pieces of data", database. ExecuteScalar (SQL ));
})));
}
The following table lists the time consumed for each of the four databases to generate 0.1 million data entries.
Database |
Consumed time |
MsSql |
00:00:02. 9376300 |
Oracle |
00:00:01. 5155959 |
SQLite |
00:00:01. 6275634 |
MySql |
00:00:05. 4166891 |