This article will share with you how to use python to import CSV file data to MySQL through multiple processes and the specific code, if you have the same requirements, you can refer to the following section to help colleagues handle a requirement for importing CSV data to MySQL. Two large CSV files have 3 GB, 21 million records, 7 GB, and 35 million records respectively. For data of this magnitude, it takes a long time to use a simple single process/single thread import, and finally it is implemented using a multi-process method. The specific process is not described in detail. record several key points:
Insert in batches instead of insert one by one
To speed up insertion, do not create an index first.
Producer and consumer models, master process reading files, multiple worker processes executing inserts
Control the number of workers to avoid too much pressure on MySQL.
Handle exceptions caused by dirty data
The raw data is GBK encoded, so be careful to convert to UTF-8
Encapsulate command line tools with click
The code is as follows:
#! /Usr/bin/env python #-*-coding: UTF-8-*-import codecsimport csvimport loggingimport multiprocessingimport osimport warningsimport clickimport MySQLdbimport sqlalchemywarnings. filterwarnings ('ignore', category = MySQLdb. warning) # BATCH = 5000DB_URI = 'MySQL: // root @ localhost: 3306/example? Charset = utf8 'engine = sqlalchemy. create_engine (DB_URI) def get_table_cols (table): SQL = 'select * FROM '{table} 'limit 0 '. format (table = table) res = engine.exe cute (SQL) return res. keys () def insert_values (table, cols, rows, cursor): SQL = 'Insert INTO '{table}' ({cols}) VALUES ({marks })'. format (table = table, cols = ','. join (cols), marks = ','. join (['% s'] * len (cols) cursor.exe cute (SQL, * rows) logging. in Fo ('process % s inserted % s rows into table % s', OS. getpid (), len (rows), table) def insert_worker (table, cols, queue): rows = [] # each sub-process creates its own engine object cursor = sqlalchemy. create_engine (DB_URI) while True: row = queue. get () if row is None: if rows: insert_tables (table, cols, rows, cursor) break rows. append (row) if len (rows) = BATCH: insert_rows (table, cols, rows, cursor) rows = [] def insert_parallel (Table, reader, w = 10): cols = get_table_cols (table) # data queue. The main process reads files and writes data to them, the worker process reads data from the queue # pay attention to controlling the queue size to avoid too much data accumulation due to slow consumption, occupying too much memory queue = multiprocessing. queue (maxsize = w * BATCH * 2) workers = [] for I in range (w): p = multiprocessing. process (target = insert_worker, args = (table, cols, queue) p. start () workers. append (p) logging.info ('starting # % s worker process, pid: % s... ', I + 1, p. pid) dirty_data_file = '. /Define pai_dirty_rows.csv '. format (table) xf = open (dirty_data_file, 'w') writer = csv. writer (xf, delimiter = reader. dialect. delimiter) for line in reader: # Record and skip dirty data: the number of key values is inconsistent if len (line )! = Len (cols): writer. writerow (line) continue # Replace the None value with 'null' clean_line = [None if x = 'null' else x for x in line] # write the data queue to the queue. put (tuple (clean_line) if reader. line_num % 500000 = 0: logging.info ('put % s tasks into queue. ', reader. line_num) xf. close () # send logging.info ('send close signal to worker processes ') for I in range (w): queue. put (None) for p in workers: p. join () def convert_file_to_utf8 (f, rv_file = None): if not rv_file: name, ext = OS. path. splitext (f) if isinstance (name, unicode): name = name. encode ('utf8') rv_file = '{} _ utf8 {}'. format (name, ext) logging.info ('Start to process file % s', f) with open (f) as infd: with open (rv_file, 'w') as outfd: lines = [] loop = 0 chunck = 200000 first_line = infd. readline (). strip (codecs. BOM_UTF8 ). strip () + '\ n' lines. append (first_line) for line in infd: clean_line = line. decode ('gb18030 '). encode ('utf8') clean_line = clean_line.rstrip () + '\ n' lines. append (clean_line) if len (lines) = chunck: outfd. writelines (lines) lines = [] loop + = 1 logging.info ('processed % s lines. ', loop * chunck) outfd. writelines (lines) logging.info ('processed % s lines. ', loop * chunck + len (lines) @ click. group () def cli (): logging. basicConfig (level = logging. INFO, format = '% (asctime) s-% (levelname) s-% (name) s-% (message) s') @ cli. command ('gbk _ to_utf8') @ click. argument ('F') def convert_gbk_to_utf8 (f): convert_file_to_utf8 (f) @ cli. command ('load') @ click. option ('-t',' -- table', required = True, help = 'Table name') @ click. option ('-I', '-- filename', required = True, help = 'input file') @ click. option ('-W',' -- workers', default = 10, help = 'worker quantity, default 10') def load_fac_day_pro_nos_sal_table (table, filename, workers ): with open (filename) as fd: fd. readline () # skip header reader = csv. reader (fd) insert_parallel (table, reader, w = workers) if _ name _ = '_ main _': cli ()
No one has shared this article. I hope you will like it.
For more Python multi-process import CSV data to related articles, please follow the PHP Chinese network!