A while ago, I helped colleagues with a need to import CSV data into MySQL. Two large CSV files, with 3GB, 21 million records, and 7GB, 35 million records respectively. For this magnitude of data, using a simple single-process/single-threaded import can take a long time and end up being implemented in a multi-process way. The specific process does not repeat, record a few points:
Bulk INSERT instead of insert
In order to speed up the insertion, do not build the index first
Producer and consumer model, main process read file, multiple worker processes execute insert
Take care to control the number of workers and avoid too much pressure on MySQL
Note the exception that is caused by handling dirty data
The original data is GBK encoded, so also pay attention to convert to UTF-8
Encapsulating command-line tools with the click
The specific code is implemented as follows:
#!/usr/bin/env python#-*-coding:utf-8-*-import codecsimport csvimport loggingimport multiprocessingimport osimport wa Rningsimport clickimport mysqldbimport sqlalchemywarnings.filterwarnings (' Ignore ', category=mysqldb.warning) # Number of records in BULK insert batch = 5000db_uri = ' Mysql://root@localhost:3306/example?charset=utf8 ' engine = Sqlalchemy.create_engine (DB _uri) def get_table_cols (table): sql = ' SELECT * from ' {table} ' LIMIT 0 '. Format (table=table) res = engine.execute (SQL) r Eturn Res.keys () def insert_many (table, cols, rows, cursor): sql = ' insert INTO ' {table} ' ({cols}) VALUES ({marks}) '. forma T (table=table, cols= ', '. Join (cols), marks= ', '. Join (['%s '] * len (cols))) cursor.execute (SQL, *rows) log Ging.info (' process%s inserted%s rows into table '%s ', Os.getpid (), Len (rows), table) def insert_worker (table, cols, queue) : rows = [] # Each child process creates its own engine object cursor = Sqlalchemy.create_engine (db_uri) while true:row = Queue.get () if RO W is none:if Rows:insert_many(table, cols, rows, cursor) break rows.append (Row) If Len (rows) = = batch:insert_many (table, cols, rows, c ursor) rows = []def insert_parallel (table, Reader, w=10): cols = Get_table_cols (table) # Data queue, main process reads the file and writes data, worker enters Process read data from queue # Note the size of the control queue, avoid consuming too slowly causing too much data to accumulate, consuming too much memory queue = multiprocessing. Queue (maxsize=w*batch*2) workers = [] for I in Range (W): P = multiprocessing. Process (Target=insert_worker, args= (table, cols, queue)) P.start () Workers.append (p) logging.info (' Starting #%s Worker process, PID:%s ... ', i + 1, p.pid) Dirty_data_file = './{}_dirty_rows.csv '. Format (table) XF = open (Dirty_data_fi Le, ' w ') writer = Csv.writer (XF, delimiter=reader.dialect.delimiter) for line in reader: # record and Skip dirty data: Inconsistent number of key values if Le N (line)! = Len (cols): Writer.writerow (line) Continue # Replace the value of none with ' null ' Clean_line = [none if x = = ' null ' Else x for x on line] # Write data to queue queue.put (tuple (clean_line)) if reader.line_num% 500000 = = 0:loggiNg.info (' Put%s tasks into queue. ', Reader.line_num) xf.close () # sends a signal to each worker to end the task Logging.info (' Send close signal To worker processes ') for I in Range (W): Queue.put (None) for P in Workers:p.join () def Convert_file_to_utf8 (F, rv_ File=none): If not rv_file:name, ext = Os.path.splitext (f) if Isinstance (name, Unicode): name = Name.encode (' UTF8 ') Rv_file = ' {}_utf8{} '. Format (name, ext) logging.info (' Start to process file%s ', F) with open (f) as Infd:w ITH open (rv_file, ' W ') as Outfd:lines = [] Loop = 0 Chunck = 200000 first_line = Infd.readline (). Stri P (codecs. BOM_UTF8). Strip () + ' \ n ' lines.append (first_line) for line in infd:clean_line = Line.decode (' GB18030 '). E Ncode (' utf8 ') Clean_line = Clean_line.rstrip () + ' \ n ' lines.append (clean_line) If Len (lines) = = Chunc K:outfd.writelines (lines) lines = [] loop + = 1 logging.info (' processed%s lines. ', lo OP * Chunck) Outfd.writelines (lines) logging.info (' processed%s lines. ', loop * Chunck + len (lines)) @click. Group () def CLI (): l Ogging.basicconfig (level=logging.info, format= '% (asctime) s-% (levelname) s-% (name) s-% (message) s ') @cli. comma nd (' Gbk_to_utf8 ') @click. argument (' F ') def Convert_gbk_to_utf8 (f): Convert_file_to_utf8 (f) @cli. Command (' load ') @ Click.option ('-t ', '--table ', required=true, help= ' table name ') @click. Option ('-I ', '--filename ', required=true, help= ' input file ') @click. Option ('-w ', '--workers ', default=10, help= ' number of workers, default ') def load_fac_day_pro_nos_sal_table (table, FileName, workers): with open (filename) as Fd:fd.readline () # skip Header reader = Csv.reader (FD) insert_paral Lel (table, Reader, w=workers) if __name__ = = ' __main__ ': CLI ()
The above is the article to everyone to share all no one, I hope you can like