Python Multiprocess multipart access to ultra-large files,
This example describes how to read large files in multiple parts in Python. We will share this with you for your reference. The details are as follows:
Read ultra-large text files and use multi-process multipart reading to output each file separately
#-*-Coding: GBK-*-import urlparseimport datetimeimport osfrom multiprocessing import Process, Queue, Array, RLock "multi-process multipart file Reading" WORKERS = 4 BLOCKSIZE = too many file_size = 0def getFilesize (file ): "Get the size of the file to be read" global FILE_SIZE fstream = open (file, 'R') fstream. seek (0, OS. SEEK_END) FILE_SIZE = fstream. tell () fstream. close () def process_found (pid, array, file, rlock): global FILE_SIZE global JOB globa L PREFIX "process processing Args: pid: process ID array: Shared queue between processes, used to mark the end position of the file block read by each process file: the name of the read file. Each process first obtains the endpossition (startpossition + BLOCKSIZE) from the array where the current maximum value is startpossition. if (startpossition + BLOCKSIZE) <FILE_SIZE else FILE_SIZE if startpossition = FILE_SIZE indicates that the process ends if startpossition = 0, and if startpossition is read from 0! = 0 to prevent the row from being intercepted by a block, read a row first without processing it. Start from the next row and formally process if current location <= endpossition on readline; otherwise, the boundary is crossed, find the maximum value "fstream = open (file, 'R') in array while True: rlock. acquire () print 'pid % s' % pid ,','. join ([str (v) for v in array]) startpossition = max (array) endpossition = array [pid] = (startpossition + BLOCKSIZE) if (startpossition + BLOCKSIZE) <FILE_SIZE else FILE_SIZE rlock. release () if startpossition = FILE_SIZE: # end of the fi Le print 'pid % s end' % (pid) break elif startpossition! = 0: fstream. seek (startpossition) fstream. readline () pos = ss = fstream. tell () ostream = open ('/data/download/tmp_pid' + str (pid) + '_ jobs' + str (endpossition), 'w') while pos <endpossition: # process line = fstream. readline () ostream. write (line) pos = fstream. tell () print 'pid: % s, startposition: % s, endposition: % s, pos: % s' % (pid, ss, pos, pos) ostream. flush () ostream. close () ee = fstream. tell () fstream. close () def main (): global FILE_SIZE print datetime. datetime. now (). strftime ("% Y/% d/% m % H: % M: % S") file = "/data/pds/download/scmcc_log/tmp_format_2011004.log" getFilesize (file) print FILE_SIZE rlock = RLock () array = Array ('l', WORKERS, lock = rlock) threads = [] for I in range (WORKERS ): p = Process (target = process_found, args = [I, array, file, rlock]) threads. append (p) for I in range (WORKERS): threads [I]. start () for I in range (WORKERS): threads [I]. join () print datetime. datetime. now (). strftime ("% Y/% d/% m % H: % M: % S") if _ name _ = '_ main _': main ()