The example in this paper describes how Python implements large file sorting. Share to everyone for your reference. The implementation method is as follows:
Import gzipimport osfrom multiprocessing import Process, Queue, Pipe, current_process, freeze_supportfrom datetime import Datetimedef Sort_worker (input,output): While true:lines = Input.get (). Splitlines () Element_set = {} for line in lines : if Line.strip () = = ' STOP ': return try:element = Line.split (") [0] if not element_set.get (element) : element_set[element] = ' Except:pass sorted_element = sorted (element_set) #print sorted_element output.put ( ' \ n '. Join (sorted_element)) def write_worker (input, Pre): Os.system (' mkdir%s '%pre) i = 0 while true:content = input. Get () if content.strip () = = ' STOP ': Return write_sorted_bulk (Content, '%s/%s '% (pre, i)) i + = 1def write_sort Ed_bulk (content, filename): F = file (filename, ' W ') f.write (content) F.close () def split_sort_file (filename, num_sort = 3, buf_size = 65536*64*4): t = DateTime.Now () Pre, ext = os.path.splitext (filename) if ext = = '. Gz ': File_file = Gzi P.open (filename, ' RB ') ELse:file_file = open (filename) bulk_queue = Queue (Ten) Sorted_queue = Queue (Ten) Num_sort = Num_sort Sort_worker_poo L = [] for I in range (Num_sort): Sort_worker_pool.append (Process (Target=sort_worker, args= (Bulk_queue, Sorted_queue)) ) Sort_worker_pool[i].start () Num_write = 1 Write_worker_pool = [] for I in range (num_write): Write_worker_pool. Append (Process (Target=write_worker, args= (Sorted_queue, Pre))) Write_worker_pool[i].start () buf = File_file.read (buf _size) Sorted_count = 0 while Len (buf): End_line = Buf.rfind (' \ n ') #print buf[:end_line+1] Bulk_queue.put (buf[: END_LINE+1]) Sorted_count + = 1 if end_line! = -1:buf = buf[end_line+1:] + file_file.read (buf_size) Else: BUF = File_file.read (buf_size) for I in Range (Num_sort): Bulk_queue.put (' STOP ') for I in Range (Num_sort): Sort_ Worker_pool[i].join () for I in Range (Num_write): Sorted_queue.put ("STOP") for I in Range (num_write): write_work Er_pool[i].join () Print ' Elasped ', DateTime.Now ()-T return sorted_countfrom HEAPQ import heappush, heappopfrom datetime import Datetimefrom m Ultiprocessing import Process, Queue, Pipe, current_process, Freeze_supportimport osclass file_heap:def __init__ (self, D IR, idx = 0, Count = 1): Files = Os.listdir (dir) self.heap = [] Self.files = {} self.bulks = {} Self.pre_el Ement = None for i in range (len (files)): File = Files[i] If hash (file)% count! = idx:continue input = Open (Os.path.join (dir, file)) self.files[i] = input self.bulks[i] = ' Heappush (self.heap, (self.get_next_e Lement_buffered (i), i) def get_next_element_buffered (self, i): If Len (Self.bulks[i]) < 256:if Self.files[i] is not none:buf = Self.files[i].read (65536) if buf:self.bulks[i] + = buf Else:sel F.files[i].close () self.files[i] = None end_line = self.bulks[i].find (' \ n ') if end_line = = -1:end_lin E = Len (Self.bulks[i]) element = Self.bulks[i][:end_line] self.bulks[i] = self.bulks[i][end_line+1:] return element def poppush_uniq (sel f): While true:element = Self.poppush () if element is None:return None if element! = Self.pre_e Lement:self.pre_element = element return element def poppush (self): try:element, index = Heappop ( SELF.HEAP) except Indexerror:return None new_element = self.get_next_element_buffered (index) if new_element : Heappush (Self.heap, (new_element, index)) return elementdef Heappoppush (dir, queue, idx = 0, Count = 1): heap = File_heap (dir, IDX, count) while true:d = Heap.poppush_uniq () queue.put (d) If D is None:returndef Heappoppush 2 (dir, queue, Count = 1): heap = [] procs = [] queues = [] Pre_element = None for I in Range (count): q = Queue (102 4) Q_buf = Queue_buffer (q) queues.append (q_buf) p = Process (Target=heappoppush, args= (dir, Q_buf, I, count)) p Rocs.append (P) P.starT () queues = tuple (queues) for I in Range (count): Heappush (Heap, (Queues[i].get (), i)) while true:try:d, I = Heappop (heap) except IndexError:queue.put (None) for P in Procs:p.join () return else: If D is not None:heappush (heap, (Queues[i].get (), i)) if d! = Pre_element:pre_element = d Queue.put (d) def merge_file (dir): heap = File_heap (dir) os.system (' rm-f ' +dir+ '. Merge ') Fmerge = open (dir+ '. Merge '), ' a ') element = Heap.poppush_uniq () fmerge.write (element+ ' \ n ') while element was not none:element = Heap.poppush_uniq () fmerge.write (element+ ' \ n ') class Queue_buffer:def __init__ (self, queue): self.q = Queue self.rbuf = [] Sel F.wbuf = [] def get (self): If Len (self.rbuf) = = 0:self.rbuf = Self.q.get () R = self.rbuf[0] del self.rbuf[ 0] Return R def put (self, D): Self.wbuf.append (d) If D was None or Len (self.wbuf) > 1024:self.q.put (self . wbuf) Self.wbuf =[]def diff_file (File_old, File_new, file_diff, buf = 268435456): print ' buffer size ', buf from file_split import split_s Ort_file os.system (' rm-rf ' + os.path.splitext (file_old) [0]) Os.system (' Rm-rf ' + os.path.splitext (file_new) [0]) T = DateTime.Now () split_sort_file (FILE_OLD,5,BUF) split_sort_file (file_new,5,buf) print ' Split elasped ', DateTime.Now () -T Os.system (' Cat%s/* | wc-l '%os.path.splitext (File_old) [0]) Os.system (' Cat%s/* | wc-l '%os.path.splitext (file_new) [ 0]) Os.system (' rm-f ' +file_diff) t = DateTime.Now () Zdiff = open (File_diff, ' a ') Old_q = Queue (1024x768) new_q = Queue (1 024) Old_queue = Queue_buffer (old_q) new_queue = Queue_buffer (new_q) h1 = Process (Target=heappoppush2, args= (os.path.sp Litext (File_old) [0], Old_queue, 3)) H2 = Process (Target=heappoppush2, args= (Os.path.splitext (File_new) [0], New_queue, 3)) H1.start (), H2.start () old = Old_queue.get () new = New_queue.get () old_count, New_count = 0, 0 while old is not N One or new is not none:iF old > New or Old is None:zdiff.write (' < ' +new+ ' \ n ') New = New_queue.get () new_count +=1 elif ol D < New or new is None:zdiff.write (' > ' +old+ ' \ n ') Old = Old_queue.get () old_count +=1 else: Old = Old_queue.get () new = New_queue.get () print ' New_count: ', new_count print ' Old_count: ', old_count print ' dif F elasped ', DateTime.Now ()-T H1.join (), H2.join ()
Hopefully this article will help you with Python programming.