Python implementation of large file sorting methods

Source: Internet
Author: User
The example in this paper describes how Python implements large file sorting. Share to everyone for your reference. The implementation method is as follows:

Import gzipimport osfrom multiprocessing import Process, Queue, Pipe, current_process, freeze_supportfrom datetime import Datetimedef Sort_worker (input,output): While true:lines = Input.get (). Splitlines () Element_set = {} for line in lines : if Line.strip () = = ' STOP ': return try:element = Line.split (") [0] if not element_set.get (element) : element_set[element] = ' Except:pass sorted_element = sorted (element_set) #print sorted_element output.put ( ' \ n '. Join (sorted_element)) def write_worker (input, Pre): Os.system (' mkdir%s '%pre) i = 0 while true:content = input. Get () if content.strip () = = ' STOP ': Return write_sorted_bulk (Content, '%s/%s '% (pre, i)) i + = 1def write_sort Ed_bulk (content, filename): F = file (filename, ' W ') f.write (content) F.close () def split_sort_file (filename, num_sort = 3, buf_size = 65536*64*4): t = DateTime.Now () Pre, ext = os.path.splitext (filename) if ext = = '. Gz ': File_file = Gzi P.open (filename, ' RB ') ELse:file_file = open (filename) bulk_queue = Queue (Ten) Sorted_queue = Queue (Ten) Num_sort = Num_sort Sort_worker_poo  L = [] for I in range (Num_sort): Sort_worker_pool.append (Process (Target=sort_worker, args= (Bulk_queue, Sorted_queue)) ) Sort_worker_pool[i].start () Num_write = 1 Write_worker_pool = [] for I in range (num_write): Write_worker_pool. Append (Process (Target=write_worker, args= (Sorted_queue, Pre))) Write_worker_pool[i].start () buf = File_file.read (buf _size) Sorted_count = 0 while Len (buf): End_line = Buf.rfind (' \ n ') #print buf[:end_line+1] Bulk_queue.put (buf[:      END_LINE+1]) Sorted_count + = 1 if end_line! = -1:buf = buf[end_line+1:] + file_file.read (buf_size) Else: BUF = File_file.read (buf_size) for I in Range (Num_sort): Bulk_queue.put (' STOP ') for I in Range (Num_sort): Sort_ Worker_pool[i].join () for I in Range (Num_write): Sorted_queue.put ("STOP") for I in Range (num_write): write_work Er_pool[i].join () Print ' Elasped ', DateTime.Now ()-T return sorted_countfrom HEAPQ import heappush, heappopfrom datetime import Datetimefrom m Ultiprocessing import Process, Queue, Pipe, current_process, Freeze_supportimport osclass file_heap:def __init__ (self, D IR, idx = 0, Count = 1): Files = Os.listdir (dir) self.heap = [] Self.files = {} self.bulks = {} Self.pre_el Ement = None for i in range (len (files)): File = Files[i] If hash (file)% count! = idx:continue input = Open (Os.path.join (dir, file)) self.files[i] = input self.bulks[i] = ' Heappush (self.heap, (self.get_next_e Lement_buffered (i), i) def get_next_element_buffered (self, i): If Len (Self.bulks[i]) < 256:if Self.files[i] is not none:buf = Self.files[i].read (65536) if buf:self.bulks[i] + = buf Else:sel F.files[i].close () self.files[i] = None end_line = self.bulks[i].find (' \ n ') if end_line = = -1:end_lin  E = Len (Self.bulks[i])  element = Self.bulks[i][:end_line] self.bulks[i] = self.bulks[i][end_line+1:] return element def poppush_uniq (sel f): While true:element = Self.poppush () if element is None:return None if element! = Self.pre_e Lement:self.pre_element = element return element def poppush (self): try:element, index = Heappop ( SELF.HEAP) except Indexerror:return None new_element = self.get_next_element_buffered (index) if new_element  : Heappush (Self.heap, (new_element, index)) return elementdef Heappoppush (dir, queue, idx = 0, Count = 1): heap = File_heap (dir, IDX, count) while true:d = Heap.poppush_uniq () queue.put (d) If D is None:returndef Heappoppush 2 (dir, queue, Count = 1): heap = [] procs = [] queues = [] Pre_element = None for I in Range (count): q = Queue (102 4) Q_buf = Queue_buffer (q) queues.append (q_buf) p = Process (Target=heappoppush, args= (dir, Q_buf, I, count)) p Rocs.append (P) P.starT () queues = tuple (queues) for I in Range (count): Heappush (Heap, (Queues[i].get (), i)) while true:try:d, I      = Heappop (heap) except IndexError:queue.put (None) for P in Procs:p.join () return else:          If D is not None:heappush (heap, (Queues[i].get (), i)) if d! = Pre_element:pre_element = d Queue.put (d) def merge_file (dir): heap = File_heap (dir) os.system (' rm-f ' +dir+ '. Merge ') Fmerge = open (dir+ '. Merge '), ' a ') element = Heap.poppush_uniq () fmerge.write (element+ ' \ n ') while element was not none:element = Heap.poppush_uniq () fmerge.write (element+ ' \ n ') class Queue_buffer:def __init__ (self, queue): self.q = Queue self.rbuf = [] Sel F.wbuf = [] def get (self): If Len (self.rbuf) = = 0:self.rbuf = Self.q.get () R = self.rbuf[0] del self.rbuf[ 0] Return R def put (self, D): Self.wbuf.append (d) If D was None or Len (self.wbuf) > 1024:self.q.put (self . wbuf) Self.wbuf =[]def diff_file (File_old, File_new, file_diff, buf = 268435456): print ' buffer size ', buf from file_split import split_s Ort_file os.system (' rm-rf ' + os.path.splitext (file_old) [0]) Os.system (' Rm-rf ' + os.path.splitext (file_new) [0]) T = DateTime.Now () split_sort_file (FILE_OLD,5,BUF) split_sort_file (file_new,5,buf) print ' Split elasped ', DateTime.Now () -T Os.system (' Cat%s/* | wc-l '%os.path.splitext (File_old) [0]) Os.system (' Cat%s/* | wc-l '%os.path.splitext (file_new) [ 0]) Os.system (' rm-f ' +file_diff) t = DateTime.Now () Zdiff = open (File_diff, ' a ') Old_q = Queue (1024x768) new_q = Queue (1 024) Old_queue = Queue_buffer (old_q) new_queue = Queue_buffer (new_q) h1 = Process (Target=heappoppush2, args= (os.path.sp Litext (File_old) [0], Old_queue, 3)) H2 = Process (Target=heappoppush2, args= (Os.path.splitext (File_new) [0], New_queue, 3)) H1.start (), H2.start () old = Old_queue.get () new = New_queue.get () old_count, New_count = 0, 0 while old is not N One or new is not none:iF old > New or Old is None:zdiff.write (' < ' +new+ ' \ n ') New = New_queue.get () new_count +=1 elif ol      D < New or new is None:zdiff.write (' > ' +old+ ' \ n ') Old = Old_queue.get () old_count +=1 else: Old = Old_queue.get () new = New_queue.get () print ' New_count: ', new_count print ' Old_count: ', old_count print ' dif F elasped ', DateTime.Now ()-T H1.join (), H2.join ()

Hopefully this article will help you with Python programming.

  • Related Article

    Contact Us

    The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

    If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.