Multi-path Merge Sorting [JAVA Implementation]

Source: Internet
Author: User

 

For data that is far greater than the memory, it is more efficient to use the loser tree when comparing multiple channels.

This algorithm can be used when an inverted index is created.

 

 

Package my. sort; </P> <p> Import Java. io. bufferedinputstream; <br/> Import Java. io. bufferedoutputstream; <br/> Import Java. io. bufferedwriter; <br/> Import Java. io. datainputstream; <br/> Import Java. io. dataoutputstream; <br/> Import Java. io. file; <br/> Import Java. io. fileinputstream; <br/> Import Java. io. filenotfoundexception; <br/> Import Java. io. fileoutputstream; <br/> Import Java. io. filewriter; <br/> Import J Ava. io. ioexception; <br/> Import Java. util. arraylist; <br/> Import Java. util. arrays; <br/> Import Java. util. iterator; <br/> Import Java. util. random; </P> <p>/** <br/> * external Sorting Algorithm Based on large data volumes, divided into two merge and multi-channel merge <br/> * @ author java2king <br/> * @ link http://blog.csdn.net/Java2King <br/> */<br/> public class externalsort {</P> <p> Public static int item_count = 10000000; // total number </P> <p> Public static int buffer_size = 1024*4*1000; // One buffer read </P> <p> Public static int file_count = 1024*1000*1*4; // The number of records per file: 1 </P> <p> Public static file main_file = new file ("mainset "); // file to be sorted </P> <p>/** <br/> * Two-way merge <br/> * @ Param file <br/> * @ return <br /> * @ throws ioexception <br/> */<br/> Public file sort (File file) throws ioexception {<br/> arraylist <File> files = Split (File); <br/> return process (files ); <br/>}< br/>/** <br /> * Merge multiple channels <br/> * @ Param file <br/> * @ throws ioexception <br/> */<br/> Public void msort (File file) throws ioexception {<br/> arraylist <File> files = Split (File); <br/> multiplemerge (files ); </P> <p >}</P> <p> // Recursive Method to merge the lists until we are left with a <br/> // single merged list <br /> private file process (arraylist <File> List) throws ioexception {<br/> If (list. size () = 1 ){ <Br/> return list. get (0); <br/>}< br/> arraylist <File> inter = new arraylist <File> (); <br/> for (iterator <File> itr = List. iterator (); itr. hasnext ();) {<br/> file one = itr. next (); <br/> If (itr. hasnext () {<br/> file two = itr. next (); <br/> Inter. add (merge (ONE, TWO); <br/>}else {<br/> // return one; <br/> Inter. add (one); <br/>}< br/> return process (inter ); <br/>}< br/>/** <br/> * Splits the original file into a number of sub files. <br/> */<br/> private arraylist <File> split (File file) throws ioexception {<br/> arraylist <File> files = new arraylist <File> (); <br/> int [] buffer = new int [file_count]; <br/> fileinputstream Fr = new fileinputstream (File); <br/> bufferedinputstream bin = new bufferedinputstream (FR, buffer_size); <br/> datainputstream din = new datainputstream (BIN); <br/> Boolean filecomplete = false; </P> <p> while (! Filecomplete) {<br/> int Index = buffer. length; <br/> for (INT I = 0; I <buffer. Length &&! Filecomplete; I ++) {<br/> try {<br/> buffer [I] = din. readint (); <br/>}catch (exception e) {<br/> filecomplete = true; <br/> Index = I; <br/>}< br/> If (index! = 0 & buffer [0]>-1) {<br/> arrays. sort (buffer, 0, index); <br/> file F = new file ("set" + new random (). nextint (); <br/> // file temp = file. createtempfile ("josp ",". TMP ", f); <br/> fileoutputstream writer = new fileoutputstream (f); <br/> bufferedoutputstream boutputstream = new bufferedoutputstream (writer ); </P> <p> dataoutputstream dout = new dataoutputstream (boutputstream); <br/> for (Int J = 0; J <index; j ++) {<br/> dout. writeint (buffer [J]); <br/>}< br/> dout. close (); <br/> boutputstream. close (); <br/> writer. close (); <br/> files. add (f); </P> <p >}< br/> din. close (); <br/> bin. close (); <br/> Fr. close (); <br/> return files; <br/>}< br/>/** <br/> * multiplexing <br/> * @ Param list <br/> * @ throws ioexception <br/> * /<br/> private void multiplemerge (arraylist <File> List) throws ioexcep Tion {</P> <p> int filesize = List. size (); </P> <p> If (filesize = 1) {<br/> return; <br/>}</P> <p> arraylist <datainputstream> dinlist = new arraylist <datainputstream> (); </P> <p> int [] ext = new int [filesize]; // compare the array </P> <p> // file output = new file ("multiplemerged"); <br/> fileoutputstream OS = new fileoutputstream (main_file ); <br/> bufferedoutputstream bout = new bufferedoutputstream (OS); <br/> dataoutputstream Dout = new dataoutputstream (bout); </P> <p> for (INT I = 0; I <filesize; I ++) {<br/> try {<br/> dinlist. add (I, new datainputstream (New bufferedinputstream (<br/> New fileinputstream (list. get (I), buffer_size); <br/>} catch (exception e) {<br/> E. printstacktrace (); <br/>}</P> <p> int Index = 0; </P> <p> for (INT I = 0; I <filesize; I ++) {<br/> try {<br/> ext [I] = dinlist. get (I ). readint (); <br/>} Catch (exception e) {<br/> system. err. println ("file _" + I + "blank"); <br/> ext [I] =-1; <br/>}< br/> int COUNT = filesize; <br/> int [] sum = new int [filesize]; </P> <p> while (count> 1) {</P> <p> Index = getminindex (EXT); <br/> dout. writeint (EXT [Index]); <br/> sum [Index] ++; <br/> try {<br/> ext [Index] = dinlist. get (index ). readint (); <br/>}catch (exception e) {<br/> ext [Index] =-1; <br/> count --; <br/> Dinlist. get (index ). close (); <br/> // system. err. println (index + "null, written into:" + sum [Index]); </P> <p >}< br/> int sindex = getsindex (EXT); <br/> dout. writeint (EXT [sindex]); <br/> while (true) {<br/> try {<br/> dout. writeint (dinlist. get (sindex ). readint (); <br/>}catch (exception e) {<br/> dinlist. get (sindex ). close (); <br/> break; <br/>}< br/> dout. close (); <br/>}< br/> // find the input stream of the last file. <br/> Public Int getsindex (INT [] ext) {<br/> int result = 0; <br/> for (INT I = 0; I <Ext. length; I ++) {<br/> If (EXT [I]! =-1) {<br/> result = I; <br/> break; <br/>}< br/> return result; <br/>}< br/> // find the smallest data <br/> Public int getminindex (INT [] ext) {<br/> int min = 2147483647; <br/> int Index =-1; <br/> for (INT I = 0; I <Ext. length; I ++) {<br/> If (EXT [I]! =-1 & ext [I] <min) {<br/> min = ext [I]; <br/> Index = I; <br/>}< br/> Return Index; <br/>}< br/>/** <br/> * merge two channels <br/> * @ Param one <br/> * @ Param two <br/> * @ return <br/> * @ throws ioexception <br/> */<br/> private file Merge (file one, file two) throws ioexception {<br/> fileinputstream fis1 = new fileinputstream (one); <br/> fileinputstream fis2 = new fileinputstream (two); <br /> Bufferedinputstream bin1 = new bufferedinputstream (fis1, buffer_size); <br/> bufferedinputstream bin2 = new bufferedinputstream (fis2, buffer_size ); </P> <p> datainputstream din1 = new datainputstream (bin1); <br/> datainputstream din2 = new datainputstream (bin2 ); </P> <p> file output = new file ("merged" + new random (). nextint (); <br/> fileoutputstream OS = new fileoutputstream (output); <br/> bufferedout Putstream bout = new bufferedoutputstream (OS); <br/> dataoutputstream dout = new dataoutputstream (bout); </P> <p> int A =-1; // = din1.readint (); <br/> int B =-1; // = din2.readint (); </P> <p> Boolean finished = false; <br/> Boolean emptya = false; // <br/> int flag = 0; <br/> while (! Finished) {</P> <p> If (flag! = 1) {<br/> try {<br/> A = din1.readint (); <br/>} catch (exception e) {<br/> emptya = true; <br/> break; <br/>}< br/> If (flag! = 2) {<br/> try {<br/> B = din2.readint (); <br/>} catch (exception e) {<br/> emptya = false; <br/> break; <br/>}< br/> If (A> B) {<br/> dout. writeint (B); <br/> flag = 1; <br/>} else if (a <B) {<br/> dout. writeint (a); <br/> flag = 2; <br/>} else if (a = B) {<br/> dout. write (a); <br/> dout. write (B); <br/> flag = 0; <br/>}< br/> finished = false; <br/> If (emptya) {<br/> dout. writeint (B); <br/> while (! Finished) {<br/> try {<br/> B = din2.readint (); <br/>} catch (exception e) {<br/> break; <br/>}< br/> dout. writeint (B); <br/>}< br/>} else {<br/> dout. writeint (a); <br/> while (! Finished) {<br/> try {<br/> A = din1.readint (); <br/>}catch (exception e) {<br/> break; <br/>}< br/> dout. writeint (a); <br/>}< br/> dout. close (); <br/> OS. close (); <br/> bin1.close (); <br/> bin2.close (); <br/> bout. close (); <br/> return output; <br/>}</P> <p>/** <br/> * @ Param ARGs <br/> * @ throws ioexception <br/> */<br/> Public static void main (string [] ARGs) throws ioexception {</P> <p> Ra Ndom random = new random (system. currenttimemillis (); <br/> fileoutputstream fw = new fileoutputstream (main_file); <br/> bufferedoutputstream bout = new bufferedoutputstream (FW ); <br/> dataoutputstream dout = new dataoutputstream (bout); </P> <p> for (INT I = 0; I <item_count; I ++) {<br/> int ger = random. nextint (); <br/> ger = ger <0? -Ger: GER; <br/> dout. writeint (GER); </P> <p >}< br/> dout. close (); <br/> bout. close (); <br/> FW. close (); <br/> externalsort sort = new externalsort (); <br/> system. out. println ("Original:"); </P> <p> long start = system. currenttimemillis (); <br/> sort. msort (main_file); </P> <p> long end = system. currenttimemillis (); <br/> system. out. println (end-Start)/1000 + "S"); <br/> recordfile (end-Start)/1000, true ); <br/>}</P> <p> Private Static void recordfile (long time, Boolean isbuffer) <br/> throws filenotfoundexception, ioexception {<br/> bufferedwriter BW = new bufferedwriter (New filewriter ("log", true); <br/> BW. write ("file_count =" + file_count + "; for" + item_count + "data entries" + item_count * 4/(1024*1204) + "MB sorting time consumption: "+ time +" S "); <br/> If (isbuffer) {<br/> BW. write ("buffer usage:" + buffer_size * 4/(1024*1204) + "MB"); <br/>}< br/> BW. newline (); <br/> BW. close (); <br/>}</P> <p>}

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.