對遠遠大於記憶體的資料進行外排序,在多路比較的時候用敗者樹效率會更高。
這個演算法可以在建立倒排索引的時候使用
package my.sort;</p><p>import java.io.BufferedInputStream;<br />import java.io.BufferedOutputStream;<br />import java.io.BufferedWriter;<br />import java.io.DataInputStream;<br />import java.io.DataOutputStream;<br />import java.io.File;<br />import java.io.FileInputStream;<br />import java.io.FileNotFoundException;<br />import java.io.FileOutputStream;<br />import java.io.FileWriter;<br />import java.io.IOException;<br />import java.util.ArrayList;<br />import java.util.Arrays;<br />import java.util.Iterator;<br />import java.util.Random;</p><p>/**<br /> * 基於大資料量的外排序演算法,分為二路歸併和多路歸併<br /> * @author java2king<br /> * @link http://blog.csdn.net/Java2King<br /> *<br /> */<br />public class ExternalSort {</p><p>public static int ITEM_COUNT = 10000000; //總數 </p><p> public static int BUFFER_SIZE = 1024*4*1000;// 一次緩衝讀取</p><p> public static int FILE_COUNT = 1024*1000*1*4;// 每個檔案的記錄數1</p><p> public static File MAIN_FILE = new File("mainset");//要排序的檔案</p><p> /**<br /> * 二路歸併<br /> * @param file<br /> * @return<br /> * @throws IOException<br /> */<br /> public File sort(File file) throws IOException {<br /> ArrayList<File> files = split(file);<br /> return process(files);<br /> }<br /> /**<br /> * 多路歸併<br /> * @param file<br /> * @throws IOException<br /> */<br /> public void mSort(File file) throws IOException{<br /> ArrayList<File> files = split(file);<br /> multipleMerge(files);</p><p> }</p><p> // recursive method to merge the lists until we are left with a<br /> // single merged list<br /> private File process(ArrayList<File> list) throws IOException {<br /> if (list.size() == 1) {<br /> return list.get(0);<br /> }<br /> ArrayList<File> inter = new ArrayList<File>();<br /> for (Iterator<File> itr = list.iterator(); itr.hasNext();) {<br /> File one = itr.next();<br /> if (itr.hasNext()) {<br /> File two = itr.next();<br /> inter.add(merge(one, two));<br /> } else {<br /> // return one;<br /> inter.add(one);<br /> }<br /> }<br /> return process(inter);<br /> }<br /> /**<br /> * Splits the original file into a number of sub files.<br /> */<br /> private ArrayList<File> split(File file) throws IOException {<br /> ArrayList<File> files = new ArrayList<File>();<br /> int[] buffer = new int[FILE_COUNT];<br /> FileInputStream fr = new FileInputStream(file);<br /> BufferedInputStream bin = new BufferedInputStream(fr,BUFFER_SIZE);<br /> DataInputStream din=new DataInputStream(bin);<br /> boolean fileComplete = false;</p><p> while (!fileComplete) {<br /> int index = buffer.length;<br /> for (int i = 0; i < buffer.length && !fileComplete; i++) {<br /> try {<br /> buffer[i] = din.readInt();<br />} catch (Exception e) {<br />fileComplete = true;<br /> index = i;<br />}<br /> }<br /> if (index != 0 && buffer[0] > -1) {<br /> Arrays.sort(buffer, 0, index);<br /> File f = new File("set" + new Random().nextInt());<br /> // File temp = File.createTempFile("josp", ".tmp", f);<br /> FileOutputStream writer = new FileOutputStream(f);<br /> BufferedOutputStream bOutputStream = new BufferedOutputStream(writer);</p><p> DataOutputStream dout=new DataOutputStream(bOutputStream);<br /> for (int j = 0; j < index; j++) {<br /> dout.writeInt(buffer[j]);<br /> }<br /> dout.close();<br /> bOutputStream.close();<br /> writer.close();<br /> files.add(f);</p><p> }</p><p> }<br /> din.close();<br /> bin.close();<br /> fr.close();<br /> return files;<br /> }<br /> /**<br /> * 多路歸併<br /> * @param list<br /> * @throws IOException<br /> */<br />private void multipleMerge(ArrayList<File> list) throws IOException {</p><p>int fileSize = list.size();</p><p>if(fileSize == 1){<br />return;<br />}</p><p>ArrayList<DataInputStream> dinlist = new ArrayList<DataInputStream>();</p><p>int[] ext = new int[fileSize];//比較數組</p><p>//File output = new File("multipleMerged");<br />FileOutputStream os = new FileOutputStream(MAIN_FILE);<br />BufferedOutputStream bout = new BufferedOutputStream(os);<br />DataOutputStream dout = new DataOutputStream(bout);</p><p>for (int i = 0; i < fileSize; i++) {<br />try {<br />dinlist.add(i, new DataInputStream(new BufferedInputStream(<br />new FileInputStream(list.get(i)), BUFFER_SIZE)));<br />} catch (Exception e) {<br />e.printStackTrace();<br />}<br />}</p><p>int index = 0;</p><p>for (int i = 0; i < fileSize; i++) {<br />try {<br />ext[i] = dinlist.get(i).readInt();<br />} catch (Exception e) {<br />System.err.println("file_" + i + "為空白");<br />ext[i] = -1;<br />}<br />}<br />int count = fileSize;<br />int[] sum = new int[fileSize];</p><p>while (count > 1) {</p><p>index = getMinIndex(ext);<br />dout.writeInt(ext[index]);<br />sum[index]++;<br />try {<br />ext[index] = dinlist.get(index).readInt();<br />} catch (Exception e) {<br />ext[index] = -1;<br />count--;<br />dinlist.get(index).close();<br />//System.err.println(index + "空,寫進:" +sum[index]);</p><p>}<br />}<br />int sIndex = getSIndex(ext);<br />dout.writeInt(ext[sIndex]);<br />while (true) {<br />try {<br />dout.writeInt(dinlist.get(sIndex).readInt());<br />} catch (Exception e) {<br />dinlist.get(sIndex).close();<br />break;<br />}<br />}<br />dout.close();<br />}<br /> //找到剩下的最後一個檔案輸入資料流<br /> public int getSIndex(int[] ext){<br /> int result = 0;<br /> for (int i = 0; i < ext.length; i++) {<br />if(ext[i]!= -1){<br />result = i;<br />break;<br />}<br />}<br /> return result;<br /> }<br /> //找到資料中最小的一個<br /> public int getMinIndex(int[] ext){<br /> int min = 2147483647;<br /> int index = -1;<br /> for (int i = 0; i < ext.length; i++) {<br />if(ext[i] != -1 && ext[i] < min){<br />min = ext[i];<br />index = i;<br />}<br />}<br /> return index;<br /> }<br /> /**<br /> * 二路歸併<br /> *<br /> * @param one<br /> * @param two<br /> * @return<br /> * @throws IOException<br /> */<br /> private File merge(File one, File two) throws IOException {<br /> FileInputStream fis1 = new FileInputStream(one);<br /> FileInputStream fis2 = new FileInputStream(two);<br /> BufferedInputStream bin1 = new BufferedInputStream(fis1,BUFFER_SIZE);<br /> BufferedInputStream bin2 = new BufferedInputStream(fis2,BUFFER_SIZE);</p><p> DataInputStream din1=new DataInputStream(bin1);<br /> DataInputStream din2=new DataInputStream(bin2); </p><p> File output = new File("merged" + new Random().nextInt());<br /> FileOutputStream os = new FileOutputStream(output);<br /> BufferedOutputStream bout = new BufferedOutputStream(os);<br /> DataOutputStream dout=new DataOutputStream(bout); </p><p> int a = -1;//= din1.readInt();<br /> int b = -1;//= din2.readInt();</p><p> boolean finished = false;<br /> boolean emptyA = false;//<br /> int flag = 0;<br />while (!finished) {</p><p>if (flag != 1) {<br />try {<br />a = din1.readInt();<br />} catch (Exception e) {<br />emptyA = true;<br />break;<br />}<br />}<br />if (flag != 2) {<br />try {<br />b = din2.readInt();<br />} catch (Exception e) {<br />emptyA = false;<br />break;<br />}<br />}<br />if(a > b){<br />dout.writeInt(b);<br />flag = 1;<br />}else if( a < b){<br />dout.writeInt(a);<br />flag = 2;<br />}else if(a == b){<br />dout.write(a);<br />dout.write(b);<br />flag = 0;<br />}<br />}<br />finished = false;<br />if(emptyA){<br />dout.writeInt(b);<br />while(!finished){<br />try {<br />b = din2.readInt();<br />} catch (Exception e) {<br />break;<br />}<br />dout.writeInt(b);<br />}<br />}else{<br />dout.writeInt(a);<br />while(!finished){<br />try {<br />a = din1.readInt();<br />} catch (Exception e) {<br />break;<br />}<br />dout.writeInt(a);<br />}<br />}<br />dout.close();<br /> os.close();<br /> bin1.close();<br /> bin2.close();<br /> bout.close();<br /> return output;<br /> }</p><p> /**<br /> * @param args<br /> * @throws IOException<br /> */<br /> public static void main(String[] args) throws IOException {</p><p> Random random = new Random(System.currentTimeMillis());<br /> FileOutputStream fw = new FileOutputStream(MAIN_FILE);<br /> BufferedOutputStream bout = new BufferedOutputStream(fw);<br /> DataOutputStream dout=new DataOutputStream(bout); </p><p> for (int i = 0; i < ITEM_COUNT; i++) {<br /> int ger = random.nextInt();<br /> ger = ger < 0 ? -ger : ger;<br /> dout.writeInt(ger);</p><p> }<br /> dout.close();<br /> bout.close();<br /> fw.close();<br /> ExternalSort sort = new ExternalSort();<br /> System.out.println("Original:");</p><p> long start = System.currentTimeMillis();<br /> sort.mSort(MAIN_FILE);</p><p> long end = System.currentTimeMillis();<br /> System.out.println((end - start)/1000 + "s");<br /> recordFile((end - start)/1000 ,true);<br /> }</p><p> private static void recordFile(long time,boolean isBuffer)<br /> throws FileNotFoundException, IOException {<br /> BufferedWriter bw = new BufferedWriter(new FileWriter("log",true));<br /> bw.write("FILE_COUNT = "+FILE_COUNT+";對"+ ITEM_COUNT + "條資料 "+ ITEM_COUNT*4/(1024*1204) +"MB排序耗時:" + time + "s ");<br /> if(isBuffer){<br /> bw.write(" 使用緩衝:"+BUFFER_SIZE*4/(1024*1204) +"MB");<br /> }<br /> bw.newLine();<br /> bw.close();<br /> }</p><p>}