This topic has a precondition that the data in a large file has a feature, that is, there is no duplicate data. Even duplicate data only allows you to find the sorting results of the data after deduplication.
If the question becomes like this, we can consider creating a large array. The following table of the array indicates an integer. The content of the array is assigned a value when I traverse the file. If the integer represented by the array subscript exists, we assign this element of the array to 1; otherwise, the value is 0. in this way, we can traverse the array again and output the subscripts whose element value is 1 to another file, so that the other file stores the sorted data.
In fact, we can use vector <bool> or bitset <n>; To implement the above array. This is the so-called bitmap algorithm, which is implemented as follows:
[Cpp]
// Sort the total number of 10 to 7 files in sequence, and write the collation result to the sorted_data.txt file.
// Use bitmap.
Void sort_big_file_with_single_data ()
{
Timer t;
// Generate a set of large numbers and arrange them randomly
Const int n= 5000000;
Vector <int>;
For (int I = 0; I <n; I ++)
A. push_back (I );
Variate_generator <mt19937, uniform_int <> myrandom (mt19937 (), uniform_int <> (0, n-1 ));
For (int I = 0; I <n; ++ I)
Swap (a [myrandom ()], a [myrandom ()]);
// Write data to a file
Ofstream fout ("data.txt", ios: binary );
If (! Fout)
{
Cout <"can not open file to write" <endl;
}
For (int I = 0; I <n; I ++)
{
Fout. write (reinterpret_cast <const char *> (& a [I]), sizeof (int ));
}
Fout. close ();
// Read the first 100 pieces of data from the file and display
Ifstream fin;
Fin. open ("data.txt", ios: binary );
For (int I = 0, num = 0; I <100; I ++)
{
Fin. read (reinterpret_cast <char *> (& num), sizeof (int ));
If (fin. eof ())
{
Break;
}
Cout <num <"";
}
Fin. close ();
// Sort files
Bitset <n> bit;
Bit. reset ();
// Ifstream fin;
Fin. open ("data.txt", ios: binary );
Int I = 0;
While (true)
{
Fin. read (reinterpret_cast <char *> (& I), sizeof (int ));
If (fin. eof ())
{
Break;
}
Bit [I] = true;
}
Fin. close ();
// Output the sorted file to sorted_data.txt
// Ofstream fout;
Fout. open ("sorted_data.txt", ios: binary );
For (int I = 0; I <n; I ++)
{
If (bit [I] = true)
{
Fout. write (reinterpret_cast <const char *> (& I), sizeof (int ));
}
}
Fout. close ();
Cout <"sort data" <t. elapsed () <"seconds needed" <endl;
// Read sorted files and display the first 100 data records
Fin. open ("sorted_data.txt", ios: binary );
For (int I = 0, num = 0; I <100; I ++)
{
Fin. read (reinterpret_cast <char *> (& num), sizeof (int ));
If (fin. eof ())
{
Break;
}
Cout <num <"";
}
Fin. close ();
}
// Sort the total number of 10 to 7 files in sequence, and write the collation result to the sorted_data.txt file.
// Use bitmap.
Void sort_big_file_with_single_data ()
{
Timer t;
// Generate a set of large numbers and arrange them randomly
Const int n= 5000000;
Vector <int>;
For (int I = 0; I <n; I ++)
A. push_back (I );
Variate_generator <mt19937, uniform_int <> myrandom (mt19937 (), uniform_int <> (0, n-1 ));
For (int I = 0; I <n; ++ I)
Swap (a [myrandom ()], a [myrandom ()]);
// Write data to a file
Ofstream fout ("data.txt", ios: binary );
If (! Fout)
{
Cout <"can not open file to write" <endl;
}
For (int I = 0; I <n; I ++)
{
Fout. write (reinterpret_cast <const char *> (& a [I]), sizeof (int ));
}
Fout. close ();
// Read the first 100 pieces of data from the file and display
Ifstream fin;
Fin. open ("data.txt", ios: binary );
For (int I = 0, num = 0; I <100; I ++)
{
Fin. read (reinterpret_cast <char *> (& num), sizeof (int ));
If (fin. eof ())
{
Break;
}
Cout <num <"";
}
Fin. close ();
// Sort files
Bitset <n> bit;
Bit. reset ();
// Ifstream fin;
Fin. open ("data.txt", ios: binary );
Int I = 0;
While (true)
{
Fin. read (reinterpret_cast <char *> (& I), sizeof (int ));
If (fin. eof ())
{
Break;
}
Bit [I] = true;
}
Fin. close ();
// Output the sorted file to sorted_data.txt
// Ofstream fout;
Fout. open ("sorted_data.txt", ios: binary );
For (int I = 0; I <n; I ++)
{
If (bit [I] = true)
{
Fout. write (reinterpret_cast <const char *> (& I), sizeof (int ));
}
}
Fout. close ();
Cout <"sort data" <t. elapsed () <"seconds needed" <endl;
// Read sorted files and display the first 100 data records
Fin. open ("sorted_data.txt", ios: binary );
For (int I = 0, num = 0; I <100; I ++)
{
Fin. read (reinterpret_cast <char *> (& num), sizeof (int ));
If (fin. eof ())
{
Break;
}
Cout <num <"";
}
Fin. close ();
}
The above processes 5 million integer files, outputs the results to another file, and displays part of the output file as the test result.
The timer here uses the timer in the boost library. If not, download the boost file, decompress it, and put the boost folder into the include file of VC ++ IDE.
Author: ClamReason