/**
* Generate the list of files and make them into filesplits.
* @param job The job context
* @throws IOException
*/
Public list<inputsplit> getsplits (Jobcontext job) throws IOException {
Stopwatch SW = new Stopwatch (). Start ();
//Get the minimum value a inputsplit can contain
Long minSize = Math.max (Getformatminsplitsize (), getminsplitsize (Job));
//Get the maximum value a inputsplit can contain
Long maxSize = getmaxsplitsize (Job);
Generate splits
List<inputsplit> splits = new arraylist<inputsplit> ();
list<filestatus> files = liststatus (Job);
/*
* As a result, if there are 1 million small files, it will loop 1 million times, and generate at least 1 million inputsplit, it will contain at least 1 million map tasks
* If the default size of a inputsplit is a block size, i.e. 64M
* A 20M file will produce a inputsplit, a map task
* A 80M file will generate two Inputsplit, two map tasks
* Two files of 20M respectively produce two inputsplit, two map tasks
* A 20M, a 70M file will produce three inputsplit, three map tasks in total
*/
for (Filestatus file:files) {
Path PATH = File.getpath ();
Long length = File.getlen ();
if (length! = 0) {
Blocklocation[] blklocations;
if (file instanceof locatedfilestatus) {
Blklocations = ((locatedfilestatus) file). Getblocklocations ();
} else {
FileSystem fs = Path.getfilesystem (Job.getconfiguration ());
blklocations = fs.getfileblocklocations (file, 0, length);
} if (issplitable (Job, path)) {
//Get HDFs the default block size
Long blockSize = File.getblocksize ();
//Calculate the size of a inputsplit
Long splitsize = Computesplitsize (BlockSize, MinSize, maxSize);
Long bytesremaining = length;
while ((double) bytesremaining)/splitsize > Split_ SLOP) {
int blkindex = Getblockindex ( Blklocations, length-bytesremaining);
Splits.add (makesplit (Path, Length-bytesremaining, Splitsize,
blklocations[blkindex].gethosts (),
blklocations[blkindex].getcachedhosts ()));
bytesremaining-= splitsize;
}
if (bytesremaining! = 0) {
int blkindex = Getblockindex (blklocations, length-bytesremaining);
Splits.add (Makesplit (Path, length-bytesremaining, BytesRemaining,
Blklocations[blkindex].gethosts (),
Blklocations[blkindex].getcachedhosts ()));
}
} else {//not splitable
Splits.add (makesplit (path, 0, length, blklocations[0].gethosts (),
Blklocations[0].getcachedhosts ()));
}
} else {
Create empty hosts array for zero length files
Splits.add (makesplit (path, 0, length, new string[0]));
}
}
Save the number of input files for Metrics/loadgen
Job.getconfiguration (). Setlong (Num_input_files, Files.size ());
Sw.stop ();
if (log.isdebugenabled ()) {
Log.debug ("Total # of splits generated by getsplits:" + splits.size () + ", Timetaken:" + sw.elapsedmillis ());
}
return splits;
}
Fileinputformat Guide Getsplits