Hadoop is known to be more efficient at processing a single large file than it does with multiple small files, and a single file also consumes HDFs storage space. So it is often to merge them together.
1,getmerge
Hadoop has a command-line tool Getmerge, which is used to copy files from a set of HDFs to the local computer before merging
Reference: http://hadoop.apache.org/common/docs/r0.19.2/cn/hdfs_shell.html
How to use: Hadoop fs-getmerge <src> <localdst> [ADDNL]
Accepts a source directory and a destination file as input, and connects all the files in the source directory to the local destination file. ADDNL is optional and is used to specify that a line break is added at the end of each file.
A few words: The call file System (FS) shell command should use the form Bin/hadoop FS <args>. All of the FS shell commands use URI paths as parameters. The URI format is Scheme://authority/path.
2.putmerge
The local small file merge is uploaded to the HDFs file system.
A method can now write a script locally, first merge a file into a large file, and then upload the entire large file, this method consumes a lot of local disk space;
Another way to do this is to upload it during the copy process. Reference: "Hadoop in Action"
import java.io.ioexception; import org.apache.hadoop.conf.configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.fsdataoutputstream; import org.apache.hadoop.fs.filestatus; import org.apache.hadoop.fs.filesystem; import org.apache.hadoop.fs.path; import org.apache.hadoop.io.ioutils; //parameter 1 is a local directory, and parameter 2 is a file on HDFs public class putmerge { public static void putmergefunc (String localdir, string fsfile) throws ioexception { Configuration conf = new configuration (); filesystem fs = filesysTem.get (conf); //fs is the HDFs file system filesystem local = filesystem.getlocal (conf); //local file system path localdir = new path (Localdir); path hdfsfile = new path (Fsfile); filestatus[] status = local.liststatus (localdir); //Get input directory fsdataoutputstream out = fs.create (Hdfsfile); //creating an output file on HDFs &nbSp; for (Filestatus st: status) { path temp = st.getpath (); Fsdatainputstream in = local.open (temp); ioutils.copybytes (in, out, 4096, false); / /Read the contents of the in stream into Out in.close (); When the //is complete, close the current file input stream } out.close (); } public static void main (String [] args) throws ioexception { String l = "/home/kqiao/hadoop/ Myhadoopcodes/putmergefiles "; string f = "Hdfs://ubuntu:9000/user/kqiao/test/putmergetest"; putmergefunc (l,f); } }
3. Package small files into Sequencefile's MapReduce task
From: "The Hadoop authoritative guide"
Implements a inputformat that handles the entire file as a record:
import org.apache.hadoop.conf.configuration;import org.apache.hadoop.conf.configured;import org.apache.hadoop.fs.filesystem;import org.apache.hadoop.fs.path;import org.apache.hadoop.io.byteswritable;import org.apache.hadoop.io.nullwritable;import org.apache.hadoop.io.text;import org.apache.hadoop.mapreduce.inputsplit;import org.apache.hadoop.mapreduce.job;import org.apache.hadoop.mapreduce.mapper;import org.apache.hadoop.mapreduce.lib.input.fileinputformat;import org.apache.hadoop.mapreduce.lib.input.filesplit;import org.apache.hadoop.mapreduce.lib.output.fileoutputformat;import org.apache.hadoop.mapreduce.lib.output.sequencefileoutputformat;import org.apache.hadoop.mapreduce.lib.partition.hashpartitioner;import org.apache.hadoop.util.tool;import org.apache.hadoop.util.toolrunner;//implements an entire file as a record processing inputformat:public class Smallfilestosequencefileconverter extends configured implements tool {// static inner class, as mapperstatic class sequencefilemapper extends mapper<nullwritable, byteswritable, text, byteswritable> {private text Filenamekey;// setup is called before the task starts, and here is primarily the initialization of Filenamekey@overrideprotected void setup (Context Context) {inputsplit split = context.getinputsplit (); path path = ((filesplit) split). GetPath (); Filenamekey = new text ( Path.tostring ());} @Overridepublic void map (nullwritable key, byteswritable value, context Context) Throws ioexception, interruptedexception {context.write (Filenamekey, value);}} @Overridepublic int run (String[] args) throws exception {configuration conf = new configuration (); Job job = new job (conf); Path output = new path ("Hdfs://192.168.0.211:9000/inputredlinetExt "); Filesystem fs = filesystem.get (conf); if (fs.exists (output)) {Fs.delete (output, true);} Fileinputformat.addinputpath (Job, new path ("/inputtext")); Fileoutputformat.setoutputpath (Job, new path ("/inputredlinetext"));// again understand the input and output formats set here ... It represents a method of partitioning the file, indexed Job.setinputformatclass (Wholefileinputformat.class); Job.setoutputformatclass ( Sequencefileoutputformat.class);// Here is the final output of the Key/value, be sure to pay attention! Job.setoutputkeyclass (Text.class); Job.setoutputvalueclass (Byteswritable.class); Job.setmapperclass ( Sequencefilemapper.class); job.setjarbyclass (Smallfilestosequencefileconverter.class); Job.setjobname ("Smallfilestosequencefileconverter"); job.setpartitionerclass (HashPartitioner.class); return job.waitforcompletion (True) ? 0 : 1;} Public static void main (String[] args) throws exception {int exitcode = toolrunner.run (New smallfilestosequenCefileconverter (), args); System.exit (ExitCode);}}
import java.io.ioexception;import org.apache.hadoop.fs.path;import org.apache.hadoop.io.byteswritable;import org.apache.hadoop.io.nullwritable;import org.apache.hadoop.mapreduce.inputsplit;import org.apache.hadoop.mapreduce.jobcontext;import Org.apache.hadoop.mapreduce.recordreader;import org.apache.hadoop.mapreduce.taskattemptcontext;import org.apache.hadoop.mapreduce.lib.input.fileinputformat;//the custom recordreader:public class used in the implementation class wholefileinputformat extends fileinputformat<nullwritable, byteswritable> {@ Overrideprotected boolean issplitable (Jobcontext context, path file) {return false;} @Overridepublic recordreader<nullwritable, byteswritable> createrecordreader (InputSplit split, taskattemptcontext context) throws ioexception, interruptedexception { Wholefilerecordreader reader = new wholefilerecordreaDer (); Reader.initialize (split, context); return reader;}}
import java.io.ioexception;import org.apache.hadoop.conf.configuration;import org.apache.hadoop.fs.fsdatainputstream;import org.apache.hadoop.fs.path;import org.apache.hadoop.io.byteswritable;import org.apache.hadoop.io.ioutils;import org.apache.hadoop.io.nullwritable;import org.apache.hadoop.mapreduce.inputsplit;import Org.apache.hadoop.mapreduce.recordreader;import org.apache.hadoop.mapreduce.taskattemptcontext;import org.apache.hadoop.mapreduce.lib.input.filesplit;//implements a custom recordreader that all six methods are virtual functions that are required by the inherited Recordreader. //implementation of Recordreader for custom InputFormat services public class WholeFileRecordReader extends recordreader<nullwritable, byteswritable> {private filesplit filesplit; Private configuration conf;private byteswritable value = new byteswritable ( );p rivate boolean processed = false; @Overridepublic void close() throws ioexception {// do nothing} @Overridepublic NullWritable Getcurrentkey () throws ioexception, interruptedexception {return nullwritable.get ();} @Overridepublic byteswritable getcurrentvalue () throws IOException, Interruptedexception {return value;} @Overridepublic float getprogress () throws ioexception, interruptedexception { return processed ? 1.0f : 0.0f;} @Overridepublic void initialize (Inputsplit split, taskattemptcontext context) throws ioexception, interruptedexception {this.filesplit = (FileSplit) split; This.conf = context.getconfiguration ();} process Indicates whether the record has been processed @overridepublic boolean nextkeyvalue () throws ioexception, InterruptedException {if (!processed) {byte[] contents = new byte[( int) &NBSp;filesplit.getlength ()]; Path file = filesplit.getpath ();org.apache.hadoop.fs.filesystem fs = File.getfilesystem (conf); Fsdatainputstream in = null;try {in = fs.open (file);// the file// Contents into the contents array. Using the Readfully method of the Ioutils utility class, the contents of the in stream are placed in the// contents byte array. Ioutils.readfully (in, contents, 0, contents.length);// byteswritable is a sequence of bytes that can be used as a key or value. And Bytewritable is a single byte. Set the content of value to the value of Contents Value.set (contents, 0, contents.length);} finally {ioutils.closestream (in);} Processed = true;return true;} Return false;}}
The entire file is processed as a record of the custom Fileinputformat class