Not much to say, directly on the code.
Code
Package zhouls.bigdata.myWholeHadoop.HDFS.hdfs7;
Import java.io.IOException;
Import Java.net.URI;
Import java.net.URISyntaxException;
Import org.apache.hadoop.conf.Configuration;
Import Org.apache.hadoop.fs.FSDataInputStream;
Import Org.apache.hadoop.fs.FSDataOutputStream;
Import Org.apache.hadoop.fs.FileStatus;
Import Org.apache.hadoop.fs.FileSystem;
Import Org.apache.hadoop.fs.FileUtil;
Import Org.apache.hadoop.fs.Path;
Import Org.apache.hadoop.fs.PathFilter;
Import Org.apache.hadoop.io.IOUtils;
/**
* Function merging small files to HDFS
*
*
*/
public class Mergesmallfilestohdfs
{
private static FileSystem fs = null; Defines the file system object, which is on the HDFs
private static FileSystem local = NULL; Defines the file system object, which is the local
/**
* @function Main
* @param args
* @throws IOException
* @throws URISyntaxException
*/
public static void Main (string[] args) throws Ioexception,urisyntaxexception
{
List ();
}
/**
*
* @throws IOException
* @throws URISyntaxException
*/
public static void list () throws IOException, URISyntaxException
{
Read the Hadoop configuration file
Configuration conf = new configuration ();
File system Provider and creation of filesystem objects, running on-premises mode
Uri uri = new Uri ("hdfs://hadoopmaster:9000");
fs = Filesystem.get (URI, conf);
Get local file system
Local = filesystem.getlocal (conf);
The SVN file under the filter directory
filestatus[] Dirstatus = local.globstatus (New Path ("d://data/tvdata/*"), New Regexexcludepathfilter ("^.*svn$"));
Get all file paths under the D:\Data\tvdata directory
path[] dirs = fileutil.stat2paths (dirstatus);
Fsdataoutputstream out = null;
Fsdatainputstream in = null;
for (Path dir:dirs)
{//Take 2012-09-17 for example
Place folder name 2012-09-17-minus, direct, get 20120901 folder name
String fileName = Dir.getname (). Replace ("-", "");//File name
Only accept. txt files in the 20120917 date directory
filestatus[] Localstatus = local.globstatus (new Path (dir+ "/*"), New Regexacceptpathfilter ("^.*txt$"));
Get all the files in the 20120917 date directory
path[] listedpaths = fileutil.stat2paths (localstatus);
Output path
Path block = new Path ("hdfs://hadoopmaster:9000/middle/tv/" + fileName + ". txt");
SYSTEM.OUT.PRINTLN ("Merged file name:" +filename+ ". txt");
Open the output stream
Out = Fs.create (block);
Loop 20120917 All files under the date directory
for (Path p:listedpaths)
{
in = Local.open (P);//Open input stream
Ioutils.copybytes (in, out, 4096, false); Copying data
Close the input stream
In.close ();
}
if (out! = null)
{
Turn off the output stream
Out.close ();
}
After looping through all the files in the 20120917-date directory, then 20120918,20120919,,,
}
}
/**
*
* @function filter files in regex format
*
*/
public static class Regexexcludepathfilter implements Pathfilter
{
Private final String regex;
Public Regexexcludepathfilter (String regex)
{
This.regex = regex;
}
public boolean accept (path Path)
{
TODO auto-generated Method Stub
Boolean flag = Path.tostring (). Matches (regex);
return!flag;
}
}
/**
*
* @function accept files in regex format
*
*/
public static class Regexacceptpathfilter implements Pathfilter
{
Private final String regex;
Public Regexacceptpathfilter (String regex)
{
This.regex = regex;
}
public boolean accept (path Path)
{
TODO auto-generated Method Stub
Boolean flag = Path.tostring (). Matches (regex);
return flag;
}
}
}
Hadoop HDFs Programming API Getting Started series of merging small files into HDFs (iii)