Today wrote a MapReduce program from the MSSQL SERVER2008 database to take data analysis. Program published to a Hadoop machine to run a report sqlexception error
Strangely, there is no limit in my SQL statement, this limit where it comes from. I looked through the source code of the Dbinputformat class,
Protected Recordreader<longwritable, t> Createdbrecordreader (Dbinputsplit split,
Configuration conf) throws IOException {
@SuppressWarnings ("Unchecked")
Class<t> Inputclass = (class<t>) (Dbconf.getinputclass ());
try {
Use database product name to determine appropriate the record reader.
if (Dbproductname.startswith ("ORACLE")) {
Use Oracle-specific db Reader.
Return to New Oracledbrecordreader<t> (split, Inputclass,
conf, CreateConnection (), getdbconf (), conditions, FieldNames,
TableName);
else if (Dbproductname.startswith ("MYSQL")) {
Use Mysql-specific db Reader.
Return to New Mysqldbrecordreader<t> (split, Inputclass,
conf, CreateConnection (), getdbconf (), conditions, FieldNames,
TableName);
} else {
Generic Reader.
Return to New Dbrecordreader<t> (split, Inputclass,
conf, CreateConnection (), getdbconf (), conditions, FieldNames,
TableName);
}
catch (SQLException ex) {
throw new IOException (Ex.getmessage ());
}
}
Dbrecordreader's source code
Protected String Getselectquery () {
StringBuilder query = new StringBuilder ();
Default Codepath for MySQL, HSQLDB, etc. Relies on limit/offset for splits.
if (dbconf.getinputquery () = null) {
Query.append ("select");
for (int i = 0; i < fieldnames.length; i++) {
Query.append (Fieldnames[i]);
if (i!= fieldnames.length-1) {
Query.append (",");
}
}
Query.append ("from"). Append (tablename);
Query.append ("as"). Append (tablename); In HSQLDB this is necessary
if (conditions!= null && conditions.length () > 0) {
Query.append ("WHERE ()"). Append (Conditions). Append (")");
}
String by = Dbconf.getinputorderby ();
if (by!= null && orderby.length () > 0) {
Query.append (' ORDER by '). append (by);
}
} else {
prebuilt QUERY
Query.append (Dbconf.getinputquery ());
}
try {
Query.append ("LIMIT"). Append (Split.getlength ()); Where the problem lies
Query.append ("OFFSET"). Append (Split.getstart ());
catch (IOException ex) {
Ignore, would not throw.
}
return query.tostring ();
}
Finally found the reason.
It turns out that Hadoop only implements MySQL's Dbrecordreader (mysqldbrecordreader) and Oracle's Dbrecordreader (Oracledbrecordreader).
Reason found, I refer to the Oracledbrecordreader implementation of the MSSQL server Dbrecordreader code as follows:
Code for Mssqldbinputformat:
/**
*
*/
Package org.apache.hadoop.mapreduce.lib.db;
Import java.io.IOException;
Import java.sql.SQLException;
Import org.apache.hadoop.conf.Configuration;
Import org.apache.hadoop.io.LongWritable;
Import Org.apache.hadoop.mapreduce.Job;
Import Org.apache.hadoop.mapreduce.RecordReader;
/**
* @author Summer
* MICROSOFT SQL SERVER
*/
public class Mssqldbinputformat<t extends dbwritable> extends dbinputformat<t> {
public static void SetInput (Job job,
class<? Extends Dbwritable> Inputclass,
String inputquery, String inputcountquery,string rowId) {
Job.setinputformatclass (Mssqldbinputformat.class);
Dbconfiguration dbconf = new Dbconfiguration (Job.getconfiguration ());
Dbconf.setinputclass (Inputclass);
Dbconf.setinputquery (Inputquery);
Dbconf.setinputcountquery (Inputcountquery);
Dbconf.setinputfieldnames (New String[]{rowid});
}
@Override
Protected Recordreader<longwritable, t> Createdbrecordreader (
Org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit Split,
Configuration conf) throws IOException {
@SuppressWarnings ("Unchecked")
Class<t> Inputclass = (class<t>) (Dbconf.getinputclass ());
try {
Return to New Mssqldbrecordreader<t> (split, Inputclass,
conf, CreateConnection (), getdbconf (), conditions, FieldNames,
TableName);
catch (SQLException ex) {
throw new IOException (Ex.getmessage ());
}
}
}
Code for Mssqldbrecordreader:
/**
*
*/
Package org.apache.hadoop.mapreduce.lib.db;
Import java.io.IOException;
Import java.sql.Connection;
Import java.sql.SQLException;
Import org.apache.hadoop.conf.Configuration;
/**
* @author Summer
*
*/
public class Mssqldbrecordreader <t extends dbwritable> extends dbrecordreader<t>{
Public Mssqldbrecordreader (Dbinputformat.dbinputsplit split,
Class<t> Inputclass, Configuration conf, Connection Conn, dbconfiguration Dbconfig,
string cond, String [] fields, String table) throws SQLException {
Super (split, Inputclass, CONF, Conn, Dbconfig, cond, fields, table);
}
@Override
Protected String Getselectquery () {
StringBuilder query = new StringBuilder ();
Dbconfiguration dbconf = getdbconf ();
String conditions = getconditions ();
String tablename = Gettablename ();
String [] FieldNames = Getfieldnames ();
Oracle-specific Codepath to use rownum instead of Limit/offset.
if (dbconf.getinputquery () = null) {
Query.append ("select");
for (int i = 0; i < fieldnames.length; i++) {
Query.append (Fieldnames[i]);
if (i!= fieldnames.length-1) {
Query.append (",");
}
}
Query.append ("from"). Append (tablename);
if (conditions!= null && conditions.length () > 0)
Query.append ("WHERE"). append (conditions);
String by = Dbconf.getinputorderby ();
if (by!= null && orderby.length () > 0) {
Query.append (' ORDER by '). append (by);
}
} else {
prebuilt QUERY
Query.append (Dbconf.getinputquery ());
}
try {
Dbinputformat.dbinputsplit split = Getsplit ();
if (split.getlength () > 0) {
String querystring = query.tostring ();
String id = fieldnames[0];
query = new StringBuilder ();
Query.append ("Select Top" +split.getlength () + "*");
Query.append (querystring);
Query.append (") a WHERE" + ID + "not in" (SELECT top). Append (Split.getend ());
Query.append ("" "+id +" from (");
Query.append (querystring);
Query.append (") b");
Query.append (")");
System.out.println ("----------------------MICROSOFT sql SERVER QUERY STRING---------------------------");
System.out.println (Query.tostring ());
System.out.println ("----------------------MICROSOFT sql SERVER QUERY STRING---------------------------");
}
catch (IOException ex) {
Ignore, would not throw.
}
return query.tostring ();
}
}
MapReduce's Code
/**
*
*/
Package com.nltk.sns.mapreduce;
Import java.io.IOException;
Import java.util.List;
Import org.apache.hadoop.conf.Configuration;
Import Org.apache.hadoop.fs.FileSystem;
Import Org.apache.hadoop.fs.Path;
Import org.apache.hadoop.io.LongWritable;
Import Org.apache.hadoop.io.Text;
Import Org.apache.hadoop.mapreduce.Job;
Import Org.apache.hadoop.mapreduce.MRJobConfig;
Import Org.apache.hadoop.mapreduce.Mapper;
Import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
Import Org.apache.hadoop.mapreduce.lib.db.MSSQLDBInputFormat;
Import Org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
Import Com.nltk.sns.ETLUtils;
/**
* @author Summer
*
*/
public class Lawdataetl {
public static class Caseetlmapper extends
Mapper<longwritable, Lawcaserecord, longwritable, text>{
static final int step = 6;
Longwritable key = new longwritable (1);
Text value = new text ();
@Override
protected void Map (
Longwritable Key,
Lawcaserecord Lawcaserecord,
Mapper<longwritable, Lawcaserecord, longwritable, Text> Context)
Throws IOException, Interruptedexception {
System.out.println ("-----------------------------" +lawcaserecord+ "------------------------------");
Key.set (lawcaserecord.id);
String Source = Etlutils.format (Lawcaserecord.source);
list<string> words = etlutils.split (source, step);
for (String w:words) {
Value.set (w);
Context.write (key, value);
}
}
}
Static final String Driverclass = "Com.microsoft.sqlserver.jdbc.SQLServerDriver";
Static final String Dburl = "jdbc:sqlserver://192.168.0.100:1433;databasename=lawdb";
Static final String uid = "sa";
Static final String pwd = "Cistjava";
Static final String inputquery = "Select Sid,source from LawDB.dbo.case_source where sid<1000";
Static final String inputcountquery = "SELECT count (1) from LawDB.dbo.case_source where sid<1000";
Static final String Jarclasspath = "/user/lib/sqljdbc4.jar";
Static final String OutputPath = "Hdfs://ubuntu:9000/user/lawdata";
Static final String rowId = "Sid";
public static Job Configurejob (Configuration conf) throws exception{
String jobName = "Etlcase";
Job Job = job.getinstance (conf, jobName);
Job.addfiletoclasspath (New Path (Jarclasspath));
Mssqldbinputformat.setinput (Job, Lawcaserecord.class, Inputquery, Inputcountquery,rowid);
Job.setjarbyclass (Lawdataetl.class);
Fileoutputformat.setoutputpath (Job, New Path (OutputPath));
Job.setmapoutputkeyclass (Longwritable.class);
Job.setmapoutputvalueclass (Text.class);
Job.setoutputkeyclass (Longwritable.class);
Job.setoutputvalueclass (Text.class);
Job.setmapperclass (Caseetlmapper.class);
return job;
}
public static void Main (string[] args) throws exception{
Configuration conf = new Configuration ();
FileSystem fs = Filesystem.get (conf);
Fs.delete (New Path (OutputPath), true);
Dbconfiguration.configuredb (conf, Driverclass, Dburl, UID, PWD);
Conf.set (Mrjobconfig.num_maps, string.valueof (10));
Job Job = configurejob (conf);
System.out.println ("------------------------------------------------");
System.out.println (Conf.get (Dbconfiguration.driver_class_property));
System.out.println (Conf.get (Dbconfiguration.url_property));
System.out.println (Conf.get (Dbconfiguration.username_property));
System.out.println (Conf.get (Dbconfiguration.password_property));
System.out.println ("------------------------------------------------");
System.exit (Job.waitforcompletion (true)? 0:1);
}
}
Code for the secondary class:
/**
*
*/
Package Com.nltk.sns;
Import java.util.ArrayList;
Import java.util.List;
Import Org.apache.commons.lang.StringUtils;
/**
* @author Summer
*
*/
public class Etlutils {
Public final static String Null_char = "";
Public final static String Punctuation_regex = "[(\\pP) &&[^\\|\\{\\}\\#]]+";
Public final static String Whitespace_regex = "[\\p{space}]+";
public static string format (string s) {
Return S.replaceall (Punctuation_regex, Null_char). ReplaceAll (Whitespace_regex, Null_char);
}
public static list<string> split (String s,int STEPN) {
List<string> splits = new arraylist<string> ();
if (Stringutils.isempty (s) | | stepn<1)
return splits;
int len = S.length ();
if (LEN<=STEPN)
Splits.add (s);
else{
for (int j=1;j<=stepn;j++)
for (int i=0;i<=len-j;i++) {
String key = Stringutils.mid (S, i,j);
if (Stringutils.isempty (key))
Continue
Splits.add (key);
}
}
return splits;
}
public static void Main (string[] args) {
String s= "Shetingting etc and Jiangpo etc.";
int STEPN = 2;
List<string> splits = Split (S,STEPN);
System.out.println (splits);
}
}
The operation was successful.
The initial implementation of the code, mainly to meet my needs, we can according to their own needs to modify.
Actually dbrecordreader the author does not achieve well, we look at the Dbrecordreader, Mysqldbrecordreader and Oracledbrecordreader source, The coupling degree of dbrecordreader and Mysqldbrecordreader is too high. In general, is not the implementation of the database Dbrecordreader should also do not report abnormal, nothing more than the use of a single split and a single map.
use MapReduce to import HDFs data into HBase
Package com.bank.service;
Import java.io.IOException;
Import org.apache.hadoop.conf.Configuration;
Import org.apache.hadoop.conf.Configured;
Import Org.apache.hadoop.fs.Path;
Import org.apache.hadoop.hbase.HBaseConfiguration;
Import Org.apache.hadoop.hbase.client.Put;
Import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
Import Org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
Import org.apache.hadoop.hbase.util.Bytes;
Import org.apache.hadoop.io.LongWritable;
Import Org.apache.hadoop.io.Text;
Import Org.apache.hadoop.mapreduce.Job;
Import Org.apache.hadoop.mapreduce.Mapper;
Import Org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
Import Org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
Import Org.apache.hadoop.util.GenericOptionsParser;
Import Org.apache.hadoop.util.Tool;
Import Org.apache.hadoop.util.ToolRunner;
/**
* Use MapReduce to bulk import hbase (MapReduce without the reduce function)
* @author Mengyao
*
*/
public class Dataimporttohbase extends configured implements Tool {
Static class Dataimporttohbasemapper extends Mapper<longwritable, Text, immutablebyteswritable, put> {
private static String familyname = "info";
private static string[] Qualifiers = {"Gzh", "Currency", "Version", "Valuta", "Qftime", "Flag", "MachineID"};
@Override
protected void Map (longwritable key, Text value, context context)
Throws IOException, Interruptedexception {
String line = value.tostring ();
String[] values = Line.split ("T");
if (values.length = = 7 && values.length = = qualifiers.length) {
String row = values[0]+ "_" +values[1]+ "_" +values[2]+ "_" +values[3];
Long timestamp = System.currenttimemillis ();
Immutablebyteswritable immutable = new Immutablebyteswritable (bytes.tobytes (row));
Put on = new put (bytes.tobytes (row));
for (int i = 0; i < values.length; i++) {
String qualifier = qualifiers[i];
String val = values[i];
Put.add (Bytes.tobytes (familyname), bytes.tobytes (qualifier), timestamp, Bytes.tobytes (Val));
}
Context.write (immutable, put);
} else {
System.err.println ("Error:value length must equale qualifier length");
}
}
}
@Override
public int run (string[] arg0) throws Exception {
Job Job = Job.getinstance (getconf (), DataImportToHbase.class.getSimpleName ());
Job.setjarbyclass (Dataimporttohbase.class);
Job.setinputformatclass (Textinputformat.class);
Fileinputformat.setinputpaths (Job, New Path (arg0[0));
Job.setmapperclass (Dataimporttohbasemapper.class);
Job.setmapoutputkeyclass (Immutablebyteswritable.class);
Job.setmapoutputvalueclass (Put.class);
Tablemapreduceutil.inittablereducerjob (arg0[1], NULL, job);
Job.setnumreducetasks (0);
Tablemapreduceutil.adddependencyjars (Job);
Return Job.waitforcompletion (True)? 0:1;
}
public static void Main (string[] args) throws Exception {
Configuration conf = hbaseconfiguration.create ();
Conf.set ("Hbase.zookeeper.quorum", "h5:2181,h6:2181,h7:2181");
Conf.set ("Hbase.zookeeper.property.clientPort", "2181");
Conf.set ("Dfs.socket.timeout", "3600000");
string[] Otherargs = new Genericoptionsparser (args). Getremainingargs ();
if (otherargs.length!= 2) {
System.err.println ("ERROR: <dataInputDir> <tableName>");
System.exit (2);
}
int status = Toolrunner.run (conf, new Dataimporttohbase (), Otherargs);
System.exit (status);
}
}