標籤:hive mysql udf genericudf
前面我分別介紹了兩種Hive將分析結果匯入到MySQL表的兩種方式:Sqoop匯入方式和使用Hive、MySQL JDBC驅動,現在我介紹第三種,也是使用比較多的方式——使用Hive 自訂函數(UDF或GenericUDF)將每條記錄插入到資料庫表中。
一、使用UDF方式
使用UDF方式實現比較簡單,只要繼承UDF類,並重寫evaluate方法即可
1、編寫實作類別
package com.gxnzx.hive.udf;import org.apache.hadoop.hive.ql.exec.UDF;import com.gxnzx.hive.util.DBSqlHelper;public class AnalyzeStatistics extends UDF{ public String evaluate(String clxxbh,String hphm){ //jtxx2資料庫為目標資料庫表 String sql="insert into jtxx2 values(?,?)"; //往資料庫中插入記錄 if(DBSqlHelper.addBatch(sql, clxxbh, hphm)){ return clxxbh+" SUCCESS "+hphm; }else{ return clxxbh+" faile "+hphm; } }}
2、資料庫操作方法
public static boolean addBatch(String sql,String clxxbh,String hphm){ boolean flag=false; try{ conn=DBSqlHelper.getConn(); //開啟一個資料庫連接 ps=(PreparedStatement) conn.prepareStatement(sql); ps.setString(1, clxxbh); ps.setString(2, hphm); System.out.println(ps.toString()); ps.execute(); flag=true; }catch(Exception e){ e.printStackTrace(); }finally{ try { ps.close(); } catch (SQLException e) { e.printStackTrace(); } } return flag; }
3、使用eclipse將該項目包打成jar包匯入到hive類環境中
hive> add jar hiveudf2.jar
4、將MySQL JDBC驅動包匯入hive 類環境中
hive> add jar /home/hadoopUser/cloud/hive/apache-hive-0.13.1-bin/lib/mysql-connector-java-5.1.18-bin.jar
5、建立hive 臨時函數
hive> create temporary function analyze as 'com.gxnzx.hive.udf.AnalyzeStatistics';
6、測試
hive> select analyze(clxxbh,hphm) from transjtxx_hbase limit 10;Total jobs = 1Launching Job 1 out of 1Number of reduce tasks is set to 0 since there's no reduce operatorStarting Job = job_1428394594787_0034, Tracking URL = http://secondmgt:8088/proxy/application_1428394594787_0034/Kill Command = /home/hadoopUser/cloud/hadoop/programs/hadoop-2.2.0/bin/hadoop job -kill job_1428394594787_0034Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 02015-04-23 10:15:34,355 Stage-1 map = 0%, reduce = 0%2015-04-23 10:15:51,032 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 7.14 secMapReduce Total cumulative CPU time: 7 seconds 140 msecEnded Job = job_1428394594787_0034MapReduce Jobs Launched:Job 0: Map: 1 Cumulative CPU: 7.14 sec HDFS Read: 256 HDFS Write: 532 SUCCESSTotal MapReduce CPU Time Spent: 7 seconds 140 msecOK32100017000000000220140317000015 SUCCESS 魯Q5818232100017000000000220140317000016 SUCCESS 魯QV466232100017000000000220140317000019 SUCCESS 蘇LL812832100017000000000220140317000020 SUCCESS 蘇CAH36732100017000000000220140317000023 SUCCESS 魯Q7899W32100017000000000220140317000029 SUCCESS 蘇HN381932100017000000000220140317000038 SUCCESS 魯C0157632100017000000000220140317000044 SUCCESS 蘇DT917832100017000000000220140317000049 SUCCESS 蘇LZ111232100017000000000220140317000052 SUCCESS 蘇K9795警Time taken: 35.815 seconds, Fetched: 10 row(s)
7、查看MySQL表中資料
mysql> select * from jtxx2;+----------------------------------+-------------+| cllxbh | hphm |+----------------------------------+-------------+| 32100017000000000220140317000015 | 魯Q58182 || 32100017000000000220140317000016 | 魯QV4662 || 32100017000000000220140317000019 | 蘇LL8128 || 32100017000000000220140317000020 | 蘇CAH367 || 32100017000000000220140317000023 | 魯Q7899W || 32100017000000000220140317000029 | 蘇HN3819 || 32100017000000000220140317000038 | 魯C01576 || 32100017000000000220140317000044 | 蘇DT9178 || 32100017000000000220140317000049 | 蘇LZ1112 || 32100017000000000220140317000052 | 蘇K9795警 |+----------------------------------+-------------+10 rows in set (0.00 sec)
二、使用GenericUDF方式
使用GenericUDF方式,實現比較複雜,我參考了別人的代碼,如下:
1、編寫調用函數
package com.gxnzx.hive.main;import java.sql.Connection;import java.sql.DriverManager;import java.sql.PreparedStatement;import java.sql.SQLException;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.hadoop.hive.ql.exec.Description;import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.UDFType;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.serde.Constants;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;import org.apache.hadoop.io.IntWritable;/** * AnalyzeGenericUDFDBOutput is designed to output data directly from Hive to a * JDBC datastore. This UDF is useful for exporting small to medium summaries * that have a unique key. * * Due to the nature of hadoop, individual mappers, reducers or entire jobs can * fail. If a failure occurs a mapper or reducer may be retried. This UDF has no * way of detecting failures or rolling back a transaction. Consequently, you * should only only use this to export to a table with a unique key. The unique * key should safeguard against duplicate data. * * To use this UDF ,you should follow below three steps First of all, you need * to packag the UDF into the jar file; Secondly, you should use hive add jar * feature to add the UDF jar file to current class path; Thirdly, you should * use hive add jar feature to add JDBC Driver jar file to current class path; * Fourthly, you should use hive create temporary function feature to create an * temporary function belong to the UDF class. * * Examples for MySQL: hive> add jar udf.jar hive> add jar * mysql-connector-java-5.1.18-bin.jar hive> create temporary function * analyzedboutput as 'com.gxnzx.hive.main.AnalyzeGenericUDFDBOutput' */@Description(name = "analyzedboutput", value = "_FUNC_(jdbctring,username,password,preparedstatement,[arguments])"+ " - sends data to a jdbc driver", extended = "argument 0 is the JDBC connection string\n"+ "argument 1 is the database user name\n"+ "argument 2 is the database user's password\n"+ "argument 3 is an SQL query to be used in the PreparedStatement\n"+ "argument (4-n) The remaining arguments must be primitive and are "+ "passed to the PreparedStatement object\n")@UDFType(deterministic = false)public class AnalyzeGenericUDFDBOutput extends GenericUDF {private static final Log LOG = LogFactory.getLog(AnalyzeGenericUDFDBOutput.class.getName());private transient ObjectInspector[] argumentOI;private transient Connection connection = null;private String url;private String user;private String pass;private final IntWritable result = new IntWritable(-1);/** * @param arguments * argument 0 is the JDBC connection string argument 1 is the * user name argument 2 is the password argument 3 is an SQL * query to be used in the PreparedStatement argument (4-n) The * remaining arguments must be primitive and are passed to the * PreparedStatement object */@Overridepublic ObjectInspector initialize(ObjectInspector[] arguments)throws UDFArgumentTypeException {argumentOI = arguments;// this should be connection// url,username,password,query,column1[,columnn]*for (int i = 0; i < 4; i++) {if (arguments[i].getCategory() == ObjectInspector.Category.PRIMITIVE) {PrimitiveObjectInspector poi = ((PrimitiveObjectInspector) arguments[i]);if (!(poi.getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING)) {throw new UDFArgumentTypeException(i,"The argument of function should be \""+ Constants.STRING_TYPE_NAME + "\", but \""+ arguments[i].getTypeName()+ "\" is found");}}}for (int i = 4; i < arguments.length; i++) {if (arguments[i].getCategory() != ObjectInspector.Category.PRIMITIVE) {throw new UDFArgumentTypeException(i,"The argument of function should be primative"+ ", but \"" + arguments[i].getTypeName()+ "\" is found");}}return PrimitiveObjectInspectorFactory.writableIntObjectInspector;}/** * @return 0 on success -1 on failure */@Overridepublic Object evaluate(DeferredObject[] arguments) throws HiveException {url = ((StringObjectInspector) argumentOI[0]).getPrimitiveJavaObject(arguments[0].get());user = ((StringObjectInspector) argumentOI[1]).getPrimitiveJavaObject(arguments[1].get());pass = ((StringObjectInspector) argumentOI[2]).getPrimitiveJavaObject(arguments[2].get());try {connection = DriverManager.getConnection(url, user, pass);} catch (SQLException ex) {LOG.error("Driver loading or connection issue", ex);result.set(2);}if (connection != null) {try {PreparedStatement ps = connection.prepareStatement(((StringObjectInspector) argumentOI[3]).getPrimitiveJavaObject(arguments[3].get()));for (int i = 4; i < arguments.length; ++i) {PrimitiveObjectInspector poi = ((PrimitiveObjectInspector) argumentOI[i]);ps.setObject(i - 3,poi.getPrimitiveJavaObject(arguments[i].get()));}ps.execute();ps.close();result.set(0);} catch (SQLException e) {LOG.error("Underlying SQL exception", e);result.set(1);} finally {try {connection.close();} catch (Exception ex) {LOG.error("Underlying SQL exception during close", ex);}}}return result;}@Overridepublic String getDisplayString(String[] children) {StringBuilder sb = new StringBuilder();sb.append("dboutput(");if (children.length > 0) {sb.append(children[0]);for (int i = 1; i < children.length; i++) {sb.append(",");sb.append(children[i]);}}sb.append(")");return sb.toString();}}
2、將程式打成jar包,匯入到Hive class path下
hive> add jar hiveGenericUdf.jar;
3、添加mysql JDBC驅動 JAR檔案
hive> add jar /home/hadoopUser/cloud/hive/apache-hive-0.13.1-bin/lib/mysql-connector-java-5.1.18-bin.jar
4、建立臨時函數
hive> create temporary function analyzedboutput as 'com.gxnzx.hive.main.AnalyzeGenericUDFDBOutput';
5、測試
hive> select analyzedboutput('jdbc:mysql://192.168.2.133:3306/transport','hive','hive','insert into jtxx2 values(?,?)',clxxbh,hphm) from transjtxx_hbase limit 5;Total jobs = 1Launching Job 1 out of 1Number of reduce tasks is set to 0 since there's no reduce operatorStarting Job = job_1428394594787_0043, Tracking URL = http://secondmgt:8088/proxy/application_1428394594787_0043/Kill Command = /home/hadoopUser/cloud/hadoop/programs/hadoop-2.2.0/bin/hadoop job -kill job_1428394594787_0043Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 02015-04-23 22:01:46,205 Stage-1 map = 0%, reduce = 0%2015-04-23 22:02:01,985 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 9.37 secMapReduce Total cumulative CPU time: 9 seconds 370 msecEnded Job = job_1428394594787_0043MapReduce Jobs Launched:Job 0: Map: 1 Cumulative CPU: 9.37 sec HDFS Read: 256 HDFS Write: 10 SUCCESSTotal MapReduce CPU Time Spent: 9 seconds 370 msecOK00000Time taken: 32.118 seconds, Fetched: 5 row(s) analyzedboutput六個參數分別表示:MySQL JDBC連接字串、MYSQL資料使用者名稱、密碼、SQL插入語句、Hive表中對應的clxxbh,hphm兩個查詢欄位。
6、查看MySQL資料庫表資料
mysql> select * from jtxx2;Empty set (0.00 sec)mysql> select * from jtxx2;+----------------------------------+-----------+| cllxbh | hphm |+----------------------------------+-----------+| 32100017000000000220140317000015 | 魯Q58182 || 32100017000000000220140317000016 | 魯QV4662 || 32100017000000000220140317000019 | 蘇LL8128 || 32100017000000000220140317000020 | 蘇CAH367 || 32100017000000000220140317000023 | 魯Q7899W |+----------------------------------+-----------+5 rows in set (0.00 sec)
//此處結束
將Hive統計分析結果匯入到MySQL資料庫表中(三)——使用Hive UDF或GenericUDF