Spark Operation HBase

Source: Internet
Author: User

Import Org.apache.hadoop.hbase.util.Bytesimport org.apache.hadoop.hbase. {hcolumndescriptor, Htabledescriptor, TableName, Hbaseconfiguration}import org.apache.hadoop.hbase.client._import Org.apache.spark.SparkContextimport scala.collection.javaconversions._/** * HBase 1.0.0 new API, CRUD basic operation code example **/ Object Hbasenewapi {def main (args:array[string]) {val sc = new Sparkcontext ("local", "Sparkhbase") val conf = HB Aseconfiguration.create () conf.set ("Hbase.zookeeper.property.clientPort", "2181") Conf.set (" Hbase.zookeeper.quorum "," Master ") the creation of//connection is a heavyweight work, thread safe, is the entry for the operation hbase val conn = Connectionfactory.createconn Ection (conf)//Get the Admin object from connection (equivalent to previous hadmin) Val Admin = conn.getadmin//This example will manipulate the table name val usertable = tabl Ename.valueof ("user")//Create user table Val tabledescr = new Htabledescriptor (usertable) tabledescr.addfamily (New Hcolu    Mndescriptor ("Basic". GetBytes)) println ("Creating table ' user '.") if (admin.tableexists (usertable)) {ADmin.disabletable (usertable) admin.deletetable (usertable)} admin.createtable (TABLEDESCR) println ("Done!")  try{//Get user table Val table = conn.gettable (usertable) try{//prepare to insert a key for id001 data val p = New put ("id001". GetBytes)//Specify column and value for the put operation (the previous Put.add method is deprecated) p.addcolumn ("Basic". GetBytes, "name        ". GetBytes," Wuchong ". GetBytes)//Submit Table.put (P)//query a data val g = new Get (" id001 ". GetBytes)        Val result = Table.get (g) Val value = bytes.tostring (Result.getvalue ("Basic". GetBytes, "name". GetBytes)) println ("GET id001:" +value)//scan data val s = new scan () S.addcolumn ("Basic". GetBytes, "name". GetBytes            Val scanner = Table.getscanner (s) try{for (R <-Scanner) {println ("Found row:" +r) println ("Found Value:" +bytes.tostring (R.getvalue ("Basic". GetBytes, "name". getBytes))}}finall y {//ensure scannerClose Scanner.close ()}//Delete a piece of data in a way similar to Put val d = new Delete ("id001". GetBytes) D.add      Column ("Basic". GetBytes, "name". GetBytes) Table.delete (d)}finally {if (table! = null) Table.close () }}finally {Conn.close ()}}}

Import Org.apache.hadoop.hbase.client.Putimport Org.apache.hadoop.hbase.filter.CompareFilter.CompareOpimport Org.apache.hadoop.hbase.filter.SingleColumnValueFilterimport Org.apache.hadoop.hbase.io.ImmutableBytesWritableimport Org.apache.hadoop.hbase.mapred.TableOutputFormatimport Org.apache.hadoop.hbase.mapreduce.TableInputFormatimport Org.apache.hadoop.hbase.protobuf.ProtobufUtilimport Org.apache.hadoop.hbase.util. {Base64, Bytes}import org.apache.hadoop.hbase.HBaseConfigurationimport org.apache.hadoop.mapred.JobConfimport  Org.apache.spark.SparkContextimport org.apache.hadoop.hbase.client._/** * Spark reads and writes HBase **/object sparkonhbase {  def convertscantostring (Scan:scan) = {val proto = Protobufutil.toscan (scan) base64.encodebytes (Proto.tobytearray) } def main (args:array[string]) {val sc = new Sparkcontext ("local", "Sparkonhbase") val conf = hbaseconfiguration. Create () Conf.set ("Hbase.zookeeper.property.clientPort", "2181") Conf.set ("Hbase.zookeeper.quorum", "MaSter ")//======save RDD to hbase========//step 1:jobconf setup val jobconf = new jobconf (conf,this.getclass)  Jobconf.setoutputformat (Classof[tableoutputformat]) jobconf.set (tableoutputformat.output_table, "user")//Step 2: RDD Mapping to Table//the schema of the table in HBase is generally the case//*row Cf:col_1 cf:col_2//And in spark we operate an RDD tuple such as (1, "l    Ilei ", +), (2," Hanmei ", 18)//We need to convert *rdd[(Uid:int, name:string, Age:int)]* to *rdd[(immutablebyteswritable, Put)]*      We define the CONVERT function to do this conversion work def convert (triple: (int, String, int)) = {val p = new Put (bytes.tobytes (triple._1)) P.addcolumn (Bytes.tobytes ("Basic"), Bytes.tobytes ("name"), Bytes.tobytes (triple._2)) P.addcolumn (Bytes.tobytes ("b ASIC "), Bytes.tobytes (" Age "), Bytes.tobytes (Triple._3)) (new Immutablebyteswritable, p)}//step 3:read RDD dat A from somewhere and convert val rawdata = List ((1, "Lilei", +), (2, "Hanmei",), (3, "someone", and)) Val Localdata = s C.parallelize (rawdata). MaP (convert)//step 4:use ' Saveashadoopdataset ' to save RDD to HBase Localdata.saveashadoopdataset (jobconf)//= = =    ==============================//======load Rdd from hbase========//use ' Newapihadooprdd ' to the Load RDD from HBase Read data directly from HBase and turn it into a rdd[k,v that can be manipulated directly by Spark]//Set the table name of the query Conf.set (tableinputformat.input_table, "user")//Add filter, Year 18 years old Val scan = new scan () Scan.setfilter (New Singlecolumnvaluefilter ("Basic". GetBytes, "age". GetBytes, Comp Areop.greater_or_equal,bytes.tobytes ()) Conf.set (tableinputformat.scan,convertscantostring (SCAN)) Val UsersRDD      = Sc.newapihadooprdd (conf, Classof[tableinputformat], classof[org.apache.hadoop.hbase.io.immutablebyteswritable],    Classof[org.apache.hadoop.hbase.client.result]) Val count = Usersrdd.count () println ("Users RDD Count:" + count) Usersrdd.cache ()//Traversal output usersrdd.foreach{case (_,result) = Val key = Bytes.toint (result.getrow) VA L name = bytes.tostring (resUlt.getvalue ("Basic". GetBytes, "name". GetBytes)) Val age = Bytes.toint (Result.getvalue ("Basic". GetBytes, "age". getBytes) println ("Row key:" +key+ "Name:" +name+ "Age:" +age)}//=================================}}

  

Ext.: Https://gist.github.com/wuchong/95630f80966d07d7453b#file-hbasenewapi-scala

http://wuchong.me/blog/2015/04/04/spark-on-yarn-cluster-deploy/

Spark Operation HBase

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.