ImportJava.io.BufferedReader;ImportJava.io.File;ImportJava.io.FileReader;Importjava.io.IOException;ImportJava.util.Random;Importorg.apache.hadoop.conf.Configuration;ImportOrg.apache.hadoop.fs.Path;ImportOrg.apache.log4j.Logger;ImportOrg.apache.parquet.example.data.Group;Importorg.apache.parquet.example.data.GroupFactory;Importorg.apache.parquet.example.data.simple.SimpleGroupFactory;ImportOrg.apache.parquet.hadoop.ParquetReader;ImportOrg.apache.parquet.hadoop.ParquetReader.Builder;ImportOrg.apache.parquet.hadoop.ParquetWriter;ImportOrg.apache.parquet.hadoop.example.GroupReadSupport;ImportOrg.apache.parquet.hadoop.example.GroupWriteSupport;ImportOrg.apache.parquet.schema.MessageType;ImportOrg.apache.parquet.schema.MessageTypeParser; Public classReadparquet {StaticLogger Logger=logger.getlogger (Readparquet.class); Public Static voidMain (string[] args)throwsException {//parquetwriter ("Test\\parquet-out2", "Input.txt");ParquetReaderV2 ("Test\\parquet-out2"); } Static voidParquetReaderV2 (String Inpath)throwsexception{groupreadsupport Readsupport=NewGroupreadsupport (); Builder<Group> reader= Parquetreader.builder (Readsupport,NewPath (Inpath)); Parquetreader<Group> build=Reader.build (); Group Line=NULL; while((Line=build.read ())! =NULL) {System.out.println (line.tostring ()); } System.out.println ("Read End"); }//New Parquetreader () all construction methods seem to have been deprecated, using the builder above to construct the object Static voidParquetreader (String Inpath)throwsexception{groupreadsupport Readsupport=NewGroupreadsupport (); Parquetreader<Group> reader =New Parquetreader<Group> (NewPath (Inpath), readsupport); Group Line=NULL; while((Line=reader.read ())! =NULL) {System.out.println (line.tostring ()); }SYSTEM.OUT.PRINTLN ("Read End"); } /** * * @paramoutpath Output Parquet format *@paramInpath entering plain text files *@throwsIOException*/ Static voidParquetwriter (String outpath,string Inpath)throwsioexception{MessageType Schema= Messagetypeparser.parsemessagetype ("message Pair {\ n" + "required binary city (UTF8); \ n" + "Required binary IP (UTF8); \ n" + "repeated group time {\ n" + "required int32 ttl;\n" + "Required binary ttl2;\n" + "}\n" + "}"); Groupfactory Factory=Newsimplegroupfactory (Schema); Path Path=NewPath (Outpath); Configuration Configuration=NewConfiguration (); Groupwritesupport Writesupport=NewGroupwritesupport (); Writesupport.setschema (schema,configuration); Parquetwriter<Group> writer =NewParquetwriter<group>(Path,configuration,writesupport);
Local files are read in to generate Parquet format files BufferedReader br=NewBufferedReader (NewFileReader (NewFile (Inpath)); String Line=""; Random R=NewRandom (); while((Line=br.readline ())! =NULL) {string[] STRs=line.split ("\\s+"); if(strs.length==2) {Group Group=Factory.newgroup (). Append ("City", strs[0]). Append ("IP", strs[1]); Group Tmpg=group.addgroup ("Time"); Tmpg.append ("TTL", R.nextint (9) +1); Tmpg.append ("Ttl2", R.nextint (9) + "_a"); Writer.write (group); }} System.out.println ("Write End"); Writer.close (); }}
Next schema (write parquet format data need schema, read the word "automatically recognize" the schema)
/* * Each field has three attributes: Repeat number, data type and field name, repeat number can be the following three kinds: * required (appears 1 times) * repeated (appears 0 or more times) * Optional (0 or 1 times) * The data type of each field can be divided into two types: * Group (complex type) * primitive (basic type)
* Data types are
* INT64, INT32, BOOLEAN, BINARY, FLOAT, DOUBLE, INT96, Fixed_len_byte_array
*/
Maven dependent (1.7 of my use)
<Dependency> <groupId>Org.apache.parquet</groupId> <Artifactid>Parquet-hadoop</Artifactid> <version>1.7.0</version></Dependency>
Java read-write parquet format data Parquet Example