The reason for implementing this code is:
- I'll be mapreduce, but I've been on the AWS EMR before, and I've built a pseudo-distributed one, but it's hard to think about it;
- I will be a little MySQL (I would like to use MongoDB but not very good)
- The amount of data is not very large, at least for me.
- I hope not to be a problem, this file system can still be trusted.
Design ideas are as follows:
- Init stage: Add the required files to a list file
input_file_list.txt
.
- Map stage: Reads each
input_file_list.txt
line in each file and maps it to a key-value pair.
Consider that the key may contain special characters, so here MySQL is used to store an ID to the corresponding relationship of key data.
- Reduce phase: For each key, read the corresponding file, and eventually generate a name-value list, the Name-value list corresponding to a JSON object, such as:
{ "name": "zifeiy", "age": 88 }
, all the JSON objects stored in a result file reduceResult.txt
.
- Process the result stage,
reduceResult.txt
parse the file, and eventually generate the resulting CSV file or Excel file.
Main code:
Package Com.zifeiy.snowflake.tools.mapreduce.v1;import Java.io.bufferedreader;import Java.io.File;import Java.io.fileinputstream;import Java.io.fileoutputstream;import Java.io.ioexception;import Java.io.inputstreamreader;import Java.io.outputstreamwriter;import Java.sql.connection;import Java.sql.drivermanager;import Java.sql.preparedstatement;import Java.sql.resultset;import java.sql.Statement; Import Java.util.arraylist;import java.util.hashmap;import Java.util.list;import Java.util.map;import Com.google.gson.gson;import Com.zifeiy.snowflake.assist.csvonelineparser;import Com.zifeiy.snowflake.assist.filehelper;import JXL. Workbook;import Jxl.write.label;import Jxl.write.writablesheet;import Jxl.write.writableworkbook;public Abstract Class MapReduceBaseVersion1 {private static final String Appended_db_info = "? Useunicode=true&characterencodin G=utf8 "+" &rewritebatchedstatements=true " + "&uselegacydatetimecode=false" + "&servertim Ezone=asia/shanghai "+" &usessl=false "; Private static final String classname = "Com.mysql.cj.jdbc.Driver"; Private static final String URL = "Jdbc:mysql://localhost:3306/snowflake" + appended_db_info; Private static final String username = "root"; Private static final String password = "password"; public static final String Taskrootpath = "D:\\snowflake\\task"; Private Connection Connection = null; Private File inputlistfile = null; Private File reduceresultfile = null; Private File resultfile = null; private int taskId; public void Addinputpath (file file) throws IOException {Filehelper.appendfile (Inputlistfile, File.getabsolutepath ( ) + "\ r \ n"); public void Setkeyvaluepair (string key, String value) throws Exception{int id =-1; Statement Statement = Connection.createstatement (); ResultSet ResultSet = Statement.executequery (String.Format ("SELECT ID from tmp" + TaskId + "where kname= '%s '", Key.replac Eall ("'", "" ")); if (Resultset.next ()) {id = resultset.getint (1); } else {Statement.execute (String.Format ("INSERT INTO TMP" + TaskId + "(kname) VALUES ('%s ')", KEY.REPL Aceall ("'", Key.replaceall ("'", "" "))); ResultSet = Statement.executequery (String.Format ("SELECT ID from tmp" + TaskId + "where kname= '%s '", Key.replaceall ("'", "‘‘"))); if (Resultset.next ()) {id = resultset.getint (1); }} if (id = =-1) throw new Exception ("Set key value pair Failed:key =" + key + ", value =" + value); File Tmpfile = new file (Taskrootpath + file.separator + taskId + file.separator + "tmp" + file.separator + ID + ". txt "); if (tmpfile.exists () = = False) {TMPFILe.createnewfile (); } filehelper.appendfile (Tmpfile, value + "\ r \ n"); } public void Addparamlist (list<map<string, string>> paramlist) throws Exception {String Conten t = ""; Gson Gson = new Gson (); For (map<string, string> params:paramlist) {String jsonstring = Gson.tojson (params); Content + = jsonstring + "\ r \ n"; } filehelper.appendfile (Reduceresultfile, content); } public void Generatefile (string[] columns, string[] namecolumns) throws Exception {if (Reduceresultfile = = NULL | | Reduceresultfile.exists () = = False) {throw new Exception ("[Mapreduce.v1] in Generatefile function:reduceresul Tfile does not exist! ");} if (false) {//Test if (Reduceresultfile.length () > 1 * 1024 * 1024) {//If file size exceeds 1MB, export to CSV Resultfile = new File (Taskrootpath + file.separator + taskId + file.separator + "result.csv"); Gson Gson = new Gson (); BufferedReader br = new BufferedReader (new InputStreamReader (New FileInputStream (Reduceresultfile), "UTF-8")); FileOutputStream fos = new FileOutputStream (resultfile); OutputStreamWriter OSW = new OutputStreamWriter (FOS, "UTF-8"); String content = ""; for (int i = 0; i < namecolumns.length; i + +) {if (i > 0) content + = ","; Content + = ' "' + namecolumns[i] + '"; } osw.write (content + "\ r \ n"); String line = null; while (line = Br.readline ()) = null) {content = ""; map<string, string> map = Gson.fromjson (line, map.class); if (map = = null) {throw new Exception ("Map is null by parsing line:" + line);} for (int i = 0; i < columns.length; i + +) {if (i > 0) content + = ","; String C = columns[i]; String v = map.get (c); if (v! = null) {content + = ' "' + V + '"; }} osw.write (content + "\ r \ n"); } br.close (); Osw.write (content); Osw.flush (); Osw.close (); } else {//If the file size is less than 1MB, export to excel file Resultfile = new file (Taskrootpath + file.separator + taskId + file.separ Ator + "Result.xls"); Writableworkbook workbook = Workbook.createworkbook (Resultfile); Writablesheet sheet = workbook.createsheet ("Sheet1", 0); BufferedReader br = new BufferedReader (new InputStreamReader (New FileInputStream (Reduceresultfile), "UTF-8")); String line = null; for (int i = 0; i < namecolumns.length; i + +) {Sheet.addcell (new Label (i, 0, namecolumns[i])); } int rowId = 1; while (line = Br.readline ()) = null) {Gson Gson = new Gson (); list<string> rowlist = new arraylist<string> (); map<string, string> map = Gson.fromjson (line, map.class); if (map = = null) {throw new Exception ("Map is null by parsing line:" + line);} for (int i = 0; i < columns.length; i + +) {String c = columns[i]; String v = map.get (c); String innercontent = ""; if (v! = null) {innercontent = v; } Sheet.addcell (New Label (I, RowId, innercontent)); } rowId + +; } br.close (); Workbook.write (); Workbook.close (); }} public abstract void init () throws Exception; public abstract VoID map (String line) throws Exception; public abstract void reduce (String key, Reducereader Reducereader) throws Exception; public abstract void Generate () throws Exception; Public String MapReduce () {try {class.forname (classname); Connection = drivermanager.getconnection (URL, username, password); Generate taskId PreparedStatement PreparedStatement = connection.preparestatement ("INSERT into task () value S () "); Preparedstatement.execute ("INSERT into task () values ()", Preparedstatement.return_generated_keys); ResultSet ResultSet = Preparedstatement.getgeneratedkeys (); if (Resultset.next ()) {taskId = Resultset.getint (1); } else {throw new Exception ("[mapreduce.v1] Exception:can not generate TaskId"); }//Generated task file path String Taskpath = Taskrootpath + file.separator + tAskid; File Taskpathdir = new file (Taskpath); if (taskpathdir.exists () = True) {throw new Exception ("[MAPREDUCE.V1] Exception:task directory already E Xists "); } taskpathdir.mkdirs (); String Tmpdirpath = taskpath + file.separator + "tmp"; File TmpDir = new file (Tmpdirpath); Tmpdir.mkdirs (); This.inputlistfile = new File (Taskpath + file.separator + "Input_file_list.txt"); Inputlistfile.createnewfile (); Period. 1:init ()//During Init period, we'll use Addinputpath function to add all the input files we need Init (); Begin to read all line of each file//peroid. 2:map (line)//DB prepare Statement Statement = Connection.createstatement (); Statement.execute ("Create temporary table tmp" + TaskId + "(id int not NULL auto_increment primary key, Kname VarcHar (200)) "); File content Prepare BufferedReader br = new BufferedReader (new InputStreamReader (New fileinputs Tream (Inputlistfile), "UTF-8"); String inputfilename = null; while ((InputFileName = Br.readline ()) = null) {file Inputfile = new File (inputfilename); if (inputfile.exists () = = False) {throw new Exception ("[Mapreduce.v1] exception:input file" + input Filename + "do not exists!"); } BufferedReader Br2 = new BufferedReader (new InputStreamReader (New FileInputStream (Inputfile), "GBK")); String line = null; while (line = Br2.readline ())! = NULL) {map (line); }} br.close (); Period. 3:reduce (Key, valueList) Reduceresultfile = new File (Taskpath + file.separator + "Reduce.txt"); if (REDUCERESULTFIle.exists () = = True) {throw new Exception ("[mapreduce.v1] reduce file already exists!"); } reduceresultfile.createnewfile (); ResultSet = Statement.executequery ("SELECT * from tmp" + taskId); while (Resultset.next ()) {int id = resultset.getint (1); String key = resultset.getstring (2); File Reducefile = new file (Tmpdirpath + file.separator + ID + ". txt"); if (reducefile.exists () = = False) {throw new Exception ("[Mapreduce.v1] exception:reduce file" + redu Cefile.getname () + "not exists!"); } reducereader Reducereader = new Reducereader (reducefile); Reduce (key, reducereader); }//Period. 4:generate//Generate the result file generate (); Connection.close (); } catch (Exception e) {E.printstacktraCE (); } if (resultfile = = null) return null; else return Resultfile.getabsolutepath (); }//main for test public static void main (string[] args) {MapReduceBaseVersion1 Mapreducebaseversio N1 = new MapReduceBaseVersion1 () {@Override public void reduce (String key, Reducereader Reducereader) throws Exception {//TODO auto-generated method stub list<map<string, string>> paramlist = new arraylist<map<string,string>> (); String Line; while (line = Reducereader.next ()) = null) {list<string> rowlist = Csvonelineparser.parselin E (line); map<string, string> tmpmap = new hashmap<string, string> (); int idx = 0; for (String s:rowlist) {idx + +; Tmpmap.put ("" + IDx, s); } paramlist.add (Tmpmap); } addparamlist (Paramlist); } @Override public void map (String line) throws Exception {//TODO auto- Generated method stub Setkeyvaluepair (line.substring (1, 3), line); } @Override public void init () throws Exception {//TODO auto-generated Method Stub Addinputpath (new File ("D:\\test\\test.del")); } @Override public void Generate () throws Exception {//TODO Auto-genera Ted Method Stub Generatefile (new string[] {"1", "2", "3", "4", "5", "6"}, new string[] {"One", "two", "three", " Four "," five "," Six "}); } }; System.out.println (Mapreducebaseversion1.mapreduce ()); }}
Using Java to implement MapReduce based on file system (and MySQL)