Using Java to implement MapReduce based on file system (and MySQL)

Source: Internet
Author: User
Tags prepare readline aws emr

The reason for implementing this code is:

    • I'll be mapreduce, but I've been on the AWS EMR before, and I've built a pseudo-distributed one, but it's hard to think about it;
    • I will be a little MySQL (I would like to use MongoDB but not very good)
    • The amount of data is not very large, at least for me.
    • I hope not to be a problem, this file system can still be trusted.

Design ideas are as follows:

    • Init stage: Add the required files to a list file input_file_list.txt .
    • Map stage: Reads each input_file_list.txt line in each file and maps it to a key-value pair.
      Consider that the key may contain special characters, so here MySQL is used to store an ID to the corresponding relationship of key data.
    • Reduce phase: For each key, read the corresponding file, and eventually generate a name-value list, the Name-value list corresponding to a JSON object, such as: { "name": "zifeiy", "age": 88 } , all the JSON objects stored in a result file reduceResult.txt .
    • Process the result stage, reduceResult.txt parse the file, and eventually generate the resulting CSV file or Excel file.

Main code:

Package Com.zifeiy.snowflake.tools.mapreduce.v1;import Java.io.bufferedreader;import Java.io.File;import Java.io.fileinputstream;import Java.io.fileoutputstream;import Java.io.ioexception;import Java.io.inputstreamreader;import Java.io.outputstreamwriter;import Java.sql.connection;import Java.sql.drivermanager;import Java.sql.preparedstatement;import Java.sql.resultset;import java.sql.Statement; Import Java.util.arraylist;import java.util.hashmap;import Java.util.list;import Java.util.map;import Com.google.gson.gson;import Com.zifeiy.snowflake.assist.csvonelineparser;import Com.zifeiy.snowflake.assist.filehelper;import JXL. Workbook;import Jxl.write.label;import Jxl.write.writablesheet;import Jxl.write.writableworkbook;public Abstract Class MapReduceBaseVersion1 {private static final String Appended_db_info = "? Useunicode=true&characterencodin                                     G=utf8 "+" &rewritebatchedstatements=true "                + "&uselegacydatetimecode=false" + "&servertim    Ezone=asia/shanghai "+" &usessl=false ";    Private static final String classname = "Com.mysql.cj.jdbc.Driver";    Private static final String URL = "Jdbc:mysql://localhost:3306/snowflake" + appended_db_info;    Private static final String username = "root";        Private static final String password = "password";        public static final String Taskrootpath = "D:\\snowflake\\task";    Private Connection Connection = null;    Private File inputlistfile = null;    Private File reduceresultfile = null;    Private File resultfile = null;        private int taskId; public void Addinputpath (file file) throws IOException {Filehelper.appendfile (Inputlistfile, File.getabsolutepath (    ) + "\ r \ n"); public void Setkeyvaluepair (string key, String value) throws Exception{int id =-1;        Statement Statement = Connection.createstatement (); ResultSet ResultSet = Statement.executequery (String.Format ("SELECT ID from tmp" + TaskId + "where kname= '%s '", Key.replac        Eall ("'", "" "));        if (Resultset.next ()) {id = resultset.getint (1); } else {Statement.execute (String.Format ("INSERT INTO TMP" + TaskId + "(kname) VALUES ('%s ')", KEY.REPL            Aceall ("'", Key.replaceall ("'", "" "))); ResultSet = Statement.executequery (String.Format ("SELECT ID from tmp" + TaskId + "where kname= '%s '", Key.replaceall ("'",            "‘‘")));            if (Resultset.next ()) {id = resultset.getint (1);        }} if (id = =-1) throw new Exception ("Set key value pair Failed:key =" + key + ", value =" + value); File Tmpfile = new file (Taskrootpath + file.separator + taskId + file.separator + "tmp" + file.separator + ID + ". txt        "); if (tmpfile.exists () = = False) {TMPFILe.createnewfile ();    } filehelper.appendfile (Tmpfile, value + "\ r \ n"); } public void Addparamlist (list<map<string, string>> paramlist) throws Exception {String Conten        t = "";        Gson Gson = new Gson ();            For (map<string, string> params:paramlist) {String jsonstring = Gson.tojson (params);        Content + = jsonstring + "\ r \ n";    } filehelper.appendfile (Reduceresultfile, content); } public void Generatefile (string[] columns, string[] namecolumns) throws Exception {if (Reduceresultfile = = NULL | | Reduceresultfile.exists () = = False) {throw new Exception ("[Mapreduce.v1] in Generatefile function:reduceresul Tfile does not exist! ");}            if (false) {//Test if (Reduceresultfile.length () > 1 * 1024 * 1024) {//If file size exceeds 1MB, export to CSV                      Resultfile = new File (Taskrootpath + file.separator + taskId + file.separator + "result.csv");  Gson Gson = new Gson ();            BufferedReader br = new BufferedReader (new InputStreamReader (New FileInputStream (Reduceresultfile), "UTF-8"));            FileOutputStream fos = new FileOutputStream (resultfile);                        OutputStreamWriter OSW = new OutputStreamWriter (FOS, "UTF-8");            String content = "";                for (int i = 0; i < namecolumns.length; i + +) {if (i > 0) content + = ",";            Content + = ' "' + namecolumns[i] + '";                        } osw.write (content + "\ r \ n");            String line = null;                while (line = Br.readline ()) = null) {content = "";                map<string, string> map = Gson.fromjson (line, map.class);                if (map = = null) {throw new Exception ("Map is null by parsing line:" + line);}           for (int i = 0; i < columns.length; i + +) {if (i > 0) content + = ",";         String C = columns[i];                    String v = map.get (c);                    if (v! = null) {content + = ' "' + V + '";            }} osw.write (content + "\ r \ n");            } br.close ();            Osw.write (content);            Osw.flush ();        Osw.close (); } else {//If the file size is less than 1MB, export to excel file Resultfile = new file (Taskrootpath + file.separator + taskId + file.separ                        Ator + "Result.xls");            Writableworkbook workbook = Workbook.createworkbook (Resultfile);                        Writablesheet sheet = workbook.createsheet ("Sheet1", 0);            BufferedReader br = new BufferedReader (new InputStreamReader (New FileInputStream (Reduceresultfile), "UTF-8"));                        String line = null;            for (int i = 0; i < namecolumns.length; i + +) {Sheet.addcell (new Label (i, 0, namecolumns[i]));                       } int rowId = 1;                while (line = Br.readline ()) = null) {Gson Gson = new Gson ();                                list<string> rowlist = new arraylist<string> ();                map<string, string> map = Gson.fromjson (line, map.class);                if (map = = null) {throw new Exception ("Map is null by parsing line:" + line);}                    for (int i = 0; i < columns.length; i + +) {String c = columns[i];                    String v = map.get (c);                    String innercontent = "";                    if (v! = null) {innercontent = v;                } Sheet.addcell (New Label (I, RowId, innercontent));                            } rowId + +;                        } br.close ();            Workbook.write ();                    Workbook.close ();        }} public abstract void init () throws Exception; public abstract VoID map (String line) throws Exception;        public abstract void reduce (String key, Reducereader Reducereader) throws Exception;        public abstract void Generate () throws Exception;            Public String MapReduce () {try {class.forname (classname);                        Connection = drivermanager.getconnection (URL, username, password); Generate taskId PreparedStatement PreparedStatement = connection.preparestatement ("INSERT into task () value            S () ");            Preparedstatement.execute ("INSERT into task () values ()", Preparedstatement.return_generated_keys);            ResultSet ResultSet = Preparedstatement.getgeneratedkeys ();            if (Resultset.next ()) {taskId = Resultset.getint (1);            } else {throw new Exception ("[mapreduce.v1] Exception:can not generate TaskId"); }//Generated task file path String Taskpath = Taskrootpath + file.separator + tAskid;            File Taskpathdir = new file (Taskpath); if (taskpathdir.exists () = True) {throw new Exception ("[MAPREDUCE.V1] Exception:task directory already E            Xists ");            } taskpathdir.mkdirs ();            String Tmpdirpath = taskpath + file.separator + "tmp";            File TmpDir = new file (Tmpdirpath);            Tmpdir.mkdirs ();            This.inputlistfile = new File (Taskpath + file.separator + "Input_file_list.txt");            Inputlistfile.createnewfile (); Period.            1:init ()//During Init period, we'll use Addinputpath function to add all the input files we need                        Init (); Begin to read all line of each file//peroid.            2:map (line)//DB prepare Statement Statement = Connection.createstatement (); Statement.execute ("Create temporary table tmp" + TaskId + "(id int not NULL auto_increment primary key, Kname VarcHar (200)) "); File content Prepare BufferedReader br = new BufferedReader (new InputStreamReader (New fileinputs            Tream (Inputlistfile), "UTF-8");               String inputfilename = null;                while ((InputFileName = Br.readline ()) = null) {file Inputfile = new File (inputfilename); if (inputfile.exists () = = False) {throw new Exception ("[Mapreduce.v1] exception:input file" + input                Filename + "do not exists!");                } BufferedReader Br2 = new BufferedReader (new InputStreamReader (New FileInputStream (Inputfile), "GBK"));                String line = null;                while (line = Br2.readline ())! = NULL) {map (line);                        }} br.close (); Period.            3:reduce (Key, valueList) Reduceresultfile = new File (Taskpath + file.separator + "Reduce.txt"); if (REDUCERESULTFIle.exists () = = True) {throw new Exception ("[mapreduce.v1] reduce file already exists!");                        } reduceresultfile.createnewfile ();            ResultSet = Statement.executequery ("SELECT * from tmp" + taskId);                while (Resultset.next ()) {int id = resultset.getint (1);                String key = resultset.getstring (2);                File Reducefile = new file (Tmpdirpath + file.separator + ID + ". txt"); if (reducefile.exists () = = False) {throw new Exception ("[Mapreduce.v1] exception:reduce file" + redu                Cefile.getname () + "not exists!");                } reducereader Reducereader = new Reducereader (reducefile);            Reduce (key, reducereader); }//Period.                        4:generate//Generate the result file generate ();        Connection.close (); } catch (Exception e) {E.printstacktraCE ();        } if (resultfile = = null) return null;            else return Resultfile.getabsolutepath (); }//main for test public static void main (string[] args) {MapReduceBaseVersion1 Mapreducebaseversio  N1 = new MapReduceBaseVersion1 () {@Override public void reduce (String key, Reducereader Reducereader) throws Exception {//TODO auto-generated method stub list<map<string,                                string>> paramlist = new arraylist<map<string,string>> ();                String Line; while (line = Reducereader.next ()) = null) {list<string> rowlist = Csvonelineparser.parselin                    E (line);                    map<string, string> tmpmap = new hashmap<string, string> ();                    int idx = 0;                        for (String s:rowlist) {idx + +; Tmpmap.put ("" + IDx, s);                } paramlist.add (Tmpmap);            } addparamlist (Paramlist); } @Override public void map (String line) throws Exception {//TODO auto-            Generated method stub Setkeyvaluepair (line.substring (1, 3), line); } @Override public void init () throws Exception {//TODO auto-generated            Method Stub Addinputpath (new File ("D:\\test\\test.del")); } @Override public void Generate () throws Exception {//TODO Auto-genera Ted Method Stub Generatefile (new string[] {"1", "2", "3", "4", "5", "6"}, new string[] {"One", "two", "three", "            Four "," five "," Six "});        }        };    System.out.println (Mapreducebaseversion1.mapreduce ()); }}

Using Java to implement MapReduce based on file system (and MySQL)

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.