Micro-BO Data Cleaning (Java version)

Source: Internet
Author: User
Tags html tags locale script tag

Large data Public Welfare university provides a data, obligation to deal with, the original data is Excel, contains HTML tags, as follows:



request to clean out the HTML tags, and the URL address in the microblogging content.


Mainly divided into two parts:

1. Process the text and clean the data.

2. Handle Excel Read and write operations.


Code on:

Excelutil class, which contains excel2003-2007 read and write operations, Excel uses Apache POI to operate with a jar package as follows:


Package dat.datadeal;
Import Java.io.File;
Import Java.io.FileInputStream;
Import java.io.FileNotFoundException;
Import Java.io.FileOutputStream;
Import java.io.IOException;
Import Java.io.InputStream;
Import java.text.ParseException;
Import Java.text.SimpleDateFormat;
Import java.util.ArrayList;
Import Java.util.Date;
Import java.util.List;
Import Java.util.Locale;
Import Java.util.logging.Level;

Import Java.util.logging.Logger;
Import Org.apache.poi.hssf.usermodel.HSSFCell;
Import Org.apache.poi.hssf.usermodel.HSSFCellStyle;
Import Org.apache.poi.hssf.usermodel.HSSFRow;
Import Org.apache.poi.hssf.usermodel.HSSFSheet;
Import Org.apache.poi.hssf.usermodel.HSSFWorkbook;
Import Org.apache.poi.ss.usermodel.Cell;
Import Org.apache.poi.ss.usermodel.DateUtil;
Import Org.apache.poi.ss.usermodel.Row;
Import Org.apache.poi.ss.usermodel.Sheet;
Import Org.apache.poi.ss.usermodel.Workbook;

Import Org.apache.poi.xssf.usermodel.XSSFWorkbook; /** * * @author daT dev.tao@gmail.com *2003,2007 version exCEL read/write Tools/public class excelutil{/** * Excel file Read * @param filePath * @return string[] The row is stored, and the list is listed. * One Excel reads all at once in memory (Excel oversized needs to be processed separately)/public list<string[]> Readexcel (String filePath) {list<string[  
        ]> dataList = new arraylist<string[]> ();  
        Boolean isExcel2003 = true;  
        if (isExcel2007 (FilePath)) {isExcel2003 = false;  
        File File = new file (FilePath);  
        InputStream is = null;  
        try {is = new FileInputStream (file); The catch (FileNotFoundException ex) {Logger.getlogger (ExcelUtil.class.getName ()). log (Level.severe, NULL, ex)  
        ;  
        Workbook WB = null;  
        try {wb = isExcel2003 new Hssfworkbook (IS): New Xssfworkbook (IS);  
        The catch (IOException ex) {Logger.getlogger (ExcelUtil.class.getName ()). log (Level.severe, NULL, ex);  
   } Sheet Sheet = Wb.getsheetat (0);     int totalrows = Sheet.getphysicalnumberofrows ();  
        int totalcells = 0; if (totalrows >= 1 && sheet.getrow (0)!= null) {totalcells = Sheet.getrow (0). getphysicalnumberof  
        Cells ();  
            for (int r = 0; r < totalrows; r++) {Row row = Sheet.getrow (r);  
            if (row = = null) {continue;  
            } string[] rowlist = new String[totalcells];  
                for (int c = 0; c < totalcells; C + +) {cell cell = Row.getcell (c);  
                String cellvalue = "";  
                    if (cell = = null) {Rowlist[c] = (cellvalue);  
                Continue  
                } Cellvalue = Convertcellstr (cell, cellvalue);  
            ROWLIST[C] = (cellvalue);  
        } datalist.add (Rowlist);  
    return dataList; Private String convertcellstr (cell cell, STring cellstr) {switch (Cell.getcelltype ()) {case cell.cell_type_string://Read  
                String cellstr = Cell.getstringcellvalue (). toString ();  
            Break Case Cell.cell_type_boolean://Get BOOLEAN object Method cellstr = string.valueof (Cell.getboolea  
                Ncellvalue ());  
            Break  
                    Case Cell.cell_type_numeric://First See if the date format if (dateutil.iscelldateformatted (Cell)) {  
                Read date format cellstr = Formattime (Cell.getdatecellvalue (). toString ());  
                else {//read the number Cellstr = String.valueof (Cell.getnumericcellvalue ());  
            } break;  
                Case Cell.cell_type_formula://Read Formula CELLSTR = Cell.getcellformula (). toString ();  
        Break } RETurn cellstr; Private Boolean isExcel2007 (String fileName) {return filename.matches ("^.+\\").  
	 i) (xlsx) $ "); private String Formattime (string s) {SimpleDateFormat SF = new SimpleDateFormat ("EEE MMM dd hh:mm:ss z y  
        YYY ", locale.english);  
        Date date = null;  
        try {date = Sf.parse (s);  
        The catch (ParseException ex) {Logger.getlogger (ExcelUtil.class.getName ()). log (Level.severe, NULL, ex);  
        SimpleDateFormat SDF = new SimpleDateFormat ("Yyyy-mm-dd HH:mm:ss");  
        String result = Sdf.format (date);  
    return result; /** * Excel write operation, simple or use memory data once write * @param filePath output file path name * @param dataList output file content,list<string> line Li St Column * @throws IOException */public void Writeexcel (String filepath,list<list<string>> dataList) throws  
	        ioexception{Hssfworkbook wb = new Hssfworkbook (); Hssfsheet sheet = Wb.createsheet("sheet");//Add sheet//table styles Hssfcellstyle style = Wb.createcellstyle (); Style.setalignment (hssfcellstyle.align_center);//Specify Cell center alignment////Border//Style.setborderbottom (HSSFCE  
	        Llstyle.border_medium);  
	        Style.setbordertop (Hssfcellstyle.border_medium);  
	        Style.setborderleft (Hssfcellstyle.border_medium);  
	        Style.setborderright (Hssfcellstyle.border_medium);  
	        Set font//Hssffont F = wb.createfont ();  
	        F.setfontheightinpoints ((short) 10);  
	        F.setboldweight (Hssffont.boldweight_normal);  
	        Style.setfont (f);  
	        Set column width//sheet.setcolumnwidth (short) 0, (short) 9600);  
	        Sheet.setcolumnwidth ((short) 1, (short) 4000);  
	        Sheet.setcolumnwidth ((short) 2, (short) 8000);  
	  
	        Sheet.setcolumnwidth ((short) 3, (short) 8000); Create the first line in the position of index 0 for (int i = 0; i < datalist.size (); i++) { 
	            Hssfrow row = Sheet.createrow (i);  
	            list<string> list = Datalist.get (i);  
	                for (int j = 0; J < List.size (); j + +) {Hssfcell cell = Row.createcell (j);  
	                Cell.setcellvalue (List.get (j));  
	            Cell.setcellstyle (style);  
	        }///export file FileOutputStream fout = new FileOutputStream (FilePath);  
	        Wb.write (Fout);  
	Fout.close (); }
	
}

Dataclean class, which contains the cleaning of HTML tags, urls in the information.

Package dat.datadeal;
Import java.io.IOException;
Import java.util.ArrayList;
Import java.util.List;
Import Java.util.regex.Matcher;

Import Java.util.regex.Pattern; /** * * @author daT dev.tao@gmail.com * */public class Dataclean {/** * Clean HTML tags * @param inputstring * @r
        Eturn/public static string delhtml (String inputstring) {string htmlstr = inputstring;//String containing HTML tags
        String textstr = "";
        Java.util.regex.Pattern P_script;
        Java.util.regex.Matcher M_script;
        Java.util.regex.Pattern p_html;
        Java.util.regex.Matcher m_html; try {string regex_html = ' <[^>]+> ';//define the regular expression String regex_script = "<[/s]*?s for HTML tags Cript[^>]*?>[/s/s]*?<[/s]*?//[/s]*?script[/s]*?> "; Defines a script's regular expression {or <script[^>]*?>[/s/S]*?<//script> P_script = Pattern.compile (Regex_script, Pa Ttern.
            case_insensitive);
            M_script = P_script.matcher (HTMLSTR);Htmlstr = M_script.replaceall ("");
            Filter script Tag p_html = Pattern.compile (regex_html, pattern.case_insensitive);
            m_html = P_html.matcher (HTMLSTR); Htmlstr = M_html.replaceall ("");
        Filter HTML tags textstr = htmlstr;
        catch (Exception e) {System.err.println ("Html2text:" + e.getmessage ()); Return textstr;//returns text string}/** * processing the URL address in the message * * public static string Dealwithurl (String str) {St Ring regEx = "[http|https]+[://]+[0-9a-za-z:/[-]_#[?] [=] [.] [[;]]
        *";   
        Pattern p = pattern.compile (regEx);
        Matcher m = p.matcher (str);
	Return M.replaceall ("");
		public static void Main (string[] args) throws ioexception{excelutil excelutil = new Excelutil ();
		list<list<string>> writelist = new arraylist<list<string>> ();
		list<string[]> readlist =excelutil.readexcel ("/home/dat/javatest/Weibo data _.xlsx"); For (string[] linearray:readlist) {LIST&LT
			string> strlist = new arraylist<string> ();
				for (string str:linearray) {string strtmp = Dataclean.dealwithurl (dataclean.delhtml (str));
				Strlist.add (strtmp);
			System.out.println (strtmp);
		} writelist.add (Strlist);  
	    } excelutil.writeexcel ("/home/dat/javatest/weibo.xlsx", writelist);
	The System.out.println ("job has finished ..."); }
}


After cleaning data:


Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.