Large data Public Welfare university provides a data, obligation to deal with, the original data is Excel, contains HTML tags, as follows:
request to clean out the HTML tags, and the URL address in the microblogging content.
Mainly divided into two parts:
1. Process the text and clean the data.
2. Handle Excel Read and write operations.
Code on:
Excelutil class, which contains excel2003-2007 read and write operations, Excel uses Apache POI to operate with a jar package as follows:
Package dat.datadeal;
Import Java.io.File;
Import Java.io.FileInputStream;
Import java.io.FileNotFoundException;
Import Java.io.FileOutputStream;
Import java.io.IOException;
Import Java.io.InputStream;
Import java.text.ParseException;
Import Java.text.SimpleDateFormat;
Import java.util.ArrayList;
Import Java.util.Date;
Import java.util.List;
Import Java.util.Locale;
Import Java.util.logging.Level;
Import Java.util.logging.Logger;
Import Org.apache.poi.hssf.usermodel.HSSFCell;
Import Org.apache.poi.hssf.usermodel.HSSFCellStyle;
Import Org.apache.poi.hssf.usermodel.HSSFRow;
Import Org.apache.poi.hssf.usermodel.HSSFSheet;
Import Org.apache.poi.hssf.usermodel.HSSFWorkbook;
Import Org.apache.poi.ss.usermodel.Cell;
Import Org.apache.poi.ss.usermodel.DateUtil;
Import Org.apache.poi.ss.usermodel.Row;
Import Org.apache.poi.ss.usermodel.Sheet;
Import Org.apache.poi.ss.usermodel.Workbook;
Import Org.apache.poi.xssf.usermodel.XSSFWorkbook; /** * * @author daT dev.tao@gmail.com *2003,2007 version exCEL read/write Tools/public class excelutil{/** * Excel file Read * @param filePath * @return string[] The row is stored, and the list is listed. * One Excel reads all at once in memory (Excel oversized needs to be processed separately)/public list<string[]> Readexcel (String filePath) {list<string[
]> dataList = new arraylist<string[]> ();
Boolean isExcel2003 = true;
if (isExcel2007 (FilePath)) {isExcel2003 = false;
File File = new file (FilePath);
InputStream is = null;
try {is = new FileInputStream (file); The catch (FileNotFoundException ex) {Logger.getlogger (ExcelUtil.class.getName ()). log (Level.severe, NULL, ex)
;
Workbook WB = null;
try {wb = isExcel2003 new Hssfworkbook (IS): New Xssfworkbook (IS);
The catch (IOException ex) {Logger.getlogger (ExcelUtil.class.getName ()). log (Level.severe, NULL, ex);
} Sheet Sheet = Wb.getsheetat (0); int totalrows = Sheet.getphysicalnumberofrows ();
int totalcells = 0; if (totalrows >= 1 && sheet.getrow (0)!= null) {totalcells = Sheet.getrow (0). getphysicalnumberof
Cells ();
for (int r = 0; r < totalrows; r++) {Row row = Sheet.getrow (r);
if (row = = null) {continue;
} string[] rowlist = new String[totalcells];
for (int c = 0; c < totalcells; C + +) {cell cell = Row.getcell (c);
String cellvalue = "";
if (cell = = null) {Rowlist[c] = (cellvalue);
Continue
} Cellvalue = Convertcellstr (cell, cellvalue);
ROWLIST[C] = (cellvalue);
} datalist.add (Rowlist);
return dataList; Private String convertcellstr (cell cell, STring cellstr) {switch (Cell.getcelltype ()) {case cell.cell_type_string://Read
String cellstr = Cell.getstringcellvalue (). toString ();
Break Case Cell.cell_type_boolean://Get BOOLEAN object Method cellstr = string.valueof (Cell.getboolea
Ncellvalue ());
Break
Case Cell.cell_type_numeric://First See if the date format if (dateutil.iscelldateformatted (Cell)) {
Read date format cellstr = Formattime (Cell.getdatecellvalue (). toString ());
else {//read the number Cellstr = String.valueof (Cell.getnumericcellvalue ());
} break;
Case Cell.cell_type_formula://Read Formula CELLSTR = Cell.getcellformula (). toString ();
Break } RETurn cellstr; Private Boolean isExcel2007 (String fileName) {return filename.matches ("^.+\\").
i) (xlsx) $ "); private String Formattime (string s) {SimpleDateFormat SF = new SimpleDateFormat ("EEE MMM dd hh:mm:ss z y
YYY ", locale.english);
Date date = null;
try {date = Sf.parse (s);
The catch (ParseException ex) {Logger.getlogger (ExcelUtil.class.getName ()). log (Level.severe, NULL, ex);
SimpleDateFormat SDF = new SimpleDateFormat ("Yyyy-mm-dd HH:mm:ss");
String result = Sdf.format (date);
return result; /** * Excel write operation, simple or use memory data once write * @param filePath output file path name * @param dataList output file content,list<string> line Li St Column * @throws IOException */public void Writeexcel (String filepath,list<list<string>> dataList) throws
ioexception{Hssfworkbook wb = new Hssfworkbook (); Hssfsheet sheet = Wb.createsheet("sheet");//Add sheet//table styles Hssfcellstyle style = Wb.createcellstyle (); Style.setalignment (hssfcellstyle.align_center);//Specify Cell center alignment////Border//Style.setborderbottom (HSSFCE
Llstyle.border_medium);
Style.setbordertop (Hssfcellstyle.border_medium);
Style.setborderleft (Hssfcellstyle.border_medium);
Style.setborderright (Hssfcellstyle.border_medium);
Set font//Hssffont F = wb.createfont ();
F.setfontheightinpoints ((short) 10);
F.setboldweight (Hssffont.boldweight_normal);
Style.setfont (f);
Set column width//sheet.setcolumnwidth (short) 0, (short) 9600);
Sheet.setcolumnwidth ((short) 1, (short) 4000);
Sheet.setcolumnwidth ((short) 2, (short) 8000);
Sheet.setcolumnwidth ((short) 3, (short) 8000); Create the first line in the position of index 0 for (int i = 0; i < datalist.size (); i++) {
Hssfrow row = Sheet.createrow (i);
list<string> list = Datalist.get (i);
for (int j = 0; J < List.size (); j + +) {Hssfcell cell = Row.createcell (j);
Cell.setcellvalue (List.get (j));
Cell.setcellstyle (style);
}///export file FileOutputStream fout = new FileOutputStream (FilePath);
Wb.write (Fout);
Fout.close (); }
}
Dataclean class, which contains the cleaning of HTML tags, urls in the information.
Package dat.datadeal;
Import java.io.IOException;
Import java.util.ArrayList;
Import java.util.List;
Import Java.util.regex.Matcher;
Import Java.util.regex.Pattern; /** * * @author daT dev.tao@gmail.com * */public class Dataclean {/** * Clean HTML tags * @param inputstring * @r
Eturn/public static string delhtml (String inputstring) {string htmlstr = inputstring;//String containing HTML tags
String textstr = "";
Java.util.regex.Pattern P_script;
Java.util.regex.Matcher M_script;
Java.util.regex.Pattern p_html;
Java.util.regex.Matcher m_html; try {string regex_html = ' <[^>]+> ';//define the regular expression String regex_script = "<[/s]*?s for HTML tags Cript[^>]*?>[/s/s]*?<[/s]*?//[/s]*?script[/s]*?> "; Defines a script's regular expression {or <script[^>]*?>[/s/S]*?<//script> P_script = Pattern.compile (Regex_script, Pa Ttern.
case_insensitive);
M_script = P_script.matcher (HTMLSTR);Htmlstr = M_script.replaceall ("");
Filter script Tag p_html = Pattern.compile (regex_html, pattern.case_insensitive);
m_html = P_html.matcher (HTMLSTR); Htmlstr = M_html.replaceall ("");
Filter HTML tags textstr = htmlstr;
catch (Exception e) {System.err.println ("Html2text:" + e.getmessage ()); Return textstr;//returns text string}/** * processing the URL address in the message * * public static string Dealwithurl (String str) {St Ring regEx = "[http|https]+[://]+[0-9a-za-z:/[-]_#[?] [=] [.] [[;]]
*";
Pattern p = pattern.compile (regEx);
Matcher m = p.matcher (str);
Return M.replaceall ("");
public static void Main (string[] args) throws ioexception{excelutil excelutil = new Excelutil ();
list<list<string>> writelist = new arraylist<list<string>> ();
list<string[]> readlist =excelutil.readexcel ("/home/dat/javatest/Weibo data _.xlsx"); For (string[] linearray:readlist) {LIST<
string> strlist = new arraylist<string> ();
for (string str:linearray) {string strtmp = Dataclean.dealwithurl (dataclean.delhtml (str));
Strlist.add (strtmp);
System.out.println (strtmp);
} writelist.add (Strlist);
} excelutil.writeexcel ("/home/dat/javatest/weibo.xlsx", writelist);
The System.out.println ("job has finished ..."); }
}
After cleaning data: