The release time of various formats in the Web page is extracted, the release time in a regular "Yyyy-mm-dd HH:mm:ss" format, can only be as accurate as possible, but because the format of the network publishing time is very flexible, so do not make the correct extraction
Package Whu.extract.pubtime.core;
Import java.util.ArrayList;
Import Java.util.Calendar;
Import java.util.Collections;
Import java.util.List;
Import Java.util.regex.Matcher;
Import Java.util.regex.Pattern;
Import Whu.utils.TimeUtil; /** * Created on March 13, 2014 afternoon 2:49:05 * @description get the publishing time for the Web page/public class Fetchpubtime {/** represents a sequential 8-bit date in the URL, such as HT
tp://www.baidu.com/20140311/2356.html * * private static String url_reg_whole= "([-|/|_]{1}20\\d{6})"; /** indicating the date of-or/separated, such as http://www.baidu.com/2014-3-11/2356.html/private static String Url_reg_sep_ymd = "([-|/|_]
{1}20\\d{2}[-|/|_]{1}\\d{1,2}[-|/|_]{1}\\d{1,2}) "; /** represents a-or/separated date, only years and months, such as http://www.baidu.com/2014-3/2356.html/private static String Url_reg_sep_ym = "([-|/|_]{1
}20\\d{2}[-|/|_]{1}\\d{1,2}) ";
private static Calendar current = Calendar.getinstance (); /** well-formed Time Regular expression */private static String Righttimereg = "^ (\\d{2} ([02468][048]) | ( [13579] [26])) [\\-\\/\\s]? (((0? [13578]) | (1[02]) [\\-\\/\\S]? ((0?) [1-9]) | ([1-2][0-9]) | (3[01])) | ((0? [469]) | (11)) [\\-\\/\\s]? ((0?) [1-9]) | ([1-2][0-9]) | (30)) | (0?2[\\-\\/\\s]? ((0?) [1-9]) | ([1-2][0-9])) | (\\d{2} ([02468][1235679]) | ( [13579] [01345789])) [\\-\\/\\s]? (((0? [13578]) | (1[02]) [\\-\\/\\s]? ((0?) [1-9]) | ([1-2][0-9]) | (3[01])) | ((0? [469]) | (11)) [\\-\\/\\s]? ((0?) [1-9]) | ([1-2][0-9]) | (30)) | (0?2[\\-\\/\\s]? ((0?) [1-9]) | (1[0-9]) | (2[0-8])))) (\\s ((0?[ 0-9]) | ([1-2][0-3]) \:([0-5]? [0-9]) ((\\s) | (\:([0-5]? [0-9]))?
$"; /** * @param URL * @param urlcontent * @return */public static string getpubtimevarious (String url,string u
rlcontent) {String pubtime = getpubtimefromurl (URL); There is no link inside, matching text if (Pubtime = null) {if (Urlcontent!=null&&!urlcontent.trim ()). Equals (")" Re
Turn extractpagedate (urlcontent);
return pubtime; /** the release time from the URL, returns the YYYY-MM-DD HH:mm:ss format string * @param URL * @return/public static string GETPUBTIMEF Romurl (String URL) {PattErn p_whole = Pattern.compile (url_reg_whole);
Matcher m_whole = p_whole.matcher (URL);
if (m_whole.find (0) &&m_whole.groupcount () >0) {String time = m_whole.group (0);
Time = Time.substring (1,time.length ()); Each step is not able to exceed the current time if (Current.compareto (Timeutil.strtocalendar, YYYYMMDD)) >=0} {return TIME.S
Ubstring (0,4) + "-" +time.substring (4,6) + "-" + time.substring (6,8) + "" + 00:00:00 ";
} p_whole = null;
M_whole = null;
Pattern p_sep = Pattern.compile (URL_REG_SEP_YMD);
Matcher m_sep = p_sep.matcher (URL);
if (m_sep.find (0) &&m_sep.groupcount () >0) {String time = m_sep.group (0);
Time = Time.substring (1,time.length ());
string[] seg = Time.split ("[-|/|_]{1}");
Calendar thetime = Calendar.getinstance ();
Thetime.set (Calendar.year,integer.parseint (seg[0));
Thetime.set (Calendar.month, Integer.parseint (seg[1)); Thetime.set (CALENDAR.DAy_of_month, Integer.parseint (seg[2]);
if (Current.compareto (thetime) >=0) {return seg[0]+ "-+seg[1]+"-"+seg[2]+" "+ 00:00:00";
} P_SEP = null;
M_SEP = null;
Pattern P_sep_ym = Pattern.compile (URL_REG_SEP_YM);
Matcher m_sep_ym = p_sep_ym.matcher (URL);
if (m_sep_ym.find (0) &&m_sep_ym.groupcount () >0) {String time = m_sep_ym.group (0);
Time = Time.substring (1,time.length ());
Calendar thetime = Calendar.getinstance ();
string[] seg = Time.split ("[-|/|_]{1}");
Thetime.set (Calendar.year,integer.parseint (seg[0));
Thetime.set (Calendar.month, Integer.parseint (seg[1));
Thetime.set (Calendar.day_of_month, 1);
if (Current.compareto (thetime) >=0) {return seg[0]+ "-" +seg[1]+ "-" + "01" + "" + "00:00:00";
} return null; /** from the source of the Web page release time * Java Regular expression extract string date implementation code * December 19, 2013 15:58:42 * read out 2013-12-19 15:48:33 or 2013-12-19 or 2012/3/05 form of time * @param text string to be extracted * @return return date * @author: Oschina * @Createtime: 21, 2013
*/public static string Extractpagedate (String text) {Boolean containshms =false;
String datestr = Text.replaceall ("R?n", "");
try {List matches = null; Pattern P_detail = Pattern.compile (20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2} \\d{1,2}:\\d{1,2}:\\d{1,2}) | ( 20\\D{2} \\d{1,2} month \\d{1,2} ", pattern.case_insensitive|
Pattern.multiline); If it is only the date of the year, then according to the above, if it is to extract the month and a minute, then according to the following pattern P = Pattern.compile ("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2}) |" ( 20\\D{2} \\d{1,2} month \\d{1,2} ", pattern.case_insensitive|
Pattern.multiline);
Matcher Matcher = P.matcher (DATESTR);
Matcher Matcher_detail = P_detail.matcher (DATESTR); if (!) ( Matcher_detail.find (0) && matcher_detail.groupcount () >= 1)) {Matcher_detail = P.matcher (datest
R);
Containshms = true; }else Matcher_detail = P_detail. Matcher (DATESTR);
if (Matcher_detail.find () && matcher_detail.groupcount () >= 1) {matches = new ArrayList ();
for (int i = 1; I <= matcher_detail.groupcount (); i++) {String temp = Matcher_detail.group (i);
Matches.add (temp);
} else {matches = collections.empty_list; } if (Matches.size () > 0) {for (int i=0;i<matches.size (); i++) {String pubt
IME = Matches.get (i). toString (). Trim ();
Remove the first value Pubtime = Pubtime.replace ("/", "-"). Replace ("year", "-"). Replace ("month", "-"). Replace ("Day", "-");
if (Current.compareto (Timeutil.strtocalendar (Pubtime, "Yyyy-mm-dd")) >=0) {if (CONTAINSHMS)
pubtime+= "" + "00:00:00";
if (Pubtime.matches (Righttimereg)) {return pubtime;
'}}} else {return null; }} CatCH (Exception e) {return null;
return null; }
}
The above is the entire content of this article, I hope to learn Java program to help you.