Java accurate extraction of Web page publishing time

Java accurate extraction of Web page publishing time _java

Last Update:2017-01-19 Source: Internet

Author: User

Tags current time

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

The release time of various formats in the Web page is extracted, the release time in a regular "Yyyy-mm-dd HH:mm:ss" format, can only be as accurate as possible, but because the format of the network publishing time is very flexible, so do not make the correct extraction

Package Whu.extract.pubtime.core;
Import java.util.ArrayList;
Import Java.util.Calendar;
Import java.util.Collections;
Import java.util.List;
Import Java.util.regex.Matcher;
 
Import Java.util.regex.Pattern;
 
Import Whu.utils.TimeUtil; /** * Created on March 13, 2014 afternoon 2:49:05 * @description get the publishing time for the Web page/public class Fetchpubtime {/** represents a sequential 8-bit date in the URL, such as HT
  tp://www.baidu.com/20140311/2356.html * * private static String url_reg_whole= "([-|/|_]{1}20\\d{6})"; /** indicating the date of-or/separated, such as http://www.baidu.com/2014-3-11/2356.html/private static String Url_reg_sep_ymd = "([-|/|_]
  {1}20\\d{2}[-|/|_]{1}\\d{1,2}[-|/|_]{1}\\d{1,2}) "; /** represents a-or/separated date, only years and months, such as http://www.baidu.com/2014-3/2356.html/private static String Url_reg_sep_ym = "([-|/|_]{1
  }20\\d{2}[-|/|_]{1}\\d{1,2}) ";
  private static Calendar current = Calendar.getinstance (); /** well-formed Time Regular expression */private static String Righttimereg = "^ (\\d{2} ([02468][048]) | ( [13579] [26])) [\\-\\/\\s]? (((0? [13578]) | (1[02]) [\\-\\/\\S]? ((0?) [1-9]) | ([1-2][0-9]) | (3[01])) | ((0? [469]) | (11)) [\\-\\/\\s]? ((0?) [1-9]) | ([1-2][0-9]) | (30)) | (0?2[\\-\\/\\s]? ((0?) [1-9]) | ([1-2][0-9])) | (\\d{2} ([02468][1235679]) | ( [13579] [01345789])) [\\-\\/\\s]? (((0? [13578]) | (1[02]) [\\-\\/\\s]? ((0?) [1-9]) | ([1-2][0-9]) | (3[01])) | ((0? [469]) | (11)) [\\-\\/\\s]? ((0?) [1-9]) | ([1-2][0-9]) | (30)) | (0?2[\\-\\/\\s]? ((0?) [1-9]) | (1[0-9]) | (2[0-8])))) (\\s ((0?[ 0-9]) | ([1-2][0-3]) \:([0-5]? [0-9]) ((\\s) | (\:([0-5]? [0-9]))?
   
  $"; /** * @param URL * @param urlcontent * @return */public static string getpubtimevarious (String url,string u
     
   rlcontent) {String pubtime = getpubtimefromurl (URL); There is no link inside, matching text if (Pubtime = null) {if (Urlcontent!=null&&!urlcontent.trim ()). Equals (")" Re
    Turn extractpagedate (urlcontent);
  return pubtime; /** the release time from the URL, returns the YYYY-MM-DD HH:mm:ss format string * @param URL * @return/public static string GETPUBTIMEF Romurl (String URL) {PattErn p_whole = Pattern.compile (url_reg_whole);
    Matcher m_whole = p_whole.matcher (URL);
      if (m_whole.find (0) &&m_whole.groupcount () >0) {String time = m_whole.group (0);
      Time = Time.substring (1,time.length ()); Each step is not able to exceed the current time if (Current.compareto (Timeutil.strtocalendar, YYYYMMDD)) >=0} {return TIME.S
    Ubstring (0,4) + "-" +time.substring (4,6) + "-" + time.substring (6,8) + "" + 00:00:00 ";
    } p_whole = null;
    M_whole = null;
    Pattern p_sep = Pattern.compile (URL_REG_SEP_YMD);
    Matcher m_sep = p_sep.matcher (URL);
       if (m_sep.find (0) &&m_sep.groupcount () >0) {String time = m_sep.group (0);
       Time = Time.substring (1,time.length ());
       string[] seg = Time.split ("[-|/|_]{1}");
       Calendar thetime = Calendar.getinstance ();
       Thetime.set (Calendar.year,integer.parseint (seg[0));
       Thetime.set (Calendar.month, Integer.parseint (seg[1)); Thetime.set (CALENDAR.DAy_of_month, Integer.parseint (seg[2]);
        if (Current.compareto (thetime) >=0) {return seg[0]+ "-+seg[1]+"-"+seg[2]+" "+ 00:00:00";
    } P_SEP = null;
    M_SEP = null;
    Pattern P_sep_ym = Pattern.compile (URL_REG_SEP_YM);
    Matcher m_sep_ym = p_sep_ym.matcher (URL);
       if (m_sep_ym.find (0) &&m_sep_ym.groupcount () >0) {String time = m_sep_ym.group (0);
       Time = Time.substring (1,time.length ());
       Calendar thetime = Calendar.getinstance ();
       string[] seg = Time.split ("[-|/|_]{1}");
       Thetime.set (Calendar.year,integer.parseint (seg[0));
       Thetime.set (Calendar.month, Integer.parseint (seg[1));
       Thetime.set (Calendar.day_of_month, 1);
      if (Current.compareto (thetime) >=0) {return seg[0]+ "-" +seg[1]+ "-" + "01" + "" + "00:00:00";
  } return null; /** from the source of the Web page release time * Java Regular expression extract string date implementation code * December 19, 2013 15:58:42 * read out 2013-12-19 15:48:33 or 2013-12-19 or 2012/3/05 form of time * @param text string to be extracted * @return return date * @author: Oschina * @Createtime: 21, 2013
    */public static string Extractpagedate (String text) {Boolean containshms =false;
    String datestr = Text.replaceall ("R?n", ""); 
      try {List matches = null; Pattern P_detail = Pattern.compile (20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2} \\d{1,2}:\\d{1,2}:\\d{1,2}) | ( 20\\D{2} \\d{1,2} month \\d{1,2} ", pattern.case_insensitive| 
      Pattern.multiline); If it is only the date of the year, then according to the above, if it is to extract the month and a minute, then according to the following pattern P = Pattern.compile ("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2}) |" ( 20\\D{2} \\d{1,2} month \\d{1,2} ", pattern.case_insensitive|
      Pattern.multiline);
      Matcher Matcher = P.matcher (DATESTR);
       
      Matcher Matcher_detail = P_detail.matcher (DATESTR); if (!) ( Matcher_detail.find (0) && matcher_detail.groupcount () >= 1)) {Matcher_detail = P.matcher (datest
        R);
      Containshms = true; }else Matcher_detail = P_detail. Matcher (DATESTR); 
        if (Matcher_detail.find () && matcher_detail.groupcount () >= 1) {matches = new ArrayList (); 
          for (int i = 1; I <= matcher_detail.groupcount (); i++) {String temp = Matcher_detail.group (i); 
        Matches.add (temp); 
      } else {matches = collections.empty_list; } if (Matches.size () > 0) {for (int i=0;i<matches.size (); i++) {String pubt
          IME = Matches.get (i). toString (). Trim ();
          Remove the first value Pubtime = Pubtime.replace ("/", "-"). Replace ("year", "-"). Replace ("month", "-"). Replace ("Day", "-");
              if (Current.compareto (Timeutil.strtocalendar (Pubtime, "Yyyy-mm-dd")) >=0) {if (CONTAINSHMS)
            pubtime+= "" + "00:00:00"; 
            if (Pubtime.matches (Righttimereg)) {return pubtime; 
      '}}} else {return null; }} CatCH (Exception e) {return null;
  return null; }
}

The above is the entire content of this article, I hope to learn Java program to help you.

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More