Java accurate extraction of Web page publishing time _java

Source: Internet
Author: User
Tags current time

The release time of various formats in the Web page is extracted, the release time in a regular "Yyyy-mm-dd HH:mm:ss" format, can only be as accurate as possible, but because the format of the network publishing time is very flexible, so do not make the correct extraction

Package Whu.extract.pubtime.core;
Import java.util.ArrayList;
Import Java.util.Calendar;
Import java.util.Collections;
Import java.util.List;
Import Java.util.regex.Matcher;
 
Import Java.util.regex.Pattern;
 
Import Whu.utils.TimeUtil; /** * Created on March 13, 2014 afternoon 2:49:05 * @description get the publishing time for the Web page/public class Fetchpubtime {/** represents a sequential 8-bit date in the URL, such as HT
  tp://www.baidu.com/20140311/2356.html * * private static String url_reg_whole= "([-|/|_]{1}20\\d{6})"; /** indicating the date of-or/separated, such as http://www.baidu.com/2014-3-11/2356.html/private static String Url_reg_sep_ymd = "([-|/|_]
  {1}20\\d{2}[-|/|_]{1}\\d{1,2}[-|/|_]{1}\\d{1,2}) "; /** represents a-or/separated date, only years and months, such as http://www.baidu.com/2014-3/2356.html/private static String Url_reg_sep_ym = "([-|/|_]{1
  }20\\d{2}[-|/|_]{1}\\d{1,2}) ";
  private static Calendar current = Calendar.getinstance (); /** well-formed Time Regular expression */private static String Righttimereg = "^ (\\d{2} ([02468][048]) | ( [13579] [26])) [\\-\\/\\s]? (((0? [13578]) | (1[02]) [\\-\\/\\S]? ((0?) [1-9]) | ([1-2][0-9]) | (3[01])) | ((0? [469]) | (11)) [\\-\\/\\s]? ((0?) [1-9]) | ([1-2][0-9]) | (30)) | (0?2[\\-\\/\\s]? ((0?) [1-9]) | ([1-2][0-9])) | (\\d{2} ([02468][1235679]) | ( [13579] [01345789])) [\\-\\/\\s]? (((0? [13578]) | (1[02]) [\\-\\/\\s]? ((0?) [1-9]) | ([1-2][0-9]) | (3[01])) | ((0? [469]) | (11)) [\\-\\/\\s]? ((0?) [1-9]) | ([1-2][0-9]) | (30)) | (0?2[\\-\\/\\s]? ((0?) [1-9]) | (1[0-9]) | (2[0-8])))) (\\s ((0?[ 0-9]) | ([1-2][0-3]) \:([0-5]? [0-9]) ((\\s) | (\:([0-5]? [0-9]))?
   
  $"; /** * @param URL * @param urlcontent * @return */public static string getpubtimevarious (String url,string u
     
   rlcontent) {String pubtime = getpubtimefromurl (URL); There is no link inside, matching text if (Pubtime = null) {if (Urlcontent!=null&&!urlcontent.trim ()). Equals (")" Re
    Turn extractpagedate (urlcontent);
  return pubtime; /** the release time from the URL, returns the YYYY-MM-DD HH:mm:ss format string * @param URL * @return/public static string GETPUBTIMEF Romurl (String URL) {PattErn p_whole = Pattern.compile (url_reg_whole);
    Matcher m_whole = p_whole.matcher (URL);
      if (m_whole.find (0) &&m_whole.groupcount () >0) {String time = m_whole.group (0);
      Time = Time.substring (1,time.length ()); Each step is not able to exceed the current time if (Current.compareto (Timeutil.strtocalendar, YYYYMMDD)) >=0} {return TIME.S
    Ubstring (0,4) + "-" +time.substring (4,6) + "-" + time.substring (6,8) + "" + 00:00:00 ";
    } p_whole = null;
    M_whole = null;
    Pattern p_sep = Pattern.compile (URL_REG_SEP_YMD);
    Matcher m_sep = p_sep.matcher (URL);
       if (m_sep.find (0) &&m_sep.groupcount () >0) {String time = m_sep.group (0);
       Time = Time.substring (1,time.length ());
       string[] seg = Time.split ("[-|/|_]{1}");
       Calendar thetime = Calendar.getinstance ();
       Thetime.set (Calendar.year,integer.parseint (seg[0));
       Thetime.set (Calendar.month, Integer.parseint (seg[1)); Thetime.set (CALENDAR.DAy_of_month, Integer.parseint (seg[2]);
        if (Current.compareto (thetime) >=0) {return seg[0]+ "-+seg[1]+"-"+seg[2]+" "+ 00:00:00";
    } P_SEP = null;
    M_SEP = null;
    Pattern P_sep_ym = Pattern.compile (URL_REG_SEP_YM);
    Matcher m_sep_ym = p_sep_ym.matcher (URL);
       if (m_sep_ym.find (0) &&m_sep_ym.groupcount () >0) {String time = m_sep_ym.group (0);
       Time = Time.substring (1,time.length ());
       Calendar thetime = Calendar.getinstance ();
       string[] seg = Time.split ("[-|/|_]{1}");
       Thetime.set (Calendar.year,integer.parseint (seg[0));
       Thetime.set (Calendar.month, Integer.parseint (seg[1));
       Thetime.set (Calendar.day_of_month, 1);
      if (Current.compareto (thetime) >=0) {return seg[0]+ "-" +seg[1]+ "-" + "01" + "" + "00:00:00";
  } return null; /** from the source of the Web page release time * Java Regular expression extract string date implementation code * December 19, 2013 15:58:42 * read out 2013-12-19 15:48:33 or 2013-12-19 or 2012/3/05 form of time * @param text string to be extracted * @return return date * @author: Oschina * @Createtime: 21, 2013
    */public static string Extractpagedate (String text) {Boolean containshms =false;
    String datestr = Text.replaceall ("R?n", ""); 
      try {List matches = null; Pattern P_detail = Pattern.compile (20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2} \\d{1,2}:\\d{1,2}:\\d{1,2}) | ( 20\\D{2} \\d{1,2} month \\d{1,2} ", pattern.case_insensitive| 
      Pattern.multiline); If it is only the date of the year, then according to the above, if it is to extract the month and a minute, then according to the following pattern P = Pattern.compile ("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2}) |" ( 20\\D{2} \\d{1,2} month \\d{1,2} ", pattern.case_insensitive|
      Pattern.multiline);
      Matcher Matcher = P.matcher (DATESTR);
       
      Matcher Matcher_detail = P_detail.matcher (DATESTR); if (!) ( Matcher_detail.find (0) && matcher_detail.groupcount () >= 1)) {Matcher_detail = P.matcher (datest
        R);
      Containshms = true; }else Matcher_detail = P_detail. Matcher (DATESTR); 
        if (Matcher_detail.find () && matcher_detail.groupcount () >= 1) {matches = new ArrayList (); 
          for (int i = 1; I <= matcher_detail.groupcount (); i++) {String temp = Matcher_detail.group (i); 
        Matches.add (temp); 
      } else {matches = collections.empty_list; } if (Matches.size () > 0) {for (int i=0;i<matches.size (); i++) {String pubt
          IME = Matches.get (i). toString (). Trim ();
          Remove the first value Pubtime = Pubtime.replace ("/", "-"). Replace ("year", "-"). Replace ("month", "-"). Replace ("Day", "-");
              if (Current.compareto (Timeutil.strtocalendar (Pubtime, "Yyyy-mm-dd")) >=0) {if (CONTAINSHMS)
            pubtime+= "" + "00:00:00"; 
            if (Pubtime.matches (Righttimereg)) {return pubtime; 
      '}}} else {return null; }} CatCH (Exception e) {return null;
  return null; }
}

The above is the entire content of this article, I hope to learn Java program to help you.

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.