Package com.smilezl.scrapy;
Import Java.io.BufferedReader;
Import java.io.IOException;
Import Java.io.InputStreamReader;
Import java.net.HttpURLConnection;
Import Java.net.URL;
Import java.sql.Connection;
Import Java.sql.DriverManager;
Import java.sql.Statement;
Import java.util.ArrayList;
Import java.util.List;
Import Java.util.regex.Matcher;
Import Java.util.regex.Pattern;
public class Scrapyurl {
/**
* Parse Page links
* @param htmlurl
* @throws IOException
*/
public static list<string> parserhtml (String htmlurl) {
list<string> list = new arraylist<string> ();
try {
URL url = new URL (htmlurl);
HttpURLConnection connection = (httpurlconnection) url.openconnection ();
Connection.setdooutput (TRUE);
String ContentType = Connection.getcontenttype ();
String CharSet = Getcharset (contenttype);
if (CharSet = = null)
CharSet = "UTF-8";
InputStreamReader ISR = new InputStreamReader (Connection.getinputstream (), charSet);
BufferedReader br = new BufferedReader (ISR);
String str = NULL, RS = null;
while ((str = br.readline ()) = null) {
rs = Gethref (str, htmlurl);
if (rs! = null &&!list.contains (RS))
List.add (RS);
}
} catch (IOException e) {
TODO auto-generated Catch block
E.printstacktrace ();
}
return list;
}
/**
* Get the page encoding method
* @param str
* @return
*/
public static string Getcharset (String str) {
Pattern pattern = pattern.compile ("charset=.*");
Matcher Matcher = Pattern.matcher (str);
if (Matcher.find ()) {
Return Matcher.group (0). Split ("charset=") [1];
}
return null;
}
/**
* Read links from a line of strings
* @param str
* @return
*/
public static string Gethref (String str, string htmlurl) {
String patternstr = "(http://|https://) {1}[\\w\\.\\-/:]+";
String patternstr = "[^\\s]* ((<\\s*[aa]\\s+ (href\\s*=[^>]+\\s*) >) (. *) </[aA]>). *";
Pattern pattern = pattern.compile (PATTERNSTR);
Matcher Matcher = Pattern.matcher (str);
if (Matcher.find ()) {
Return Matcher.group (0);
} else {
Relative position interception
String relpatternstr = "href=\"/.* (HTML) {1} ";
Pattern = Pattern.compile (RELPATTERNSTR);
Matcher = Pattern.matcher (str);
if (Matcher.find ()) {
Return Matcher.group (0). replace ("href=\"/", Htmlurl);
}
}
return null;
}
/**
* Save Link
* @param URL
*/
public static void Saveurllist (String hrefurl) {
try {
Class.forName ("Org.postgresql.Driver"). newinstance ();
String url = "JDBC:POSTGRESQL://LOCALHOST:5432/MYDB?USEUNICODE=TRUE&CHARACTERENCODING=GBK";
Connection con = drivermanager.getconnection (URL, "Postgres", "password");
Statement st = Con.createstatement ();
list<string> list = parserhtml (Hrefurl);
for (int i = 0; i < list.size (); i++) {
String sql = "INSERT into Scrapyurl (Url,type) VALUES ('" + list.get (i) + "', 0)";
System.out.println (List.get (i));
St.execute (SQL);
}
St.close ();
Con.close ();
} catch (Exception e) {
E.printstacktrace ();
}
}
public static void Main (string[] args) {
Saveurllist ("http://fo.ifeng.com/fojiaomeiwen/list_0/0.shtml");
}
}