Copy Code code as follows:
Import Java.io.BufferedReader;
Import java.io.IOException;
Import Java.io.InputStreamReader;
Import java.net.MalformedURLException;
Import Java.net.URL;
Import java.util.ArrayList;
Import Java.util.HashMap;
Import java.util.List;
Import Java.util.regex.Matcher;
Import Java.util.regex.Pattern;
Import java.net.*;
Import java.io.*;
Import java.util.regex.*;
/*
Gets the URL by constructing a regular expression based on the specified rules
*/
public class Urls
{
Private String StartURL; Start collecting URLs
String urlcontent;
String Contentarea;
Private String Strareabegin, strareaend; Collection area begins to collect string and end acquisition string
Private String Stringinurl,stringnotinurl;
Acquisition content obtained by String strcontent;//
String[] Allurls; All the URLs that were collected
Private String regex; Acquisition rules
Urlandtitle urlandtitle=new urlandtitle (); Store URLs and titles
public static void Main (string[] args)
{
URLs myurl=new URLs ("<body", "/body>");
Myurl.getstarturl ("http://www.zuzwn.com/");
Myurl.geturlcontent ();
Myurl.getcontentarea ();
Myurl.getstarturl ("http://www.zuzwn.com/");
Myurl.getstringnotinurl ("Google");
Myurl. Urls ();
System.out.println ("StartURL:" +myurl.starturl);
System.out.println ("urlcontent:" +myurl.urlcontent);
System.out.println ("Contentarea:" +myurl.) Contentarea);
}
Initializing constructors Strareabegin and Strareaend
Public Urls (String strareabegin,string strareaend)
{
This.strareabegin=strareabegin;
This.strareaend=strareaend;
}
//
public void Urls ()
{
int i=0;
String regex = "<a href="? http://[a-za-z0-9]+/. [a-za-z0-9]+/. [a-za-z]+/? [/.? [/S|/S]] +[a>]$ ";
String regex = "<a.*?/a>";
String regex = "http://.*?>";
Pattern Pt=pattern.compile (regex);
Matcher Mt=pt.matcher (Contentarea);
while (Mt.find ())
{
System.out.println (Mt.group ());
i++;
Get title
Matcher title=pattern.compile (">.*?</a>"). Matcher (Mt.group ());
while (Title.find ())
{
System.out.println ("title:" +title.group (). ReplaceAll (">|</a>");
}
Get URL
Matcher myurl=pattern.compile ("href=.*?>"). Matcher (Mt.group ());
while (Myurl.find ())
{
System.out.println ("URL:" +myurl.group (). ReplaceAll ("href=|>", ""));
}
System.out.println ();
}
System.out.println ("A total of" +i+ "consistent results");
}
Get started to collect URLs
public void Getstarturl (String starturl)
{
This.starturl=starturl;
}
Get the content of the website;
public void Geturlcontent ()
{
StringBuffer is=new StringBuffer ();
Try
{
URL myurl=new url (starturl);
BufferedReader br= New BufferedReader (
New InputStreamReader (Myurl.openstream ()));
String s;
while ((S=br.readline ())!=null)
{
Is.append (s);
}
Urlcontent=is.tostring ();
}
catch (Exception e)
{
System.out.println ("Web site file failed to output");
E.printstacktrace ();
}
}
Get the matching area part of the URL
public void Getcontentarea ()
{
int pos1=0,pos2=0;
pos1= Urlcontent.indexof (Strareabegin) +strareabegin.length ();
Pos2=urlcontent.indexof (STRAREAEND,POS1);
Contentarea=urlcontent.substring (POS1,POS2);
}
The following two functions to get the URL should be included in the keyword and can not be included in the keyword
Only preliminary experiments are done here. Later, the protected keyword and the keyword that cannot be included should be more than one.
public void Getstringinurl (String stringinurl)
{
This.stringinurl=stringinurl;
}
public void Getstringnotinurl (String stringnotinurl)
{
This.stringnotinurl=stringnotinurl;
}
Get Collection Rules
Get URL URLs
public void GetUrl ()
{
}
Public String Getregex ()
{
return regex;
}
Class Urlandtitle
{
String Myurl;
String title;
}
}