Learn Java regular expressions, crawl Web pages, and parse HTML parts
Package Com.xiaofeng.picup;
Import Java.io.BufferedReader;
Import java.io.IOException;
Import Java.io.InputStreamReader;
Import java.net.MalformedURLException;
Import Java.net.URL;
Import java.util.ArrayList;
Import Java.util.HashMap;
Import java.util.List;
Import Java.util.regex.Matcher;
Import Java.util.regex.Pattern; /** *//** * * * Crawl page article title and content (test) manual input URL crawl, you can further automatically crawl the entire content of the page * * * */public class webcontent ... {/** *//** * read a Web page full */Public String getonehtml (string htmlurl) throws IOException ...
{URL url;
String temp;
StringBuffer sb = new StringBuffer (); Try ...
{url = new URL (htmlurl); BufferedReader in = new BufferedReader (URL. OpenStream (), "utf-8"); Read the entire contents of the page Whil E (temp = In.readline ())!= null) ...
{sb.append (temp);
} in.close (); }catch (malformedurlexception me) ... {System.out.println ("The URL format you entered has a problem!")
Please carefully enter ");
Me.getmessage ();
throw me; }catch (IOException E) ...
{E.printstacktrace ();
Throw e;
return sb.tostring (); /** *//** * * @param s * @return get page title/public string GetTitle (string s) ...
{String regex;
String title = "";
list<string> list = new arraylist<string> ();
Regex = "<title>.*?</title>";
Pattern PA = pattern.compile (regex, pattern.canon_eq);
Matcher ma = pa.matcher (s); while (Ma.find ()) ...
{List.add (Ma.group ()); for (int i = 0; i < list.size (); i++) ...
{title = title + List.get (i);
Return Outtag (title); /** *//** * * @param s * @return get link/public list<string> getlink (String s) ...
{String regex;
list<string> list = new arraylist<string> (); Regex = "<a[^>]*href= ([^"]*) "|" ([^']*)'| ([^s>]*)] [^>]*> (. *?)
</a> ";
Pattern PA = pattern.compile (regex, Pattern.dotall);
Matcher ma = pa.matcher (s); while (Ma.find ()) ...
{ List.add (Ma.group ());
} return list; /** *//** * * @param s * @return get script code/Public list<string> Getscript (String s) ...
{String regex;
list<string> list = new arraylist<string> ();
Regex = "<script.*?</script>";
Pattern PA = pattern.compile (regex, Pattern.dotall);
Matcher ma = pa.matcher (s); while (Ma.find ()) ...
{List.add (Ma.group ());
} return list; /** *//** * * @param s * @return get CSS/public list<string> getcss (String s) ...
{String regex;
list<string> list = new arraylist<string> ();
Regex = "<style.*?</style>";
Pattern PA = pattern.compile (regex, Pattern.dotall);
Matcher ma = pa.matcher (s); while (Ma.find ()) ...
{List.add (Ma.group ());
} return list; /** *//** * * @param s * @return Remove Tag */public string Outtag (string s) ...
{return S.replaceall ("<.*?>", ""); }