Chinese Online Handbook
Download Address
Jsoup is a Java HTML parser that can be easily parsed by HTML.
1. Get title:
public class Testjsoup {public
static void Main (string[] args) {
Document doc;
try {
//Get document
doc = Jsoup.connect ("http://news.ifeng.com/").
Gets the body
Element content = Doc.body ();
Tag collection, is the title and link per page
Elements divs = Content.select ("div");
for (int i = 0; i < divs.size (); i++) {
//Today's Headline News
if ("box_02". Equals (Divs.get (i). attributes (). Get ("class")) {
Elements titles = Divs.get (i). Select ("a");
for (Element title:titles) {
if (""!=title.text ()) {
System.out.println (Title.text ());
Link
System.out.printlin (title.select ("a"). attr ("href");
Exception e) {
//TODO auto-generated catch block
e.printstacktrace ();
}
}
}
Results:
2. Get news content:
public static void Main (string[] args) {
Document doc;
try {
//get
doc = Jsoup
. Connect (
"http://news.ifeng.com/world/special/malaixiyakejishilian/ Content-4/detail_2014_03/30/35274873_0.shtml ")
. Get ();
Gets
the body Element content = Doc.getelementbyid ("main_content");
Tag collection, is the title and link of each page
Elements divs = Content.select ("P");
for (Element ele:divs) {
//If the label is a picture address
if ("Detailpic". Equals (Ele.attributes (). Get ("class")) {
System.out.println (Ele.select ("img"). attr ("src"));
else {
System.out.println (Ele.text () + "\ n");} \
catch (Exception e) {
//TODO Auto-generated Catch block
e.printstacktrace ();
}
Results:
3. Get video information:
This is the video message of some news, we need to parse the parameter information inside the script.
Video
s = p.select ("script");
if (!s.isempty ()) {
//Convert argument list to text
String data = S.first (). data ();
data = Data.replaceall ("var", "");
String[] arr;
arr= data.split ("',");
for (int i = 0; i < arr.length i++) {
//String str = arr[i];
str = Str.replaceall ("flash_[\\s]{0,} = '", "");
System.out.println (Str.trim ());
}
arr = Data.split (",");
for (String Str:arr) {
string[] ir = str.split ("=");
System.out.println (Ir[0].trim () + ";" + Ir[1].trim ());
}
Results:
Complete Example:
public class Artcle {private String artlink;
Private Document Doc;
Private Element ele;
Private Elements eles;
"http://news.qq.com/a/20141001/018175.htm?tu_biz=1.114.2.1" Public artcle (String str) {this.artlink = str;
Public Document Getdoc (String web) {Document doc = null;
try {doc = Jsoup.connect (web). get ();
catch (IOException e) {//TODO auto-generated catch block E.printstacktrace ();
return null;
return doc;
public void Getartcle () {if (null = = Getdoc (artlink))) {return;
} ele = Doc.select ("Div#cnt-main-article-qq").
Eles = Ele.select ("P");
for (Element p:eles) {Elements s = null;
Video s = p.select ("script");
if (!s.isempty ()) {//Convert argument list to text String data = S.first (). data ();
data = Data.replaceall ("var", "");
String[] arr;
arr= data.split ("',");
for (int i = 0; i < arr.length i++) {//String str = arr[i]; str = Str.replaceall ("flash_[\\s]{0,} = ' "," ");
System.out.println (Str.trim ());
} arr = Data.split (",");
for (String Str:arr) {string[] ir = str.split ("=");
System.out.println (Ir[0].trim () + ";" + Ir[1].trim ()); }
}
}
}
}
Attention:
1. The use of the method is still very simple, sometimes when parsing the Web page will throw IO exception, you pay attention to it.
2.//the argument list to text
String data = S.first (). data ();
It is important to convert the list of parameters in the script into string format, and then we can get the information we need by manipulating the string.