Play with Hibernate (2) hibernate-spider crawler ~~, Spider Crawler
Create a new project to import the previously created lib
Create a hibernate ing file for hibernate. cfg. xml.
1 <? Xml version = '1. 0' encoding = 'utf-8'?> 2 <! DOCTYPE hibernate-configuration PUBLIC 3 "-// Hibernate/Hibernate Configuration DTD 3.0 // EN" 4 "http://hibernate.sourceforge.net/hibernate-configuration-3.0.dtd"> 5 6
Create a New 'heatider 'Package, click Open HibernateSpider-> right-click src-> New-> Package
Create a New 'ednew' Class, click to open HibernateSpider-> src-> hSpider-> New-> Class
Public class edNews {private int id; private String ednews; public int getId () {return id;} public void setId (int id) {this. id = id;} public String getNews () {return ednews;} public void setNews (news ednews) {this. ednews = ednews. ednews ;}}EdNews
Create an edNews. hbm. xml ing file for the new one (it must be in the same package as edNEws)
<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE hibernate-mapping PUBLIC "-//Hibernate/Hibernate Mapping DTD 3.0//EN" "http://hibernate.sourceforge.net/hibernate-mapping-3.0.dtd">
Create a news class (for display)
1 public class news {2 3 public String ednews; 4 5 // constructor initializes data 6 public news () {7 ednews = ""; 8} 9 10 @ Override 11 public String toString () {12 return "Announcement:" + ednews + "\ n"; 13} 14}News
Create a new Spider class, which is the implementation of crawler code
1 package hibernateSpider; 2 3 import java. io. bufferedReader; 4 import java. io. inputStreamReader; 5 import java.net. URL; 6 import java.net. URLConnection; 7 import java. util. arrayList; 8 import java. util. regex. matcher; 9 import java. util. regex. pattern; 10 11 public class Spider {12 public static String SendGet (String url) {13 // define a String to store webpage content 14 String result = ""; 15 // define a buffer character input stream 16 BufferedRea Der in = null; 17 18 try {19 // convert string to url object 20 URL realUrl = new URL (url ); 21 // initialize a connection 22 URLConnection connection = realUrl linked to that url. openConnection (); 23 // start the actual connection 24 connection. connect (); 25 // initialize the BufferedReader input stream to read the URL response 26 in = new BufferedReader (new InputStreamReader (27 connection. getInputStream (), "UTF-8"); 28 // used to temporarily store the data of each row crawled 29 String line; 30 while (line = in. readLine ())! = Null) {31 // traverse each row captured and store it in result 32 result + = line; 33} 34} catch (Exception e) {35 System. out. println ("an exception occurred when sending a GET request! "+ E); 36 e. printStackTrace (); 37} 38 // use finally to close the input stream 39 finally {40 try {41 if (in! = Null) {42 in. close (); 43} 44} catch (Exception e2) {45 e2.printStackTrace (); 46} 47} 48 return result; 49 50} 51 52 public static ArrayList <news> GetNews (String content) {53 // predefine an ArrayList to store the result 54 ArrayList <news> results = new ArrayList <news> (); 55 // match the title 56 Pattern questionPattern = Pattern. compile ("ggtz/\ d000042.16.shtml. +?> (. + ?) <"); 57 Matcher questionMatcher = questionPattern. matcher (content); 58 // used to match the url, that is, the problematic link 59 Pattern urlPattern = Pattern. compile ("ggtz/\ d000042.16.shtml. +?> (. + ?) <"); 60 Matcher urlMatcher = urlPattern. matcher (content); 61 62 // the problem and link must be able to match to 63 boolean isFind = questionMatcher. find () & urlMatcher. find (); 64 65 while (isFind) {66 // defines a news object (Announcement object) to store the captured information 67 news newsTemp = new news (); 68 newsTemp. ednews = questionMatcher. group (1); 69 70 71 // Add successfully matched results 72 results. add (newsTemp); 73 // continue searching for the next matching object 74 isFind = questionMatcher. find () & urlMatcher. find (); 75} 76 return results; 77} 78 79 80}Spider
Finally, test the result.
1 public class MainTest {2 3 4 public static void main (String [] args) {5 6 // define the link to be accessed 7 8 String url = "http://jwc.gdut.edu.cn /"; 9 // access the link and obtain the page content 10 String content = Spider. sendGet (url); 11 // obtain all proposition objects on the page 12 ArrayList <news> myNews = Spider. getNews (content); 13 // print the result 14 for (int I = 0; I <myNews. size (); I ++) {15 System. out. println (myNews. get (I); 16 17 edNews aNew = new edNews (); // create a class object to be stored and set some attributes of the object 18 aNew. setId (I); 19 aNew. setNews (myNews. get (I); 20 21 {22 // Configuration is mainly used to read the Configuration file 23 Configuration cfg = new Configuration (); 24 SessionFactory sf = cfg. configure (). buildSessionFactory (); 25 // buildSessionFactory (); get a workshop for creating a Session 26 Session ss = sf. openSession (); 27 ss. beginTransaction (); // OK, put the operation into the transaction 28 ss. save (aNew); // save your object 29 ss. getTransaction (). commit (); // get the transaction and submit 30 31 ss. close (); // Session closes 32 sf. close (); // factory close 33 34 35} 36} 37} 38}MainTest