標籤:
資料庫建立指令碼:
/*Navicat MySQL Data TransferSource Server : localhostSource Server Version : 50621Source Host : localhost:3306Source Database : cmm-dbTarget Server Type : MYSQLTarget Server Version : 50621File Encoding : 65001Date: 2015-05-25 19:05:58*/SET FOREIGN_KEY_CHECKS=0;-- ------------------------------ Table structure for `region`-- ----------------------------DROP TABLE IF EXISTS `region`;CREATE TABLE `region` ( `area_code` varchar(50) NOT NULL COMMENT ‘地區編碼‘, `area_name` varchar(50) NOT NULL COMMENT ‘地區名稱‘, `parent_code` varchar(50) NOT NULL COMMENT ‘地區父編碼‘, `place_order` varchar(50) NOT NULL DEFAULT ‘0‘ COMMENT ‘顯示順序‘, PRIMARY KEY (`area_code`)) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT;-- ------------------------------ Records of region-- ----------------------------
maven依賴:
<dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.18</version></dependency>
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.8.2</version></dependency>
原始碼:
import java.io.FileWriter;import java.io.IOException;import java.sql.Connection;import java.sql.DriverManager;import java.sql.PreparedStatement;import java.sql.SQLException;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class JsoupRegion {/** 整數 */private static final String V_INTEGER = "^-?[1-9]\\d*$";class Region {public String getCode() {return code;}public void setCode(String code) {this.code = code;}public String getName() {return name;}public void setName(String name) {this.name = name;}public int getParentId() {return parenId;}public void setParentId(int parenId) {this.parenId = parenId;}public String getType() {return type;}public void setType(String type) {this.type = type;}private String code;private String name;private String type;// 鄉鎮類型private int parenId;}/** * @說明: url2Document * @param @param url * @param @return * @param @throws IOException * @return Document * @throws */public static Document url2Doc(String url) throws IOException {// 此種方式403// return Jsoup.connect(url).get();// return Jsoup.connect(url).timeout(600 * 1000)// .userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36").get();//return Jsoup.connect(url).timeout(600 * 1000).get();}/** * 驗證是不是整數 * * @param value * 要驗證的字串 要驗證的字串 * @return 如果是符合格式的字串,返回 <b>true </b>,否則為 <b>false </b> */public static boolean Integer(String value) {return match(V_INTEGER, value);}/** * @param regex * Regex字串 * @param str * 要匹配的字串 * @return 如果str 符合 regex的Regex格式,返回true, 否則返回 false; */private static boolean match(String regex, String str) {Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(str);return matcher.matches();}/** * 追加檔案:使用FileWriter * * @param fileName * @param content */public static void appendFile(String fileName, String content) {FileWriter writer = null;try {// 開啟一個寫檔案器,建構函式中的第二個參數true表示以追加形式寫檔案writer = new FileWriter(fileName, true);writer.write(content);} catch (IOException e) {e.printStackTrace();} finally {try {if (writer != null) {writer.close();}} catch (IOException e) {e.printStackTrace();}}}/** * @說明: 擷取省份 * @param @param url * @param @return * @param @throws IOException * @return List<Region> * @throws */private static List<Region> getProvince(String url) throws IOException {List<Region> list = new ArrayList<Region>();Document doc = url2Doc(url);Elements proviceTr = doc.getElementsByAttributeValue("class", "provincetr");// 通過css擷取trfor (Element e : proviceTr) {Elements tds = e.select("a[href]");for (Element element : tds) {Region region = new JsoupRegion().new Region();// region.setCode("13");region.setCode(element.attr("href").substring(0, 2));region.setName(element.text().replaceAll("<br />", ""));region.setType("");region.setParentId(0);list.add(region);}}return list;}/** * @說明: 擷取省份下的市 * @param @param url * @param @return * @param @throws IOException * @return List<Region> * @throws */private static List<Region> getCity(String url) throws IOException {List<Region> list = new ArrayList<Region>();Document doc = url2Doc(url);Elements proviceTr = doc.getElementsByAttributeValue("class", "citytr");// 通過css擷取trfor (Element e : proviceTr) {Elements tds = e.select("a[href]");for (Element element : tds) {if (Integer(element.text())) {continue;}Region region = new JsoupRegion().new Region();String code = element.attr("href").substring(3, 7);region.setCode(code);region.setName(element.text());region.setParentId(Integer.valueOf(code.substring(0, 2)));region.setType("");list.add(region);}}return list;}/** * @說明: 縣 * @param @param url * @param @return * @param @throws IOException * @return List<Region> * @throws */private static List<Region> getCounty(String url) throws IOException {List<Region> list = new ArrayList<Region>();Document doc = url2Doc(url);Elements proviceTr = doc.getElementsByAttributeValue("class", "countytr");// 通過css擷取trfor (Element e : proviceTr) {Elements tds = e.select("a[href]");for (Element element : tds) {if (Integer(element.text())) {continue;}Region region = new JsoupRegion().new Region();String code = element.attr("href").substring(3, 9);region.setCode(code);region.setName(element.text());region.setParentId(Integer.valueOf(code.substring(0, 4)));region.setType("");list.add(region);}}return list;}/** * @說明: 鎮 * @param @param url * @param @return * @param @throws IOException * @return List<Region> * @throws */private static List<Region> getTown(String url) throws IOException {List<Region> list = new ArrayList<Region>();Document doc = url2Doc(url);Elements proviceTr = doc.getElementsByAttributeValue("class", "towntr");// 通過css擷取trfor (Element e : proviceTr) {Elements tds = e.select("a[href]");for (Element element : tds) {if (Integer(element.text())) {continue;}Region region = new JsoupRegion().new Region();String code = element.attr("href").substring(3, 12);region.setCode(code);region.setName(element.text());region.setParentId(Integer.valueOf(code.substring(0, 6)));region.setType("");list.add(region);}}return list;}/** * @說明: 村 * @param @param url * @param @return * @param @throws IOException * @return List<Region> * @throws */private static List<Region> getVillage(String url) throws IOException {List<Region> list = new ArrayList<Region>();Document doc = url2Doc(url);Elements proviceTr = doc.getElementsByAttributeValue("class", "villagetr");// 通過css擷取trfor (Element e : proviceTr) {Elements trs = e.select("tr");for (Element element : trs) {Elements tds = element.select("td");Region region = new JsoupRegion().new Region();for (Element element2 : tds) {String value = element2.text();if (Integer(value) && value.length() == 3) {region.setType(element2.text());}if (Integer(value) && value.length() > 3) {region.setCode(value);region.setParentId(Integer.valueOf(value.substring(0, 9)));} else {region.setName(value);}}list.add(region);}}return list;}public static void insertDb() {}/** * @Description: * @param * @return Connection 傳回型別 * @throws */private static Connection connection() throws ClassNotFoundException, SQLException {// 調用Class.forName()方法載入驅動程式Class.forName("com.mysql.jdbc.Driver");String url = "jdbc:mysql://localhost:3306/cmm-db?useUnicode=true&characterEncoding=utf-8&autoReconnect=true"; // JDBC的URLConnection conn = DriverManager.getConnection(url, "root", "root");return conn;}public static void insertRegion(Region region) throws ClassNotFoundException, SQLException {Connection conn = connection();conn.setAutoCommit(false);// 插入資料的代碼String sql2 = "insert into region(area_code,area_name,parent_code,place_order) values(?,?,?,?) ";PreparedStatement pst = conn.prepareStatement(sql2);pst.setString(1, region.getCode());pst.setString(2, region.getName());pst.setString(3, region.getParentId()+"");pst.setString(4, region.getCode());pst.addBatch();// 執行批次更新pst.executeBatch();// 語句執行完畢,提交本事務conn.commit();pst.close();conn.close();}public static void main(String[] args) throws IOException, ClassNotFoundException, SQLException {String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/";List<Region> all = new ArrayList<Region>();List<Region> province = getProvince(url + "/index.html");all.addAll(province);List<String> done = new ArrayList<String>();// 用來存已經抓取過的省份for (Region regionProvince : province) {// 遍曆省if (done.contains(regionProvince.getCode())) {continue;}insertRegion(regionProvince);System.out.println(regionProvince.getCode() + regionProvince.getName());List<Region> city = getCity(url + regionProvince.getCode() + ".html");for (Region regionCity : city) {// 遍曆市insertRegion(regionCity);System.out.println(regionCity.getCode() + "||" + regionCity.getName());List<Region> county = getCounty(url + regionProvince.getCode() + "/" + regionCity.getCode() + ".html");all.addAll(county);for (Region regionCounty : county) {// 遍曆縣insertRegion(regionCounty);System.out.println(regionCounty.getCode() + "||" + regionCounty.getName());List<Region> town = getTown(url + regionProvince.getCode() + "/" + regionCity.getCode().substring(2, 4) + "/" + regionCounty.getCode() + ".html");all.addAll(town);for (Region regionTown : town) {// 遍曆鎮insertRegion(regionTown);System.out.println(regionTown.getCode() + "||" + regionTown.getName());List<Region> village = getVillage(url + regionProvince.getCode() + "/" + regionCity.getCode().substring(2, 4) + "/" + regionCounty.getCode().substring(4, 6) + "/"+ regionTown.getCode() + ".html");all.addAll(village);for (Region regionVillage : village) {// 遍曆村insertRegion(regionVillage);System.out.println(regionVillage.getCode() + "||" + regionVillage.getName());}}}}}}}
抓取國家統計局2013年全國行政組織機構並插入mysql