java抓取某網站上的醫院資訊

來源:互聯網
上載者:User

抓取某網站上的醫院資訊,幫一位同學寫的,完全是現學現賣,使用jsoup解析返回的HTML代碼

HttpRequestProxy.java

import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.URL;import java.util.ArrayList;import java.util.List;import java.util.Scanner;import java.util.regex.Matcher;import java.util.regex.Pattern;import java.io.FileOutputStream;import java.io.OutputStreamWriter;import org.htmlparser.util.ParserException;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class HttpRequestProxy {private static List<MZinfo> mzinfos = new ArrayList<MZinfo>();private static List<MZinfo> levelinfo = new ArrayList<MZinfo>();private static List<MZinfo> cityinfo = new ArrayList<MZinfo>();public static String getWebContent(String urlString, final String charset,int timeout) throws IOException {if (urlString == null || urlString.length() == 0) {return null;}urlString = (urlString.startsWith("http://") || urlString.startsWith("https://")) ? urlString : ("http://" + urlString).intern();URL url = new URL(urlString);HttpURLConnection conn = (HttpURLConnection) url.openConnection();conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");conn.setRequestProperty("Accept", "text/html");conn.setConnectTimeout(timeout);try {if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) {return null;}} catch (IOException e) {e.printStackTrace();return null;}InputStream input = conn.getInputStream();BufferedReader reader = new BufferedReader(new InputStreamReader(input,charset));String line = null;StringBuffer sb = new StringBuffer();while ((line = reader.readLine()) != null) {sb.append(line).append("\r\n");}if (reader != null) {reader.close();}if (conn != null) {conn.disconnect();}return sb.toString();}public static String getWebContent(String urlString) throws IOException {return getWebContent(urlString, "iso-8859-1", 5000);}public static void getHospitolInfo(String url) {int pagesSum =1;String path = "./result_"+System.currentTimeMillis()+".csv";String s = null;try {s = getWebContent(url);s = new String(s.getBytes("iso-8859-1"), "utf8");} catch (IOException e1) {// TODO Auto-generated catch blocke1.printStackTrace();}Document doc = Jsoup.parse(s);Elements pageEle = doc.select("div[class=page]>a");if(pageEle!=null&&!"".equals(pageEle.text().trim())){for(Element pages:pageEle){if("最後一頁".equals(pages.text())){String lastUrl=pages.attr("href");int a=lastUrl.lastIndexOf(".");String str=lastUrl.substring(a-3, a);String regEx="[^0-9]";   Pattern p = Pattern.compile(regEx);   Matcher m = p.matcher(str);   pagesSum= Integer.parseInt(m.replaceAll("").trim());System.out.println("數據頁數:"+pagesSum);}}}FileOutputStream fos=null;OutputStreamWriter osw=null;BufferedWriter fw=null;try {fos = new FileOutputStream(path);osw = new OutputStreamWriter(fos, "GBK");fw = new BufferedWriter(osw);for (int i = 1; i <= pagesSum; i++) {System.out.println("當前正在處理第"+i+"頁的數據");String fir = url.substring(0, url.lastIndexOf(".") - 1);s= getWebContent(fir + i + ".html");s = new String(s.getBytes("iso-8859-1"), "utf8");doc = Jsoup.parse(s);Elements ele = doc.select("div[class=part-list]");for (Element element : ele) {String title = element.select("h4 > a").text();String keshi = element.select("p > a[target=_self]").text();String dengji= element.select("h4").text();int index=dengji.lastIndexOf("(");int last=dengji.lastIndexOf(")");//System.out.println(keshi + "\t\t" + title);fw.write(title+","+keshi+","+dengji.substring(index+1, last)+"\n");}}} catch (Exception e) {e.printStackTrace();} finally {try {fw.close();osw.close();fos.close();} catch (IOException e) {// TODO Auto-generated catch blockSystem.out.println("IOException");}}}public static void main(String[] args) throws IOException, ParserException {while(true){// 初始化連結資料 System.out.println("擷取醫院分級資訊"); getHostLevel(); System.out.println("分級資訊擷取完畢。請輸入分級編號:"); Scanner inputLevel = new Scanner(System.in); int levelBianhao=inputLevel.nextInt(); System.out.println("開始初始化地區資料...."); MZinfo mzinfoLevel=levelinfo.get(levelBianhao);//getHospitolInfo(mzinfoLevel.getUrl()); String s=getWebContent(mzinfoLevel.getUrl()); getNative(s); System.out.println("請輸入地區編號:"); Scanner input = new Scanner(System.in); int bianhao=input.nextInt(); System.out.println("是否按照城市顯示醫院資訊[Y/N]"); Scanner inputYn = new Scanner(System.in); String flag=inputYn.next(); MZinfo mzinfo=mzinfos.get(bianhao); if("Y".endsWith(flag)){ System.out.println("開始擷取城市資訊"); getCityHospital(mzinfo); System.out.println("城市資訊擷取成功,請輸入城市編號"); Scanner inputcity = new Scanner(System.in); int cityNum=inputcity.nextInt(); mzinfo=cityinfo.get(cityNum); } System.out.println("開始抓取資訊:"); getHospitolInfo(mzinfo.getUrl()); System.out.println("抓取資訊成功,是否繼續[Y/N]"); Scanner inputYN = new Scanner(System.in); String flag2=inputYN.next(); if("N".equals(flag2)){ break; } Runtime.getRuntime().exec( "cmd   cls ");}}private static void getCityHospital(MZinfo mzinfo) throws IOException {String s = getWebContent(mzinfo.getUrl());s = new String(s.getBytes("iso-8859-1"), "utf8");Document doc = Jsoup.parse(s);Elements sf= doc.select("div[class=find-hospital]>h4");for(Element el:sf){if(mzinfo.getDiqu().equals(el.select("h4>a").text())){Elements ele = el.select(" h4 > div > ul >li");//System.out.println(mzinfo.getDiqu());for(int i=0;i<ele.size();i++){Element element=ele.get(i);String url = element.select("a").attr("href");String name = element.select("a").text();MZinfo mZinfo2=new MZinfo();mZinfo2.setDiqu(name);mZinfo2.setUrl(url);cityinfo.add(mZinfo2);System.out.println("       "+i+":"+name);}}}}private static  void getNative(String s) throws IOException {s = new String(s.getBytes("iso-8859-1"), "utf8");Document doc = Jsoup.parse(s);Elements ele = doc.select("div[class=find-hospital]>h4");// 迴圈省for (int i = 0; i < ele.size(); i++) {Element element = ele.get(i);Elements ele1 = element.select("h4>a");MZinfo mZinfo = new MZinfo();String diqu = ele1.text();mZinfo.setDiqu(diqu);mZinfo.setUrl(ele1.attr("href").toString());mzinfos.add(mZinfo);System.out.println(i + ":" + diqu);}}private static void getHostLevel() throws IOException {// TODO Auto-generated method stubString s = getWebContent("http://hospital.qqyy.com/list-p110000c0a110108k0v1r0d0n0.html");s = new String(s.getBytes("iso-8859-1"), "utf8");Document doc = Jsoup.parse(s);// <h4 class='show' id='proset' >Elements ele = doc.select("div[class=find-departments-tab tab2]>span>a");// 迴圈分級for (int i = 0; i < ele.size(); i++) {Element element = ele.get(i);System.out.println(i+":"+element.text());MZinfo mZinfo = new MZinfo();mZinfo.setDiqu(element.text());mZinfo.setUrl(element.attr("href").toString());levelinfo.add(mZinfo);}}}

MZinfo.java

public class MZinfo {private String diqu;private String url;public String getDiqu() {return diqu;}public void setDiqu(String diqu) {this.diqu = diqu;}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}}

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.