Java微博搜尋索引鍵採集

來源:互聯網
上載者:User

標籤:

import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.ObjectInputStream;import java.io.UnsupportedEncodingException;import java.net.MalformedURLException;import java.text.SimpleDateFormat;import java.util.List;import java.util.Random;import java.util.concurrent.Callable;import org.apache.http.client.CookieStore;import org.apache.log4j.Logger;import com.gargoylesoftware.htmlunit.BrowserVersion;import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;import com.gargoylesoftware.htmlunit.WebClient;import com.gargoylesoftware.htmlunit.html.HtmlPage;import com.gargoylesoftware.htmlunit.util.Cookie;public class SinaSearchCrawlerCommand implements Callable<Object> {    private static Logger logger = Logger.getLogger(SinaSearchCrawlerCommand.class);    private static String word="如家";    private static String cookiePath="E:\\學習\\微博爬蟲\\cookie\\cookie.file";    private static String outputpath="E:\\學習\\微博爬蟲\\";    //public Object call(){    public static void main(String[] args){        try {            word= java.net.URLEncoder.encode(word, "utf-8");        } catch (UnsupportedEncodingException e2) {            // TODO Auto-generated catch block            e2.printStackTrace();        }        WebClient webClient = new WebClient(BrowserVersion.FIREFOX_17);        webClient.getCookieManager().setCookiesEnabled(true);        for(int i=1;i<=100;i++){        System.out.println(cookiePathAppendRandom());        File file = new File(cookiePathAppendRandom());        if (file.exists()) {            FileInputStream fin = null;            try {                fin = new FileInputStream(file);            } catch (FileNotFoundException e1) {                e1.printStackTrace();            }            CookieStore cookieStore = null;            ObjectInputStream in;            try {                in = new ObjectInputStream(fin);                cookieStore = (CookieStore) in.readObject();                in.close();            } catch (IOException e) {                logger.error(e);            } catch (ClassNotFoundException e) {                logger.error(e);            }            List<org.apache.http.cookie.Cookie> l = cookieStore.getCookies();            for (org.apache.http.cookie.Cookie temp : l) {                Cookie cookie = new Cookie(temp.getDomain(), temp.getName(),                        temp.getValue(), temp.getPath(), temp.getExpiryDate(),                        false);                webClient.getCookieManager().addCookie(cookie);            }            /*HtmlPage page = null;            try {                page = webClient.getPage("http://weibo.cn/search/?tf=5_012");            } catch (FailingHttpStatusCodeException e) {                logger.error(e);            } catch (MalformedURLException e) {                logger.error(e);            } catch (IOException e) {                logger.error(e);            }            HtmlForm form = page.getForms().get(0);            HtmlSubmitInput button = form.getInputByName("smblog");            form.getInputByName("keyword").setValueAttribute(word);            logger.info("search:" + word);            try {                page = button.click();            } catch (IOException e1) {                logger.error(e1);            }*/                        HtmlPage page = null;            try {                //logger.info("execution:"+this);                page = webClient.getPage("http://weibo.cn/search/mblog?hideSearchFrame=&keyword="+word+"&page="+i);            } catch (FailingHttpStatusCodeException e) {                logger.error(e);            } catch (MalformedURLException e) {                logger.error(e);            } catch (IOException e) {                logger.error(e);            }            SimpleDateFormat dayformat = new SimpleDateFormat("yyyyMMdd");            long start = System.currentTimeMillis();            start = System.currentTimeMillis();            String path = null;            File file2 = null;            path = new String(outputpath + "/" + dayformat.format(start)                    + "/" + System.currentTimeMillis() + file.getName()+".html" );            file2 = new File(outputpath + "/" + dayformat.format(start));            if (!file2.exists())                file2.mkdirs();            file2 = new File(path);            System.out.println("當前頁"+i+",採集至"+path);            if (file2.exists())                logger.warn("outfile exit!");            else {                FileOutputStream outputStream;                try {                    outputStream = new FileOutputStream(file2);                    outputStream.write(page.getWebResponse().getContentAsString().getBytes());                    outputStream.close();                } catch (FileNotFoundException e) {                    logger.error(e);                } catch (IOException e) {                    logger.error(e);                }            }            webClient.closeAllWindows();        } else {            logger.warn("CookiePath doesn`t exit !!!");        }                logger.info("execution:");        try {            Thread.sleep(10000);        } catch (InterruptedException e) {            logger.error(e);            return;        }        }        return;            }        private static String cookiePathAppendRandom() {        Random random = new Random();        return cookiePath+random.nextInt(7);    }        public SinaSearchCrawlerCommand(String word, String cookiePath, String outputpath) {        if(word.contains("&")) {            word = word.replace("&", " ");        }        this.word = word;        this.cookiePath = cookiePath;        this.outputpath = outputpath;    }    @Override    public String toString() {        return "SinaSearchCrawlerCommand [word=" + word + ", outputpath="                + outputpath + "]";    }    @Override    public Object call() throws Exception {        // TODO Auto-generated method stub        return null;    }}

 

Java微博搜尋索引鍵採集

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.