Grab down and return to text like this:
Required Packages:: Http://pan.baidu.com/s/1o69myOq
Two classes of code
Weibocn.java
ImportJava.util.Set;Importcn.edu.hfut.dmic.webcollector.*;ImportOrg.openqa.selenium.Cookie;ImportOrg.openqa.selenium.WebElement;ImportOrg.openqa.selenium.htmlunit.HtmlUnitDriver; Public class weibocn { /** * Code is provided by Webcollector, if not used in webcollector, you need to import the selenium related jar package */ /** * Access to Sina Weibo cookie, this method is valid for weibo.cn, weibo.com Invalid * weibo.cn to transmit data in clear text, please use the trumpet * @param Username Sina Weibo user name * @param password sina Weibo password * @return * @throws Exception */ Public StaticStringGetsinacookie(string username, string password)throwsexception{StringBuilder sb =NewStringBuilder (); Htmlunitdriver Driver =NewHtmlunitdriver (); Driver.setjavascriptenabled (true); Driver.get ("http://login.weibo.cn/login/"); Webelement mobile = Driver.findelementbycssselector ("Input[name=mobile]"); Mobile.sendkeys (username); Webelement pass = Driver.findelementbycssselector ("Input[name^=password]"); Pass.sendkeys (password); Webelement rem = Driver.findelementbycssselector ("Input[name=remember]"); Rem.click (); Webelement Submit = Driver.findelementbycssselector ("Input[name=submit]"); Submit.click (); set<cookie> Cookieset = Driver.manage (). GetCookies (); Driver.close (); for(Cookie Cookie:cookieset) {Sb.append (Cookie.getname () +"="+cookie.getvalue () +";"); } String result=sb.tostring ();if(Result.contains ("GSID_CTANDWM")){returnResult }Else{Throw NewException ("Weibo login failed"); } }}
Weibocrlawer.java
ImportOrg.jsoup.nodes.Element;Importorg.jsoup.select.Elements;ImportCn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;ImportCn.edu.hfut.dmic.webcollector.model.Links;ImportCn.edu.hfut.dmic.webcollector.model.Page;ImportCn.edu.hfut.dmic.webcollector.net.HttpRequesterImpl; Public class weibocrawler extends deepcrawler{ Public Weibocrawler(String Crawlpath)throwsException {Super(Crawlpath);/ * For Sina Weibo cookies, account passwords are transmitted in clear text, please use the trumpet * /String Cookie=weibocn.getsinacookie ("Your Weibo account","Password"); Httprequesterimpl myrequester= (Httprequesterimpl) This. Gethttprequester (); Myrequester.setcookie (cookie); } PublicLinksvisitandgetnextlinks(Page page) {/ * Extract Micro-blog * /Elements weibos=page.getdoc (). Select ("DIV.C"); for(Element Weibo:weibos) {System.out.println (Weibo.text ()); }/ * If you want to crawl a comment, here you can extract the URL of the comment page, return * / return NULL; } Public Static void Main(string[] args)throwsexception{Weibocrawler crawler=NewWeibocrawler ("/home/hu/data/weibo"); Crawler.setthreads (3);/ * Crawl the top 5 pages of someone's Weibo * / for(intI=0;i<5; i++) {Crawler.addseed ("Http://weibo.cn/vipgcu?vt=4&page="+i); } crawler.start (1); }}
Author: by: Rodan: Http://blog.csdn.net/sunyuan_software
Pro-Test super easy to use Webcollector crawl Sina Weibo data (available to download all crawler packs)