Package com.letv.cloud.spider;
Import Java.util.HashSet;
Import java.util.List;
Import Us.codecraft.webmagic.Page;
Import Us.codecraft.webmagic.Site;
Import Us.codecraft.webmagic.Spider;
Import Us.codecraft.webmagic.processor.PageProcessor;
public class Moviepaperpageprocessor implements Pageprocessor {
Private Site page = Site.me (). Setretrytimes (3). Setsleeptime (1000);
Public Site Getsite () {
Return page;http://www.huiyi8.com/moban/
public void Process (Page page) {page template
List<string> links = page.gethtml (). Links (). Regex (
"Http://posters.imdb.cn/poster/\\d+"). All ();
Links = removeduplicate (links);
Page.addtargetrequests (links);
Page.putfield ("title", Page.gethtml (). XPath (
"//div[@id = ' IMDBLEFTSECC ']/center/h1/text ()"). ToString ());
Page.putfield ("Imgurl", page.gethtml (). XPath (
"//div[@id = ' IMDBLEFTSECC ']/center/img/@src"). toString ());
public static void Main (string[] args) {for (int i = 1; I <= 3; i++) {
Spider.create (New Moviepaperpageprocessor ()). Addurl (
"http://posters.imdb.cn/poster_page/" + i). Thread (5). Run ();
public static list removeduplicate (List list) {
HashSet hs = new HashSet (list);
List.clear ();
List.addall (HS);
return list;