The whole site cloning is a very interesting thing, need to meet many conditions.
You need to ensure that the files are statically accessible, which requires that the paths in the HTML file are relative paths.
involves a link rewriting process for an HTML file, because a link is not overwritten and the resources on the site are loaded locally when accessed.
A big pit: If redirection is not disabled, a dead loop is generated.
You visit a page, A contains b,b is redirected to a, then it will produce a/a/a/a/a ...
In this case, the simplest solution is to disable the redirect, which detects a loop when the method is troublesome, and stops if a path loop occurs.
The key is that the URL becomes the wrong url after the redirect.
<?xmlVersion= "1.0" encoding= "UTF-8"?><projectxmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="Http://www.w3.org/2001/XMLSchema-instance"xsi:schemalocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>Wyf</groupId> <artifactId>Clonesite</artifactId> <version>1.0-snapshot</version> <dependencies> <!--https://mvnrepository.com/artifact/cn.edu.hfut.dmic.webcollector/WebCollector -- <dependency> <groupId>Cn.edu.hfut.dmic.webcollector</groupId> <artifactId>Webcollector</artifactId> <version>2.71</version> </dependency> <!--https://mvnrepository.com/artifact/org.jsoup/jsoup -- <dependency> <groupId>Org.jsoup</groupId> <artifactId>Jsoup</artifactId> <version>1.11.2</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>Org.apache.maven.plugins</groupId> <artifactId>Maven-compiler-plugin</artifactId> <configuration> <source>1.8</source> <target>1.8</target> <encoding>Utf-8</encoding> </configuration> </plugin> </plugins> </build></project>
import cn.edu.hfut.dmic.webcollector.conf.Configuration;import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;import cn.edu.hfut.dmic.webcollector.model.Page;import Cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import Java.io.BufferedWriter;import java.io.IOException;import Java.io.OutputStream;import Java.nio.charset.Charset;import java.nio.file.Files;import Java.nio.file.Path;import java.nio.file.Paths; Public classMainextendsBreadthcrawler {StaticString seed ="Http://www.xqbase.com/computer.htm";//html Page PrefixesStaticString prefix ="Http://www.xqbase.com/computer";StaticPath TargetFolder = Paths.Get("haha").Toabsolutepath();intMaxredirect =0; Public Main(String Crawlpath,BooleanAutoparse) {Super(Crawlpath, Autoparse); This.getconf().Setmaxredirect(Maxredirect); This.Addseed(seed);}Boolean Isinvalidpathchar(CharVAR0) {returnVar0 < ' | |"<>:\"|? *".indexOf(VAR0)! =-1;}Boolean Isinvalidpath(String Path) { for(inti =0; I < path.length(); i++)if(Isinvalidpathchar(Path.charAt(i)))return true;return false;}/*** Convert URL to local path for saving Web page content to local * * @param url: absolute path URL * @param type: File type, used to determine the name of the suffix to be saved */PathUrl2path(string URL, String type) {intBeg = URL.indexOf(":") +3; String path = URL.substring(beg);//If the file name contains illegal characters, then use Hashcode if(Isinvalidpath(path)) {Path = path.hashcode() +""; }if(Type! =NULL&&!path.EndsWith("."+ type) {path + = '. ' + type; }returnTargetFolder.Resolve(path);}/*** Now indicates that the current HTML page Url,resource represents the resource file URL and returns the relative position of the two* ResourceType Indicates whether to force ResourceUrl to change */Stringpath2relative(String Htmlurl, String ResourceUrl, String resourcetype) {return Url2path(Htmlurl,"HTML").getParent().relativize(Url2path(ResourceUrl, ResourceType)).toString().Replace('\\', '/');}/*** Recursive creation of directories for creating files */void mkdir(Path p) {p = p.Toabsolutepath();if(Files.exists(p))return;if(Files.notexists(p.getParent()))mkdir(p.getParent());Try{Files.CreateDirectory(p); }Catch(IOException e) {e.Printstacktrace(); }}/*** Save text file */void WriteFile(path path, String content, Charset encoding) {mkdir(Path.getParent());Try(BufferedWriter cout = Files.)Newbufferedwriter(path, encoding)) {cout.Write(content); }Catch(Exception e) {e.Printstacktrace(); }}/*** Save a binary file */void WriteFile(Path Path,byte[] data) {mkdir(Path.getParent());Try(OutputStream cout = Files.)Newoutputstream(path)) {cout.Write(data); }Catch(Exception e) {e.Printstacktrace(); }}void src(Page page, Crawldatums crawldatums, Document Doc) {String src[] =Newstring[]{"Script","SVG","img"}; for(intIND =0; IND < SRC.length; ind++) {String j = Src[ind]; for(Element I:doc.Select(j)) {if(i.hasattr("src") ==false)Continue; String s = i.Absurl("src");if(S.Trim().length() ==0)Continue; I.attr("src",path2relative(page.URL(), S,NULL)); Crawldatum next =New crawldatum(S,"Binary"); Crawldatums.Add(next); } }}void Hrefofresource(Page page, Crawldatums crawldatums, Document Doc) {String href[] =Newstring[]{"link"}; for(intIND =0; IND < href.length; ind++) {String j = Href[ind]; for(Element I:doc.Select(j)) {if(i.hasattr("href") ==false)Continue; String s = i.Absurl("href");if(S.Trim().length() ==0)Continue; I.attr("href",path2relative(page.URL(), S,NULL)); Crawldatum next =New crawldatum(S,"Binary"); Crawldatums.Add(next); } }}void hrefofhtml(Page page, Crawldatums crawldatums, Document Doc) { for(Element I:doc.Select("a")) {if(i.hasattr("href") {String s = i.Absurl("href");if(S.Trim().length() ==0)Continue; I.attr("href",path2relative(page.URL(), S,"HTML"));if(S.StartsWith(prefix)) {crawldatums.Add(s); } } }}@Override Public void Visit(Page page, crawldatums crawldatums) {if(page.MatchType("Binary")) {WriteFile(Url2path(page.URL(),NULL), page.content()); }Else{Document doc = page.Doc();src(page, crawldatums, doc);Hrefofresource(page, crawldatums, doc);hrefofhtml(page, crawldatums, doc);WriteFile(Url2path(page.URL(),"HTML"), Doc.HTML(), Doc.CharSet()); }} Public Static void Main(string[] args)throwsException {//autoparse Indicates whether the engine is allowed to control URL parsingMain blog =New Main("Webcollector",false); Configuration conf = blog.getconf(); Conf.Setconnecttimeout( the); Blog.Start(Integer.Max_value);}}
Webcollector-based full-site cloning tool