Webcollector-based full-site cloning tool

Source: Internet
Author: User
Tags file url

The whole site cloning is a very interesting thing, need to meet many conditions.

You need to ensure that the files are statically accessible, which requires that the paths in the HTML file are relative paths.

involves a link rewriting process for an HTML file, because a link is not overwritten and the resources on the site are loaded locally when accessed.

A big pit: If redirection is not disabled, a dead loop is generated.
You visit a page, A contains b,b is redirected to a, then it will produce a/a/a/a/a ...
In this case, the simplest solution is to disable the redirect, which detects a loop when the method is troublesome, and stops if a path loop occurs.
The key is that the URL becomes the wrong url after the redirect.

<?xmlVersion= "1.0" encoding= "UTF-8"?><projectxmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="Http://www.w3.org/2001/XMLSchema-instance"xsi:schemalocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">    <modelVersion>4.0.0</modelVersion>    <groupId>Wyf</groupId>    <artifactId>Clonesite</artifactId>    <version>1.0-snapshot</version>    <dependencies>        <!--https://mvnrepository.com/artifact/cn.edu.hfut.dmic.webcollector/WebCollector --        <dependency>            <groupId>Cn.edu.hfut.dmic.webcollector</groupId>            <artifactId>Webcollector</artifactId>            <version>2.71</version>        </dependency>        <!--https://mvnrepository.com/artifact/org.jsoup/jsoup --        <dependency>            <groupId>Org.jsoup</groupId>            <artifactId>Jsoup</artifactId>            <version>1.11.2</version>        </dependency>    </dependencies>    <build>        <plugins>            <plugin>                <groupId>Org.apache.maven.plugins</groupId>                <artifactId>Maven-compiler-plugin</artifactId>                <configuration>                    <source>1.8</source>                    <target>1.8</target>                    <encoding>Utf-8</encoding>                </configuration>            </plugin>        </plugins>    </build></project>
import cn.edu.hfut.dmic.webcollector.conf.Configuration;import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;import cn.edu.hfut.dmic.webcollector.model.Page;import Cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import Java.io.BufferedWriter;import java.io.IOException;import Java.io.OutputStream;import Java.nio.charset.Charset;import java.nio.file.Files;import Java.nio.file.Path;import java.nio.file.Paths; Public classMainextendsBreadthcrawler {StaticString seed ="Http://www.xqbase.com/computer.htm";//html Page PrefixesStaticString prefix ="Http://www.xqbase.com/computer";StaticPath TargetFolder = Paths.Get("haha").Toabsolutepath();intMaxredirect =0; Public Main(String Crawlpath,BooleanAutoparse) {Super(Crawlpath, Autoparse); This.getconf().Setmaxredirect(Maxredirect); This.Addseed(seed);}Boolean Isinvalidpathchar(CharVAR0) {returnVar0 < ' | |"<>:\"|? *".indexOf(VAR0)! =-1;}Boolean Isinvalidpath(String Path) { for(inti =0; I < path.length(); i++)if(Isinvalidpathchar(Path.charAt(i)))return true;return false;}/*** Convert URL to local path for saving Web page content to local * * @param url: absolute path URL * @param type: File type, used to determine the name of the suffix to be saved */PathUrl2path(string URL, String type) {intBeg = URL.indexOf(":") +3; String path = URL.substring(beg);//If the file name contains illegal characters, then use Hashcode    if(Isinvalidpath(path)) {Path = path.hashcode() +""; }if(Type! =NULL&&!path.EndsWith("."+ type) {path + = '. ' + type; }returnTargetFolder.Resolve(path);}/*** Now indicates that the current HTML page Url,resource represents the resource file URL and returns the relative position of the two* ResourceType Indicates whether to force ResourceUrl to change */Stringpath2relative(String Htmlurl, String ResourceUrl, String resourcetype) {return Url2path(Htmlurl,"HTML").getParent().relativize(Url2path(ResourceUrl, ResourceType)).toString().Replace('\\', '/');}/*** Recursive creation of directories for creating files */void mkdir(Path p) {p = p.Toabsolutepath();if(Files.exists(p))return;if(Files.notexists(p.getParent()))mkdir(p.getParent());Try{Files.CreateDirectory(p); }Catch(IOException e) {e.Printstacktrace(); }}/*** Save text file */void WriteFile(path path, String content, Charset encoding) {mkdir(Path.getParent());Try(BufferedWriter cout = Files.)Newbufferedwriter(path, encoding)) {cout.Write(content); }Catch(Exception e) {e.Printstacktrace(); }}/*** Save a binary file */void WriteFile(Path Path,byte[] data) {mkdir(Path.getParent());Try(OutputStream cout = Files.)Newoutputstream(path)) {cout.Write(data); }Catch(Exception e) {e.Printstacktrace(); }}void src(Page page, Crawldatums crawldatums, Document Doc) {String src[] =Newstring[]{"Script","SVG","img"}; for(intIND =0; IND < SRC.length; ind++) {String j = Src[ind]; for(Element I:doc.Select(j)) {if(i.hasattr("src") ==false)Continue; String s = i.Absurl("src");if(S.Trim().length() ==0)Continue; I.attr("src",path2relative(page.URL(), S,NULL)); Crawldatum next =New crawldatum(S,"Binary"); Crawldatums.Add(next); }    }}void Hrefofresource(Page page, Crawldatums crawldatums, Document Doc) {String href[] =Newstring[]{"link"}; for(intIND =0; IND < href.length; ind++) {String j = Href[ind]; for(Element I:doc.Select(j)) {if(i.hasattr("href") ==false)Continue; String s = i.Absurl("href");if(S.Trim().length() ==0)Continue; I.attr("href",path2relative(page.URL(), S,NULL)); Crawldatum next =New crawldatum(S,"Binary"); Crawldatums.Add(next); }    }}void hrefofhtml(Page page, Crawldatums crawldatums, Document Doc) { for(Element I:doc.Select("a")) {if(i.hasattr("href") {String s = i.Absurl("href");if(S.Trim().length() ==0)Continue; I.attr("href",path2relative(page.URL(), S,"HTML"));if(S.StartsWith(prefix)) {crawldatums.Add(s); }        }    }}@Override Public void Visit(Page page, crawldatums crawldatums) {if(page.MatchType("Binary")) {WriteFile(Url2path(page.URL(),NULL), page.content()); }Else{Document doc = page.Doc();src(page, crawldatums, doc);Hrefofresource(page, crawldatums, doc);hrefofhtml(page, crawldatums, doc);WriteFile(Url2path(page.URL(),"HTML"), Doc.HTML(), Doc.CharSet()); }} Public Static void Main(string[] args)throwsException {//autoparse Indicates whether the engine is allowed to control URL parsingMain blog =New Main("Webcollector",false); Configuration conf = blog.getconf(); Conf.Setconnecttimeout( the); Blog.Start(Integer.Max_value);}}

Webcollector-based full-site cloning tool

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.