The following program requires htmlparser. jar. You can directly
Http://umn.dl.sourceforge.net/sourceforge/htmlparser/htmlparser1_5_20040728.zip
Download, http://htmlparser.sourceforge.net is the home page of htmlparser.
// Copy from here.
/*************************************** ****************************************
* $ Header $
* $ Revision $
* $ Date $
*
* ===================================================== ========================================================
*
* Copyright (c) 2001-2004 XXX Technologies, Ltd.
* All rights reserved.
*
* Created on 2004-12-3
**************************************** ***************************************/
Import java. io. BufferedReader;
Import java. io. File;
Import java. io. FileOutputStream;
Import java. io. IOException;
Import java. io. InputStreamReader;
Import java. io. OutputStream;
Import java. io. OutputStreamWriter;
Import java.net. MalformedURLException;
Import java.net. URL;
Import java. text. MessageFormat;
Import java. util. ArrayList;
Import java. util. List;
Import javax. xml. parsers. DocumentBuilder;
Import javax. xml. parsers. DocumentBuilderFactory;
Import org.html parser. Node;
Import org.html parser. Parser;
Import org.html parser. lexer. Page;
Import org.html parser. tags. Div;
Import org.html parser. util. ParserException;
Import org. w3c. dom. Document;
Import org. w3c. dom. Element;
Import org. w3c. dom. NodeList;
/**
*
* @ Author Fei (mailto: mr_yanfei & Yahoo.com)
*/
/*
* Modification history
* $ Log $
*/
Public final class blogbackuptool {
Private Static final string rss_url = "http://blog.csdn.net/mr_yanfei/Rss.aspx ";
Private Static final string save_path = "D: // Temp ";
Private Static final string channel = "channel ";
Private Static final string channel_item = "item ";
Private Static final string item_title = "title ";
Private Static final string item_link = "Link ";
Private static final boolean FILTER = true;
Class Blog {
Private String fTitle;
Private String fLink;
Public Blog (String title, String link ){
FTitle = title;
Flink = link;
}
Public String gettitle (){
Return ftitle;
}
Public String getLink (){
Return fLink;
}
}
Private Blog [] getBlogs (String rssUrl ){
DocumentBuilderFactory factory =
DocumentBuilderFactory. newInstance ();
Factory. setNamespaceAware (true );
List result = new ArrayList ();
Try {
URL url = new URL (rssUrl );
DocumentBuilder builder = factory. newDocumentBuilder ();
Document document = builder. parse (url. openStream ());
Element channel = document. getDocumentElement ();
Channel = (Element) document. getElementsByTagName (CHANNEL). item (0 );
If (CHANNEL. equals (channel. getLocalName ())){
NodeList nodes = channel. getChildNodes ();
For (int I = 0; I <nodes. getLength (); I ++ ){
Org. w3c. dom. Node item = nodes. item (I );
If (CHANNEL_ITEM.equals (item. getLocalName ())){
String title = getChildNodeText (item, ITEM_TITLE );
String link = getChildNodeText (item, ITEM_LINK );
Result. add (new Blog (title, link ));
}
}
}
} Catch (Exception ex ){
Ex. printStackTrace ();
}
Return (Blog []) result. toArray (new Blog [result. size ()]);
}
Private String getChildNodeText (org. w3c. dom. Node item, String nodeName ){
NodeList nodes = item. getChildNodes ();
For (int I = 0; I <nodes. getLength (); I ++ ){
Org. w3c. dom. Node node = nodes. item (I );
If (nodeName. equals (node. getLocalName ())){
Return node. getFirstChild (). getNodeValue ();
}
}
Return null;
}
Private String validFilename (String name ){
String result = name. replace (':','_');
Result = result. replace ('/','_');
Result = result. replace ('//','_');
Result = result. replace ('? ','? ');
Result = result. replace ('*','_');
Result = result. replace ('<','_');
Result = result. Replace ('> ','_');
Result = result. Replace ('| ','_');
Result = result. Replace ('"','_');
Return result;
}
Private void saveblogs (blog [] blogs) throws exception {
String title, link;
For (int I = 0; I <blogs. length; I ++ ){
Title = blogs [I]. getTitle ();
Link = blogs [I]. getLink ();
System. out. println ("Get Blog" + title );
System. out. println ("URL:" + link );
If (FILTER ){
Parser parser = null;
Try {
Parser = new Parser (Link );
} Catch (parserexception ex ){
Continue;
}
Page page = parser. getlexer (). getpage ();
String pageurl = page. geturl ();
Node [] bases = parser. extractallnodesthatare (Div. Class );
For (Int J = 0; j <bases. length; j ++ ){
String ATTR = (DIV) bases [J]). getattribute ("class ");
If (ATTR = NULL)
ATTR = "";
If (attr. equals ("post ")){
String content = (Div) bases [j]). getChildrenHTML ();
SaveBlogToFile (title + ". html", content );
Break;
}
}
Parser. reset ();
}
Else {
StringBuffer buffer = getHtmlFromURL (link );
SaveBlogToFile (title + ". html", buffer. toString ());
}
}
}
Private StringBuffer getHtmlFromURL (String url ){
StringBuffer buffer = new StringBuffer ();
Try {
URL pageUrl = new URL (url );
BufferedReader in = new BufferedReader (new InputStreamReader (pageUrl. openStream ()));
String str;
While (str = in. readLine ())! = Null ){
Buffer. append (str );
}
In. close ();
} Catch (MalformedURLException e ){
E. printStackTrace ();
} Catch (IOException e ){
E. printStackTrace ();
}
Return buffer;
}
Private void saveblogtofile (string filename, string content ){
Try {
Filename = validfilename (filename );
File file = new file (save_path, filename );
Outputstream out = new fileoutputstream (File );
OutputStreamWriter writer = new OutputStreamWriter (out );
Writer. write (content );
Writer. close ();
} Catch (IOException ex ){
}
}
Public static void main (string [] ARGs) throws exception {
Blogbackuptool reader = new blogbackuptool ();
Blog [] blogs = reader. getblogs (rss_url );
Reader. saveblogs (blogs );
String MSG = messageformat. Format ("totle {0} blogs saved.", new string [] {INTEGER. tostring (blogs. Length )});
System. Out. println (MSG );
}
}
// End