Package com.bjsxt.ly;
Import Java.io.BufferedReader;
Import Java.io.File;
Import java.io.FileNotFoundException;
Import Java.io.FileOutputStream;
Import java.io.IOException;
Import Java.io.InputStream;
Import Java.io.InputStreamReader;
Import java.io.UnsupportedEncodingException;
Import Java.net.URL;
Import java.net.URLConnection;
Import java.util.ArrayList;
Import java.util.List;
Import Java.util.regex.Matcher;
Import Java.util.regex.Pattern;
Import org.dom4j.Document;
Import Org.dom4j.DocumentHelper;
Import org.dom4j.Element;
Import Org.dom4j.io.OutputFormat;
Import Org.dom4j.io.XMLWriter;
public class Hellospider {
public static void Main (string[] args) throws Exception {
Get path
String Path = System.getproperty ("User.dir") + file.separator + "src" + file.separator + "Postcode.xml";
Zip code
String postcode = "100088";
Crawl Network Information
Charsequence charsequence = Webspider ("http://tool.cncn.com/youbian/" + postcode);
Match the crawled information with regular expressions to get what you need
list<string> list = Regexppostcode ("([\\u4e00-\\u9fa5\\w\\ (\) 58-]+) (?=</li>)", 0, Charsequence);
To store crawled data in XML
Createxml (postcode, list, path);
}
/**
* Create an XML document
* @param postcode
* @param list
* @param path
* @throws IOException
* @throws FileNotFoundException
* @throws unsupportedencodingexception
*/
private static void Createxml (string postcode, list<string> List, String path) throws Unsupportedencodingexception , FileNotFoundException, IOException {
Creating the root node
Element rootelement = documenthelper.createelement ("postcodes");
Start creating child nodes
Element postcodeelement = documenthelper.createelement ("postcode");
Postcodeelement.addattribute ("code", postcode);
Traverse Create Address
for (String address:list) {
Create a Node
Element addresselement = documenthelper.createelement ("Address");
Addresselement.settext (address);
Adding nodes
Postcodeelement.add (addresselement);
}
Stitching to the root node
Rootelement.add (postcodeelement);
Start creating a Document Object model
Document document = Documenthelper.createdocument (rootelement);
Start output
New XMLWriter (New FileOutputStream (path), Outputformat.createprettyprint ()). write (document);
}
/**
* Regular expressions get the ZIP code
* @param regex
* @param flags
* @param charsequence
* @return
*/
private static list<string> Regexppostcode (String regex, int flags, charsequence charsequence) {
Declare a container to hold the zip address
list<string> list = new arraylist<> ();
Get template
Pattern pattern = pattern.compile (regex, flags);
Get the matching device
Matcher Matcher = Pattern.matcher (charsequence);
Start reading
while (Matcher.find ()) {
List.add (Matcher.group ());
}
return results
return list;
}
/**
* Web crawler
* @param spec
* @return
* @throws IOException
*/
private static Charsequence Webspider (String spec) throws IOException {
Get URL Address
URL url = new URL (spec);
Get connections
URLConnection connection = Url.openconnection ();
Disguised as a browser
Connection.setrequestproperty ("User-agent", "mozilla/5.0" (Windows NT 6.1; WOW64; rv:47.0) gecko/20100101 firefox/47.0 ");
Get input stream
InputStream InputStream = Connection.getinputstream ();
Start conversion
BufferedReader reader = new BufferedReader (new InputStreamReader (InputStream, "GBK"));
Start Get string
StringBuffer buffer = new StringBuffer ();
String line = "";
Start Traversal Read
while (line = Reader.readline ()) = null) {
Start stitching strings
Buffer.append (line);
}
Back to crawled content
return buffer;
}
}
The crawler crawls the mailbox and stores it in XML as a database