Analysis
Open page http://www.coobobo.com/free-http-proxy/, port number a look on the wrong, the old rule ctrl+shift+c choose:
This is tragic, the port numbers are shown in pictures:
But it doesn't matter, look at these pictures grow so comely pure natural without impurities, identification is very easy.
Then choose the IP address:
It is possible that the IP address is used in this JS is now written in, to determine the words also have to look at the original HTML returned, view the source location of this IP:
It seems that only from this section of the JS to extract the IP address, is not difficult, just need to put the quotation marks, plus, parentheses, document.write, white space character erase can, a regular expression can be done.
Code implementation
Port Picture It's troublesome to write a similar gadget library before, and for this simple character recognition you can save a bit of effort, and this tool library is used here.
Because the principle of recognition is to collect some pictures to mark Good who is what character as the basis, and then come back to the new all to refer to these have been marked, so you need to first collect some pictures to mark:
/** * Collects character pictures that need to be labeled */public static void Grabtrainimage (String basepath) {for (int i = 1; i <=; i++) {Sys Tem.out.println ("page" + i); Document document = GetDocument (URL + i); Elements images = Document.select ("table.table-condensed tbody tr img"); Images.foreach (ELT, {String Imglink = host + elt.attr ("src"); byte[] imgbytes = Download (imglink); try {String OutputPath = BasePath + system.currenttimemillis () + ". png"; BufferedImage img = imageio.read (new Bytearrayinputstream (imgbytes)); Imageio.write (IMG, "PNG", New File (OutputPath)); System.out.println (Imglink); } catch (IOException e) {e.printstacktrace (); } }); }}
Grab the picture locally and generate the picture you want to label:
public static void Main (string[] args) throws IOException { String rawimagesavedir = "e:/test/proxy/kubobo/raw/";
string Distinctcharsavedir = "e:/test/proxy/kubobo/char/"; Grabtrainimage (rawimagesavedir); Ocrutil.init (Rawimagesavedir, distinctcharsavedir);}
Then open e:/test/proxy/kubobo/char/, all the characters used in the previously downloaded images are split and placed in this directory:
Now you need to change the file name to the meaning of this image:
Be careful not to mark the wrong or the back is all wrong.
Then tell Ocrutil the location of the above directory so that it knows where to load:
Ocrutil.loaddictionarymap ("e:/test/proxy/kubobo/char/");
Then you can use it, just pass the picture to OCRUTIL.OCR (BufferedImage) and return the corresponding character of the image, the complete code is as follows:
Package Org.cc11001100.t1;import Cc11001100.ocr.ocrutil;import Org.apache.commons.lang3.stringutils;import Org.jsoup.jsoup;import Org.jsoup.nodes.document;import Org.jsoup.select.elements;import Javax.imageio.ImageIO; Import Java.awt.image.bufferedimage;import Java.io.bytearrayinputstream;import Java.io.file;import Java.io.ioexception;import Java.nio.charset.standardcharsets;import Java.util.arraylist;import java.util.List; Import Static java.util.stream.collectors.tolist;/** * @author CC11001100 */public class Kuboboproxygrab {private stat IC String host = "http://www.coobobo.com"; private static String URL = "http://www.coobobo.com/free-http-proxy/"; private static Ocrutil Ocrutil; static {Ocrutil = new ocrutil (); Ocrutil.loaddictionarymap ("e:/test/proxy/kubobo/char/"); }/** * Collects character pictures that need to be labeled */public static void Grabtrainimage (String basepath) {for (int i = 1; I <= 1 0; i++) {System.out.println ("page" + i); DoCument document = getdocument (URL + i); Elements images = Document.select ("table.table-condensed tbody tr img"); Images.foreach (ELT, {String Imglink = host + elt.attr ("src"); byte[] imgbytes = Download (imglink); try {String OutputPath = BasePath + system.currenttimemillis () + ". png"; BufferedImage img = imageio.read (new Bytearrayinputstream (imgbytes)); Imageio.write (IMG, "PNG", New File (OutputPath)); System.out.println (Imglink); } catch (IOException e) {e.printstacktrace (); } }); }} private static Document getdocument (String URL) {byte[] responsebytes = download (URL); String html = new String (responsebytes, standardcharsets.utf_8); return Jsoup.parse (HTML); } private static byte[] Download (String URL) {for (int i = 0; i < 3; I+ +) {try {return Jsoup.connect (URL). Execute (). Bodyasbytes (); } catch (IOException e) {e.printstacktrace (); }} return new byte[0]; } public static list<string> Grabproxyiplist () {list<string> resultlist = new arraylist<> (); for (int i = 1; i <=; i++) {System.out.println ("page" + i); Document document = GetDocument (URL + i); Elements ipelts = Document.select ("table.table-condensed tbody tr"); list<string> pageiplist = Ipelts.stream (). Map (ELT, {String rawtext = Elt.select ("td:eq (0) Scrip T "). First (). data (); String IP = rawtext.replaceall ("document.write|[ \ ' \ ' () +]|\\s+ "," "). Trim (); String Imglink = host + Elt.select ("Td:eq (1) img"). attr ("src"); byte[] imgbytes = Download (imglink); try {bufferedimage img = imageio.reAD (new Bytearrayinputstream (imgbytes)); String port = OCRUTIL.OCR (IMG); return IP + ":" + port; } catch (IOException e) {e.printstacktrace (); } return ""; }). filter (Stringutils::isnotempty). Collect (ToList ()); Resultlist.addall (pageiplist); } return resultlist; } public static void Main (string[] args) throws IOException {//String Rawimagesavedir = "e:/test/proxy/kubobo/r aw/";//String Distinctcharsavedir =" e:/test/proxy/kubobo/char/";//Grabtrainimage (RAWIMAGESAVEDIR);// Ocrutil.init (Rawimagesavedir, Distinctcharsavedir); Grabproxyiplist (). ForEach (System.out::p rintln); }}
Cool Uncle Real-time free HTTP proxy ip crawl (port picture display +document.write)