Last learned Jsoup, found some dynamic generated Web content is unable to crawl, and then learned the Htmlunit, the following is the capture of cool dog music and QQ Music Link Example:
Cool Dog Music:
Import Java.io.bufferedinputstream;import java.io.fileoutputstream;import java.io.inputstream;import Java.net.URL; Import Java.net.urlencoder;import java.util.uuid;import java.util.regex.matcher;import Java.util.regex.Pattern; Import Org.jsoup.nodes.element;import Com.alibaba.fastjson.jsonarray;import com.alibaba.fastjson.JSONObject; Import Com.gargoylesoftware.htmlunit.browserversion;import Com.gargoylesoftware.htmlunit.nicelyresynchronizingajaxcontroller;import Com.gargoylesoftware.htmlunit.Page; Import Com.gargoylesoftware.htmlunit.webclient;public class Worm7 {private static String name= "Li Sao"; public static WebClient Getwebclient (Boolean flag) {WebClient WebClient = new WebClient (browserversion.firefox_45); Webclient.getoptions (). Setuseinsecuressl (True); Webclient.getoptions (). setcssenabled (false); Webclient.getoptions (). Setthrowexceptiononfailingstatuscode (false); Webclient.getoptions (). Setthrowexceptiononscripterror (false); Webclient.getoPtions (). Setredirectenabled (True); Webclient.getoptions (). setappletenabled (false); Webclient.getoptions (). setjavascriptenabled (flag); Webclient.getoptions (). SetTimeout (60000); Webclient.getoptions (). Setprintcontentonfailingstatuscode (false); Webclient.setajaxcontroller (New Nicelyresynchronizingajaxcontroller ()); return webClient; public static String Getmp3url (WebClient WebClient) {FileOutputStream outputstream = null; InputStream inputstream = null; Bufferedinputstream bis = null; try {page page=webclient.getpage ("Http://songsearch.kugou.com/song_search_v2?") + "callback=jquery112408395432201569397_1532930925600" + "&keyword=" +urlencoder.encode (name, "Utf-8") + "& Page=1 "+" &pagesize=30 "+" &userid=-1 "+" &clientver= "+" &platform=webfilter "+" &tag=em "+" & filter=2 "+" &iscorrection=1 "+" &privilege_filter=0 "+" &_= "+system.currenttimemillis ());// System.out.println (PagE.getwebresponse (). getcontentasstring ()),//system.out.println (Zzee (Page.getwebresponse (). getContentAsString (), "(? <=\\ (\\{). *? =\\}\\))")); Jsonobject Job=jsonobject.parseobject ("{" +zzee (Page.getwebresponse (). getcontentasstring (), "(? <=\\ (\\{). *? =\\}\\)) (") +"} "). Getjsonobject (" Data "); System.out.println ("Job:" +job); Jsonarray List=job.getjsonarray ("lists"); SYSTEM.OUT.PRINTLN ("list" +list); for (int i=0;i<list.size (); i++) {String id1=list.getjsonobject (i). GetString (" Filehash "); String Id2=list.getjsonobject (i). getString ("albumID"); String detailurl= "Http://www.kugou.com/yy/index.php?r=play/getdata" + "&hash=" +id1+ "&album_id=" +id2+ " &_= "+system.currenttimemillis (); Page page2=webclient.getpage (Detailurl); Jsonobject Job2=jsonobject.parseobject (Page2.getwebresponse (). getcontentasstring ()). GetJSONObject ("Data"); System.out.println ("title:" +job2.getstring ("Audio_name"));//system.out.println ("Lyrics:" +job2.getstring ("lyrics")); System.out.println ("MP3:" +job2.getstring ("Play_url")); String outimage = job2.getstring ("audio_name") + ". mp3"; URL imgurl = new URL (job2.getstring ("Play_url"));//Get input stream inputstream = Imgurl.openconnection (). getinputstre AM (); The input stream information is put into the buffer stream to improve read and write speed bis = new Bufferedinputstream (InputStream); Read bytes Lou byte[] buf = new byte[1024]; Generate File OutputStream = new FileOutputStream ("f://" + outimage); int size = 0; Read-side write while (size = Bis.read (BUF))! =-1) {outputstream.write (buf, 0, size); }//Refresh file stream Outputstream.flush (); }} catch (Exception e) {e.printstacktrace ();} return name; } private static string Zzee (String str, String zz) {string list = null; Pattern p = pattern.compile (ZZ); Matcher m = p.matcher (str); while (M.find ()) {list = M.group ();} return list; }public static void Main (StriNg[] args) {WebClient webclient=getwebclient (false); Getmp3url (WebClient);}}
Operation Result:
QQ Music Capture Example:
Import Java.io.bufferedinputstream;import Java.io.fileoutputstream;import Java.io.ioexception;import Java.io.inputstream;import Java.net.malformedurlexception;import Java.net.url;import Java.net.URLEncoder;import Java.util.uuid;import Java.util.regex.matcher;import Java.util.regex.pattern;import org.jsoup.nodes.Element; Import Com.alibaba.fastjson.json;import Com.alibaba.fastjson.jsonarray;import com.alibaba.fastjson.JSONObject; Import Com.gargoylesoftware.htmlunit.browserversion;import Com.gargoylesoftware.htmlunit.nicelyresynchronizingajaxcontroller;import Com.gargoylesoftware.htmlunit.Page; Import Com.gargoylesoftware.htmlunit.webclient;public class Worm6 {private static string Name= "Li Sao"; static string Id1=nu ll Static String Id2=null; Static String Id3=null; Static String Id4=null; Static String Name1=null; Static String Name2=null; static String URL = null; Static Jsonobject Job2=null; public static WebClient Getwebclient (Boolean flag) {WebClient WebClient = new WebClient (BROWSERVERSION.FIREFOX_45); Webclient.getoptions (). Setuseinsecuressl (True); Webclient.getoptions (). setcssenabled (false); Webclient.getoptions (). Setthrowexceptiononfailingstatuscode (false); Webclient.getoptions (). Setthrowexceptiononscripterror (false); Webclient.getoptions (). Setredirectenabled (True); Webclient.getoptions (). setappletenabled (false); Webclient.getoptions (). setjavascriptenabled (flag); Webclient.getoptions (). SetTimeout (60000); Webclient.getoptions (). Setprintcontentonfailingstatuscode (false); Webclient.setajaxcontroller (New Nicelyresynchronizingajaxcontroller ()); return webClient; } public static String Getmp3url (WebClient WebClient) {try {page page=webclient.getpage ("Https://c.y.qq.com/so So/fcgi-bin/client_search_cp? " + "ct=24" + "&qqmusic_ver=1298" + "&new_json=1" + "&remoteplace=txt.yqq.center" + "&searchid= 36047978388657978 "+" &t=0 "+" &aggr=1 "+" & Cr=1 "+" &catzhida=1 "+" &lossless=0 "+" &p=1 "+" &n=20 "+" &w= "+urlencoder.encode (name," Utf-8 ") +" &g_tk=5381 "+" &jsonpcallback=musicjsoncallback6176591962889693 "+" &loginuin=0 "+" &hostUin=0 "+" &format=jsonp "+" &incharset=utf8 "+" &outcharset=utf-8 "+" ¬ice=0 "+" &PLATFORM=YQQ "+" & Neednewcode=0 ");//system.out.println (" page: "+page);//system.out.println ("------"+page.getwebresponse (). Getcontentasstring ());//system.out.println ("======" +zzee (Page.getwebresponse (). getcontentasstring (), "(? <=\\ (\\{).*? (?=\\}\\))")); Jsonobject Job=jsonobject.parseobject ("{" +zzee (Page.getwebresponse (). getcontentasstring (), "(? <=\\ (\\{). *? =\\}\\)) (") +"} "). Getjsonobject (" Data "),//system.out.println (" job: "+job); String job0=job.getstring ("song");//system.out.println ("job0" +job0); Job=json.parseobject (job0); Jsonarray List=job.getjsonarray ("list");//system.out.println ("list:" +list); for (int i=0;i<list.size (); i++) {ID1 =list.getjsonobject (i). GetString ("Mid");//system.out.println ("Id1" +id1); Id2=list.getjsonobject (i). getString ("file");//system.out.println ("id" +id2); Id2= "C400" +jsonobject.parseobject (ID2). getString ("Media_mid") + ". m4a";//SYSTEM.OUT.PRINTLN ("id" +id2); name1= List.getjsonobject (i). GetString ("title"), Name2=list.getjsonobject (i). GetString ("singer");//system.out.println ( NAME2); Jsonarray Name=json.parsearray (name2);//system.out.println ("JOB4:" +name); name2=name.getjsonobject (0). getString ( "Name");//system.out.println (Name.getjsonobject (0). getString ("name"));/*string detailurl= "Https://c.y.qq.com/v8 /FCG-BIN/FCG_PLAY_SINGLE_SONG.FCG? " + "songmid=" +id1+ "&tpl=yqq_song_detail&format=jsonp&callback=getonesonginfocallback&g_tk=5381 &jsonpcallback=getonesonginfocallback&loginuin=0&hostuin=0&format=jsonp&incharset=utf8 &outcharset=utf-8¬ice=0&platform=yqq&neednewcode=0 "; Page page2=webclient.getpage (Detailurl);//system.out.println (Page2); String b= "{" +zzee (Page2.getwebresponse (). getcontentasstrING (), "(? <=\\ (\\{). =\\}\\) ") +"} ";//system.out.println (" B "+b); Jsonobject Job1=jsonobject.parseobject ("{" +zzee (Page2.getwebresponse (). getcontentasstring (), "(? <=\\ (\\{). *? =\\}\\)) (") +"} "). Getjsonobject (" url "); System.out.println ("JOB1:" +job1); String job2=job1.getstring (ID2); System.out.println ("Job2" +job2); */string url1= "Https://c.y.qq.com/base/fcgi-bin/fcg_music_express_mobile3.fcg?g _tk=5381&jsonpcallback=musicjsoncallback32651599216689386&loginuin=0&hostuin=0&format=json &incharset=utf8&outcharset=utf-8¬ice=0&platform=yqq&neednewcode=0&cid=205361747 &callback=musicjsoncallback32651599216689386&uin=0 "+" &songmid= "+id1+" &filename= "+id2+" & guid=2241489759 ";; Page Page2=webclient.getpage (URL1);//system.out.println ("Page2" +page2); Jsonobject Job2=jsonobject.parseobject ("{" +zzee (Page2.getwebresponse (). getcontentasstring (), "(? <=\\ (\\{). *? =\\}\\) (") +"} "). Getjsonobject (" Data "),//system.out.println (" title: "+job2.getstring (" items")); String job3=job2.getstring ("items"); Jsonarray Job4=json.parsearray (JOB3);//system.out.println ("JOB4:" +JOB4);//system.out.println (Job4.getjsonobject (0). getString ("Vkey")), url = "http://dl.stream.qqmusic.qq.com/" +id2+ "? vkey=" +job4.getjsonobject (0). getString (" Vkey ") +" &guid=2241489759&uin=0&fromtag=66 "; System.out.println ("Name:" +name1+ "--" +name2); System.out.println ("URL:" +url);d ownload ();}} catch (Exception e) {e.printstacktrace ();} return name; } private static string Zzee (String str, String zz) {string list = null; Pattern p = pattern.compile (ZZ); Matcher m = p.matcher (str); while (M.find ()) {list = M.group ();} return list; } private static void Download () throws ioexception{FileOutputStream outputstream = null; InputStream inputstream = null; Bufferedinputstream bis = null; String outimage = name1+ "--" +name2+ ". mp3"; URL imgurl = new URL (URL);//Get input stream inputstream = Imgurl.openconnection (). getinputstrEAM (); The input stream information is put into the buffer stream to improve read and write speed bis = new Bufferedinputstream (InputStream); Read bytes Lou byte[] buf = new byte[1024]; Generate File OutputStream = new FileOutputStream ("f://" + outimage); int size = 0; Read-side write while (size = Bis.read (BUF))! =-1) {outputstream.write (buf, 0, size); }//Refresh file stream Outputstream.flush (); }public static void Main (string[] args) {WebClient webclient=getwebclient (false); Getmp3url (WebClient);}}
Operation Result:
In contrast, cool dog music relatively good crawl some, QQ music some cumbersome ...
Htmlunit+fastjson Grab Cool Dog music qq music link and download