上次学了jsoup之后,发现一些动态生成的网页内容是无法抓取的,于是又学习了htmlunit,下面是抓取酷狗音乐与qq音乐链接的例子:
酷狗音乐:
import java.io.BufferedInputStream;import java.io.FileOutputStream;import java.io.InputStream;import java.net.URL;import java.net.URLEncoder;import java.util.UUID;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.nodes.Element;import com.alibaba.fastjson.JSONArray;import com.alibaba.fastjson.JSONObject;import com.gargoylesoftware.htmlunit.BrowserVersion;import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;import com.gargoylesoftware.htmlunit.Page;import com.gargoylesoftware.htmlunit.WebClient;public class worm7 { private static String name="离骚"; public static WebClient getWebClient(boolean flag){ WebClient webClient = new WebClient(BrowserVersion.FIREFOX_45); webClient.getOptions().setUseInsecureSSL(true); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setRedirectEnabled(true); webClient.getOptions().setAppletEnabled(false); webClient.getOptions().setJavaScriptEnabled(flag); webClient.getOptions().setTimeout(60000); webClient.getOptions().setPrintContentOnFailingStatusCode(false); webClient.setAjaxController(new NicelyResynchronizingAjaxController()); return webClient; } public static String getMp3Url(WebClient webClient){ FileOutputStream outputStream = null; InputStream inputStream = null; BufferedInputStream bis = null; try { Page page=webClient.getPage("http://songsearch.kugou.com/song_search_v2?" + "callback=jQuery112408395432201569397_1532930925600" + "&keyword="+URLEncoder.encode(name, "utf-8") + "&page=1" + "&pagesize=30" + "&userid=-1" + "&clientver=" + "&platform=WebFilter" + "&tag=em" + "&filter=2" + "&iscorrection=1" + "&privilege_filter=0" + "&_="+System.currentTimeMillis()); //System.out.println(page.getWebResponse().getContentAsString()); //System.out.println(zzee(page.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")); JSONObject job=JSONObject.parseObject("{"+zzee(page.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")+"}").getJSONObject("data"); System.out.println("job:"+job); JSONArray list=job.getJSONArray("lists"); System.out.println("list"+list); for(int i=0;i
运行结果:
qq音乐抓取实例:
import java.io.BufferedInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.net.MalformedURLException;import java.net.URL;import java.net.URLEncoder;import java.util.UUID;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.nodes.Element;import com.alibaba.fastjson.JSON;import com.alibaba.fastjson.JSONArray;import com.alibaba.fastjson.JSONObject;import com.gargoylesoftware.htmlunit.BrowserVersion;import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;import com.gargoylesoftware.htmlunit.Page;import com.gargoylesoftware.htmlunit.WebClient;public class worm6 { private static String name="离骚"; static String id1=null; static String id2=null; static String id3=null; static String id4=null; static String name1=null; static String name2=null; static String url = null; static JSONObject job2=null; public static WebClient getWebClient(boolean flag){ WebClient webClient = new WebClient(BrowserVersion.FIREFOX_45); webClient.getOptions().setUseInsecureSSL(true); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setRedirectEnabled(true); webClient.getOptions().setAppletEnabled(false); webClient.getOptions().setJavaScriptEnabled(flag); webClient.getOptions().setTimeout(60000); webClient.getOptions().setPrintContentOnFailingStatusCode(false); webClient.setAjaxController(new NicelyResynchronizingAjaxController()); return webClient; } public static String getMp3Url(WebClient webClient){ try { Page page=webClient.getPage("https://c.y.qq.com/soso/fcgi-bin/client_search_cp?" + "ct=24" + "&qqmusic_ver=1298" + "&new_json=1" + "&remoteplace=txt.yqq.center" + "&searchid=36047978388657978" + "&t=0" + "&aggr=1" + "&cr=1" + "&catZhida=1" + "&lossless=0" + "&p=1" + "&n=20" + "&w="+URLEncoder.encode(name, "utf-8") + "&g_tk=5381" + "&jsonpCallback=MusicJsonCallback6176591962889693" + "&loginUin=0" + "&hostUin=0" + "&format=jsonp" + "&inCharset=utf8" + "&outCharset=utf-8" + "¬ice=0" + "&platform=yqq" + "&needNewCode=0" ); //System.out.println("page:"+page); //System.out.println("------"+page.getWebResponse().getContentAsString()); //System.out.println("======"+zzee(page.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")); JSONObject job=JSONObject.parseObject("{"+zzee(page.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")+"}").getJSONObject("data"); //System.out.println("job:"+job); String job0=job.getString("song"); //System.out.println("job0"+job0); job=JSON.parseObject(job0); JSONArray list=job.getJSONArray("list"); //System.out.println("list:"+list); for(int i=0;i
运行结果:
相比之下,酷狗音乐相对好爬一些,QQ音乐有些繁琐。。。