用 Java 爬小姐姐圖片,這個厲害了。。。

源?/?? ? ? ??文/?
目的
爬取搜狗圖片上千張美女圖片并下載到本地
準備工作
爬取地址:https://pic.sogou.com/pics?query=%E7%BE%8E%E5%A5%B3

分析
打開上面的地址,按F12開發(fā)者工具 - NetWork - XHR - 頁面往下滑動XHR欄出現(xiàn)請求信息如下:
Request URL :https://pic.sogou.com/napi/pc/searchList?mode=1&start=48&xml_len=48&query=%E7%BE%8E%E5%A5%B3
分析這段請求URL的主要幾個參數(shù):
start=48 表示從第48張圖片開始檢索
xml_len=48 從地48張往后獲取48張圖片
query=?搜索關(guān)鍵詞(例:美女,這里瀏覽器自動做了轉(zhuǎn)碼,不影響我們使用)

點擊Respose,找個JSON格式器輔助過去看看。

JSON格式:https://www.bejson.com/
分析Respose返回的信息,可以發(fā)現(xiàn)我們想要的圖片地址放在 picUrl里,

思路
通過以上分析,不難實現(xiàn)下載方法,思路如下:
設(shè)置URL請求參數(shù) 訪問URL請求,獲取圖片地址 圖片地址存入List 遍歷List,使用線程池下載到本地
代碼
SougouImgProcessor.java 爬取圖片類
import?com.alibaba.fastjson.JSONObject;
import?us.codecraft.webmagic.utils.HttpClientUtils;
import?victor.chang.crawler.pipeline.SougouImgPipeline;
import?java.util.ArrayList;
import?java.util.List;
/**
?*?A?simple?PageProcessor.
?*?@[email protected]?
?*?@since?0.1.0
?*/
public?class?SougouImgProcessor?{
????private?String?url;
????private?SougouImgPipeline?pipeline;
????private?List?dataList;
????private?List?urlList;
????private?String?word;
????public?SougouImgProcessor(String?url,String?word)?{
????????this.url?=?url;
????????this.word?=?word;
????????this.pipeline?=?new?SougouImgPipeline();
????????this.dataList?=?new?ArrayList<>();
????????this.urlList?=?new?ArrayList<>();
????}
????public?void?process(int?idx,?int?size)?{
????????String?res?=?HttpClientUtils.get(String.format(this.url,?idx,?size,?this.word));
????????JSONObject?object?=?JSONObject.parseObject(res);
????????List?items?=?(List)((JSONObject)object.get("data")).get("items");
????????for(JSONObject?item?:?items){
????????????this.urlList.add(item.getString("picUrl"));
????????}
????????this.dataList.addAll(items);
????}
????//?下載
????public?void?pipelineData(){
????????//?多線程
????????pipeline.processSync(this.urlList,?this.word);
????}
????public?static?void?main(String[]?args)?{
????????String?url?=?"https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";
????????SougouImgProcessor?processor?=?new?SougouImgProcessor(url,"美女");
????????int?start?=?0,?size?=?50,?limit?=?1000;?//?定義爬取開始索引、每次爬取數(shù)量、總共爬取數(shù)量
????????for(int?i=start;ilimit;i+=size)
????????????processor.process(i,?size);
????????processor.pipelineData();
????}
}
SougouImgPipeline.java ?圖片下載類
import?java.io.File;
import?java.io.FileOutputStream;
import?java.io.InputStream;
import?java.net.URL;
import?java.net.URLConnection;
import?java.util.List;
import?java.util.Objects;
import?java.util.concurrent.ExecutorService;
import?java.util.concurrent.Executors;
import?java.util.concurrent.TimeUnit;
import?java.util.concurrent.atomic.AtomicInteger;
/**
?*?Store?results?in?files.
?*?@[email protected]?
?*?@since?0.1.0
?*/
public?class?SougouImgPipeline?{
????private?String?extension?=?".jpg";
????private?String?path;
????private?volatile?AtomicInteger?suc;
????private?volatile?AtomicInteger?fails;
????public?SougouImgPipeline()?{
????????setPath("E:/pipeline/sougou");
????????suc?=?new?AtomicInteger();
????????fails?=?new?AtomicInteger();
????}
????public?SougouImgPipeline(String?path)?{
????????setPath(path);
????????suc?=?new?AtomicInteger();
????????fails?=?new?AtomicInteger();
????}
????public?SougouImgPipeline(String?path,?String?extension)?{
????????setPath(path);
????????this.extension?=?extension;
????????suc?=?new?AtomicInteger();
????????fails?=?new?AtomicInteger();
????}
????public?void?setPath(String?path)?{
????????this.path?=?path;
????}
????/**
?????*?下載
?????*?@param?url
?????*?@param?cate
?????*?@throws?Exception
?????*/
????private?void?downloadImg(String?url,?String?cate,?String?name)?throws?Exception?{
????????String?path?=?this.path?+?"/"?+?cate?+?"/";
????????File?dir?=?new?File(path);
????????if?(!dir.exists())?{????//?目錄不存在則創(chuàng)建目錄
????????????dir.mkdirs();
????????}
????????String?realExt?=?url.substring(url.lastIndexOf("."));???//?獲取擴展名
????????String?fileName?=?name?+?realExt;
????????fileName?=?fileName.replace("-",?"");
????????String?filePath?=?path?+?fileName;
????????File?img?=?new?File(filePath);
????????if(img.exists()){???//?若文件之前已經(jīng)下載過,則跳過
????????????System.out.println(String.format("文件%s已存在本地目錄",fileName));
????????????return;
????????}
????????URLConnection?con?=?new?URL(url).openConnection();
????????con.setConnectTimeout(5000);
????????con.setReadTimeout(5000);
????????InputStream?inputStream?=?con.getInputStream();
????????byte[]?bs?=?new?byte[1024];
????????File?file?=?new?File(filePath);
????????FileOutputStream?os?=?new?FileOutputStream(file,?true);
????????//?開始讀取?寫入
????????int?len;
????????while?((len?=?inputStream.read(bs))?!=?-1)?{
????????????os.write(bs,?0,?len);
????????}
????????System.out.println("picUrl:?"?+?url);
????????System.out.println(String.format("正在下載第%s張圖片",?suc.getAndIncrement()));
????}
????/**
?????*?單線程處理
?????*
?????*?@param?data
?????*?@param?word
?????*/
????public?void?process(List?data,?String?word)?{
????????long?start?=?System.currentTimeMillis();
????????for?(String?picUrl?:?data)?{
????????????if?(picUrl?==?null)
????????????????continue;
????????????try?{
????????????????downloadImg(picUrl,?word,?picUrl);
????????????}?catch?(Exception?e)?{
????????????????fails.incrementAndGet();
????????????}
????????}
????????System.out.println("下載成功:?"?+?suc.get());
????????System.out.println("下載失敗:?"?+?fails.get());
????????long?end?=?System.currentTimeMillis();
????????System.out.println("耗時:"?+?(end?-?start)?/?1000?+?"秒");
????}
????/**
?????*?多線程處理
?????*
?????*?@param?data
?????*?@param?word
?????*/
????public?void?processSync(List?data,?String?word)?{
????????long?start?=?System.currentTimeMillis();
????????int?count?=?0;
????????ExecutorService?executorService?=?Executors.newCachedThreadPool();?//?創(chuàng)建緩存線程池
????????for?(int?i=0;i????????????String?picUrl?=?data.get(i);
????????????if?(picUrl?==?null)
????????????????continue;
????????????String?name?=?"";
????????????if(i<10){
????????????????name="000"+i;
????????????}else?if(i<100){
????????????????name="00"+i;
????????????}else?if(i<1000){
????????????????name="0"+i;
????????????}
????????????String?finalName?=?name;
????????????executorService.execute(()?->?{
????????????????try?{
????????????????????downloadImg(picUrl,?word,?finalName);
????????????????}?catch?(Exception?e)?{
????????????????????fails.incrementAndGet();
????????????????}
????????????});
????????????count++;
????????}
????????executorService.shutdown();
????????try?{
????????????if?(!executorService.awaitTermination(60,?TimeUnit.SECONDS))?{
????????????????//?超時的時候向線程池中所有的線程發(fā)出中斷(interrupted)。
????????????????//?executorService.shutdownNow();
????????????}
????????????System.out.println("AwaitTermination?Finished");
????????????System.out.println("共有URL:?"+data.size());
????????????System.out.println("下載成功:?"?+?suc);
????????????System.out.println("下載失敗:?"?+?fails);
????????????File?dir?=?new?File(this.path?+?"/"?+?word?+?"/");
????????????int?len?=?Objects.requireNonNull(dir.list()).length;
????????????System.out.println("當前共有文件:?"+len);
????????????long?end?=?System.currentTimeMillis();
????????????System.out.println("耗時:"?+?(end?-?start)?/?1000.0?+?"秒");
????????}?catch?(InterruptedException?e)?{
????????????e.printStackTrace();
????????}
????}
????/**
?????*?多線程分段處理
?????*
?????*?@param?data
?????*?@param?word
?????*?@param?threadNum
?????*/
????public?void?processSync2(List?data,?final?String?word,?int?threadNum)?{
????????if?(data.size()?????????????process(data,?word);
????????}?else?{
????????????ExecutorService?executorService?=?Executors.newCachedThreadPool();
????????????int?num?=?data.size()?/?threadNum;????//每段要處理的數(shù)量
????????????for?(int?i?=?0;?i?????????????????int?start?=?i?*?num;
????????????????int?end?=?(i?+?1)?*?num;
????????????????if?(i?==?threadNum?-?1)?{
????????????????????end?=?data.size();
????????????????}
????????????????final?List?cutList?=?data.subList(start,?end);
????????????????executorService.execute(()?->?process(cutList,?word));
????????????}
????????????executorService.shutdown();
????????}
????}
}
HttpClientUtils.java ? http請求工具類
import?org.apache.http.Header;
import?org.apache.http.HttpEntity;
import?org.apache.http.NameValuePair;
import?org.apache.http.client.entity.UrlEncodedFormEntity;
import?org.apache.http.client.methods.CloseableHttpResponse;
import?org.apache.http.client.methods.HttpGet;
import?org.apache.http.client.methods.HttpPost;
import?org.apache.http.client.methods.HttpUriRequest;
import?org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import?org.apache.http.conn.ssl.TrustStrategy;
import?org.apache.http.entity.StringEntity;
import?org.apache.http.impl.client.CloseableHttpClient;
import?org.apache.http.impl.client.HttpClients;
import?org.apache.http.message.BasicNameValuePair;
import?org.apache.http.ssl.SSLContextBuilder;
import?org.apache.http.util.EntityUtils;
import?org.slf4j.Logger;
import?org.slf4j.LoggerFactory;
import?javax.net.ssl.HostnameVerifier;
import?javax.net.ssl.SSLContext;
import?javax.net.ssl.SSLSession;
import?java.io.IOException;
import?java.io.UnsupportedEncodingException;
import?java.security.GeneralSecurityException;
import?java.security.cert.CertificateException;
import?java.security.cert.X509Certificate;
import?java.util.ArrayList;
import?java.util.HashMap;
import?java.util.List;
import?java.util.Map;
/**
?*?@[email protected]
?*?Date:?17/3/27
?*/
public?abstract?class?HttpClientUtils?{
????public?static?Map>?convertHeaders(Header[]?headers)?{
????????Map>?results?=?new?HashMap>();
????????for?(Header?header?:?headers)?{
????????????List?list?=?results.get(header.getName());
????????????if?(list?==?null)?{
????????????????list?=?new?ArrayList();
????????????????results.put(header.getName(),?list);
????????????}
????????????list.add(header.getValue());
????????}
????????return?results;
????}
????/**
?????*?http的get請求
?????*?@param?url
?????*/
????public?static?String?get(String?url)?{
????????return?get(url,?"UTF-8");
????}
????public?static?Logger?logger?=?LoggerFactory.getLogger(HttpClientUtils.class);
????/**
?????*?http的get請求
?????*?@param?url
?????*/
????public?static?String?get(String?url,?String?charset)?{
????????HttpGet?httpGet?=?new?HttpGet(url);
????????return?executeRequest(httpGet,?charset);
????}
????/**
?????*?http的get請求,增加異步請求頭參數(shù)
?????*?@param?url
?????*/
????public?static?String?ajaxGet(String?url)?{
????????return?ajaxGet(url,?"UTF-8");
????}
????/**
?????*?http的get請求,增加異步請求頭參數(shù)
?????*
?????*?@param?url
?????*/
????public?static?String?ajaxGet(String?url,?String?charset)?{
????????HttpGet?httpGet?=?new?HttpGet(url);
????????httpGet.setHeader("X-Requested-With",?"XMLHttpRequest");
????????return?executeRequest(httpGet,?charset);
????}
????/**
?????*?@param?url
?????*?@return
?????*/
????public?static?String?ajaxGet(CloseableHttpClient?httpclient,?String?url)?{
????????HttpGet?httpGet?=?new?HttpGet(url);
????????httpGet.setHeader("X-Requested-With",?"XMLHttpRequest");
????????return?executeRequest(httpclient,?httpGet,?"UTF-8");
????}
????/**
?????*?http的post請求,傳遞map格式參數(shù)
?????*/
????public?static?String?post(String?url,?Map?dataMap)?{
????????return?post(url,?dataMap,?"UTF-8");
????}
????/**
?????*?http的post請求,傳遞map格式參數(shù)
?????*/
????public?static?String?post(String?url,?Map?dataMap,?String?charset)?{
????????HttpPost?httpPost?=?new?HttpPost(url);
????????try?{
????????????if?(dataMap?!=?null)?{
????????????????List?nvps?=?new?ArrayList();
????????????????for?(Map.Entry?entry?:?dataMap.entrySet())?{
????????????????????nvps.add(new?BasicNameValuePair(entry.getKey(),?entry.getValue()));
????????????????}
????????????????UrlEncodedFormEntity?formEntity?=?new?UrlEncodedFormEntity(nvps,?charset);
????????????????formEntity.setContentEncoding(charset);
????????????????httpPost.setEntity(formEntity);
????????????}
????????}?catch?(UnsupportedEncodingException?e)?{
????????????e.printStackTrace();
????????}
????????return?executeRequest(httpPost,?charset);
????}
????/**
?????*?http的post請求,增加異步請求頭參數(shù),傳遞map格式參數(shù)
?????*/
????public?static?String?ajaxPost(String?url,?Map?dataMap)?{
????????return?ajaxPost(url,?dataMap,?"UTF-8");
????}
????/**
?????*?http的post請求,增加異步請求頭參數(shù),傳遞map格式參數(shù)
?????*/
????public?static?String?ajaxPost(String?url,?Map?dataMap,?String?charset)?{
????????HttpPost?httpPost?=?new?HttpPost(url);
????????httpPost.setHeader("X-Requested-With",?"XMLHttpRequest");
????????try?{
????????????if?(dataMap?!=?null)?{
????????????????List?nvps?=?new?ArrayList();
????????????????for?(Map.Entry?entry?:?dataMap.entrySet())?{
????????????????????nvps.add(new?BasicNameValuePair(entry.getKey(),?entry.getValue()));
????????????????}
????????????????UrlEncodedFormEntity?formEntity?=?new?UrlEncodedFormEntity(nvps,?charset);
????????????????formEntity.setContentEncoding(charset);
????????????????httpPost.setEntity(formEntity);
????????????}
????????}?catch?(UnsupportedEncodingException?e)?{
????????????e.printStackTrace();
????????}
????????return?executeRequest(httpPost,?charset);
????}
????/**
?????*?http的post請求,增加異步請求頭參數(shù),傳遞json格式參數(shù)
?????*/
????public?static?String?ajaxPostJson(String?url,?String?jsonString)?{
????????return?ajaxPostJson(url,?jsonString,?"UTF-8");
????}
????/**
?????*?http的post請求,增加異步請求頭參數(shù),傳遞json格式參數(shù)
?????*/
????public?static?String?ajaxPostJson(String?url,?String?jsonString,?String?charset)?{
????????HttpPost?httpPost?=?new?HttpPost(url);
????????httpPost.setHeader("X-Requested-With",?"XMLHttpRequest");
????????
????????StringEntity?stringEntity?=?new?StringEntity(jsonString,?charset);//?解決中文亂碼問題
????????stringEntity.setContentEncoding(charset);
????????stringEntity.setContentType("application/json");
????????httpPost.setEntity(stringEntity);
????????return?executeRequest(httpPost,?charset);
????}
????/**
?????*?執(zhí)行一個http請求,傳遞HttpGet或HttpPost參數(shù)
?????*/
????public?static?String?executeRequest(HttpUriRequest?httpRequest)?{
????????return?executeRequest(httpRequest,?"UTF-8");
????}
????/**
?????*?執(zhí)行一個http請求,傳遞HttpGet或HttpPost參數(shù)
?????*/
????public?static?String?executeRequest(HttpUriRequest?httpRequest,?String?charset)?{
????????CloseableHttpClient?httpclient;
????????if?("https".equals(httpRequest.getURI().getScheme()))?{
????????????httpclient?=?createSSLInsecureClient();
????????}?else?{
????????????httpclient?=?HttpClients.createDefault();
????????}
????????String?result?=?"";
????????try?{
????????????try?{
????????????????CloseableHttpResponse?response?=?httpclient.execute(httpRequest);
????????????????HttpEntity?entity?=?null;
????????????????try?{
????????????????????entity?=?response.getEntity();
????????????????????result?=?EntityUtils.toString(entity,?charset);
????????????????}?finally?{
????????????????????EntityUtils.consume(entity);
????????????????????response.close();
????????????????}
????????????}?finally?{
????????????????httpclient.close();
????????????}
????????}?catch?(IOException?ex)?{
????????????ex.printStackTrace();
????????}
????????return?result;
????}
????public?static?String?executeRequest(CloseableHttpClient?httpclient,?HttpUriRequest?httpRequest,?String?charset)?{
????????String?result?=?"";
????????try?{
????????????try?{
????????????????CloseableHttpResponse?response?=?httpclient.execute(httpRequest);
????????????????HttpEntity?entity?=?null;
????????????????try?{
????????????????????entity?=?response.getEntity();
????????????????????result?=?EntityUtils.toString(entity,?charset);
????????????????}?finally?{
????????????????????EntityUtils.consume(entity);
????????????????????response.close();
????????????????}
????????????}?finally?{
????????????????httpclient.close();
????????????}
????????}?catch?(IOException?ex)?{
????????????ex.printStackTrace();
????????}
????????return?result;
????}
????/**
?????*?創(chuàng)建?SSL連接
?????*/
????public?static?CloseableHttpClient?createSSLInsecureClient()?{
????????try?{
????????????SSLContext?sslContext?=?new?SSLContextBuilder().loadTrustMaterial(new?TrustStrategy()?{
????????????????@Override
????????????????public?boolean?isTrusted(X509Certificate[]?chain,?String?authType)?throws?CertificateException?{
????????????????????return?true;
????????????????}
????????????}).build();
????????????SSLConnectionSocketFactory?sslsf?=?new?SSLConnectionSocketFactory(sslContext,?new?HostnameVerifier()?{
????????????????@Override
????????????????public?boolean?verify(String?hostname,?SSLSession?session)?{
????????????????????return?true;
????????????????}
????????????});
????????????return?HttpClients.custom().setSSLSocketFactory(sslsf).build();
????????}?catch?(GeneralSecurityException?ex)?{
????????????throw?new?RuntimeException(ex);
????????}
????}
}
運行
由于網(wǎng)絡(luò)等原因,我們發(fā)現(xiàn)并不能全部下載成功,不過可以多次運行嘗試,可以實現(xiàn)較高的下載成功率。



666,厲害了。。
END


頂級程序員:topcoding
做最好的程序員社區(qū):Java后端開發(fā)、Python、大數(shù)據(jù)、AI
一鍵三連「分享」、「點贊」和「在看」
