用 Java 爬小姐姐圖片,這個(gè)厲害了!
不點(diǎn)藍(lán)字關(guān)注,我們哪來故事?

正文如下
來自:blog.csdn.net/qq_35402412/article/details/113627625
目的
準(zhǔn)備工作
https://pic.sogou.com/pics?query=美女
分析
https://pic.sogou.com/napi/pc/searchList?mode=1&start=48&xml_len=48&query=美女start=48?表示從第48張圖片開始檢索xml_len=48?從地48張往后獲取48張圖片query=?搜索關(guān)鍵詞(例:美女,這里瀏覽器自動(dòng)做了轉(zhuǎn)碼,不影響我們使用)



思路
設(shè)置URL請求參數(shù) 訪問URL請求,獲取圖片地址 圖片地址存入List 遍歷List,使用線程池下載到本地
代碼
import?com.alibaba.fastjson.JSONObject;??
import?us.codecraft.webmagic.utils.HttpClientUtils;??
import?victor.chang.crawler.pipeline.SougouImgPipeline;??
??
import?java.util.ArrayList;??
import?java.util.List;??
??
/**??
?*?A?simple?PageProcessor.??
?*?@author[email protected]?
??
?*?@since?0.1.0??
?*/??
public?class?SougouImgProcessor?{??
??
????private?String?url;??
????private?SougouImgPipeline?pipeline;??
????private?List?dataList;??
????private?List?urlList;??
????private?String?word;??
??
????public?SougouImgProcessor(String?url,String?word)?{??
????????this.url?=?url;??
????????this.word?=?word;??
????????this.pipeline?=?new?SougouImgPipeline();??
????????this.dataList?=?new?ArrayList<>();??
????????this.urlList?=?new?ArrayList<>();??
????}??
??
????public?void?process(int?idx,?int?size)?{??
????????String?res?=?HttpClientUtils.get(String.format(this.url,?idx,?size,?this.word));??
????????JSONObject?object?=?JSONObject.parseObject(res);??
????????List?items?=?(List )((JSONObject)object.get("data")).get("items");??
????????for(JSONObject?item?:?items){??
????????????this.urlList.add(item.getString("picUrl"));??
????????}??
????????this.dataList.addAll(items);??
????}??
??
????//?下載??
????public?void?pipelineData(){??
????????//?多線程??
????????pipeline.processSync(this.urlList,?this.word);??
????}??
??
??
????public?static?void?main(String[]?args)?{??
????????String?url?=?"https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";??
????????SougouImgProcessor?processor?=?new?SougouImgProcessor(url,"美女");??
??
????????int?start?=?0,?size?=?50,?limit?=?1000;?//?定義爬取開始索引、每次爬取數(shù)量、總共爬取數(shù)量??
??
????????for(int?i=start;i????????????processor.process(i,?size);??
??
????????processor.pipelineData();??
??
????}??
??
}??
import?java.io.File;??
import?java.io.FileOutputStream;??
import?java.io.InputStream;??
import?java.net.URL;??
import?java.net.URLConnection;??
import?java.util.List;??
import?java.util.Objects;??
import?java.util.concurrent.ExecutorService;??
import?java.util.concurrent.Executors;??
import?java.util.concurrent.TimeUnit;??
import?java.util.concurrent.atomic.AtomicInteger;??
??
/**??
?*?Store?results?in?files.
??
?*?@author[email protected]?
??
?*?@since?0.1.0??
?*/??
public?class?SougouImgPipeline?{??
??
????private?String?extension?=?".jpg";??
????private?String?path;??
??
????private?volatile?AtomicInteger?suc;??
????private?volatile?AtomicInteger?fails;??
??
????public?SougouImgPipeline()?{??
????????setPath("E:/pipeline/sougou");??
????????suc?=?new?AtomicInteger();??
????????fails?=?new?AtomicInteger();??
????}??
??
????public?SougouImgPipeline(String?path)?{??
????????setPath(path);??
????????suc?=?new?AtomicInteger();??
????????fails?=?new?AtomicInteger();??
????}??
??
????public?SougouImgPipeline(String?path,?String?extension)?{??
????????setPath(path);??
????????this.extension?=?extension;??
????????suc?=?new?AtomicInteger();??
????????fails?=?new?AtomicInteger();??
????}??
??
????public?void?setPath(String?path)?{??
????????this.path?=?path;??
????}??
??
????/**??
?????*?下載??
?????*?@param?url??
?????*?@param?cate??
?????*?@throws?Exception??
?????*/??
????private?void?downloadImg(String?url,?String?cate,?String?name)?throws?Exception?{??
????????String?path?=?this.path?+?"/"?+?cate?+?"/";??
????????File?dir?=?new?File(path);??
????????if?(!dir.exists())?{????//?目錄不存在則創(chuàng)建目錄??
????????????dir.mkdirs();??
????????}??
????????String?realExt?=?url.substring(url.lastIndexOf("."));???//?獲取擴(kuò)展名??
????????String?fileName?=?name?+?realExt;??
????????fileName?=?fileName.replace("-",?"");??
????????String?filePath?=?path?+?fileName;??
????????File?img?=?new?File(filePath);??
????????if(img.exists()){???//?若文件之前已經(jīng)下載過,則跳過??
????????????System.out.println(String.format("文件%s已存在本地目錄",fileName));??
????????????return;??
????????}??
??
????????URLConnection?con?=?new?URL(url).openConnection();??
????????con.setConnectTimeout(5000);??
????????con.setReadTimeout(5000);??
????????InputStream?inputStream?=?con.getInputStream();??
????????byte[]?bs?=?new?byte[1024];??
??
????????File?file?=?new?File(filePath);??
????????FileOutputStream?os?=?new?FileOutputStream(file,?true);??
????????//?開始讀取?寫入??
????????int?len;??
????????while?((len?=?inputStream.read(bs))?!=?-1)?{??
????????????os.write(bs,?0,?len);??
????????}??
????????System.out.println("picUrl:?"?+?url);??
????????System.out.println(String.format("正在下載第%s張圖片",?suc.getAndIncrement()));??
????}??
??
????/**??
?????*?單線程處理??
?????*??
?????*?@param?data??
?????*?@param?word??
?????*/??
????public?void?process(List?data,?String?word) ?{??
????????long?start?=?System.currentTimeMillis();??
????????for?(String?picUrl?:?data)?{??
????????????if?(picUrl?==?null)??
????????????????continue;??
????????????try?{??
????????????????downloadImg(picUrl,?word,?picUrl);??
????????????}?catch?(Exception?e)?{??
????????????????fails.incrementAndGet();??
????????????}??
????????}??
????????System.out.println("下載成功:?"?+?suc.get());??
????????System.out.println("下載失敗:?"?+?fails.get());??
????????long?end?=?System.currentTimeMillis();??
????????System.out.println("耗時(shí):"?+?(end?-?start)?/?1000?+?"秒");??
????}??
??
??
????/**??
?????*?多線程處理??
?????*??
?????*?@param?data??
?????*?@param?word??
?????*/??
????public?void?processSync(List?data,?String?word) ?{??
????????long?start?=?System.currentTimeMillis();??
????????int?count?=?0;??
????????ExecutorService?executorService?=?Executors.newCachedThreadPool();?//?創(chuàng)建緩存線程池??
????????for?(int?i=0;i????????????String?picUrl?=?data.get(i);??
????????????if?(picUrl?==?null)??
????????????????continue;??
????????????String?name?=?"";??
????????????if(i<10){??
????????????????name="000"+i;??
????????????}else?if(i<100){??
????????????????name="00"+i;??
????????????}else?if(i<1000){??
????????????????name="0"+i;??
????????????}??
????????????String?finalName?=?name;??
????????????executorService.execute(()?->?{??
????????????????try?{??
????????????????????downloadImg(picUrl,?word,?finalName);??
????????????????}?catch?(Exception?e)?{??
????????????????????fails.incrementAndGet();??
????????????????}??
????????????});??
????????????count++;??
????????}??
????????executorService.shutdown();??
????????try?{??
????????????if?(!executorService.awaitTermination(60,?TimeUnit.SECONDS))?{??
????????????????//?超時(shí)的時(shí)候向線程池中所有的線程發(fā)出中斷(interrupted)。??
????????????????//?executorService.shutdownNow();??
????????????}??
????????????System.out.println("AwaitTermination?Finished");??
????????????System.out.println("共有URL:?"+data.size());??
????????????System.out.println("下載成功:?"?+?suc);??
????????????System.out.println("下載失敗:?"?+?fails);??
??
????????????File?dir?=?new?File(this.path?+?"/"?+?word?+?"/");??
????????????int?len?=?Objects.requireNonNull(dir.list()).length;??
????????????System.out.println("當(dāng)前共有文件:?"+len);??
??
????????????long?end?=?System.currentTimeMillis();??
????????????System.out.println("耗時(shí):"?+?(end?-?start)?/?1000.0?+?"秒");??
????????}?catch?(InterruptedException?e)?{??
????????????e.printStackTrace();??
????????}??
??
????}??
??
??
????/**??
?????*?多線程分段處理??
?????*??
?????*?@param?data??
?????*?@param?word??
?????*?@param?threadNum??
?????*/??
????public?void?processSync2(List?data,?final?String?word,?int?threadNum) ?{??
????????if?(data.size()?????????????process(data,?word);??
????????}?else?{??
????????????ExecutorService?executorService?=?Executors.newCachedThreadPool();??
????????????int?num?=?data.size()?/?threadNum;????//每段要處理的數(shù)量??
????????????for?(int?i?=?0;?i?????????????????int?start?=?i?*?num;??
????????????????int?end?=?(i?+?1)?*?num;??
????????????????if?(i?==?threadNum?-?1)?{??
????????????????????end?=?data.size();??
????????????????}??
????????????????final?List?cutList?=?data.subList(start,?end);??
????????????????executorService.execute(()?->?process(cutList,?word));??
????????????}??
????????????executorService.shutdown();??
????????}??
????}??
??
}??
import?org.apache.http.Header;??
import?org.apache.http.HttpEntity;??
import?org.apache.http.NameValuePair;??
import?org.apache.http.client.entity.UrlEncodedFormEntity;??
import?org.apache.http.client.methods.CloseableHttpResponse;??
import?org.apache.http.client.methods.HttpGet;??
import?org.apache.http.client.methods.HttpPost;??
import?org.apache.http.client.methods.HttpUriRequest;??
import?org.apache.http.conn.ssl.SSLConnectionSocketFactory;??
import?org.apache.http.conn.ssl.TrustStrategy;??
import?org.apache.http.entity.StringEntity;??
import?org.apache.http.impl.client.CloseableHttpClient;??
import?org.apache.http.impl.client.HttpClients;??
import?org.apache.http.message.BasicNameValuePair;??
import?org.apache.http.ssl.SSLContextBuilder;??
import?org.apache.http.util.EntityUtils;??
import?org.slf4j.Logger;??
import?org.slf4j.LoggerFactory;??
??
import?javax.net.ssl.HostnameVerifier;??
import?javax.net.ssl.SSLContext;??
import?javax.net.ssl.SSLSession;??
import?java.io.IOException;??
import?java.io.UnsupportedEncodingException;??
import?java.security.GeneralSecurityException;??
import?java.security.cert.CertificateException;??
import?java.security.cert.X509Certificate;??
import?java.util.ArrayList;??
import?java.util.HashMap;??
import?java.util.List;??
import?java.util.Map;??
??
/**??
?*?@author[email protected]??
?*?Date:?17/3/27??
?*/??
public?abstract?class?HttpClientUtils?{??
??
????public?static?Map>?convertHeaders(Header[]?headers)?{??
????????Map>?results?=?new?HashMap >();??
????????for?(Header?header?:?headers)?{??
????????????List?list?=?results.get(header.getName());??
????????????if?(list?==?null)?{??
????????????????list?=?new?ArrayList();??
????????????????results.put(header.getName(),?list);??
????????????}??
????????????list.add(header.getValue());??
????????}??
????????return?results;??
????}??
??
????/**??
?????*?http的get請求??
?????*?@param?url??
?????*/??
????public?static?String?get(String?url)?{??
????????return?get(url,?"UTF-8");??
????}??
??
????public?static?Logger?logger?=?LoggerFactory.getLogger(HttpClientUtils.class);??
??
????/**??
?????*?http的get請求??
?????*?@param?url??
?????*/??
????public?static?String?get(String?url,?String?charset)?{??
????????HttpGet?httpGet?=?new?HttpGet(url);??
????????return?executeRequest(httpGet,?charset);??
????}??
??
????/**??
?????*?http的get請求,增加異步請求頭參數(shù)??
?????*?@param?url??
?????*/??
????public?static?String?ajaxGet(String?url)?{??
????????return?ajaxGet(url,?"UTF-8");??
????}??
??
????/**??
?????*?http的get請求,增加異步請求頭參數(shù)??
?????*??
?????*?@param?url??
?????*/??
????public?static?String?ajaxGet(String?url,?String?charset)?{??
????????HttpGet?httpGet?=?new?HttpGet(url);??
????????httpGet.setHeader("X-Requested-With",?"XMLHttpRequest");??
????????return?executeRequest(httpGet,?charset);??
????}??
??
????/**??
?????*?@param?url??
?????*?@return??
?????*/??
????public?static?String?ajaxGet(CloseableHttpClient?httpclient,?String?url)?{??
????????HttpGet?httpGet?=?new?HttpGet(url);??
????????httpGet.setHeader("X-Requested-With",?"XMLHttpRequest");??
????????return?executeRequest(httpclient,?httpGet,?"UTF-8");??
????}??
??
????/**??
?????*?http的post請求,傳遞map格式參數(shù)??
?????*/??
????public?static?String?post(String?url,?Map?dataMap) ?{??
????????return?post(url,?dataMap,?"UTF-8");??
????}??
??
????/**??
?????*?http的post請求,傳遞map格式參數(shù)??
?????*/??
????public?static?String?post(String?url,?Map?dataMap,?String?charset) ?{??
????????HttpPost?httpPost?=?new?HttpPost(url);??
????????try?{??
????????????if?(dataMap?!=?null)?{??
????????????????List?nvps?=?new?ArrayList ();??
????????????????for?(Map.Entry?entry?:?dataMap.entrySet())?{??
????????????????????nvps.add(new?BasicNameValuePair(entry.getKey(),?entry.getValue()));??
????????????????}??
????????????????UrlEncodedFormEntity?formEntity?=?new?UrlEncodedFormEntity(nvps,?charset);??
????????????????formEntity.setContentEncoding(charset);??
????????????????httpPost.setEntity(formEntity);??
????????????}??
????????}?catch?(UnsupportedEncodingException?e)?{??
????????????e.printStackTrace();??
????????}??
????????return?executeRequest(httpPost,?charset);??
????}??
??
????/**??
?????*?http的post請求,增加異步請求頭參數(shù),傳遞map格式參數(shù)??
?????*/??
????public?static?String?ajaxPost(String?url,?Map?dataMap) ?{??
????????return?ajaxPost(url,?dataMap,?"UTF-8");??
????}??
??
????/**??
?????*?http的post請求,增加異步請求頭參數(shù),傳遞map格式參數(shù)??
?????*/??
????public?static?String?ajaxPost(String?url,?Map?dataMap,?String?charset) ?{??
????????HttpPost?httpPost?=?new?HttpPost(url);??
????????httpPost.setHeader("X-Requested-With",?"XMLHttpRequest");??
????????try?{??
????????????if?(dataMap?!=?null)?{??
????????????????List?nvps?=?new?ArrayList ();??
????????????????for?(Map.Entry?entry?:?dataMap.entrySet())?{??
????????????????????nvps.add(new?BasicNameValuePair(entry.getKey(),?entry.getValue()));??
????????????????}??
????????????????UrlEncodedFormEntity?formEntity?=?new?UrlEncodedFormEntity(nvps,?charset);??
????????????????formEntity.setContentEncoding(charset);??
????????????????httpPost.setEntity(formEntity);??
????????????}??
????????}?catch?(UnsupportedEncodingException?e)?{??
????????????e.printStackTrace();??
????????}??
????????return?executeRequest(httpPost,?charset);??
????}??
??
????/**??
?????*?http的post請求,增加異步請求頭參數(shù),傳遞json格式參數(shù)??
?????*/??
????public?static?String?ajaxPostJson(String?url,?String?jsonString)?{??
????????return?ajaxPostJson(url,?jsonString,?"UTF-8");??
????}??
??
????/**??
?????*?http的post請求,增加異步請求頭參數(shù),傳遞json格式參數(shù)??
?????*/??
????public?static?String?ajaxPostJson(String?url,?String?jsonString,?String?charset)?{??
????????HttpPost?httpPost?=?new?HttpPost(url);??
????????httpPost.setHeader("X-Requested-With",?"XMLHttpRequest");??
??????????
????????StringEntity?stringEntity?=?new?StringEntity(jsonString,?charset);//?解決中文亂碼問題??
????????stringEntity.setContentEncoding(charset);??
????????stringEntity.setContentType("application/json");??
????????httpPost.setEntity(stringEntity);??
????????return?executeRequest(httpPost,?charset);??
????}??
??
????/**??
?????*?執(zhí)行一個(gè)http請求,傳遞HttpGet或HttpPost參數(shù)??
?????*/??
????public?static?String?executeRequest(HttpUriRequest?httpRequest)?{??
????????return?executeRequest(httpRequest,?"UTF-8");??
????}??
??
????/**??
?????*?執(zhí)行一個(gè)http請求,傳遞HttpGet或HttpPost參數(shù)??
?????*/??
????public?static?String?executeRequest(HttpUriRequest?httpRequest,?String?charset)?{??
????????CloseableHttpClient?httpclient;??
????????if?("https".equals(httpRequest.getURI().getScheme()))?{??
????????????httpclient?=?createSSLInsecureClient();??
????????}?else?{??
????????????httpclient?=?HttpClients.createDefault();??
????????}??
????????String?result?=?"";??
????????try?{??
????????????try?{??
????????????????CloseableHttpResponse?response?=?httpclient.execute(httpRequest);??
????????????????HttpEntity?entity?=?null;??
????????????????try?{??
????????????????????entity?=?response.getEntity();??
????????????????????result?=?EntityUtils.toString(entity,?charset);??
????????????????}?finally?{??
????????????????????EntityUtils.consume(entity);??
????????????????????response.close();??
????????????????}??
????????????}?finally?{??
????????????????httpclient.close();??
????????????}??
????????}?catch?(IOException?ex)?{??
????????????ex.printStackTrace();??
????????}??
????????return?result;??
????}??
??
????public?static?String?executeRequest(CloseableHttpClient?httpclient,?HttpUriRequest?httpRequest,?String?charset)?{??
????????String?result?=?"";??
????????try?{??
????????????try?{??
????????????????CloseableHttpResponse?response?=?httpclient.execute(httpRequest);??
????????????????HttpEntity?entity?=?null;??
????????????????try?{??
????????????????????entity?=?response.getEntity();??
????????????????????result?=?EntityUtils.toString(entity,?charset);??
????????????????}?finally?{??
????????????????????EntityUtils.consume(entity);??
????????????????????response.close();??
????????????????}??
????????????}?finally?{??
????????????????httpclient.close();??
????????????}??
????????}?catch?(IOException?ex)?{??
????????????ex.printStackTrace();??
????????}??
????????return?result;??
????}??
??
????/**??
?????*?創(chuàng)建?SSL連接??
?????*/??
????public?static?CloseableHttpClient?createSSLInsecureClient()?{??
????????try?{??
????????????SSLContext?sslContext?=?new?SSLContextBuilder().loadTrustMaterial(new?TrustStrategy()?{??
????????????????@Override??
????????????????public?boolean?isTrusted(X509Certificate[]?chain,?String?authType)?throws?CertificateException?{??
????????????????????return?true;??
????????????????}??
????????????}).build();??
????????????SSLConnectionSocketFactory?sslsf?=?new?SSLConnectionSocketFactory(sslContext,?new?HostnameVerifier()?{??
????????????????@Override??
????????????????public?boolean?verify(String?hostname,?SSLSession?session)?{??
????????????????????return?true;??
????????????????}??
????????????});??
????????????return?HttpClients.custom().setSSLSocketFactory(sslsf).build();??
????????}?catch?(GeneralSecurityException?ex)?{??
????????????throw?new?RuntimeException(ex);??
????????}??
????}??
}??
運(yùn)行



往期推薦
-END-
↑ 點(diǎn)擊上方關(guān)注我公號(hào)?↑?
我是 泥瓦匠,堅(jiān)持分享編程,算法,Java 等干貨教程
一枚醫(yī)科大本科生,開源小作者,半吊子創(chuàng)業(yè)愛好者...
半吊子的自己在試錯(cuò),不知道以后會(huì)干什么,但享受現(xiàn)在的試錯(cuò),試錯(cuò)給我驚訝的生活
喜歡公號(hào)的互動(dòng)分享,感謝關(guān)注,路上遇見了你,同一小段時(shí)間之路,相伴 ~
長按識(shí)別,加我微信

點(diǎn)個(gè)在看結(jié)對編程一把

評(píng)論
圖片
表情
