来源:blog.csdn.net/qq_35402412/article/details/113627625
目的
爬取搜狗图片上千张美女图片并下载到本地
准备工作
爬取地址:https://pic.sogou.com/pics?query=%E7%BE%8E%E5%A5%B3
分析
打开上面的地址,按F12开发者工具 - NetWork - XHR - 页面往下滑动XHR栏出现请求信息如下:
Request URL :https://pic.sogou.com/napi/pc/searchList?mode=1&start=48&xml_len=48&query=%E7%BE%8E%E5%A5%B3
分析这段请求URL的主要几个参数:
start=48 表示从第48张图片开始检索
xml_len=48 从地48张往后获取48张图片
query=?搜索关键词(例:美女,这里浏览器自动做了转码,不影响我们使用)
点击Respose,找个JSON格式器辅助过去看看。
JSON格式:https://www.bejson.com/
分析Respose返回的信息,可以发现我们想要的图片地址放在 picUrl里,
思路
通过以上分析,不难实现下载方法,思路如下:
- 设置URL请求参数
- 访问URL请求,获取图片地址
- 图片地址存入List
- 遍历List,使用线程池下载到本地
代码
SougouImgProcessor.java 爬取图片类
import com.alibaba.fastjson.JSONObject; | |
import us.codecraft.webmagic.utils.HttpClientUtils; | |
import victor.chang.crawler.pipeline.SougouImgPipeline; | |
import java.util.ArrayList; | |
import java.util.List; | |
/** | |
* A simple PageProcessor. | |
* @author code4crafter@gmail.com <br> | |
* @since 0.1.0 | |
*/ | |
public class SougouImgProcessor { | |
private String url; | |
private SougouImgPipeline pipeline; | |
private List<JSONObject> dataList; | |
private List<String> urlList; | |
private String word; | |
public SougouImgProcessor(String url,String word) { | |
this.url = url; | |
this.word = word; | |
this.pipeline = new SougouImgPipeline(); | |
this.dataList = new ArrayList<>(); | |
this.urlList = new ArrayList<>(); | |
} | |
public void process(int idx, int size) { | |
String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word)); | |
JSONObject object = JSONObject.parseObject(res); | |
List<JSONObject> items = (List<JSONObject>)((JSONObject)object.get("data")).get("items"); | |
for(JSONObject item : items){ | |
this.urlList.add(item.getString("picUrl")); | |
} | |
this.dataList.addAll(items); | |
} | |
// 下载 | |
public void pipelineData(){ | |
// 多线程 | |
pipeline.processSync(this.urlList, this.word); | |
} | |
public static void main(String[] args) { | |
String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s"; | |
SougouImgProcessor processor = new SougouImgProcessor(url,"美女"); | |
int start = 0, size = 50, limit = 1000; // 定义爬取开始索引、每次爬取数量、总共爬取数量 | |
for(int i=start;i<start+limit;i+=size) | |
processor.process(i, size); | |
processor.pipelineData(); | |
} | |
} |
SougouImgPipeline.java 图片下载类
import java.io.File; | |
import java.io.FileOutputStream; | |
import java.io.InputStream; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.util.List; | |
import java.util.Objects; | |
import java.util.concurrent.ExecutorService; | |
import java.util.concurrent.Executors; | |
import java.util.concurrent.TimeUnit; | |
import java.util.concurrent.atomic.AtomicInteger; | |
/** | |
* Store results in files.<br> | |
* @author code4crafter@gmail.com <br> | |
* @since 0.1.0 | |
*/ | |
public class SougouImgPipeline { | |
private String extension = ".jpg"; | |
private String path; | |
private volatile AtomicInteger suc; | |
private volatile AtomicInteger fails; | |
public SougouImgPipeline() { | |
setPath("E:/pipeline/sougou"); | |
suc = new AtomicInteger(); | |
fails = new AtomicInteger(); | |
} | |
public SougouImgPipeline(String path) { | |
setPath(path); | |
suc = new AtomicInteger(); | |
fails = new AtomicInteger(); | |
} | |
public SougouImgPipeline(String path, String extension) { | |
setPath(path); | |
this.extension = extension; | |
suc = new AtomicInteger(); | |
fails = new AtomicInteger(); | |
} | |
public void setPath(String path) { | |
this.path = path; | |
} | |
/** | |
* 下载 | |
* @param url | |
* @param cate | |
* @throws Exception | |
*/ | |
private void downloadImg(String url, String cate, String name) throws Exception { | |
String path = this.path + "/" + cate + "/"; | |
File dir = new File(path); | |
if (!dir.exists()) { // 目录不存在则创建目录 | |
dir.mkdirs(); | |
} | |
String realExt = url.substring(url.lastIndexOf(".")); // 获取扩展名 | |
String fileName = name + realExt; | |
fileName = fileName.replace("-", ""); | |
String filePath = path + fileName; | |
File img = new File(filePath); | |
if(img.exists()){ // 若文件之前已经下载过,则跳过 | |
System.out.println(String.format("文件%s已存在本地目录",fileName)); | |
return; | |
} | |
URLConnection con = new URL(url).openConnection(); | |
con.setConnectTimeout(5000); | |
con.setReadTimeout(5000); | |
InputStream inputStream = con.getInputStream(); | |
byte[] bs = new byte[1024]; | |
File file = new File(filePath); | |
FileOutputStream os = new FileOutputStream(file, true); | |
// 开始读取 写入 | |
int len; | |
while ((len = inputStream.read(bs)) != -1) { | |
os.write(bs, 0, len); | |
} | |
System.out.println("picUrl: " + url); | |
System.out.println(String.format("正在下载第%s张图片", suc.getAndIncrement())); | |
} | |
/** | |
* 单线程处理 | |
* | |
* @param data | |
* @param word | |
*/ | |
public void process(List<String> data, String word) { | |
long start = System.currentTimeMillis(); | |
for (String picUrl : data) { | |
if (picUrl == null) | |
continue; | |
try { | |
downloadImg(picUrl, word, picUrl); | |
} catch (Exception e) { | |
fails.incrementAndGet(); | |
} | |
} | |
System.out.println("下载成功: " + suc.get()); | |
System.out.println("下载失败: " + fails.get()); | |
long end = System.currentTimeMillis(); | |
System.out.println("耗时:" + (end - start) / 1000 + "秒"); | |
} | |
/** | |
* 多线程处理 | |
* | |
* @param data | |
* @param word | |
*/ | |
public void processSync(List<String> data, String word) { | |
long start = System.currentTimeMillis(); | |
int count = 0; | |
ExecutorService executorService = Executors.newCachedThreadPool(); // 创建缓存线程池 | |
for (int i=0;i<data.size();i++) { | |
String picUrl = data.get(i); | |
if (picUrl == null) | |
continue; | |
String name = ""; | |
if(i<10){ | |
name="000"+i; | |
}else if(i<100){ | |
name="00"+i; | |
}else if(i<1000){ | |
name="0"+i; | |
} | |
String finalName = name; | |
executorService.execute(() -> { | |
try { | |
downloadImg(picUrl, word, finalName); | |
} catch (Exception e) { | |
fails.incrementAndGet(); | |
} | |
}); | |
count++; | |
} | |
executorService.shutdown(); | |
try { | |
if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) { | |
// 超时的时候向线程池中所有的线程发出中断(interrupted)。 | |
// executorService.shutdownNow(); | |
} | |
System.out.println("AwaitTermination Finished"); | |
System.out.println("共有URL: "+data.size()); | |
System.out.println("下载成功: " + suc); | |
System.out.println("下载失败: " + fails); | |
File dir = new File(this.path + "/" + word + "/"); | |
int len = Objects.requireNonNull(dir.list()).length; | |
System.out.println("当前共有文件: "+len); | |
long end = System.currentTimeMillis(); | |
System.out.println("耗时:" + (end - start) / 1000.0 + "秒"); | |
} catch (InterruptedException e) { | |
e.printStackTrace(); | |
} | |
} | |
/** | |
* 多线程分段处理 | |
* | |
* @param data | |
* @param word | |
* @param threadNum | |
*/ | |
public void processSync2(List<String> data, final String word, int threadNum) { | |
if (data.size() < threadNum) { | |
process(data, word); | |
} else { | |
ExecutorService executorService = Executors.newCachedThreadPool(); | |
int num = data.size() / threadNum; //每段要处理的数量 | |
for (int i = 0; i < threadNum; i++) { | |
int start = i * num; | |
int end = (i + 1) * num; | |
if (i == threadNum - 1) { | |
end = data.size(); | |
} | |
final List<String> cutList = data.subList(start, end); | |
executorService.execute(() -> process(cutList, word)); | |
} | |
executorService.shutdown(); | |
} | |
} | |
} |
HttpClientUtils.java http请求工具类
import org.apache.http.Header; | |
import org.apache.http.HttpEntity; | |
import org.apache.http.NameValuePair; | |
import org.apache.http.client.entity.UrlEncodedFormEntity; | |
import org.apache.http.client.methods.CloseableHttpResponse; | |
import org.apache.http.client.methods.HttpGet; | |
import org.apache.http.client.methods.HttpPost; | |
import org.apache.http.client.methods.HttpUriRequest; | |
import org.apache.http.conn.ssl.SSLConnectionSocketFactory; | |
import org.apache.http.conn.ssl.TrustStrategy; | |
import org.apache.http.entity.StringEntity; | |
import org.apache.http.impl.client.CloseableHttpClient; | |
import org.apache.http.impl.client.HttpClients; | |
import org.apache.http.message.BasicNameValuePair; | |
import org.apache.http.ssl.SSLContextBuilder; | |
import org.apache.http.util.EntityUtils; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
import javax.net.ssl.HostnameVerifier; | |
import javax.net.ssl.SSLContext; | |
import javax.net.ssl.SSLSession; | |
import java.io.IOException; | |
import java.io.UnsupportedEncodingException; | |
import java.security.GeneralSecurityException; | |
import java.security.cert.CertificateException; | |
import java.security.cert.X509Certificate; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
/** | |
* @author code4crafter@gmail.com | |
* Date: 17/3/27 | |
*/ | |
public abstract class HttpClientUtils { | |
public static Map<String, List<String>> convertHeaders(Header[] headers) { | |
Map<String, List<String>> results = new HashMap<String, List<String>>(); | |
for (Header header : headers) { | |
List<String> list = results.get(header.getName()); | |
if (list == null) { | |
list = new ArrayList<String>(); | |
results.put(header.getName(), list); | |
} | |
list.add(header.getValue()); | |
} | |
return results; | |
} | |
/** | |
* http的get请求 | |
* @param url | |
*/ | |
public static String get(String url) { | |
return get(url, "UTF-8"); | |
} | |
public static Logger logger = LoggerFactory.getLogger(HttpClientUtils.class); | |
/** | |
* http的get请求 | |
* @param url | |
*/ | |
public static String get(String url, String charset) { | |
HttpGet httpGet = new HttpGet(url); | |
return executeRequest(httpGet, charset); | |
} | |
/** | |
* http的get请求,增加异步请求头参数 | |
* @param url | |
*/ | |
public static String ajaxGet(String url) { | |
return ajaxGet(url, "UTF-8"); | |
} | |
/** | |
* http的get请求,增加异步请求头参数 | |
* | |
* @param url | |
*/ | |
public static String ajaxGet(String url, String charset) { | |
HttpGet httpGet = new HttpGet(url); | |
httpGet.setHeader("X-Requested-With", "XMLHttpRequest"); | |
return executeRequest(httpGet, charset); | |
} | |
/** | |
* @param url | |
* @return | |
*/ | |
public static String ajaxGet(CloseableHttpClient httpclient, String url) { | |
HttpGet httpGet = new HttpGet(url); | |
httpGet.setHeader("X-Requested-With", "XMLHttpRequest"); | |
return executeRequest(httpclient, httpGet, "UTF-8"); | |
} | |
/** | |
* http的post请求,传递map格式参数 | |
*/ | |
public static String post(String url, Map<String, String> dataMap) { | |
return post(url, dataMap, "UTF-8"); | |
} | |
/** | |
* http的post请求,传递map格式参数 | |
*/ | |
public static String post(String url, Map<String, String> dataMap, String charset) { | |
HttpPost httpPost = new HttpPost(url); | |
try { | |
if (dataMap != null) { | |
List<NameValuePair> nvps = new ArrayList<NameValuePair>(); | |
for (Map.Entry<String, String> entry : dataMap.entrySet()) { | |
nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue())); | |
} | |
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset); | |
formEntity.setContentEncoding(charset); | |
httpPost.setEntity(formEntity); | |
} | |
} catch (UnsupportedEncodingException e) { | |
e.printStackTrace(); | |
} | |
return executeRequest(httpPost, charset); | |
} | |
/** | |
* http的post请求,增加异步请求头参数,传递map格式参数 | |
*/ | |
public static String ajaxPost(String url, Map<String, String> dataMap) { | |
return ajaxPost(url, dataMap, "UTF-8"); | |
} | |
/** | |
* http的post请求,增加异步请求头参数,传递map格式参数 | |
*/ | |
public static String ajaxPost(String url, Map<String, String> dataMap, String charset) { | |
HttpPost httpPost = new HttpPost(url); | |
httpPost.setHeader("X-Requested-With", "XMLHttpRequest"); | |
try { | |
if (dataMap != null) { | |
List<NameValuePair> nvps = new ArrayList<NameValuePair>(); | |
for (Map.Entry<String, String> entry : dataMap.entrySet()) { | |
nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue())); | |
} | |
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset); | |
formEntity.setContentEncoding(charset); | |
httpPost.setEntity(formEntity); | |
} | |
} catch (UnsupportedEncodingException e) { | |
e.printStackTrace(); | |
} | |
return executeRequest(httpPost, charset); | |
} | |
/** | |
* http的post请求,增加异步请求头参数,传递json格式参数 | |
*/ | |
public static String ajaxPostJson(String url, String jsonString) { | |
return ajaxPostJson(url, jsonString, "UTF-8"); | |
} | |
/** | |
* http的post请求,增加异步请求头参数,传递json格式参数 | |
*/ | |
public static String ajaxPostJson(String url, String jsonString, String charset) { | |
HttpPost httpPost = new HttpPost(url); | |
httpPost.setHeader("X-Requested-With", "XMLHttpRequest"); | |
StringEntity stringEntity = new StringEntity(jsonString, charset);// 解决中文乱码问题 | |
stringEntity.setContentEncoding(charset); | |
stringEntity.setContentType("application/json"); | |
httpPost.setEntity(stringEntity); | |
return executeRequest(httpPost, charset); | |
} | |
/** | |
* 执行一个http请求,传递HttpGet或HttpPost参数 | |
*/ | |
public static String executeRequest(HttpUriRequest httpRequest) { | |
return executeRequest(httpRequest, "UTF-8"); | |
} | |
/** | |
* 执行一个http请求,传递HttpGet或HttpPost参数 | |
*/ | |
public static String executeRequest(HttpUriRequest httpRequest, String charset) { | |
CloseableHttpClient httpclient; | |
if ("https".equals(httpRequest.getURI().getScheme())) { | |
httpclient = createSSLInsecureClient(); | |
} else { | |
httpclient = HttpClients.createDefault(); | |
} | |
String result = ""; | |
try { | |
try { | |
CloseableHttpResponse response = httpclient.execute(httpRequest); | |
HttpEntity entity = null; | |
try { | |
entity = response.getEntity(); | |
result = EntityUtils.toString(entity, charset); | |
} finally { | |
EntityUtils.consume(entity); | |
response.close(); | |
} | |
} finally { | |
httpclient.close(); | |
} | |
} catch (IOException ex) { | |
ex.printStackTrace(); | |
} | |
return result; | |
} | |
public static String executeRequest(CloseableHttpClient httpclient, HttpUriRequest httpRequest, String charset) { | |
String result = ""; | |
try { | |
try { | |
CloseableHttpResponse response = httpclient.execute(httpRequest); | |
HttpEntity entity = null; | |
try { | |
entity = response.getEntity(); | |
result = EntityUtils.toString(entity, charset); | |
} finally { | |
EntityUtils.consume(entity); | |
response.close(); | |
} | |
} finally { | |
httpclient.close(); | |
} | |
} catch (IOException ex) { | |
ex.printStackTrace(); | |
} | |
return result; | |
} | |
/** | |
* 创建 SSL连接 | |
*/ | |
public static CloseableHttpClient createSSLInsecureClient() { | |
try { | |
SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(new TrustStrategy() { | |
public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException { | |
return true; | |
} | |
}).build(); | |
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, new HostnameVerifier() { | |
public boolean verify(String hostname, SSLSession session) { | |
return true; | |
} | |
}); | |
return HttpClients.custom().setSSLSocketFactory(sslsf).build(); | |
} catch (GeneralSecurityException ex) { | |
throw new RuntimeException(ex); | |
} | |
} | |
} |
运行
由于网络等原因,我们发现并不能全部下载成功,不过可以多次运行尝试,可以实现较高的下载成功率。
666,厉害了。。