技术标签: ssm框架 jsoup httpclient java爬虫
首先访问京东,搜索手机,分析页面,我们抓取以下商品数据:
商品图片、价格、标题、商品详情页
技术要求:springboot spring data jpa httpclient jsoup
数据库准备:
use crawler;
CREATE TABLE `jd_item` (
`id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键 id',
`spu` bigint(15) DEFAULT NULL COMMENT '商品集合 id',
`sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元 id',
`title` varchar(100) DEFAULT NULL COMMENT '商品标题',
`price` bigint(10) DEFAULT NULL COMMENT '商品价格',
`pic` varchar(200) DEFAULT NULL COMMENT '商品图片',
`url` varchar(200) DEFAULT NULL COMMENT '商品详情地址',
`created` datetime DEFAULT NULL COMMENT '创建时间',
`updated` datetime DEFAULT NULL COMMENT '更新时间',
PRIMARY KEY (`id`),
KEY `sku` (`sku`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='京东商品表'
1.创建springboot 工程 ,导入依赖
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.0.1.RELEASE</version>
</parent>
<groupId>com.itheima</groupId>
<artifactId>crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- HttpClient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<!-- 日志 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<!--Jsoup-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<!--工具-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.7</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
</dependencies>
2.编写配置文件
3.编写启动类
@SpringBootApplication
public class SpringBootRun {
public static void main(String[] args) {
SpringApplication.run(SpringBootRun.class,args);
}
}
4.编写持久化类
@Entity
@Table(name = "jd_item")
public class Item {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
private String spu;
private String sku;
private String title;
private Long price;
private String pic;
private String url;
private Date created;
private Date updated;
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getSpu() {
return spu;
}
public void setSpu(String spu) {
this.spu = spu;
}
public String getSku() {
return sku;
}
public void setSku(String sku) {
this.sku = sku;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Long getPrice() {
return price;
}
public void setPrice(Long price) {
this.price = price;
}
public String getPic() {
return pic;
}
public void setPic(String pic) {
this.pic = pic;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public Date getCreated() {
return created;
}
public void setCreated(Date created) {
this.created = created;
}
public Date getUpdated() {
return updated;
}
public void setUpdated(Date updated) {
this.updated = updated;
}
}
5.编写dao接口
public interface ItemDao extends JpaRepository<Item,Long>{
}
6.编写service接口以及实现列
@Service
@Transactional
public class ItemServiceImpl implements ItemService {
@Autowired
private ItemDao itemDao;
@Override
public void save(Item item) {
itemDao.save(item);
}
//条件查询
@Override
public List<Item> findAll(Item item) {
Example example = Example.of(item);
List<Item> all = this.itemDao.findAll(example);
return all;
}
}
封装HttpClient
package com.itheima.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.context.annotation.ComponentScan;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.UUID;
@ComponentScan
public class HttpUtil {
//http连接池
private static PoolingHttpClientConnectionManager pool;
static{
pool = new PoolingHttpClientConnectionManager();
pool.setMaxTotal(100);
pool.setDefaultMaxPerRoute(50);
}
/**
* 获取页面源码
*/
public String getHtml(String url){
CloseableHttpClient build = HttpClients.custom().setConnectionManager(pool).build();
//通过get请求
HttpGet httpGet = new HttpGet();
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36");
httpGet.setConfig(this.getConfig());
//发送请求
try {
CloseableHttpResponse response = build.execute(httpGet);
//判断发送的返回的状态
if (response.getStatusLine().getStatusCode()==200){
String string = EntityUtils.toString(response.getEntity(), "UTF-8");
return string;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
return null;
}
/**
* 下载图片
*/
public String getImage(String imgUrl){
CloseableHttpClient build = HttpClients.custom().setConnectionManager(pool).build();
HttpGet httpGet = new HttpGet();
httpGet.setConfig(this.getConfig());
try {
CloseableHttpResponse response = build.execute(httpGet);
if (response.getStatusLine().getStatusCode()==200){
//后缀名 .jpg .png
String suffix = imgUrl.substring(imgUrl.lastIndexOf("."));
String newImg = UUID.randomUUID()+suffix;
//保存图片
FileOutputStream fileOutputStream = new FileOutputStream(new File("F:\\img\\" + newImg));
response.getEntity().writeTo(fileOutputStream);
return newImg;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
return null;
}
private RequestConfig getConfig(){
RequestConfig requestConfig = RequestConfig.custom()
.setConnectionRequestTimeout(500)
.setConnectTimeout(500)
.setSocketTimeout(1000 * 10)
.build();
return requestConfig;
}
}
使用定时任务编写页面抓取代码
package com.itheima.utils;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.itheima.po.Item;
import com.itheima.service.ItemService;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import java.io.IOException;
import java.util.Date;
@Component
public class ItemTask {
@Autowired
private ItemService itemService;
@Autowired
private HttpUtil httpUtil;
public static final ObjectMapper MAPPER = new ObjectMapper();
//设置定时任务,间隔100秒执行一次
@Scheduled(fixedDelay = 1000 * 50)
public void process(){
//京东的url 地址
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=59&click=0&page=";
for (int i = 1; i < 10; i=i+2) {
String html = this.httpUtil.getHtml(url+i);
//解析页面数据保存到数据库
this.parseHtml(html);
}
System.out.println("执行完成");
}
//解析页面数据库保存到数据库
private void parseHtml(String html) {
//通过Jsoup解析文件
Document document = Jsoup.parse(html);
//获取spu 的dom 数据
Elements spuelements = document.select("div#J_goodsList li.gl-item");
for (Element spuelement : spuelements) {
String spuId = spuelement.attr("data-spu");
Elements skuelements = spuelement.select("div.p-scroll li.ps-item");
for (Element skuelement : skuelements) {
Item item = new Item();
item.setSpu(spuId);
//获得skuid
String skuid = skuelement.select("img").attr("data-sku");
item.setSku(skuid);
//获得图片的url路径
String skuUrl = "https://item.jd.com/"+skuid+".html";
item.setUrl(skuUrl);
//发送请求获得商品详情页的数据
String skuHtml = httpUtil.getHtml(skuUrl);
//获得详情页的dom树
Document skuDocumet = Jsoup.parse(skuHtml);
String skuTitle = skuDocumet.select("div.sku-name").text();
item.setTitle(skuTitle);
//注意:因为价格是异步请求的,所以我们通过ajax获得
String priceUrl = "https://p.3.cn/prices/mgets?skuIds=J_"+skuid;
String parceHtml = httpUtil.getHtml(priceUrl);
ObjectMapper objectMapper = new ObjectMapper();
try {
long price = objectMapper.readTree(priceUrl).get(0).get("p").asLong();
item.setPrice(price);
} catch (IOException e) {
e.printStackTrace();
}
//获取路径
String imgUrl = skuelement.select("img").attr("src");
if (StringUtils.isEmpty(imgUrl)){
imgUrl = skuelement.select("img").attr("data-lazy-img");
}
imgUrl = imgUrl.replace("/n9/","/n7/");
String imageNewName= httpUtil.getImg("http:" + imgUrl);
item.setPic(imageNewName);
item.setCreated(new Date());
itemService.save(item);
}
}
}
}
使用json解析获得的页面
package com.itheima.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.UUID;
@Component
public class HttpUtil {
private static PoolingHttpClientConnectionManager pool;
static{
pool = new PoolingHttpClientConnectionManager();
pool.setMaxTotal(200);
pool.setDefaultMaxPerRoute(50);
}
/**
* 使用httpclient抓取页面
* @param url
* @return
*/
public String getHtml(String url){
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(pool).build();
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36");
httpGet.setConfig(this.getConfig());
try {
CloseableHttpResponse response = httpClient.execute(httpGet);
if(response.getStatusLine().getStatusCode() == 200){
String content = EntityUtils.toString(response.getEntity(), "UTF-8");
return content;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
return null;
}
/**
* 获取图片,并且保存在本地文件
* 返回图片的新文件名
* @param imgUrl
* @return
*/
public String getImg(String imgUrl){
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(pool).build();
HttpGet httpGet = new HttpGet(imgUrl);
httpGet.setConfig(this.getConfig());
try {
CloseableHttpResponse response = httpClient.execute(httpGet);
//获取图片后缀 .jpg .png
String suffix = imgUrl.substring(imgUrl.lastIndexOf("."));
//创建新文件名
String imgNewName = UUID.randomUUID()+suffix;
if(response.getStatusLine().getStatusCode() ==200){
//通过流保存图片文件
FileOutputStream outputStream = new FileOutputStream("F:\\img\\"+imgNewName);
response.getEntity().writeTo(outputStream);
return imgNewName;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
return null;
}
/**
* 生成配置
* @return
*/
private RequestConfig getConfig(){
RequestConfig requestConfig = RequestConfig.custom()
.setConnectionRequestTimeout(1000)
.setConnectTimeout(1000)
.setSocketTimeout(1000 * 10)
.build();
return requestConfig;
}
}
文章浏览阅读7.2k次。前段时间开发手持机上的软件,因为A8手持机的射频卡可存储的内容太小,并且需要存储16进制数据,因此就写了一个工具类。上代码:package cn.com.szh;import java.io.UnsupportedEncodingException;public class Main { public static void main(String[] args) { Stri..._bytearraytohexstring
文章浏览阅读4.9k次。#include #include using namespace std;using namespace cv;int main(){Mat src; //源图像Mat tmp; //临时图像Mat dst_bw; //去掉背景后的目标二值图像Mat dst_contours;//轮廓图像src=imread("E:\\单板图片\\求孔洞数_边缘的最小外接矩形
文章浏览阅读865次。中介者,说白了跟市面上黑中介类似。当然这个中介,开发者是可以控制其行为的。也是在一定的信任关系上建立的。该模式要解决的问题是,一堆对象之间交叉耦合问题。网上看过群聊的例子。如果没有任何一个平台,多人之间的会话会是什么样的呢?不举多人,就三个吧A想把一句话说给BC,那么他首先要知道B和C在哪儿,然后分别告诉对方,自己想说的事情。如果再加一个人呢?问题很明显,此时各种群聊工具应运而生。我写
文章浏览阅读1.8k次。AUTO_INCREMENT两种情况1、在载入语句执行前,已经不确定要插入多少条记录。在执行插入语句时在表级别加一个auto-inc锁,然后为每条待插入记录的auto-increment修饰的列分配递增的值,语句执行结束后,再把auto-inc锁释放掉。一个事务再持有auto-inc锁的过程中,其他事务的插入语句都要被阻塞,可以保证一个语句中分配的递增值是连续的。AUTO-INC锁的..._mysql 自增序列生成原理
文章浏览阅读3.5k次,点赞2次,收藏17次。半导体能带结构示意图:上方两条白色带为没有电子填充的带,下面三条灰色带为充满电子的带,其中最高一条灰色带为价带,它与最低一条白色带之间的空隙为能隙空穴又称电洞(Electron hole),在固体物理学中指共价键上流失一个电子,最后在共价键上留下空位的现象导带(英语:conduction band),又名传导带,是指半导体或是绝缘体材料中,一种电子所具有能量的范围。这个能量的范围高..._掺杂半导体的带隙
文章浏览阅读3.5k次,点赞2次,收藏26次。基于C++和OpenCV的中心线提取算法加权平方灰度重心法介绍算法演示加权平方灰度重心法介绍详情见 https://blog.csdn.net/u010518385/article/details/101015604算法演示下面展示 函数-输入图像和阈值,输出点。void get_median_line(Mat& src, int thresh, vector<Point2d>& points){ if (src.empty()) return; // 一、_图像中心线提取c++
文章浏览阅读2.1k次,点赞9次,收藏12次。这里我看其他博主运行完 config set registry https://registry.npm.taobao.org/这个之后又运行了npm install -g cnpm --registry=https://registry.npm.taobao.org ,结果我还是一直报错,可能是没理解其他博主的意思,反正运行完config set registry https://registry.npm.taobao.org/之后直接安装就好了。如果是其他,你使用的是代理,需要在 npm 中配置代理。_getaddrinfo enotfound registry.cnpmjs.org
文章浏览阅读5k次。在使用QT时,运行程序时,可能出现QT找不到DLL的问题,这种情况大多数情况是因为没有将QT添加到环境变量的原因。解决方式:我的电脑-高级设置-环境变量将QT的两个bin文件目录路径添加到环境变量中,即可解决这个问题!..._qt打包缺少libgcc_s_dw2-1.dll
文章浏览阅读1.5w次,点赞15次,收藏74次。Socket1 环境查看通过cmd窗口的命令:ipconfig查看本机IP地址查看网络情况是否正常:ping百度官网用来进行本地测试的地址 127.0.0.1,回环测试地址,默认代表的就是本机的IP2 Socket概述socket编程也叫套接字编程,应用程序可以通过它发送或者接受数据,可对其像打开文件一样打开/关闭/读写等操作.套接字允许应用程序将I/O插入到网络中,并与网络中的其他应用程序进行通信.网络套接字是IP地址与端口号TCP协议的组合Socket就是为网络编程提供的一_socket网络编程
文章浏览阅读574次。I installed the mp3spi to support reading mp3 files in my Java 8 project usng the javax.sound* libraries. My goal now is to write mp3 to a wav file. However, the result is incorrect. Here's the code i..._java mp3转wav
文章浏览阅读2.7w次,点赞4次,收藏18次。正好要做一个天空的场景,想添加上行星和恒星的自转和公转,代码如下1.自转。public float _RotationSpeed; //定义自转的速度transform.Rotate(Vector3.down*_RotationSpeed,Space.World); //物体自转2.公转 public GameObject Axis; //物体需要公转的参_unity2d 公转
文章浏览阅读7.6k次。 很多其他语言的libary都会有去除string类的首尾空格的库函数,但是标准C++的库却不提供这个功能。但是C++string也提供很强大的功能,实现trim这种功能也不难。下面是几种方法: 1.使用string的find_first_not_of,和find_last_not_of方法 /* Filename : StringTrim1...._c++ string trim