1、创建表结构
本爬虫主要爬取某车之家新能源汽车的品牌、车系、车型等数据,数据存储采用Mysql,首先在Mysql数据库中创建存储品牌、车系、车型数据的表,脚本如下:
CREATE TABLE `base_brand_series_model` (
`id` bigint NOT NULL AUTO_INCREMENT COMMENT 'id',
`brand` varchar(100) COLLATE utf8_bin DEFAULT NULL COMMENT '品牌',
`sub_brand` varchar(100) COLLATE utf8_bin DEFAULT NULL COMMENT '子品牌',
`series` varchar(100) COLLATE utf8_bin DEFAULT NULL COMMENT '车系',
`model` varchar(100) COLLATE utf8_bin DEFAULT NULL COMMENT '车型',
`sale_status` varchar(100) COLLATE utf8_bin DEFAULT NULL COMMENT '销售状态',
`level` varchar(100) COLLATE utf8_bin DEFAULT NULL COMMENT '级别',
`endurance` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '续航',
`year` varchar(10) COLLATE utf8_bin DEFAULT NULL COMMENT '年度',
`energy_type` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '能源类型',
`first_char` char(1) COLLATE utf8_bin DEFAULT NULL COMMENT '品牌首字母',
`create_by` bigint DEFAULT NULL COMMENT '创建者',
`create_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
`update_by` bigint DEFAULT NULL COMMENT '更新者',
`update_time` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2654 DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='品牌车系车型表';
2、需要引入的包
import pymysql
import requests
from bs4 import BeautifulSoup
import pinyin
说明:
pymysql:用于操作Mysql数据库
requests:用于处理请求
BeautifulSoup:用于解析HTML
pinyin:用于将品牌中文汉子转拼音,获取拼音首字母
3、创建基础代码结构
class CarBrandSeriesModel(object):
"""获取新能源车品牌、车系、车型数据"""
def __init__(self):
"""用户初始化数据库连接及定义存储数据的属性"""
pass
def __del__(self):
"""关闭数据库连接"""
pass
def get_brand_data(self):
"""获取所有新能源汽车品牌列表及链接地址数据"""
pass
def get_series_data(self, url_b, car_brand):
"""根据所有新能源汽车品牌列表及链接地址分别获取对应子品牌及车系数据列表"""
pass
def get_model_data(self, brand, url):
"""根据所有车系列表数据分别获取所有车型及车型详情数据"""
pass
def insert_to_db(self, data):
"""将数据保存到Mysql数据库中"""
pass
def get_data(self):
"""对外接口,用于获取所有品牌、车系、车型数据"""
pass
if __name__ == '__main__':
"""程序入口"""
pass
4、各个代码实现细节
4.1、初始化
def __init__(self):
"""用户初始化数据库连接及定义存储数据的属性"""
self.conn = pymysql.connect(host="localhost", port=3306, user="root", passwd="root", db="test", charset='utf8')
self.cur = self.conn.cursor()
self.brand_data = {}
self.series_data = {}
4.2、对象删除
def __del__(self):
"""关闭数据库连接"""
if self.cur:
self.cur.close()
if self.conn:
self.conn.close()
4.3、获取品牌数据
def get_brand_data(self):
"""获取所有新能源汽车品牌列表及链接地址数据"""
url = 'https://car.autohome.com.cn/diandongche/index.html'
headers = {
'Referer': 'https://car.autohome.com.cn/',
'Sec-Fetch-Mode': 'no-cors',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
try:
r = requests.get(url, headers=headers)
if r.status_code == 200:
r.encoding = r.apparent_encoding # 此处将编码改成网页的编码样式,防止出现乱码
soup = BeautifulSoup(r.text, "lxml")
car_tree = soup.find('div', id='cartree')
for i in car_tree.find_all('li'):
for j in i.find_all('a'):
print("一级品牌数据:", j.text.strip(), "=", 'https://car.autohome.com.cn' + j.get('href'))
self.brand_data[j.text.strip()] = 'https://car.autohome.com.cn' + j.get('href')
except:
print("爬取失败!")
4.4、获取车系数据
def get_series_data(self, url_b, car_brand):
"""根据所有新能源汽车品牌列表及链接地址分别获取对应子品牌及车系数据列表"""
headers = {
'authority': 'car.autohome.com.cn',
'method': 'GET',
'scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'fvlid=156974583432110wygoXZiH; sessionid=D7FE9717-245E-4F8D-8D42-AAF453D1F470%7C%7C2019-09-29+16%3A30%3A35.298%7C%7C0; autoid=851072202da5829e1b4e6cbb05975388; cookieCityId=110100; __ah_uuid_ng=c_D7FE9717-245E-4F8D-8D42-AAF453D1F470; area=460106; ahpau=1; sessionuid=D7FE9717-245E-4F8D-8D42-AAF453D1F470%7C%7C2019-09-29+16%3A30%3A35.298%7C%7C0; ahsids=3170; sessionip=153.0.3.115; Hm_lvt_9924a05a5a75caf05dbbfb51af638b07=1585205934,1585207311,1585266321; clubUserShow=87236155|692|2|%E6%B8%B8%E5%AE%A2|0|0|0||2020-03-27+08%3A35%3A50|0; clubUserShowVersion=0.1; sessionvid=0F2198AC-5A75-47E2-B476-EAEC2AF05F04; Hm_lpvt_9924a05a5a75caf05dbbfb51af638b07=1585269508; ahpvno=45; v_no=8; visit_info_ad=D7FE9717-245E-4F8D-8D42-AAF453D1F470||0F2198AC-5A75-47E2-B476-EAEC2AF05F04||-1||-1||8; ref=www.baidu.com%7C0%7C0%7C0%7C2020-03-27+08%3A38%3A40.425%7C2019-10-07+22%3A52%3A34.733',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
}
re = requests.get(url_b, headers)
soup = BeautifulSoup(re.text, 'lxml') # 直接解析
brand_id = "b" + url_b[url_b.index('brand-') + 6: url_b.index('.html')]
li_node = soup.find('li', id=brand_id)
for i in li_node.find_all('dd'):
for j in i.find_all('a'):
print("车系数据:", (car_brand, j.text), "=", 'https://car.autohome.com.cn' + j.get('href'))
self.series_data[(car_brand, j.text)] = 'https://car.autohome.com.cn' + j.get('href')
4.5、获取车型及详情数据
def get_model_data(self, brand, url):
"""根据所有车系列表数据分别获取所有车型及车型详情数据"""
headers = {
'authority': 'car.autohome.com.cn',
'method': 'GET',
'scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'fvlid=156974583432110wygoXZiH; sessionid=D7FE9717-245E-4F8D-8D42-AAF453D1F470%7C%7C2019-09-29+16%3A30%3A35.298%7C%7C0; autoid=851072202da5829e1b4e6cbb05975388; cookieCityId=110100; __ah_uuid_ng=c_D7FE9717-245E-4F8D-8D42-AAF453D1F470; area=460106; ahpau=1; sessionuid=D7FE9717-245E-4F8D-8D42-AAF453D1F470%7C%7C2019-09-29+16%3A30%3A35.298%7C%7C0; ahsids=3170; sessionip=153.0.3.115; Hm_lvt_9924a05a5a75caf05dbbfb51af638b07=1585205934,1585207311,1585266321; clubUserShow=87236155|692|2|%E6%B8%B8%E5%AE%A2|0|0|0||2020-03-27+08%3A35%3A50|0; clubUserShowVersion=0.1; sessionvid=0F2198AC-5A75-47E2-B476-EAEC2AF05F04; Hm_lpvt_9924a05a5a75caf05dbbfb51af638b07=1585269508; ahpvno=45; v_no=8; visit_info_ad=D7FE9717-245E-4F8D-8D42-AAF453D1F470||0F2198AC-5A75-47E2-B476-EAEC2AF05F04||-1||-1||8; ref=www.baidu.com%7C0%7C0%7C0%7C2020-03-27+08%3A38%3A40.425%7C2019-10-07+22%3A52%3A34.733',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
}
re = requests.get(url, headers=headers)
soup = BeautifulSoup(re.text, 'html.parser')
# 子品牌
sub_brand = soup.find("div", {'class': 'cartab-title'}).find("h2").find("a").text
# 销售列表(在售、即将销售、停售)
sale_list = soup.find("div", {'class': 'tab tab02 brandtab-cont'}).find("ul",
{"data-trigger": "click"}).find_all(
"li")
for li in sale_list:
a = li.find("a")
if a is None:
continue
sale_name = a.text
sale_href = 'https://car.autohome.com.cn' + a.get('href')
re = requests.get(sale_href, headers=headers)
soup = BeautifulSoup(re.text, 'html.parser')
# 获取车型列表
interval01_list = soup.find("div", id="divSeries").find_all("ul", {"class": "interval01-list"})
for interval in interval01_list:
cars_list = interval.find_all("div", {"class": "interval01-list-cars"})
for car in cars_list:
car_info = car.find("div", {"class": "interval01-list-cars-infor"}).find("p").find("a")
car_name = car_info.text
car_href = 'https:' + car_info.get('href')
re = requests.get(car_href, headers=headers)
soup = BeautifulSoup(re.text, 'html.parser')
# 参数列表
param_list = soup.find("div", {'class': 'spec-param'}).find("div",
{'class': 'param-list'}).find_all(
"div", {"class": "cell"})
# 级别
car_level = param_list[0].find("p").text
# 续航里程
endurance = param_list[1].find("p").text
# 能源类型
energy_type = param_list[4].find("p").text
# 品牌
car_brand = brand[0][:brand[0].index('(')].strip()
# 车系
car_series = brand[1][:brand[1].index('(')].strip()
# 添加数据到数据库
self.insert_to_db(
{"brand": car_brand, "sub_brand": sub_brand, "sale_status": sale_name,
"series": car_series, "model": car_name, "level": car_level,
"endurance": endurance, "energy_type": energy_type, "year": car_name[:car_name.index(' ')],
"first_char": pinyin.get(car_brand)[0:1].upper()})
4.6、保存数据到数据库
def insert_to_db(self, data):
"""将数据保存到Mysql数据库中"""
try:
print("车辆详细数据:", data)
sql = 'insert into base_brand_series_model(brand,sub_brand,series,model,sale_status,level,endurance,year,' \
'energy_type,first_char,create_by,update_by) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
param = (data.get("brand"), data.get("sub_brand"), data.get("series"), data.get("model"),
data.get("sale_status"), data.get("level"), data.get("endurance"), data.get("year"),
data.get("energy_type"), data.get("first_char"),
1, 1)
self.cur.execute(sql, param)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
4.7、对外获取数据接口
def get_data(self):
"""对外接口,用于获取所有品牌、车系、车型数据"""
# 获取一级品牌数据
print("*********** 获取一级品牌数据开始 **********")
self.get_brand_data()
print("*********** 获取一级品牌数据结束 **********")
# 获取车系数据
print("*********** 获取车系数据开始 **********")
for car_brand in self.brand_data:
self.get_series_data(self.brand_data[car_brand], car_brand)
# print(self.car_result2)
print("*********** 获取车系数据结束 **********")
# 获取车辆详细数据
print("*********** 获取车辆详细数据开始 **********")
for brand in self.series_data:
self.get_model_data(brand, self.series_data[brand])
print("*********** 获取车辆详细数据结束 **********")
4.8、程序入口
if __name__ == '__main__':
"""程序入口"""
car_brand_series_model = CarBrandSeriesModel()
car_brand_series_model.get_data()
5、代码运行结果
6、爬取完成后存储结果
【生长吧!Python】有奖征文火热进行中:https://bbs.huaweicloud.com/blogs/278897