1、创建表结构
本爬虫主要爬取国家统计局省市区数据,数据存储采用Mysql,首先在Mysql数据库中创建存储省市区数据的表,脚本如下:
CREATE TABLE `t_areas` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '主键ID',
`pid` int DEFAULT NULL COMMENT '父ID',
`name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '名称',
`url` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '下级数据URL',
`level` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '级别',
`code` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '编码',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=552 DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='省市区表';
2、需要引入的包
import pymysql
import requests
import lxml.etree as etree
import os
说明:
pymysql:用于操作Mysql数据库
requests:用于处理请求
BeautifulSoup:用于解析Html Dom树
os:操作文件路径
3、创建基础代码结构
class ChineseArea(object):
"""获取省市区数据"""
def __init__(self):
"""用户初始化数据库连接及定义存储数据的属性"""
pass
def __del__(self):
"""关闭数据库连接"""
pass
def get_province_data(self, url):
"""爬行政区划代码公布页省级数据"""
pass
def parse_province(self):
"""解析省页,返回list"""
pass
def parse_sub_data(self, level, pid, url):
"""根据level解析子页数据"""
pass
def parse_villager(self, level, pid, url):
"""解析社区页数据"""
pass
def insert_to_db(self, value):
"""将数据保存到Mysql数据库中"""
pass
def parse_areas(self):
"""对外接口,用于获取所有省市区数据"""
pass
if __name__ == '__main__':
"""程序入口"""
pass
4、各个代码实现细节
4.1、初始化
def __init__(self):
"""用户初始化数据库连接及定义存储数据的属性"""
self.baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'
self.base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
self.conn = pymysql.connect(host="localhost", port=3306, user="root", passwd="root", db="test", charset='utf8')
self.cur = self.conn.cursor()
self.levels = {
1: '//tr[@class="provincetr"]',
2: '//tr[@class="citytr"]',
3: '//tr[@class="countytr"]',
4: '//tr[@class="towntr"]',
5: '//tr[@class="villagetr"]'
}
4.2、对象删除
def __del__(self):
"""关闭数据库连接"""
if self.cur:
self.cur.close()
if self.conn:
self.conn.close()
4.3、获取省级数据
def get_province_data(self, url):
"""爬取行政区划代码公布页省级数据"""
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
i = 0
while i < 3:
try:
html = requests.get(url, headers=headers, timeout=20)
html.encoding = 'gbk' # 这里添加一行
text = html.text
return text
except requests.exceptions.RequestException:
i += 1
print('超时' + url)
4.4、解析省级数据
def parse_province(self):
"""解析省级数据,返回省列表数据"""
html = self.get_province_data(self.baseUrl)
tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
nodes = tree.xpath('//tr[@class="provincetr"]')
values = []
for node in nodes:
items = node.xpath('./td')
for item in items:
value = {}
next_url = item.xpath('./a/@href')
province = item.xpath('./a/text()')
if province:
print(province)
value['url'] = self.base + "".join(next_url)
value['name'] = "".join(province)
value['code'] = "".join(next_url)[:2] + "0000000000"
value['pid'] = 0
value['level'] = 1
print(repr(value['name']))
last_id = self.insert_to_db(value)
value['id'] = last_id
values.append(value)
print(value)
return values
4.5、根据级别解析子页数据
def parse_sub_data(self, level, pid, url):
"""根据级别解析子页数据"""
if url.strip() == '':
return None
# url_prefix+url
html = self.get_province_data(url)
tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
if level == 3:
nodes = tree.xpath(self.levels.get(level))
if len(nodes) == 0:
nodes = tree.xpath(self.levels.get(4))
print('有镇的市:' + url)
else:
nodes = tree.xpath(self.levels.get(level))
path = os.path.basename(url)
base_url = url.replace(path, '')
values = []
# 多个城市
for node in nodes:
value = {}
next_url = node.xpath('./td[1]/a/@href')
if len(next_url) == 0:
next_url = ''
code = node.xpath('./td[1]/a/text()')
if len(code) == 0:
code = node.xpath('./td[1]/text()')
name = node.xpath('./td[2]/a/text()')
if len(name) == 0:
name = node.xpath('./td[2]/text()')
value['code'] = "".join(code)
temp_url = "".join(next_url)
if len(temp_url) != 0:
value['url'] = base_url + "".join(next_url)
else:
value['url'] = ''
value['name'] = "".join(name)
print(repr(value['name']))
print(value['url'])
value['pid'] = pid
value['level'] = level
last_id = self.insert_to_db(value)
value['id'] = last_id
values.append(value)
print(value)
return values
4.6、解析社区页数据
def parse_villager(self, level, pid, url):
"""解析社区页数据"""
html = self.get_province_data(url)
tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
nodes = tree.xpath(self.levels.get(level))
values = []
# 多个城市
for node in nodes:
value = {}
nexturl = node.xpath('./td[1]/a/@href')
code = node.xpath('./td[1]/text()')
vcode = node.xpath('./td[2]/text()')
name = node.xpath('./td[3]/text()')
value['code'] = "".join(code)
value['url'] = "".join(nexturl)
value['name'] = "".join(name)
print(repr(value['name']))
value['pid'] = pid
value['level'] = level
values.append(value)
last_id = self.insert_to_db(value)
value['id'] = last_id
values.append(value)
print(value)
return values
4.7、保存数据到数据库
def insert_to_db(self, value):
"""将数据保存到Mysql数据库中"""
last_id = 0
try:
sql = 'INSERT INTO t_areas(pid,name,url,level,code) values(%s,%s,%s,%s,%s)'
param = (value.get("pid"), value.get("name"), value.get("url"), value.get("level"), value.get("code"))
self.cur.execute(sql, param)
last_id = self.cur.lastrowid
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return last_id
4.8、对外获取数据接口
def parse_areas(self):
"""对外接口,用于获取所有省市区数据"""
values = self.parse_province()
for value in values:
cities = self.parse_sub_data(2, value['id'], value['url'])
if cities:
for city in cities:
counties = self.parse_sub_data(3, city['id'], city['url'])
# 这个下面是获取 乡镇和居委会数据 如果不需要删除就可以了
# if counties:
# for county in counties:
# towns = self.parse_sub_data(4, county['id'], county['url'])
# if towns:
# for town in towns:
# self.parse_villager(5, town['id'], town['url'])
4.9、程序入口
if __name__ == '__main__':
"""程序入口"""
chinese_area = ChineseArea()
chinese_area.parse_areas()
5、代码运行结果
6、爬取完成后存储结果
【生长吧!Python】有奖征文火热进行中:https://bbs.huaweicloud.com/blogs/278897