百木园-与人分享,
就是让自己快乐。

Python爬虫爬取国家统计局2009年到2020年,统计用区划和城乡划分代码(省市区/县三级)并存入mysql数据库

国家统计局->统计标准网址:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/

流程

对统计标准的网站进行分层分级爬取

 

 

代码

import pymysql
from bs4 import BeautifulSoup
import re
import requests
import lxml
import traceback
import time
import json
from lxml import etree

def get_area(year):
year
=str(year)
url
=\"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/\"+ year +\"/index.html\"
print(url)
headers
={
\'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36\'
}
response
=requests.get(url,headers)
# print(response.text)
response.encoding=\'GBK\'
page_text
= response.text
soup
=BeautifulSoup(page_text,\'lxml\')
# print(page_text)
all_province=soup.find_all(\'tr\',class_=\'provincetr\') #获取所有省份第一级的tr 有4个tr
# all_province长度为4,其中第一组是从北京市到黑龙江省
\"\"\"
格式是这样的:
<tr class=\"provincetr\"><td><a href=\"11.html\">北京市<br/></a></td>
<td><a href=\"12.html\">天津市<br/></a></td>
<td><a href=\"13.html\">河北省<br/></a></td>
<td><a href=\"14.html\">山西省<br/></a></td>
<td><a href=\"15.html\">内蒙古自治区<br/></a></td>
<td><a href=\"21.html\">辽宁省<br/></a></td><td>
\"\"\"
province_str
=\"\" #为了方便处理,把省份数据变成一个字符串
for i in range(len(all_province)):
province_str
=province_str+str(all_province[i])
# print(province_str)
# 开始分别获得a标签的href和text
province={}
province_soup
=BeautifulSoup(province_str,\'lxml\')
province_href
=province_soup.find_all(\"a\") #获取所有的a标签
for i in province_href:
href_str
=str(i)
# print(href_str)
#创建省份数据字典
province.update({BeautifulSoup(href_str,\'lxml\').find(\"a\").text:BeautifulSoup(href_str,\'lxml\').find(\"a\")[\"href\"]})
# print(province)
\"\"\"
数据provide字典
{\'北京市\': \'11.html\', \'天津市\': \'12.html\', \'河北省\': \'13.html\', \'山西省\': \'14.html\',
\'内蒙古自治区\': \'15.html\', \'辽宁省\': \'21.html\', \'吉林省\': \'22.html\', \'黑龙江省\': \'23.html\',
\'上海市\': \'31.html\', \'江苏省\': \'32.html\', \'浙江省\': \'33.html\', \'安徽省\': \'34.html\',
\'福建省\': \'35.html\', \'江西省\': \'36.html\', \'山东省\': \'37.html\', \'河南省\': \'41.html\',
\'湖北省\': \'42.html\', \'湖南省\': \'43.html\', \'广东省\': \'44.html\', \'广西壮族自治区\': \'45.html\',
\'海南省\': \'46.html\', \'重庆市\': \'50.html\', \'四川省\': \'51.html\', \'贵州省\': \'52.html\', \'云南省\': \'53.html\',
\'西藏自治区\': \'54.html\', \'陕西省\': \'61.html\', \'甘肃省\': \'62.html\', \'青海省\': \'63.html\',
\'宁夏回族自治区\': \'64.html\', \'新疆维吾尔自治区\': \'65.html\'}
\"\"\"
# 根据身份数据字典继续爬取下一级的市级数据,创建市级数据字典
city=[]
city_url
=\"\"
city_tr
=[]
temp_list
=[]
for item in province.items():
# print(value)
city_url=\"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/\"+year+\"/\"+item[1]
city_html
=requests.get(city_url,headers)
city_html.encoding
=\'GBK\'
city_text
=city_html.text
city_tr.append(BeautifulSoup(city_text,
\'lxml\').find_all(\'tr\',class_=\"citytr\"))
# 获得所有的市区tr city_tr列表长度是31 对应31个省或直辖市
# 下面开始建立市区的字典{\"名字\":\"链接\"}
#存放省名字列表
province_key=[]
for key in province.keys():
province_key.append(key)
num
=0
for i in city_tr:
for j in i:
# j:<tr class=\"citytr\"><td><a href=\"11/1101.html\">110100000000</a></td><td><a href=\"11/1101.html\">市辖区</a></td></tr>
# print(j)
etree_ = etree.HTML(str(j))
temp_list.append({
etree_.xpath(
\'//tr/td[2]/a/text()\')[0]:
etree_.xpath(
\'//tr/td[2]/a/@href\')[0]
})
# print(temp_list)
city.append({province_key[num]:temp_list})
num
=num+1
temp_list
=[]
print(len(city))

\"\"\"
city[11]
{\'安徽省\': [{\'合肥市\': \'34/3401.html\'}, {\'芜湖市\': \'34/3402.html\'}, {\'蚌埠市\': \'34/3403.html\'},
{\'淮南市\': \'34/3404.html\'}, {\'马鞍山市\': \'34/3405.html\'}, {\'淮北市\': \'34/3406.html\'}, {\'铜陵市\': \'34/3407.html\'},
{\'安庆市\': \'34/3408.html\'}, {\'黄山市\': \'34/3410.html\'}, {\'滁州市\': \'34/3411.html\'}, {\'阜阳市\': \'34/3412.html\'},
{\'宿州市\': \'34/3413.html\'}, {\'六安市\': \'34/3415.html\'}, {\'亳州市\': \'34/3416.html\'}, {\'池州市\': \'34/3417.html\'},
{\'宣城市\': \'34/3418.html\'}]}
\"\"\"

# 搞定市级字典,下面开始最后一步,area
province_name=\"\"
city_name
=\"\"
area_name
=\"\"
area_tr
=[]
area_list
=[]
temp_area_list
=[]

for item1 in city:
for k1,v1 in item1.items():
province_name
=k1
if(province_name in [\"北京\",\"天津\",\"上海\",\"重庆\"]):
province_name
=province_name+\"\"
if(province_name ==\"宁夏\"):
province_name
=province_name+\"回族自治区\"
if(province_name in[\"西藏\",\"内蒙古\"]):
province_name
=province_name+\"自治区\"
if(province_name == \"新疆\"):
province_name
=province_name+\"维吾尔自治区\"
if (province_name == \"广西\"):
province_name
= province_name + \"壮族自治区\"
if(province_name==\"黑龙江\"):
province_name
=province_name+\"\"
if(len(province_name)==2 and province_name not in [\"西藏\",\"宁夏\",\"新疆\",\"广西\",\"北京\",\"天津\",\"上海\",\"重庆\"]):
province_name
= province_name+\"\"
for item2 in v1:
for k2,v2 in item2.items():
city_name
=k2
# print(city_name)
area_url=\"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/\"+ year +\"/\"+ v2
print(area_url)
area_response
=requests.get(area_url,headers)
area_response.encoding
=\'GBK\'
area_text
=area_response.text
area_soup
=BeautifulSoup(area_text,\'lxml\')
area_tr
=area_soup.find_all(\"tr\",class_=\"countytr\")
for i in range(len(area_tr)):
etree_area
= etree.HTML(str(area_tr[i]))
try:
area_name
=etree_area.xpath(\"//tr/td[2]/a/text()\")[0]
except:
area_name
= etree_area.xpath(\"//tr/td[2]/text()\")[0]
# print(area_name)
# print(str(area_tr[i]))
try:
temp_area_list.append({
etree_area.xpath(
\"//tr/td[1]/a/text()\")[0][0:6]: province_name+\"·\"+city_name+\"·\"+area_name
})
except:
temp_area_list.append({
etree_area.xpath(
\"//tr/td[1]/text()\")[0][0:6]: province_name+\"·\"+city_name+\"·\"+area_name
})
area_list.append(temp_area_list)
temp_area_list
=[]
time.sleep(
1)
return area_list

def into_mysql(year):
year
=str(year)
SQL
=\"\"
conn,cursor
=get_mysql_conn()
res
=get_area(year)
try:
for item in res:
for k,v in item[0].items():
print(k)
print(v)
SQL
=\"insert into std_area (year,area_code, area_name) values (\'\"+year+\"\',\'\"+k+\"\',\'\"+v+\"\')\"
print(SQL)
cursor.execute(SQL)
conn.commit()
except:
print(\"出现错误\")
conn,cursor.close()
return None

def query(sql,*args):
\"\"\"
通用封装查询
:param sql:
:param args:
:return:返回查询结果 ((),())
\"\"\"
conn , cursor
= get_mysql_conn()
print(sql)
cursor.execute(sql)
res
= cursor.fetchall()
close_conn(conn , cursor)
return res
\"\"\"
------------------------------------------------------------------------------------
\"\"\"
def get_mysql_conn():
\"\"\"
:return: 连接,游标
\"\"\"
# 创建连接
conn = pymysql.connect(host=\"127.0.0.1\",
user
=\"root\",
password
=\"000429\",
db
=\"data_cleaning\",
charset
=\"utf8\")
# 创建游标
cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示
return conn, cursor

def close_conn(conn, cursor):
if cursor:
cursor.close()
if conn:
conn.close()
if __name__ == \'__main__\':
# res=get_area()
into_mysql(\'2009\')

来源:https://www.cnblogs.com/rainbow-1/p/15769241.html
图文来源于网络,如有侵权请联系删除。

未经允许不得转载:百木园 » Python爬虫爬取国家统计局2009年到2020年,统计用区划和城乡划分代码(省市区/县三级)并存入mysql数据库

相关推荐

  • 暂无文章