用python脚本抓取省市县区乡镇村庄(五级行政区划)的过程如下:
1,抓取国家统计局官网上的行政区划名称和对应的代码(5级是不同的网页,所以涉及多层跳转);
2,数据量大约几十万条,频繁访问考虑防屏蔽问题;
3,各层级网页结构有微调需要做兼容处理;
4,大量http/https请求需做好异常处理以提高成功抓取全部结果的概率;
完整python代码:
import requests from bs4 import BeautifulSoup import random import time urlPrefix = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/' def myRequest(url): user_agent_list = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) Gecko/20100101 Firefox/61.0", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15"] headers = {'User-Agent': random.choice(user_agent_list)} requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数 s = requests.session() s.keep_alive = False # 关闭多余连接 try: return s.get(url, headers=headers) # Getting page HTML through request except (requests.exceptions.ReadTimeout, requests.exceptions.Timeout, requests.exceptions.ConnectTimeout) as e: print(e) time.sleep(random.randint(1, 5)) return s.get(url, headers=headers) # Getting page HTML through request except requests.exceptions.ConnectionError as e: print(e) time.sleep(random.randint(1, 5)) return s.get(url, headers=headers) # Getting page HTML through request except Exception as e: raise e def Writefile(content, tag): # 将数据输出文件中,注意点1. 所指定的盘存在,2. 使用file= fp = open("D:/pythonWorkspace/webSpiders-Region-"+tag+".txt", "a+", encoding='utf-8') # a+ 如果文件不存在就创建。存在就在文件内容的后面继续追加 print(content, file=fp) fp.close() def villageFun(TownID, villagePage): print('villageFun> '+TownID+','+villagePage) page = myRequest(urlPrefix+villagePage) # Getting page HTML through request if(200 == page.status_code): soup = BeautifulSoup(page.content, 'html.parser') # Parsing content using beautifulsoup links = soup.select("table tbody tr.villagetr") # Selecting all of the anchors with titles first10 = links # Keep only the first 10 anchors for anchor in first10: myItem = anchor.select("td") if ([] != myItem): print('5'+','+myItem[0].text+','+myItem[2].text+','+TownID) # Display the innerText of each anchor Writefile('5'+','+myItem[0].text+','+myItem[2].text+','+TownID, TownID[0:2]) else: print('跳过:ID='+TownID+'page='+villagePage) print('村庄遍历完成。') else: print('ERROR: status_code='+str(page.status_code)) def townFun(CountyID, townPage): print('townFun> '+CountyID+','+townPage) page = myRequest(urlPrefix+townPage) # Getting page HTML through request if(200 == page.status_code): soup = BeautifulSoup(page.content, 'html.parser') # Parsing content using beautifulsoup links = soup.select("table tbody tr.towntr") # Selecting all of the anchors with titles first10 = links # Keep only the first 10 anchors for anchor in first10: myItem = anchor.select("td a") if ([] != myItem): print('4'+','+myItem[0:1][0].text+','+myItem[1:2][0].text+','+CountyID) # Display the innerText of each anchor Writefile('4'+','+myItem[0:1][0].text+','+myItem[1:2][0].text+','+CountyID, CountyID[0:2]) villageFun(myItem[0:1][0].text, CountyID[0:2]+'/'+CountyID[2:4]+'/'+myItem[0:1][0]['href']) else: print('跳过:ID='+CountyID+'page='+townPage) time.sleep(0.5) # 延时,避免太频繁 print('乡镇遍历完成。') else: print('ERROR: status_code='+str(page.status_code)) def countyFun(CityID, countyPage): print('countyFun> '+CityID+','+countyPage) page = myRequest(urlPrefix+countyPage) # Getting page HTML through request if(200 == page.status_code): soup = BeautifulSoup(page.content, 'html.parser') # Parsing content using beautifulsoup links = soup.select("table tbody tr.countytr") # Selecting all of the anchors with titles first10 = links # Keep only the first 10 anchors for anchor in first10: myItem = anchor.select("td a") if ([] != myItem): print('3'+','+myItem[0:1][0].text+','+myItem[1:2][0].text+','+CityID) # Display the innerText of each anchor Writefile('3'+','+myItem[0:1][0].text+','+myItem[1:2][0].text+','+CityID, CityID[0:2]) townFun(myItem[0:1][0].text, CityID[0:2]+'/'+myItem[0:1][0]['href']) else: print('跳过:ID='+CityID+'page='+countyPage) time.sleep(0.5) # 延时,避免太频繁 print('县区遍历完成。') else: print('ERROR: status_code='+str(page.status_code)) def cityFun(ProvinceID, cityPage): print('cityFun> '+ProvinceID+','+cityPage) page = myRequest(urlPrefix+cityPage) # Getting page HTML through request if(200 == page.status_code): soup = BeautifulSoup(page.content, 'html.parser') # Parsing content using beautifulsoup links = soup.select("table tbody tr.citytr") # Selecting all of the anchors with titles first10 = links # Keep only the first 10 anchors for anchor in first10: myItem = anchor.select("td a") if ([] != myItem): print('2'+','+myItem[0:1][0].text+','+myItem[1:2][0].text+','+ProvinceID) # Display the innerText of each anchor Writefile('2'+','+myItem[0:1][0].text+','+myItem[1:2][0].text+','+ProvinceID, ProvinceID) countyFun(myItem[0:1][0].text, myItem[0:1][0]['href']) else: print('跳过:ID='+ProvinceID+'page='+cityPage) # time.sleep(0.5) # 延时,避免太频繁 print('城市遍历完成。') else: print('ERROR: status_code='+str(page.status_code)) def ProvinceFun(): page = myRequest(urlPrefix+'index.html') # Getting page HTML through request if(200 == page.status_code): soup = BeautifulSoup(page.content, 'html.parser') # Parsing content using beautifulsoup links = soup.select("table tbody tr.provincetr td a") # Selecting all of the anchors with titles first10 = links # Keep only the first 10 anchors for anchor in first10: ProvinceID = anchor['href'].rstrip('.html') print('1'+','+ProvinceID+','+anchor.text+','+'0') # Display the innerText of each anchor Writefile('1'+','+ProvinceID+','+anchor.text+','+'0', ProvinceID) cityFun(ProvinceID, anchor['href']) # time.sleep(3) # 延时,避免太频繁 print('省份遍历完成。') else: print('ERROR: status_code='+str(page.status_code)) if __name__ == '__main__': ProvinceFun() # cityFun('43', '43.html')
运行完城后控制台回显:
村庄遍历完成。
乡镇遍历完成。
县区遍历完成。
城市遍历完成。
省份遍历完成。
数据结果会写入到txt文本中,示例 webSpiders-Region-43.txt 是这样(文件内容太多,这里只选取文件前几行):
1,43,湖南省,0 2,430100000000,长沙市,43 3,430102000000,芙蓉区,430100000000 4,430102001000,文艺路街道,430102000000 5,430102001001,识字里社区居委会,430102001000 5,430102001002,文艺新村社区居委会,430102001000 5,430102001003,韭菜园社区居委会,430102001000
下一步将这个文件导入到 mysql里面。
新建mysql表:
CREATE DATABASE /*!32312 IF NOT EXISTS*/`db_xiongzaiqiren` /*!40100 DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci */; USE `db_xiongzaiqiren`; /*Table structure for table `tb_region` */ DROP TABLE IF EXISTS `tb_region`; CREATE TABLE `tb_region` ( `regionID` varchar(36) NOT NULL COMMENT '地区ID', `regionName` varchar(256) NOT NULL COMMENT '地区名称', `regionLevel` tinyint(4) NOT NULL DEFAULT '1' COMMENT '地区级别', `regionParentID` varchar(36) NOT NULL DEFAULT '0' COMMENT '地区上级ID', `regionIsEnabled` tinyint(4) NOT NULL DEFAULT '1' COMMENT '是否启用', PRIMARY KEY (`regionID`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
创建数据库,表后,就可以导入txt里的数据了。
LOAD DATA LOCAL INFILE 'D:/pythonWorkspace/webSpiders-Region-43.txt' INTO TABLE db_xiongzaiqiren.tb_region FIELDS TERMINATED BY ',' LINES TERMINATED BY '/r/n' (regionLevel,regionID,regionName,regionParentID,regionIsEnabled) SET regionIsEnabled=1;
注意:导入语句可能会报错,因为mysql默认没有开启 local_infile ,需要手动设置开启才能导入:
# 服务器端,local_infile默认开启;客户端,local_infile默认关闭,因此用的时候需要打开。 SHOW GLOBAL VARIABLES LIKE 'local_infile'; SET GLOBAL local_infile = 'ON';
等待导入完成,看下受影响的行数,与txt里面的行数对比一下,数据条数是一样的。
【完】
原创文章,作者:6024010,如若转载,请注明出处:https://blog.ytso.com/tech/python/275166.html