python脚本抓取省市县区乡镇村庄（五级行政区划）

用python脚本抓取省市县区乡镇村庄（五级行政区划）的过程如下：

1，抓取国家统计局官网上的行政区划名称和对应的代码（5级是不同的网页，所以涉及多层跳转）；

2，数据量大约几十万条，频繁访问考虑防屏蔽问题；

3，各层级网页结构有微调需要做兼容处理；

4，大量http/https请求需做好异常处理以提高成功抓取全部结果的概率；

完整python代码：

import requests
from bs4 import BeautifulSoup
import random
import time

urlPrefix = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'


def myRequest(url):
    user_agent_list = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE",
                       "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
                       "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
                       "Mozilla/5.0 (Windows NT 10.0; WOW64) Gecko/20100101 Firefox/61.0",
                       "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
                       "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
                       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
                       "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
                       "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15"]
    headers = {'User-Agent': random.choice(user_agent_list)}
    requests.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
    s = requests.session()
    s.keep_alive = False  # 关闭多余连接
    try:
        return s.get(url, headers=headers)  # Getting page HTML through request
    except (requests.exceptions.ReadTimeout, requests.exceptions.Timeout, requests.exceptions.ConnectTimeout) as e:
        print(e)
        time.sleep(random.randint(1, 5))
        return s.get(url, headers=headers)  # Getting page HTML through request
    except requests.exceptions.ConnectionError as e:
        print(e)
        time.sleep(random.randint(1, 5))
        return s.get(url, headers=headers)  # Getting page HTML through request
    except Exception as e:
        raise e


def Writefile(content, tag):
    # 将数据输出文件中，注意点1. 所指定的盘存在，2. 使用file=
    fp = open("D:/pythonWorkspace/webSpiders-Region-"+tag+".txt", "a+", encoding='utf-8')  # a+ 如果文件不存在就创建。存在就在文件内容的后面继续追加
    print(content, file=fp)
    fp.close()


def villageFun(TownID, villagePage):
    print('villageFun> '+TownID+','+villagePage)
    page = myRequest(urlPrefix+villagePage)  # Getting page HTML through request
    if(200 == page.status_code):
        soup = BeautifulSoup(page.content, 'html.parser')  # Parsing content using beautifulsoup

        links = soup.select("table tbody tr.villagetr")  # Selecting all of the anchors with titles
        first10 = links  # Keep only the first 10 anchors
        for anchor in first10:
            myItem = anchor.select("td")
            if ([] != myItem):
                print('5'+','+myItem[0].text+','+myItem[2].text+','+TownID)  # Display the innerText of each anchor
                Writefile('5'+','+myItem[0].text+','+myItem[2].text+','+TownID, TownID[0:2])
            else:
                print('跳过：ID='+TownID+'page='+villagePage)
        print('村庄遍历完成。')
    else:
        print('ERROR: status_code='+str(page.status_code))


def townFun(CountyID, townPage):
    print('townFun> '+CountyID+','+townPage)
    page = myRequest(urlPrefix+townPage)  # Getting page HTML through request
    if(200 == page.status_code):
        soup = BeautifulSoup(page.content, 'html.parser')  # Parsing content using beautifulsoup

        links = soup.select("table tbody tr.towntr")  # Selecting all of the anchors with titles
        first10 = links  # Keep only the first 10 anchors
        for anchor in first10:
            myItem = anchor.select("td a")
            if ([] != myItem):
                print('4'+','+myItem[0:1][0].text+','+myItem[1:2][0].text+','+CountyID)  # Display the innerText of each anchor
                Writefile('4'+','+myItem[0:1][0].text+','+myItem[1:2][0].text+','+CountyID, CountyID[0:2])
                villageFun(myItem[0:1][0].text, CountyID[0:2]+'/'+CountyID[2:4]+'/'+myItem[0:1][0]['href'])
            else:
                print('跳过：ID='+CountyID+'page='+townPage)

            time.sleep(0.5)  # 延时，避免太频繁
        print('乡镇遍历完成。')
    else:
        print('ERROR: status_code='+str(page.status_code))


def countyFun(CityID, countyPage):
    print('countyFun> '+CityID+','+countyPage)
    page = myRequest(urlPrefix+countyPage)  # Getting page HTML through request
    if(200 == page.status_code):
        soup = BeautifulSoup(page.content, 'html.parser')  # Parsing content using beautifulsoup

        links = soup.select("table tbody tr.countytr")  # Selecting all of the anchors with titles
        first10 = links  # Keep only the first 10 anchors
        for anchor in first10:
            myItem = anchor.select("td a")
            if ([] != myItem):
                print('3'+','+myItem[0:1][0].text+','+myItem[1:2][0].text+','+CityID)  # Display the innerText of each anchor
                Writefile('3'+','+myItem[0:1][0].text+','+myItem[1:2][0].text+','+CityID, CityID[0:2])
                townFun(myItem[0:1][0].text, CityID[0:2]+'/'+myItem[0:1][0]['href'])
            else:
                print('跳过：ID='+CityID+'page='+countyPage)

            time.sleep(0.5)  # 延时，避免太频繁
        print('县区遍历完成。')
    else:
        print('ERROR: status_code='+str(page.status_code))


def cityFun(ProvinceID, cityPage):
    print('cityFun> '+ProvinceID+','+cityPage)
    page = myRequest(urlPrefix+cityPage)  # Getting page HTML through request
    if(200 == page.status_code):
        soup = BeautifulSoup(page.content, 'html.parser')  # Parsing content using beautifulsoup

        links = soup.select("table tbody tr.citytr")  # Selecting all of the anchors with titles
        first10 = links  # Keep only the first 10 anchors
        for anchor in first10:
            myItem = anchor.select("td a")
            if ([] != myItem):
                print('2'+','+myItem[0:1][0].text+','+myItem[1:2][0].text+','+ProvinceID)  # Display the innerText of each anchor
                Writefile('2'+','+myItem[0:1][0].text+','+myItem[1:2][0].text+','+ProvinceID, ProvinceID)
                countyFun(myItem[0:1][0].text, myItem[0:1][0]['href'])
            else:
                print('跳过：ID='+ProvinceID+'page='+cityPage)

            # time.sleep(0.5)  # 延时，避免太频繁
        print('城市遍历完成。')
    else:
        print('ERROR: status_code='+str(page.status_code))


def ProvinceFun():
    page = myRequest(urlPrefix+'index.html')  # Getting page HTML through request
    if(200 == page.status_code):
        soup = BeautifulSoup(page.content, 'html.parser')  # Parsing content using beautifulsoup

        links = soup.select("table tbody tr.provincetr td a")  # Selecting all of the anchors with titles
        first10 = links  # Keep only the first 10 anchors
        for anchor in first10:
            ProvinceID = anchor['href'].rstrip('.html')
            print('1'+','+ProvinceID+','+anchor.text+','+'0')  # Display the innerText of each anchor
            Writefile('1'+','+ProvinceID+','+anchor.text+','+'0', ProvinceID)
            cityFun(ProvinceID, anchor['href'])
            # time.sleep(3)  # 延时，避免太频繁

        print('省份遍历完成。')
    else:
        print('ERROR: status_code='+str(page.status_code))


if __name__ == '__main__':
    ProvinceFun()
    # cityFun('43', '43.html')

运行完城后控制台回显：

村庄遍历完成。
乡镇遍历完成。
县区遍历完成。
城市遍历完成。
省份遍历完成。

数据结果会写入到txt文本中，示例 webSpiders-Region-43.txt 是这样（文件内容太多，这里只选取文件前几行）：

1,43,湖南省,0
2,430100000000,长沙市,43
3,430102000000,芙蓉区,430100000000
4,430102001000,文艺路街道,430102000000
5,430102001001,识字里社区居委会,430102001000
5,430102001002,文艺新村社区居委会,430102001000
5,430102001003,韭菜园社区居委会,430102001000

下一步将这个文件导入到 mysql里面。

新建mysql表：

CREATE DATABASE /*!32312 IF NOT EXISTS*/`db_xiongzaiqiren` /*!40100 DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci */;

USE `db_xiongzaiqiren`;

/*Table structure for table `tb_region` */

DROP TABLE IF EXISTS `tb_region`;

CREATE TABLE `tb_region` (
  `regionID` varchar(36) NOT NULL COMMENT '地区ID',
  `regionName` varchar(256) NOT NULL COMMENT '地区名称',
  `regionLevel` tinyint(4) NOT NULL DEFAULT '1' COMMENT '地区级别',
  `regionParentID` varchar(36) NOT NULL DEFAULT '0' COMMENT '地区上级ID',
  `regionIsEnabled` tinyint(4) NOT NULL DEFAULT '1' COMMENT '是否启用',
  PRIMARY KEY (`regionID`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

创建数据库，表后，就可以导入txt里的数据了。

LOAD DATA LOCAL INFILE 'D:/pythonWorkspace/webSpiders-Region-43.txt' INTO TABLE db_xiongzaiqiren.tb_region FIELDS TERMINATED BY ',' LINES TERMINATED BY '/r/n' 
(regionLevel,regionID,regionName,regionParentID,regionIsEnabled) SET regionIsEnabled=1;

注意：导入语句可能会报错，因为mysql默认没有开启 local_infile ，需要手动设置开启才能导入：

# 服务器端，local_infile默认开启；客户端，local_infile默认关闭，因此用的时候需要打开。 
SHOW GLOBAL VARIABLES LIKE 'local_infile'; 
SET GLOBAL local_infile = 'ON';

等待导入完成，看下受影响的行数，与txt里面的行数对比一下，数据条数是一样的。

【完】

原创文章，作者：6024010，如若转载，请注明出处：https://blog.ytso.com/tech/python/275166.html

python脚本抓取省市县区乡镇村庄（五级行政区划）

相关推荐

发表回复