1.创建项目
在指定文件目录下进入cmd窗口,执行创建scrapy项目命令:
scrapy startproject scrapy_xinlangweibo
如图:

2.创建爬虫文件
进入spiders文件目录下,执行创建爬虫文件命令:
scrapy genspider weibo www.weibo.com
如图:

3.修改robotstxt协议
在setting.py中修改

4.测试运行
scrapy crawl weibo

测试通过,接着往下写代码!
5.xpath解析
当我们利用xpath插件能够解析,但是在python中却返回空

这时候不要慌,这说明请的网站有了反扒手段,我们需要设置UA,如果UA不行再设置cookie。就是这样博弈的。
在setting中,进行设置
先设置UA
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
再次运行查看效果。

还是不行,根据提示应该是需要登录,那么就是说我们在爬虫中需要添加cookie。登录微博,获取cookie然后放入爬虫中。
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'cookie': 'SUB=_2A25PiFDGDeRhGeRH7lYR9C_LzTmIHXVtc3COrDV8PUJbkNAKLRH2kW1NTbHM8wP7UPnTjvTAmox62rVYXbj0cyiW; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFFUpSiA2MzUQrY0aSxSJlG5JpX5oz75NHD95QE1K-XehBpS0qfWs4Dqcj1i--Xi-iFiKnpehnp9sMt; SINAGLOBAL=4669199551025.64.1653350613001; PC_TOKEN=87ef8d1632; _s_tentry=weibo.com; Apache=5712738189316.748.1658709963128; ULV=1658709963132:2:1:1:5712738189316.748.1658709963128:1653350613062',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
每个人的cookie不一样。这里替换成自己的就行了,再次运行查看。
不出意外是应该可以获取到的,哈哈哈

当看到这一大堆文字时候,又向成功迈进一小步。查看我们解析的测试数据。

博主名称获取到了!之后就是解析需要的数据了,需要xpath的语法,这里我就不赘述了,不了解的小伙伴可以查看我之前的文章,跟着一步步来即可,这里直接上解析后的代码:
import scrapy
from datetime import datetime
from scrapy_xinlangweibo.items import ScrapyXinlangweiboItem
class WeiboSpider(scrapy.Spider):
# 爬虫的名字 用于运行爬虫的时候使用的值(运行命令 scrapy crawl weibo)
name = 'weibo'
# 允许访问的域名
allowed_domains = ['www.weibo.com']
base_url = 'https://s.weibo.com/weibo?q={}'.format('农业')
# 起始的url 指的是第一次要访问的域名'https://s.weibo.com/weibo?q=农业'
start_urls = [base_url]
page = 1
def parse(self, response):
print('~~~~~~~~~~小糖丸开始蠕动了,咕叽咕叽~~~~~~~~~~')
#UA反爬-测试
#print(response.request.headers['User-Agent'])
#cookie反爬-测试
#print(response.request.headers['cookie'])
#原文-验证
#print(response.text)
# 微博主结构
weibo_list=response.xpath('//div[@action-type="feed_list_item" and contains(@mid,/'47951/')]')
for wb in weibo_list:
# 博主名称
name = wb.xpath('.//div[@class="card-feed"]/div[@class="content"]/p[@node-type="feed_list_content"]/@nick-name').extract_first()
# 发布时间
time = wb.xpath('.//div[@class="card-feed"]/div[@class="content"]/p[@class="from"]/a[@target="_blank"]/text()').extract_first()
# 来源
source = wb.xpath('.//div[@class="card-feed"]/div[@class="content"]/p[@class="from"]/a[@rel="nofollow"]/text()').extract_first()
# 博文
txtExtract = wb.xpath('.//div[@class="card-feed"]/div[@class="content"]/p[3]/text()').extract()
txt=''
#拼接博文
for string in txtExtract:
txt+=string.replace('/n','').strip() #去掉/n和空格
#print('博文》》》'+txt)
# 转发
forward = wb.xpath('.//div[@class="card-act"]/ul/li/a[@action-type="feed_list_forward"]').extract_first().split('</span>')[1].replace('</a>','').replace('转发','0')
print('forward>>>'+forward)
# 评论
comment = wb.xpath('.//div[@class="card-act"]//a[@action-type="feed_list_comment"]').extract_first().split('</span>')[1].replace('</a>','').replace('评论','0')
# 点赞
fabulous = wb.xpath('.//div[@class="card-act"]//a[@action-type="feed_list_like"]/button/span[2]/text()').extract_first().replace('赞','0')
#采集时间
createTime = datetime.now()
# 提交管道
wb = ScrapyXinlangweiboItem(name=name,time=time,source=source,txt=txt,forward=forward,comment=comment,fabulous=fabulous,createTime=createTime)
yield wb
# 每一页爬取的业务逻辑都是一样的,所以我们只需要将执行的那个页的请求再次调用parse方法就可以了
if self.page < 20:
self.page = self.page + 1
url = self.base_url+'&page=' + str(self.page)
print('第二次url>>>'+url)
# 调用parse方法
# scrapy.Request就是scrapy的get请求
yield scrapy.Request(url=url, callback=self.parse,dont_filter=True)
这里的xpath需要多次找规律,费时间的就在此
6.开通下载管道
在setting中解除注释
ITEM_PIPELINES = {
'scrapy_xinlangweibo.pipelines.ScrapyXinlangweiboPipeline': 300,
}

7.创建数据结构目录便于持久化操作
在items.py文件中定义数据结构
# 新浪微博数据结构
class ScrapyXinlangweiboItem(scrapy.Item):
# define the fields for your item here like:
# 博主名称
name = scrapy.Field()
# 发微时间
time = scrapy.Field()
# 来自
source = scrapy.Field()
#博文
txt =scrapy.Field()
pass

8.利用管道提交解析数据给数据结构
需要导入数据结构文件
from scrapy_xinlangweibo.items import ScrapyXinlangweiboItem
9.配置数据库
在setting中配置持久化数据库信息
#配置数据库,名称一定要大写 DB_HOST = 'ip' DB_PORT = 3306 DB_USER = 'root' DB_PASSWORD = '密码' DB_NAME = '数据库' # utf-8的“-”杠不允许写,否则就报错 DB_CHARSET = 'utf8'
10.管道中编写插入数据库脚本
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class ScrapyXinlangweiboPipeline:
def process_item(self, item, spider):
return item
#加载settings文件
from scrapy.utils.project import get_project_settings
#导入pymysql
import pymysql
# 管道-Mysql持久化
class MysqlPipeline:
def open_spider(self,spider):
settings = get_project_settings()
self.host = settings['DB_HOST']
self.port = settings['DB_PORT']
self.user = settings['DB_USER']
self.password = settings['DB_PASSWORD']
self.database = settings['DB_NAME']
self.charset = settings['DB_CHARSET']
self.connect()
def connect(self):
self.conn =pymysql.connect(
host=self.host,
port=self.port,
user=self.user,
password=self.password,
db=self.database,
charset=self.charset
)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
#插入数据库
sql = 'insert into xinlangweibo(name,time,source,txt,forward,comment,fabulous,createTime) values("{}","{}","{}","{}","{}","{}","{}","{}")'.format(
item['name'],item['time'].strip(),item['source'],item['txt'],item['forward'],item['comment'],item['fabulous'],item['createTime'])
# 执行sql语句
self.cursor.execute(sql)
# 提交
self.conn.commit()
return item
def close_spider(self,spider):
self.cursor.close()
self.conn.close()
原创文章,作者:254126420,如若转载,请注明出处:https://blog.ytso.com/tech/pnotes/277088.html