Python3:爬取新浪、网易、今日头条、UC四大网站新闻标题及内容详解编程语言

Python3:爬取新浪、网易、今日头条、UC四大网站新闻标题及内容

以爬取相应网站的社会新闻内容为例:

一、新浪:

新浪网的新闻比较好爬取,我是用BeautifulSoup直接解析的,它并没有使用JS异步加载,直接爬取就行了。

''' 
新浪新闻:http://news.sina.com.cn/society/ 
Date:20180920 
Author:lizm 
Description:获取新浪新闻 
''' 
import requests 
from bs4 import BeautifulSoup 
from urllib import request 
import sys 
import re 
import os 
def getNews(title,url,m): 
Hostreferer = { 
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' 
} 
req = request.Request(url) 
response = request.urlopen(req) 
#过滤非utf-8的网页新闻 
response = response.read().decode('utf-8',"ignore") 
soup = BeautifulSoup(response,'lxml') 
tag = soup.find('div',class_='article')     
if tag == None: 
return 0 
#获取文章发布时间 
fb_date = soup.find('div','date-source').span.string 
#获取发布网站名称 
fb_www= soup.find('div','date-source').a.string 
#获取文章内容 
rep = re.compile("[/s+/./!//_,$%^*(+/"/']+|[+<>?、~*()]+") 
title = rep.sub('',title) 
title = title.replace(':','') 
filename = sys.path[0]+"/news/"+title+".txt" 
with open(filename,'w',encoding='utf8') as file_object: 
file_object.write(fb_date + " " + fb_www) 
file_object.write("/n") 
file_object.write("网址:"+url) 
file_object.write("/n") 
file_object.write(title) 
file_object.write(tag.get_text()) 
i = 0 
for image in tag.find_all('div','img_wrapper'):  
title_img = title +str(i) 
#保存图片 
#判断目录是否存在 
if (os.path.exists(sys.path[0]+"/news/"+title)): 
pass 
else: 
#不存在,则新建目录 
os.mkdir(sys.path[0]+"/news/"+title) 
os.chdir(sys.path[0]+"/news/"+title) 
file_name = "http://news.sina.com.cn/"+image.img.get('src').replace('//','') 
html = requests.get(file_name, headers=Hostreferer) 
# 图片不是文本文件,以二进制格式写入,所以是html.content 
title_img = title_img +".jpg" 
f = open(title_img, 'wb') 
f.write(html.content) 
f.close() 
i+=1 
print('成功爬取第', m,'个新闻',title) 
return 0 
#获取社会新闻(最新的162条新闻) 
def getTitle(url): 
req = request.Request(url) 
response = request.urlopen(req) 
response = response.read().decode('utf8') 
soup = BeautifulSoup(response,'lxml') 
y = 0 
for tag in soup.find('ul',class_='seo_data_list').find_all('li'): 
if tag.a != None: 
#if y== 27: 
print(y,tag.a.string,tag.a.get('href')) 
temp = tag.a.string 
getNews(temp,tag.a.get('href'),y) 
y += 1 
if __name__ == '__main__': 
url = 'http://news.sina.com.cn/society/' 
getTitle(url)

二、网易: 

网易新闻的标题及内容是使用js异步加载的,单纯的下载网页源代码是没有标题及内容的,我们可以在Network的js中找到我们需要的内容,这里我使用了正则表达式来获取我们需要的标题及其链接,并使用了BeautifulSoup来获取相应标题的内容。

import re 
from urllib import request 
from bs4 import BeautifulSoup 
def download(title, url): 
req = request.urlopen(url) 
res = req.read() 
soup = BeautifulSoup(res,'lxml') 
#print(soup.prettify()) 
tag = soup.find('div',class_='post_text') 
#print(tag.get_text()) 
title = title.replace(':','') 
title = title.replace('"','') 
title = title.replace('|','') 
title = title.replace('/','') 
title = title.replace('//','') 
title = title.replace('*','') 
title = title.replace('<','') 
title = title.replace('>','') 
title = title.replace('?','') 
#print(title) 
file_name = r'D:/code/python/spider_news/NetEase_news/sociaty//' +title + '.txt' 
file = open(file_name,'w',encoding = 'utf-8') 
file.write(tag.get_text()) 
if __name__ == '__main__': 
urls = ['http://temp.163.com/special/00804KVA/cm_shehui.js?callback=data_callback', 
'http://temp.163.com/special/00804KVA/cm_shehui_02.js?callback=data_callback', 
'http://temp.163.com/special/00804KVA/cm_shehui_03.js?callback=data_callback'] 
for url in urls: 
#url = 'http://temp.163.com/special/00804KVA/cm_shehui_02.js?callback=data_callback' 
req = request.urlopen(url) 
res = req.read().decode('gbk') 
#print(res) 
pat1 = r'"title":"(.*?)",' 
pat2 = r'"tlink":"(.*?)",' 
m1 = re.findall(pat1,res) 
news_title = [] 
for i in m1: 
news_title.append(i) 
m2 = re.findall(pat2,res) 
news_url = [] 
for j in m2: 
news_url.append(j) 
for i in range(0,len(news_url)): 
#print(news_title[i],news_body[i]) 
            download(news_title[i],news_url[i]) 
print('正在爬取第' + str(i) + '个新闻',news_title[i])

 

三、头条:

头条的新闻跟前两个也都不一样,它的标题和链接是封装到json文件中的,但是他json文件的url参数是通过一个js随机算法变化的,所以我们需要模拟json文件的参数,否则我们找不到json文件的具体url,我是通过
http://www.jianshu.com/p/5a93673ce1c0这篇博客才了解到url获取方法的,而且也解决了总是下载重复新闻的问题,该网站自带反爬机制,需要添加cookie。关于新闻的内容,我用了正则表达式提取了中文。

from urllib import request 
import requests 
import json 
import time 
import math 
import hashlib 
import re 
from bs4 import BeautifulSoup 
def get_url(max_behot_time, AS, CP): 
url = 'https://www.toutiao.com/api/pc/feed/?category=news_society&utm_source=toutiao&widen=1' / 
'&max_behot_time={0}' / 
'&max_behot_time_tmp={0}' / 
'&tadrequire=true' / 
'&as={1}' / 
'&cp={2}'.format(max_behot_time, AS, CP) 
return url 
def get_ASCP(): 
t = int(math.floor(time.time())) 
e = hex(t).upper()[2:] 
m = hashlib.md5() 
m.update(str(t).encode(encoding='utf-8')) 
i = m.hexdigest().upper() 
if len(e) != 8: 
AS = '479BB4B7254C150' 
CP = '7E0AC8874BB0985' 
return AS,CP 
n = i[0:5] 
a = i[-5:] 
s = '' 
r = '' 
for o in range(5): 
s += n[o] + e[o] 
r += e[o + 3] + a[o] 
AS = 'AL'+ s + e[-3:] 
CP = e[0:3] + r + 'E1' 
# print("AS:"+ AS,"CP:" + CP) 
return AS,CP 
def download(title, news_url): 
# print('正在爬') 
req = request.urlopen(news_url) 
if req.getcode() != 200: 
return 0 
res = req.read().decode('utf-8') 
#print(res) 
pat1 = r'content:(.*?),' 
pat2 = re.compile('[/u4e00-/u9fa5]+') 
result1 = re.findall(pat1,res) 
#print(len(result1)) 
if len(result1) == 0: 
return 0 
print(result1) 
result2 = re.findall(pat2,str(result1)) 
result3 = [] 
for i in result2: 
if i not in result3: 
result3.append(i) 
#print(result2) 
title = title.replace(':','') 
title = title.replace('"','') 
title = title.replace('|','') 
title = title.replace('/','') 
title = title.replace('//','') 
title = title.replace('*','') 
title = title.replace('<','') 
title = title.replace('>','') 
title = title.replace('?','') 
with open(r'D:/code/python/spider_news/Toutiao_news/society//' + title + '.txt','w') as file_object: 
file_object.write('/t/t/t/t') 
file_object.write(title) 
file_object.write('/n') 
file_object.write('该新闻地址:') 
file_object.write(news_url) 
file_object.write('/n') 
for i in result3: 
#print(i) 
            file_object.write(i) 
file_object.write('/n') 
# file_object.write(tag.get_text()) 
#print('正在爬取') 
def get_item(url): 
#time.sleep(5) 
cookies = {'tt_webid': '6478612551432734221'} 
wbdata = requests.get(url,cookies = cookies) 
wbdata2 = json.loads(wbdata.text) 
data = wbdata2['data'] 
for news in data: 
title = news['title'] 
news_url = news['source_url'] 
news_url = 'https://www.toutiao.com' + news_url 
print(title, news_url) 
if 'ad_label' in news: 
print(news['ad_label']) 
continue 
download(title,news_url) 
next_data = wbdata2['next'] 
next_max_behot_time = next_data['max_behot_time'] 
# print("next_max_behot_time:{0}".format(next_max_behot_time)) 
return next_max_behot_time 
if __name__ == '__main__': 
refresh = 50 
for x in range(0,refresh+1): 
print('第{0}次:'.format(x)) 
if x == 0: 
max_behot_time = 0 
else: 
max_behot_time = next_max_behot_time 
#print(next_max_behot_time) 
AS,CP = get_ASCP() 
url = get_url(max_behot_time,AS,CP) 
next_max_behot_time = get_item(url)

 

四、UC

UC和新浪差不多,没有太复杂的反爬虫,直接解析爬取就好。

from bs4 import BeautifulSoup 
from urllib import request 
def download(title,url): 
req = request.Request(url) 
response = request.urlopen(req) 
response = response.read().decode('utf-8') 
soup = BeautifulSoup(response,'lxml') 
tag = soup.find('div',class_='sm-article-content') 
if tag == None: 
return 0 
title = title.replace(':','') 
title = title.replace('"','') 
title = title.replace('|','') 
title = title.replace('/','') 
title = title.replace('//','') 
title = title.replace('*','') 
title = title.replace('<','') 
title = title.replace('>','') 
title = title.replace('?','') 
with open(r'D:/code/python/spider_news/UC_news/society//' + title + '.txt','w',encoding='utf-8') as file_object: 
file_object.write('/t/t/t/t') 
file_object.write(title) 
file_object.write('/n') 
file_object.write('该新闻地址:') 
file_object.write(url) 
file_object.write('/n') 
file_object.write(tag.get_text()) 
#print('正在爬取') 
if __name__ == '__main__': 
for i in range(0,7): 
url = 'https://news.uc.cn/c_shehui/' 
#    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36", 
#               "cookie":"sn=3957284397500558579; _uc_pramas=%7B%22fr%22%3A%22pc%22%7D"} 
#    res = request.Request(url,headers = headers) 
res = request.urlopen(url) 
req = res.read().decode('utf-8') 
soup = BeautifulSoup(req,'lxml') 
#print(soup.prettify()) 
tag = soup.find_all('div',class_ = 'txt-area-title') 
#print(tag.name) 
for x in tag: 
news_url = 'https://news.uc.cn' + x.a.get('href') 
print(x.a.string,news_url) 
download(x.a.string,news_url)

 

 

 

原创文章,作者:Maggie-Hunter,如若转载,请注明出处:https://blog.ytso.com/16705.html

(0)
上一篇 2021年7月19日
下一篇 2021年7月19日

相关推荐

发表回复

登录后才能评论