一.python与数据库连接
1.下载相关库pymysql
可以使用pip或者直接用pycharm进行下载。
2.导入所需库
from urllib.request import urlopen from bs4 import BeautifulSoup import datetime import random import pymysql import re
3.数据库连接和创建所需的表
conn=pymysql.connect(host='127.0.0.1',user='root',passwd='自己的密码',db='mysql',charset='utf8')
cur=conn.cursor()
#sql='''CREATE TABLE pages (
#id BIGINT(7) NOT NULL AUTO_INCREMENT,
#title VARCHAR(200),
#content VARCHAR (10000),
#createed TIMESTAMP DEFAULT CURRENT_TIMESTAMP ,
#PRIMARY KEY(id))'''
#cur.execute(sql)
cur.execute("USE mysql")
4.读取内容并存储
def store(title, content):
cur.execute('INSERT INTO pages (title,content) VALUES ("%s","%s") ',(title,content))
cur.connection.commit()
def getContent(articleUrl):
html=urlopen(articleUrl)
bs=BeautifulSoup(html,'html.parser')
title=bs.find('h1').get_text()
content=[]
for p in bs.select('.article p'):
content.append(p.text.strip())
store(title,content)
getContent('https://news.sina.com.cn/c/2022-07-30/doc-imizirav6039897.shtml')
二.爬取壁纸网站的壁纸并保存图片
以壁纸网站https://wallhaven.cc为例
原理:因为该壁纸网站是分页的,所以先获取每页的壁纸的url,再统一进行下载

通过class为:preview的a标签获得壁纸的网页链接,
获得壁纸的网页链接后,再通过id为:wallpaper的img标签获取到src进行下载。
代码如下:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import eventlet
import datetime
import random
import pymysql
import re
import requests
import time
def getImageUrl(response):
bs=BeautifulSoup(response.text,'html.parser')
pagesrcs=[]
for i in bs.find_all("a",{"class":"preview"}):
time.sleep(1.0)
eventlet.monkey_patch()
with eventlet.Timeout(2, False): # 设置超时时间为 2秒
response2 = requests.get(i.get('href'), headers=headers)
bs2=BeautifulSoup(response2.text,'html.parser')
img=bs2.find("img",{'id':"wallpaper"})
if img is not None:
print(img.get("src"))
pagesrcs.append(img.get("src"))
return pagesrcs
def downloadImage(srcs):
i=1
for src in srcs:
time.sleep(1.0)
eventlet.monkey_patch()
with eventlet.Timeout(20, False): # 设置超时时间为 2秒
r= requests.get(src, stream=True,headers=headers)
if r.status_code == 200:
open('D://Samurai+Champloo//'+str(i)+'.jpg', 'wb').write(r.content) # 将内容写入图片
i=i+1
print("下载成功")
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36 Edg/103.0.1264.77'
,
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'}
#分页代码
def searchPaging(startUrl):
allsrcs=[]
end = 2
page=1
for page in range(1,end):
if page==1:
url = startUrl
response = requests.get(url, headers=headers)
srcs=getImageUrl(response)
allsrcs+=srcs
else:
url=startUrl+"&page="+str(page)
print(url)
response=requests.get(url,headers=headers)
srcs = getImageUrl(response)
allsrcs+=srcs
page=page+1
return allsrcs
allsrcs=searchPaging('https://wallhaven.cc/search?q=Samurai+Champloo')
downloadImage(allsrcs)
原创文章,作者:,如若转载,请注明出处:https://blog.ytso.com/tech/pnotes/278271.html