python爬虫爬取壁纸练习及与爬取新闻储存到mysql数据库


一.python与数据库连接

1.下载相关库pymysql

可以使用pip或者直接用pycharm进行下载。

2.导入所需库

from urllib.request import urlopen
from bs4 import BeautifulSoup
import  datetime
import random
import pymysql
import re

3.数据库连接和创建所需的表

conn=pymysql.connect(host='127.0.0.1',user='root',passwd='自己的密码',db='mysql',charset='utf8')
cur=conn.cursor()
#sql='''CREATE TABLE pages (
#id BIGINT(7) NOT NULL AUTO_INCREMENT,
#title  VARCHAR(200),
#content VARCHAR (10000),
#createed TIMESTAMP DEFAULT  CURRENT_TIMESTAMP ,
#PRIMARY  KEY(id))'''

#cur.execute(sql)
cur.execute("USE mysql")

4.读取内容并存储

def store(title, content):
    cur.execute('INSERT INTO pages (title,content) VALUES ("%s","%s") ',(title,content))
    cur.connection.commit()
def getContent(articleUrl):
    html=urlopen(articleUrl)
    bs=BeautifulSoup(html,'html.parser')
    title=bs.find('h1').get_text()
    content=[]
    for p in bs.select('.article p'):
            content.append(p.text.strip())
    store(title,content)
getContent('https://news.sina.com.cn/c/2022-07-30/doc-imizirav6039897.shtml')

 二.爬取壁纸网站的壁纸并保存图片

以壁纸网站https://wallhaven.cc为例

原理:因为该壁纸网站是分页的,所以先获取每页的壁纸的url,再统一进行下载

python爬虫爬取壁纸练习及与爬取新闻储存到mysql数据库

 

 通过class为:preview的a标签获得壁纸的网页链接,

获得壁纸的网页链接后,再通过id为:wallpaper的img标签获取到src进行下载。

代码如下:

from urllib.request import urlopen
from bs4 import BeautifulSoup
import eventlet
import  datetime
import random
import pymysql
import re
import requests
import time
def getImageUrl(response):
    bs=BeautifulSoup(response.text,'html.parser')
    pagesrcs=[]
    for i in bs.find_all("a",{"class":"preview"}):
        time.sleep(1.0)
        eventlet.monkey_patch()
        with eventlet.Timeout(2, False):  # 设置超时时间为 2秒
            response2 = requests.get(i.get('href'), headers=headers)
            bs2=BeautifulSoup(response2.text,'html.parser')
            img=bs2.find("img",{'id':"wallpaper"})
            if img is not None:
             print(img.get("src"))
             pagesrcs.append(img.get("src"))
    return pagesrcs
def downloadImage(srcs):
    i=1
    for src in srcs:
        time.sleep(1.0)
        eventlet.monkey_patch()
        with eventlet.Timeout(20, False):  # 设置超时时间为 2秒
            r= requests.get(src, stream=True,headers=headers)
            if r.status_code == 200:
                open('D://Samurai+Champloo//'+str(i)+'.jpg', 'wb').write(r.content)  # 将内容写入图片
                i=i+1
                print("下载成功")
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36 Edg/103.0.1264.77'
        ,
         'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

         'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'}
#分页代码
def searchPaging(startUrl):
    allsrcs=[]
    end = 2
    page=1
    for page in range(1,end):
        if page==1:
            url = startUrl
            response = requests.get(url, headers=headers)
            srcs=getImageUrl(response)
            allsrcs+=srcs
        else:
            url=startUrl+"&page="+str(page)
            print(url)
            response=requests.get(url,headers=headers)
            srcs = getImageUrl(response)
            allsrcs+=srcs
        page=page+1
    return  allsrcs
allsrcs=searchPaging('https://wallhaven.cc/search?q=Samurai+Champloo')
downloadImage(allsrcs)

 

原创文章,作者:,如若转载,请注明出处:https://blog.ytso.com/278271.html

(0)
上一篇 2022年8月1日
下一篇 2022年8月1日

相关推荐

发表回复

登录后才能评论