python抓取图片示例详解编程语言

#!/usr/bin/python 
# -*- coding:utf-8 -*- 
 
import re 
import os 
import urllib, urllib2, cookielib 
import shutil 
from BeautifulSoup import BeautifulSoup  
 
# ---- utils ---- 
def normalize_url(url): 
    return "http://" + url if cmp(url[0:7],"http://") != 0 else url 
 
def safeDir(dir): 
    return dir.replace('/', '') 
 
# ---- variable ---- 
homepagePrefix = "http://60dxw.comww1.baisex.me/forum-47-" 
homepageSuffix = ".html" 
threadPrefix = "http://60dxw.comww1.baisex.me/" 
homedir = "baixingge" 
 
# ---- login ---- 
cookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar()) 
opener = urllib2.build_opener(cookie) 
 
# ---- file ---- 
if (os.path.exists(homedir) == False): 
    os.mkdir(homedir) 
os.chdir(homedir) 
 
# ---- crawl ---- 
for page in range(1, 25): 
    pageUrl = '{0}{1}{2}'.format(homepagePrefix,page,homepageSuffix) 
    # ---- mkdir ---- 
    if (os.path.exists(str(page)) == False): 
        os.mkdir(str(page)) 
    os.chdir(str(page)) 
    print pageUrl 
 
    # ---- download ---- 
    html_body = urllib.urlopen(pageUrl).read() 
    soup = BeautifulSoup(html_body) 
 
    # ---- extract ---- 
    threaddUrls = [] 
    urlRaws = soup.findAll('th', attrs = {'class' : ['new', 'common']}) 
    urlPattern = re.compile(r'href="([^"]*)"') 
    titlePattern = re.compile(r'>([^<]*)</a>') 
    for urlRaw in urlRaws:  
        h = urlPattern.search(str(urlRaw)) 
        t = titlePattern.search(str(urlRaw)) 
        threadUrl = h.group(1) 
        threadTitle = t.group(1) 
        if (os.path.exists(threadTitle) == False): 
            os.mkdir(safeDir(threadTitle)) 
        else: 
            continue 
        os.chdir(safeDir(threadTitle)) 
 
        page_url = threadPrefix + threadUrl 
        print "---->{0}".format(page_url) 
        print "---->{0}".format(safeDir(threadTitle)) 
        page_body = urllib.urlopen(page_url).read() 
        page_soup = BeautifulSoup(page_body) 
 
        imgPattern = re.compile(r'img src="([^"]*)" onload') 
        i = imgPattern.findall(str(page_soup)) 
        index = 0 
        for img in i: 
            print "-------->{0}".format(img) 
            imgSuffix = img[img.rindex('.'):] 
            imgName = "{0}{1}".format(str(index), imgSuffix) 
            urllib.urlretrieve(img, imgName, None) 
            index += 1 
 
        os.chdir("../") 
    os.chdir("../") 

原创文章,作者:ItWorker,如若转载,请注明出处:https://blog.ytso.com/8503.html

(0)
上一篇 2021年7月18日
下一篇 2021年7月18日

相关推荐

发表回复

登录后才能评论