Python抓妹子图+多进程详解编程语言

Pic_downloader.py  

# -*- coding: utf-8 -*- 
""" 
Created on Fri Aug 07 17:30:58 2015 
 
@author: Dreace 
""" 
import urllib2 
import sys 
import time 
import os 
import random 
from multiprocessing.dummy import Pool as ThreadPool  
type_ = sys.getfilesystemencoding() 
def rename(): 
    return time.strftime("%Y%m%d%H%M%S") 
def rename_2(name):   
    if len(name) == 2:   
        name = '0' + name + '.jpg'   
    elif len(name) == 1:   
        name = '00' + name + '.jpg'   
    else:   
        name = name + '.jpg'   
    return name 
def download_pic(i): 
    global count 
    global time_out 
    if Filter(i): 
        try:  
            content = urllib2.urlopen(i,timeout = time_out) 
            url_content = content.read() 
            file_name = repr(random.randint(10000,999999999)) + "_" + rename_2(repr(count)) 
            f = open(file_name,"wb") 
            f.write(url_content) 
            f.close() 
            if os.path.getsize(file_name) >= 1024*11: 
                count += 1 
            else: 
                os.remove(file_name) 
        except Exception, e: 
            print e 
def Filter(content): 
    for line in Filter_list: 
        if content.find(line) == -1: 
            return True 
def get_pic(url_address): 
    global pic_list 
    global time_out 
    global headers 
    try: 
        req = urllib2.Request(url = url_address,headers = headers) 
        str_ = urllib2.urlopen(req, timeout = time_out).read() 
        url_content = str_.split("/'") 
        for i in url_content: 
            if i.find(".jpg") != -1: 
                pic_list.append(i)    
    except Exception, e: 
        print e 
MAX = 100 
count = 0 
time_out = 60 
thread_num = 50 
pic_list = [] 
page_list = [] 
pic_kind = ["hot","share","mm","taiwan","japan","model"] 
Filter_list = ["imgsize.ph.126.net","img.ph.126.net","img2.ph.126.net"] 
dir_name = "C:/Photos//"+rename() 
os.makedirs(dir_name) 
os.chdir(dir_name) 
start_time = time.time() 
url_address = "http://www.mzitu.com/model/page/" 
headers = {"User-Agent":" Mozilla/5.0 (Windows NT 10.0; rv:39.0) Gecko/20100101 Firefox/39.0"} 
for pic_i in pic_kind:      
    for i in range(1,MAX + 1):   
        page_list.append(url_address + pic_i + "/page/" + repr(i)) 
page_pool = ThreadPool(thread_num) 
page_pool.map(get_pic,page_list) 
page_pool.close() 
page_pool.join() 
print "获取到".decode("utf-8").encode(type_),len(pic_list),"张图片,开始下载!".decode("utf-8").encode(type_) 
pool = ThreadPool(thread_num)  
pool.map(download_pic,pic_list) 
pool.close()  
pool.join() 
print count,"张图片保存在".decode("utf-8").encode(type_) + dir_name 
print "共耗时".decode("utf-8").encode(type_),time.time() - start_time,"s" 

在爬取过程中出现403的话可以采取如下加浏览器头的方法    

headers = {'User-Agent':' Mozilla/5.0 (Windows NT 10.0; rv:39.0) Gecko/20100101 Firefox/39.0'} 
req = urllib2.Request(url = url_address,headers = headers) 
str_ = urllib2.urlopen(req,timeout = time_out).read()

原创文章,作者:Maggie-Hunter,如若转载,请注明出处:https://blog.ytso.com/8120.html

(0)
上一篇 2021年7月18日
下一篇 2021年7月18日

相关推荐

发表回复

登录后才能评论