Python抓取指定网页以及该网页上所有链接详解编程语言

#!/usr/bin/env python 
# -*- coding: utf-8 -*- 
# **************************************************************************** 
# Copyright (C) 2010 [email protected] 
  
# Author: yangyingchao <[email protected]> 
  
# This program is free software; you can redistribute it and/or modify it 
# under the terms of the GNU General Public License as published by the Free 
# Software Foundation; either version 2, or (at your option) any later 
# version. 
  
# This program is distributed in the hope that it will be useful, but WITHOUT 
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 
# more details. 
  
# You should have received a copy of the GNU General Public License along with 
# GNU Emacs; see the file COPYING.  If not, write to the Free Software 
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 
# **************************************************************************** 
  
from copy import deepcopy 
from sgmllib import SGMLParser 
from xml.dom.minidom import * 
import os 
import re 
import sys 
import urllib2 
  
title = "Untitled" 
  
class MyParser(SGMLParser): 
  
    def __init__(self): 
        self.data = "" 
        self.links = [] 
        self.TAG_BEG = False 
        self.TAG_END = False 
        SGMLParser.__init__(self, 0) 
  
    def handle_data(self, data): 
        if (self.TAG_BEG is True) and (self.TAG_END is False): 
            self.data += data 
        pass 
  
    def start_title(self, attrs): 
        self.link = "" 
        self.data="" 
  
        self.TAG_BEG = True 
        self.TAG_END = False 
        for (key, val) in attrs: 
            if key == "href": 
                self.link = val 
  
    def end_title(self): 
        self.TAG_BEG = False 
        self.TAG_END = True 
  
        self.title = self.data.strip() 
  
  
    def flush(self): 
        pass 
  
    def handle_comment(self, data): 
        pass 
  
    def start_a(self, attrs): 
        self.data="" 
  
        self.TAG_BEG = True 
        self.TAG_END = False 
        for (key, val) in attrs: 
            if key == "href": 
                self.link = val 
  
    def end_a(self): 
        self.TAG_BEG = False 
        self.TAG_END = True 
        tmp = {} 
        tmp["name"] = self.data 
        tmp["link"] = self.link 
        self.links.append(deepcopy(tmp)) 
  
  
    def unknown_starttag(self, tag, attrs): 
        pass 
  
    def unknown_endtag(self, tag): 
        pass 
  
  
    def unknown_entityref(self, ref): 
        pass 
  
    def unknown_charref(self, ref): 
        pass 
  
    def unknown_decl(self, data): 
        pass 
  
    def close(self): 
        SGMLParser.close(self) 
        self.flush() 
  
def lst2str(lst): 
    string = "" 
    for item in lst: 
        string += item.strip()+ "/n" 
    return string 
  
def downURL(url, filename): 
    print "Download %s, save as %s"%(url, filename) 
    try: 
        fp = urllib2.urlopen(url) 
    except: 
        print "download exception" 
        print sys.exc_info() 
        return 0 
    op = open(filename, "wb") 
    while 1: 
        s = fp.read() 
        if not s: 
            break 
        op.write(s) 
    fp.close( ) 
    op.close( ) 
    return 1 
  
  
def reptile(base_url): 
    """ 
    Download all articles from base_url. 
    Arguments: 
    - `base_url`: Url of website. 
    """ 
    page_list = [] 
    if not len(base_url): 
        print "No page to reptile!" 
        sys.exit(1) 
  
    parser = MyParser() 
  
    if base_url.startswith("http"): 
        myopen = urllib2.urlopen 
    else: 
        myopen = open 
  
    try: 
        content = myopen(base_url).read() 
    except: 
        print "Failed to read from %s."%base_url 
        print sys.exc_info() 
  
    for item in content: 
        parser.feed(item) 
  
    for tmp in parser.links: 
        page_list.append(tmp.get("link")) 
  
    global title 
    title = parser.title 
    parser.close() 
  
    item_list = list(set(page_list)) 
  
    for item in item_list: 
        # Strip '#' from url. 
        pos = item.find('#') 
        if pos != -1: 
            item = item[:pos] 
  
        # Added base_url to item if necessary 
        if not item.startswith("http"): 
            item = base_url.rstrip("/")+"/"+item 
            pass 
  
        local_file = item.split("/")[-1] 
        print item, local_file 
        if not local_file: 
            print "Empty local file! Continue from next one!" 
            continue 
  
        if os.access(local_file, os.F_OK): 
            print "File: %s existed, skip ..."%local_file 
        else: 
            ret = downURL(item, local_file) 
  
    # Remember to download the index file! 
    downURL(base_url, "index.html") 
    print "Total: %d articles."%(len(item_list)) 
    pass 
  
  
def walk_dir(lst, dirname, filenames): 
    for filename in filenames: 
        fn = os.path.join(dirname, filename) 
        if os.path.isdir(fn) or / 
               not filename.endswith("html"): 
            continue 
        print "Processing: %s"%fn 
        tmp = {} 
        parser = MyParser() 
        content = open(fn).read() 
        for item in content: 
            parser.feed(item) 
        tmp["file"] = filename 
        tmp["title"] = parser.title 
        parser.close() 
        lst.append(deepcopy(tmp)) 
    pass 
  
def gen_index(): 
    """ 
    Generate index of all htmls in this directory. 
    """ 
    file_lists = [] 
    os.path.walk(".", walk_dir, file_lists) 
  
    fp = open("%s.devhelp2"%os.path.basename(os.getcwd()), "w") 
    string = '<?xml version="1.0" encoding="utf-8"?>/n<book author=""' +/ 
        ' language="c" link="index.html" name="" title="%s"'%title+/ 
        ' version="2" xmlns="http://www.devhelp.net/book">/n  <chapters>' 
    for item in file_lists: 
        link = item.get("file") 
        try: 
            name =item.get("title").decode('gbk').encode('utf-8') 
        except: 
            name = item.get("title") 
        finally: 
            string += '<sub link="%s" name="%s"/>/n'%(link, name) 
  
    string +=   '/n</chapters>/n   </book>/n' 
    fp.write(string) 
    fp.close() 
  
if __name__ == '__main__': 
    if len(sys.argv) != 2: 
        print "Usage: %s url of baidu space"%sys.argv[0] 
        print "Such as: %s http://hi.baidu.com/Username" 
        gen_index() 
        sys.exit(1) 
    base_url = sys.argv[1] 
    reptile (base_url) 
    gen_index()
原创文章，作者：ItWorker，如若转载，请注明出处：https://blog.ytso.com/8418.html
Python抓取指定网页以及该网页上所有链接详解编程语言

相关推荐

发表回复