Go语言圣经–并发的web爬虫详解编程语言

两种:

crawler.go 

package main 
 
import ( 
        "fmt" 
        "links" 
        //"log" 
        "os" 
) 
 
func main() { 
        worklist := make(chan []string) 
 
        // Start with the command-line arguments. 
        go func() { worklist <- os.Args[1:] }()  
        // Crawl the web concurrently. 
        seen := make(map[string]bool) 
        for list := range worklist { 
                for _, link := range list { 
                        if !seen[link] { 
                                seen[link] = true 
                                go func(link string) { 
                                        worklist <- crawl(link) 
                                }(link) 
                        }    
                }    
        }    
} 
 
var tokens = make(chan struct{}, 20)  
 
//从一个url页面中提取出所有的url 
func crawl(url string) []string { 
        fmt.Println(url) 
        tokens <- struct{}{} 
        list, err := links.Extract(url) 
        <-tokens 
        if err != nil { 
                //log.Print(err) 
        }    
        return list 
} 

crawler2.go 

package main 
 
import ( 
        "fmt" 
        "links" 
        //"log" 
        "os" 
        "strings" 
) 
 
func main() { 
        worklist := make(chan []string) 
        unseenLinks := make(chan string) 
 
        // Start with the command-line arguments. 
        go func() { worklist <- os.Args[1:] }()  
        // Create 20 crawler goroutines to fetch each unseen link. 
        for i := 0; i < 20; i++ { 
                go func() { 
                        for link := range unseenLinks { 
                                //if strings.HasPrefix(link, "http://www.lypeng.com") { 
                                foundLinks := crawl(link) 
                                go func() { worklist <- foundLinks }()  
 
                                //}  
                        }    
                }()  
        }    
 
        // The main goroutine de-duplicates worklist items 
        // and sends the unseen ones to the crawlers. 
        seen := make(map[string]bool) 
        for list := range worklist { 
                for _, link := range list { 
                        if !seen[link] { 
                                seen[link] = true 
                                unseenLinks <- link 
                        }    
                }    
        }    
} 
 
//从一个url页面中提取出所有的url 
func crawl(url string) []string { 
        fmt.Println(url) 
        list, err := links.Extract(url) 
        if err != nil { 
                //log.Print(err) 
        }    
        return list 
} 

  

  

原创文章,作者:奋斗,如若转载,请注明出处:https://blog.ytso.com/12528.html

(0)
上一篇 2021年7月19日
下一篇 2021年7月19日

相关推荐

发表回复

登录后才能评论