前几天和BBAE的销售大佬聊天, 他说他家的网站内容随便爬, 我说好, 所以这就趁着周末撸了一个爬虫; 其中这个爬虫特意用FAN-OUT 和 FAN-IN 模式, 对这块感谢的同学可以自己看一下了。
代码如下:
package jobs
import (
"bytes"
"encoding/json"
"fmt"
"github.com/gocolly/colly"
"net/http"
"sync"
)
type Article struct{
Title string `json:"title"`
SourceUrl string `json:"source_url"`
Body string `json:"body"`
}
func SpiderList() <- chan Article{
out := make(chan Article, 100)
go func(){
defer close(out)
c := colly.NewCollector()
c.OnHTML("div.la-list02 a.htitle-color", func(r *colly.HTMLElement){
body := string(r.Text)
article := Article{SourceUrl: r.Attr("href"), Title: body}
out <- article
})
c.Visit("https://5imeigu.com/")
}()
return out
}
func SpiderDetail(articles <- chan Article) <- chan Article{
out := make(chan Article, 10)
go func(){
defer close(out)
for article := range articles{
c := colly.NewCollector()
c.OnHTML("div.data-article", func(r *colly.HTMLElement){
article.Body = string(r.Response.Body)
out <- article
})
c.Visit(article.SourceUrl)
}
}()
return out
}
func Merge(inputs ...<- chan Article) <- chan Article{
out := make(chan Article)
var group sync.WaitGroup
collect := func( in <- chan Article){
defer group.Done()
for n := range(in){
out <- n
}
}
group.Add(len(inputs))
for _, in := range inputs{
go collect(in)
}
go func() {
group.Wait()
fmt.Println("全部完成....")
close(out)
}()
return out
}
func SyncTopicToInvest(article Article){
url := "https://investguider.com/api接口之马赛克"
buf := new(bytes.Buffer)
json.NewEncoder(buf).Encode(&article)
req, err := http.NewRequest("POST", url, buf)
req.Header.Set("Content-Type", "application/json")
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
panic(err)
}
defer resp.Body.Close()
}
func SpiderXueqiuCompany() interface{}{
listChannel := SpiderList()
detailChan1 := SpiderDetail(listChannel)
detailChan2 := SpiderDetail(listChannel)
for a := range Merge(detailChan1, detailChan2){
fmt.Println("完成: ", a.SourceUrl)
SyncTopicToInvest(a)
}
success := map[string]string{"title": "success"}
return SuccessResp(success)
}
运行结果如下:
2019/12/01 15:21:10 start consume persist.5imeigu
完成: https://5imeigu.com/archives/1476
完成: https://5imeigu.com/archives/1466
完成: https://5imeigu.com/archives/1453
完成: https://5imeigu.com/archives/1450
完成: https://5imeigu.com/archives/1445
完成: https://5imeigu.com/archives/1442
完成: https://5imeigu.com/archives/1440
完成: https://5imeigu.com/archives/1430
完成: https://5imeigu.com/archives/1414
完成: https://5imeigu.com/archives/1413
全部完成....
- 更多阅读推荐 -
一张图带你了解英股市场
技术分析 - 压力位与支撑位
技术分析 - RSI 指标说明
美股技术分析 - MA、EMA、MACD 指标说明
赠送《原则》等十余本电子书
< END >
点一下你会更好看耶↓↓↓
文章评论