goquery的使用|技术客 - 技术杂谈|IT经验分享

goquery是一个使用go语言写成的HTML解析库，可以让你像jQuery那样的方式来操作DOM文档

Go著名的爬虫框架colly就是基于goquery的

package main

import (
   "github.com/PuerkitoBio/goquery"
   "fmt"
   "os"
   "strings"
)

//读取所有的城市分站
func GetAllCitySites() {
   fd,_:=os.OpenFile("./xiaoshuo.txt",os.O_RDWR|os.O_CREATE|os.O_APPEND,0644)
   citySiteUrl := "https://www.23wx.cc/du/80/80892/"
   query, err := goquery.NewDocument(citySiteUrl)
   if err != nil {
      panic(err)
   }
   query.Find("dd").Find("a").Each(func(i int, s *goquery.Selection) {
      partUri, _ := s.Attr("href")
      partName := s.Text()

      fd_content:=strings.Join([]string{partName," ",partUri,"\n"}," ")
      buf:=[]byte(fd_content)
      fd.Write(buf)

      /*
      // 遍历每个节点,查找对应内容
      prov := s.Find("span").First().Text() //省份
      s.Find("a").Each(func(i int, selection *goquery.Selection) {
         // 遍历每个节点,查找对应内容
         href, _ := selection.Attr("href")
         text := selection.Text()
         fd_content:=strings.Join([]string{prov," ",text," ",href,"\n"}," ")
         buf:=[]byte(fd_content)
         fd.Write(buf)
      })*/
   })
   fmt.Println("==========全部抓取完成==============")
   fd.Close()
}

func main() {
  GetAllCitySites()
}

go mod tidy
go run main.go
#go build

package main

import (
 "fmt"
 "math/rand"
 "github.com/crawlab-team/crawlab-go-sdk/entity"
 "github.com/gocolly/colly/v2"
)

const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

func RandomString() string {
    b := make([]byte, rand.Intn(10)+10)
    for i := range b {
        b[i] = letterBytes[rand.Intn(len(letterBytes))]
    }
    return string(b)
}

func main() {
    // 生成 colly 采集器
 c := colly.NewCollector(
  colly.AllowedDomains("blog.zhonggeyuan.cn"),
  colly.Async(true),
  //colly.UserAgent(RandomString()),
  colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36"),
 )

    // 抓取结果数据钩子函数
 c.OnHTML(".article-list .article", func(e *colly.HTMLElement) {
        // 抓取结果实例
  item := entity.Item{
   "title": e.ChildText("h2.title > a"),
   "url":   e.ChildAttr("h2.title > a", "href"),
  }
        
        // 打印抓取结果
  fmt.Println(item)
  
        // 取消注释调用 Crawlab Go SDK 存入数据库
  //_ = crawlab.SaveItem(item)
 })

 c.OnError(func(r *colly.Response, err error) {
    fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
 })

 // 分页钩子函数 ul倒数第二个子元素（下一页）的url，循环采集
 c.OnHTML(".pager li:nth-last-child(2) a", func(e *colly.HTMLElement) {
    fmt.Println("page:", e.Attr("href"))
  _ = c.Visit("https://blog.zhonggeyuan.cn" + e.Attr("href"))
 })

 // 访问初始 URL
 startUrl := "https://blog.zhonggeyuan.cn"
 _ = c.Visit(startUrl)

 // 等待爬虫结束
 c.Wait()
}

goquery的使用

访客评论

最新评论