goquery是一个使用go语言写成的HTML解析库,可以让你像jQuery那样的方式来操作DOM文档
Go著名的爬虫框架colly就是基于goquery的
package main import ( "github.com/PuerkitoBio/goquery" "fmt" "os" "strings" ) //读取所有的城市分站 func GetAllCitySites() { fd,_:=os.OpenFile("./xiaoshuo.txt",os.O_RDWR|os.O_CREATE|os.O_APPEND,0644) citySiteUrl := "https://www.23wx.cc/du/80/80892/" query, err := goquery.NewDocument(citySiteUrl) if err != nil { panic(err) } query.Find("dd").Find("a").Each(func(i int, s *goquery.Selection) { partUri, _ := s.Attr("href") partName := s.Text() fd_content:=strings.Join([]string{partName," ",partUri,"\n"}," ") buf:=[]byte(fd_content) fd.Write(buf) /* // 遍历每个节点,查找对应内容 prov := s.Find("span").First().Text() //省份 s.Find("a").Each(func(i int, selection *goquery.Selection) { // 遍历每个节点,查找对应内容 href, _ := selection.Attr("href") text := selection.Text() fd_content:=strings.Join([]string{prov," ",text," ",href,"\n"}," ") buf:=[]byte(fd_content) fd.Write(buf) })*/ }) fmt.Println("==========全部抓取完成==============") fd.Close() } func main() { GetAllCitySites() }
go mod tidy go run main.go #go build
package main import ( "fmt" "math/rand" "github.com/crawlab-team/crawlab-go-sdk/entity" "github.com/gocolly/colly/v2" ) const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" func RandomString() string { b := make([]byte, rand.Intn(10)+10) for i := range b { b[i] = letterBytes[rand.Intn(len(letterBytes))] } return string(b) } func main() { // 生成 colly 采集器 c := colly.NewCollector( colly.AllowedDomains("blog.zhonggeyuan.cn"), colly.Async(true), //colly.UserAgent(RandomString()), colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36"), ) // 抓取结果数据钩子函数 c.OnHTML(".article-list .article", func(e *colly.HTMLElement) { // 抓取结果实例 item := entity.Item{ "title": e.ChildText("h2.title > a"), "url": e.ChildAttr("h2.title > a", "href"), } // 打印抓取结果 fmt.Println(item) // 取消注释调用 Crawlab Go SDK 存入数据库 //_ = crawlab.SaveItem(item) }) c.OnError(func(r *colly.Response, err error) { fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err) }) // 分页钩子函数 ul倒数第二个子元素(下一页)的url,循环采集 c.OnHTML(".pager li:nth-last-child(2) a", func(e *colly.HTMLElement) { fmt.Println("page:", e.Attr("href")) _ = c.Visit("https://blog.zhonggeyuan.cn" + e.Attr("href")) }) // 访问初始 URL startUrl := "https://blog.zhonggeyuan.cn" _ = c.Visit(startUrl) // 等待爬虫结束 c.Wait() }
访客评论