goquery是一个使用go语言写成的HTML解析库,可以让你像jQuery那样的方式来操作DOM文档
Go著名的爬虫框架colly就是基于goquery的
package main
import (
"github.com/PuerkitoBio/goquery"
"fmt"
"os"
"strings"
)
//读取所有的城市分站
func GetAllCitySites() {
fd,_:=os.OpenFile("./xiaoshuo.txt",os.O_RDWR|os.O_CREATE|os.O_APPEND,0644)
citySiteUrl := "https://www.23wx.cc/du/80/80892/"
query, err := goquery.NewDocument(citySiteUrl)
if err != nil {
panic(err)
}
query.Find("dd").Find("a").Each(func(i int, s *goquery.Selection) {
partUri, _ := s.Attr("href")
partName := s.Text()
fd_content:=strings.Join([]string{partName," ",partUri,"\n"}," ")
buf:=[]byte(fd_content)
fd.Write(buf)
/*
// 遍历每个节点,查找对应内容
prov := s.Find("span").First().Text() //省份
s.Find("a").Each(func(i int, selection *goquery.Selection) {
// 遍历每个节点,查找对应内容
href, _ := selection.Attr("href")
text := selection.Text()
fd_content:=strings.Join([]string{prov," ",text," ",href,"\n"}," ")
buf:=[]byte(fd_content)
fd.Write(buf)
})*/
})
fmt.Println("==========全部抓取完成==============")
fd.Close()
}
func main() {
GetAllCitySites()
}go mod tidy go run main.go #go build
package main
import (
"fmt"
"math/rand"
"github.com/crawlab-team/crawlab-go-sdk/entity"
"github.com/gocolly/colly/v2"
)
const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
func RandomString() string {
b := make([]byte, rand.Intn(10)+10)
for i := range b {
b[i] = letterBytes[rand.Intn(len(letterBytes))]
}
return string(b)
}
func main() {
// 生成 colly 采集器
c := colly.NewCollector(
colly.AllowedDomains("blog.zhonggeyuan.cn"),
colly.Async(true),
//colly.UserAgent(RandomString()),
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36"),
)
// 抓取结果数据钩子函数
c.OnHTML(".article-list .article", func(e *colly.HTMLElement) {
// 抓取结果实例
item := entity.Item{
"title": e.ChildText("h2.title > a"),
"url": e.ChildAttr("h2.title > a", "href"),
}
// 打印抓取结果
fmt.Println(item)
// 取消注释调用 Crawlab Go SDK 存入数据库
//_ = crawlab.SaveItem(item)
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
// 分页钩子函数 ul倒数第二个子元素(下一页)的url,循环采集
c.OnHTML(".pager li:nth-last-child(2) a", func(e *colly.HTMLElement) {
fmt.Println("page:", e.Attr("href"))
_ = c.Visit("https://blog.zhonggeyuan.cn" + e.Attr("href"))
})
// 访问初始 URL
startUrl := "https://blog.zhonggeyuan.cn"
_ = c.Visit(startUrl)
// 等待爬虫结束
c.Wait()
}
fffffasdfasdfasdfasdfas;kdfjlaskl;dfjaskl;dfjakls;dfjkal;sfdjlaks;df als;dfj awkls;dfj asl;dfj aslk;dfjaskl;dfjas;ldf
访客评论