如今,随着互联网技术的不断发展,网络爬虫已经成为了一项非常重要的技能
而golang作为一门新兴的编程语言,已经得到了广泛的应用。本课程将为大家介绍golang爬虫的使用方式
由于爬虫的不稳定性,很多写法今天可以用,可能明天就用不了了
所以自己写了一个服务端,简称自己爬自己
package main
import (
"encoding/json"
"fmt"
"github.com/gin-gonic/gin"
"io"
)
func ping(c *gin.Context) {
fmt.Println("成功请求")
}
func get(c *gin.Context) {
fmt.Println("get请求")
}
func post(c *gin.Context) {
fmt.Println("post请求")
}
func put(c *gin.Context) {
fmt.Println("put请求")
}
func Delete(c *gin.Context) {
fmt.Println("delete请求")
}
func form(c *gin.Context) {
byteData, err := io.ReadAll(c.Request.Body)
fmt.Println(string(byteData), err, c.Request.Header.Get("Content-Type"))
}
func jsonM(c *gin.Context) {
byteData, err := io.ReadAll(c.Request.Body)
fmt.Println(string(byteData), err)
}
func query(c *gin.Context) {
byteData, err := json.Marshal(c.Request.URL.Query())
fmt.Println(string(byteData), err)
}
func head(c *gin.Context) {
byteData, err := json.Marshal(c.Request.Header)
fmt.Println(string(byteData), err)
}
func file(c *gin.Context) {
fileHeader, err := c.FormFile("file")
if err != nil {
return
}
fmt.Println(fileHeader.Filename)
c.SaveUploadedFile(fileHeader, "uploads/file/"+fileHeader.Filename)
}
func getFile(c *gin.Context) {
c.Header("Content-Disposition", fmt.Sprintf(`attachment; filename="%s"`, "image.jpg"))
c.File("uploads/image.jpg")
}
func getJson(c *gin.Context) {
c.JSON(200, gin.H{
"code": 0,
"msg": "xxx",
"data": gin.H{},
})
}
func getHtml(c *gin.Context) {
c.HTML(200, "index.html", nil)
}
func douban(c *gin.Context) {
c.HTML(200, "douban.html", nil)
}
func main() {
router := gin.Default()
router.LoadHTMLGlob("template/**")
router.GET("/ping", ping)
router.GET("/get", get)
router.POST("/post", post)
router.POST("/form", form)
router.POST("/json", jsonM)
router.PUT("/put", put)
router.DELETE("/delete", Delete)
router.GET("/query", query)
router.GET("/head", head)
router.POST("/file", file)
router.GET("/get_file", getFile)
router.GET("/get_json", getJson)
router.GET("/get_html", getHtml)
router.GET("/douban", douban)
router.Run(":7070")
}
1. 网络请求
package main
import "net/http"
func main() {
http.Get("http://127.0.0.1:7070/ping")
}
如果你只是想对一个url发起一个get请求,不带任何头部信息,那么http.Get方法非常适合你
2. 不同的请求方式
post请求用的最多,我们先来看看两种不同的post请求
- 通过 kv 形式传送,例如
form-data和x-www-form-urlencoded - 通过 json 形式传送,例如
application/json
两种post请求
package main
import (
"bytes"
"net/http"
"net/url"
)
func main() {
param := url.Values{}
param.Add("name", "test")
param.Add("age", "22")
// application/x-www-form-urlencoded
http.PostForm("http://127.0.0.1:7070/form", param)
// application/json
params := bytes.NewBuffer([]byte(`{"data": "哈哈哈", "code": 0}`))
http.Post("http://127.0.0.1:7070/json", "application/json", params)
}
在服务端,接收到的值为:
age=22&name=test
{"data": "哈哈哈", "code": 0}
如果想要发送put,delete这样的请求,我们则需要构造请求方式
req, _ := http.NewRequest("PUT", "http://localhost:7070/put", nil)
http.DefaultClient.Do(req)
req, _ = http.NewRequest("POST", "http://localhost:7070/post", nil)
http.DefaultClient.Do(req)
req, _ = http.NewRequest("DELETE", "http://localhost:7070/delete", nil)
http.DefaultClient.Do(req)
req, _ = http.NewRequest("GET", "http://localhost:7070/get", nil)
http.DefaultClient.Do(req)
3. 不同的参数
携带查询参数
- 直接写在url上
- 构造url.Values{}
http.Get("http://127.0.0.1:7070/query?name=zhangsan&name=xxx&age=23")
client, _ := http.NewRequest("GET", "http://127.0.0.1:7070/query", nil)
query := url.Values{}
query.Add("name", "zhangsan")
client.URL.RawQuery = query.Encode()
http.DefaultClient.Do(client)
body参数
package main
import (
"bytes"
"net/http"
"net/url"
)
func main() {
param := url.Values{}
param.Add("name", "test")
param.Add("age", "22")
// application/x-www-form-urlencoded
http.PostForm("http://127.0.0.1:7070/form", param)
// multipart/form-data
params := bytes.NewBuffer([]byte(param.Encode()))
http.Post("http://127.0.0.1:7070/form", "multipart/form-data", params)
}
它们两个都是以键值对的方式传输数据
个人认为,form-data和x-www-form-urlencoded的区别就是,form-data可以传文件
https://www.jianshu.com/p/cbc34df2f008
json参数
package main
import (
"bytes"
"encoding/json"
"net/http"
)
func main() {
byteData, _ := json.Marshal(map[string]any{
"name": "枫枫",
"age": 23,
"data": map[string]string{
"name": "name",
},
})
http.Post("http://127.0.0.1:7070/json", "application/json", bytes.NewBuffer(byteData))
}
head参数
package main
import (
"net/http"
)
func main() {
request, _ := http.NewRequest("GET", "http://127.0.0.1:7070/head", nil)
request.Header.Set("name", "zhangsan")
request.Header.Add("name", "wangwu")
request.Header.Add("nb", "xxx")
request.Header.Set("nb", "yyy")
http.DefaultClient.Do(request)
}
Set和Add的区别:
当我们使用Set时候,如果原来这一项已存在,后面的就修改已有的。所以这里最终的结果就是自定义的浏览器3
当使用Add时候,如果原本不存在,则添加,如果已存在,就再追加一个
cookie参数
package main
import (
"net/http"
)
func main() {
request, _ := http.NewRequest("GET", "http://127.0.0.1:7070/head", nil)
request.AddCookie(&http.Cookie{Name: "name", Value: "zhangsan"})
request.AddCookie(&http.Cookie{Name: "age", Value: "123"})
http.DefaultClient.Do(request)
}
文件
package main
import (
"bytes"
"fmt"
"io"
"log"
"mime/multipart"
"net/http"
"os"
)
func main() {
bodyBuf := &bytes.Buffer{}
bodyWrite := multipart.NewWriter(bodyBuf)
// 读取文件
file, err := os.Open("server.go")
if err != nil {
fmt.Println(err)
return
}
defer file.Close()
// 创建一个新的file
fileWrite, err := bodyWrite.CreateFormFile("file", "server_1.go")
// 将上面的file放入现在的file
_, err = io.Copy(fileWrite, file)
if err != nil {
log.Println("err")
return
}
bodyWrite.Close()
contentType := bodyWrite.FormDataContentType()
http.Post("http://127.0.0.1:7070/file", contentType, bodyBuf)
}
4. 响应
json数据
package main
import (
"encoding/json"
"fmt"
"io"
"net/http"
)
func main() {
res, _ := http.Get("http://127.0.0.1:7070/get_json")
byteData, _ := io.ReadAll(res.Body)
fmt.Println(string(byteData))
var data map[string]any
json.Unmarshal(byteData, &data)
fmt.Println(data)
}
文本数据
package main
import (
"fmt"
"io"
"net/http"
)
func main() {
res, _ := http.Get("http://127.0.0.1:7070/get_html")
byteData, _ := io.ReadAll(res.Body)
fmt.Println(string(byteData))
}
文件
package main
import (
"io"
"net/http"
"os"
)
func main() {
res, _ := http.Get("http://127.0.0.1:7070/get_file")
byteData, _ := io.ReadAll(res.Body)
os.WriteFile("uploads/file/image.jpg", byteData, 077)
}
5. goquery
爬虫很大概率会获取html中的一些数据
goquery就是一个很好用的解析html的第三方库
go get github.com/PuerkitoBio/goquery
douban.html
<!DOCTYPE html>
<html lang="zh-CN" class="ua-windows ua-webkit">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>豆瓣电影</title>
</head>
<body>
<div id="wrapper">
<div id="content">
<div class="grid-16-8 clearfix">
<div class="article">
<div id="screening" class="s" data-dstat-areaid="70" data-dstat-mode="click,expose">
<div class="screening-bd">
<ul class="ui-slide-content" data-slide-index="1" data-index-max="7">
<li class="ui-slide-item" data-title="长空之王" data-release="2023" data-rate="6.6" data-star="35" data-trailer="https://movie.douban.com/subject/35209731/trailer" data-ticket="https://movie.douban.com/ticket/redirect/?movie_id=35209731" data-duration="127分钟" data-region="中国大陆" data-director="刘晓世" data-actors="王一博 / 胡军 / 周冬雨" data-intro="" data-enough="true" data-rater="226341">
<ul>
<li class="poster"><a href="https://movie.douban.com/subject/35209731/?from=showing"><img src="https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2889598060.webp" alt="长空之王" rel="nofollow" class=""/></a></li>
<li class="title"><a href="https://movie.douban.com/subject/35209731/?from=showing" class="">长空之王</a></li>
<li class="rating"><span class="rating-star allstar35"></span><span class="subject-rate">6.6</span></li>
<li class="ticket_btn"><span><a href="https://movie.douban.com/ticket/redirect/?movie_id=35209731" target="_blank">选座购票</a></span></li>
</ul>
</li>
<li class="ui-slide-item" data-title="宇宙探索编辑部" data-release="2021" data-rate="8.1" data-star="40" data-trailer="https://movie.douban.com/subject/34941536/trailer" data-ticket="https://movie.douban.com/ticket/redirect/?movie_id=34941536" data-duration="118分钟" data-region="中国大陆" data-director="孔大山" data-actors="杨皓宇 / 艾丽娅 / 王一通" data-intro="" data-enough="true" data-rater="316005">
<ul>
<li class="poster"><a href="https://movie.douban.com/subject/34941536/?from=showing"><img src="https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2889865405.webp" alt="宇宙探索编辑部"/></a></li>
<li class="title"><a href="https://movie.douban.com/subject/34941536/?from=showing" class="">宇宙探索编辑...</a></li>
<li class="rating"><span class="rating-star allstar40"></span><span class="subject-rate">8.1</span></li>
<li class="ticket_btn"><span><a href="https://movie.douban.com/ticket/redirect/?movie_id=34941536" target="_blank">选座购票</a></span></li>
</ul>
</li>
</ul>
</div>
</div>
<div id="gallery-frames">
<div id="hot-gallery">
<ul class='ui-slide-content'>
<li class='ui-slide-item'>
<div class="gallery-frame">
<a href="https://movie.douban.com/trailer/304261/?from=gallery" target="_blank"
data-fid="3719">
<img src="https://img9.doubanio.com/view/movie_gallery_frame_hot_rec/m/public/dfb3da9cdc07a25.jpg"
alt="诺兰新片《奥本海默》中字新预告" width="350" height="240"/>
</a>
<div class="gallery-detail">
<div class="gallery-hd">
<a href="https://movie.douban.com/trailer/304261/?from=gallery"
target="_blank" data-fid="3719">
<h3>诺兰新片《奥本海默》中字新预告</h3>
</a>
</div>
<div class="gallery-bd">
<p>
克里斯托弗·诺兰编剧执导,基里安·墨菲饰演“原子弹之父”罗伯特·奥本海默,影片有望引进内地。
</p>
</div>
</div>
</div>
</li>
<li class='ui-slide-item'>
<div class="gallery-frame">
<a href="https://movie.douban.com/annual/2022?source=doubanmovie&fullscreen=1?from=gallery"
target="_blank" data-fid="3744">
<img src="https://img2.doubanio.com/view/movie_gallery_frame_hot_rec/m/public/dc7b1d0f254a4af.jpg"
alt="「豆瓣2022年度电影榜单」上线" width="350" height="240"/>
</a>
<div class="gallery-detail">
<div class="gallery-hd">
<a href="https://movie.douban.com/annual/2022?source=doubanmovie&fullscreen=1?from=gallery"
target="_blank" data-fid="3744">
<h3>「豆瓣2022年度电影榜单」上线</h3>
</a>
</div>
<div class="gallery-bd">
<p>
点击查看完整榜单,开启全年佳片好剧大赏。
</p>
</div>
</div>
</div>
</li>
<li class='ui-slide-item'>
<div class="gallery-frame">
<a href="https://movie.douban.com/trailer/302881/?from=gallery" target="_blank"
data-fid="3743">
<img src="https://img1.doubanio.com/view/movie_gallery_frame_hot_rec/m/public/9dd7ad4ff6beed7.jpg"
alt="真人电影《芭比》中字预告" width="350" height="240"/>
</a>
<div class="gallery-detail">
<div class="gallery-hd">
<a href="https://movie.douban.com/trailer/302881/?from=gallery"
target="_blank" data-fid="3743">
<h3>真人电影《芭比》中字预告</h3>
</a>
</div>
<div class="gallery-bd">
<p>
玛格特·罗比饰演芭比,瑞恩·高斯林饰演肯,7月21日北美上映。
</p>
</div>
</div>
</div>
</li>
</ul>
</div>
</div>
</div>
</div>
</div>
</div>
</body>
</html>
goquery使用
goquery就是jquery的go实现,会jquery就会goquery
本课程只讲一些高频使用的操作
https://blog.csdn.net/yang731227/article/details/89338745
package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"net/http"
)
func main() {
res, err := http.Get("http://127.0.0.1:7070/douban")
if err != nil {
fmt.Println(err)
return
}
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
fmt.Println(err)
return
}
// 获取title
fmt.Println(doc.Find("title").Text())
// 获取dom节点
fmt.Println(doc.Find(".gallery-hd:nth-child(1) h3").Html())
// 获取属性
fmt.Println(doc.Find(".gallery-hd:nth-child(1)>a").Attr("href"))
// 遍历每一个节点
doc.Find(".gallery-hd > a").Each(func(i int, selection *goquery.Selection) {
href, _ := selection.Attr("href")
h3 := selection.First().Text()
fmt.Println(href, h3)
})
}
6. 正则表达式
package main
import (
"fmt"
"regexp"
)
func main() {
r, _ := regexp.Compile(`[a-z]{3}-(\d+)`)
// 找到 regexp 匹配的第一个字符串 abc-123
fmt.Println(r.FindString("abc-123|hrd-134"))
// 返回第一个匹配的原始字符串和括号里面的 [abc-123 123]
fmt.Println(r.FindStringSubmatch("abc-123|hrd-134"))
// 返回所有匹配的字符串 [abc-123 hrd-134]
fmt.Println(r.FindAllString("abc-123|hrd-134", -1))
// 返回所有匹配的原始字符串和括号里面的 [[abc-123 123] [hrd-134 134]]
fmt.Println(r.FindAllStringSubmatch("abc-123|hrd-134", -1))
// 是否匹配成功 true
fmt.Println(r.MatchString("abc-123"))
}