go 爬虫
如今,随着互联网技术的不断发展,网络爬虫已经成为了一项非常重要的技能 而golang作为一门新兴的编程语言,已经得到了广泛的应用。本课程将为大家介绍golang爬虫的使用方式 由于爬虫的不稳定性,
go 爬虫
发布时间:2023-10-11 (2023-10-11)

如今,随着互联网技术的不断发展,网络爬虫已经成为了一项非常重要的技能

而golang作为一门新兴的编程语言,已经得到了广泛的应用。本课程将为大家介绍golang爬虫的使用方式

由于爬虫的不稳定性,很多写法今天可以用,可能明天就用不了了

所以自己写了一个服务端,简称自己爬自己

package main

import (
  "encoding/json"
  "fmt"
  "github.com/gin-gonic/gin"
  "io"
)

func ping(c *gin.Context) {
  fmt.Println("成功请求")
}

func get(c *gin.Context) {
  fmt.Println("get请求")
}
func post(c *gin.Context) {
  fmt.Println("post请求")
}
func put(c *gin.Context) {
  fmt.Println("put请求")
}
func Delete(c *gin.Context) {
  fmt.Println("delete请求")
}

func form(c *gin.Context) {
  byteData, err := io.ReadAll(c.Request.Body)
  fmt.Println(string(byteData), err, c.Request.Header.Get("Content-Type"))

}
func jsonM(c *gin.Context) {
  byteData, err := io.ReadAll(c.Request.Body)
  fmt.Println(string(byteData), err)
}

func query(c *gin.Context) {
  byteData, err := json.Marshal(c.Request.URL.Query())
  fmt.Println(string(byteData), err)
}
func head(c *gin.Context) {
  byteData, err := json.Marshal(c.Request.Header)
  fmt.Println(string(byteData), err)
}
func file(c *gin.Context) {
  fileHeader, err := c.FormFile("file")
  if err != nil {
    return
  }
  fmt.Println(fileHeader.Filename)
  c.SaveUploadedFile(fileHeader, "uploads/file/"+fileHeader.Filename)
}

func getFile(c *gin.Context) {
  c.Header("Content-Disposition", fmt.Sprintf(`attachment; filename="%s"`, "image.jpg"))
  c.File("uploads/image.jpg")
}
func getJson(c *gin.Context) {
  c.JSON(200, gin.H{
    "code": 0,
    "msg":  "xxx",
    "data": gin.H{},
  })
}
func getHtml(c *gin.Context) {
  c.HTML(200, "index.html", nil)
}
func douban(c *gin.Context) {
  c.HTML(200, "douban.html", nil)
}

func main() {
  router := gin.Default()
  router.LoadHTMLGlob("template/**")
  router.GET("/ping", ping)
  router.GET("/get", get)
  router.POST("/post", post)
  router.POST("/form", form)
  router.POST("/json", jsonM)
  router.PUT("/put", put)
  router.DELETE("/delete", Delete)
  router.GET("/query", query)
  router.GET("/head", head)
  router.POST("/file", file)
  router.GET("/get_file", getFile)
  router.GET("/get_json", getJson)
  router.GET("/get_html", getHtml)
  router.GET("/douban", douban)
  router.Run(":7070")
}

1. 网络请求

package main

import "net/http"

func main() {
  http.Get("http://127.0.0.1:7070/ping")
}

如果你只是想对一个url发起一个get请求,不带任何头部信息,那么http.Get方法非常适合你

2. 不同的请求方式

post请求用的最多,我们先来看看两种不同的post请求

  1. 通过 kv 形式传送,例如 form-datax-www-form-urlencoded
  2. 通过 json 形式传送,例如 application/json

两种post请求

package main

import (
  "bytes"
  "net/http"
  "net/url"
)

func main() {
  
  param := url.Values{}
  param.Add("name", "test")
  param.Add("age", "22")
  // application/x-www-form-urlencoded
  http.PostForm("http://127.0.0.1:7070/form", param)
  // application/json
  params := bytes.NewBuffer([]byte(`{"data": "哈哈哈", "code": 0}`))
  http.Post("http://127.0.0.1:7070/json", "application/json", params)
}

在服务端,接收到的值为:

age=22&name=test
{"data": "哈哈哈", "code": 0}

如果想要发送put,delete这样的请求,我们则需要构造请求方式

req, _ := http.NewRequest("PUT", "http://localhost:7070/put", nil)
http.DefaultClient.Do(req)
req, _ = http.NewRequest("POST", "http://localhost:7070/post", nil)
http.DefaultClient.Do(req)
req, _ = http.NewRequest("DELETE", "http://localhost:7070/delete", nil)
http.DefaultClient.Do(req)
req, _ = http.NewRequest("GET", "http://localhost:7070/get", nil)
http.DefaultClient.Do(req)

3. 不同的参数

携带查询参数

  1. 直接写在url上
  2. 构造url.Values{}
http.Get("http://127.0.0.1:7070/query?name=zhangsan&name=xxx&age=23")
client, _ := http.NewRequest("GET", "http://127.0.0.1:7070/query", nil)
query := url.Values{}
query.Add("name", "zhangsan")
client.URL.RawQuery = query.Encode()
http.DefaultClient.Do(client)

body参数

package main

import (
  "bytes"
  "net/http"
  "net/url"
)

func main() {
  param := url.Values{}
  param.Add("name", "test")
  param.Add("age", "22")
  // application/x-www-form-urlencoded
  http.PostForm("http://127.0.0.1:7070/form", param)
  
  // multipart/form-data
  params := bytes.NewBuffer([]byte(param.Encode()))
  http.Post("http://127.0.0.1:7070/form", "multipart/form-data", params)
}

它们两个都是以键值对的方式传输数据

个人认为,form-data和x-www-form-urlencoded的区别就是,form-data可以传文件

https://www.jianshu.com/p/cbc34df2f008

json参数

package main

import (
  "bytes"
  "encoding/json"
  "net/http"
)

func main() {

  byteData, _ := json.Marshal(map[string]any{
    "name": "枫枫",
    "age":  23,
    "data": map[string]string{
      "name": "name",
    },
  })

  http.Post("http://127.0.0.1:7070/json", "application/json", bytes.NewBuffer(byteData))
}

head参数

package main

import (
  "net/http"
)

func main() {

  request, _ := http.NewRequest("GET", "http://127.0.0.1:7070/head", nil)
  request.Header.Set("name", "zhangsan")
  request.Header.Add("name", "wangwu")
  request.Header.Add("nb", "xxx")
  request.Header.Set("nb", "yyy")
  http.DefaultClient.Do(request)
}

Set和Add的区别:

当我们使用Set时候,如果原来这一项已存在,后面的就修改已有的。所以这里最终的结果就是自定义的浏览器3

当使用Add时候,如果原本不存在,则添加,如果已存在,就再追加一个

cookie参数

package main

import (
  "net/http"
)

func main() {

  request, _ := http.NewRequest("GET", "http://127.0.0.1:7070/head", nil)
  request.AddCookie(&http.Cookie{Name: "name", Value: "zhangsan"})
  request.AddCookie(&http.Cookie{Name: "age", Value: "123"})
  http.DefaultClient.Do(request)
}

文件

package main

import (
  "bytes"
  "fmt"
  "io"
  "log"
  "mime/multipart"
  "net/http"
  "os"
)

func main() {
  bodyBuf := &bytes.Buffer{}
  bodyWrite := multipart.NewWriter(bodyBuf)
  // 读取文件
  file, err := os.Open("server.go")
  if err != nil {
    fmt.Println(err)
    return
  }
  defer file.Close()
  
  // 创建一个新的file
  fileWrite, err := bodyWrite.CreateFormFile("file", "server_1.go")
  // 将上面的file放入现在的file
  _, err = io.Copy(fileWrite, file)
  if err != nil {
    log.Println("err")
    return
  }
  bodyWrite.Close() 
  contentType := bodyWrite.FormDataContentType()
  http.Post("http://127.0.0.1:7070/file", contentType, bodyBuf)
}

4. 响应

json数据

package main

import (
  "encoding/json"
  "fmt"
  "io"
  "net/http"
)

func main() {
  res, _ := http.Get("http://127.0.0.1:7070/get_json")
  byteData, _ := io.ReadAll(res.Body)
  fmt.Println(string(byteData))
  var data map[string]any
  json.Unmarshal(byteData, &data)
  fmt.Println(data)
}

文本数据

package main

import (
  "fmt"
  "io"
  "net/http"
)

func main() {
  res, _ := http.Get("http://127.0.0.1:7070/get_html")
  byteData, _ := io.ReadAll(res.Body)
  fmt.Println(string(byteData))
}

文件

package main

import (
  "io"
  "net/http"
  "os"
)

func main() {
  res, _ := http.Get("http://127.0.0.1:7070/get_file")
  byteData, _ := io.ReadAll(res.Body)
  os.WriteFile("uploads/file/image.jpg", byteData, 077)
}

5. goquery

爬虫很大概率会获取html中的一些数据

goquery就是一个很好用的解析html的第三方库

go get github.com/PuerkitoBio/goquery

douban.html

<!DOCTYPE html>
<html lang="zh-CN" class="ua-windows ua-webkit">
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <title>豆瓣电影</title>
</head>
<body>
<div id="wrapper">
    <div id="content">
        <div class="grid-16-8 clearfix">
            <div class="article">
                <div id="screening" class="s" data-dstat-areaid="70" data-dstat-mode="click,expose">
                    <div class="screening-bd">
                        <ul class="ui-slide-content" data-slide-index="1" data-index-max="7">
                            <li class="ui-slide-item" data-title="长空之王" data-release="2023" data-rate="6.6" data-star="35" data-trailer="https://movie.douban.com/subject/35209731/trailer" data-ticket="https://movie.douban.com/ticket/redirect/?movie_id=35209731" data-duration="127分钟" data-region="中国大陆" data-director="刘晓世" data-actors="王一博 / 胡军 / 周冬雨" data-intro="" data-enough="true" data-rater="226341">
                                <ul>
                                    <li class="poster"><a href="https://movie.douban.com/subject/35209731/?from=showing"><img src="https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2889598060.webp" alt="长空之王" rel="nofollow" class=""/></a></li>
                                    <li class="title"><a href="https://movie.douban.com/subject/35209731/?from=showing" class="">长空之王</a></li>
                                    <li class="rating"><span class="rating-star allstar35"></span><span class="subject-rate">6.6</span></li>
                                    <li class="ticket_btn"><span><a href="https://movie.douban.com/ticket/redirect/?movie_id=35209731" target="_blank">选座购票</a></span></li>
                                </ul>
                            </li>
                            <li class="ui-slide-item" data-title="宇宙探索编辑部" data-release="2021" data-rate="8.1" data-star="40" data-trailer="https://movie.douban.com/subject/34941536/trailer" data-ticket="https://movie.douban.com/ticket/redirect/?movie_id=34941536" data-duration="118分钟" data-region="中国大陆" data-director="孔大山" data-actors="杨皓宇 / 艾丽娅 / 王一通" data-intro="" data-enough="true" data-rater="316005">
                                <ul>
                                    <li class="poster"><a href="https://movie.douban.com/subject/34941536/?from=showing"><img src="https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2889865405.webp" alt="宇宙探索编辑部"/></a></li>
                                    <li class="title"><a href="https://movie.douban.com/subject/34941536/?from=showing" class="">宇宙探索编辑...</a></li>
                                    <li class="rating"><span class="rating-star allstar40"></span><span class="subject-rate">8.1</span></li>
                                    <li class="ticket_btn"><span><a href="https://movie.douban.com/ticket/redirect/?movie_id=34941536" target="_blank">选座购票</a></span></li>
                                </ul>
                            </li>
                        </ul>
                    </div>
                </div>

                <div id="gallery-frames">
                    <div id="hot-gallery">
                        <ul class='ui-slide-content'>
                            <li class='ui-slide-item'>

                                <div class="gallery-frame">
                                    <a href="https://movie.douban.com/trailer/304261/?from=gallery" target="_blank"
                                       data-fid="3719">
                                        <img src="https://img9.doubanio.com/view/movie_gallery_frame_hot_rec/m/public/dfb3da9cdc07a25.jpg"
                                             alt="诺兰新片《奥本海默》中字新预告" width="350" height="240"/>
                                    </a>
                                    <div class="gallery-detail">
                                        <div class="gallery-hd">
                                            <a href="https://movie.douban.com/trailer/304261/?from=gallery"
                                               target="_blank" data-fid="3719">
                                                <h3>诺兰新片《奥本海默》中字新预告</h3>
                                            </a>
                                        </div>
                                        <div class="gallery-bd">
                                            <p>
                                                克里斯托弗·诺兰编剧执导,基里安·墨菲饰演“原子弹之父”罗伯特·奥本海默,影片有望引进内地。
                                            </p>
                                        </div>
                                    </div>
                                </div>

                            </li>
                            <li class='ui-slide-item'>
                                <div class="gallery-frame">
                                    <a href="https://movie.douban.com/annual/2022?source=doubanmovie&amp;fullscreen=1?from=gallery"
                                       target="_blank" data-fid="3744">
                                        <img src="https://img2.doubanio.com/view/movie_gallery_frame_hot_rec/m/public/dc7b1d0f254a4af.jpg"
                                             alt="「豆瓣2022年度电影榜单」上线" width="350" height="240"/>
                                    </a>
                                    <div class="gallery-detail">
                                        <div class="gallery-hd">
                                            <a href="https://movie.douban.com/annual/2022?source=doubanmovie&amp;fullscreen=1?from=gallery"
                                               target="_blank" data-fid="3744">
                                                <h3>「豆瓣2022年度电影榜单」上线</h3>
                                            </a>
                                        </div>
                                        <div class="gallery-bd">
                                            <p>
                                                点击查看完整榜单,开启全年佳片好剧大赏。
                                            </p>
                                        </div>
                                    </div>
                                </div>
                            </li>
                            <li class='ui-slide-item'>
                                <div class="gallery-frame">
                                    <a href="https://movie.douban.com/trailer/302881/?from=gallery" target="_blank"
                                       data-fid="3743">
                                        <img src="https://img1.doubanio.com/view/movie_gallery_frame_hot_rec/m/public/9dd7ad4ff6beed7.jpg"
                                             alt="真人电影《芭比》中字预告" width="350" height="240"/>
                                    </a>
                                    <div class="gallery-detail">
                                        <div class="gallery-hd">
                                            <a href="https://movie.douban.com/trailer/302881/?from=gallery"
                                               target="_blank" data-fid="3743">
                                                <h3>真人电影《芭比》中字预告</h3>
                                            </a>
                                        </div>
                                        <div class="gallery-bd">
                                            <p>
                                                玛格特·罗比饰演芭比,瑞恩·高斯林饰演肯,7月21日北美上映。
                                            </p>
                                        </div>
                                    </div>
                                </div>
                            </li>
                        </ul>
                    </div>
                </div>
            </div>
        </div>
    </div>
</div>
</body>
</html>


goquery使用

goquery就是jquery的go实现,会jquery就会goquery

本课程只讲一些高频使用的操作

https://blog.csdn.net/yang731227/article/details/89338745

package main

import (
  "fmt"
  "github.com/PuerkitoBio/goquery"
  "net/http"
)

func main() {
  res, err := http.Get("http://127.0.0.1:7070/douban")
  if err != nil {
    fmt.Println(err)
    return
  }
  doc, err := goquery.NewDocumentFromReader(res.Body)
  if err != nil {
    fmt.Println(err)
    return
  }
  // 获取title
  fmt.Println(doc.Find("title").Text())
  // 获取dom节点
  fmt.Println(doc.Find(".gallery-hd:nth-child(1) h3").Html())
  // 获取属性
  fmt.Println(doc.Find(".gallery-hd:nth-child(1)>a").Attr("href"))
  // 遍历每一个节点
  doc.Find(".gallery-hd > a").Each(func(i int, selection *goquery.Selection) {
    href, _ := selection.Attr("href")
    h3 := selection.First().Text()
    fmt.Println(href, h3)
  })
}

6. 正则表达式

package main

import (
  "fmt"
  "regexp"
)

func main() {
  r, _ := regexp.Compile(`[a-z]{3}-(\d+)`)
  // 找到 regexp 匹配的第一个字符串  abc-123
  fmt.Println(r.FindString("abc-123|hrd-134"))
  //  返回第一个匹配的原始字符串和括号里面的  [abc-123 123]
  fmt.Println(r.FindStringSubmatch("abc-123|hrd-134"))
  // 返回所有匹配的字符串 [abc-123 hrd-134]
  fmt.Println(r.FindAllString("abc-123|hrd-134", -1))
  //  返回所有匹配的原始字符串和括号里面的 [[abc-123 123] [hrd-134 134]]
  fmt.Println(r.FindAllStringSubmatch("abc-123|hrd-134", -1))
  // 是否匹配成功 true
  fmt.Println(r.MatchString("abc-123"))
}