README
¶
htmlscraper
- automated HTML scraping with jQuery-like selectors in Go
Table of Contents
Installation
To install httpscraper
, use go get
:
go get github.com/branow/httpscraper
This will then make the following package available to you:
github.com/branow/httpscraper/scrape
To update httpscraper
to the latest version, use go get -u github.com/branow/httpscraper
.
We currently support the most recent major Go versions from 1.23
onward.
Examples
Lets scrape the following body tag of catalog.html file.
- Scrape the catalog name
- Scrape the product names
- Scrape the products
- Scrape the catalog
- Scrape absent data
<body>
<div class="container">
<h1>Product Catalog</h1>
<div class="catalog">
<div class="product">
<img src="https://via.placeholder.com/200" alt="Product 1">
<h2>Product 1</h2>
<p>Great product for your needs.</p>
<p class="price">$29.99</p>
</div>
<div class="product">
<img src="https://via.placeholder.com/200" alt="Product 2">
<h2>Product 2</h2>
<p>Top-rated product with excellent reviews.</p>
<p class="price">$39.99</p>
</div>
<div class="product">
<img src="https://via.placeholder.com/200" alt="Product 3">
<h2>Product 3</h2>
<p>Best value for your money.</p>
<p class="price">$19.99</p>
</div>
<div class="product">
<h2>Product 4</h2>
<p>The product that you want to buy.</p>
<p class="price">$10.99</p>
</div>
</div>
</div>
</body>
Scrape the catalog name
package examples
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/branow/htmlscraper/scrape"
)
func ScrapeString() {
// create goquery document
file := getCatalogFile() //get catalog.html
defer file.Close()
doc, err := goquery.NewDocumentFromReader(file)
raisePanic(err)
// create Scraper
scraper := scrape.Scraper{}
// scraping
var catalog string //product catalog name
err = scraper.Scrape(doc, &catalog, ".container > h1", "text")
// get output
fmt.Println("Got Error:", err)
fmt.Println("Got Output:", catalog)
}
It prints:
Got Error: <nil>
Got Output: Product Catalog
Scrape the product names
package examples
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/branow/htmlscraper/scrape"
)
func ScrapeSliceOfStrings() {
// create goquery document
file := getCatalogFile()
defer file.Close()
doc, err := goquery.NewDocumentFromReader(file)
raisePanic(err)
// create Scraper
scraper := scrape.Scraper{}
// scraping
var products []string //product names
err = scraper.Scrape(doc, &products, ".product > h2", "text")
// get output
fmt.Println("Got Error:", err)
fmt.Println("Got Output:", products)
}
It prints:
Got Error: <nil>
Got Output: [Product 1 Product 2 Product 3 Product 4]
Scrape the products
package examples
import (
"fmt"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/branow/htmlscraper/scrape"
"golang.org/x/net/html"
)
func ScrapeSliceOfStructs() {
// create goquery document
file := getCatalogFile()
defer file.Close()
doc, err := goquery.NewDocumentFromReader(file)
raisePanic(err)
// create custom extractor for price data
priceMatch := scrape.GetEqualMatch("*price")
priceExtractor := func(node *html.Node, extract string) (string, error) {
price := node.FirstChild.Data
return strings.Replace(price, "$", "", 1), nil
}
customExtractors := map[*scrape.Match]scrape.Extractor{&priceMatch: priceExtractor}
// create Scraper
scraper := scrape.Scraper{Mode: scrape.Tolerant, Extractors: customExtractors}
// scraping
type Product struct {
Name string `select:"h2" extract:"text"`
Description string `select:"p" extract:"text"`
Price string `select:".price" extract:"*price"`
Image string `select:"img" extract:"@src"`
}
var products []Product
err = scraper.Scrape(doc, &products, ".product", "")
// get output
fmt.Println("Got Error:", err)
fmt.Println("Got Output:")
for _, p := range products {
fmt.Println(p)
}
}
It prints:
Got Error: scrape: .product:n(3) img no nodes found
Got Output:
{Product 1 Great product for your needs. 29.99 https://via.placeholder.com/200}
{Product 2 Top-rated product with excellent reviews. 39.99 https://via.placeholder.com/200}
{Product 3 Best value for your money. 19.99 https://via.placeholder.com/200}
{Product 4 The product that you want to buy. 10.99 }
Scrape the catalog
package examples
import (
"fmt"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/branow/htmlscraper/scrape"
"golang.org/x/net/html"
)
func ScrapeStruct() {
// create goquery document
file := getCatalogFile()
defer file.Close()
doc, err := goquery.NewDocumentFromReader(file)
raisePanic(err)
// create custom extractor for price data
priceMatch := scrape.GetEqualMatch("*price")
priceExtractor := func(node *html.Node, extract string) (string, error) {
price := node.FirstChild.Data
return strings.Replace(price, "$", "", 1), nil
}
customExtractors := map[*scrape.Match]scrape.Extractor{&priceMatch: priceExtractor}
// create Scraper
scraper := scrape.Scraper{Mode: scrape.Tolerant, Extractors: customExtractors}
// scraping
type Product struct {
Name string `select:"h2" extract:"text"`
Description string `select:"p" extract:"text"`
Price string `select:".price" extract:"*price"`
Image string `select:"img" extract:"@src"`
}
type Catalog struct {
Name string `select:"h1" extract:"text"`
Products []Product `select:".product"`
}
var catalog Catalog
err = scraper.Scrape(doc, &catalog, ".container", "")
// get output
fmt.Println("Got Error:", err)
fmt.Println("Got Output:")
fmt.Println("Catalog {")
fmt.Println(catalog.Name)
for _, p := range catalog.Products {
fmt.Println(p)
}
fmt.Println("}")
}
It prints:
Got Error: scrape: .container .product:n(3) img no nodes found
Got Output:
Catalog {
Product Catalog
{Product 1 Great product for your needs. 29.99 https://via.placeholder.com/200}
{Product 2 Top-rated product with excellent reviews. 39.99 https://via.placeholder.com/200}
{Product 3 Best value for your money. 19.99 https://via.placeholder.com/200}
{Product 4 The product that you want to buy. 10.99 }
}
Scrape absent data
If some data could be absent in an HTML document, pointers should be used. Pointers let you save some memory but first and foremost checking whether a pointer is nil gives you information about the absence of data in the HTML document.
package examples
import (
"fmt"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/branow/htmlscraper/scrape"
"golang.org/x/net/html"
)
func ScrapePointers() {
// create goquery document
file := getCatalogFile()
defer file.Close()
doc, err := goquery.NewDocumentFromReader(file)
raisePanic(err)
// create custom extractor for price data
priceMatch := scrape.GetEqualMatch("*price")
priceExtractor := func(node *html.Node, extract string) (string, error) {
price := node.FirstChild.Data
return strings.Replace(price, "$", "", 1), nil
}
customExtractors := map[*scrape.Match]scrape.Extractor{&priceMatch: priceExtractor}
// create Scraper
scraper := scrape.Scraper{Mode: scrape.Tolerant, Extractors: customExtractors}
// scraping
type Image struct {
Src string `extract:"@src"`
Alt string `extract:"@alt"`
}
type Product struct {
Name string `select:"h2" extract:"text"`
Description string `select:"p" extract:"text"`
Price string `select:".price" extract:"*price"`
Image *Image `select:"img"`
}
var products []Product
err = scraper.Scrape(doc, &products, ".product", "")
// get output
fmt.Println("Got Error:", err)
fmt.Println("Got Output:")
for _, p := range products {
fmt.Println(p)
}
}
It prints:
Got Error: scrape: .product:n(3) img no nodes found
Got Output:
{Product 1 Great product for your needs. 29.99 0xc00009ade0}
{Product 2 Top-rated product with excellent reviews. 39.99 0xc00009ae80}
{Product 3 Best value for your money. 19.99 0xc00009af20}
{Product 4 The product that you want to buy. 10.99 <nil>}
Contributing
Please feel free to submit issues, fork the repository and send pull requests!
License
This project is licensed under the terms of the MIT license.
Click to show internal directories.
Click to hide internal directories.