htmlscraper

module

v0.4.0 Latest Latest Go to latest Published: Sep 20, 2024 License: MIT

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/branow/htmlscraper

Links

Open Source Insights

README ¶

`htmlscraper` - automated HTML scraping with jQuery-like selectors in Go

Installation

To install httpscraper, use go get:

go get github.com/branow/httpscraper

This will then make the following package available to you:

github.com/branow/httpscraper/scrape

To update httpscraper to the latest version, use go get -u github.com/branow/httpscraper.

We currently support the most recent major Go versions from 1.23 onward.

Examples

Lets scrape the following body tag of catalog.html file.

Scrape the catalog name
Scrape the product names
Scrape the products
Scrape the catalog
Scrape absent data

<body>
    <div class="container">
        <h1>Product Catalog</h1>
        <div class="catalog">
            <div class="product">
                <img src="https://via.placeholder.com/200" alt="Product 1">
                <h2>Product 1</h2>
                <p>Great product for your needs.</p>
                <p class="price">$29.99</p>
            </div>
            <div class="product">
                <img src="https://via.placeholder.com/200" alt="Product 2">
                <h2>Product 2</h2>
                <p>Top-rated product with excellent reviews.</p>
                <p class="price">$39.99</p>
            </div>
            <div class="product">
                <img src="https://via.placeholder.com/200" alt="Product 3">
                <h2>Product 3</h2>
                <p>Best value for your money.</p>
                <p class="price">$19.99</p>
            </div>
            <div class="product">
                <h2>Product 4</h2>
                <p>The product that you want to buy.</p>
                <p class="price">$10.99</p>
            </div>
        </div>
    </div>
</body>

Scrape the catalog name

package examples

import (
	"fmt"

	"github.com/PuerkitoBio/goquery"
	"github.com/branow/htmlscraper/scrape"
)

func ScrapeString() {
	// create goquery document
	file := getCatalogFile() //get catalog.html
	defer file.Close()
	doc, err := goquery.NewDocumentFromReader(file)
	raisePanic(err)

	// create Scraper
	scraper := scrape.Scraper{}

	// scraping
	var catalog string //product catalog name
	err = scraper.Scrape(doc, &catalog, ".container > h1", "text")

	// get output
	fmt.Println("Got Error:", err)
	fmt.Println("Got Output:", catalog)
}

It prints:

Got Error: <nil>
Got Output: Product Catalog

The example file.

Scrape the product names

package examples

import (
	"fmt"

	"github.com/PuerkitoBio/goquery"
	"github.com/branow/htmlscraper/scrape"
)

func ScrapeSliceOfStrings() {
	// create goquery document
	file := getCatalogFile()
	defer file.Close()
	doc, err := goquery.NewDocumentFromReader(file)
	raisePanic(err)

	// create Scraper
	scraper := scrape.Scraper{}

	// scraping
	var products []string //product names
	err = scraper.Scrape(doc, &products, ".product > h2", "text")

	// get output
	fmt.Println("Got Error:", err)
	fmt.Println("Got Output:", products)
}

It prints:

Got Error: <nil>
Got Output: [Product 1 Product 2 Product 3 Product 4]

The example file.

Scrape the products

package examples

import (
	"fmt"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/branow/htmlscraper/scrape"
	"golang.org/x/net/html"
)

func ScrapeSliceOfStructs() {
	// create goquery document
	file := getCatalogFile()
	defer file.Close()
	doc, err := goquery.NewDocumentFromReader(file)
	raisePanic(err)

	// create custom extractor for price data
	priceMatch := scrape.GetEqualMatch("*price")
	priceExtractor := func(node *html.Node, extract string) (string, error) {
		price := node.FirstChild.Data
		return strings.Replace(price, "$", "", 1), nil
	}
	customExtractors := map[*scrape.Match]scrape.Extractor{&priceMatch: priceExtractor}

	// create Scraper
	scraper := scrape.Scraper{Mode: scrape.Tolerant, Extractors: customExtractors}

	// scraping
	type Product struct {
		Name        string `select:"h2" extract:"text"`
		Description string `select:"p" extract:"text"`
		Price       string `select:".price" extract:"*price"`
		Image       string `select:"img" extract:"@src"`
	}
	var products []Product
	err = scraper.Scrape(doc, &products, ".product", "")

	// get output
	fmt.Println("Got Error:", err)
	fmt.Println("Got Output:")
	for _, p := range products {
		fmt.Println(p)
	}
}

It prints:

Got Error: scrape: .product:n(3) img no nodes found
Got Output:
{Product 1 Great product for your needs. 29.99 https://via.placeholder.com/200}
{Product 2 Top-rated product with excellent reviews. 39.99 https://via.placeholder.com/200}
{Product 3 Best value for your money. 19.99 https://via.placeholder.com/200}
{Product 4 The product that you want to buy. 10.99 }

The example file.

Scrape the catalog

package examples

import (
	"fmt"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/branow/htmlscraper/scrape"
	"golang.org/x/net/html"
)

func ScrapeStruct() {
	// create goquery document
	file := getCatalogFile()
	defer file.Close()
	doc, err := goquery.NewDocumentFromReader(file)
	raisePanic(err)

	// create custom extractor for price data
	priceMatch := scrape.GetEqualMatch("*price")
	priceExtractor := func(node *html.Node, extract string) (string, error) {
		price := node.FirstChild.Data
		return strings.Replace(price, "$", "", 1), nil
	}
	customExtractors := map[*scrape.Match]scrape.Extractor{&priceMatch: priceExtractor}

	// create Scraper
	scraper := scrape.Scraper{Mode: scrape.Tolerant, Extractors: customExtractors}

	// scraping
	type Product struct {
		Name        string `select:"h2" extract:"text"`
		Description string `select:"p" extract:"text"`
		Price       string `select:".price" extract:"*price"`
		Image       string `select:"img" extract:"@src"`
	}
	type Catalog struct {
		Name     string    `select:"h1" extract:"text"`
		Products []Product `select:".product"`
	}
	var catalog Catalog
	err = scraper.Scrape(doc, &catalog, ".container", "")

	// get output
	fmt.Println("Got Error:", err)
	fmt.Println("Got Output:")
	fmt.Println("Catalog {")
	fmt.Println(catalog.Name)
	for _, p := range catalog.Products {
		fmt.Println(p)
	}
	fmt.Println("}")
}

It prints:

Got Error: scrape: .container .product:n(3) img no nodes found
Got Output:
Catalog {
Product Catalog
{Product 1 Great product for your needs. 29.99 https://via.placeholder.com/200}
{Product 2 Top-rated product with excellent reviews. 39.99 https://via.placeholder.com/200}
{Product 3 Best value for your money. 19.99 https://via.placeholder.com/200}
{Product 4 The product that you want to buy. 10.99 }
}

The example file.

Scrape absent data

If some data could be absent in an HTML document, pointers should be used. Pointers let you save some memory but first and foremost checking whether a pointer is nil gives you information about the absence of data in the HTML document.

package examples

import (
	"fmt"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/branow/htmlscraper/scrape"
	"golang.org/x/net/html"
)

func ScrapePointers() {
	// create goquery document
	file := getCatalogFile()
	defer file.Close()
	doc, err := goquery.NewDocumentFromReader(file)
	raisePanic(err)

	// create custom extractor for price data
	priceMatch := scrape.GetEqualMatch("*price")
	priceExtractor := func(node *html.Node, extract string) (string, error) {
		price := node.FirstChild.Data
		return strings.Replace(price, "$", "", 1), nil
	}
	customExtractors := map[*scrape.Match]scrape.Extractor{&priceMatch: priceExtractor}

	// create Scraper
	scraper := scrape.Scraper{Mode: scrape.Tolerant, Extractors: customExtractors}

	// scraping
	type Image struct {
		Src string `extract:"@src"`
		Alt string `extract:"@alt"`
	}
	type Product struct {
		Name        string `select:"h2" extract:"text"`
		Description string `select:"p" extract:"text"`
		Price       string `select:".price" extract:"*price"`
		Image       *Image `select:"img"`
	}
	var products []Product
	err = scraper.Scrape(doc, &products, ".product", "")

	// get output
	fmt.Println("Got Error:", err)
	fmt.Println("Got Output:")
	for _, p := range products {
		fmt.Println(p)
	}
}

It prints:

Got Error: scrape: .product:n(3) img no nodes found
Got Output:
{Product 1 Great product for your needs. 29.99 0xc00009ade0}
{Product 2 Top-rated product with excellent reviews. 39.99 0xc00009ae80}
{Product 3 Best value for your money. 19.99 0xc00009af20}
{Product 4 The product that you want to buy. 10.99 <nil>}

The example file.

Contributing

Please feel free to submit issues, fork the repository and send pull requests!

License

This project is licensed under the terms of the MIT license.

Directories ¶

Path	Synopsis
examples
scrape Package scrape implements scraping functionality for extracting useful data from HTTP text using jQuery-like selectors.	Package scrape implements scraping functionality for extracting useful data from HTTP text using jQuery-like selectors.

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

README ¶

htmlscraper - automated HTML scraping with jQuery-like selectors in Go

Table of Contents

Installation

Examples

Scrape the catalog name

Scrape the product names

Scrape the products

Scrape the catalog

Scrape absent data

Contributing

License

Directories ¶

`htmlscraper` - automated HTML scraping with jQuery-like selectors in Go