goscrapper

package module
v1.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 27, 2020 License: MIT Imports: 10 Imported by: 0

README

GO Scraper

An oppinated & limited way to access the web using GO.

Examples

Here are a few impressions on the way the library works. More examples are on the project go docs.

For complete working example please refer to _examples project directory.

Initialization
// with context
ctx := context.Background();
ctx, cancel := context.WithTimeout(ctx, time.Second * 5)
defer cancel()

web := goscrapper.NewContextScrapper(ctx, "https://www.domain.com")
if err != nil {
    log.Fatal(err)
}

// without context
web := goscrapper.NewScrapper("https://www.domain.com")
if err != nil {
    log.Fatal(err)
}
Usage
// scrape headers info
fmt.Println(web.Title())
fmt.Println(web.CSRFToken())
fmt.Println(web.ContentType())

// scrape all headers
fmt.Println(web.Headers())

// scrape paragraphs
fmt.Println(web.Paragraphs())
fmt.Println(web.CleanParagraphs())

// scrape images and links and commonly interesting details
fmt.Println(web.Links())
fmt.Println(web.InternalLinks())
fmt.Println(web.ExternalLinks())
fmt.Println(web.LinksWithDetails())
fmt.Println(web.Images())
fmt.Println(web.ImagesWithDetails())

// scrape emails
fmt.Println(web.Emails())

// scrape using custom query
quotes := web.Query(goscrapper.Query{Name: "Quotes", Selector: "quotes p"})
for _, q := range quotes {
    fmt.Printf("Attributes: %v, Value: %v\n", q.Attr, q.Text)
}

See the full documentation for more information and examples.

Documentation

Overview

Example
package main

import (
	"fmt"
	"goscrapper"
)

func main() {
	web := goscrapper.NewScrapper("https://www.domain.com")

	// scrape headers info
	fmt.Println(web.Title())
	fmt.Println(web.CSRFToken())
	fmt.Println(web.ContentType())

	// scrape all headers
	fmt.Println(web.Headers())

	// scrape paragraphs
	fmt.Println(web.Paragraphs())
	fmt.Println(web.CleanParagraphs())

	// scrape images and links and commonly interesting details
	fmt.Println(web.Links())
	fmt.Println(web.InternalLinks())
	fmt.Println(web.ExternalLinks())
	fmt.Println(web.LinksWithDetails())
	fmt.Println(web.Images())
	fmt.Println(web.ImagesWithDetails())

	// scrape emails
	fmt.Println(web.Emails())

	// scrape using custom query
	quotes := web.Query(goscrapper.Query{Name: "Quotes", Selector: "quotes p"})
	for _, q := range quotes {
		fmt.Printf("Attributes: %v, Value: %v\n", q.Attr, q.Text)
	}
}
Output:

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type HeadingLevel

type HeadingLevel string
const (
	H1 HeadingLevel = "h1"
	H2 HeadingLevel = "h2"
	H3 HeadingLevel = "h3"
	H4 HeadingLevel = "h4"
	H5 HeadingLevel = "h5"
	H6 HeadingLevel = "h6"
)

type Query

type Query struct {
	Name     string
	Selector string
}

type QueryResult

type QueryResult struct {
	Attr map[string]interface{}
	Text string
}

type Viewport

type Viewport struct {
	Val map[string]string
}

func NewViewport

func NewViewport(doc *goquery.Document) *Viewport

func (*Viewport) Fetch

func (v *Viewport) Fetch(doc *goquery.Document)

func (*Viewport) String

func (v *Viewport) String() string

String representation of viewport

type Web

type Web struct {
	URL string

	Doc *goquery.Document
	// contains filtered or unexported fields
}

func NewContextScrapper added in v1.1.0

func NewContextScrapper(ctx context.Context, url string) (*Web, error)

initialize new scrapper instance with context

func NewScrapper

func NewScrapper(url string) (*Web, error)

initialize new scrapper instance

func (*Web) CSRFToken

func (w *Web) CSRFToken() string

Fetch meta info of csrf token from head

Example:

html: <meta name="csrf-token" content="token" />

Result: token

func (*Web) Canonical

func (w *Web) Canonical() string

Fetch canonical meta url from head

Example:

html: <link rel="canonical" href="https://test-page.goscrapper.com/page.html" />

Result: https://test-page.goscrapper.com/page.html

func (*Web) Charset

func (w *Web) Charset() string

Fetch the charset meta info from head, if a tag wasn't found because it's missing in the source HTML, empty string will be returned.

Example:

html: <meta charset="utf-8" /> Result: utf-8

func (*Web) CleanParagraphs

func (w *Web) CleanParagraphs() []string

Empty p-tags would lead to empty strings in the returned array. To avoid this you can call w.CleanParagraphs() instead. This will filter empty paragraphs and only return those with content.

func (*Web) ContentType

func (w *Web) ContentType() string

Fetch content type meta info from head

Example:

<meta http-equiv="Content-type" content="text/html; charset=utf-8" />

Result: [text/html, utf-8]

func (*Web) Emails

func (w *Web) Emails() ([]string, error)

scrape all emails from current web page

func (w *Web) ExternalLinks() []string

get all external links on the page as absolute URLs

func (*Web) Fetch

func (w *Web) Fetch() error

func (*Web) Headers

func (w *Web) Headers() map[string]string

get the header collected as an slice

func (*Web) Heading

func (w *Web) Heading(opt ...HeadingLevel) [][]string

Fetch slice of heading text, default level is h1 you pass different heading level -> w.Heading(H2, H3)

Example:

html: <h1>Heading 1</h1>

Result: Heading 1

func (*Web) Headings

func (w *Web) Headings() [][]string

Fetch slice of all the heading tags text (h1, h2, h3, h4, h5, h6)

Example:

html: <h1>Heading 1</h1>

<h1>Heading 1</h1>
<h2>Heading 2</h2>
<h2>Heading 2</h2>

Result: [[Heading 1, heading 1], [Heading 2, Heading 2]]

func (*Web) Images

func (w *Web) Images() []string

get slice of all images on the page with absolute URLs

Example:

html: <img src="https://test-pages.de/assets/cat.jpg" alt="absolute path">

Result: ['https://test-pages.de/assets/cat.jpg']

func (*Web) ImagesWithDetails

func (w *Web) ImagesWithDetails() []map[string]interface{}

get all images on the page with commonly interesting details

Example:

html: <img src="https://test-pages.de/assets/cat.jpg" alt="absolute path">

Result: [

'url' => 'https://test-pages.de/assets/cat.jpg',
'alt' => 'absolute path',
'width' => null,
'height' => null,

]

func (w *Web) InternalLinks() []string

get all internal links (same root or sub-domain) on the page as absolute URLs

func (w *Web) Links() []string

get slice of all links on the page as absolute URLs

func (*Web) LinksWithDetails

func (w *Web) LinksWithDetails() []map[string]interface{}

get all links on the page with commonly interesting details

Example:

html: <a href="https://placekitten.com/432/287" rel="nofollow">external kitten</a>

Result: [

'url' => 'https://placekitten.com/432/287',
'text' => 'external kitten',
'title' => null,
'target' => null,
'rel' => 'nofollow',
'isNofollow' => true,
'isUGC' => false,
'isNoopener' => false,
'isNoreferrer' => false,

]

func (*Web) Paragraphs

func (w *Web) Paragraphs() []string

Fetch all the paragraphs (<p>) on a website

func (*Web) Query

func (w *Web) Query(query Query) []QueryResult

get attributes and value of given query selector. return slice of result in case of multiple existence of an element

Example
package main

import (
	"fmt"
	"goscrapper"
)

func main() {
	web := goscrapper.NewScrapper("https://www.metalsucks.net/")

	metaResult := web.Query(goscrapper.Query{Name: "Meta Info", Selector: "meta[property='og:locale']"})
	fmt.Println(metaResult[0].Attr)
}
Output:

map[content:en_US property:og:locale]

func (*Web) Title

func (w *Web) Title() string

Fetch the title from head, if a tag wasn't found because it's missing in the source HTML, empty string will be returned.

Example:

html: <title>Lorem Ipsum</title>

Result: Lorem Ipsum

Example
package main

import (
	"fmt"
	"goscrapper"
)

func main() {
	web := goscrapper.NewScrapper("https://www.metalsucks.net/")

	fmt.Println(web.Title())
}
Output:

MetalSucks | Metal News, Tour Dates, Reviews and Videos

func (*Web) Viewport

func (w *Web) Viewport() *Viewport

Fetch viewport meta info from head

Examples:

html: <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" />

Results: w.Viewport().Val -> ['width=device-width', 'initial-scale=1', 'maximum-scale=1', 'user-scalable=no'] w.Viewport().String() -> 'width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no'

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL