Documentation
¶
Overview ¶
Package html2data - extract data from HTML via CSS selectors
Install package and command line utility:
go get -u github.com/msoap/html2data/cmd/html2data
Install package only:
go get -u github.com/msoap/html2data
Allowed pseudo-selectors:
:attr(attr_name) - for getting attributes instead text
:html - for getting HTML instead text
:get(N) - get n-th element from list
Command line utility:
html2data URL "css selector" html2data file.html "css selector" cat file.html | html2data "css selector"
Example ¶
package main import ( "fmt" "log" "github.com/msoap/html2data" ) func main() { doc := html2data.FromURL("http://example.com") // or with config // doc := FromURL("http://example.com", URLCfg{UA: "userAgent", TimeOut: 10, DontDetectCharset: true}) if doc.Err != nil { log.Fatal(doc.Err) } // get title title, _ := doc.GetDataSingle("title") fmt.Println("Title is:", title) title, _ = doc.GetDataSingle("title", html2data.Cfg{DontTrimSpaces: true}) fmt.Println("Title as is, with spaces:", title) texts, _ := doc.GetData(map[string]string{"h1": "h1", "links": "a:attr(href)"}) // get all H1 headers: if textOne, ok := texts["h1"]; ok { for _, text := range textOne { fmt.Println(text) } } // get all urls from links if links, ok := texts["links"]; ok { for _, text := range links { fmt.Println(text) } } }
Output:
Index ¶
- type CSSSelector
- type Cfg
- type Doc
- func (doc Doc) GetData(selectors map[string]string, configs ...Cfg) (result map[string][]string, err error)
- func (doc Doc) GetDataFirst(selectors map[string]string, configs ...Cfg) (result map[string]string, err error)
- func (doc Doc) GetDataNested(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string][]string, err error)
- func (doc Doc) GetDataNestedFirst(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string]string, err error)
- func (doc Doc) GetDataSingle(selector string, configs ...Cfg) (result string, err error)
- type URLCfg
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type CSSSelector ¶
type CSSSelector struct {
// contains filtered or unexported fields
}
CSSSelector - selector with settings
type Cfg ¶
type Cfg struct {
DontTrimSpaces bool // get text as is, by default trim spaces
}
Cfg - config for GetData* methods
type Doc ¶
type Doc struct { Err error // contains filtered or unexported fields }
Doc - html document for parse
func FromFile ¶
FromFile - get doc from file
Example ¶
package main import ( "log" "github.com/msoap/html2data" ) func main() { doc := html2data.FromFile("file_name.html") if doc.Err != nil { log.Fatal(doc.Err) } }
Output:
func FromReader ¶
FromReader - get doc from io.Reader
Example ¶
package main import ( "bufio" "log" "os" "github.com/msoap/html2data" ) func main() { doc := html2data.FromReader(bufio.NewReader(os.Stdin)) if doc.Err != nil { log.Fatal(doc.Err) } }
Output:
func FromURL ¶
FromURL - get doc from URL
FromURL("https://url") FromURL("https://url", URLCfg{UA: "Custom UA 1.0", TimeOut: 10})
Example ¶
package main import ( "log" "github.com/msoap/html2data" ) func main() { doc := html2data.FromURL("http://example.com") if doc.Err != nil { log.Fatal(doc.Err) } // or with config doc = html2data.FromURL("http://example.com", html2data.URLCfg{UA: "userAgent", TimeOut: 10, DontDetectCharset: false}) if doc.Err != nil { log.Fatal(doc.Err) } }
Output:
func (Doc) GetData ¶
func (doc Doc) GetData(selectors map[string]string, configs ...Cfg) (result map[string][]string, err error)
GetData - extract data by CSS-selectors
texts, err := doc.GetData(map[string]string{"h1": "h1"})
Example ¶
package main import ( "fmt" "github.com/msoap/html2data" ) func main() { texts, _ := html2data.FromURL("http://example.com").GetData(map[string]string{"headers": "h1", "links": "a:attr(href)"}) // get all H1 headers: if textOne, ok := texts["headers"]; ok { for _, text := range textOne { fmt.Println(text) } } // get all urls from links if links, ok := texts["links"]; ok { for _, text := range links { fmt.Println(text) } } }
Output:
func (Doc) GetDataFirst ¶
func (doc Doc) GetDataFirst(selectors map[string]string, configs ...Cfg) (result map[string]string, err error)
GetDataFirst - extract data by CSS-selectors, get first entry for each selector or ""
texts, err := doc.GetDataFirst(map[string]string{"h1": "h1"})
Example ¶
package main import ( "fmt" "log" "github.com/msoap/html2data" ) func main() { texts, err := html2data.FromURL("http://example.com").GetDataFirst(map[string]string{"header": "h1", "first_link": "a:attr(href)"}) if err != nil { log.Fatal(err) } // get H1 header: fmt.Println("header: ", texts["header"]) // get URL in first link: fmt.Println("first link: ", texts["first_link"]) }
Output:
func (Doc) GetDataNested ¶
func (doc Doc) GetDataNested(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string][]string, err error)
GetDataNested - extract nested data by CSS-selectors from another CSS-selector
texts, err := doc.GetDataNested("CSS.selector", map[string]string{"h1": "h1"}) - get h1 from CSS.selector
Example ¶
package main import ( "fmt" "github.com/msoap/html2data" ) func main() { texts, _ := html2data.FromFile("test.html").GetDataNested("div.article", map[string]string{"headers": "h1", "links": "a:attr(href)"}) for _, article := range texts { // get all H1 headers inside each <div class="article">: if textOne, ok := article["headers"]; ok { for _, text := range textOne { fmt.Println(text) } } // get all urls from links inside each <div class="article"> if links, ok := article["links"]; ok { for _, text := range links { fmt.Println(text) } } } }
Output:
func (Doc) GetDataNestedFirst ¶
func (doc Doc) GetDataNestedFirst(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string]string, err error)
GetDataNestedFirst - extract nested data by CSS-selectors from another CSS-selector get first entry for each selector or ""
texts, err := doc.GetDataNestedFirst("CSS.selector", map[string]string{"h1": "h1"}) - get h1 from CSS.selector
Example ¶
package main import ( "fmt" "log" "github.com/msoap/html2data" ) func main() { texts, err := html2data.FromFile("cmd/html2data/test.html").GetDataNestedFirst("div.block", map[string]string{"header": "h1", "link": "a:attr(href)", "sp": "span"}) if err != nil { log.Fatal(err) } fmt.Println("") for _, block := range texts { // get first H1 header fmt.Printf("header - %s\n", block["header"]) // get first link fmt.Printf("first URL - %s\n", block["link"]) // get not exists span fmt.Printf("span - '%s'\n", block["span"]) } }
Output: header - Head1.1 first URL - http://url1 span - '' header - Head2.1 first URL - http://url2 span - ''
func (Doc) GetDataSingle ¶
GetDataSingle - extract data by one CSS-selector
title, err := doc.GetDataSingle("title")
Example ¶
package main import ( "fmt" "log" "github.com/msoap/html2data" ) func main() { // get title title, err := html2data.FromFile("cmd/html2data/test.html").GetDataSingle("title") if err != nil { log.Fatal(err) } fmt.Println("Title is:", title) }
Output: Title is: Title