Documentation
¶
Overview ¶
A HTML parse and a serializer for Go. GoHTML tries to keep semantic similar to JS-DOM API while trying to keep the API simple by not forcing JS-DOM model into GoHTML. Because of this GoHTML has node tree model. GoHTML tokenizer uses std net/html module for tokenizing in underlining layer. There for it's users responsibility to make sure inputs to GoHTML is UTF-8 encoded. GoHTML allows direct access to the node tree.
Index ¶
- Constants
- Variables
- func Encode(w io.Writer, rootNode *Node)
- func IsVoidTag(tagName string) bool
- func NodeTreeToHTML(rootNode *Node) string
- func QuerySearch(node *Node, selector string) iter.Seq[*Node]
- type BasicSelector
- type ClassList
- func (classList ClassList) AppendClass(className string)
- func (classList ClassList) Contains(className string) bool
- func (classList ClassList) DecodeFrom(node *Node)
- func (classList ClassList) DeleteClass(className string)
- func (classList ClassList) Encode() string
- func (classList ClassList) EncodeTo(node *Node)
- type Combinator
- type CombinatorEl
- type Node
- func (node *Node) Append(newNode *Node)
- func (node *Node) AppendChild(childNode *Node)
- func (node *Node) AppendText(text string)
- func (node *Node) Closest(selector string) *Node
- func (node *Node) GetAttribute(attributeName string) (string, bool)
- func (node *Node) GetChildNode() *Node
- func (node *Node) GetElementByClassName(className string) *Node
- func (node *Node) GetElementByID(idName string) *Node
- func (node *Node) GetElementByTagName(tagName string) *Node
- func (node *Node) GetElementsByClassName(className string) NodeList
- func (node *Node) GetElementsById(idName string) NodeList
- func (node *Node) GetElementsByTagName(tagName string) NodeList
- func (node *Node) GetFirstNode() *Node
- func (node *Node) GetInnerText() string
- func (node *Node) GetLastNode() *Node
- func (node *Node) GetNextNode() *Node
- func (node *Node) GetParent() *Node
- func (node *Node) GetPreviousNode() *Node
- func (node *Node) GetTagName() string
- func (node *Node) GetText() string
- func (node *Node) IsTextNode() bool
- func (node *Node) IterateAttributes(callback func(attribute, value string))
- func (node *Node) QuerySelector(selector string) *Node
- func (node *Node) QuerySelectorAll(selector string) NodeList
- func (node *Node) RemoveAttribute(attributeName string)
- func (node *Node) RemoveNode()
- func (node *Node) SetAttribute(attribute, value string)
- func (node *Node) SetNextNode(nextNode *Node)
- func (node *Node) SetPreviousNode(previousNode *Node)
- func (node *Node) SetTagName(tagName string)
- func (node *Node) SetText(text string)
- type NodeList
- type NodeTreeBuilder
- type Selector
- type Tokenizer
- type TraverseCondition
- type Traverser
Examples ¶
Constants ¶
const ( Area string = "area" Base string = "base" Br string = "br" Col string = "col" Embed string = "embed" Hr string = "hr" Img string = "img" Input string = "input" Link string = "link" Meta string = "meta" Param string = "param" Source string = "source" Track string = "track" Wbr string = "wbr" )
Void tags
const ( //This is not a void el. but added it anyway. DOCTYPEDTD string = "!DOCTYPE" )
A DTD defines the structure and the legal elements and attributes of an XML document.
Variables ¶
var (
SyntaxError error = fmt.Errorf("Syntax error")
)
Functions ¶
func NodeTreeToHTML ¶
NodeTreeToHTML returns encoding of node-tree as a string.
func QuerySearch ¶ added in v0.2.3
QuerySearch search returns a iterator that traverse through the node tree from given node and passes nodes that matches the given selector.
Example ¶
package main import ( "fmt" "net/http" GoHtml "github.com/udan-jayanith/GoHTML" ) func main() { //Request the html res, err := http.Get("https://example.com/") if err != nil || res.StatusCode != http.StatusOK { return } defer res.Body.Close() //Decode the html rootNode, _ := GoHtml.Decode(res.Body) //Iterate over every node that matches the query. for node := range GoHtml.QuerySearch(rootNode, ".event-columns .column .event-block h4 a") { //Convert the node and it's children nodes to text html and print it. fmt.Println(GoHtml.NodeTreeToHTML(node)) } }
Types ¶
type BasicSelector ¶ added in v0.2.3
type BasicSelector int
const ( Id BasicSelector = iota Class Tag )
type ClassList ¶
type ClassList struct {
// contains filtered or unexported fields
}
func (ClassList) AppendClass ¶
AppendClass append className to classList. className that contains multiple classes is also a valid className.
func (ClassList) Contains ¶
Contains returns whether the className exists or not.
Example ¶
package main import ( "fmt" GoHtml "github.com/udan-jayanith/GoHTML" ) func main() { //Creates a div that has classes video-container and main-contents div := GoHtml.CreateNode("div") div.SetAttribute("class", "video-container main-contents") classList := GoHtml.NewClassList() //Add the classes in the div to the class list classList.DecodeFrom(div) //Checks wether the following classes exists in the classList fmt.Println(classList.Contains("container")) fmt.Println(classList.Contains("video-container")) }
Output: false true
func (ClassList) DecodeFrom ¶ added in v0.0.1
DecodeFrom append classes in the node to classList. If node is nil SetClass does nothing.
func (ClassList) DeleteClass ¶
DeleteClass deletes the specified classes in className.
func (ClassList) Encode ¶
Encode returns the full className.
Example ¶
package main import ( "fmt" GoHtml "github.com/udan-jayanith/GoHTML" ) func main() { classList := GoHtml.NewClassList() //Add classes to the class list classList.AppendClass("container") classList.AppendClass("warper") classList.AppendClass("main-content") //This would output something like this "warper container main-content". Order of the output is not guaranteed. fmt.Println(classList.Encode()) }
type Combinator ¶ added in v0.2.3
type Combinator int
const ( Descendant Combinator = iota Child NextSibling SubsequentSibling //if no combinator NoneCombinator )
type CombinatorEl ¶ added in v0.2.3
type CombinatorEl struct { Type Combinator Selector1 Selector Selector2 Selector }
CombinatorEl is used to represent selectors that are around a combinator.
func TokenizeSelectorsAndCombinators ¶ added in v0.2.3
func TokenizeSelectorsAndCombinators(selector string) []CombinatorEl
This takes a selector or combinators and selectors and then returns a slice of CombinatorEl.
type Node ¶
type Node struct {
// contains filtered or unexported fields
}
Node is a struct that represents a html elements. Nodes can have sibling nodes(NextNode and Previous Node) and child node that represent the child elements. Text is also stored as a node which can be checked by using IsTextNode method.
func CloneNode ¶
CloneNode copy the node. But have one way connections to it's parent, next and previous nodes. If node is nil CloneNode returns nil.
func CreateTextNode ¶
CreateTextNode returns a new node that represents the given text. HTML tags in text get escaped.
func Decode ¶
Decode reads from rd and create a node-tree. Then returns the root node and nil.
Example ¶
package main import ( "fmt" "strings" GoHtml "github.com/udan-jayanith/GoHTML" ) func main() { r := strings.NewReader(` <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>User Profile</title> </head> <body> <h1 class="username">Udan</h1> <p class="email">udanjayanith@gmail.com</p> <p>Joined: 01/08/2024</p> </body> </html> `) rootNode, _ := GoHtml.Decode(r) titleNode := rootNode.QuerySelector("title") title := "" if titleNode != nil { title = titleNode.GetInnerText() } fmt.Println(title) }
Output: User Profile
func DeepCloneNode ¶
DeepCloneNode clones the node without having references to it's original parent node, previous node and next node. If node is nil DeepCloneNode returns nil.
func HTMLToNodeTree ¶
HTMLToNodeTree return html code as a node-tree. If error were to occur it would be SyntaxError.
func (*Node) AppendChild ¶
The AppendChild() method of the Node adds a node to the end of the list of children of a specified parent node.
func (*Node) AppendText ¶
AppendText append text to the node.
func (*Node) Closest ¶ added in v0.2.3
Closest traverses the node tree and its parents (heading toward the root node) until it finds a node that matches the selector and returns that node. Adapted from [https://developer.mozilla.org/en-US/docs/Web/API/Element/closest](MDN Element: closest() method)
func (*Node) GetAttribute ¶
GetAttribute returns the specified attribute value form the node. If the specified attribute doesn't exists GetAttribute returns a empty string and false.
func (*Node) GetChildNode ¶
GetChildNode returns the first child node of this node.
func (*Node) GetElementByClassName ¶ added in v0.0.1
GetElementByClassName returns the first node that match with the given className by advancing from the node.
func (*Node) GetElementByID ¶ added in v0.0.1
GetElementByID returns the first node that match with the given idName by advancing from the node.
func (*Node) GetElementByTagName ¶ added in v0.0.1
GetElementByTagName returns the first node that match with the given tagName by advancing from the node.
func (*Node) GetElementsByClassName ¶ added in v0.0.1
GetElementsByClassName returns a NodeList containing nodes that have the given className from the node.
func (*Node) GetElementsById ¶ added in v0.0.1
GetElementsByClassName returns a NodeList containing nodes that have the given idName from the node.
func (*Node) GetElementsByTagName ¶ added in v0.0.1
GetElementsByTagName returns a NodeList containing nodes that have the given tagName from the node.
func (*Node) GetFirstNode ¶
GetFirstNode returns the first node of the node branch.
func (*Node) GetInnerText ¶
GetInnerText returns all of the text inside the node.
func (*Node) GetLastNode ¶
GetLastNode returns the last node in the node branch.
func (*Node) GetNextNode ¶
GetNextNode returns node next to the node.
func (*Node) GetPreviousNode ¶
GetPreviousNode returns the previous node.
func (*Node) GetTagName ¶
Returns a string with the name of the tag for the given node.
func (*Node) GetText ¶
GetText returns text on the node. This does not returns text on it's child nodes. If you also wants child nodes text use GetInnerText method on the node. HTML tags in returns value get escaped.
func (*Node) IsTextNode ¶
IsTextNode returns a boolean value indicating node is a text node or not.
func (*Node) IterateAttributes ¶
IterateAttributes calls callback at every attribute in the node by passing attribute and value of the node.
func (*Node) QuerySelector ¶ added in v0.0.2
QuerySelector returns the first node that matches with the selector from the node.
Example ¶
package main import ( "fmt" "net/http" GoHtml "github.com/udan-jayanith/GoHTML" ) func main() { res, err := http.Get("https://example.com/") if err != nil || res.StatusCode != http.StatusOK { return } defer res.Body.Close() rootNode, _ := GoHtml.Decode(res.Body) res.Body.Close() title := rootNode.QuerySelector("title") if title != nil { fmt.Println(title.GetInnerText()) //Example Domain } }
func (*Node) QuerySelectorAll ¶ added in v0.0.2
QuerySelectorAll returns a NodeList that has node that matches the selector form the node.
func (*Node) RemoveAttribute ¶
RemoveAttribute remove or delete the specified attribute.
func (*Node) RemoveNode ¶
func (node *Node) RemoveNode()
RemoveNode removes the node from the branch safely by connecting sibling nodes.
func (*Node) SetAttribute ¶
SetAttribute add a attribute to the node.
func (*Node) SetNextNode ¶
SetNextNode make nodes next node as nextNode.
func (*Node) SetPreviousNode ¶
SetPreviousNode sets nodes previous node to previousNode.
func (*Node) SetTagName ¶
SetTagName changes the html tag name to the tagName.
type NodeList ¶ added in v0.0.1
type NodeList struct {
// contains filtered or unexported fields
}
NodeList can store nodes by appended order and can iterate over the node list by invoking IterNodeList method.
Example ¶
package main import ( "fmt" GoHtml "github.com/udan-jayanith/GoHTML" ) func main() { nodeList := GoHtml.NewNodeList() nodeList.Append(GoHtml.CreateNode("br")) nodeList.Append(GoHtml.CreateNode("hr")) nodeList.Append(GoHtml.CreateNode("div")) iter := nodeList.IterNodeList() for node := range iter { fmt.Println(node.GetTagName()) } }
Output: br hr div
func NewNodeList ¶ added in v0.0.1
func NewNodeList() NodeList
New returns an initialized node list.
func (*NodeList) Back ¶ added in v0.0.1
Back returns the last node of list or nil if the list is empty.
func (*NodeList) Front ¶ added in v0.0.1
Front returns the first node of list or nil if the list is empty.
func (*NodeList) IterNodeList ¶ added in v0.0.1
IterNodeList returns a iterator over the node list.
func (*NodeList) Len ¶ added in v0.0.1
Len returns the number of node in the list. The complexity is O(1).
type NodeTreeBuilder ¶ added in v0.2.3
type NodeTreeBuilder struct {
// contains filtered or unexported fields
}
NodeTreeBuilder is used to build a node tree given a node and it's type.
func NewNodeTreeBuilder ¶ added in v0.2.3
func NewNodeTreeBuilder() NodeTreeBuilder
NewNodeTreeBuilder returns a new NodeTreeBuilder.
func (*NodeTreeBuilder) GetRootNode ¶ added in v0.2.3
func (ntb *NodeTreeBuilder) GetRootNode() *Node
GetRootNode returns the root node of the accumulated node tree and resets the NodeTreeBuilder.
func (*NodeTreeBuilder) WriteNodeTree ¶ added in v0.2.3
func (ntb *NodeTreeBuilder) WriteNodeTree(node *Node, tt html.TokenType)
WriteNodeTree append the node given html.TokenType.
type Selector ¶ added in v0.2.3
type Selector struct {
// contains filtered or unexported fields
}
Selector struct represents a single css selector Ex: .my-class, #video, div
func NewSelector ¶ added in v0.2.3
NewSelector takes a single css selector and returns a Selector struct. Selector string should be only of basic selector.
type Tokenizer ¶ added in v0.2.3
type Tokenizer struct {
// contains filtered or unexported fields
}
Tokenizer contains a *html.Tokenizer.
Example ¶
package main import ( "fmt" "net/http" GoHtml "github.com/udan-jayanith/GoHTML" "golang.org/x/net/html" ) func main() { //Request the html res, err := http.Get("https://go.dev/") if err != nil || res.StatusCode != http.StatusOK { return } defer res.Body.Close() //NewTokenizer takes a io.reader that receives UTF-8 encoded html code and returns a Tokenizer. t := GoHtml.NewTokenizer(res.Body) //NewNodeTreeBuilder return a new NodeTreeBuilder that can be used to build a node tree. nodeTreeBuilder := GoHtml.NewNodeTreeBuilder() for { //Advanced scans the next token and returns its type. tt := t.Advanced() if tt == html.ErrorToken { break } //WriteNodeTree takes a node and a token type. The node can be nil so if token type is EndTagToken. nodeTreeBuilder.WriteNodeTree(t.GetCurrentNode(), tt) } //Prints the root node of the node tree in the nodeTreeBuilder. fmt.Println(nodeTreeBuilder.GetRootNode()) }
func NewTokenizer ¶ added in v0.2.3
NewTokenizer returns a new Tokenizer.
func (*Tokenizer) GetCurrentNode ¶ added in v0.2.3
CurrentNode returns the current node. Returned value can be nil regardless of token type.
type TraverseCondition ¶
type TraverseCondition = bool
const ( StopWalkthrough TraverseCondition = false ContinueWalkthrough TraverseCondition = true )
type Traverser ¶
type Traverser struct {
// contains filtered or unexported fields
}
func NewTraverser ¶ added in v0.0.1
NewTraverser returns a new traverser that can be used to navigate the node tree.
func (*Traverser) GetCurrentNode ¶
GetCurrentNode returns the current node.
func (*Traverser) Next ¶
Next returns the node next to current node and change CurrentNode to the new node. Make sure t.currentNode is not nil otherwise program will panic.
func (*Traverser) Previous ¶
Previous returns the previous node and change CurrentNode to the new node. Make sure t.currentNode is not nil otherwise program will panic.
func (*Traverser) SetCurrentNodeTo ¶
SetCurrentNodeTo changes the current node to the newNode.
func (*Traverser) Walkthrough ¶
func (t *Traverser) Walkthrough(callback func(node *Node) TraverseCondition)
Walkthrough traverse the node tree from the current node to the end of the node tree by visiting every node. Walkthrough traverse the node tree similar to DFS without visiting visited nodes iteratively. Walkthrough can be used as a range over iterator or a function that takes a callback and pass every node one by one.
Example ¶
package main import ( "fmt" GoHtml "github.com/udan-jayanith/GoHTML" ) func main() { //Creation of the node tree. body := GoHtml.CreateNode("body") h1 := GoHtml.CreateNode("h1") h1.AppendText("This is a heading") body.AppendChild(h1) p := GoHtml.CreateNode("p") p.AppendText("The HTML <p>tag is a fundamental element used for creating paragraphs in web development. It helps structure content, separating text into distinct blocks. When you wrap text within <p>... </p>tags, you tell browsers to treat the enclosed content as a paragraph.") body.AppendChild(p) traverser := GoHtml.NewTraverser(body) for node := range traverser.Walkthrough { fmt.Println(node) } //or traverser.Walkthrough(func(node *GoHtml.Node) GoHtml.TraverseCondition { fmt.Println(node) return true }) }