htmlsimplifier

package
v0.0.8 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 20, 2025 License: MIT Imports: 7 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Document

type Document struct {
	Tag      string     `yaml:"tag,omitempty"`
	Attrs    string     `yaml:"attrs,omitempty"`    // Simplified attributes as space-separated key=value pairs
	Text     string     `yaml:"text,omitempty"`     // For text-only nodes
	Markdown string     `yaml:"markdown,omitempty"` // For markdown-converted content
	IsSVG    bool       `yaml:"svg,omitempty"`      // Mark SVG elements to potentially skip details
	Children []Document `yaml:"children,omitempty"`
}

func (Document) IsEmpty

func (d Document) IsEmpty() bool

IsEmpty returns true if the document is empty (no content)

type FilterConfig

type FilterConfig struct {
	Selectors []Selector `yaml:"selectors"`
}

type NodeHandler

type NodeHandler struct {
	// contains filtered or unexported fields
}

NodeHandler determines how to process different types of nodes

func NewNodeHandler

func NewNodeHandler(opts Options) *NodeHandler

NewNodeHandler creates a new NodeHandler with the given options

func (*NodeHandler) GetStrategy

func (h *NodeHandler) GetStrategy(node *html.Node) NodeHandlingStrategy

GetStrategy returns the handling strategy for a given node

func (*NodeHandler) IsMarkdownable

func (h *NodeHandler) IsMarkdownable(node *html.Node) bool

IsMarkdownable returns true if the node and all its children can be converted to markdown

func (*NodeHandler) IsTextOnly

func (h *NodeHandler) IsTextOnly(node *html.Node) bool

IsTextOnly returns true if all children of the node can be converted to text

type NodeHandlingStrategy

type NodeHandlingStrategy int

NodeHandlingStrategy defines how a node should be processed

const (
	// StrategyDefault processes the node normally, keeping its tag and attributes
	StrategyDefault NodeHandlingStrategy = iota

	// StrategyUnwrap removes the node but keeps its children
	StrategyUnwrap

	// StrategyFilter removes the node and all its children
	StrategyFilter

	// StrategyTextOnly converts the node and its children to text if possible
	StrategyTextOnly

	// StrategyMarkdown converts the node and its children to markdown if possible
	StrategyMarkdown

	// StrategyPreserveWhitespace keeps all whitespace in text nodes
	StrategyPreserveWhitespace
)

func (NodeHandlingStrategy) String

func (s NodeHandlingStrategy) String() string

String returns a string representation of the strategy

type Options

type Options struct {
	StripScripts bool
	StripCSS     bool
	ShortenText  bool
	CompactSVG   bool
	StripSVG     bool
	MaxListItems int
	MaxTableRows int
	FilterConfig *FilterConfig
	SimplifyText bool
	Markdown     bool // Convert text with important elements to markdown
}

type Selector

type Selector struct {
	Type     string       `yaml:"type"`     // "css" or "xpath"
	Mode     SelectorMode `yaml:"mode"`     // "select" or "filter"
	Selector string       `yaml:"selector"` // The actual selector string
}

type SelectorMode

type SelectorMode string
const (
	SelectorModeSelect SelectorMode = "select"
	SelectorModeFilter SelectorMode = "filter"
)

type Simplifier

type Simplifier struct {
	// contains filtered or unexported fields
}

Simplifier handles HTML simplification with configurable options

func NewSimplifier

func NewSimplifier(opts Options) *Simplifier

NewSimplifier creates a new HTML simplifier with the given options

func (*Simplifier) ProcessHTML

func (s *Simplifier) ProcessHTML(htmlContent string) ([]Document, error)

ProcessHTML simplifies the given HTML content according to the configured options

type TextSimplifier

type TextSimplifier struct {
	// contains filtered or unexported fields
}

TextSimplifier handles text-related simplification operations

func NewTextSimplifier

func NewTextSimplifier(markdownEnabled bool) *TextSimplifier

NewTextSimplifier creates a new text simplifier

func (*TextSimplifier) ConvertToMarkdown

func (t *TextSimplifier) ConvertToMarkdown(node *html.Node) (string, bool)

ConvertToMarkdown converts a node and its children to markdown format

func (*TextSimplifier) ExtractText

func (t *TextSimplifier) ExtractText(node *html.Node) string

ExtractText extracts text from a node and its children, preserving whitespace if needed

func (*TextSimplifier) SimplifyText

func (t *TextSimplifier) SimplifyText(node *html.Node) (string, bool)

SimplifyText attempts to convert a node and its children to a single text string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL