Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Document ¶
type Document struct { Tag string `yaml:"tag,omitempty"` Attrs string `yaml:"attrs,omitempty"` // Simplified attributes as space-separated key=value pairs Text string `yaml:"text,omitempty"` // For text-only nodes Markdown string `yaml:"markdown,omitempty"` // For markdown-converted content IsSVG bool `yaml:"svg,omitempty"` // Mark SVG elements to potentially skip details Children []Document `yaml:"children,omitempty"` }
type FilterConfig ¶
type FilterConfig struct {
Selectors []Selector `yaml:"selectors"`
}
type NodeHandler ¶
type NodeHandler struct {
// contains filtered or unexported fields
}
NodeHandler determines how to process different types of nodes
func NewNodeHandler ¶
func NewNodeHandler(opts Options) *NodeHandler
NewNodeHandler creates a new NodeHandler with the given options
func (*NodeHandler) GetStrategy ¶
func (h *NodeHandler) GetStrategy(node *html.Node) NodeHandlingStrategy
GetStrategy returns the handling strategy for a given node
func (*NodeHandler) IsMarkdownable ¶
func (h *NodeHandler) IsMarkdownable(node *html.Node) bool
IsMarkdownable returns true if the node and all its children can be converted to markdown
func (*NodeHandler) IsTextOnly ¶
func (h *NodeHandler) IsTextOnly(node *html.Node) bool
IsTextOnly returns true if all children of the node can be converted to text
type NodeHandlingStrategy ¶
type NodeHandlingStrategy int
NodeHandlingStrategy defines how a node should be processed
const ( // StrategyDefault processes the node normally, keeping its tag and attributes StrategyDefault NodeHandlingStrategy = iota // StrategyUnwrap removes the node but keeps its children StrategyUnwrap // StrategyFilter removes the node and all its children StrategyFilter // StrategyTextOnly converts the node and its children to text if possible StrategyTextOnly // StrategyMarkdown converts the node and its children to markdown if possible StrategyMarkdown // StrategyPreserveWhitespace keeps all whitespace in text nodes StrategyPreserveWhitespace )
func (NodeHandlingStrategy) String ¶
func (s NodeHandlingStrategy) String() string
String returns a string representation of the strategy
type Selector ¶
type Selector struct { Type string `yaml:"type"` // "css" or "xpath" Mode SelectorMode `yaml:"mode"` // "select" or "filter" Selector string `yaml:"selector"` // The actual selector string }
type SelectorMode ¶
type SelectorMode string
const ( SelectorModeSelect SelectorMode = "select" SelectorModeFilter SelectorMode = "filter" )
type Simplifier ¶
type Simplifier struct {
// contains filtered or unexported fields
}
Simplifier handles HTML simplification with configurable options
func NewSimplifier ¶
func NewSimplifier(opts Options) *Simplifier
NewSimplifier creates a new HTML simplifier with the given options
func (*Simplifier) ProcessHTML ¶
func (s *Simplifier) ProcessHTML(htmlContent string) ([]Document, error)
ProcessHTML simplifies the given HTML content according to the configured options
type TextSimplifier ¶
type TextSimplifier struct {
// contains filtered or unexported fields
}
TextSimplifier handles text-related simplification operations
func NewTextSimplifier ¶
func NewTextSimplifier(markdownEnabled bool) *TextSimplifier
NewTextSimplifier creates a new text simplifier
func (*TextSimplifier) ConvertToMarkdown ¶
func (t *TextSimplifier) ConvertToMarkdown(node *html.Node) (string, bool)
ConvertToMarkdown converts a node and its children to markdown format
func (*TextSimplifier) ExtractText ¶
func (t *TextSimplifier) ExtractText(node *html.Node) string
ExtractText extracts text from a node and its children, preserving whitespace if needed
func (*TextSimplifier) SimplifyText ¶
func (t *TextSimplifier) SimplifyText(node *html.Node) (string, bool)
SimplifyText attempts to convert a node and its children to a single text string