html

package
v0.62.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 30, 2024 License: Apache-2.0 Imports: 16 Imported by: 0

Documentation

Index

Constants

View Source
const HTMLParseEndErrorMsg = "received stop command, exiting HTML parser"

Variables

View Source
var ProcessHTML model.ProcessFunc = func(c *config.Config, reader io.ReadCloser) *model.Result {

	defer reader.Close()

	if c.Verbose {
		fmt.Println("--> parsing HTML...")
	}

	var err error
	var contents *HTMLContents
	var parseFn parseFunc = ParseHTML

	exts := extHTML(c.Extensions)

	if c.TagWeights == nil {
		c.TagWeights = defaultTagWeights
	}
	if c.ExtraTagWeights != nil {
		for k, v := range c.ExtraTagWeights {
			c.TagWeights[k] = v
		}
	}

	if c.FullSite && c.Source != "" {
		var crawler *webCrawler
		crawler, err = newWebCrawler(parseFn, exts, c.Source, c.Verbose)
		if err != nil {
			return model.ErrResult(err)
		}
		contents = crawler.run(reader)
	} else {
		contents = parseFn(reader, c, exts, nil)
	}

	if c.Verbose {
		fmt.Printf("--> parsed: %s\n", contents)
	}

	if err != nil {
		return &model.Result{Err: err}
	}

	if len(contents.lines) == 0 {
		return model.EmptyResult()
	}

	tags, title := tagifyHTML(contents, c, exts)

	return &model.Result{
		Meta: &model.Meta{
			ContentType: config.HTML,
			DocTitle:    title,
			DocHash:     fmt.Sprintf("%x", contents.hash()),
			Lang:        c.Lang,
		},
		RawTags:    tags,
		Extensions: extension.MapResults(c.Extensions),
	}
}

ParseHTML receives lines of raw HTML markup text from the Web and returns simple text, plus list of prioritised tags (if tagify == true) based on the importance of HTML tags which wrap sentences.

Example:

<h1>A story about foo
<p> Foo was a good guy but, had a quite poor time management skills,
therefore he had issues with shipping all his tasks. Though foo had heaps
of other amazing skills, which gained him a fortune.

Result:

foo: 2 + 1 = 3, story: 2, management: 1 + 1 = 2, skills: 1 + 1 = 2.

Functions

This section is empty.

Types

type HTMLContents added in v0.50.0

type HTMLContents struct {
	// contains filtered or unexported fields
}

HTMLContents stores text from target tags.

func ParseHTML

func ParseHTML(reader io.Reader, cfg *config.Config, exts []HTMLExt, c *webCrawler) *HTMLContents

func (*HTMLContents) Append added in v0.50.0

func (cnt *HTMLContents) Append(lineIndex int, tag string, data []byte)

func (*HTMLContents) Last added in v0.59.0

func (cnt *HTMLContents) Last() *HTMLLine

func (*HTMLContents) Len added in v0.50.0

func (cnt *HTMLContents) Len() int

func (*HTMLContents) String added in v0.50.0

func (cnt *HTMLContents) String() string

func (*HTMLContents) Weigh added in v0.50.0

func (cnt *HTMLContents) Weigh(lineIndex int, weight float64)

type HTMLExt added in v0.50.0

type HTMLExt interface {
	extension.Extension
}

HTMLExt ...

type HTMLExtParseTag added in v0.50.0

type HTMLExtParseTag interface {
	HTMLExt

	// ParseTag returns true in case if the contents have been appended and false otherwise.
	ParseTag(cfg *config.Config, token *html.Token, lineIdx int, cnts *HTMLContents) (bool, error)
}

HTMLExtParseTag executed at the HTML parsing phase when dealing with the HTML tag.

type HTMLExtParseText added in v0.50.0

type HTMLExtParseText interface {
	HTMLExt

	// ParseText ...
	ParseText(cfg *config.Config, tagName, text string, lineIdx int) error
}

HTMLExtParseText executed at the HTML parsing phase when dealing with the text inside an HTML tag.

type HTMLExtTagify added in v0.50.0

type HTMLExtTagify interface {
	HTMLExt
	Tagify(cfg *config.Config, line *HTMLLine, tokenIndex map[string]*model.Tag) error
}

HTMLExtParseText executed during token counting phase.

type HTMLLine added in v0.50.0

type HTMLLine struct {
	// contains filtered or unexported fields
}

func (*HTMLLine) String added in v0.50.0

func (l *HTMLLine) String() string

type HTMLParseEndError added in v0.50.0

type HTMLParseEndError struct {
}

func NewHTMLParseEndError added in v0.50.0

func NewHTMLParseEndError() *HTMLParseEndError

func (*HTMLParseEndError) Error added in v0.50.0

func (e *HTMLParseEndError) Error() string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL