classifier

package
v0.0.14 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 4, 2026 License: MIT Imports: 12 Imported by: 0

Documentation

Overview

Package classifier implements form and field type classification.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ElemFeatures

func ElemFeatures(elem *goquery.Selection, form *goquery.Selection) map[string]any

ElemFeatures extracts per-field features for CRF classification.

func GetFormFeatures

func GetFormFeatures(form *goquery.Selection, formType string, fieldElems []*goquery.Selection) []map[string]any

GetFormFeatures extracts CRF feature sequences for a form.

Types

type ClassifyProbaResult

type ClassifyProbaResult struct {
	Form   map[string]float64            `json:"form"`
	Fields map[string]map[string]float64 `json:"fields,omitempty"`
}

ClassifyProbaResult holds probability-based classification results.

type ClassifyResult

type ClassifyResult struct {
	Form   string            `json:"form"`
	Fields map[string]string `json:"fields,omitempty"`
}

ClassifyResult holds the classification result for a form.

type FeaturePipeline

type FeaturePipeline struct {
	Name           string
	Extractor      FormFeatureExtractor
	VecType        string // "dict", "count", "tfidf"
	NgramRange     [2]int
	MinDF          int
	Binary         bool
	Analyzer       string
	StopWords      map[string]bool
	UseEnglishStop bool
}

FeaturePipeline describes a feature extraction + vectorization pipeline.

func DefaultFeaturePipelines

func DefaultFeaturePipelines() []FeaturePipeline

DefaultFeaturePipelines returns the 9 feature extraction pipelines matching Formasaurus's FEATURES list.

type FieldTypeModel

type FieldTypeModel struct {
	CRF *crf.Model
}

FieldTypeModel wraps a CRF model for field type classification.

func TrainFieldType

func TrainFieldType(sequences []crf.TrainingSequence, config crf.TrainerConfig) *FieldTypeModel

TrainFieldType trains a CRF model for field type classification.

func (*FieldTypeModel) Classify

func (m *FieldTypeModel) Classify(form *goquery.Selection, formType string) map[string]string

Classify returns field types for a form given the form type.

func (*FieldTypeModel) ClassifyProba

func (m *FieldTypeModel) ClassifyProba(form *goquery.Selection, formType string) map[string]map[string]float64

ClassifyProba returns field type probabilities for a form.

type FormCSS

type FormCSS struct{}

FormCSS extracts form CSS class and ID.

func (FormCSS) ExtractDict

func (f FormCSS) ExtractDict(form *goquery.Selection) map[string]any

func (FormCSS) ExtractString

func (f FormCSS) ExtractString(form *goquery.Selection) string

func (FormCSS) IsDict

func (f FormCSS) IsDict() bool

type FormElements

type FormElements struct{}

FormElements extracts structural boolean features from a form.

func (FormElements) ExtractDict

func (f FormElements) ExtractDict(form *goquery.Selection) map[string]any

func (FormElements) ExtractString

func (f FormElements) ExtractString(_ *goquery.Selection) string

func (FormElements) IsDict

func (f FormElements) IsDict() bool

type FormFeatureExtractor

type FormFeatureExtractor interface {
	ExtractString(form *goquery.Selection) string
	ExtractDict(form *goquery.Selection) map[string]any
	IsDict() bool
}

FormFeatureExtractor extracts features from a form element.

type FormFieldClassifier

type FormFieldClassifier struct {
	FormModel  *FormTypeModel
	FieldModel *FieldTypeModel
	PageModel  *PageTypeModel
}

FormFieldClassifier detects HTML form, field, and page types.

func LoadClassifier

func LoadClassifier(path string) (*FormFieldClassifier, error)

LoadClassifier loads a FormFieldClassifier from disk.

func (*FormFieldClassifier) Classify

func (c *FormFieldClassifier) Classify(form *goquery.Selection, fields bool) ClassifyResult

Classify returns the form type and field types.

func (*FormFieldClassifier) ClassifyPage added in v0.0.3

func (c *FormFieldClassifier) ClassifyPage(doc *goquery.Document) string

ClassifyPage classifies the page type using form results as features.

func (*FormFieldClassifier) ClassifyPageProba added in v0.0.3

func (c *FormFieldClassifier) ClassifyPageProba(doc *goquery.Document, threshold float64) map[string]float64

ClassifyPageProba returns page type probabilities.

func (*FormFieldClassifier) ClassifyProba

func (c *FormFieldClassifier) ClassifyProba(form *goquery.Selection, threshold float64, fields bool) ClassifyProbaResult

ClassifyProba returns probabilities for form and field types.

func (*FormFieldClassifier) ExtractForms

func (c *FormFieldClassifier) ExtractForms(htmlStr string, proba bool, threshold float64, classifyFields bool) ([]FormResult, error)

ExtractForms extracts and classifies all forms from HTML.

func (*FormFieldClassifier) ExtractFormsFromReader

func (c *FormFieldClassifier) ExtractFormsFromReader(r *strings.Reader, proba bool, threshold float64, classifyFields bool) ([]FormResult, error)

ExtractFormsFromReader extracts and classifies forms from an io.Reader.

func (*FormFieldClassifier) ExtractPage added in v0.0.3

func (c *FormFieldClassifier) ExtractPage(htmlStr string, proba bool, threshold float64, classifyFields bool) ([]FormResult, ClassifyResult, ClassifyProbaResult, error)

ExtractPage classifies both the page type and forms from HTML.

func (*FormFieldClassifier) SaveModel

func (c *FormFieldClassifier) SaveModel(path string) error

SaveModel saves the classifier to disk.

type FormInputCSS

type FormInputCSS struct{}

FormInputCSS extracts CSS of non-hidden inputs.

func (FormInputCSS) ExtractDict

func (f FormInputCSS) ExtractDict(form *goquery.Selection) map[string]any

func (FormInputCSS) ExtractString

func (f FormInputCSS) ExtractString(form *goquery.Selection) string

func (FormInputCSS) IsDict

func (f FormInputCSS) IsDict() bool

type FormInputNames

type FormInputNames struct{}

FormInputNames extracts names of non-hidden inputs.

func (FormInputNames) ExtractDict

func (f FormInputNames) ExtractDict(form *goquery.Selection) map[string]any

func (FormInputNames) ExtractString

func (f FormInputNames) ExtractString(form *goquery.Selection) string

func (FormInputNames) IsDict

func (f FormInputNames) IsDict() bool

type FormInputTitle

type FormInputTitle struct{}

FormInputTitle extracts title attributes of non-hidden inputs.

func (FormInputTitle) ExtractDict

func (f FormInputTitle) ExtractDict(form *goquery.Selection) map[string]any

func (FormInputTitle) ExtractString

func (f FormInputTitle) ExtractString(form *goquery.Selection) string

func (FormInputTitle) IsDict

func (f FormInputTitle) IsDict() bool

type FormLabelText

type FormLabelText struct{}

FormLabelText extracts label text inside the form.

func (FormLabelText) ExtractDict

func (f FormLabelText) ExtractDict(form *goquery.Selection) map[string]any

func (FormLabelText) ExtractString

func (f FormLabelText) ExtractString(form *goquery.Selection) string

func (FormLabelText) IsDict

func (f FormLabelText) IsDict() bool

type FormLinksText

type FormLinksText struct{}

FormLinksText extracts link text inside the form.

func (FormLinksText) ExtractDict

func (f FormLinksText) ExtractDict(_ *goquery.Selection) map[string]any

func (FormLinksText) ExtractString

func (f FormLinksText) ExtractString(form *goquery.Selection) string

func (FormLinksText) IsDict

func (f FormLinksText) IsDict() bool

type FormResult

type FormResult struct {
	FormHTML string              `json:"form_html"`
	Result   ClassifyResult      `json:"result,omitempty"`
	Proba    ClassifyProbaResult `json:"proba,omitempty"`
}

FormResult holds the result for a single form.

type FormTypeModel

type FormTypeModel struct {
	Classes   []string             `json:"classes"`
	Coef      [][]float64          `json:"coef"`      // [numClasses][numFeatures]
	Intercept []float64            `json:"intercept"` // [numClasses]
	Pipelines []SerializedPipeline `json:"pipelines"`
	// contains filtered or unexported fields
}

FormTypeModel holds a trained form type classifier.

func TrainFormType

func TrainFormType(forms []*goquery.Selection, labels []string, config FormTypeTrainConfig) *FormTypeModel

TrainFormType trains a form type classifier.

func (*FormTypeModel) Classify

func (m *FormTypeModel) Classify(form *goquery.Selection) string

Classify returns the predicted form type.

func (*FormTypeModel) ClassifyProba

func (m *FormTypeModel) ClassifyProba(form *goquery.Selection) map[string]float64

ClassifyProba returns probabilities for each form type.

func (*FormTypeModel) InitRuntime

func (m *FormTypeModel) InitRuntime()

InitRuntime initializes runtime state from serialized pipelines.

type FormTypeSummaryExtractor added in v0.0.3

type FormTypeSummaryExtractor struct{}

FormTypeSummaryExtractor extracts features from form classification results.

func (FormTypeSummaryExtractor) ExtractDict added in v0.0.3

func (e FormTypeSummaryExtractor) ExtractDict(_ *goquery.Document, formResults []ClassifyResult) map[string]any

func (FormTypeSummaryExtractor) ExtractString added in v0.0.3

func (FormTypeSummaryExtractor) IsDict added in v0.0.3

func (e FormTypeSummaryExtractor) IsDict() bool

type FormTypeTrainConfig

type FormTypeTrainConfig struct {
	C       float64
	MaxIter int
	Verbose bool
}

FormTypeTrainConfig holds training configuration.

func DefaultFormTypeTrainConfig

func DefaultFormTypeTrainConfig() FormTypeTrainConfig

DefaultFormTypeTrainConfig returns default training config.

type FormURL

type FormURL struct{}

FormURL extracts the form action URL (normalized).

func (FormURL) ExtractDict

func (f FormURL) ExtractDict(form *goquery.Selection) map[string]any

func (FormURL) ExtractString

func (f FormURL) ExtractString(form *goquery.Selection) string

func (FormURL) IsDict

func (f FormURL) IsDict() bool

type PageBodyTextExtractor added in v0.0.3

type PageBodyTextExtractor struct{}

PageBodyTextExtractor extracts visible body text (first 2000 chars).

func (PageBodyTextExtractor) ExtractDict added in v0.0.3

func (e PageBodyTextExtractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any

func (PageBodyTextExtractor) ExtractString added in v0.0.3

func (e PageBodyTextExtractor) ExtractString(doc *goquery.Document, _ []ClassifyResult) string

func (PageBodyTextExtractor) IsDict added in v0.0.3

func (e PageBodyTextExtractor) IsDict() bool

type PageCSSExtractor added in v0.0.3

type PageCSSExtractor struct{}

PageCSSExtractor extracts body/main class and id attributes.

func (PageCSSExtractor) ExtractDict added in v0.0.3

func (e PageCSSExtractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any

func (PageCSSExtractor) ExtractString added in v0.0.3

func (e PageCSSExtractor) ExtractString(doc *goquery.Document, _ []ClassifyResult) string

func (PageCSSExtractor) IsDict added in v0.0.3

func (e PageCSSExtractor) IsDict() bool

type PageFeatureExtractor added in v0.0.3

type PageFeatureExtractor interface {
	ExtractString(doc *goquery.Document, formResults []ClassifyResult) string
	ExtractDict(doc *goquery.Document, formResults []ClassifyResult) map[string]any
	IsDict() bool
}

PageFeatureExtractor extracts features from a page document.

type PageFeaturePipeline added in v0.0.3

type PageFeaturePipeline struct {
	Name           string
	Extractor      PageFeatureExtractor
	VecType        string // "dict", "tfidf"
	NgramRange     [2]int
	MinDF          int
	Binary         bool
	Analyzer       string
	StopWords      map[string]bool
	UseEnglishStop bool
}

PageFeaturePipeline describes a page feature extraction + vectorization pipeline.

func DefaultPageFeaturePipelines added in v0.0.3

func DefaultPageFeaturePipelines() []PageFeaturePipeline

DefaultPageFeaturePipelines returns the 9 page feature extraction pipelines.

type PageH1Extractor added in v0.0.3

type PageH1Extractor struct{}

PageH1Extractor extracts <h1> text.

func (PageH1Extractor) ExtractDict added in v0.0.3

func (e PageH1Extractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any

func (PageH1Extractor) ExtractString added in v0.0.3

func (e PageH1Extractor) ExtractString(doc *goquery.Document, _ []ClassifyResult) string

func (PageH1Extractor) IsDict added in v0.0.3

func (e PageH1Extractor) IsDict() bool

type PageHeadingsExtractor added in v0.0.3

type PageHeadingsExtractor struct{}

PageHeadingsExtractor extracts all h1-h6 text concatenated.

func (PageHeadingsExtractor) ExtractDict added in v0.0.3

func (e PageHeadingsExtractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any

func (PageHeadingsExtractor) ExtractString added in v0.0.3

func (e PageHeadingsExtractor) ExtractString(doc *goquery.Document, _ []ClassifyResult) string

func (PageHeadingsExtractor) IsDict added in v0.0.3

func (e PageHeadingsExtractor) IsDict() bool

type PageMetaDescriptionExtractor added in v0.0.3

type PageMetaDescriptionExtractor struct{}

PageMetaDescriptionExtractor extracts <meta name="description"> content.

func (PageMetaDescriptionExtractor) ExtractDict added in v0.0.3

func (PageMetaDescriptionExtractor) ExtractString added in v0.0.3

func (PageMetaDescriptionExtractor) IsDict added in v0.0.3

type PageNavTextExtractor struct{}

PageNavTextExtractor extracts <nav> text.

func (e PageNavTextExtractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any
func (e PageNavTextExtractor) ExtractString(doc *goquery.Document, _ []ClassifyResult) string
func (e PageNavTextExtractor) IsDict() bool

type PageStructureExtractor added in v0.0.3

type PageStructureExtractor struct{}

PageStructureExtractor extracts structural features + error indicators.

func (PageStructureExtractor) ExtractDict added in v0.0.3

func (e PageStructureExtractor) ExtractDict(doc *goquery.Document, _ []ClassifyResult) map[string]any

func (PageStructureExtractor) ExtractString added in v0.0.3

func (e PageStructureExtractor) ExtractString(_ *goquery.Document, _ []ClassifyResult) string

func (PageStructureExtractor) IsDict added in v0.0.3

func (e PageStructureExtractor) IsDict() bool

type PageTitleExtractor added in v0.0.3

type PageTitleExtractor struct{}

PageTitleExtractor extracts <title> text.

func (PageTitleExtractor) ExtractDict added in v0.0.3

func (e PageTitleExtractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any

func (PageTitleExtractor) ExtractString added in v0.0.3

func (e PageTitleExtractor) ExtractString(doc *goquery.Document, _ []ClassifyResult) string

func (PageTitleExtractor) IsDict added in v0.0.3

func (e PageTitleExtractor) IsDict() bool

type PageTypeModel added in v0.0.3

type PageTypeModel struct {
	Classes   []string             `json:"classes"`
	Coef      [][]float64          `json:"coef"`
	Intercept []float64            `json:"intercept"`
	Pipelines []SerializedPipeline `json:"pipelines"`
	// contains filtered or unexported fields
}

PageTypeModel holds a trained page type classifier.

func TrainPageType added in v0.0.3

func TrainPageType(docs []*goquery.Document, formResults [][]ClassifyResult, urls []string, labels []string, config PageTypeTrainConfig) *PageTypeModel

TrainPageType trains a page type classifier.

func (*PageTypeModel) Classify added in v0.0.3

func (m *PageTypeModel) Classify(doc *goquery.Document, formResults []ClassifyResult) string

Classify returns the predicted page type.

func (*PageTypeModel) ClassifyProba added in v0.0.3

func (m *PageTypeModel) ClassifyProba(doc *goquery.Document, formResults []ClassifyResult) map[string]float64

ClassifyProba returns probabilities for each page type.

func (*PageTypeModel) InitRuntime added in v0.0.3

func (m *PageTypeModel) InitRuntime()

InitRuntime initializes runtime state from serialized pipelines.

type PageTypeTrainConfig added in v0.0.3

type PageTypeTrainConfig struct {
	C            float64
	MaxIter      int
	Verbose      bool
	BalanceClass bool // use balanced class weights
}

PageTypeTrainConfig holds training configuration for the page type model.

func DefaultPageTypeTrainConfig added in v0.0.3

func DefaultPageTypeTrainConfig() PageTypeTrainConfig

DefaultPageTypeTrainConfig returns default training config.

type PageURLExtractor added in v0.0.3

type PageURLExtractor struct {
	URL string // set per-document before extraction
}

PageURLExtractor extracts URL path patterns.

func (PageURLExtractor) ExtractDict added in v0.0.3

func (e PageURLExtractor) ExtractDict(_ *goquery.Document, _ []ClassifyResult) map[string]any

func (PageURLExtractor) ExtractString added in v0.0.3

func (e PageURLExtractor) ExtractString(_ *goquery.Document, _ []ClassifyResult) string

func (PageURLExtractor) IsDict added in v0.0.3

func (e PageURLExtractor) IsDict() bool

type SerializedPipeline

type SerializedPipeline struct {
	Name          string                      `json:"name"`
	ExtractorType string                      `json:"extractor_type"`
	VecType       string                      `json:"vec_type"`
	DictVec       *vectorizer.DictVectorizer  `json:"dict_vec,omitempty"`
	CountVec      *vectorizer.CountVectorizer `json:"count_vec,omitempty"`
	TfidfVec      *vectorizer.TfidfVectorizer `json:"tfidf_vec,omitempty"`
}

SerializedPipeline holds the serialized state of a feature pipeline.

type SubmitText

type SubmitText struct{}

SubmitText extracts submit button text.

func (SubmitText) ExtractDict

func (f SubmitText) ExtractDict(_ *goquery.Selection) map[string]any

func (SubmitText) ExtractString

func (f SubmitText) ExtractString(form *goquery.Selection) string

func (SubmitText) IsDict

func (f SubmitText) IsDict() bool

type UnifiedModel

type UnifiedModel struct {
	FormModel  *FormTypeModel `json:"form_model"`
	FieldModel *crf.Model     `json:"field_model"`
	PageModel  *PageTypeModel `json:"page_model"`
}

UnifiedModel holds form, field, and page models for serialization.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL