scoring

package
v0.6.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 25, 2026 License: MIT Imports: 9 Imported by: 0

Documentation

Overview

Package scoring provides content scoring functionality for the defuddle content extraction system. It implements algorithms to score DOM elements based on content quality and relevance.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func FindBestElement

func FindBestElement(elements []*goquery.Selection, minScore float64) *goquery.Selection

FindBestElement finds the best scoring element from a list JavaScript original code:

static findBestElement(elements: Element[], minScore: number = 50): Element | null {
	let bestElement: Element | null = null;
	let bestScore = 0;

	elements.forEach(element => {
		const score = this.scoreElement(element);
		if (score > bestScore) {
			bestScore = score;
			bestElement = element;
		}
	});

	return bestScore > minScore ? bestElement : null;
}

func IsProtectedNode added in v0.4.0

func IsProtectedNode(el *goquery.Selection, mainContent *goquery.Selection) bool

IsProtectedNode returns true if el should never be removed:

  • el is an ancestor of mainContent (removing it would destroy the content)
  • el is inside a code block (pre or code)

func NodeContains

func NodeContains(ancestor, descendant *goquery.Selection) bool

NodeContains returns true if ancestor contains descendant in the DOM tree.

func ScoreAndRemove

func ScoreAndRemove(doc *goquery.Document, debug bool, mainContent *goquery.Selection)

ScoreAndRemove scores blocks and removes those that are likely not content. JavaScript original code:

public static scoreAndRemove(doc: Document, debug: boolean = false) {
	const startTime = Date.now();
	let removedCount = 0;

	// Track all elements to be removed
	const elementsToRemove = new Set<Element>();

	// Get all block elements
	const blockElements = Array.from(doc.querySelectorAll(BLOCK_ELEMENTS.join(',')));

	// Process each block element
	blockElements.forEach(element => {
		// Skip elements that are already marked for removal
		if (elementsToRemove.has(element)) {
			return;
		}

		// Skip elements that are likely to be content
		if (ContentScorer.isLikelyContent(element)) {
			return;
		}

		// Score the element based on various criteria
		const score = ContentScorer.scoreNonContentBlock(element);

		// If the score is below the threshold, mark for removal
		if (score < 0) {
			elementsToRemove.add(element);
			removedCount++;
		}
	});

	// Remove all collected elements in a single pass
	elementsToRemove.forEach(el => el.remove());

	const endTime = Date.now();
	if (debug) {
		console.log('Defuddle', 'Removed non-content blocks:', {
			count: removedCount,
			processingTime: `${(endTime - startTime).toFixed(2)}ms`
		});
	}
}

func ScoreElement

func ScoreElement(element *goquery.Selection) float64

ScoreElement scores an element based on various content indicators JavaScript original code:

static scoreElement(element: Element): number {
	let score = 0;

	// Text density
	const text = element.textContent || '';
	const words = text.split(/\s+/).length;
	score += words;

	// Paragraph ratio
	const paragraphs = element.getElementsByTagName('p').length;
	score += paragraphs * 10;

	// Link density (penalize high link density)
	const links = element.getElementsByTagName('a').length;
	const linkDensity = links / (words || 1);
	score -= linkDensity * 5;

	// Image ratio (penalize high image density)
	const images = element.getElementsByTagName('img').length;
	const imageDensity = images / (words || 1);
	score -= imageDensity * 3;

	// Position bonus (center/right elements)
	try {
		const style = element.getAttribute('style') || '';
		const align = element.getAttribute('align') || '';
		const isRightSide = style.includes('float: right') ||
						   style.includes('text-align: right') ||
						   align === 'right';
		if (isRightSide) score += 5;
	} catch (e) {
		// Ignore position if we can't get style
	}

	// Content indicators
	const hasDate = /\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b/i.test(text);
	if (hasDate) score += 10;

	const hasAuthor = /\b(?:by|written by|author:)\s+[A-Za-z\s]+\b/i.test(text);
	if (hasAuthor) score += 10;

	// Check for common content classes/attributes
	const className = element.className.toLowerCase();
	if (className.includes('content') || className.includes('article') || className.includes('post')) {
		score += 15;
	}

	// Check for footnotes/references
	const hasFootnotes = element.querySelector(FOOTNOTE_INLINE_REFERENCES);
	if (hasFootnotes) score += 10;

	const hasFootnotesList = element.querySelector(FOOTNOTE_LIST_SELECTORS);
	if (hasFootnotesList) score += 10;

	// Check for nested tables (penalize)
	const nestedTables = element.getElementsByTagName('table').length;
	score -= nestedTables * 5;

	// Additional scoring for table cells
	if (element.tagName.toLowerCase() === 'td') {
		// Table cells get a bonus for being in the main content area
		const parentTable = element.closest('table');
		if (parentTable) {
			// Only favor cells in tables that look like old-style content layouts
			const tableWidth = parseInt(parentTable.getAttribute('width') || '0');
			const tableAlign = parentTable.getAttribute('align') || '';
			const tableClass = parentTable.className.toLowerCase();
			const isTableLayout =
				tableWidth > 400 || // Common width for main content tables
				tableAlign === 'center' ||
				tableClass.includes('content') ||
				tableClass.includes('article');

			if (isTableLayout) {
				// Additional checks to ensure this is likely the main content cell
				const allCells = Array.from(parentTable.getElementsByTagName('td'));
				const cellIndex = allCells.indexOf(element as HTMLTableCellElement);
				const isCenterCell = cellIndex > 0 && cellIndex < allCells.length - 1;

				if (isCenterCell) {
					score += 10;
				}
			}
		}
	}

	return score;
}

Types

type ContentScore

type ContentScore struct {
	Score   float64
	Element *goquery.Selection
}

ContentScore represents a scored element JavaScript original code:

export interface ContentScore {
  score: number;
  element: Element;
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL