web

package
v0.0.34 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 12, 2026 License: Apache-2.0 Imports: 17 Imported by: 3

README

web

URL manipulation, text normalization, media type detection, binary file fetching, and web search abstractions for web crawling and content processing. Provides utilities for normalizing URLs, resolving relative links, cleaning text, identifying media files, downloading binary content, and implementing search functionality.

Usage Examples

URL Normalization
package main

import (
	"fmt"
	"log"

	"github.com/deepnoodle-ai/wonton/web"
)

func main() {
	// Normalize URL (adds https://, removes query params)
	url, err := web.NormalizeURL("example.com/path?query=1#fragment")
	if err != nil {
		log.Fatal(err)
	}
	fmt.Println(url.String())
	// Output: https://example.com/path

	// Normalize with http:// (converts to https://)
	url, _ = web.NormalizeURL("http://example.com")
	fmt.Println(url.String())
	// Output: https://example.com

	// Already normalized URL
	url, _ = web.NormalizeURL("https://example.com/path")
	fmt.Println(url.String())
	// Output: https://example.com/path

	// Trim whitespace
	url, _ = web.NormalizeURL("  example.com  ")
	fmt.Println(url.String())
	// Output: https://example.com
}
func resolveLinks(baseDomain string, links []string) {
	for _, link := range links {
		resolved, ok := web.ResolveLink(baseDomain, link)
		if ok {
			fmt.Printf("%s -> %s\n", link, resolved)
		} else {
			fmt.Printf("%s -> (invalid)\n", link)
		}
	}
}

func main() {
	// Resolve relative URLs
	baseDomain := "https://example.com/blog"
	links := []string{
		"/about",                    // Absolute path
		"post/123",                  // Relative path
		"../contact",                // Parent path
		"https://other.com/page",    // Absolute URL
		"mailto:test@example.com",   // Non-HTTP (rejected)
		"#section",                  // Fragment (removed)
	}

	resolveLinks(baseDomain, links)
	// Output:
	// /about -> https://example.com/about
	// post/123 -> https://example.com/blog/post/123
	// ../contact -> https://example.com/contact
	// https://other.com/page -> https://other.com/page
	// mailto:test@example.com -> (invalid)
	// #section -> https://example.com/blog
}
Host Comparison
func compareHosts() {
	url1, _ := web.NormalizeURL("https://example.com/path1")
	url2, _ := web.NormalizeURL("https://example.com/path2")
	url3, _ := web.NormalizeURL("https://sub.example.com/path")
	url4, _ := web.NormalizeURL("https://other.com/path")

	// Check if same host
	fmt.Println(web.AreSameHost(url1, url2))
	// Output: true

	fmt.Println(web.AreSameHost(url1, url3))
	// Output: false

	// Check if related hosts (same domain)
	fmt.Println(web.AreRelatedHosts(url1, url3))
	// Output: true (both *.example.com)

	fmt.Println(web.AreRelatedHosts(url1, url4))
	// Output: false (different domains)
}
Sorting URLs
func sortURLs(urls []string) {
	// Parse URLs
	var parsedURLs []*url.URL
	for _, u := range urls {
		parsed, err := web.NormalizeURL(u)
		if err == nil {
			parsedURLs = append(parsedURLs, parsed)
		}
	}

	// Sort alphabetically
	web.SortURLs(parsedURLs)

	// Print sorted URLs
	for _, u := range parsedURLs {
		fmt.Println(u.String())
	}
}

func main() {
	urls := []string{
		"example.com/zebra",
		"example.com/alpha",
		"example.com/beta",
	}
	sortURLs(urls)
	// Output:
	// https://example.com/alpha
	// https://example.com/beta
	// https://example.com/zebra
}
Text Normalization
func cleanText(input string) string {
	return web.NormalizeText(input)
}

func main() {
	// Trim whitespace
	fmt.Println(cleanText("  Hello  "))
	// Output: Hello

	// Unescape HTML entities
	fmt.Println(cleanText("Hello & goodbye"))
	// Output: Hello & goodbye

	fmt.Println(cleanText("<div>"))
	// Output: <div>

	// Remove non-printable characters
	fmt.Println(cleanText("Hello\x00\x01World"))
	// Output: Hello  World

	// Combined transformations
	fmt.Println(cleanText("  &quot;Hello&quot; \x00"))
	// Output: "Hello"
}
Checking Punctuation
func checkPunctuation() {
	fmt.Println(web.EndsWithPunctuation("Hello."))
	// Output: true

	fmt.Println(web.EndsWithPunctuation("Hello?"))
	// Output: true

	fmt.Println(web.EndsWithPunctuation("Hello"))
	// Output: false

	fmt.Println(web.EndsWithPunctuation("Hello!"))
	// Output: true

	fmt.Println(web.EndsWithPunctuation(""))
	// Output: false
}
Media URL Detection
func filterMediaURLs(urls []string) {
	var mediaURLs []string
	var pageURLs []string

	for _, rawURL := range urls {
		parsed, err := web.NormalizeURL(rawURL)
		if err != nil {
			continue
		}

		if web.IsMediaURL(parsed) {
			mediaURLs = append(mediaURLs, parsed.String())
		} else {
			pageURLs = append(pageURLs, parsed.String())
		}
	}

	fmt.Println("Page URLs:")
	for _, u := range pageURLs {
		fmt.Printf("  %s\n", u)
	}

	fmt.Println("\nMedia URLs:")
	for _, u := range mediaURLs {
		fmt.Printf("  %s\n", u)
	}
}

func main() {
	urls := []string{
		"example.com/page.html",
		"example.com/image.jpg",
		"example.com/document.pdf",
		"example.com/video.mp4",
		"example.com/about",
	}

	filterMediaURLs(urls)
	// Output:
	// Page URLs:
	//   https://example.com/page.html
	//   https://example.com/about
	//
	// Media URLs:
	//   https://example.com/image.jpg
	//   https://example.com/document.pdf
	//   https://example.com/video.mp4
}
Web Crawler URL Processing
type Crawler struct {
	baseDomain string
	visited    map[string]bool
	queue      []string
}

func (c *Crawler) AddLink(link string) {
	// Resolve relative link
	resolved, ok := web.ResolveLink(c.baseDomain, link)
	if !ok {
		return
	}

	// Parse normalized URL
	url, err := web.NormalizeURL(resolved)
	if err != nil {
		return
	}

	// Skip media files
	if web.IsMediaURL(url) {
		return
	}

	// Check if same domain
	baseURL, _ := web.NormalizeURL(c.baseDomain)
	if !web.AreSameHost(url, baseURL) {
		return
	}

	// Add to queue if not visited
	urlStr := url.String()
	if !c.visited[urlStr] {
		c.visited[urlStr] = true
		c.queue = append(c.queue, urlStr)
	}
}
func extractPageLinks(baseURL string, htmlLinks []string) []string {
	var validLinks []string

	for _, link := range htmlLinks {
		// Resolve and normalize
		resolved, ok := web.ResolveLink(baseURL, link)
		if !ok {
			continue
		}

		url, err := web.NormalizeURL(resolved)
		if err != nil {
			continue
		}

		// Skip media files
		if web.IsMediaURL(url) {
			continue
		}

		validLinks = append(validLinks, url.String())
	}

	return validLinks
}
HTML Text Cleaning
func extractCleanText(htmlText string) string {
	// Remove HTML tags (simplified - use proper parser in production)
	text := regexp.MustCompile(`<[^>]*>`).ReplaceAllString(htmlText, "")

	// Normalize the text
	text = web.NormalizeText(text)

	return text
}

func main() {
	html := `<p>Hello &amp; welcome!</p><div>This is a test.</div>`
	fmt.Println(extractCleanText(html))
	// Output: Hello & welcome! This is a test.
}
Deduplicate URLs
func deduplicateURLs(urls []string) []string {
	seen := make(map[string]bool)
	var unique []string

	for _, rawURL := range urls {
		// Normalize to ensure consistent comparison
		normalized, err := web.NormalizeURL(rawURL)
		if err != nil {
			continue
		}

		urlStr := normalized.String()
		if !seen[urlStr] {
			seen[urlStr] = true
			unique = append(unique, urlStr)
		}
	}

	return unique
}

func main() {
	urls := []string{
		"example.com",
		"http://example.com",
		"https://example.com",
		"example.com?foo=bar",
		"example.com#section",
	}

	deduplicated := deduplicateURLs(urls)
	for _, u := range deduplicated {
		fmt.Println(u)
	}
	// Output:
	// https://example.com
}
Binary File Fetching
func downloadFile() {
	fetcher := web.NewDefaultBinaryFetcher()

	// Download to memory
	result, err := fetcher.FetchBinary(context.Background(), &web.BinaryFetchInput{
		URL: "https://example.com/document.pdf",
	})
	if err != nil {
		log.Fatal(err)
	}
	fmt.Printf("Downloaded %d bytes: %s\n", result.Size, result.Filename)
	// result.Data contains the file contents

	// Download to file
	result, err = fetcher.FetchBinary(context.Background(), &web.BinaryFetchInput{
		URL:        "https://example.com/image.png",
		OutputPath: "/tmp/downloads/image.png",
		CreateDirs: true,
	})
	if err != nil {
		log.Fatal(err)
	}
	fmt.Printf("Saved to: %s\n", result.DownloadPath)

	// Download with size limit and MIME verification
	result, err = fetcher.FetchBinary(context.Background(), &web.BinaryFetchInput{
		URL:            "https://example.com/file.pdf",
		OutputPath:     "/tmp/downloads/",  // Directory - filename from response
		MaxSizeBytes:   10 * 1024 * 1024,   // 10MB limit
		ExpectedType:   "application/pdf",
		VerifyMimeType: true,
	})
	if err != nil {
		log.Fatal(err)
	}
}
Error Handling
func fetchWithRetry(url string) error {
	fetcher := web.NewDefaultBinaryFetcher()

	_, err := fetcher.FetchBinary(context.Background(), &web.BinaryFetchInput{
		URL: url,
	})
	if err != nil {
		// Check if it's a fetch error with status code
		var fetchErr *web.FetchError
		if errors.As(err, &fetchErr) {
			fmt.Printf("HTTP %d: %s\n", fetchErr.StatusCode, fetchErr.Error())

			// Check if we should retry
			if fetchErr.IsRecoverable() {
				fmt.Println("Error is recoverable, will retry...")
				// Retry logic here
			}
		}
		return err
	}
	return nil
}
Implementing a Search Provider
// Implement the Searcher interface for your search backend
type MySearcher struct {
	apiKey string
}

func (s *MySearcher) Search(ctx context.Context, input *web.SearchInput) (*web.SearchOutput, error) {
	// Call your search API
	// ...

	return &web.SearchOutput{
		Items: []*web.SearchItem{
			{
				URL:         "https://example.com/result1",
				Title:       "First Result",
				Description: "Description of the first result",
			},
			{
				URL:         "https://example.com/result2",
				Title:       "Second Result",
				Description: "Description of the second result",
			},
		},
	}, nil
}

func searchExample() {
	searcher := &MySearcher{apiKey: "..."}

	results, err := searcher.Search(context.Background(), &web.SearchInput{
		Query: "golang web scraping",
		Limit: 10,
	})
	if err != nil {
		log.Fatal(err)
	}

	for _, item := range results.Items {
		fmt.Printf("%s: %s\n", item.Title, item.URL)
	}
}

API Reference

URL Functions
Function Description Inputs Outputs
NormalizeURL Normalizes URL with transformations value string *url.URL, error
ResolveLink Resolves relative URL against base domain, value string string, bool
AreSameHost Checks if URLs have same host url1, url2 *url.URL bool
AreRelatedHosts Checks if URLs share domain url1, url2 *url.URL bool
SortURLs Sorts URLs alphabetically urls []*url.URL none (in-place)
IsMediaURL Checks if URL points to media file u *url.URL bool
IsMediaExtension Checks if extension is a media type ext string bool
Text Functions
Function Description Inputs Outputs
NormalizeText Cleans and normalizes text text string string
EndsWithPunctuation Checks if string ends with punctuation s string bool
Binary Fetcher Types
Type Description
BinaryFetcher Interface for fetching binary files from URLs
DefaultBinaryFetcher Standard implementation with timeouts and security features
BinaryFetchInput Configuration: URL, headers, output path, size limits, MIME verification
BinaryFetchResult Result containing filename, size, content type, and data or path
Error Types
Type Description
FetchError HTTP fetch error with status code and recoverability check
NewFetchError(code, err) Create a new FetchError
(*FetchError).IsRecoverable() Returns true for 429, 500, 502, 503, 504 status codes
Search Types
Type Description
Searcher Interface for web search implementations
SearchInput Search query and limit
SearchOutput Container for search results
SearchItem Individual result: URL, title, description, icon, image

URL Normalization Behavior

NormalizeURL applies these transformations:

  1. Trims whitespace
  2. Adds https:// prefix if missing
  3. Converts http:// to https://
  4. Removes query parameters
  5. Removes URL fragments
  6. Removes trailing / if path is just /

Examples:

  • "example.com""https://example.com"
  • "http://example.com""https://example.com"
  • "example.com/path?q=1#frag""https://example.com/path"
  • " example.com/ ""https://example.com"

Text Normalization Behavior

NormalizeText applies these transformations:

  1. Trims whitespace
  2. Unescapes HTML entities (&amp;&)
  3. Removes non-printable characters

Examples:

  • " text ""text"
  • "Hello &amp; goodbye""Hello & goodbye"
  • "&lt;div&gt;""<div>"
  • "text\x00\x01here""text here"

Supported Media Extensions

IsMediaURL and IsMediaExtension recognize these common file types:

  • Images: .jpg, .jpeg, .png, .gif, .svg, .webp, .bmp, .ico, .tiff
  • Documents: .pdf, .doc, .docx, .xls, .xlsx, .ppt, .pptx
  • Video: .mp4, .avi, .mov, .wmv, .flv, .mkv, .m4v
  • Audio: .mp3, .wav, .aac, .ogg, .flac, .m4a
  • Archives: .zip, .tar, .gz, .rar, .7z, .iso
  • Fonts: .ttf, .otf, .woff, .woff2, .eot
  • Executables: .exe, .dmg, .apk, .deb, .rpm, .msi, .bin, .pkg
  • Other: .css, .torrent
  • crawler - Web crawler that uses these utilities
  • fetch - HTTP page fetching
  • htmlparse - HTML parsing and link extraction
  • htmltomd - HTML to Markdown conversion

Documentation

Overview

Package web provides URL manipulation, text normalization, media detection, binary file fetching, and web search abstractions for web crawling and content processing.

URL Operations

  • NormalizeURL: Parse and standardize URLs (add https://, remove query params)
  • ResolveLink: Resolve relative URLs against a base domain
  • AreSameHost: Compare if two URLs have identical hosts
  • AreRelatedHosts: Check if URLs share a common parent domain
  • SortURLs: Sort URLs alphabetically by their string representation

Text Processing

Media Detection

Binary File Fetching

Error Handling

  • FetchError: Structured error type for HTTP fetch failures with status codes

This package is particularly useful when building web crawlers, content extractors, or any application that needs to process URLs and text from web pages.

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

func AreRelatedHosts

func AreRelatedHosts(url1, url2 *url.URL) bool

AreRelatedHosts checks if two URLs share the same registrable domain (effective TLD + 1). Returns false if either URL is nil or cannot have its registrable domain determined.

This function uses the Public Suffix List to correctly handle multi-part TLDs like "co.uk", "com.au", etc. For example, "example.co.uk" and "other.co.uk" are NOT related because they have different registrable domains.

This function is useful for determining if URLs belong to the same website family, even if they use different subdomains.

Example:

url1, _ := url.Parse("https://www.example.com")
url2, _ := url.Parse("https://api.example.com")
web.AreRelatedHosts(url1, url2) // true (both share "example.com")

url3, _ := url.Parse("https://example.org")
web.AreRelatedHosts(url1, url3) // false (different base domains)

url4, _ := url.Parse("https://foo.example.co.uk")
url5, _ := url.Parse("https://bar.example.co.uk")
web.AreRelatedHosts(url4, url5) // true (both share "example.co.uk")

url6, _ := url.Parse("https://foo.other.co.uk")
web.AreRelatedHosts(url4, url6) // false (different registrable domains)
Example

Example demonstrates checking if URLs share a common domain.

url1, _ := url.Parse("https://www.example.com")
url2, _ := url.Parse("https://api.example.com")
url3, _ := url.Parse("https://other.com")

fmt.Println(AreRelatedHosts(url1, url2))
fmt.Println(AreRelatedHosts(url1, url3))
Output:
true
false

func AreSameHost

func AreSameHost(url1, url2 *url.URL) bool

AreSameHost checks if two URLs have the same host value. Returns false if either URL is nil.

This function performs an exact host comparison, meaning subdomains are considered different hosts. For example, "www.example.com" and "api.example.com" are not the same host. Use AreRelatedHosts if you need to check for shared parent domains.

Example:

url1, _ := url.Parse("https://example.com/page1")
url2, _ := url.Parse("https://example.com/page2")
web.AreSameHost(url1, url2) // true

url3, _ := url.Parse("https://sub.example.com/page")
web.AreSameHost(url1, url3) // false
Example

Example demonstrates comparing URL hosts.

url1, _ := url.Parse("https://example.com/page1")
url2, _ := url.Parse("https://example.com/page2")
url3, _ := url.Parse("https://sub.example.com/page")

fmt.Println(AreSameHost(url1, url2))
fmt.Println(AreSameHost(url1, url3))
Output:
true
false

func EndsWithPunctuation

func EndsWithPunctuation(s string) bool

EndsWithPunctuation checks if a string ends with a common punctuation mark.

The following punctuation characters are recognized: . , : ; ? ! " '

This function correctly handles Unicode strings by checking the last rune rather than the last byte. Returns false for empty strings.

Example:

web.EndsWithPunctuation("Hello.")  // true
web.EndsWithPunctuation("Hello?")  // true
web.EndsWithPunctuation("Hello")   // false
web.EndsWithPunctuation("")        // false
Example

Example demonstrates checking for punctuation at the end of strings.

fmt.Println(EndsWithPunctuation("Hello."))
fmt.Println(EndsWithPunctuation("Hello?"))
fmt.Println(EndsWithPunctuation("Hello"))
fmt.Println(EndsWithPunctuation(""))
Output:
true
true
false
false

func IsMediaExtension

func IsMediaExtension(ext string) bool

IsMediaExtension checks if a file extension is considered a media file extension. The extension should include the leading dot (e.g., ".jpg", ".mp4"). The check is case-insensitive.

Example:

web.IsMediaExtension(".jpg")  // true
web.IsMediaExtension(".JPG")  // true
web.IsMediaExtension(".html") // false
web.IsMediaExtension("jpg")   // false (missing dot)

func IsMediaURL

func IsMediaURL(u *url.URL) bool

IsMediaURL checks if a URL appears to point to a media file based on its file extension.

The function extracts the file extension from the URL's path and performs a case-insensitive lookup against the known media extensions. Returns true if the extension is recognized as a media file type.

This is useful for filtering out media files when crawling web pages or extracting links that point to HTML content.

Example:

url, _ := url.Parse("https://example.com/image.jpg")
web.IsMediaURL(url) // true

url, _ = url.Parse("https://example.com/page.html")
web.IsMediaURL(url) // false

url, _ = url.Parse("https://example.com/VIDEO.MP4")
web.IsMediaURL(url) // true (case-insensitive)
Example

Example demonstrates detecting media files from URLs.

imageURL, _ := url.Parse("https://example.com/photo.jpg")
fmt.Println(IsMediaURL(imageURL))

videoURL, _ := url.Parse("https://example.com/video.mp4")
fmt.Println(IsMediaURL(videoURL))

pageURL, _ := url.Parse("https://example.com/page.html")
fmt.Println(IsMediaURL(pageURL))
Output:
true
true
false

func NormalizeText

func NormalizeText(text string) string

NormalizeText applies transformations to clean up text extracted from web pages.

The following transformations are applied in order:

  • Trim leading and trailing whitespace
  • Unescape HTML entities (e.g., "&amp;" becomes "&", "&lt;" becomes "<")
  • Replace non-printable characters with spaces

Non-printable characters are any Unicode characters that are not printable according to unicode.IsPrint() and are not whitespace. These are replaced with spaces rather than removed to preserve word boundaries.

Returns the original text unchanged if it's empty after trimming.

Example:

text := web.NormalizeText("  Hello &amp; goodbye  ")
fmt.Println(text) // "Hello & goodbye"

text = web.NormalizeText("&lt;div&gt;content&lt;/div&gt;")
fmt.Println(text) // "<div>content</div>"
Example

Example demonstrates text normalization for web content.

// Trim whitespace
fmt.Println(NormalizeText("  Hello  "))

// Unescape HTML entities
fmt.Println(NormalizeText("Hello &amp; goodbye"))

// Convert HTML tags (entities)
fmt.Println(NormalizeText("&lt;div&gt;"))

// Remove non-printable characters
fmt.Println(NormalizeText("Hello\x00World"))
Output:
Hello
Hello & goodbye
<div>
Hello World

func NormalizeURL

func NormalizeURL(value string) (*url.URL, error)

NormalizeURL parses a URL string and returns a normalized URL.

The following transformations are applied:

  • Trim whitespace from the input
  • Add https:// prefix if the URL has no scheme
  • Convert http:// to https://
  • Remove query parameters and URL fragments
  • Remove trailing "/" if the path is only "/"

This function returns an error if the input is empty, has an invalid scheme (anything other than http/https), or cannot be parsed as a valid URL.

Example:

url, _ := web.NormalizeURL("example.com/path?q=1#frag")
fmt.Println(url.String()) // "https://example.com/path"

url, _ = web.NormalizeURL("http://example.com")
fmt.Println(url.String()) // "https://example.com"
Example

Example demonstrates basic URL normalization.

// Normalize a URL with query parameters and fragment
url, _ := NormalizeURL("example.com/path?query=1#fragment")
fmt.Println(url.String())

// Convert http to https
url, _ = NormalizeURL("http://example.com")
fmt.Println(url.String())

// Add https prefix when missing
url, _ = NormalizeURL("example.com")
fmt.Println(url.String())
Output:
https://example.com/path
https://example.com
https://example.com
func ResolveLink(domain, value string) (string, bool)

ResolveLink resolves a relative or absolute URL against a base domain and returns the normalized result.

For absolute URLs, this function validates the scheme (only http/https are accepted) and normalizes the URL. For relative URLs, it resolves them against the provided domain. URL fragments are always removed.

Returns the resolved URL string and true if successful, or an empty string and false if the URL is invalid (e.g., unsupported scheme, parse error).

The domain parameter can be specified with or without a scheme. If no scheme is provided, https:// is assumed.

Example:

// Resolve relative URL
resolved, ok := web.ResolveLink("example.com", "/about")
// resolved: "https://example.com/about", ok: true

// Validate absolute URL
resolved, ok = web.ResolveLink("example.com", "https://other.com/page")
// resolved: "https://other.com/page", ok: true

// Reject non-http schemes
resolved, ok = web.ResolveLink("example.com", "ftp://files.com")
// resolved: "", ok: false

func SortURLs

func SortURLs(urls []*url.URL)

SortURLs sorts a slice of URLs alphabetically by their string representation. The slice is sorted in place. Nil entries are sorted to the end of the slice.

Example:

urls := []*url.URL{
    mustParse("https://z.com"),
    mustParse("https://a.com"),
    mustParse("https://m.com"),
}
web.SortURLs(urls)
// urls is now ordered: a.com, m.com, z.com
Example

Example demonstrates sorting URLs alphabetically.

urls := []*url.URL{
	mustParse("https://z.com/page"),
	mustParse("https://a.com/page"),
	mustParse("https://m.com/page"),
}

SortURLs(urls)

for _, u := range urls {
	fmt.Println(u.String())
}
Output:
https://a.com/page
https://m.com/page
https://z.com/page

Types

type BinaryFetchInput added in v0.0.5

type BinaryFetchInput struct {
	// URL is the address to fetch the binary file from.
	URL string `json:"url"`

	// Headers contains additional HTTP headers to include in the request.
	Headers map[string]string `json:"headers,omitempty"`

	// OutputPath is the destination file path or directory. If it's a directory,
	// the filename is derived from the URL or Content-Disposition header.
	// If empty, the file content is returned in BinaryFetchResult.Data.
	OutputPath string `json:"output_path,omitempty"`

	// CreateDirs creates parent directories if they don't exist.
	CreateDirs bool `json:"create_dirs,omitempty"`

	// MaxSizeBytes limits the maximum file size to download. A value of 0
	// means no limit.
	MaxSizeBytes int64 `json:"max_size_bytes,omitempty"`

	// ExpectedType is the expected MIME type (e.g., "application/pdf", "image/jpeg").
	// Only the media type is compared; parameters like charset are ignored.
	ExpectedType string `json:"expected_type,omitempty"`

	// VerifyMimeType enables MIME type verification against ExpectedType.
	VerifyMimeType bool `json:"verify_mime_type,omitempty"`
}

BinaryFetchInput contains parameters for fetching binary files.

type BinaryFetchResult added in v0.0.5

type BinaryFetchResult struct {
	// Filename is the name of the downloaded file.
	Filename string

	// Size is the number of bytes downloaded.
	Size int64

	// ContentType is the MIME type reported by the server.
	ContentType string

	// DownloadPath is the file path where content was saved (only set if
	// OutputPath was specified in the input).
	DownloadPath string

	// Data contains the file content (only populated if OutputPath was not
	// specified in the input).
	Data []byte
}

BinaryFetchResult contains the result of a binary file fetch operation.

type BinaryFetcher added in v0.0.5

type BinaryFetcher interface {
	// FetchBinary downloads a binary file from the specified URL.
	// Returns the result containing either the file data or the path where
	// it was saved.
	FetchBinary(ctx context.Context, input *BinaryFetchInput) (*BinaryFetchResult, error)
}

BinaryFetcher defines the interface for fetching binary files from URLs.

Implementations should handle HTTP redirects, respect size limits, and sanitize filenames to prevent path traversal attacks.

type DefaultBinaryFetcher added in v0.0.5

type DefaultBinaryFetcher struct {
	// Client is the HTTP client used for requests. If nil, a default client
	// with a 30-second timeout is used.
	Client *http.Client
}

DefaultBinaryFetcher provides a standard implementation of BinaryFetcher with sensible defaults for production use.

func NewDefaultBinaryFetcher added in v0.0.5

func NewDefaultBinaryFetcher() *DefaultBinaryFetcher

NewDefaultBinaryFetcher creates a new binary fetcher with a default HTTP client configured with a 30-second timeout.

func (*DefaultBinaryFetcher) FetchBinary added in v0.0.5

FetchBinary downloads a binary file from the specified URL.

The function performs filename sanitization to prevent path traversal attacks when saving to disk. Filenames from Content-Disposition headers or URLs are cleaned to remove path separators and parent directory references.

type FetchError added in v0.0.5

type FetchError struct {
	// StatusCode is the HTTP status code returned by the server.
	StatusCode int

	// Err is the underlying error describing the failure.
	Err error
}

FetchError represents an HTTP fetch failure with status code information.

This error type wraps the underlying error while preserving the HTTP status code, enabling callers to make decisions based on the type of failure. It implements the standard error interface and supports error unwrapping via errors.Unwrap.

Example:

resp, err := http.Get(url)
if err != nil {
    return nil, err
}
if resp.StatusCode >= 400 {
    return nil, web.NewFetchError(resp.StatusCode, fmt.Errorf("request failed"))
}

func NewFetchError added in v0.0.5

func NewFetchError(statusCode int, err error) *FetchError

NewFetchError creates a new FetchError with the given HTTP status code and underlying error.

func (*FetchError) Error added in v0.0.5

func (e *FetchError) Error() string

Error returns a string representation of the fetch error, including the status code and underlying error message.

func (*FetchError) IsRecoverable added in v0.0.5

func (e *FetchError) IsRecoverable() bool

IsRecoverable returns true if the error represents a temporary failure that might succeed on retry.

The following status codes are considered recoverable:

  • 429 Too Many Requests: Rate limiting, retry after backoff
  • 500 Internal Server Error: Transient server issue
  • 502 Bad Gateway: Upstream server issue
  • 503 Service Unavailable: Temporary overload or maintenance
  • 504 Gateway Timeout: Upstream timeout

Client errors (4xx except 429) and permanent server errors are not considered recoverable.

func (*FetchError) Unwrap added in v0.0.5

func (e *FetchError) Unwrap() error

Unwrap returns the underlying error, enabling use with errors.Is and errors.As.

type SearchInput added in v0.0.5

type SearchInput struct {
	// Query is the search query string.
	Query string `json:"query"`

	// Limit is the maximum number of results to return.
	// A value of 0 uses the implementation's default limit.
	Limit int `json:"limit,omitempty"`
}

SearchInput contains parameters for a web search query.

type SearchItem added in v0.0.5

type SearchItem struct {
	// URL is the web address of the search result.
	URL string `json:"url"`

	// Title is the page title or headline.
	Title string `json:"title"`

	// Description is a summary or snippet from the page content.
	Description string `json:"description,omitempty"`

	// Icon is the URL of the site's favicon or icon.
	Icon string `json:"icon,omitempty"`

	// Image is the URL of a preview image, if available.
	Image string `json:"image,omitempty"`
}

SearchItem represents a single search result.

type SearchOutput added in v0.0.5

type SearchOutput struct {
	// Items contains the search results, ordered by relevance.
	Items []*SearchItem `json:"items"`
}

SearchOutput contains the results of a web search.

type Searcher added in v0.0.5

type Searcher interface {
	// Search performs a web search and returns matching results.
	// Returns an error if the search fails or the context is canceled.
	Search(ctx context.Context, input *SearchInput) (*SearchOutput, error)
}

Searcher defines the interface for web search implementations.

Implementations might include search engine APIs (Google, Bing, DuckDuckGo), site-specific search, or custom search indexes.

Example implementation:

type GoogleSearcher struct {
    APIKey string
    CX     string
}

func (s *GoogleSearcher) Search(ctx context.Context, input *web.SearchInput) (*web.SearchOutput, error) {
    // Call Google Custom Search API
    // ...
    return &web.SearchOutput{Items: items}, nil
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL