Documentation ¶
Index ¶
Constants ¶
View Source
const ( FolxUserAgent = "github.com/thecolngroup/folx" TextHTMLMimeType = "text/html; charset=utf-8" TextPlainMimeType = "text/plain; charset=utf-8" )
Variables ¶
This section is empty.
Functions ¶
func GetArticle ¶
func GetWebPageArticle ¶
Types ¶
type ArticleExtractor ¶
type ArticleExtractor interface { Start(string) error Wait() AttachOnError(ErrorCallback) AttachOnNewArticle(NewArticleCallback) }
func NewFileArticleExtractor ¶
func NewFileArticleExtractor() ArticleExtractor
func NewWebArticleExtractor ¶
func NewWebArticleExtractor(opts WebArticleExtractorOptions) ArticleExtractor
type ErrorCallback ¶
type ErrorCallback func(error)
type NewArticleCallback ¶
type WebArticleExtractor ¶
type WebArticleExtractor struct {
// contains filtered or unexported fields
}
func (*WebArticleExtractor) AttachOnError ¶
func (t *WebArticleExtractor) AttachOnError(callback ErrorCallback)
func (*WebArticleExtractor) AttachOnNewArticle ¶
func (t *WebArticleExtractor) AttachOnNewArticle(callback NewArticleCallback)
func (*WebArticleExtractor) Start ¶
func (t *WebArticleExtractor) Start(sourceURL string) error
func (*WebArticleExtractor) Wait ¶
func (t *WebArticleExtractor) Wait()
type WebArticleExtractorOptions ¶
type WebArticleExtractorOptions struct { UserAgent string // empty = random browser UA CacheDir string // empty = in-memory cache AllowAnyDomain bool // false = only the domain in the starting URL is allowed for crawling NoCrawl bool // false = crawl links found on pages AggressiveCrawl bool // false = throttled async (max 4 parellel routines) IgnoreRobotsTxt bool // false = good net citizen MaxDepth int // 0 = infinite recursion ArticleCharMinLen int // 0 = no min len }
Click to show internal directories.
Click to hide internal directories.