Documentation
¶
Index ¶
- func DownloadFile(uri string) (string, string, string, error)
- func DownloadHtml(uri string) (string, string, error)
- func FlattenHtmlDocument(body *html.Node) (doc *html.Node, article *html.Node)
- func GetInnerText(n *html.Node) string
- func NewHtmlDocument(localpath string) (*html.Node, error)
- func WriteHtmlFile2(doc *html.Node) (string, error)
- type Boilerpiper
- type Curl
- type Curler
- type HtmlCleaner
- type Readabilitier
- type SummaryScore
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func DownloadFile ¶
download to tmp path, ungzipped already filepath, mediate-type, charset, error
func DownloadHtml ¶
html has been converted to utf-8 return local_filepath, media_type, error
func FlattenHtmlDocument ¶
func GetInnerText ¶
func NewHtmlDocument ¶
read utf-8 html file
Types ¶
type Boilerpiper ¶
type Boilerpiper struct {
// contains filtered or unexported fields
}
func NewBoilerpiper ¶
func NewBoilerpiper(article *html.Node) *Boilerpiper
func (*Boilerpiper) NumberWordsRulesFilter ¶
func (this *Boilerpiper) NumberWordsRulesFilter()
type Curl ¶
type Curl interface { Download(uri string) (filepath, media_type, charset string, err error) DownloadHtml(uri string) (filepath, media_type string, err error) }
func DefaultCurl ¶
func DefaultCurl() Curl
type HtmlCleaner ¶
type HtmlCleaner struct { Article *html.Node // body or article or a table's body // contains filtered or unexported fields }
func NewHtmlCleaner ¶
func NewHtmlCleaner(u string) *HtmlCleaner
func (*HtmlCleaner) CleanForm ¶
func (this *HtmlCleaner) CleanForm()
func (*HtmlCleaner) CleanHtml ¶
func (cleaner *HtmlCleaner) CleanHtml(root *html.Node)
CleanHtml 清洗掉所有的link/style/css 删除/html/head 转换所有的tag为小写字母 找到body/article节点 找到h1节点或者h2节点,根据数目设置body
func (*HtmlCleaner) String ¶
func (this *HtmlCleaner) String() string
type Readabilitier ¶
type Readabilitier struct {
// contains filtered or unexported fields
}
func NewReadabilitier ¶
func NewReadabilitier(body *html.Node) *Readabilitier
func (*Readabilitier) CreateArticle ¶
func (this *Readabilitier) CreateArticle() (*html.Node, *html.Node)
func (*Readabilitier) String ¶
func (this *Readabilitier) String() string
type SummaryScore ¶
type SummaryScore struct { WordCount int `json:"word_count" bson:"word_count"` // ImageCount int `json:"image_count" bson:"image_count"` LinkCount int `json:"link_count" bson:"link_count"` Images []string `json:"image,omitempty" bson:"image,omitempty"` }
func CleanFragment ¶
func CleanFragment(cont, uri string) (string, *SummaryScore)
return local_filepath, words, images
func ExtractHtml ¶
func ExtractHtml(url string) (string, *SummaryScore, error)
cleaned html doc by utf-8 encoded return filepath, *SummaryScore, error
func NewSummaryScore ¶
func NewSummaryScore(n *html.Node) *SummaryScore
Click to show internal directories.
Click to hide internal directories.