kraaler

package module
v0.0.0-...-f92c932 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 10, 2019 License: GPL-3.0 Imports: 33 Imported by: 0

README

Kraaler

Build Status

This is an Go implementation of the design covered in /Kraaler: A User Perspective Web Crawler/ and presented at TMA 2019

Building

Kraaler requires CGO_ENABLED=1 (C-support in Go), due to the use of sqlite. In order to compile the binary a set of C libraries is needed. The official Golang Docker Images comes pre-bundled with these C dependencies, making them a convenient tool for compilation.

docker run \
	--rm \
	-v $(pwd):/go/src/github.com/aau-network-security/kraaler \
	-w /go/src/github.com/aau-network-security/kraaler/app/ \
	-e GO111MODULE=on \
	-e GOOS=linux \
	-e GOARCH=amd64 \
	-e CGO_ENABLED=1 \
	-e HOST_UID=`id -u` \
	golang:1.12.6 \
	bash build.sh

Remember to set GOOS and GOARCH according to your platform.

Running

$ krl run -n 3 \ # amount of workers
  --provider-file urls.txt \ # provider for urls
  --sampler 'uni' \ # sampler for prioritization of urls
  --filter-resp-bodies-ct '^text/' # only text bodies

Contributors

Documentation

Index

Constants

View Source
const (
	CHROME_REQ_WILL_BE_SENT = "Network.requestWillBeSent"
	CHROME_RESP_RECEIVED    = "Network.responseReceived"
	CHROME_LOADING_FAILED   = "Network.loadingFailed"
	CHROME_LOADING_FINISHED = "Network.loadingFinished"
	CUSTOM_GOT_BODY         = "Custom.body"
)

Variables

View Source
var (
	ErrFuncTimeout = errors.New("timeout")
	ErrNameServer  = errors.New("unable to get name servers")
	ErrDockerConn  = errors.New("docker connection not responding")
	ErrTimeoutDOM  = errors.New("timeout loading document object model")
)
View Source
var DefaultResolution = &Resolution{
	Width:  1366,
	Height: 768,
}

Functions

func GetAvailablePort

func GetAvailablePort() uint

func LinksFromBodies

func LinksFromBodies(host *url.URL, bodies ...*ResponseBody) []*url.URL

func NewWorker

func NewWorker(conf WorkerConfig) (*worker, error)

func PullImage

func PullImage(c *docker.Client, img string) error

func ReadDomainsFromFile

func ReadDomainsFromFile(path string) (<-chan Domain, error)
func RetrieveLinks(host *url.URL, body []byte) ([]*url.URL, error)

func ScanForServers

func ScanForServers(ctx context.Context, domains <-chan Domain) <-chan *url.URL

func WaitForEndpoint

func WaitForEndpoint(ctx context.Context, endpoint string) error

Types

type BrowserEvents

type BrowserEvents struct {
	// contains filtered or unexported fields
}

type BrowserRequest

type BrowserRequest struct {
	URL      string
	Method   string
	Headers  map[string]string
	PostData *string
}

type BrowserResponse

type BrowserResponse struct {
	StatusCode         int
	Headers            map[string]string
	MimeType           string
	Body               []byte
	BodyChecksumSha256 string
}

func (*BrowserResponse) Read

func (br *BrowserResponse) Read(resp *network.Response) error

type BrowserScreenshot

type BrowserScreenshot struct {
	Screenshot []byte
	Resolution Resolution
	Kind       string
	Taken      time.Time
}

type BrowserTimes

type BrowserTimes struct {
	StartTime        float64
	EndTime          float64
	ConnectStartTime *float64
	ConnectEndTime   *float64
	SendStartTime    *float64
	SendEndTime      *float64
}

func (*BrowserTimes) Align

func (bt *BrowserTimes) Align()

type CTXLOGGER

type CTXLOGGER struct{}

type CallFrame

type CallFrame struct {
	Column     int
	LineNumber int
	Url        string
	Function   *string
}

type Causer

type Causer interface {
	Cause() error
}

type ChromeEventParam

type ChromeEventParam struct {
	// contains filtered or unexported fields
}

type CrawlAction

type CrawlAction struct {
	Parent    *CrawlAction
	Initiator Initiator

	Host     Host
	Request  network.Request
	Response *network.Response
	Error    *string
	Body     *ResponseBody

	Timings BrowserTimes
}

func ActionsFromEvents

func ActionsFromEvents(events *BrowserEvents) []*CrawlAction

func (*CrawlAction) Finished

func (ca *CrawlAction) Finished() bool

type CrawlRequest

type CrawlRequest struct {
	Url         *url.URL
	Screenshots []time.Duration
}

type CrawlResponse

type CrawlResponse struct {
	Primary     *CrawlAction
	Secondaries []CrawlAction
	Error       error
}

type Domain

type Domain string

func (Domain) HTTP

func (d Domain) HTTP() string

func (Domain) HTTPS

func (d Domain) HTTPS() string

type DomainFileProvider

type DomainFileProvider struct {
	// contains filtered or unexported fields
}

func NewDomainFileProvider

func NewDomainFileProvider(path string, conf *DomainFileProviderConfig) (*DomainFileProvider, error)

func (*DomainFileProvider) Close

func (dfp *DomainFileProvider) Close()

func (*DomainFileProvider) UrlsC

func (dfp *DomainFileProvider) UrlsC() <-chan *url.URL

type DomainFileProviderConfig

type DomainFileProviderConfig struct {
	Logger  *zap.Logger
	Timeout time.Duration
	Targets map[int]func(string) string
}

type Host

type Host struct {
	Domain      Domain
	IPAddr      string
	NameServers []string
}

func GetHostInfo

func GetHostInfo(domain Domain) (Host, error)

type Initiator

type Initiator struct {
	Kind  string
	Stack *CallFrame
}

type JavaScriptConsole

type JavaScriptConsole struct {
	Msg      string
	Line     int
	Column   int
	Function string
	URL      string
}

type NoParamErr

type NoParamErr struct {
	// contains filtered or unexported fields
}

func (*NoParamErr) Error

func (npe *NoParamErr) Error() string

type NotOfTypeErr

type NotOfTypeErr struct {
	// contains filtered or unexported fields
}

func (*NotOfTypeErr) Error

func (note *NotOfTypeErr) Error() string

type Page

type Page struct {
	InitialURL   *url.URL
	Actions      []*CrawlAction
	Resolution   string
	Console      []*JavaScriptConsole
	Screenshots  []*BrowserScreenshot
	Error        error
	DocumentURLs []*url.URL

	InitiatedTime  time.Time
	NavigateTime   time.Time
	LoadedTime     time.Time
	TerminatedTime time.Time
}

type PageHandleFunc

type PageHandleFunc func(Page)

type PageMiddleware

type PageMiddleware func(PageHandleFunc) PageHandleFunc

type PageStore

type PageStore interface {
	SaveSession(Page) error
}

type PhishTankProvider

type PhishTankProvider struct {
	// contains filtered or unexported fields
}

func NewPhishTankProvider

func NewPhishTankProvider() *PhishTankProvider

func NewPhishTankProviderWithConfig

func NewPhishTankProviderWithConfig(conf PhishTankProviderConfig) *PhishTankProvider

func (*PhishTankProvider) Close

func (ptr *PhishTankProvider) Close()

func (*PhishTankProvider) UrlsC

func (ptr *PhishTankProvider) UrlsC() <-chan *url.URL

type PhishTankProviderConfig

type PhishTankProviderConfig struct {
	Endpoint     string
	APIKey       string
	TickDuration time.Duration
}

type Resolution

type Resolution struct {
	Width  int
	Height int
}

func (Resolution) String

func (r Resolution) String() string

type ResponseBody

type ResponseBody struct {
	RequestID      network.RequestID
	Body           []byte
	Links          []*url.URL
	ChecksumSha256 string
}

type URLChanProvider

type URLChanProvider struct {
	C <-chan *url.URL
}

func (URLChanProvider) UrlsC

func (ucp URLChanProvider) UrlsC() <-chan *url.URL

type URLHandleFunc

type URLHandleFunc func(*url.URL)

func SkipURLsMiddleware

func SkipURLsMiddleware(URLHandleFunc) URLHandleFunc

type URLMiddleware

type URLMiddleware func(URLHandleFunc) URLHandleFunc

type URLProvider

type URLProvider interface {
	UrlsC() <-chan *url.URL
}

type URLStore

type URLStore interface {
	Sample() (*url.URL, error)
	Add(urls ...*url.URL) (int, error)
	Visit(u *url.URL, t time.Time) error
	Size() int
}

type Worker

type Worker interface {
	io.Closer
	Run(queue <-chan CrawlRequest, results chan<- Page) error
}

type WorkerConfig

type WorkerConfig struct {
	DockerClient *docker.Client
	UseInstance  string
	Resolution   *Resolution
	LoadTimeout  *time.Duration
	Logger       *zap.Logger
}

type WorkerController

type WorkerController struct {
	// contains filtered or unexported fields
}

func NewWorkerController

func NewWorkerController(ctx context.Context, conf WorkerControllerConfig) (*WorkerController, error)

func (*WorkerController) AddWorker

func (wc *WorkerController) AddWorker() error

func (*WorkerController) Close

func (wc *WorkerController) Close() error

type WorkerControllerConfig

type WorkerControllerConfig struct {
	URLStore       URLStore
	PageStore      PageStore
	Logger         *zap.Logger
	WorkerProducer func() (Worker, error)
	PageMiddleware []PageMiddleware
	URLMiddleware  []URLMiddleware
}

Directories

Path Synopsis
app
cmd

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL