hansard

package
v0.0.0-...-9034d1e Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 14, 2020 License: AGPL-3.0 Imports: 11 Imported by: 0

Documentation

Index

Constants

View Source
const (
	MaxLineProcessed = 10
)

Variables

This section is empty.

Functions

func GetAbsoluteDataDir

func GetAbsoluteDataDir(workingDir, dataDir string) string

func GetAbsoluteSplitOutDir

func GetAbsoluteSplitOutDir(workingDir, splitOutDir string) string

func LoadAndSplit

func LoadAndSplit(conf Configuration) error

func NewHansardDocumentContent

func NewHansardDocumentContent(pdfDoc *PDFDocument, hansardDoc *HansardDocument) error

func NewHansardQuestions

func NewHansardQuestions(pdfDoc *PDFDocument, hansardQuestions *[]HansardQuestion) error

func NewSplitHansardDocumentPlanContent

func NewSplitHansardDocumentPlanContent(pdfDoc *PDFDocument, splitPlan *SplitHansardDocumentPlan) error

Types

type Configuration

type Configuration struct {
	// DUN Session Label
	DUNSession string

	// ./raw + ./data folders are assumed to be relative to this dir
	WorkingDir string

	// Data directory name; can be relative or absolute?
	DataDir string

	// Source PDF can be anywhere; maybe make it a Reader to be read direct from S3?
	SourcePDFPath string

	// Options?
	Options *ExtractPDFOptions
}

Configuration of a Context from outside-in ..

type ErrorQuestionsHasInvalid

type ErrorQuestionsHasInvalid struct {
	// contains filtered or unexported fields
}

func (*ErrorQuestionsHasInvalid) Error

func (e *ErrorQuestionsHasInvalid) Error() string

type ExtractPDFOptions

type ExtractPDFOptions struct {
	StartPage int
	NumPages  int
}

type HansardDocument

type HansardDocument struct {
	StateAssemblySession string
	HansardType          HansardType
	HansardQuestions     []HansardQuestion
}

func NewHansardDocument

func NewHansardDocument(sessionName string, pdfPath string) (*HansardDocument, error)

type HansardQuestion

type HansardQuestion struct {
	QuestionNum  string
	PageNumStart int
	PageNumEnd   int
}

type HansardType

type HansardType int
const (
	HANSARD_INVALID HansardType = iota
	HANSARD_SPOKEN
	HANSARD_WRITTEN
	HANSARD_DEBATE
)

type PDFDocument

type PDFDocument struct {
	NumPages   int
	Pages      []PDFPage
	SourcePath string
}

func NewPDFDocument

func NewPDFDocument(pdfPath string, options *ExtractPDFOptions) (*PDFDocument, error)

type PDFPage

type PDFPage struct {
	PageNo           int
	PDFPlainText     string
	PDFTxtSameLines  []string // combined content with same line .. proxy for changes
	PDFTxtSameStyles []string // combined content with same style .. proxy for changes
}

type SplitHansardDocumentPlan

type SplitHansardDocumentPlan struct {
	PlanDir         string
	HansardDocument HansardDocument
	// contains filtered or unexported fields
}

func NewEmptySplitHansardDocumentPlan

func NewEmptySplitHansardDocumentPlan(absoluteDataDir, absolutePlanFile, sessionName string) *SplitHansardDocumentPlan

func NewSplitHansardDocumentPlan

func NewSplitHansardDocumentPlan(sourcePDFPath, workingDir, dataDir, dunSession string, options *ExtractPDFOptions) *SplitHansardDocumentPlan

func (*SplitHansardDocumentPlan) ExecuteSplit

func (s *SplitHansardDocumentPlan) ExecuteSplit(absoluteSrcPDF, absoluteSplitOutput string) error

func (*SplitHansardDocumentPlan) LoadPlan

func (s *SplitHansardDocumentPlan) LoadPlan() error

func (*SplitHansardDocumentPlan) SavePlan

func (s *SplitHansardDocumentPlan) SavePlan() error

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL