Documentation ¶
Index ¶
- Constants
- Variables
- func ChangePathDir(inDir, inPath, outDir string) (string, error)
- func ChangePathDirExt(inDir, inPath, outDir, outExt string) (string, error)
- func ChangePathExt(inPath, outExt string) string
- func CleanCorpus(corpus []string) []string
- func CreateBleveIndex(indexPath string, forceCreate, allowAppend bool) (bleve.Index, error)
- func CreateBleveMemIndex() (bleve.Index, error)
- func Describe(pdfReader *pdf.PdfReader) (numPages int, width, height float64, err error)
- func DocPageSize(pageSizes [][2]float64) (w, h float64)
- func Exists(filename string) bool
- func ExpandUser(filename string) string
- func ExportBleveMem(index bleve.Index) ([]byte, error)
- func ExtractPageText(page *pdf.PdfPage) (string, error)
- func ExtractPageTextLocation(page *pdf.PdfPage) (string, []extractor.TextLocation, error)
- func ExtractPageTextObject(page *pdf.PdfPage) (*extractor.PageText, error)
- func FileHash(filename string) (string, error)
- func FileSize(filename string) (int64, error)
- func GetPosition(positions []serial.TextLocation, start, end uint32) serial.TextLocation
- func ImportBleveMem(data []byte) (bleve.Index, error)
- func IntRange(i0, i1 int) []int
- func IntSetIntersection(a, b map[int]bool) map[int]bool
- func IntSetToSlice(set map[int]bool) []int
- func IntSetUnion(a, b map[int]bool) map[int]bool
- func IntSliceDifference(a, b []int) []int
- func IntSliceIntersection(a, b []int) []int
- func IntSliceSymmetricDifference(a, b []int) []int
- func IntSliceToSet(arr []int) map[int]bool
- func IntSliceUnion(a, b []int) []int
- func MMToPoint(x float64) float64
- func MakeUsage(msg string)
- func MinMaxIntSlice(arr []int) (min, max int, valid bool)
- func MkDir(dir string) error
- func MkParentDir(filename string) error
- func PageSizeMm(page *pdf.PdfPage) (width, height float64, err error)
- func PageSizePt(page *pdf.PdfPage) (width, height float64, err error)
- func PatternsToPaths(patternList []string, sortSize bool) ([]string, error)
- func PdfOpenDescribe(inPath string) (numPages int, width, height float64, err error)
- func PdfOpenFile(inPath string, lazy bool) (*pdf.PdfReader, error)
- func PdfOpenReader(f io.ReadSeeker, lazy bool) (*pdf.PdfReader, error)
- func PointToMM(x float64) float64
- func ProcessPDFPagesFile(inPath string, processPage func(pageNum uint32, page *pdf.PdfPage) error) error
- func ProcessPDFPagesReader(inPath string, rs io.ReadSeeker, ...) error
- func ReaderSizeHash(rs io.ReadSeeker) (int64, string, error)
- func RegularFile(filename string) (bool, error)
- func RemoveDirectory(dir string) error
- func Reverse(arr []string) []string
- func SetLogging()
- func SortFileSize(pathList []string, minSize, maxSize int64) ([]string, error)
- func StringUniques(arr []string) []string
- func TestRoundtripMem(index bleve.Index) bleve.Index
- func ToSerialTextLocation(loc extractor.TextLocation) serial.TextLocation
- func WriteJsonSlice(filename string, vals []string) error
- type DocPageText
- type DocPositions
- func (lDoc *DocPositions) AddDocPage(pageNum uint32, dpl serial.DocPageLocations, text string) (uint32, error)
- func (lDoc *DocPositions) Close() error
- func (lDoc *DocPositions) GetTextPath(pageIdx uint32) string
- func (d DocPositions) Len() int
- func (lDoc *DocPositions) ReadPagePositions(pageIdx uint32) (uint32, serial.DocPageLocations, error)
- func (lDoc *DocPositions) ReadPageText(pageIdx uint32) (string, error)
- func (lDoc *DocPositions) Save() error
- func (d DocPositions) String() string
- type Extract
- type ExtractList
- type FileDesc
- type FileFinder
- type IDText
- type PdfMatch
- type PdfMatchSet
- type PositionsState
- func FromHIPDs(hipds []serial.HashIndexPathDoc) PositionsState
- func IndexPdfFiles(pathList []string, persistDir string, forceCreate, allowAppend bool, ...) (*PositionsState, bleve.Index, int, error)
- func IndexPdfReaders(pathList []string, rsList []io.ReadSeeker, persistDir string, ...) (*PositionsState, bleve.Index, int, error)
- func OpenPositionsState(root string, forceCreate bool) (*PositionsState, error)
- func (l PositionsState) Check()
- func (lState *PositionsState) CreatePositionsDoc(fd FileDesc) (*DocPositions, error)
- func (lState *PositionsState) ExtractDocPagePositions(inPath string) ([]DocPageText, error)
- func (lState *PositionsState) ExtractDocPagePositionsReader(inPath string, rs io.ReadSeeker) ([]DocPageText, error)
- func (lState *PositionsState) Flush() error
- func (lState *PositionsState) GetHashPath(docIdx uint64) (hash, inPath string)
- func (l PositionsState) Len() int
- func (lState *PositionsState) OpenPositionsDoc(docIdx uint64) (*DocPositions, error)
- func (lState *PositionsState) ReadDocPagePositions(docIdx uint64, pageIdx uint32) (string, uint32, serial.DocPageLocations, error)
- func (lState *PositionsState) ReadDocPageText(docIdx uint64, pageIdx uint32) (string, error)
- func (l PositionsState) String() string
- func (l PositionsState) ToHIPDs() []serial.HashIndexPathDoc
Constants ¶
const ( Inch2MM = 25.4 Inch2Point = 72.0 MM2Point = Inch2Point / Inch2MM Point2MM = Inch2MM / Inch2Point )
const BorderWidth = 3.0 // !@#$ For testing.
const ShadowWidth = BorderWidth + 0.5 // !@#$ For testing.
Variables ¶
var ( Debug bool Trace bool // ExposeErrors can be set to true to not recover from errors in library functions. ExposeErrors bool )
var ErrNoMatch = errors.New("no match for hit")
var ErrRange = errors.New("out of range")
var FileHashSize = 10
Functions ¶
func ChangePathDir ¶
ChangePathDir returns `inPath` with its ancestor directory `inDir` replaced with `outDir`.
func ChangePathDirExt ¶
ChangePathDir returns `inPath` with its ancestor directory `inDir` replaced with `outDir` and its extension replaced with `outExt`.
func ChangePathExt ¶
ChangePathExt returns `inPath` with its extension replaced with `outExt`.
func CleanCorpus ¶
CleanCorpus returns `corpus` with known bad files removed.
func CreateBleveIndex ¶
CreateBleveIndex creates a new persistent Bleve index at `indexPath`. If `forceCreate` is true then an existing index will be deleted. If `allowAppend` is true then an existing index will be appended to. TODO: Remove `allowAppend` argument. Instead always append to an existing index if
`forceCreate` is false.
func CreateBleveMemIndex ¶
CreateBleveMemIndex creates a new in-memory (unpersisted) Bleve index.
func Describe ¶
Describe returns numPages, width, height for the PDF in `pdfReader`. Width and height are in mm.
func DocPageSize ¶
DocPageSize returns the width and height of a document whose page sizes are `pageSizes`. This is a single source of truth for our definition of document page size. Currently the document width is defined as the longest page width in the document.
func ExpandUser ¶
ExpandUser returns `filename` with ~ replaced with user's home directory.
func ExtractPageText ¶
ExtractPageText returns the text on page `page`.
func ExtractPageTextLocation ¶
ExtractPageTextLocation returns the locations of text on page `page`.
func ExtractPageTextObject ¶
ExtractPageTextObject returns the PageText on page `page`. PageText is an opaque UniDoc struct that describes the text marks on a PDF page. extractDocPages uses UniDoc to extract the text from all pages in PDF file `inPath` as a slice of PdfPage.
func FileHash ¶
FileHash returns a hex encoded string of the SHA-256 digest of the contents of file `filename`.
func GetPosition ¶
func GetPosition(positions []serial.TextLocation, start, end uint32) serial.TextLocation
func IntSetIntersection ¶
IntSetIntersection returns `a` ∩ `b`.
func IntSetToSlice ¶
IntSetToSlice returns keys of `set` as a slice.
func IntSliceDifference ¶
IntSliceDifference returns the elements in `a` that aren't in `b`.
func IntSliceIntersection ¶
IntSliceIntersection returns `a` ∩ `b`.
func IntSliceSymmetricDifference ¶
IntSliceDifference returns the elements in `a` that aren't in `b` plus the elements in `b` that aren't in `a`.
func IntSliceToSet ¶
IntSliceToSet returns a map whose keys are the elements of `arr`.
func MakeUsage ¶
func MakeUsage(msg string)
MakeUsage updates flag.Usage to include usage message `msg`.
func MinMaxIntSlice ¶
MinMaxIntSlice returns min and max of `arr`. `valid` is true if `arr` contains values.
func MkParentDir ¶
MkParentDir creates the parent directory for `filename` if it doesn't already exist.
func PageSizeMm ¶
PageSizeMm returns the width and height of `page` in mm.
func PageSizePt ¶
PageSizePt returns the width and height of `page` in points.
func PatternsToPaths ¶
PatternsToPaths returns a list of files matching the patterns in `patternList`.
func PdfOpenDescribe ¶
PdfOpenDescribe returns numPages, width, height for PDF file `inPath`. Width and height are in mm.
func PdfOpenFile ¶
PdfOpenFile opens PDF file `inPath` and attempts to handle null encryption schemes.
func PdfOpenReader ¶
func ProcessPDFPagesFile ¶
func ProcessPDFPagesFile(inPath string, processPage func(pageNum uint32, page *pdf.PdfPage) error) error
ProcessPDFPagesFile runs `processPage` on every page in PDF file `inPath`. It can recover from errors in the libraries it calls if RecoverErrors is true.
func ProcessPDFPagesReader ¶
func ReaderSizeHash ¶
func ReaderSizeHash(rs io.ReadSeeker) (int64, string, error)
func RegularFile ¶
RegularFile returns true if file `filename` is a regular file.
func RemoveDirectory ¶
RemoveDirectory recursively removes directory `dir` and its contents from disk.
func SetLogging ¶
func SetLogging()
func SortFileSize ¶
SortFileSize returns the paths of the files in `pathList` sorted by ascending size. If `minSize` >= 0 then only files of this size or larger are returned. If `maxSize` >= 0 then only files of this size or smaller are returned.
func StringUniques ¶
StringUniques returns the unique strings in `arr`.
func ToSerialTextLocation ¶
func ToSerialTextLocation(loc extractor.TextLocation) serial.TextLocation
ToSerialTextLocation converts extractor.TextLocation `loc` to a more compact serial.TextLocation.
func WriteJsonSlice ¶
WriteJsonSlice writes slice `vals` to json file `filename`, one line per string. NOTE: We write this json file in a human readable way because we will be using it in development
Types ¶
type DocPageText ¶
type DocPageText struct { DocIdx uint64 // Doc index (0-offset) into PositionsState.fileList . PageIdx uint32 // Page index (0-offset) into DocPositions.index . PageNum uint32 // Page number in PDF file (1-offset) Text string // Extracted page text. }
DocPageText contains doc:page indexes, the PDF page number and the text extracted from from a PDF page.
type DocPositions ¶
type DocPositions struct {
// contains filtered or unexported fields
}
DocPositions tracks the data that is used to index a PDF file.
func (*DocPositions) AddDocPage ¶
func (lDoc *DocPositions) AddDocPage(pageNum uint32, dpl serial.DocPageLocations, text string) (uint32, error)
AddDocPage adds a page (with page number `pageNum` and contents `dpl`) to `lDoc`. !@#$ Remove `text` param.
func (*DocPositions) Close ¶
func (lDoc *DocPositions) Close() error
func (*DocPositions) GetTextPath ¶
func (lDoc *DocPositions) GetTextPath(pageIdx uint32) string
func (DocPositions) Len ¶
func (d DocPositions) Len() int
func (*DocPositions) ReadPagePositions ¶
func (lDoc *DocPositions) ReadPagePositions(pageIdx uint32) (uint32, serial.DocPageLocations, error)
ReadPagePositions returns the DocPageLocations of the text on the `pageIdx` (0-offset) returned text in document `lDoc`.
func (*DocPositions) ReadPageText ¶
func (lDoc *DocPositions) ReadPageText(pageIdx uint32) (string, error)
func (*DocPositions) Save ¶
func (lDoc *DocPositions) Save() error
func (DocPositions) String ¶
func (d DocPositions) String() string
type ExtractList ¶
type ExtractList struct {
// contains filtered or unexported fields
}
ExtractList is a list of document:page inputs that are to be combined in a specified order.
func CreateExtractList ¶
func CreateExtractList(maxPages int) *ExtractList
func (*ExtractList) AddRect ¶
func (l *ExtractList) AddRect(inPath string, pageNum uint32, llx, lly, urx, ury float32)
func (*ExtractList) NumPages ¶
func (l *ExtractList) NumPages() int
func (*ExtractList) SaveOutputPdf ¶
func (l *ExtractList) SaveOutputPdf(outPath string) error
SaveOutputPdf is called by position_search.go to markup a PDF file with the locations of text. `l` contains the input PDF names and the pages and coordinates to mark. The resulting PDF is written to `outPath`.
func (ExtractList) String ¶
func (l ExtractList) String() string
type FileDesc ¶
type FileDesc struct { InPath string // Full path to PDF file. Hash string // SHA-256 hash of file contents. SizeMB float64 // Size of PDF file on disk. }
FileDesc describes a PDF file.
func CreateFileDesc ¶
func CreateFileDesc(inPath string, rs io.ReadSeeker) (FileDesc, error)
type FileFinder ¶
type FileFinder struct {
// contains filtered or unexported fields
}
FileFinder is a group of file paths.
func NewFileFinder ¶
func NewFileFinder(pathList []string) FileFinder
NewFileFinder returns a FileFinder of all file paths in `pathList`.
func NewFileFinderFromCorpus ¶
func NewFileFinderFromCorpus() (FileFinder, error)
NewFileFinderFromCorpus returns a FileFinder for all files in our main corpus directory.
func (*FileFinder) Find ¶
func (ff *FileFinder) Find(fullpath string) string
Find finds the file path in `ff` that best matches `fullpath`.
type PdfMatch ¶
type PdfMatch struct { InPath string PageNum uint32 LineNum int Line string serial.DocPageLocations // contains filtered or unexported fields }
PdfMatch describes a single search match in a PDF document. It is the analog of a bleve search.DocumentMatch
type PdfMatchSet ¶
func SearchIndex ¶
func SearchIndex(lState *PositionsState, index bleve.Index, term string, maxResults int) ( PdfMatchSet, error)
func SearchPdfIndex ¶
func SearchPdfIndex(persistDir, term string, maxResults int) (PdfMatchSet, error)
func (PdfMatchSet) Files ¶
func (s PdfMatchSet) Files() []string
Files returns the unique file names in `s`.
func (PdfMatchSet) Filter ¶
func (s PdfMatchSet) Filter(maxResultsPerFile int) PdfMatchSet
Filter returns a filtered list of results is `s` as a PdfMatchSet.
func (PdfMatchSet) String ¶
func (s PdfMatchSet) String() string
type PositionsState ¶
type PositionsState struct {
// contains filtered or unexported fields
}
PositionsState is the global state of a writer or reader to the position indexes saved to disk.
func FromHIPDs ¶
func FromHIPDs(hipds []serial.HashIndexPathDoc) PositionsState
func IndexPdfFiles ¶
func IndexPdfFiles(pathList []string, persistDir string, forceCreate, allowAppend bool, report func(string)) (*PositionsState, bleve.Index, int, error)
IndexPdfFiles creates a bleve+PositionsState index for `pathList`. If `persistDir` is not empty, the index is written to this directory. If `forceCreate` is true and `persistDir` is not empty, a new directory is always created. If `allowAppend` is true and `persistDir` is not empty and a bleve index already exists on disk then the bleve index will be appended to. `report` is a supplied function that is called to report progress. TODO: Remove `allowAppend` argument. Instead always append to a bleve index if it exists and
`forceCreate` is not set.
func IndexPdfReaders ¶
func IndexPdfReaders(pathList []string, rsList []io.ReadSeeker, persistDir string, forceCreate, allowAppend bool, report func(string)) (*PositionsState, bleve.Index, int, error)
IndexPdfReaders returns a PositionsState and a bleve.Index over the PDF contents read by the io.ReaderSeeker's in `rsList`. The names of the PDFs are in the corresponding position in `pathList`. The inde`persistDir If `persist` is false, the index is stored in memory. If `persist` is true, the index is stored on disk in `persistDir`. `report` is a supplied function that is called to report progress.
func OpenPositionsState ¶
func OpenPositionsState(root string, forceCreate bool) (*PositionsState, error)
OpenPositionsState loads indexes from an existing locations directory `root` or creates one if it doesn't exist. When opening for writing, do this to ensure final index is written to disk:
lState, err := doclib.OpenPositionsState(persistDir, forceCreate) defer lState.Flush()
func (PositionsState) Check ¶
func (l PositionsState) Check()
func (*PositionsState) CreatePositionsDoc ¶
func (lState *PositionsState) CreatePositionsDoc(fd FileDesc) (*DocPositions, error)
CreatePositionsDoc creates a DocPositions for writing. CreatePositionsDoc always populates the DocPositions with base fields. In a persistent `lState`, necessary directories are created and files are opened.
func (*PositionsState) ExtractDocPagePositions ¶
func (lState *PositionsState) ExtractDocPagePositions(inPath string) ([]DocPageText, error)
func (*PositionsState) ExtractDocPagePositionsReader ¶
func (lState *PositionsState) ExtractDocPagePositionsReader(inPath string, rs io.ReadSeeker) ( []DocPageText, error)
ExtractDocPagePositionsReader extracts the text of the PDF file referenced by `rs`. It returns the text as a DocPageText per page. The []DocPageText refer to DocPositions which are stored in lState.hashDoc which is updated in this function.
func (*PositionsState) Flush ¶
func (lState *PositionsState) Flush() error
func (*PositionsState) GetHashPath ¶
func (lState *PositionsState) GetHashPath(docIdx uint64) (hash, inPath string)
func (PositionsState) Len ¶
func (l PositionsState) Len() int
func (*PositionsState) OpenPositionsDoc ¶
func (lState *PositionsState) OpenPositionsDoc(docIdx uint64) (*DocPositions, error)
OpenPositionsDoc opens a DocPositions for reading. In a persistent `lState`, necessary files are opened in lDoc.openDoc().
func (*PositionsState) ReadDocPagePositions ¶
func (lState *PositionsState) ReadDocPagePositions(docIdx uint64, pageIdx uint32) ( string, uint32, serial.DocPageLocations, error)
ReadDocPagePositions is inefficient. A DocPositions (a file) is opened and closed to read a page.
func (*PositionsState) ReadDocPageText ¶
func (lState *PositionsState) ReadDocPageText(docIdx uint64, pageIdx uint32) (string, error)
func (PositionsState) String ¶
func (l PositionsState) String() string
func (PositionsState) ToHIPDs ¶
func (l PositionsState) ToHIPDs() []serial.HashIndexPathDoc