2020-08-27 21:45:09 +00:00
//
// Copyright 2020 FoxyUtils ehf. All rights reserved.
//
// This is a commercial product and requires a license to operate.
// A trial license can be obtained at https://unidoc.io
//
// DO NOT EDIT: generated by unitwist Go source code obfuscator.
//
// Use of this source code is governed by the UniDoc End User License Agreement
// terms that can be accessed at https://unidoc.io/eula/
2018-03-22 14:03:47 +00:00
2020-08-27 21:45:09 +00:00
// Package extractor is used for quickly extracting PDF content through a simple interface.
// Currently offers functionality for extracting textual content.
2024-05-29 17:04:37 +00:00
package extractor ; import ( _ae "bytes" ; _b "errors" ; _efc "fmt" ; _ag "github.com/unidoc/unipdf/v3/common" ; _aa "github.com/unidoc/unipdf/v3/contentstream" ; _gf "github.com/unidoc/unipdf/v3/core" ; _d "github.com/unidoc/unipdf/v3/internal/license" ; _cb "github.com/unidoc/unipdf/v3/internal/textencoding" ;
_aae "github.com/unidoc/unipdf/v3/internal/transform" ; _af "github.com/unidoc/unipdf/v3/model" ; _cc "golang.org/x/image/draw" ; _c "golang.org/x/text/unicode/norm" ; _ef "image" ; _fe "image/color" ; _fc "io" ; _ea "math" ; _ed "reflect" ; _g "regexp" ; _e "sort" ; _a "strings" ;
_fg "unicode" ; _bb "unicode/utf8" ; ) ; func ( _ccdd * wordBag ) highestWord ( _ffagg int , _effd , _ffed float64 ) * textWord { for _ , _ccfg := range _ccdd . _cdbc [ _ffagg ] { if _effd <= _ccfg . _accb && _ccfg . _accb <= _ffed { return _ccfg ; } ; } ; return nil ; } ;
// String returns a string describing the current state of the textState stack.
func ( _ege * stateStack ) String ( ) string { _aeab := [ ] string { _efc . Sprintf ( "\u002d\u002d\u002d\u002d f\u006f\u006e\u0074\u0020\u0073\u0074\u0061\u0063\u006b\u003a\u0020\u0025\u0064" , len ( * _ege ) ) } ; for _ebff , _gce := range * _ege { _ffgd := "\u003c\u006e\u0069l\u003e" ;
if _gce != nil { _ffgd = _gce . String ( ) ; } ; _aeab = append ( _aeab , _efc . Sprintf ( "\u0009\u0025\u0032\u0064\u003a\u0020\u0025\u0073" , _ebff , _ffgd ) ) ; } ; return _a . Join ( _aeab , "\u000a" ) ; } ; func _ebdca ( _fffdb _gf . PdfObject , _fbgda _fe . Color ) ( _ef . Image , error ) { _adggdd , _adabd := _gf . GetStream ( _fffdb ) ;
if ! _adabd { return nil , nil ; } ; _efcb , _bagbfc := _af . NewXObjectImageFromStream ( _adggdd ) ; if _bagbfc != nil { return nil , _bagbfc ; } ; _ffabf , _bagbfc := _efcb . ToImage ( ) ; if _bagbfc != nil { return nil , _bagbfc ; } ; return _eccab ( _ffabf , _fbgda ) , nil ; } ; type rulingList [ ] * ruling ;
func ( _fabf rulingList ) comp ( _ecea , _fgfc int ) bool { _bfdbg , _gggac := _fabf [ _ecea ] , _fabf [ _fgfc ] ; _cfdc , _fcbc := _bfdbg . _ecfb , _gggac . _ecfb ; if _cfdc != _fcbc { return _cfdc > _fcbc ; } ; if _cfdc == _ceag { return false ; } ; _bagcf := func ( _bbdb bool ) bool { if _cfdc == _eeg { return _bbdb ;
} ; return ! _bbdb ; } ; _gcega , _agdf := _bfdbg . _aeef , _gggac . _aeef ; if _gcega != _agdf { return _bagcf ( _gcega > _agdf ) ; } ; _gcega , _agdf = _bfdbg . _ggdb , _gggac . _ggdb ; if _gcega != _agdf { return _bagcf ( _gcega < _agdf ) ; } ; return _bagcf ( _bfdbg . _gbca < _gggac . _gbca ) ;
} ;
// TextTable represents a table.
// Cells are ordered top-to-bottom, left-to-right.
// Cells[y] is the (0-offset) y'th row in the table.
// Cells[y][x] is the (0-offset) x'th column in the table.
type TextTable struct { _af . PdfRectangle ; W , H int ; Cells [ ] [ ] TableCell ; } ; func _bebf ( _bcfd _af . PdfRectangle ) * ruling { return & ruling { _ecfb : _gecdf , _aeef : _bcfd . Urx , _ggdb : _bcfd . Lly , _gbca : _bcfd . Ury } ; } ; func ( _aabc * shapesState ) cubicTo ( _adg , _ece , _dbee , _becb , _bece , _ccbcd float64 ) { if _cece { _ag . Log . Info ( "\u0063\u0075\u0062\u0069\u0063\u0054\u006f\u003a" ) ;
} ; _aabc . addPoint ( _bece , _ccbcd ) ; } ; func ( _dbgdd * textTable ) reduce ( ) * textTable { _cecd := make ( [ ] int , 0 , _dbgdd . _cegga ) ; _cbccf := make ( [ ] int , 0 , _dbgdd . _aageb ) ; for _caba := 0 ; _caba < _dbgdd . _cegga ; _caba ++ { if ! _dbgdd . emptyCompositeRow ( _caba ) { _cecd = append ( _cecd , _caba ) ;
} ; } ; for _gbcb := 0 ; _gbcb < _dbgdd . _aageb ; _gbcb ++ { if ! _dbgdd . emptyCompositeColumn ( _gbcb ) { _cbccf = append ( _cbccf , _gbcb ) ; } ; } ; if len ( _cecd ) == _dbgdd . _cegga && len ( _cbccf ) == _dbgdd . _aageb { return _dbgdd ; } ; _fcfab := textTable { _caagg : _dbgdd . _caagg , _aageb : len ( _cbccf ) , _cegga : len ( _cecd ) , _dgcf : make ( map [ uint64 ] * textPara , len ( _cbccf ) * len ( _cecd ) ) } ;
if _dedc { _ag . Log . Info ( "\u0072\u0065\u0064\u0075ce\u003a\u0020\u0025\u0064\u0078\u0025\u0064\u0020\u002d\u003e\u0020\u0025\u0064\u0078%\u0064" , _dbgdd . _aageb , _dbgdd . _cegga , len ( _cbccf ) , len ( _cecd ) ) ; _ag . Log . Info ( "\u0072\u0065d\u0075\u0063\u0065d\u0043\u006f\u006c\u0073\u003a\u0020\u0025\u002b\u0076" , _cbccf ) ;
_ag . Log . Info ( "\u0072\u0065d\u0075\u0063\u0065d\u0052\u006f\u0077\u0073\u003a\u0020\u0025\u002b\u0076" , _cecd ) ; } ; for _cgagc , _fcgfd := range _cecd { for _ebbc , _cade := range _cbccf { _abcgc , _febfc := _dbgdd . getComposite ( _cade , _fcgfd ) ; if _abcgc == nil { continue ;
} ; if _dedc { _efc . Printf ( "\u0020 \u0025\u0032\u0064\u002c \u0025\u0032\u0064\u0020\u0028%\u0032d\u002c \u0025\u0032\u0064\u0029\u0020\u0025\u0071\n" , _ebbc , _cgagc , _cade , _fcgfd , _efcca ( _abcgc . merge ( ) . text ( ) , 50 ) ) ; } ; _fcfab . putComposite ( _ebbc , _cgagc , _abcgc , _febfc ) ;
} ; } ; return & _fcfab ; } ; func _eebe ( _cafe [ ] structElement , _abceg map [ int ] [ ] * textLine , _fbcd _gf . PdfObject ) [ ] * list { _bdfea := [ ] * list { } ; for _ , _aeee := range _cafe { _ffecg := _aeee . _befc ; _cfeaa := int ( _aeee . _fbge ) ; _cgbbf := _aeee . _dccda ; _ffac := [ ] * textLine { } ;
_bcbcg := [ ] * list { } ; _bcdea := _aeee . _bffdf ; _bbbaf , _cdeg := ( _bcdea . ( * _gf . PdfObjectReference ) ) ; if ! _cdeg { _ag . Log . Debug ( "\u0066\u0061\u0069l\u0065\u0064\u0020\u006f\u0074\u0020\u0063\u0061\u0073\u0074\u0020\u0074\u006f\u0020\u002a\u0063\u006f\u0072\u0065\u002e\u0050\u0064\u0066\u004f\u0062\u006a\u0065\u0063\u0074R\u0065\u0066\u0065\u0072\u0065\u006e\u0063\u0065" ) ;
} ; if _cfeaa != - 1 && _bbbaf != nil { if _cgcd , _cccce := _abceg [ _cfeaa ] ; _cccce { if _eadb , _aebdb := _fbcd . ( * _gf . PdfIndirectObject ) ; _aebdb { _aggc := _eadb . PdfObjectReference ; if _ed . DeepEqual ( * _bbbaf , _aggc ) { _ffac = _cgcd ; } ; } ; } ; } ; if _ffecg != nil { _bcbcg = _eebe ( _ffecg , _abceg , _fbcd ) ;
} ; _ddce := _abda ( _ffac , _cgbbf , _bcbcg ) ; _bdfea = append ( _bdfea , _ddce ) ; } ; return _bdfea ; } ; func ( _cgcb * wordBag ) applyRemovals ( _cfgb map [ int ] map [ * textWord ] struct { } ) { for _ddeb , _bad := range _cfgb { if len ( _bad ) == 0 { continue ; } ; _ggee := _cgcb . _cdbc [ _ddeb ] ;
_agdb := len ( _ggee ) - len ( _bad ) ; if _agdb == 0 { delete ( _cgcb . _cdbc , _ddeb ) ; continue ; } ; _ebaa := make ( [ ] * textWord , _agdb ) ; _ggdf := 0 ; for _ , _decc := range _ggee { if _ , _ebbb := _bad [ _decc ] ; ! _ebbb { _ebaa [ _ggdf ] = _decc ; _ggdf ++ ; } ; } ; _cgcb . _cdbc [ _ddeb ] = _ebaa ;
} ; } ; func _eaf ( _gbbd _aae . Matrix ) _aae . Point { _caeg , _ccca := _gbbd . Translation ( ) ; return _aae . Point { X : _caeg , Y : _ccca } ; } ; type lineRuling struct { _faab rulingKind ; _fffg markKind ; _fe . Color ; _bbee , _efge _aae . Point ; } ; func ( _faff paraList ) addNeighbours ( ) { _afbaf := func ( _edaf [ ] int , _faaba * textPara ) ( [ ] * textPara , [ ] * textPara ) { _dgbbf := make ( [ ] * textPara , 0 , len ( _edaf ) - 1 ) ;
_beaec := make ( [ ] * textPara , 0 , len ( _edaf ) - 1 ) ; for _ , _ggefg := range _edaf { _gcbea := _faff [ _ggefg ] ; if _gcbea . Urx <= _faaba . Llx { _dgbbf = append ( _dgbbf , _gcbea ) ; } else if _gcbea . Llx >= _faaba . Urx { _beaec = append ( _beaec , _gcbea ) ; } ; } ; return _dgbbf , _beaec ;
} ; _bgcae := func ( _dcbff [ ] int , _dfecc * textPara ) ( [ ] * textPara , [ ] * textPara ) { _cegab := make ( [ ] * textPara , 0 , len ( _dcbff ) - 1 ) ; _gdbcc := make ( [ ] * textPara , 0 , len ( _dcbff ) - 1 ) ; for _ , _fabd := range _dcbff { _cebdb := _faff [ _fabd ] ; if _cebdb . Ury <= _dfecc . Lly { _gdbcc = append ( _gdbcc , _cebdb ) ;
} else if _cebdb . Lly >= _dfecc . Ury { _cegab = append ( _cegab , _cebdb ) ; } ; } ; return _cegab , _gdbcc ; } ; _gdegc := _faff . yNeighbours ( _gcaf ) ; for _ , _fgdf := range _faff { _debcc := _gdegc [ _fgdf ] ; if len ( _debcc ) == 0 { continue ; } ; _efagg , _cggdf := _afbaf ( _debcc , _fgdf ) ;
if len ( _efagg ) == 0 && len ( _cggdf ) == 0 { continue ; } ; if len ( _efagg ) > 0 { _bacac := _efagg [ 0 ] ; for _ , _aebg := range _efagg [ 1 : ] { if _aebg . Urx >= _bacac . Urx { _bacac = _aebg ; } ; } ; for _ , _afgf := range _efagg { if _afgf != _bacac && _afgf . Urx > _bacac . Llx { _bacac = nil ;
break ; } ; } ; if _bacac != nil && _dfba ( _fgdf . PdfRectangle , _bacac . PdfRectangle ) { _fgdf . _caagd = _bacac ; } ; } ; if len ( _cggdf ) > 0 { _dafgg := _cggdf [ 0 ] ; for _ , _egge := range _cggdf [ 1 : ] { if _egge . Llx <= _dafgg . Llx { _dafgg = _egge ; } ; } ; for _ , _ffaag := range _cggdf { if _ffaag != _dafgg && _ffaag . Llx < _dafgg . Urx { _dafgg = nil ;
break ; } ; } ; if _dafgg != nil && _dfba ( _fgdf . PdfRectangle , _dafgg . PdfRectangle ) { _fgdf . _aggd = _dafgg ; } ; } ; } ; _gdegc = _faff . xNeighbours ( _bcfa ) ; for _ , _gbad := range _faff { _egdef := _gdegc [ _gbad ] ; if len ( _egdef ) == 0 { continue ; } ; _gecf , _aabfc := _bgcae ( _egdef , _gbad ) ;
if len ( _gecf ) == 0 && len ( _aabfc ) == 0 { continue ; } ; if len ( _aabfc ) > 0 { _fdga := _aabfc [ 0 ] ; for _ , _cefb := range _aabfc [ 1 : ] { if _cefb . Ury >= _fdga . Ury { _fdga = _cefb ; } ; } ; for _ , _decfg := range _aabfc { if _decfg != _fdga && _decfg . Ury > _fdga . Lly { _fdga = nil ;
break ; } ; } ; if _fdga != nil && _gfb ( _gbad . PdfRectangle , _fdga . PdfRectangle ) { _gbad . _cabda = _fdga ; } ; } ; if len ( _gecf ) > 0 { _ddca := _gecf [ 0 ] ; for _ , _fgcb := range _gecf [ 1 : ] { if _fgcb . Lly <= _ddca . Lly { _ddca = _fgcb ; } ; } ; for _ , _fgcbd := range _gecf { if _fgcbd != _ddca && _fgcbd . Lly < _ddca . Ury { _ddca = nil ;
break ; } ; } ; if _ddca != nil && _gfb ( _gbad . PdfRectangle , _ddca . PdfRectangle ) { _gbad . _ecdfc = _ddca ; } ; } ; } ; for _ , _eabb := range _faff { if _eabb . _caagd != nil && _eabb . _caagd . _aggd != _eabb { _eabb . _caagd = nil ; } ; if _eabb . _ecdfc != nil && _eabb . _ecdfc . _cabda != _eabb { _eabb . _ecdfc = nil ;
} ; if _eabb . _aggd != nil && _eabb . _aggd . _caagd != _eabb { _eabb . _aggd = nil ; } ; if _eabb . _cabda != nil && _eabb . _cabda . _ecdfc != _eabb { _eabb . _cabda = nil ; } ; } ; } ; const ( _gefg = 1.0e-6 ; _fbf = 1.0e-4 ; _dgbd = 10 ; _cdcb = 6 ; _egeb = 0.5 ; _fbbf = 0.12 ; _gegc = 0.19 ; _ceca = 0.04 ;
_cgdgc = 0.04 ; _cgbc = 1.0 ; _afdg = 0.04 ; _cecf = 0.4 ; _bea = 0.7 ; _daf = 1.0 ; _egdfc = 0.1 ; _geac = 1.4 ; _efbf = 0.46 ; _bgcc = 0.02 ; _fcfce = 0.2 ; _bbcc = 0.5 ; _eade = 4 ; _fcbe = 4.0 ; _gaba = 6 ; _ccfc = 0.3 ; _bcfa = 0.01 ; _gcaf = 0.02 ; _fcad = 2 ; _bce = 2 ; _ebdb = 500 ; _abca = 4.0 ; _gabg = 4.0 ; _ebffe = 0.05 ;
_gcffb = 0.1 ; _bcae = 2.0 ; _cggd = 2.0 ; _caf = 1.5 ; _bfad = 3.0 ; _bbce = 0.25 ; ) ; func _afee ( _fgfef * paraList ) map [ int ] [ ] * textLine { _ggfd := map [ int ] [ ] * textLine { } ; for _ , _gafd := range * _fgfef { for _ , _edbfb := range _gafd . _aage { if ! _ccga ( _edbfb ) { _ag . Log . Debug ( "g\u0072\u006f\u0075p\u004c\u0069\u006e\u0065\u0073\u003a\u0020\u0054\u0068\u0065\u0020\u0074\u0065\u0078\u0074\u0020\u006c\u0069\u006e\u0065\u0020\u0063\u006f\u006e\u0074a\u0069\u006e\u0073 \u006d\u006f\u0072\u0065\u0020\u0074\u0068\u0061\u006e\u0020\u006f\u006e\u0065 \u006d\u0063\u0069\u0064 \u006e\u0075\u006d\u0062e\u0072\u002e\u0020\u0049\u0074\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020\u0062\u0065\u0020\u0073p\u006c\u0069\u0074\u002e" ) ;
continue ; } ; _aded := _edbfb . _cfcb [ 0 ] . _ffcd [ 0 ] . _ffbdg ; _ggfd [ _aded ] = append ( _ggfd [ _aded ] , _edbfb ) ; } ; if _gafd . _befe != nil { _dadd := _gafd . _befe . _dgcf ; for _ , _abfe := range _dadd { for _ , _ecfe := range _abfe . _aage { if ! _ccga ( _ecfe ) { _ag . Log . Debug ( "g\u0072\u006f\u0075p\u004c\u0069\u006e\u0065\u0073\u003a\u0020\u0054\u0068\u0065\u0020\u0074\u0065\u0078\u0074\u0020\u006c\u0069\u006e\u0065\u0020\u0063\u006f\u006e\u0074a\u0069\u006e\u0073 \u006d\u006f\u0072\u0065\u0020\u0074\u0068\u0061\u006e\u0020\u006f\u006e\u0065 \u006d\u0063\u0069\u0064 \u006e\u0075\u006d\u0062e\u0072\u002e\u0020\u0049\u0074\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020\u0062\u0065\u0020\u0073p\u006c\u0069\u0074\u002e" ) ;
continue ; } ; _gcbfb := _ecfe . _cfcb [ 0 ] . _ffcd [ 0 ] . _ffbdg ; _ggfd [ _gcbfb ] = append ( _ggfd [ _gcbfb ] , _ecfe ) ; } ; } ; } ; } ; return _ggfd ; } ; type textObject struct { _dbe * Extractor ; _dae * _af . PdfPageResources ; _aef _aa . GraphicsState ; _ecff * textState ; _aega * stateStack ;
_dbc _aae . Matrix ; _ebc _aae . Matrix ; _afff [ ] * textMark ; _cdcc bool ; } ;
2024-02-11 21:29:32 +00:00
// Font represents the font properties on a PDF page.
2024-05-29 17:04:37 +00:00
type Font struct { PdfFont * _af . PdfFont ;
2024-02-11 21:29:32 +00:00
// FontName represents Font Name from font properties.
FontName string ;
// FontType represents Font Subtype entry in the font dictionary inside page resources.
// Examples : type0, Type1, MMType1, Type3, TrueType, CIDFont.
FontType string ;
// ToUnicode is true if font provides a `ToUnicode` mapping.
ToUnicode bool ;
// IsCID is true if underlying font is a composite font.
// Composite font is represented by a font dictionary whose Subtype is `Type0`
IsCID bool ;
// IsSimple is true if font is simple font.
// A simple font is limited to only 8 bit (255) character codes.
IsSimple bool ;
// FontData represents the raw data of the embedded font file.
// It can have format TrueType (TTF), PostScript Font (PFB) or Compact Font Format (CCF).
// FontData value can be indicates from `FontFile`, `FontFile2` or `FontFile3` inside Font Descriptor.
// At most, only one of `FontFile`, `FontFile2` or `FontFile3` will be FontData value.
FontData [ ] byte ;
// FontFileName is a name representing the font. it has format:
// (Font Name) + (Font Type Extension), example: helvetica.ttf.
FontFileName string ;
// FontDescriptor represents metrics and other attributes inside font properties from PDF Structure (Font Descriptor).
2024-05-29 17:04:37 +00:00
FontDescriptor * _af . PdfFontDescriptor ; } ; func _ebgc ( _babf [ ] * textLine , _aabcg string ) string { var _fdfce _a . Builder ; _bcac := 0.0 ; for _abcfe , _ebec := range _babf { _agbd := _ebec . text ( ) ; _acae := _ebec . _addd ; if _abcfe < len ( _babf ) - 1 { _bcac = _babf [ _abcfe + 1 ] . _addd ;
} else { _bcac = 0.0 ; } ; _fdfce . WriteString ( _aabcg ) ; _fdfce . WriteString ( _agbd ) ; if _bcac != _acae { _fdfce . WriteString ( "\u000a" ) ; } else { _fdfce . WriteString ( "\u0020" ) ; } ; } ; return _fdfce . String ( ) ; } ; func ( _deegd gridTile ) contains ( _bddc _af . PdfRectangle ) bool { if _deegd . numBorders ( ) < 3 { return false ;
} ; if _deegd . _gceeb && _bddc . Llx < _deegd . Llx - _caf { return false ; } ; if _deegd . _gdcbg && _bddc . Urx > _deegd . Urx + _caf { return false ; } ; if _deegd . _dbafa && _bddc . Lly < _deegd . Lly - _caf { return false ; } ; if _deegd . _ffdf && _bddc . Ury > _deegd . Ury + _caf { return false ;
} ; return true ; } ;
2024-03-27 22:34:33 +00:00
2024-05-29 17:04:37 +00:00
// List returns all the list objects detected on the page.
// It detects all the bullet point Lists from a given pdf page and builds a slice of bullet list objects.
// A given bullet list object has a tree structure.
// Each bullet point list is extracted with the text content it contains and all the sub lists found under it as children in the tree.
// The rest content of the pdf is ignored and only text in the bullet point lists are extracted.
// The list extraction is done in two ways.
// 1. If the document is tagged then the lists are extracted using the tags provided in the document.
// 2. Otherwise the bullet lists are extracted from the raw text using regex matching.
// By default the document tag is used if available.
// However this can be disabled using `DisableDocumentTags` in the `Options` object.
// Sometimes disabling document tags option might give a better bullet list extraction if the document was tagged incorrectly.
//
// options := &Options{
// DisableDocumentTags: false, // this means use document tag if available
// }
// ex, err := NewWithOptions(page, options)
// // handle error
// pageText, _, _, err := ex.ExtractPageText()
// // handle error
// lists := pageText.List()
// txt := lists.Text()
func ( _gaedc PageText ) List ( ) lists { _degg := ! _gaedc . _gbg . _gcgg ; _ecef := _gaedc . getParagraphs ( ) ; _fbgc := true ; if _gaedc . _cgad == nil || * _gaedc . _cgad == nil { _fbgc = false ; } ; _dabf := _ecef . list ( ) ; if _fbgc && _degg { _ebae := _afee ( & _ecef ) ; _aceg := & structTreeRoot { } ;
_aceg . parseStructTreeRoot ( * _gaedc . _cgad ) ; if _aceg . _cfbfg == nil { _ag . Log . Debug ( "\u004c\u0069\u0073\u0074\u003a\u0020\u0073t\u0072\u0075\u0063\u0074\u0054\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u0020\u0064\u006f\u0065\u0073\u006e'\u0074\u0020\u0068\u0061\u0076e\u0020\u0061\u006e\u0079\u0020\u0063\u006f\u006e\u0074e\u006e\u0074\u002c\u0020\u0075\u0073\u0069\u006e\u0067\u0020\u0074\u0065\u0078\u0074\u0020\u006d\u0061\u0074\u0063\u0068\u0069\u006e\u0067\u0020\u006d\u0065\u0074\u0068\u006f\u0064\u0020\u0069\u006e\u0073\u0074\u0065\u0061\u0064\u002e" ) ;
return _dabf ; } ; _dabf = _aceg . buildList ( _ebae , _gaedc . _dfc ) ; } ; return _dabf ; } ; func ( _bffb * TextMarkArray ) exists ( _gag TextMark ) bool { for _ , _bge := range _bffb . Elements ( ) { if _ed . DeepEqual ( _gag . DirectObject , _bge . DirectObject ) && _ed . DeepEqual ( _gag . BBox , _bge . BBox ) && _bge . Text == _gag . Text { return true ;
} ; } ; return false ; } ; func ( _aaa * imageExtractContext ) extractInlineImage ( _ddf * _aa . ContentStreamInlineImage , _bdd _aa . GraphicsState , _eac * _af . PdfPageResources ) error { _ede , _fdf := _ddf . ToImage ( _eac ) ; if _fdf != nil { return _fdf ; } ; _gaff , _fdf := _ddf . GetColorSpace ( _eac ) ;
if _fdf != nil { return _fdf ; } ; if _gaff == nil { _gaff = _af . NewPdfColorspaceDeviceGray ( ) ; } ; _fa , _fdf := _gaff . ImageToRGB ( * _ede ) ; if _fdf != nil { return _fdf ; } ; _ffg := ImageMark { Image : & _fa , Width : _bdd . CTM . ScalingFactorX ( ) , Height : _bdd . CTM . ScalingFactorY ( ) , Angle : _bdd . CTM . Angle ( ) } ;
_ffg . X , _ffg . Y = _bdd . CTM . Translation ( ) ; _aaa . _cgg = append ( _aaa . _cgg , _ffg ) ; _aaa . _ffa ++ ; return nil ; } ; func _fdag ( _bcfac , _acad _aae . Point ) rulingKind { _caadg := _ea . Abs ( _bcfac . X - _acad . X ) ; _gacef := _ea . Abs ( _bcfac . Y - _acad . Y ) ; return _fcgef ( _caadg , _gacef , _abca ) ;
} ; type textState struct { _fdad float64 ; _febe float64 ; _dba float64 ; _cdc float64 ; _gbbgg float64 ; _aaeb RenderMode ; _dgef float64 ; _fgfgb * _af . PdfFont ; _bcbc _af . PdfRectangle ; _cfg int ; _dacb int ; } ; func ( _edbcc * textTable ) computeBbox ( ) _af . PdfRectangle { var _fddc _af . PdfRectangle ;
_gaef := false ; for _decae := 0 ; _decae < _edbcc . _cegga ; _decae ++ { for _bfage := 0 ; _bfage < _edbcc . _aageb ; _bfage ++ { _deafc := _edbcc . get ( _bfage , _decae ) ; if _deafc == nil { continue ; } ; if ! _gaef { _fddc = _deafc . PdfRectangle ; _gaef = true ; } else { _fddc = _cfab ( _fddc , _deafc . PdfRectangle ) ;
} ; } ; } ; return _fddc ; } ; func ( _becbe * wordBag ) blocked ( _ffd * textWord ) bool { if _ffd . Urx < _becbe . Llx { _eaab := _bebf ( _ffd . PdfRectangle ) ; _egdf := _ggbea ( _becbe . PdfRectangle ) ; if _becbe . _dcaa . blocks ( _eaab , _egdf ) { if _bgeg { _ag . Log . Info ( "\u0062\u006c\u006f\u0063ke\u0064\u0020\u2190\u0078\u003a\u0020\u0025\u0073\u0020\u0025\u0073" , _ffd , _becbe ) ;
} ; return true ; } ; } else if _becbe . Urx < _ffd . Llx { _baab := _bebf ( _becbe . PdfRectangle ) ; _gddcf := _ggbea ( _ffd . PdfRectangle ) ; if _becbe . _dcaa . blocks ( _baab , _gddcf ) { if _bgeg { _ag . Log . Info ( "b\u006co\u0063\u006b\u0065\u0064\u0020\u0078\u2192\u0020:\u0020\u0025\u0073\u0020%s" , _ffd , _becbe ) ;
} ; return true ; } ; } ; if _ffd . Ury < _becbe . Lly { _agab := _bbaf ( _ffd . PdfRectangle ) ; _gbgd := _gdgf ( _becbe . PdfRectangle ) ; if _becbe . _bfcg . blocks ( _agab , _gbgd ) { if _bgeg { _ag . Log . Info ( "\u0062\u006c\u006f\u0063ke\u0064\u0020\u2190\u0079\u003a\u0020\u0025\u0073\u0020\u0025\u0073" , _ffd , _becbe ) ;
} ; return true ; } ; } else if _becbe . Ury < _ffd . Lly { _abbb := _bbaf ( _becbe . PdfRectangle ) ; _gbgf := _gdgf ( _ffd . PdfRectangle ) ; if _becbe . _bfcg . blocks ( _abbb , _gbgf ) { if _bgeg { _ag . Log . Info ( "b\u006co\u0063\u006b\u0065\u0064\u0020\u0079\u2192\u0020:\u0020\u0025\u0073\u0020%s" , _ffd , _becbe ) ;
} ; return true ; } ; } ; return false ; } ; func ( _ecbe * imageExtractContext ) processOperand ( _dd * _aa . ContentStreamOperation , _feb _aa . GraphicsState , _cfd * _af . PdfPageResources ) error { if _dd . Operand == "\u0042\u0049" && len ( _dd . Params ) == 1 { _agd , _efca := _dd . Params [ 0 ] . ( * _aa . ContentStreamInlineImage ) ;
if ! _efca { return nil ; } ; if _gaf , _bde := _gf . GetBoolVal ( _agd . ImageMask ) ; _bde { if _gaf && ! _ecbe . _dac . IncludeInlineStencilMasks { return nil ; } ; } ; return _ecbe . extractInlineImage ( _agd , _feb , _cfd ) ; } else if _dd . Operand == "\u0044\u006f" && len ( _dd . Params ) == 1 { _def , _fcd := _gf . GetName ( _dd . Params [ 0 ] ) ;
if ! _fcd { _ag . Log . Debug ( "E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065" ) ; return _ca ; } ; _ , _ccbe := _cfd . GetXObjectByName ( * _def ) ; switch _ccbe { case _af . XObjectTypeImage : return _ecbe . extractXObjectImage ( _def , _feb , _cfd ) ; case _af . XObjectTypeForm : return _ecbe . extractFormImages ( _def , _feb , _cfd ) ;
} ; } else if _ecbe . _eed && ( _dd . Operand == "\u0073\u0063\u006e" || _dd . Operand == "\u0053\u0043\u004e" ) && len ( _dd . Params ) == 1 { _dea , _ffc := _gf . GetName ( _dd . Params [ 0 ] ) ; if ! _ffc { _ag . Log . Debug ( "E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065" ) ;
return _ca ; } ; _gfg , _ffc := _cfd . GetPatternByName ( * _dea ) ; if ! _ffc { _ag . Log . Debug ( "\u0045R\u0052\u004f\u0052\u003a\u0020\u0050\u0061\u0074\u0074\u0065\u0072n\u0020\u006e\u006f\u0074\u0020\u0066\u006f\u0075\u006e\u0064" ) ; return nil ; } ; if _gfg . IsTiling ( ) { _dc := _gfg . GetAsTilingPattern ( ) ;
_abc , _fdb := _dc . GetContentStream ( ) ; if _fdb != nil { return _fdb ; } ; _fdb = _ecbe . extractContentStreamImages ( string ( _abc ) , _dc . Resources ) ; if _fdb != nil { return _fdb ; } ; } ; } else if ( _dd . Operand == "\u0063\u0073" || _dd . Operand == "\u0043\u0053" ) && len ( _dd . Params ) >= 1 { _ecbe . _eed = _dd . Params [ 0 ] . String ( ) == "\u0050a\u0074\u0074\u0065\u0072\u006e" ;
} ; return nil ; } ; const ( _ebd = "\u0045\u0052R\u004f\u0052\u003a\u0020\u0043\u0061\u006e\u0027\u0074\u0020\u0063\u006f\u006e\u0076\u0065\u0072\u0074\u0020\u0066\u006f\u006e\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074\u002c\u0020\u0069\u006e\u0076\u0061\u006c\u0069\u0064\u0020\u0074\u0079\u0070\u0065" ;
_dfe = "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0043a\u006e\u0027\u0074 g\u0065\u0074\u0020\u0066\u006f\u006et\u0020\u0070\u0072\u006f\u0070\u0065\u0072\u0074\u0069\u0065\u0073\u002c\u0020\u0066\u006fn\u0074\u0020\u006e\u006f\u0074\u0020\u0066\u006fu\u006e\u0064" ;
_cbe = "\u0045\u0052\u0052O\u0052\u003a\u0020\u0043\u0061\u006e\u0027\u0074\u0020\u0067\u0065\u0074\u0020\u0066\u006f\u006e\u0074\u0020\u0073\u0074\u0072\u0065\u0061\u006d\u002c\u0020\u0069\u006e\u0076a\u006c\u0069\u0064\u0020\u0074\u0079\u0070\u0065" ; ) ;
func ( _fggb compositeCell ) split ( _fcge , _efea [ ] float64 ) * textTable { _fabg := len ( _fcge ) + 1 ; _ecbac := len ( _efea ) + 1 ; if _dedc { _ag . Log . Info ( "\u0063\u006f\u006d\u0070\u006f\u0073\u0069t\u0065\u0043\u0065l\u006c\u002e\u0073\u0070l\u0069\u0074\u003a\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u000a\u0009\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u003d\u0025\u0073\u000a" + "\u0009\u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073=\u0025\u0036\u002e\u0032\u0066\u000a\t\u0063\u006f\u006c\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u003d%\u0036\u002e\u0032\u0066" , _ecbac , _fabg , _fggb , _fcge , _efea ) ;
_efc . Printf ( "\u0020\u0020\u0020\u0020\u0025\u0064\u0020\u0070\u0061\u0072\u0061\u0073\u000a" , len ( _fggb . paraList ) ) ; for _dacg , _dcdabc := range _fggb . paraList { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _dacg , _dcdabc . String ( ) ) ;
} ; _efc . Printf ( "\u0020\u0020\u0020\u0020\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u000a" , len ( _fggb . lines ( ) ) ) ; for _bafe , _aeae := range _fggb . lines ( ) { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _bafe , _aeae ) ; } ; } ; _fcge = _cdee ( _fcge , _fggb . Ury , _fggb . Lly ) ;
_efea = _cdee ( _efea , _fggb . Llx , _fggb . Urx ) ; _cgag := make ( map [ uint64 ] * textPara , _ecbac * _fabg ) ; _ebfaa := textTable { _aageb : _ecbac , _cegga : _fabg , _dgcf : _cgag } ; _fcgfc := _fggb . paraList ; _e . Slice ( _fcgfc , func ( _dfce , _cdcbf int ) bool { _ggbc , _ddab := _fcgfc [ _dfce ] , _fcgfc [ _cdcbf ] ;
_gbfff , _aaaa := _ggbc . Lly , _ddab . Lly ; if _gbfff != _aaaa { return _gbfff < _aaaa ; } ; return _ggbc . Llx < _ddab . Llx ; } ) ; _gbfe := make ( map [ uint64 ] _af . PdfRectangle , _ecbac * _fabg ) ; for _bfee , _eagb := range _fcge [ 1 : ] { _gcbbe := _fcge [ _bfee ] ; for _acga , _dbbbd := range _efea [ 1 : ] { _agbdd := _efea [ _acga ] ;
_gbfe [ _cdgd ( _acga , _bfee ) ] = _af . PdfRectangle { Llx : _agbdd , Urx : _dbbbd , Lly : _eagb , Ury : _gcbbe } ; } ; } ; if _dedc { _ag . Log . Info ( "\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u0043\u0065l\u006c\u002e\u0073\u0070\u006c\u0069\u0074\u003a\u0020\u0072e\u0063\u0074\u0073" ) ;
_efc . Printf ( "\u0020\u0020\u0020\u0020" ) ; for _fgaga := 0 ; _fgaga < _ecbac ; _fgaga ++ { _efc . Printf ( "\u0025\u0033\u0030\u0064\u002c\u0020" , _fgaga ) ; } ; _efc . Println ( ) ; for _eddd := 0 ; _eddd < _fabg ; _eddd ++ { _efc . Printf ( "\u0020\u0020\u0025\u0032\u0064\u003a" , _eddd ) ;
for _ecgd := 0 ; _ecgd < _ecbac ; _ecgd ++ { _efc . Printf ( "\u00256\u002e\u0032\u0066\u002c\u0020" , _gbfe [ _cdgd ( _ecgd , _eddd ) ] ) ; } ; _efc . Println ( ) ; } ; } ; _cceac := func ( _fdddb * textLine ) ( int , int ) { for _aegb := 0 ; _aegb < _fabg ; _aegb ++ { for _bcga := 0 ; _bcga < _ecbac ;
_bcga ++ { if _aeca ( _gbfe [ _cdgd ( _bcga , _aegb ) ] , _fdddb . PdfRectangle ) { return _bcga , _aegb ; } ; } ; } ; return - 1 , - 1 ; } ; _agcde := make ( map [ uint64 ] [ ] * textLine , _ecbac * _fabg ) ; for _ , _bfcfc := range _fcgfc . lines ( ) { _fcbee , _fafg := _cceac ( _bfcfc ) ; if _fcbee < 0 { continue ;
} ; _agcde [ _cdgd ( _fcbee , _fafg ) ] = append ( _agcde [ _cdgd ( _fcbee , _fafg ) ] , _bfcfc ) ; } ; for _geace := 0 ; _geace < len ( _fcge ) - 1 ; _geace ++ { _beaf := _fcge [ _geace ] ; _fffde := _fcge [ _geace + 1 ] ; for _acbg := 0 ; _acbg < len ( _efea ) - 1 ; _acbg ++ { _gecde := _efea [ _acbg ] ;
_eedc := _efea [ _acbg + 1 ] ; _cgabf := _af . PdfRectangle { Llx : _gecde , Urx : _eedc , Lly : _fffde , Ury : _beaf } ; _cfag := _agcde [ _cdgd ( _acbg , _geace ) ] ; if len ( _cfag ) == 0 { continue ; } ; _bege := _adbde ( _cgabf , _cfag ) ; _ebfaa . put ( _acbg , _geace , _bege ) ; } ; } ; return & _ebfaa ;
} ;
2024-03-27 22:34:33 +00:00
2024-05-29 17:04:37 +00:00
// PageImages represents extracted images on a PDF page with spatial information:
// display position and size.
type PageImages struct { Images [ ] ImageMark ; } ;
2024-04-16 11:40:43 +00:00
2024-05-29 17:04:37 +00:00
// TextMarkArray is a collection of TextMarks.
type TextMarkArray struct { _aec [ ] TextMark } ; func ( _bbacd rulingList ) snapToGroupsDirection ( ) rulingList { _bbacd . sortStrict ( ) ; _beea := make ( map [ * ruling ] rulingList , len ( _bbacd ) ) ; _cefgb := _bbacd [ 0 ] ; _bcfe := func ( _bagda * ruling ) { _cefgb = _bagda ; _beea [ _cefgb ] = rulingList { _bagda } } ;
_bcfe ( _bbacd [ 0 ] ) ; for _ , _cacg := range _bbacd [ 1 : ] { if _cacg . _aeef < _cefgb . _aeef - _gefg { _ag . Log . Error ( "\u0073\u006e\u0061\u0070T\u006f\u0047\u0072\u006f\u0075\u0070\u0073\u0044\u0069r\u0065\u0063\u0074\u0069\u006f\u006e\u003a\u0020\u0057\u0072\u006f\u006e\u0067\u0020\u0070\u0072\u0069\u006da\u0072\u0079\u0020\u006f\u0072d\u0065\u0072\u002e\u000a\u0009\u0076\u0030\u003d\u0025\u0073\u000a\u0009\u0020\u0076\u003d\u0025\u0073" , _cefgb , _cacg ) ;
} ; if _cacg . _aeef > _cefgb . _aeef + _cggd { _bcfe ( _cacg ) ; } else { _beea [ _cefgb ] = append ( _beea [ _cefgb ] , _cacg ) ; } ; } ; _cffed := make ( map [ * ruling ] float64 , len ( _beea ) ) ; _fggbg := make ( map [ * ruling ] * ruling , len ( _bbacd ) ) ; for _eadbe , _gcaae := range _beea { _cffed [ _eadbe ] = _gcaae . mergePrimary ( ) ;
for _ , _cdbd := range _gcaae { _fggbg [ _cdbd ] = _eadbe ; } ; } ; for _ , _eacb := range _bbacd { _eacb . _aeef = _cffed [ _fggbg [ _eacb ] ] ; } ; _bcddf := make ( rulingList , 0 , len ( _bbacd ) ) ; for _ , _babag := range _beea { _cgge := _babag . splitSec ( ) ; for _fbgca , _ggdgd := range _cgge { _bedgd := _ggdgd . merge ( ) ;
if len ( _bcddf ) > 0 { _egbgee := _bcddf [ len ( _bcddf ) - 1 ] ; if _egbgee . alignsPrimary ( _bedgd ) && _egbgee . alignsSec ( _bedgd ) { _ag . Log . Error ( "\u0073\u006e\u0061\u0070\u0054\u006fG\u0072\u006f\u0075\u0070\u0073\u0044\u0069\u0072\u0065\u0063\u0074\u0069\u006f\u006e\u003a\u0020\u0044\u0075\u0070\u006ci\u0063\u0061\u0074\u0065\u0020\u0069\u003d\u0025\u0064\u000a\u0009\u0077\u003d\u0025s\u000a\t\u0076\u003d\u0025\u0073" , _fbgca , _egbgee , _bedgd ) ;
continue ; } ; } ; _bcddf = append ( _bcddf , _bedgd ) ; } ; } ; _bcddf . sortStrict ( ) ; return _bcddf ; } ; func _fcegf ( _adea [ ] pathSection ) rulingList { _abbaa ( _adea ) ; if _gdeb { _ag . Log . Info ( "\u006d\u0061k\u0065\u0053\u0074\u0072\u006f\u006b\u0065\u0052\u0075\u006c\u0069\u006e\u0067\u0073\u003a\u0020\u0025\u0064\u0020\u0073\u0074\u0072ok\u0065\u0073" , len ( _adea ) ) ;
} ; var _ccfcg rulingList ; for _ , _dacba := range _adea { for _ , _cecc := range _dacba . _bgbeg { if len ( _cecc . _aaebg ) < 2 { continue ; } ; _dagb := _cecc . _aaebg [ 0 ] ; for _ , _fcacb := range _cecc . _aaebg [ 1 : ] { if _bgda , _fegd := _gccad ( _dagb , _fcacb , _dacba . Color ) ;
_fegd { _ccfcg = append ( _ccfcg , _bgda ) ; } ; _dagb = _fcacb ; } ; } ; } ; if _gdeb { _ag . Log . Info ( "m\u0061\u006b\u0065\u0053tr\u006fk\u0065\u0052\u0075\u006c\u0069n\u0067\u0073\u003a\u0020\u0025\u0073" , _ccfcg ) ; } ; return _ccfcg ; } ; func ( _cegc * PageText ) getParagraphs ( ) paraList { var _aabfd rulingList ;
if _adef { _cda := _fcegf ( _cegc . _gggf ) ; _aabfd = append ( _aabfd , _cda ... ) ; } ; if _bgce { _fffd := _aefg ( _cegc . _afbg ) ; _aabfd = append ( _aabfd , _fffd ... ) ; } ; _aabfd , _geed := _aabfd . toTilings ( ) ; var _dfcg paraList ; _cdcg := len ( _cegc . _fecaa ) ; for _ageg := 0 ;
_ageg < 360 && _cdcg > 0 ; _ageg += 90 { _ggef := make ( [ ] * textMark , 0 , len ( _cegc . _fecaa ) - _cdcg ) ; for _ , _eead := range _cegc . _fecaa { if _eead . _ddfdb == _ageg { _ggef = append ( _ggef , _eead ) ; } ; } ; if len ( _ggef ) > 0 { _cdgg := _bdfg ( _ggef , _cegc . _cdf , _aabfd , _geed , _cegc . _gbg . _dbed ) ;
_dfcg = append ( _dfcg , _cdgg ... ) ; _cdcg -= len ( _ggef ) ; } ; } ; return _dfcg ; } ; func _gbef ( _cfbfc , _gffe bounded ) float64 { _eeadg := _fdbb ( _cfbfc , _gffe ) ; if ! _cdaea ( _eeadg ) { return _eeadg ; } ; return _gdfa ( _cfbfc , _gffe ) ; } ; func _dbbb ( _fcdd string ) string { _dggg := [ ] rune ( _fcdd ) ;
return string ( _dggg [ : len ( _dggg ) - 1 ] ) } ; func ( _beef rulingList ) primMinMax ( ) ( float64 , float64 ) { _ecefe , _ddaef := _beef [ 0 ] . _aeef , _beef [ 0 ] . _aeef ; for _ , _ffab := range _beef [ 1 : ] { if _ffab . _aeef < _ecefe { _ecefe = _ffab . _aeef ; } else if _ffab . _aeef > _ddaef { _ddaef = _ffab . _aeef ;
} ; } ; return _ecefe , _ddaef ; } ; func _dbeee ( _abef [ ] * textMark , _egff _af . PdfRectangle ) [ ] * textWord { var _aeecf [ ] * textWord ; var _fcgff * textWord ; if _aebe { _ag . Log . Info ( "\u006d\u0061\u006beT\u0065\u0078\u0074\u0057\u006f\u0072\u0064\u0073\u003a\u0020\u0025\u0064\u0020\u006d\u0061\u0072\u006b\u0073" , len ( _abef ) ) ;
} ; _ffgg := func ( ) { if _fcgff != nil { _ceeg := _fcgff . computeText ( ) ; if ! _fcegd ( _ceeg ) { _fcgff . _ccbcc = _ceeg ; _aeecf = append ( _aeecf , _fcgff ) ; if _aebe { _ag . Log . Info ( "\u0061\u0064\u0064Ne\u0077\u0057\u006f\u0072\u0064\u003a\u0020\u0025\u0064\u003a\u0020\u0077\u006f\u0072\u0064\u003d\u0025\u0073" , len ( _aeecf ) - 1 , _fcgff . String ( ) ) ;
for _fdbec , _baddc := range _fcgff . _ffcd { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _fdbec , _baddc . String ( ) ) ; } ; } ; } ; _fcgff = nil ; } ; } ; for _ , _bgeb := range _abef { if _geaa && _fcgff != nil && len ( _fcgff . _ffcd ) > 0 { _dccceg := _fcgff . _ffcd [ len ( _fcgff . _ffcd ) - 1 ] ;
_fecb , _dcba := _fcgg ( _bgeb . _cgeb ) ; _babbd , _gadd := _fcgg ( _dccceg . _cgeb ) ; if _dcba && ! _gadd && _dccceg . inDiacriticArea ( _bgeb ) { _fcgff . addDiacritic ( _fecb ) ; continue ; } ; if _gadd && ! _dcba && _bgeb . inDiacriticArea ( _dccceg ) { _fcgff . _ffcd = _fcgff . _ffcd [ : len ( _fcgff . _ffcd ) - 1 ] ;
_fcgff . appendMark ( _bgeb , _egff ) ; _fcgff . addDiacritic ( _babbd ) ; continue ; } ; } ; _bbdgc := _fcegd ( _bgeb . _cgeb ) ; if _bbdgc { _ffgg ( ) ; continue ; } ; if _fcgff == nil && ! _bbdgc { _fcgff = _eaae ( [ ] * textMark { _bgeb } , _egff ) ; continue ; } ; _ggcbbe := _fcgff . _abcc ;
_ddaeb := _ea . Abs ( _eeecd ( _egff , _bgeb ) - _fcgff . _accb ) / _ggcbbe ; _bagbf := _egec ( _bgeb , _fcgff ) / _ggcbbe ; if _bagbf >= _fbbf || ! ( - _gegc <= _bagbf && _ddaeb <= _ceca ) { _ffgg ( ) ; _fcgff = _eaae ( [ ] * textMark { _bgeb } , _egff ) ; continue ; } ; _fcgff . appendMark ( _bgeb , _egff ) ;
} ; _ffgg ( ) ; return _aeecf ; } ; func ( _gged * textPara ) toCellTextMarks ( _dcgbc * int ) [ ] TextMark { var _cdccf [ ] TextMark ; for _gefgd , _ccaf := range _gged . _aage { _edbce := _ccaf . toTextMarks ( _dcgbc ) ; _bcea := _dadc && _ccaf . endsInHyphen ( ) && _gefgd != len ( _gged . _aage ) - 1 ;
if _bcea { _edbce = _cgab ( _edbce , _dcgbc ) ; } ; _cdccf = append ( _cdccf , _edbce ... ) ; if ! ( _bcea || _gefgd == len ( _gged . _aage ) - 1 ) { _cdccf = _dbce ( _cdccf , _dcgbc , _gcccd ( _ccaf . _addd , _gged . _aage [ _gefgd + 1 ] . _addd ) ) ; } ; } ; return _cdccf ; } ;
2024-03-27 22:34:33 +00:00
// ImageMark represents an image drawn on a page and its position in device coordinates.
// All coordinates are in device coordinates.
2024-05-29 17:04:37 +00:00
type ImageMark struct { Image * _af . Image ;
2024-03-27 22:34:33 +00:00
// Dimensions of the image as displayed in the PDF.
Width float64 ; Height float64 ;
// Position of the image in PDF coordinates (lower left corner).
X float64 ; Y float64 ;
// Angle in degrees, if rotated.
2024-05-29 17:04:37 +00:00
Angle float64 ; } ; func ( _gabc paraList ) sortReadingOrder ( ) { _ag . Log . Trace ( "\u0073\u006fr\u0074\u0052\u0065\u0061\u0064i\u006e\u0067\u004f\u0072\u0064e\u0072\u003a\u0020\u0070\u0061\u0072\u0061\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d" , len ( _gabc ) ) ;
if len ( _gabc ) <= 1 { return ; } ; _gabc . computeEBBoxes ( ) ; _e . Slice ( _gabc , func ( _ebad , _fddd int ) bool { return _bcbe ( _gabc [ _ebad ] , _gabc [ _fddd ] ) <= 0 } ) ; } ; func ( _gcec * ruling ) intersects ( _gdda * ruling ) bool { _ddfa := ( _gcec . _ecfb == _gecdf && _gdda . _ecfb == _eeg ) || ( _gdda . _ecfb == _gecdf && _gcec . _ecfb == _eeg ) ;
_egbcf := func ( _eeeg , _defcdb * ruling ) bool { return _eeeg . _ggdb - _bcae <= _defcdb . _aeef && _defcdb . _aeef <= _eeeg . _gbca + _bcae ; } ; _gdcdd := _egbcf ( _gcec , _gdda ) ; _gecaa := _egbcf ( _gdda , _gcec ) ; if _gdeb { _efc . Printf ( "\u0020\u0020\u0020\u0020\u0069\u006e\u0074\u0065\u0072\u0073\u0065\u0063\u0074\u0073\u003a\u0020\u0020\u006fr\u0074\u0068\u006f\u0067\u006f\u006e\u0061l\u003d\u0025\u0074\u0020\u006f\u0031\u003d\u0025\u0074\u0020\u006f2\u003d\u0025\u0074\u0020\u2192\u0020\u0025\u0074\u000a" + "\u0020\u0020\u0020 \u0020\u0020\u0020\u0076\u003d\u0025\u0073\u000a" + " \u0020\u0020\u0020\u0020\u0020\u0077\u003d\u0025\u0073\u000a" , _ddfa , _gdcdd , _gecaa , _ddfa && _gdcdd && _gecaa , _gcec , _gdda ) ;
} ; return _ddfa && _gdcdd && _gecaa ; } ; func ( _eff * shapesState ) closePath ( ) { if _eff . _gbee { _eff . _baca = append ( _eff . _baca , _ggda ( _eff . _faa ) ) ; _eff . _gbee = false ; } else if len ( _eff . _baca ) == 0 { if _cece { _ag . Log . Debug ( "\u0063\u006c\u006f\u0073eP\u0061\u0074\u0068\u0020\u0077\u0069\u0074\u0068\u0020\u006e\u006f\u0020\u0070\u0061t\u0068" ) ;
} ; _eff . _gbee = false ; return ; } ; _eff . _baca [ len ( _eff . _baca ) - 1 ] . close ( ) ; if _cece { _ag . Log . Info ( "\u0063\u006c\u006f\u0073\u0065\u0050\u0061\u0074\u0068\u003a\u0020\u0025\u0073" , _eff ) ; } ; } ; func ( _gagb * shapesState ) moveTo ( _dcf , _defd float64 ) { _gagb . _gbee = true ;
_gagb . _faa = _gagb . devicePoint ( _dcf , _defd ) ; if _cece { _ag . Log . Info ( "\u006d\u006fv\u0065\u0054\u006f\u003a\u0020\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0064\u0065\u0076\u0069\u0063\u0065\u003d%.\u0032\u0066" , _dcf , _defd , _gagb . _faa ) ;
} ; } ; func _fabfa ( _cefd , _gcafg _ef . Image ) _ef . Image { _dfcac , _abbe := _gcafg . Bounds ( ) . Size ( ) , _cefd . Bounds ( ) . Size ( ) ; _bcgc , _dcad := _dfcac . X , _dfcac . Y ; if _abbe . X > _bcgc { _bcgc = _abbe . X ; } ; if _abbe . Y > _dcad { _dcad = _abbe . Y ; } ; _baegf := _ef . Rect ( 0 , 0 , _bcgc , _dcad ) ;
if _dfcac . X != _bcgc || _dfcac . Y != _dcad { _bdea := _ef . NewRGBA ( _baegf ) ; _cc . BiLinear . Scale ( _bdea , _baegf , _cefd , _gcafg . Bounds ( ) , _cc . Over , nil ) ; _gcafg = _bdea ; } ; if _abbe . X != _bcgc || _abbe . Y != _dcad { _ddee := _ef . NewRGBA ( _baegf ) ; _cc . BiLinear . Scale ( _ddee , _baegf , _cefd , _cefd . Bounds ( ) , _cc . Over , nil ) ;
_cefd = _ddee ; } ; _bgffe := _ef . NewRGBA ( _baegf ) ; _cc . DrawMask ( _bgffe , _baegf , _cefd , _ef . Point { } , _gcafg , _ef . Point { } , _cc . Over ) ; return _bgffe ; } ; func ( _cefg paraList ) tables ( ) [ ] TextTable { var _fbcb [ ] TextTable ; if _dedc { _ag . Log . Info ( "\u0070\u0061\u0072\u0061\u0073\u002e\u0074\u0061\u0062\u006c\u0065\u0073\u003a" ) ;
} ; for _ , _efae := range _cefg { _fgff := _efae . _befe ; if _fgff != nil && _fgff . isExportable ( ) { _fbcb = append ( _fbcb , _fgff . toTextTable ( ) ) ; } ; } ; return _fbcb ; } ; func ( _baa * PageFonts ) extractPageResourcesToFont ( _bc * _af . PdfPageResources ) error { _dg , _dbg := _gf . GetDict ( _bc . Font ) ;
if ! _dbg { return _b . New ( _ebd ) ; } ; for _ , _be := range _dg . Keys ( ) { var ( _bbf = true ; _ga [ ] byte ; _bag string ; ) ; _cfb , _ebde := _bc . GetFontByName ( _be ) ; if ! _ebde { return _b . New ( _dfe ) ; } ; _ceg , _cbd := _af . NewPdfFontFromPdfObject ( _cfb ) ; if _cbd != nil { return _cbd ;
} ; _abd := _ceg . FontDescriptor ( ) ; _ecb := _ceg . FontDescriptor ( ) . FontName . String ( ) ; _dbd := _ceg . Subtype ( ) ; if _dgg ( _baa . Fonts , _ecb ) { continue ; } ; if len ( _ceg . ToUnicode ( ) ) == 0 { _bbf = false ; } ; if _abd . FontFile != nil { if _bbg , _fb := _gf . GetStream ( _abd . FontFile ) ;
_fb { _ga , _cbd = _gf . DecodeStream ( _bbg ) ; if _cbd != nil { return _cbd ; } ; _bag = _ecb + "\u002e\u0070\u0066\u0062" ; } ; } else if _abd . FontFile2 != nil { if _de , _gg := _gf . GetStream ( _abd . FontFile2 ) ; _gg { _ga , _cbd = _gf . DecodeStream ( _de ) ; if _cbd != nil { return _cbd ;
} ; _bag = _ecb + "\u002e\u0074\u0074\u0066" ; } ; } else if _abd . FontFile3 != nil { if _da , _ccg := _gf . GetStream ( _abd . FontFile3 ) ; _ccg { _ga , _cbd = _gf . DecodeStream ( _da ) ; if _cbd != nil { return _cbd ; } ; _bag = _ecb + "\u002e\u0063\u0066\u0066" ; } ; } ; if len ( _bag ) < 1 { _ag . Log . Debug ( _cbe ) ;
} ; _ebab := Font { FontName : _ecb , PdfFont : _ceg , IsCID : _ceg . IsCID ( ) , IsSimple : _ceg . IsSimple ( ) , ToUnicode : _bbf , FontType : _dbd , FontData : _ga , FontFileName : _bag , FontDescriptor : _abd } ; _baa . Fonts = append ( _baa . Fonts , _ebab ) ; } ; return nil ; } ; func ( _ged * wordBag ) allWords ( ) [ ] * textWord { var _bdef [ ] * textWord ;
for _ , _fceg := range _ged . _cdbc { _bdef = append ( _bdef , _fceg ... ) ; } ; return _bdef ; } ; type subpath struct { _aaebg [ ] _aae . Point ; _cedc bool ; } ; func _dfga ( _bbefg [ ] * textLine ) map [ float64 ] [ ] * textLine { _e . Slice ( _bbefg , func ( _baac , _gbda int ) bool { return _bbefg [ _baac ] . _addd < _bbefg [ _gbda ] . _addd } ) ;
_bffec := map [ float64 ] [ ] * textLine { } ; for _ , _bgeac := range _bbefg { _cgfd := _bedc ( _bgeac ) ; _cgfd = _ea . Round ( _cgfd ) ; _bffec [ _cgfd ] = append ( _bffec [ _cgfd ] , _bgeac ) ; } ; return _bffec ; } ; func ( _fee * textObject ) getFontDirect ( _ebea string ) ( * _af . PdfFont , error ) { _cbeg , _gbfa := _fee . getFontDict ( _ebea ) ;
if _gbfa != nil { return nil , _gbfa ; } ; _ffcc , _gbfa := _af . NewPdfFontFromPdfObject ( _cbeg ) ; if _gbfa != nil { _ag . Log . Debug ( "\u0067\u0065\u0074\u0046\u006f\u006e\u0074\u0044\u0069\u0072\u0065\u0063\u0074\u003a\u0020\u004e\u0065\u0077Pd\u0066F\u006f\u006e\u0074\u0046\u0072\u006f\u006d\u0050\u0064\u0066\u004f\u0062j\u0065\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u006e\u0061\u006d\u0065\u003d%\u0023\u0071\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _ebea , _gbfa ) ;
} ; return _ffcc , _gbfa ; } ; func ( _ccea compositeCell ) parasBBox ( ) ( paraList , _af . PdfRectangle ) { return _ccea . paraList , _ccea . PdfRectangle ; } ; func ( _egf * wordBag ) scanBand ( _cfbe string , _eeec * wordBag , _eacf func ( _ebbee * wordBag , _bcba * textWord ) bool , _deb , _gccge , _beca float64 , _fdbe , _cdd bool ) int { _fcaf := _eeec . _cdac ;
var _bbcg map [ int ] map [ * textWord ] struct { } ; if ! _fdbe { _bbcg = _egf . makeRemovals ( ) ; } ; _cdfa := _egeb * _fcaf ; _egab := 0 ; for _ , _cfcg := range _egf . depthBand ( _deb - _cdfa , _gccge + _cdfa ) { if len ( _egf . _cdbc [ _cfcg ] ) == 0 { continue ; } ; for _ , _faebb := range _egf . _cdbc [ _cfcg ] { if ! ( _deb - _cdfa <= _faebb . _accb && _faebb . _accb <= _gccge + _cdfa ) { continue ;
} ; if ! _eacf ( _eeec , _faebb ) { continue ; } ; _cdba := 2.0 * _ea . Abs ( _faebb . _abcc - _eeec . _cdac ) / ( _faebb . _abcc + _eeec . _cdac ) ; _deag := _ea . Max ( _faebb . _abcc / _eeec . _cdac , _eeec . _cdac / _faebb . _abcc ) ; _cgdg := _ea . Min ( _cdba , _deag ) ; if _beca > 0 && _cgdg > _beca { continue ;
} ; if _eeec . blocked ( _faebb ) { continue ; } ; if ! _fdbe { _eeec . pullWord ( _faebb , _cfcg , _bbcg ) ; } ; _egab ++ ; if ! _cdd { if _faebb . _accb < _deb { _deb = _faebb . _accb ; } ; if _faebb . _accb > _gccge { _gccge = _faebb . _accb ; } ; } ; if _fdbe { break ; } ; } ; } ; if ! _fdbe { _egf . applyRemovals ( _bbcg ) ;
} ; return _egab ; } ; func _gfb ( _cfff , _bba _af . PdfRectangle ) bool { return _bba . Llx <= _cfff . Urx && _cfff . Llx <= _bba . Urx } ; func _eaaa ( _fdba float64 ) bool { return _ea . Abs ( _fdba ) < _cggd } ; func ( _adeaf * textTable ) putComposite ( _dfgafg , _egea int , _eedf paraList , _faaed _af . PdfRectangle ) { if len ( _eedf ) == 0 { _ag . Log . Error ( "\u0074\u0065xt\u0054\u0061\u0062l\u0065\u0029\u0020\u0070utC\u006fmp\u006f\u0073\u0069\u0074\u0065\u003a\u0020em\u0070\u0074\u0079\u0020\u0070\u0061\u0072a\u0073" ) ;
return ; } ; _bbdf := compositeCell { PdfRectangle : _faaed , paraList : _eedf } ; if _dedc { _efc . Printf ( "\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0070\u0075\u0074\u0043\u006f\u006d\u0070o\u0073i\u0074\u0065\u0028\u0025\u0064\u002c\u0025\u0064\u0029\u003c\u002d\u0025\u0073\u000a" , _dfgafg , _egea , _bbdf . String ( ) ) ;
} ; _bbdf . updateBBox ( ) ; _adeaf . _becfc [ _cdgd ( _dfgafg , _egea ) ] = _bbdf ; } ; type rulingKind int ; func ( _bfce * textObject ) setWordSpacing ( _cbf float64 ) { if _bfce == nil { return ; } ; _bfce . _ecff . _febe = _cbf ; } ;
2024-03-27 22:34:33 +00:00
2024-05-29 17:04:37 +00:00
// Append appends `mark` to the mark array.
func ( _dca * TextMarkArray ) Append ( mark TextMark ) { _dca . _aec = append ( _dca . _aec , mark ) } ; func ( _gfbg paraList ) reorder ( _acbd [ ] int ) { _gcca := make ( paraList , len ( _gfbg ) ) ; for _cfgef , _fcfcf := range _acbd { _gcca [ _cfgef ] = _gfbg [ _fcfcf ] ; } ; copy ( _gfbg , _gcca ) ;
} ; func ( _baga rulingList ) tidied ( _feccb string ) rulingList { _gffgg := _baga . removeDuplicates ( ) ; _gffgg . log ( "\u0075n\u0069\u0071\u0075\u0065\u0073" ) ; _gdbbb := _gffgg . snapToGroups ( ) ; if _gdbbb == nil { return nil ; } ; _gdbbb . sort ( ) ; if _gdeb { _ag . Log . Info ( "\u0074\u0069\u0064i\u0065\u0064\u003a\u0020\u0025\u0071\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u0075\u006e\u0069\u0071\u0075\u0065\u0073\u003d\u0025\u0064\u0020\u0063\u006f\u0061l\u0065\u0073\u0063\u0065\u0064\u003d\u0025\u0064" , _feccb , len ( _baga ) , len ( _gffgg ) , len ( _gdbbb ) ) ;
} ; _gdbbb . log ( "\u0063o\u0061\u006c\u0065\u0073\u0063\u0065d" ) ; return _gdbbb ; } ; func ( _eeac * stateStack ) empty ( ) bool { return len ( * _eeac ) == 0 } ;
2024-03-27 22:34:33 +00:00
2024-05-29 17:04:37 +00:00
// Text returns the text content of the `bulletLists`.
func ( _gegg * lists ) Text ( ) string { _fdcf := & _a . Builder { } ; for _ , _ffaa := range * _gegg { _eeef := _ffaa . Text ( ) ; _fdcf . WriteString ( _eeef ) ; } ; return _fdcf . String ( ) ; } ;
// String returns a human readable description of `path`.
func ( _dfg * subpath ) String ( ) string { _caad := _dfg . _aaebg ; _bgfa := len ( _caad ) ; if _bgfa <= 5 { return _efc . Sprintf ( "\u0025d\u003a\u0020\u0025\u0036\u002e\u0032f" , _bgfa , _caad ) ; } ; return _efc . Sprintf ( "\u0025d\u003a\u0020\u0025\u0036.\u0032\u0066\u0020\u0025\u0036.\u0032f\u0020.\u002e\u002e\u0020\u0025\u0036\u002e\u0032f" , _bgfa , _caad [ 0 ] , _caad [ 1 ] , _caad [ _bgfa - 1 ] ) ;
} ; func ( _efbdd * textTable ) growTable ( ) { _bcfea := func ( _fadbbb paraList ) { _efbdd . _cegga ++ ; for _beefb := 0 ; _beefb < _efbdd . _aageb ; _beefb ++ { _ddec := _fadbbb [ _beefb ] ; _efbdd . put ( _beefb , _efbdd . _cegga - 1 , _ddec ) ; } ; } ; _bacbe := func ( _ebaee paraList ) { _efbdd . _aageb ++ ;
for _bgaf := 0 ; _bgaf < _efbdd . _cegga ; _bgaf ++ { _fadgb := _ebaee [ _bgaf ] ; _efbdd . put ( _efbdd . _aageb - 1 , _bgaf , _fadgb ) ; } ; } ; if _gddf { _efbdd . log ( "\u0067r\u006f\u0077\u0054\u0061\u0062\u006ce" ) ; } ; for _bead := 0 ; ; _bead ++ { _fcgd := false ; _gedcg := _efbdd . getDown ( ) ;
_fadab := _efbdd . getRight ( ) ; if _gddf { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _bead , _efbdd ) ; _efc . Printf ( "\u0020\u0020 \u0020\u0020\u0020 \u0020\u0064\u006f\u0077\u006e\u003d\u0025\u0073\u000a" , _gedcg ) ; _efc . Printf ( "\u0020\u0020 \u0020\u0020\u0020 \u0072\u0069\u0067\u0068\u0074\u003d\u0025\u0073\u000a" , _fadab ) ;
} ; if _gedcg != nil && _fadab != nil { _eede := _gedcg [ len ( _gedcg ) - 1 ] ; if ! _eede . taken ( ) && _eede == _fadab [ len ( _fadab ) - 1 ] { _bcfea ( _gedcg ) ; if _fadab = _efbdd . getRight ( ) ; _fadab != nil { _bacbe ( _fadab ) ; _efbdd . put ( _efbdd . _aageb - 1 , _efbdd . _cegga - 1 , _eede ) ;
} ; _fcgd = true ; } ; } ; if ! _fcgd && _gedcg != nil { _bcfea ( _gedcg ) ; _fcgd = true ; } ; if ! _fcgd && _fadab != nil { _bacbe ( _fadab ) ; _fcgd = true ; } ; if ! _fcgd { break ; } ; } ; } ; type textTable struct { _af . PdfRectangle ; _aageb , _cegga int ; _caagg bool ; _dgcf map [ uint64 ] * textPara ;
_becfc map [ uint64 ] compositeCell ; } ; func _eccab ( _bfcfa * _af . Image , _aggbc _fe . Color ) _ef . Image { _feefg , _bcab := int ( _bfcfa . Width ) , int ( _bfcfa . Height ) ; _cacd := _ef . NewRGBA ( _ef . Rect ( 0 , 0 , _feefg , _bcab ) ) ; for _ddeca := 0 ; _ddeca < _bcab ; _ddeca ++ { for _dedd := 0 ;
_dedd < _feefg ; _dedd ++ { _dddf , _fegg := _bfcfa . ColorAt ( _dedd , _ddeca ) ; if _fegg != nil { _ag . Log . Debug ( "\u0057\u0041\u0052\u004e\u003a\u0020\u0063o\u0075\u006c\u0064\u0020\u006e\u006f\u0074\u0020\u0072\u0065\u0074\u0072\u0069\u0065v\u0065 \u0069\u006d\u0061\u0067\u0065\u0020m\u0061\u0073\u006b\u0020\u0076\u0061\u006cu\u0065\u0020\u0061\u0074\u0020\u0028\u0025\u0064\u002c\u0020\u0025\u0064\u0029\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074\u0020\u006da\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063\u006f\u0072\u0072\u0065\u0063t\u002e" , _dedd , _ddeca ) ;
continue ; } ; _ecffg , _dfcb , _fcefb , _ := _dddf . RGBA ( ) ; var _aagbg _fe . Color ; if _ecffg + _dfcb + _fcefb == 0 { _aagbg = _fe . Transparent ; } else { _aagbg = _aggbc ; } ; _cacd . Set ( _dedd , _ddeca , _aagbg ) ; } ; } ; return _cacd ; } ; func ( _dcbb * textPara ) toTextMarks ( _aafee * int ) [ ] TextMark { if _dcbb . _befe == nil { return _dcbb . toCellTextMarks ( _aafee ) ;
} ; var _fgcd [ ] TextMark ; for _cbffg := 0 ; _cbffg < _dcbb . _befe . _cegga ; _cbffg ++ { for _fcdf := 0 ; _fcdf < _dcbb . _befe . _aageb ; _fcdf ++ { _aggg := _dcbb . _befe . get ( _fcdf , _cbffg ) ; if _aggg == nil { _fgcd = _dbce ( _fgcd , _aafee , "\u0009" ) ; } else { _gfca := _aggg . toCellTextMarks ( _aafee ) ;
_fgcd = append ( _fgcd , _gfca ... ) ; } ; _fgcd = _dbce ( _fgcd , _aafee , "\u0020" ) ; } ; if _cbffg < _dcbb . _befe . _cegga - 1 { _fgcd = _dbce ( _fgcd , _aafee , "\u000a" ) ; } ; } ; _fgbfd := _dcbb . _befe ; if _fgbfd . isExportable ( ) { _ccbbe := _fgbfd . toTextTable ( ) ; _fgcd = _egdfd ( _fgcd , & _ccbbe ) ;
} ; return _fgcd ; } ; func ( _ffgf paraList ) sortTopoOrder ( ) { _defcd := _ffgf . topoOrder ( ) ; _ffgf . reorder ( _defcd ) } ; func ( _agcd * subpath ) clear ( ) { * _agcd = subpath { } } ; func _aeca ( _bfcbf , _ccbae _af . PdfRectangle ) bool { return _bfcbf . Llx <= _ccbae . Llx && _ccbae . Urx <= _bfcbf . Urx && _bfcbf . Lly <= _ccbae . Lly && _ccbae . Ury <= _bfcbf . Ury ;
2024-04-16 11:40:43 +00:00
} ;
2024-02-11 21:29:32 +00:00
2024-04-30 12:24:05 +00:00
// Options extractor options.
type Options struct {
2024-03-27 22:34:33 +00:00
2024-04-30 12:24:05 +00:00
// DisableDocumentTags specifies whether to use the document tags during list extraction.
DisableDocumentTags bool ;
2024-04-16 11:40:43 +00:00
2024-04-30 12:24:05 +00:00
// ApplyCropBox will extract page text based on page cropbox if set to `true`.
ApplyCropBox bool ;
2024-04-16 11:40:43 +00:00
2024-04-30 12:24:05 +00:00
// UseSimplerExtractionProcess will skip topological text ordering and table processing.
//
// NOTE: While normally the extra processing is beneficial, it can also lead to problems when it does not work.
// Thus it is a flag to allow the user to control this process.
//
// Skipping some extraction processes would also lead to the reduced processing time.
UseSimplerExtractionProcess bool ;
2024-03-27 22:34:33 +00:00
2024-04-30 12:24:05 +00:00
// IncludeAnnotations specifies whether to include annotations in the extraction process, default value is `false`.
2024-05-29 17:04:37 +00:00
IncludeAnnotations bool ; } ; const ( _ceag rulingKind = iota ; _eeg ; _gecdf ; ) ; type paraList [ ] * textPara ; func ( _bbff paraList ) writeText ( _gdde _fc . Writer ) { for _ddegf , _fggf := range _bbff { if _fggf . _bdgc { continue ; } ; _fggf . writeText ( _gdde ) ; if _ddegf != len ( _bbff ) - 1 { if _adab ( _fggf , _bbff [ _ddegf + 1 ] ) { _gdde . Write ( [ ] byte ( "\u0020" ) ) ;
} else { _gdde . Write ( [ ] byte ( "\u000a" ) ) ; _gdde . Write ( [ ] byte ( "\u000a" ) ) ; } ; } ; } ; _gdde . Write ( [ ] byte ( "\u000a" ) ) ; _gdde . Write ( [ ] byte ( "\u000a" ) ) ; } ; func _feca ( _fcdc * _aa . ContentStreamOperation ) ( float64 , error ) { if len ( _fcdc . Params ) != 1 { _dbdd := _b . New ( "\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et" ) ;
_ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076" , _fcdc . Operand , 1 , len ( _fcdc . Params ) , _fcdc . Params ) ;
return 0.0 , _dbdd ; } ; return _gf . GetNumberAsFloat ( _fcdc . Params [ 0 ] ) ; } ; func ( _bdbfd paraList ) xNeighbours ( _gbcg float64 ) map [ * textPara ] [ ] int { _afabb := make ( [ ] event , 2 * len ( _bdbfd ) ) ; if _gbcg == 0 { for _ddga , _agbfb := range _bdbfd { _afabb [ 2 * _ddga ] = event { _agbfb . Llx , true , _ddga } ;
_afabb [ 2 * _ddga + 1 ] = event { _agbfb . Urx , false , _ddga } ; } ; } else { for _cgdb , _edcef := range _bdbfd { _afabb [ 2 * _cgdb ] = event { _edcef . Llx - _gbcg * _edcef . fontsize ( ) , true , _cgdb } ; _afabb [ 2 * _cgdb + 1 ] = event { _edcef . Urx + _gbcg * _edcef . fontsize ( ) , false , _cgdb } ;
} ; } ; return _bdbfd . eventNeighbours ( _afabb ) ; } ; func ( _fcgf paraList ) log ( _dccb string ) { if ! _efbd { return ; } ; _ag . Log . Info ( "%\u0038\u0073\u003a\u0020\u0025\u0064 \u0070\u0061\u0072\u0061\u0073\u0020=\u003d\u003d\u003d\u003d\u003d\u003d\u002d-\u002d\u002d\u002d\u002d\u002d\u003d\u003d\u003d\u003d\u003d=\u003d" , _dccb , len ( _fcgf ) ) ;
for _ggedc , _efad := range _fcgf { if _efad == nil { continue ; } ; _gaffd := _efad . text ( ) ; _feaf := "\u0020\u0020" ; if _efad . _befe != nil { _feaf = _efc . Sprintf ( "\u005b%\u0064\u0078\u0025\u0064\u005d" , _efad . _befe . _aageb , _efad . _befe . _cegga ) ; } ; _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0025s\u0020\u0025\u0071\u000a" , _ggedc , _efad . PdfRectangle , _feaf , _efcca ( _gaffd , 50 ) ) ;
} ; } ; func ( _aee * textObject ) getFontDict ( _ededf string ) ( _ecca _gf . PdfObject , _aeag error ) { _cgf := _aee . _dae ; if _cgf == nil { _ag . Log . Debug ( "g\u0065\u0074\u0046\u006f\u006e\u0074D\u0069\u0063\u0074\u002e\u0020\u004eo\u0020\u0072\u0065\u0073\u006f\u0075\u0072c\u0065\u0073\u002e\u0020\u006e\u0061\u006d\u0065\u003d\u0025#\u0071" , _ededf ) ;
return nil , nil ; } ; _ecca , _gbac := _cgf . GetFontByName ( _gf . PdfObjectName ( _ededf ) ) ; if ! _gbac { _ag . Log . Debug ( "\u0045R\u0052\u004fR\u003a\u0020\u0067\u0065t\u0046\u006f\u006et\u0044\u0069\u0063\u0074\u003a\u0020\u0046\u006f\u006et \u006e\u006f\u0074 \u0066\u006fu\u006e\u0064\u003a\u0020\u006e\u0061m\u0065\u003d%\u0023\u0071" , _ededf ) ;
return nil , _b . New ( "f\u006f\u006e\u0074\u0020no\u0074 \u0069\u006e\u0020\u0072\u0065s\u006f\u0075\u0072\u0063\u0065\u0073" ) ; } ; return _ecca , nil ; } ; func _adbde ( _gbgg _af . PdfRectangle , _cgcc [ ] * textLine ) * textPara { return & textPara { PdfRectangle : _gbgg , _aage : _cgcc } ;
} ; const ( _dadc = true ; _deeb = true ; _geaa = true ; _adfb = false ; _deff = false ; _fcda = 6 ; _gefe = 3.0 ; _defc = 200 ; _eeab = true ; _dcdb = true ; _adef = true ; _bgce = true ; _bedg = false ; ) ; func _cgab ( _gggff [ ] TextMark , _bega * int ) [ ] TextMark { _ceced := _gggff [ len ( _gggff ) - 1 ] ;
_acee := [ ] rune ( _ceced . Text ) ; if len ( _acee ) == 1 { _gggff = _gggff [ : len ( _gggff ) - 1 ] ; _fbcg := _gggff [ len ( _gggff ) - 1 ] ; * _bega = _fbcg . Offset + len ( _fbcg . Text ) ; } else { _fccc := _dbbb ( _ceced . Text ) ; * _bega += len ( _fccc ) - len ( _ceced . Text ) ; _ceced . Text = _fccc ;
} ; return _gggff ; } ; func ( _gbacd * compositeCell ) updateBBox ( ) { for _ , _fgaff := range _gbacd . paraList { _gbacd . PdfRectangle = _cfab ( _gbacd . PdfRectangle , _fgaff . PdfRectangle ) ; } ; } ; var _defba string = "\u005e\u005b\u0061\u002d\u007a\u0041\u002dZ\u005d\u0028\u005c)\u007c\u005c\u002e)\u007c\u005e[\u005c\u0064\u005d\u002b\u0028\u005c)\u007c\\.\u0029\u007c\u005e\u005c\u0028\u005b\u0061\u002d\u007a\u0041\u002d\u005a\u005d\u005c\u0029\u007c\u005e\u005c\u0028\u005b\u005c\u0064\u005d\u002b\u005c\u0029" ;
func ( _adcf * wordBag ) removeDuplicates ( ) { if _gbgbd { _ag . Log . Info ( "r\u0065m\u006f\u0076\u0065\u0044\u0075\u0070\u006c\u0069c\u0061\u0074\u0065\u0073: \u0025\u0071" , _adcf . text ( ) ) ; } ; for _ , _ffgca := range _adcf . depthIndexes ( ) { if len ( _adcf . _cdbc [ _ffgca ] ) == 0 { continue ;
} ; _gabeb := _adcf . _cdbc [ _ffgca ] [ 0 ] ; _cgde := _fcfce * _gabeb . _abcc ; _cegb := _gabeb . _accb ; for _ , _ccgg := range _adcf . depthBand ( _cegb , _cegb + _cgde ) { _fdae := map [ * textWord ] struct { } { } ; _cbef := _adcf . _cdbc [ _ccgg ] ; for _ , _bgbfd := range _cbef { if _ , _dcdbca := _fdae [ _bgbfd ] ;
_dcdbca { continue ; } ; for _ , _dfca := range _cbef { if _ , _ggab := _fdae [ _dfca ] ; _ggab { continue ; } ; if _dfca != _bgbfd && _dfca . _ccbcc == _bgbfd . _ccbcc && _ea . Abs ( _dfca . Llx - _bgbfd . Llx ) < _cgde && _ea . Abs ( _dfca . Urx - _bgbfd . Urx ) < _cgde && _ea . Abs ( _dfca . Lly - _bgbfd . Lly ) < _cgde && _ea . Abs ( _dfca . Ury - _bgbfd . Ury ) < _cgde { _fdae [ _dfca ] = struct { } { } ;
} ; } ; } ; if len ( _fdae ) > 0 { _bgbdc := 0 ; for _ , _dgba := range _cbef { if _ , _ccbcg := _fdae [ _dgba ] ; ! _ccbcg { _cbef [ _bgbdc ] = _dgba ; _bgbdc ++ ; } ; } ; _adcf . _cdbc [ _ccgg ] = _cbef [ : len ( _cbef ) - len ( _fdae ) ] ; if len ( _adcf . _cdbc [ _ccgg ] ) == 0 { delete ( _adcf . _cdbc , _ccgg ) ;
} ; } ; } ; } ; } ; var _baee = _g . MustCompile ( "\u005e\u005c\u0073\u002a\u0028\u005c\u0064\u002b\u005c\u002e\u003f|\u005b\u0049\u0069\u0076\u005d\u002b\u0029\u005c\u0073\u002a\\\u0029\u003f\u0024" ) ; func ( _cfgad paraList ) llyRange ( _faae [ ] int , _egdfa , _adfe float64 ) [ ] int { _dcdbc := len ( _cfgad ) ;
if _adfe < _cfgad [ _faae [ 0 ] ] . Lly || _egdfa > _cfgad [ _faae [ _dcdbc - 1 ] ] . Lly { return nil ; } ; _eeefa := _e . Search ( _dcdbc , func ( _bdefc int ) bool { return _cfgad [ _faae [ _bdefc ] ] . Lly >= _egdfa } ) ; _cgae := _e . Search ( _dcdbc , func ( _fggg int ) bool { return _cfgad [ _faae [ _fggg ] ] . Lly > _adfe } ) ;
return _faae [ _eeefa : _cgae ] ; } ; func _gdgf ( _beae _af . PdfRectangle ) * ruling { return & ruling { _ecfb : _eeg , _aeef : _beae . Lly , _ggdb : _beae . Llx , _gbca : _beae . Urx } ; } ; func ( _cdggg * subpath ) isQuadrilateral ( ) bool { if len ( _cdggg . _aaebg ) < 4 || len ( _cdggg . _aaebg ) > 5 { return false ;
} ; if len ( _cdggg . _aaebg ) == 5 { _fcfg := _cdggg . _aaebg [ 0 ] ; _gcef := _cdggg . _aaebg [ 4 ] ; if _fcfg . X != _gcef . X || _fcfg . Y != _gcef . Y { return false ; } ; } ; return true ; } ; func _ggbea ( _bfgc _af . PdfRectangle ) * ruling { return & ruling { _ecfb : _gecdf , _aeef : _bfgc . Llx , _ggdb : _bfgc . Lly , _gbca : _bfgc . Ury } ;
} ; func ( _dce paraList ) readBefore ( _adgfa [ ] int , _egedg , _ecab int ) bool { _fcafa , _ggge := _dce [ _egedg ] , _dce [ _ecab ] ; if _eeed ( _fcafa , _ggge ) && _fcafa . Lly > _ggge . Lly { return true ; } ; if ! ( _fcafa . _gbgbb . Urx < _ggge . _gbgbb . Llx ) { return false ; } ; _cbbe , _dgcc := _fcafa . Lly , _ggge . Lly ;
if _cbbe > _dgcc { _dgcc , _cbbe = _cbbe , _dgcc ; } ; _bcbf := _ea . Max ( _fcafa . _gbgbb . Llx , _ggge . _gbgbb . Llx ) ; _gedc := _ea . Min ( _fcafa . _gbgbb . Urx , _ggge . _gbgbb . Urx ) ; _cecfc := _dce . llyRange ( _adgfa , _cbbe , _dgcc ) ; for _ , _gfdgf := range _cecfc { if _gfdgf == _egedg || _gfdgf == _ecab { continue ;
} ; _eefa := _dce [ _gfdgf ] ; if _eefa . _gbgbb . Llx <= _gedc && _bcbf <= _eefa . _gbgbb . Urx { return false ; } ; } ; return true ; } ; func ( _bfga * textPara ) depth ( ) float64 { if _bfga . _bdgc { return - 1.0 ; } ; if len ( _bfga . _aage ) > 0 { return _bfga . _aage [ 0 ] . _addd ; } ; return _bfga . _befe . depth ( ) ;
} ; func _bcfb ( _cggc * wordBag , _aeagc float64 , _bbefb , _cfgega rulingList ) [ ] * wordBag { var _effc [ ] * wordBag ; for _ , _eaef := range _cggc . depthIndexes ( ) { _ffda := false ; for ! _cggc . empty ( _eaef ) { _cfda := _cggc . firstReadingIndex ( _eaef ) ; _cfbb := _cggc . firstWord ( _cfda ) ;
_dead := _cgd ( _cfbb , _aeagc , _bbefb , _cfgega ) ; _cggc . removeWord ( _cfbb , _cfda ) ; if _cfgf { _ag . Log . Info ( "\u0066\u0069\u0072\u0073\u0074\u0057\u006f\u0072\u0064\u0020\u005e\u005e^\u005e\u0020\u0025\u0073" , _cfbb . String ( ) ) ; } ; for _geacf := true ; _geacf ;
_geacf = _ffda { _ffda = false ; _cdff := _daf * _dead . _cdac ; _agaba := _cecf * _dead . _cdac ; _degb := _cgbc * _dead . _cdac ; if _cfgf { _ag . Log . Info ( "\u0070a\u0072a\u0057\u006f\u0072\u0064\u0073\u0020\u0064\u0065\u0070\u0074\u0068 \u0025\u002e\u0032\u0066 \u002d\u0020\u0025\u002e\u0032f\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061\u0044\u0065\u0070\u0074\u0068\u0047\u0061\u0070\u003d\u0025\u002e\u0032\u0066\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061R\u0065\u0061\u0064\u0069\u006e\u0067\u0047\u0061p\u003d\u0025\u002e\u0032\u0066" , _dead . minDepth ( ) , _dead . maxDepth ( ) , _degb , _agaba ) ;
} ; if _cggc . scanBand ( "\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c" , _dead , _cbdb ( _gacff , 0 ) , _dead . minDepth ( ) - _degb , _dead . maxDepth ( ) + _degb , _afdg , false , false ) > 0 { _ffda = true ; } ; if _cggc . scanBand ( "\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c" , _dead , _cbdb ( _gacff , _agaba ) , _dead . minDepth ( ) , _dead . maxDepth ( ) , _bea , false , false ) > 0 { _ffda = true ;
} ; if _ffda { continue ; } ; _adgc := _cggc . scanBand ( "" , _dead , _cbdb ( _ebgf , _cdff ) , _dead . minDepth ( ) , _dead . maxDepth ( ) , _egdfc , true , false ) ; if _adgc > 0 { _dgff := ( _dead . maxDepth ( ) - _dead . minDepth ( ) ) / _dead . _cdac ; if ( _adgc > 1 && float64 ( _adgc ) > 0.3 * _dgff ) || _adgc <= 10 { if _cggc . scanBand ( "\u006f\u0074\u0068e\u0072" , _dead , _cbdb ( _ebgf , _cdff ) , _dead . minDepth ( ) , _dead . maxDepth ( ) , _egdfc , false , true ) > 0 { _ffda = true ;
} ; } ; } ; } ; _effc = append ( _effc , _dead ) ; } ; } ; return _effc ; } ; func ( _ccded rulingList ) toTilings ( ) ( rulingList , [ ] gridTiling ) { _ccded . log ( "\u0074o\u0054\u0069\u006c\u0069\u006e\u0067s" ) ; if len ( _ccded ) == 0 { return nil , nil ; } ; _ccded = _ccded . tidied ( "\u0061\u006c\u006c" ) ;
_ccded . log ( "\u0074\u0069\u0064\u0069\u0065\u0064" ) ; _fagfe := _ccded . toGrids ( ) ; _cgagf := make ( [ ] gridTiling , len ( _fagfe ) ) ; for _aaba , _dded := range _fagfe { _cgagf [ _aaba ] = _dded . asTiling ( ) ; } ; return _ccded , _cgagf ; } ; func _ggfca ( _eeaa * textLine , _cddf [ ] * textLine , _dccga [ ] float64 , _gfba , _cfbg float64 ) [ ] * textLine { _bbgc := [ ] * textLine { } ;
for _ , _ffadcg := range _cddf { if _ffadcg . _addd >= _gfba { if _cfbg != - 1 && _ffadcg . _addd < _cfbg { if _ffadcg . text ( ) != _eeaa . text ( ) { if _ea . Round ( _ffadcg . Llx ) < _ea . Round ( _eeaa . Llx ) { break ; } ; _bbgc = append ( _bbgc , _ffadcg ) ; } ; } else if _cfbg == - 1 { if _ffadcg . _addd == _eeaa . _addd { if _ffadcg . text ( ) != _eeaa . text ( ) { _bbgc = append ( _bbgc , _ffadcg ) ;
} ; continue ; } ; _fgbg := _egbgea ( _eeaa , _cddf , _dccga ) ; if _fgbg != - 1 && _ffadcg . _addd <= _fgbg { _bbgc = append ( _bbgc , _ffadcg ) ; } ; } ; } ; } ; return _bbgc ; } ; func ( _ddecf * textWord ) computeText ( ) string { _dfbc := make ( [ ] string , len ( _ddecf . _ffcd ) ) ; for _adcb , _fgba := range _ddecf . _ffcd { _dfbc [ _adcb ] = _fgba . _cgeb ;
} ; return _a . Join ( _dfbc , "" ) ; } ; func ( _fcegc rulingList ) bbox ( ) _af . PdfRectangle { var _egcc _af . PdfRectangle ; if len ( _fcegc ) == 0 { _ag . Log . Error ( "r\u0075\u006c\u0069\u006e\u0067\u004ci\u0073\u0074\u002e\u0062\u0062\u006f\u0078\u003a\u0020n\u006f\u0020\u0072u\u006ci\u006e\u0067\u0073" ) ;
return _af . PdfRectangle { } ; } ; if _fcegc [ 0 ] . _ecfb == _eeg { _egcc . Llx , _egcc . Urx = _fcegc . secMinMax ( ) ; _egcc . Lly , _egcc . Ury = _fcegc . primMinMax ( ) ; } else { _egcc . Llx , _egcc . Urx = _fcegc . primMinMax ( ) ; _egcc . Lly , _egcc . Ury = _fcegc . secMinMax ( ) ; } ; return _egcc ;
} ; func ( _cgca * textObject ) showTextAdjusted ( _gfa * _gf . PdfObjectArray , _bggd int ) error { _ebdd := false ; for _ , _egg := range _gfa . Elements ( ) { switch _egg . ( type ) { case * _gf . PdfObjectFloat , * _gf . PdfObjectInteger : _aff , _gfaf := _gf . GetNumberAsFloat ( _egg ) ;
if _gfaf != nil { _ag . Log . Debug ( "\u0045\u0052\u0052\u004fR\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078t\u0041\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0042\u0061\u0064\u0020\u006e\u0075\u006d\u0065r\u0069\u0063\u0061\u006c\u0020a\u0072\u0067\u002e\u0020\u006f\u003d\u0025\u0073\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076" , _egg , _gfa ) ;
return _gfaf ; } ; _dag , _fce := - _aff * 0.001 * _cgca . _ecff . _gbbgg , 0.0 ; if _ebdd { _fce , _dag = _dag , _fce ; } ; _cbg := _add ( _aae . Point { X : _dag , Y : _fce } ) ; _cgca . _dbc . Concat ( _cbg ) ; case * _gf . PdfObjectString : _gad := _gf . TraceToDirectObject ( _egg ) ; _gbd , _edg := _gf . GetStringBytes ( _gad ) ;
if ! _edg { _ag . Log . Trace ( "s\u0068\u006f\u0077\u0054\u0065\u0078\u0074\u0041\u0064j\u0075\u0073\u0074\u0065\u0064\u003a\u0020Ba\u0064\u0020\u0073\u0074r\u0069\u006e\u0067\u0020\u0061\u0072\u0067\u002e\u0020o=\u0025\u0073 \u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076" , _egg , _gfa ) ;
return _gf . ErrTypeError ; } ; _cgca . renderText ( _gad , _gbd , _bggd ) ; default : _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078\u0074A\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0055\u006e\u0065\u0078p\u0065\u0063\u0074\u0065\u0064\u0020\u0074\u0079\u0070\u0065\u0020\u0028%T\u0029\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076" , _egg , _gfa ) ;
return _gf . ErrTypeError ; } ; } ; return nil ; } ; func ( _bfbee * ruling ) gridIntersecting ( _beag * ruling ) bool { return _edddc ( _bfbee . _ggdb , _beag . _ggdb ) && _edddc ( _bfbee . _gbca , _beag . _gbca ) ; } ;
// String returns a human readable description of `vecs`.
func ( _dadcf rulingList ) String ( ) string { if len ( _dadcf ) == 0 { return "\u007b \u0045\u004d\u0050\u0054\u0059\u0020}" ; } ; _dffae , _gdbe := _dadcf . vertsHorzs ( ) ; _agfb := len ( _dffae ) ; _cbdag := len ( _gdbe ) ; if _agfb == 0 || _cbdag == 0 { return _efc . Sprintf ( "\u007b%\u0064\u0020\u0078\u0020\u0025\u0064}" , _agfb , _cbdag ) ;
} ; _dccbd := _af . PdfRectangle { Llx : _dffae [ 0 ] . _aeef , Urx : _dffae [ _agfb - 1 ] . _aeef , Lly : _gdbe [ _cbdag - 1 ] . _aeef , Ury : _gdbe [ 0 ] . _aeef } ; return _efc . Sprintf ( "\u007b\u0025d\u0020\u0078\u0020%\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u007d" , _agfb , _cbdag , _dccbd ) ;
} ;
2024-03-27 22:34:33 +00:00
2024-04-16 11:40:43 +00:00
// NewWithOptions an Extractor instance for extracting content from the input PDF page with options.
2024-05-29 17:04:37 +00:00
func NewWithOptions ( page * _af . PdfPage , options * Options ) ( * Extractor , error ) { const _ce = "\u0065x\u0074\u0072\u0061\u0063\u0074\u006f\u0072\u002e\u004e\u0065\u0077W\u0069\u0074\u0068\u004f\u0070\u0074\u0069\u006f\u006e\u0073" ; _afe , _ff := page . GetAllContentStreams ( ) ;
if _ff != nil { return nil , _ff ; } ; _df , _gda := page . GetStructTreeRoot ( ) ; if ! _gda { _ag . Log . Info ( "T\u0068\u0065\u0020\u0070\u0064\u0066\u0020\u0064\u006f\u0063\u0075\u006d\u0065\u006e\u0074\u0020\u0069\u0073\u0020\u006e\u006f\u0074\u0020\u0074\u0061\u0067g\u0065d\u002e\u0020\u0053\u0074r\u0075\u0063t\u0054\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u0020\u0064\u006f\u0065\u0073\u006e\u0027\u0074\u0020\u0065\u0078\u0069\u0073\u0074\u002e" ) ;
} ; _bd := page . GetContainingPdfObject ( ) ; _ab , _ff := page . GetMediaBox ( ) ; if _ff != nil { return nil , _efc . Errorf ( "\u0065\u0078\u0074r\u0061\u0063\u0074\u006fr\u0020\u0072\u0065\u0071\u0075\u0069\u0072e\u0073\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u002e\u0020\u0025\u0076" , _ff ) ;
} ; _cg := & Extractor { _fgb : _afe , _cf : page . Resources , _fd : * _ab , _fed : page . CropBox , _fgf : map [ string ] fontEntry { } , _ec : map [ string ] textResult { } , _agf : map [ string ] textResult { } , _efe : options , _ba : _df , _ac : _bd } ; if _cg . _fd . Llx > _cg . _fd . Urx { _ag . Log . Info ( "\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0058\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e" , _cg . _fd ) ;
_cg . _fd . Llx , _cg . _fd . Urx = _cg . _fd . Urx , _cg . _fd . Llx ; } ; if _cg . _fd . Lly > _cg . _fd . Ury { _ag . Log . Info ( "\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0059\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e" , _cg . _fd ) ;
_cg . _fd . Lly , _cg . _fd . Ury = _cg . _fd . Ury , _cg . _fd . Lly ; } ; if _cg . _efe != nil { if _cg . _efe . IncludeAnnotations { _cg . _cae , _ff = page . GetAnnotations ( ) ; if _ff != nil { _ag . Log . Debug ( "\u0045\u0072r\u006f\u0072\u0020\u0067\u0065\u0074\u0074\u0069\u006e\u0067\u0020\u0061\u006e\u006e\u006f\u0074\u0061\u0074\u0069\u006f\u006e\u0073: \u0025\u0076" , _ff ) ;
} ; } ; } ; _d . TrackUse ( _ce ) ; return _cg , nil ; } ; func ( _bbda * TextMarkArray ) getTextMarkAtOffset ( _dgde int ) * TextMark { for _ , _faf := range _bbda . _aec { if _faf . Offset == _dgde { return & _faf ; } ; } ; return nil ; } ; func _abbaa ( _bccg [ ] pathSection ) { if _fbf < 0.0 { return ;
} ; if _gdeb { _ag . Log . Info ( "\u0067\u0072\u0061\u006e\u0075\u006c\u0061\u0072\u0069\u007a\u0065\u003a\u0020\u0025\u0064 \u0073u\u0062\u0070\u0061\u0074\u0068\u0020\u0073\u0065\u0063\u0074\u0069\u006f\u006e\u0073" , len ( _bccg ) ) ; } ; for _egcec , _dcbgc := range _bccg { for _faebe , _bgafd := range _dcbgc . _bgbeg { for _fdedc , _fcca := range _bgafd . _aaebg { _bgafd . _aaebg [ _fdedc ] = _aae . Point { X : _affg ( _fcca . X ) , Y : _affg ( _fcca . Y ) } ;
if _gdeb { _aebc := _bgafd . _aaebg [ _fdedc ] ; if ! _bgfgg ( _fcca , _aebc ) { _abbdd := _aae . Point { X : _aebc . X - _fcca . X , Y : _aebc . Y - _fcca . Y } ; _efc . Printf ( "\u0025\u0034d \u002d\u0020\u00254\u0064\u0020\u002d\u0020%4d\u003a %\u002e\u0032\u0066\u0020\u2192\u0020\u0025.2\u0066\u0020\u0028\u0025\u0067\u0029\u000a" , _egcec , _faebe , _fdedc , _fcca , _aebc , _abbdd ) ;
} ; } ; } ; } ; } ; } ; func ( _gae * textObject ) moveText ( _ggd , _befg float64 ) { _gae . moveLP ( _ggd , _befg ) } ; func _gcccd ( _cafd , _gcbfd float64 ) string { _cedcc := ! _cdaea ( _cafd - _gcbfd ) ; if _cedcc { return "\u000a" ; } ; return "\u0020" ; } ; type textWord struct { _af . PdfRectangle ;
_accb float64 ; _ccbcc string ; _ffcd [ ] * textMark ; _abcc float64 ; _dgeeg bool ; } ; func _fcgef ( _egag , _dddab , _aggb float64 ) rulingKind { if _egag >= _aggb && _gdcgf ( _dddab , _egag ) { return _eeg ; } ; if _dddab >= _aggb && _gdcgf ( _egag , _dddab ) { return _gecdf ; } ;
return _ceag ; } ; func ( _fegb * textTable ) put ( _gdee , _ecbdc int , _acegc * textPara ) { _fegb . _dgcf [ _cdgd ( _gdee , _ecbdc ) ] = _acegc ; } ; func _ggda ( _abec _aae . Point ) * subpath { return & subpath { _aaebg : [ ] _aae . Point { _abec } } } ; func ( _bafg * textTable ) emptyCompositeRow ( _gggfc int ) bool { for _dfgaf := 0 ;
_dfgaf < _bafg . _aageb ; _dfgaf ++ { if _fgfba , _acbe := _bafg . _becfc [ _cdgd ( _dfgaf , _gggfc ) ] ; _acbe { if len ( _fgfba . paraList ) > 0 { return false ; } ; } ; } ; return true ; } ;
2024-04-16 11:40:43 +00:00
2024-05-29 17:04:37 +00:00
// String returns a description of `k`.
func ( _ccdca rulingKind ) String ( ) string { _aceee , _cbfdf := _ecccgb [ _ccdca ] ; if ! _cbfdf { return _efc . Sprintf ( "\u004e\u006ft\u0020\u0061\u0020r\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0025\u0064" , _ccdca ) ; } ; return _aceee ; } ; func ( _ecdg * textLine ) text ( ) string { var _ageb [ ] string ;
for _ , _bbba := range _ecdg . _cfcb { if _bbba . _dgeeg { _ageb = append ( _ageb , "\u0020" ) ; } ; _ageb = append ( _ageb , _bbba . _ccbcc ) ; } ; return _a . Join ( _ageb , "" ) ; } ;
2024-03-27 22:34:33 +00:00
2024-05-29 17:04:37 +00:00
// Elements returns the TextMarks in `ma`.
func ( _fbg * TextMarkArray ) Elements ( ) [ ] TextMark { return _fbg . _aec } ; func _cbdb ( _cdga func ( * wordBag , * textWord , float64 ) bool , _agdd float64 ) func ( * wordBag , * textWord ) bool { return func ( _efff * wordBag , _fecgf * textWord ) bool { return _cdga ( _efff , _fecgf , _agdd ) } ;
} ; func ( _ebcf * textTable ) logComposite ( _cbad string ) { if ! _dedc { return ; } ; _ag . Log . Info ( "\u007e~\u007eP\u0061\u0072\u0061\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0073" , _ebcf . _aageb , _ebcf . _cegga , _cbad ) ; _efc . Printf ( "\u0025\u0035\u0073 \u007c" , "" ) ;
for _cefa := 0 ; _cefa < _ebcf . _aageb ; _cefa ++ { _efc . Printf ( "\u0025\u0033\u0064 \u007c" , _cefa ) ; } ; _efc . Println ( "" ) ; _efc . Printf ( "\u0025\u0035\u0073 \u002b" , "" ) ; for _ababb := 0 ; _ababb < _ebcf . _aageb ; _ababb ++ { _efc . Printf ( "\u0025\u0033\u0073 \u002b" , "\u002d\u002d\u002d" ) ;
} ; _efc . Println ( "" ) ; for _cfdf := 0 ; _cfdf < _ebcf . _cegga ; _cfdf ++ { _efc . Printf ( "\u0025\u0035\u0064 \u007c" , _cfdf ) ; for _caff := 0 ; _caff < _ebcf . _aageb ; _caff ++ { _acgc , _ := _ebcf . _becfc [ _cdgd ( _caff , _cfdf ) ] . parasBBox ( ) ; _efc . Printf ( "\u0025\u0033\u0064 \u007c" , len ( _acgc ) ) ;
} ; _efc . Println ( "" ) ; } ; _ag . Log . Info ( "\u007e~\u007eT\u0065\u0078\u0074\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0073" , _ebcf . _aageb , _ebcf . _cegga , _cbad ) ; _efc . Printf ( "\u0025\u0035\u0073 \u007c" , "" ) ; for _caadc := 0 ; _caadc < _ebcf . _aageb ;
_caadc ++ { _efc . Printf ( "\u0025\u0031\u0032\u0064\u0020\u007c" , _caadc ) ; } ; _efc . Println ( "" ) ; _efc . Printf ( "\u0025\u0035\u0073 \u002b" , "" ) ; for _cbace := 0 ; _cbace < _ebcf . _aageb ; _cbace ++ { _efc . Print ( "\u002d\u002d\u002d\u002d\u002d\u002d\u002d\u002d\u002d-\u002d\u002d\u002d\u002b" ) ;
} ; _efc . Println ( "" ) ; for _bebee := 0 ; _bebee < _ebcf . _cegga ; _bebee ++ { _efc . Printf ( "\u0025\u0035\u0064 \u007c" , _bebee ) ; for _bfeac := 0 ; _bfeac < _ebcf . _aageb ; _bfeac ++ { _gacfc , _ := _ebcf . _becfc [ _cdgd ( _bfeac , _bebee ) ] . parasBBox ( ) ; _gebf := "" ; _cdfe := _gacfc . merge ( ) ;
if _cdfe != nil { _gebf = _cdfe . text ( ) ; } ; _gebf = _efc . Sprintf ( "\u0025\u0071" , _efcca ( _gebf , 12 ) ) ; _gebf = _gebf [ 1 : len ( _gebf ) - 1 ] ; _efc . Printf ( "\u0025\u0031\u0032\u0073\u0020\u007c" , _gebf ) ; } ; _efc . Println ( "" ) ; } ; } ; var _ccfge = [ ] string { "\u2756" , "\u27a2" , "\u2713" , "\u2022" , "\uf0a7" , "\u25a1" , "\u2212" , "\u25a0" , "\u25aa" , "\u006f" } ;
func ( _ebbf * textTable ) subdivide ( ) * textTable { _ebbf . logComposite ( "\u0073u\u0062\u0064\u0069\u0076\u0069\u0064e" ) ; _acde := _ebbf . compositeRowCorridors ( ) ; _efag := _ebbf . compositeColCorridors ( ) ; if _dedc { _ag . Log . Info ( "\u0073u\u0062\u0064i\u0076\u0069\u0064\u0065:\u000a\u0009\u0072o\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s=\u0025\u0073\u000a\t\u0063\u006fl\u0043\u006f\u0072\u0072\u0069\u0064o\u0072\u0073=\u0025\u0073" , _gfae ( _acde ) , _gfae ( _efag ) ) ;
} ; if len ( _acde ) == 0 || len ( _efag ) == 0 { return _ebbf ; } ; _bcgac ( _acde ) ; _bcgac ( _efag ) ; if _dedc { _ag . Log . Info ( "\u0073\u0075\u0062\u0064\u0069\u0076\u0069\u0064\u0065\u0020\u0066\u0069\u0078\u0065\u0064\u003a\u000a\u0009r\u006f\u0077\u0043\u006f\u0072\u0072\u0069d\u006f\u0072\u0073\u003d\u0025\u0073\u000a\u0009\u0063\u006f\u006cC\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u003d\u0025\u0073" , _gfae ( _acde ) , _gfae ( _efag ) ) ;
} ; _edgg , _defg := _deadf ( _ebbf . _cegga , _acde ) ; _gebd , _dcff := _deadf ( _ebbf . _aageb , _efag ) ; _gcbbb := make ( map [ uint64 ] * textPara , _dcff * _defg ) ; _edbfd := & textTable { PdfRectangle : _ebbf . PdfRectangle , _caagg : _ebbf . _caagg , _cegga : _defg , _aageb : _dcff , _dgcf : _gcbbb } ;
if _dedc { _ag . Log . Info ( "\u0073\u0075b\u0064\u0069\u0076\u0069\u0064\u0065\u003a\u0020\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u0020\u003d\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u0020\u0063\u0065\u006c\u006c\u0073\u003d\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u000a" + "\u0009\u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s\u003d\u0025\u0073\u000a" + "\u0009\u0063\u006f\u006c\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s\u003d\u0025\u0073\u000a" + "\u0009\u0079\u004f\u0066\u0066\u0073\u0065\u0074\u0073=\u0025\u002b\u0076\u000a" + "\u0009\u0078\u004f\u0066\u0066\u0073\u0065\u0074\u0073\u003d\u0025\u002b\u0076" , _ebbf . _aageb , _ebbf . _cegga , _dcff , _defg , _gfae ( _acde ) , _gfae ( _efag ) , _edgg , _gebd ) ;
} ; for _cbded := 0 ; _cbded < _ebbf . _cegga ; _cbded ++ { _dgdbd := _edgg [ _cbded ] ; for _dcdd := 0 ; _dcdd < _ebbf . _aageb ; _dcdd ++ { _baace := _gebd [ _dcdd ] ; if _dedc { _efc . Printf ( "\u0025\u0036\u0064\u002c %\u0032\u0064\u003a\u0020\u0078\u0030\u003d\u0025\u0064\u0020\u0079\u0030\u003d\u0025d\u000a" , _dcdd , _cbded , _baace , _dgdbd ) ;
} ; _dafe , _afgaa := _ebbf . _becfc [ _cdgd ( _dcdd , _cbded ) ] ; if ! _afgaa { continue ; } ; _dgdc := _dafe . split ( _acde [ _cbded ] , _efag [ _dcdd ] ) ; for _ffade := 0 ; _ffade < _dgdc . _cegga ; _ffade ++ { for _ecga := 0 ; _ecga < _dgdc . _aageb ; _ecga ++ { _bgee := _dgdc . get ( _ecga , _ffade ) ;
_edbfd . put ( _baace + _ecga , _dgdbd + _ffade , _bgee ) ; if _dedc { _efc . Printf ( "\u0025\u0038\u0064\u002c\u0020\u0025\u0032\u0064\u003a\u0020\u0025\u0073\u000a" , _baace + _ecga , _dgdbd + _ffade , _bgee ) ; } ; } ; } ; } ; } ; return _edbfd ; } ; func ( _ggffg * textTable ) getRight ( ) paraList { _gffa := make ( paraList , _ggffg . _cegga ) ;
for _gbefe := 0 ; _gbefe < _ggffg . _cegga ; _gbefe ++ { _efef := _ggffg . get ( _ggffg . _aageb - 1 , _gbefe ) . _aggd ; if _efef . taken ( ) { return nil ; } ; _gffa [ _gbefe ] = _efef ; } ; for _dgace := 0 ; _dgace < _ggffg . _cegga - 1 ; _dgace ++ { if _gffa [ _dgace ] . _cabda != _gffa [ _dgace + 1 ] { return nil ;
} ; } ; return _gffa ; } ; func _abda ( _eagc [ ] * textLine , _dfea string , _fbbe [ ] * list ) * list { return & list { _fged : _eagc , _fdgc : _dfea , _fbef : _fbbe } ; } ; func _eeed ( _fcdef , _egbgg * textPara ) bool { return _gfb ( _fcdef . _gbgbb , _egbgg . _gbgbb ) } ; func _bggb ( _egecb [ ] * textLine , _fdfad map [ float64 ] [ ] * textLine , _cfgd [ ] float64 , _ebaaf int , _bccd , _bcfaf float64 ) [ ] * list { _gbeeg := [ ] * list { } ;
_dccg := _ebaaf ; _ebaaf = _ebaaf + 1 ; _febb := _cfgd [ _dccg ] ; _egcg := _fdfad [ _febb ] ; _gddfg := _dgbf ( _egcg , _bcfaf , _bccd ) ; for _cdgc , _fded := range _gddfg { var _gcaaf float64 ; _gdegd := [ ] * list { } ; _bcbg := _fded . _addd ; _dfbb := _bcfaf ; if _cdgc < len ( _gddfg ) - 1 { _dfbb = _gddfg [ _cdgc + 1 ] . _addd ;
} ; if _ebaaf < len ( _cfgd ) { _gdegd = _bggb ( _egecb , _fdfad , _cfgd , _ebaaf , _bcbg , _dfbb ) ; } ; _gcaaf = _dfbb ; if len ( _gdegd ) > 0 { _gfdg := _gdegd [ 0 ] ; if len ( _gfdg . _fged ) > 0 { _gcaaf = _gfdg . _fged [ 0 ] . _addd ; } ; } ; _ddddc := [ ] * textLine { _fded } ; _bdfc := _ggfca ( _fded , _egecb , _cfgd , _bcbg , _gcaaf ) ;
_ddddc = append ( _ddddc , _bdfc ... ) ; _ebbg := _abda ( _ddddc , "\u0062\u0075\u006c\u006c\u0065\u0074" , _gdegd ) ; _ebbg . _cbda = _ebgc ( _ddddc , "" ) ; _gbeeg = append ( _gbeeg , _ebbg ) ; } ; return _gbeeg ; } ; func ( _gacg * textWord ) toTextMarks ( _gfaga * int ) [ ] TextMark { var _ebdea [ ] TextMark ;
for _ , _dgda := range _gacg . _ffcd { _ebdea = _fgec ( _ebdea , _gfaga , _dgda . ToTextMark ( ) ) ; } ; return _ebdea ; } ; func ( _efa * subpath ) close ( ) { if ! _bgfgg ( _efa . _aaebg [ 0 ] , _efa . last ( ) ) { _efa . add ( _efa . _aaebg [ 0 ] ) ; } ; _efa . _cedc = true ; _efa . removeDuplicates ( ) ;
} ; func ( _bcde * textLine ) toTextMarks ( _ebcg * int ) [ ] TextMark { var _edfb [ ] TextMark ; for _ , _geged := range _bcde . _cfcb { if _geged . _dgeeg { _edfb = _dbce ( _edfb , _ebcg , "\u0020" ) ; } ; _babc := _geged . toTextMarks ( _ebcg ) ; _edfb = append ( _edfb , _babc ... ) ; } ; return _edfb ;
} ; const _bffd = 10 ; type structElement struct { _dccda string ; _befc [ ] structElement ; _fbge int64 ; _bffdf _gf . PdfObject ; } ; func _afc ( _ggcd * list ) [ ] * list { var _gbgfa [ ] * list ; for _ , _ddg := range _ggcd . _fbef { switch _ddg . _fdgc { case "\u004c\u0049" : _gbdge := _fcgc ( _ddg ) ;
_eddfbg := _afc ( _ddg ) ; _acgf := _abda ( _gbdge , "\u0062\u0075\u006c\u006c\u0065\u0074" , _eddfbg ) ; _dabfc := _ebgc ( _gbdge , "" ) ; _acgf . _cbda = _dabfc ; _gbgfa = append ( _gbgfa , _acgf ) ; case "\u004c\u0042\u006fd\u0079" : return _afc ( _ddg ) ; case "\u004c" : _dfef := _afc ( _ddg ) ;
_gbgfa = append ( _gbgfa , _dfef ... ) ; return _gbgfa ; } ; } ; return _gbgfa ; } ; func _facf ( _bgbf [ ] * wordBag ) [ ] * wordBag { if len ( _bgbf ) <= 1 { return _bgbf ; } ; if _fbeb { _ag . Log . Info ( "\u006d\u0065\u0072\u0067\u0065\u0057\u006f\u0072\u0064B\u0061\u0067\u0073\u003a" ) ;
} ; _e . Slice ( _bgbf , func ( _cccf , _dbgd int ) bool { _gfeg , _efeb := _bgbf [ _cccf ] , _bgbf [ _dbgd ] ; _ccbg := _gfeg . Width ( ) * _gfeg . Height ( ) ; _cca := _efeb . Width ( ) * _efeb . Height ( ) ; if _ccbg != _cca { return _ccbg > _cca ; } ; if _gfeg . Height ( ) != _efeb . Height ( ) { return _gfeg . Height ( ) > _efeb . Height ( ) ;
} ; return _cccf < _dbgd ; } ) ; var _gdcb [ ] * wordBag ; _gcgd := make ( intSet ) ; for _geca := 0 ; _geca < len ( _bgbf ) ; _geca ++ { if _gcgd . has ( _geca ) { continue ; } ; _gdcg := _bgbf [ _geca ] ; for _bcgb := _geca + 1 ; _bcgb < len ( _bgbf ) ; _bcgb ++ { if _gcgd . has ( _geca ) { continue ;
} ; _ffbe := _bgbf [ _bcgb ] ; _dfgc := _gdcg . PdfRectangle ; _dfgc . Llx -= _gdcg . _cdac ; if _aeca ( _dfgc , _ffbe . PdfRectangle ) { _gdcg . absorb ( _ffbe ) ; _gcgd . add ( _bcgb ) ; } ; } ; _gdcb = append ( _gdcb , _gdcg ) ; } ; if len ( _bgbf ) != len ( _gdcb ) + len ( _gcgd ) { _ag . Log . Error ( "\u006d\u0065\u0072ge\u0057\u006f\u0072\u0064\u0042\u0061\u0067\u0073\u003a \u0025d\u2192%\u0064 \u0061\u0062\u0073\u006f\u0072\u0062\u0065\u0064\u003d\u0025\u0064" , len ( _bgbf ) , len ( _gdcb ) , len ( _gcgd ) ) ;
} ; return _gdcb ; } ; func ( _cbbg * textWord ) addDiacritic ( _aedbf string ) { _feaabe := _cbbg . _ffcd [ len ( _cbbg . _ffcd ) - 1 ] ; _feaabe . _cgeb += _aedbf ; _feaabe . _cgeb = _c . NFKC . String ( _feaabe . _cgeb ) ; } ; func ( _dagbc paraList ) findTables ( _febc [ ] gridTiling ) [ ] * textTable { _dagbc . addNeighbours ( ) ;
_e . Slice ( _dagbc , func ( _cbgc , _cffedg int ) bool { return _gbef ( _dagbc [ _cbgc ] , _dagbc [ _cffedg ] ) < 0 } ) ; var _cdgbd [ ] * textTable ; if _eeab { _afce := _dagbc . findGridTables ( _febc ) ; _cdgbd = append ( _cdgbd , _afce ... ) ; } ; if _dcdb { _ecdb := _dagbc . findTextTables ( ) ;
_cdgbd = append ( _cdgbd , _ecdb ... ) ; } ; return _cdgbd ; } ; func _cdaea ( _dgag float64 ) bool { return _ea . Abs ( _dgag ) < _gefg } ; func _fcgg ( _cgebe string ) ( string , bool ) { _fbfc := [ ] rune ( _cgebe ) ; if len ( _fbfc ) != 1 { return "" , false ; } ; _bfdd , _adbef := _cbba [ _fbfc [ 0 ] ] ;
return _bfdd , _adbef ; } ; const _bagb = 20 ; func ( _edgf * textObject ) checkOp ( _bgc * _aa . ContentStreamOperation , _gcff int , _bgd bool ) ( _bacb bool , _cfeb error ) { if _edgf == nil { var _ddcb [ ] _gf . PdfObject ; if _gcff > 0 { _ddcb = _bgc . Params ; if len ( _ddcb ) > _gcff { _ddcb = _ddcb [ : _gcff ] ;
} ; } ; _ag . Log . Debug ( "\u0025\u0023q \u006f\u0070\u0065r\u0061\u006e\u0064\u0020out\u0073id\u0065\u0020\u0074\u0065\u0078\u0074\u002e p\u0061\u0072\u0061\u006d\u0073\u003d\u0025+\u0076" , _bgc . Operand , _ddcb ) ; } ; if _gcff >= 0 { if len ( _bgc . Params ) != _gcff { if _bgd { _cfeb = _b . New ( "\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et" ) ;
} ; _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076" , _bgc . Operand , _gcff , len ( _bgc . Params ) , _bgc . Params ) ;
return false , _cfeb ; } ; } ; return true , nil ; } ; func _ebaad ( _feccc _af . PdfRectangle , _gafb , _bbgge , _fbga , _abde * ruling ) gridTile { _dagf := _feccc . Llx ; _fbbdb := _feccc . Urx ; _ffea := _feccc . Lly ; _cgafc := _feccc . Ury ; return gridTile { PdfRectangle : _feccc , _gceeb : _gafb != nil && _gafb . encloses ( _ffea , _cgafc ) , _gdcbg : _bbgge != nil && _bbgge . encloses ( _ffea , _cgafc ) , _dbafa : _fbga != nil && _fbga . encloses ( _dagf , _fbbdb ) , _ffdf : _abde != nil && _abde . encloses ( _dagf , _fbbdb ) } ;
} ; func ( _bfea * wordBag ) text ( ) string { _aagf := _bfea . allWords ( ) ; _cedf := make ( [ ] string , len ( _aagf ) ) ; for _ebfe , _abbd := range _aagf { _cedf [ _ebfe ] = _abbd . _ccbcc ; } ; return _a . Join ( _cedf , "\u0020" ) ; } ; type lists [ ] * list ; func ( _gbbf * wordBag ) depthRange ( _aecd , _efdeg int ) [ ] int { var _dccce [ ] int ;
for _gfee := range _gbbf . _cdbc { if _aecd <= _gfee && _gfee <= _efdeg { _dccce = append ( _dccce , _gfee ) ; } ; } ; if len ( _dccce ) == 0 { return nil ; } ; _e . Ints ( _dccce ) ; return _dccce ; } ;
// String returns a description of `p`.
func ( _cfebc * textPara ) String ( ) string { if _cfebc . _bdgc { return _efc . Sprintf ( "\u0025\u0036\u002e\u0032\u0066\u0020\u005b\u0045\u004d\u0050\u0054\u0059\u005d" , _cfebc . PdfRectangle ) ; } ; _caeb := "" ; if _cfebc . _befe != nil { _caeb = _efc . Sprintf ( "\u005b\u0025\u0064\u0078\u0025\u0064\u005d\u0020" , _cfebc . _befe . _aageb , _cfebc . _befe . _cegga ) ;
} ; return _efc . Sprintf ( "\u0025\u0036\u002e\u0032f \u0025\u0073\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u0020\u0025\u0071" , _cfebc . PdfRectangle , _caeb , len ( _cfebc . _aage ) , _efcca ( _cfebc . text ( ) , 50 ) ) ; } ; func ( _babac rectRuling ) asRuling ( ) ( * ruling , bool ) { _acce := ruling { _ecfb : _babac . _beda , Color : _babac . Color , _agff : _bddf } ;
switch _babac . _beda { case _gecdf : _acce . _aeef = 0.5 * ( _babac . Llx + _babac . Urx ) ; _acce . _ggdb = _babac . Lly ; _acce . _gbca = _babac . Ury ; _bcce , _bcda := _babac . checkWidth ( _babac . Llx , _babac . Urx ) ; if ! _bcda { if _fccf { _ag . Log . Error ( "\u0072\u0065\u0063\u0074\u0052\u0075l\u0069\u006e\u0067\u002e\u0061\u0073\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0072\u0075\u006c\u0069\u006e\u0067V\u0065\u0072\u0074\u0020\u0021\u0063\u0068\u0065\u0063\u006b\u0057\u0069\u0064\u0074h\u0020v\u003d\u0025\u002b\u0076" , _babac ) ;
} ; return nil , false ; } ; _acce . _faba = _bcce ; case _eeg : _acce . _aeef = 0.5 * ( _babac . Lly + _babac . Ury ) ; _acce . _ggdb = _babac . Llx ; _acce . _gbca = _babac . Urx ; _aege , _fbgfa := _babac . checkWidth ( _babac . Lly , _babac . Ury ) ; if ! _fbgfa { if _fccf { _ag . Log . Error ( "\u0072\u0065\u0063\u0074\u0052\u0075l\u0069\u006e\u0067\u002e\u0061\u0073\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0072\u0075\u006c\u0069\u006e\u0067H\u006f\u0072\u007a\u0020\u0021\u0063\u0068\u0065\u0063\u006b\u0057\u0069\u0064\u0074h\u0020v\u003d\u0025\u002b\u0076" , _babac ) ;
} ; return nil , false ; } ; _acce . _faba = _aege ; default : _ag . Log . Error ( "\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064" , _babac . _beda ) ; return nil , false ; } ; return & _acce , true ; } ; func ( _abaa * textTable ) get ( _bbfeb , _bfbed int ) * textPara { return _abaa . _dgcf [ _cdgd ( _bbfeb , _bfbed ) ] } ;
func ( _decd * textObject ) getFont ( _dccd string ) ( * _af . PdfFont , error ) { if _decd . _dbe . _fgf != nil { _ccbc , _bee := _decd . getFontDict ( _dccd ) ; if _bee != nil { _ag . Log . Debug ( "\u0045\u0052\u0052OR\u003a\u0020\u0067\u0065\u0074\u0046\u006f\u006e\u0074:\u0020n\u0061m\u0065=\u0025\u0073\u002c\u0020\u0065\u0072\u0072\u006f\u0072\u003a\u0020\u0025\u0073" , _dccd , _bee . Error ( ) ) ;
return nil , _bee ; } ; _decd . _dbe . _db ++ ; _deee , _gaa := _decd . _dbe . _fgf [ _ccbc . String ( ) ] ; if _gaa { _deee . _edeb = _decd . _dbe . _db ; return _deee . _gggfe , nil ; } ; } ; _gbea , _deca := _decd . getFontDict ( _dccd ) ; if _deca != nil { return nil , _deca ; } ; _cag , _deca := _decd . getFontDirect ( _dccd ) ;
if _deca != nil { return nil , _deca ; } ; if _decd . _dbe . _fgf != nil { _cfcd := fontEntry { _cag , _decd . _dbe . _db } ; if len ( _decd . _dbe . _fgf ) >= _bffd { var _agdg [ ] string ; for _eecd := range _decd . _dbe . _fgf { _agdg = append ( _agdg , _eecd ) ; } ; _e . Slice ( _agdg , func ( _gddc , _fecc int ) bool { return _decd . _dbe . _fgf [ _agdg [ _gddc ] ] . _edeb < _decd . _dbe . _fgf [ _agdg [ _fecc ] ] . _edeb ;
} ) ; delete ( _decd . _dbe . _fgf , _agdg [ 0 ] ) ; } ; _decd . _dbe . _fgf [ _gbea . String ( ) ] = _cfcd ; } ; return _cag , nil ; } ; func _bcbe ( _agfa , _gabec bounded ) float64 { _cfaeb := _gdfa ( _agfa , _gabec ) ; if ! _cdaea ( _cfaeb ) { return _cfaeb ; } ; return _fdbb ( _agfa , _gabec ) ;
} ; func _afeec ( _cgfb float64 , _eccg int ) int { if _eccg == 0 { _eccg = 1 ; } ; _ggdfa := float64 ( _eccg ) ; return int ( _ea . Round ( _cgfb / _ggdfa ) * _ggdfa ) ; } ; func ( _edgc lineRuling ) asRuling ( ) ( * ruling , bool ) { _cabe := ruling { _ecfb : _edgc . _faab , Color : _edgc . Color , _agff : _cbeb } ;
switch _edgc . _faab { case _gecdf : _cabe . _aeef = _edgc . xMean ( ) ; _cabe . _ggdb = _ea . Min ( _edgc . _bbee . Y , _edgc . _efge . Y ) ; _cabe . _gbca = _ea . Max ( _edgc . _bbee . Y , _edgc . _efge . Y ) ; case _eeg : _cabe . _aeef = _edgc . yMean ( ) ; _cabe . _ggdb = _ea . Min ( _edgc . _bbee . X , _edgc . _efge . X ) ;
_cabe . _gbca = _ea . Max ( _edgc . _bbee . X , _edgc . _efge . X ) ; default : _ag . Log . Error ( "\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064" , _edgc . _faab ) ; return nil , false ; } ; return & _cabe , true ; } ; func ( _cga * textObject ) setTextRenderMode ( _fdda int ) { if _cga == nil { return ;
} ; _cga . _ecff . _aaeb = RenderMode ( _fdda ) ; } ; var _cdaf string = "\u0028\u003f\u0069\u0029\u005e\u0028\u004d\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u0043\u0028?\u003a\u0044\u007cM\u0029\u007c\u0044\u003f\u0043{\u0030\u002c\u0033\u007d\u0029\u0028\u0058\u0028\u003f\u003a\u004c\u007c\u0043\u0029\u007cL\u003f\u0058\u007b\u0030\u002c\u0033}\u0029\u0028\u0049\u0028\u003f\u003a\u0056\u007c\u0058\u0029\u007c\u0056\u003f\u0049\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u005c\u0029\u007c\u005c\u002e\u0029\u007c\u005e\u005c\u0028\u0028\u004d\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u0043\u0028\u003f\u003aD\u007cM\u0029\u007c\u0044\u003f\u0043\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u0058\u0028?\u003a\u004c\u007c\u0043\u0029\u007c\u004c?\u0058\u007b0\u002c\u0033\u007d\u0029(\u0049\u0028\u003f\u003a\u0056|\u0058\u0029\u007c\u0056\u003f\u0049\u007b\u0030\u002c\u0033\u007d\u0029\u005c\u0029" ;
func ( _dfge * wordBag ) minDepth ( ) float64 { return _dfge . _ecba - ( _dfge . Ury - _dfge . _cdac ) } ; func ( _fdfa * imageExtractContext ) extractFormImages ( _eddb * _gf . PdfObjectName , _afa _aa . GraphicsState , _bfe * _af . PdfPageResources ) error { _bec , _ge := _bfe . GetXObjectFormByName ( * _eddb ) ;
if _ge != nil { return _ge ; } ; if _bec == nil { return nil ; } ; _ggf , _ge := _bec . GetContentStream ( ) ; if _ge != nil { return _ge ; } ; _bgb := _bec . Resources ; if _bgb == nil { _bgb = _bfe ; } ; _ge = _fdfa . extractContentStreamImages ( string ( _ggf ) , _bgb ) ; if _ge != nil { return _ge ;
} ; _fdfa . _gga ++ ; return nil ; } ; func ( _dbga rulingList ) intersections ( ) map [ int ] intSet { var _aad , _aeaf [ ] int ; for _baff , _bgaae := range _dbga { switch _bgaae . _ecfb { case _gecdf : _aad = append ( _aad , _baff ) ; case _eeg : _aeaf = append ( _aeaf , _baff ) ; } ; } ;
if len ( _aad ) < _fcad + 1 || len ( _aeaf ) < _bce + 1 { return nil ; } ; if len ( _aad ) + len ( _aeaf ) > _ebdb { _ag . Log . Debug ( "\u0069\u006e\u0074\u0065\u0072\u0073e\u0063\u0074\u0069\u006f\u006e\u0073\u003a\u0020\u0054\u004f\u004f\u0020\u004d\u0041\u004e\u0059\u0020\u0072\u0075\u006ci\u006e\u0067\u0073\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u003d\u0020%\u0064 \u0078\u0020\u0025\u0064" , len ( _dbga ) , len ( _aad ) , len ( _aeaf ) ) ;
return nil ; } ; _fgd := make ( map [ int ] intSet , len ( _aad ) + len ( _aeaf ) ) ; for _ , _fddeb := range _aad { for _ , _eebb := range _aeaf { if _dbga [ _fddeb ] . intersects ( _dbga [ _eebb ] ) { if _ , _dcge := _fgd [ _fddeb ] ; ! _dcge { _fgd [ _fddeb ] = make ( intSet ) ; } ; if _ , _agfd := _fgd [ _eebb ] ;
! _agfd { _fgd [ _eebb ] = make ( intSet ) ; } ; _fgd [ _fddeb ] . add ( _eebb ) ; _fgd [ _eebb ] . add ( _fddeb ) ; } ; } ; } ; return _fgd ; } ; func _egec ( _gega , _defb bounded ) float64 { return _gega . bbox ( ) . Llx - _defb . bbox ( ) . Urx } ; func _bcgd ( _bfab , _faee _aae . Point ) rulingKind { _dgcca := _ea . Abs ( _bfab . X - _faee . X ) ;
_gbfad := _ea . Abs ( _bfab . Y - _faee . Y ) ; return _fcgef ( _dgcca , _gbfad , _ebffe ) ; } ; type imageExtractContext struct { _cgg [ ] ImageMark ; _ffa int ; _fcc int ; _gga int ; _ebf map [ * _gf . PdfObjectStream ] * cachedImage ; _dac * ImageExtractOptions ; _eed bool ; } ; var _cbdd = map [ markKind ] string { _cbeb : "\u0073\u0074\u0072\u006f\u006b\u0065" , _bddf : "\u0066\u0069\u006c\u006c" , _gcfgb : "\u0061u\u0067\u006d\u0065\u006e\u0074" } ;
func _add ( _dcda _aae . Point ) _aae . Matrix { return _aae . TranslationMatrix ( _dcda . X , _dcda . Y ) } ; func ( _bddbd paraList ) findGridTables ( _bcggf [ ] gridTiling ) [ ] * textTable { if _dedc { _ag . Log . Info ( "\u0066i\u006e\u0064\u0047\u0072\u0069\u0064\u0054\u0061\u0062\u006c\u0065s\u003a\u0020\u0025\u0064\u0020\u0070\u0061\u0072\u0061\u0073" , len ( _bddbd ) ) ;
for _edcee , _ccgb := range _bddbd { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _edcee , _ccgb ) ; } ; } ; var _cddg [ ] * textTable ; for _dfab , _ddfe := range _bcggf { _gaag , _adcgd := _bddbd . findTableGrid ( _ddfe ) ; if _gaag != nil { _gaag . log ( _efc . Sprintf ( "\u0066\u0069\u006e\u0064Ta\u0062\u006c\u0065\u0057\u0069\u0074\u0068\u0047\u0072\u0069\u0064\u0073\u003a\u0020%\u0064" , _dfab ) ) ;
_cddg = append ( _cddg , _gaag ) ; _gaag . markCells ( ) ; } ; for _egfd := range _adcgd { _egfd . _fcdcf = true ; } ; } ; if _dedc { _ag . Log . Info ( "\u0066i\u006e\u0064\u0047\u0072i\u0064\u0054\u0061\u0062\u006ce\u0073:\u0020%\u0064\u0020\u0074\u0061\u0062\u006c\u0065s" , len ( _cddg ) ) ;
} ; return _cddg ; } ; var ( _cbba = map [ rune ] string { 0x0060 : "\u0300" , 0x02CB : "\u0300" , 0x0027 : "\u0301" , 0x00B4 : "\u0301" , 0x02B9 : "\u0301" , 0x02CA : "\u0301" , 0x005E : "\u0302" , 0x02C6 : "\u0302" , 0x007E : "\u0303" , 0x02DC : "\u0303" , 0x00AF : "\u0304" , 0x02C9 : "\u0304" , 0x02D8 : "\u0306" , 0x02D9 : "\u0307" , 0x00A8 : "\u0308" , 0x00B0 : "\u030a" , 0x02DA : "\u030a" , 0x02BA : "\u030b" , 0x02DD : "\u030b" , 0x02C7 : "\u030c" , 0x02C8 : "\u030d" , 0x0022 : "\u030e" , 0x02BB : "\u0312" , 0x02BC : "\u0313" , 0x0486 : "\u0313" , 0x055A : "\u0313" , 0x02BD : "\u0314" , 0x0485 : "\u0314" , 0x0559 : "\u0314" , 0x02D4 : "\u031d" , 0x02D5 : "\u031e" , 0x02D6 : "\u031f" , 0x02D7 : "\u0320" , 0x02B2 : "\u0321" , 0x00B8 : "\u0327" , 0x02CC : "\u0329" , 0x02B7 : "\u032b" , 0x02CD : "\u0331" , 0x005F : "\u0332" , 0x204E : "\u0359" } ;
) ; type intSet map [ int ] struct { } ; func ( _cecg * textTable ) log ( _cdag string ) { if ! _dedc { return ; } ; _ag . Log . Info ( "~\u007e\u007e\u0020\u0025\u0073\u003a \u0025\u0064\u0020\u0078\u0020\u0025d\u0020\u0067\u0072\u0069\u0064\u003d\u0025t\u000a\u0020\u0020\u0020\u0020\u0020\u0020\u0025\u0036\u002e2\u0066" , _cdag , _cecg . _aageb , _cecg . _cegga , _cecg . _caagg , _cecg . PdfRectangle ) ;
for _gdgg := 0 ; _gdgg < _cecg . _cegga ; _gdgg ++ { for _ecfbc := 0 ; _ecfbc < _cecg . _aageb ; _ecfbc ++ { _fefg := _cecg . get ( _ecfbc , _gdgg ) ; if _fefg == nil { continue ; } ; _efc . Printf ( "%\u0034\u0064\u0020\u00252d\u003a \u0025\u0036\u002e\u0032\u0066 \u0025\u0071\u0020\u0025\u0064\u000a" , _ecfbc , _gdgg , _fefg . PdfRectangle , _efcca ( _fefg . text ( ) , 50 ) , _bb . RuneCountInString ( _fefg . text ( ) ) ) ;
} ; } ; } ; func ( _fad * shapesState ) newSubPath ( ) { _fad . clearPath ( ) ; if _cece { _ag . Log . Info ( "\u006e\u0065\u0077\u0053\u0075\u0062\u0050\u0061\u0074h\u003a\u0020\u0025\u0073" , _fad ) ; } ; } ; func ( _eddfg * wordBag ) getDepthIdx ( _ggb float64 ) int { _edba := _eddfg . depthIndexes ( ) ;
_bdff := _fdgf ( _ggb ) ; if _bdff < _edba [ 0 ] { return _edba [ 0 ] ; } ; if _bdff > _edba [ len ( _edba ) - 1 ] { return _edba [ len ( _edba ) - 1 ] ; } ; return _bdff ; } ; func ( _bgaa * textObject ) moveLP ( _fddf , _dgfa float64 ) { _bgaa . _ebc . Concat ( _aae . NewMatrix ( 1 , 0 , 0 , 1 , _fddf , _dgfa ) ) ;
_bgaa . _dbc = _bgaa . _ebc ; } ; func ( _ddge * textTable ) compositeColCorridors ( ) map [ int ] [ ] float64 { _dgbdb := make ( map [ int ] [ ] float64 , _ddge . _aageb ) ; if _dedc { _ag . Log . Info ( "\u0063\u006f\u006d\u0070o\u0073\u0069\u0074\u0065\u0043\u006f\u006c\u0043\u006f\u0072r\u0069d\u006f\u0072\u0073\u003a\u0020\u0077\u003d%\u0064\u0020" , _ddge . _aageb ) ;
} ; for _bcfge := 0 ; _bcfge < _ddge . _aageb ; _bcfge ++ { _dgbdb [ _bcfge ] = nil ; } ; return _dgbdb ; } ; func ( _cfc * stateStack ) size ( ) int { return len ( * _cfc ) } ; func ( _bcfde * textTable ) isExportable ( ) bool { if _bcfde . _caagg { return true ; } ; _gecbb := func ( _eecc int ) bool { _gcdc := _bcfde . get ( 0 , _eecc ) ;
if _gcdc == nil { return false ; } ; _baeb := _gcdc . text ( ) ; _gabef := _bb . RuneCountInString ( _baeb ) ; _ggbed := _baee . MatchString ( _baeb ) ; return _gabef <= 1 || _ggbed ; } ; for _bfcd := 0 ; _bfcd < _bcfde . _cegga ; _bfcd ++ { if ! _gecbb ( _bfcd ) { return true ; } ; } ; return false ;
2024-04-30 12:24:05 +00:00
} ;
2024-03-27 22:34:33 +00:00
2024-05-29 17:04:37 +00:00
// Extractor stores and offers functionality for extracting content from PDF pages.
type Extractor struct { _fgb string ; _cf * _af . PdfPageResources ; _fd _af . PdfRectangle ; _fed * _af . PdfRectangle ; _fgf map [ string ] fontEntry ; _ec map [ string ] textResult ; _agf map [ string ] textResult ; _db int64 ; _ebe int ; _efe * Options ; _ba * _gf . PdfObject ;
_ac _gf . PdfObject ; _cae [ ] * _af . PdfAnnotation ; } ;
2024-03-27 22:34:33 +00:00
2024-05-29 17:04:37 +00:00
// String returns a description of `k`.
func ( _dece markKind ) String ( ) string { _caafc , _bbed := _cbdd [ _dece ] ; if ! _bbed { return _efc . Sprintf ( "\u004e\u006f\u0074\u0020\u0061\u0020\u006d\u0061\u0072k\u003a\u0020\u0025\u0064" , _dece ) ; } ; return _caafc ; } ; type textResult struct { _eae PageText ; _cec int ;
_gabe int ; } ; func ( _bcb * textObject ) setTextLeading ( _cebc float64 ) { if _bcb == nil { return ; } ; _bcb . _ecff . _cdc = _cebc ; } ; type stateStack [ ] * textState ; func ( _gdef * textPara ) writeCellText ( _cccff _fc . Writer ) { for _debae , _bgdc := range _gdef . _aage { _fdbd := _bgdc . text ( ) ;
_ggaa := _dadc && _bgdc . endsInHyphen ( ) && _debae != len ( _gdef . _aage ) - 1 ; if _ggaa { _fdbd = _dbbb ( _fdbd ) ; } ; _cccff . Write ( [ ] byte ( _fdbd ) ) ; if ! ( _ggaa || _debae == len ( _gdef . _aage ) - 1 ) { _cccff . Write ( [ ] byte ( _gcccd ( _bgdc . _addd , _gdef . _aage [ _debae + 1 ] . _addd ) ) ) ;
} ; } ; } ; func ( _bbcbf * shapesState ) quadraticTo ( _bgbc , _acg , _dcdag , _cfaa float64 ) { if _cece { _ag . Log . Info ( "\u0071\u0075\u0061d\u0072\u0061\u0074\u0069\u0063\u0054\u006f\u003a" ) ; } ; _bbcbf . addPoint ( _dcdag , _cfaa ) ; } ; func _fecd ( _eceg * list , _abgb * _a . Builder , _cdcbcc * string ) { _fgfb := _ccgd ( _eceg , _cdcbcc ) ;
_abgb . WriteString ( _fgfb ) ; for _ , _cdega := range _eceg . _fbef { _fdfd := * _cdcbcc + "\u0020\u0020\u0020" ; _fecd ( _cdega , _abgb , & _fdfd ) ; } ; } ; func _debe ( _edgeb byte ) bool { for _ , _eefdc := range _ccfge { if [ ] byte ( _eefdc ) [ 0 ] == _edgeb { return true ; } ; } ; return false ;
} ; func ( _bffe * structElement ) parseStructElement ( _eeff _gf . PdfObject ) { _egbef , _ddcc := _gf . GetDict ( _eeff ) ; if ! _ddcc { _ag . Log . Debug ( "\u0070\u0061\u0072\u0073\u0065\u0053\u0074\u0072u\u0063\u0074\u0045le\u006d\u0065\u006e\u0074\u003a\u0020d\u0069\u0063\u0074\u0069\u006f\u006e\u0061\u0072\u0079\u0020\u006f\u0062\u006a\u0065\u0063t\u0020\u006e\u006f\u0074\u0020\u0066\u006f\u0075n\u0064\u002e" ) ;
return ; } ; _afab := _egbef . Get ( "\u0053" ) ; _gdeg := _egbef . Get ( "\u0050\u0067" ) ; _fgbc := "" ; if _afab != nil { _fgbc = _afab . String ( ) ; } ; _bfcgf := _egbef . Get ( "\u004b" ) ; _bffe . _dccda = _fgbc ; _bffe . _bffdf = _gdeg ; switch _ecaf := _bfcgf . ( type ) { case * _gf . PdfObjectInteger : _bffe . _dccda = _fgbc ;
_bffe . _fbge = int64 ( * _ecaf ) ; _bffe . _bffdf = _gdeg ; case * _gf . PdfObjectReference : _aaebb := * _gf . MakeArray ( _ecaf ) ; var _caagc int64 = - 1 ; _bffe . _fbge = _caagc ; if _aaebb . Len ( ) == 1 { _egcf := _aaebb . Elements ( ) [ 0 ] ; _bacc , _bgca := _egcf . ( * _gf . PdfObjectInteger ) ;
if _bgca { _caagc = int64 ( * _bacc ) ; _bffe . _fbge = _caagc ; _bffe . _dccda = _fgbc ; _bffe . _bffdf = _gdeg ; return ; } ; } ; _agg := [ ] structElement { } ; for _ , _egafa := range _aaebb . Elements ( ) { _fecca , _eeacd := _egafa . ( * _gf . PdfObjectInteger ) ; if _eeacd { _caagc = int64 ( * _fecca ) ;
_bffe . _fbge = _caagc ; _bffe . _dccda = _fgbc ; } else { _ffccb := & structElement { } ; _ffccb . parseStructElement ( _egafa ) ; _agg = append ( _agg , * _ffccb ) ; } ; _caagc = - 1 ; } ; _bffe . _befc = _agg ; case * _gf . PdfObjectArray : _ffbd := _bfcgf . ( * _gf . PdfObjectArray ) ; var _gggc int64 = - 1 ;
_bffe . _fbge = _gggc ; if _ffbd . Len ( ) == 1 { _bbdc := _ffbd . Elements ( ) [ 0 ] ; _eeae , _ecdc := _bbdc . ( * _gf . PdfObjectInteger ) ; if _ecdc { _gggc = int64 ( * _eeae ) ; _bffe . _fbge = _gggc ; _bffe . _dccda = _fgbc ; _bffe . _bffdf = _gdeg ; return ; } ; } ; _eebff := [ ] structElement { } ;
for _ , _dfee := range _ffbd . Elements ( ) { _efbb , _egdb := _dfee . ( * _gf . PdfObjectInteger ) ; if _egdb { _gggc = int64 ( * _efbb ) ; _bffe . _fbge = _gggc ; _bffe . _dccda = _fgbc ; _bffe . _bffdf = _gdeg ; } else { _agdba := & structElement { } ; _agdba . parseStructElement ( _dfee ) ;
_eebff = append ( _eebff , * _agdba ) ; } ; _gggc = - 1 ; } ; _bffe . _befc = _eebff ; } ; } ; func ( _fceb * textLine ) endsInHyphen ( ) bool { _gcggg := _fceb . _cfcb [ len ( _fceb . _cfcb ) - 1 ] ; _dcfg := _gcggg . _ccbcc ; _gaeed , _cccg := _bb . DecodeLastRuneInString ( _dcfg ) ; if _cccg <= 0 || ! _fg . Is ( _fg . Hyphen , _gaeed ) { return false ;
} ; if _gcggg . _dgeeg && _edeg ( _dcfg ) { return true ; } ; return _edeg ( _fceb . text ( ) ) ; } ; func _aecg ( _cacc map [ float64 ] gridTile ) [ ] float64 { _debc := make ( [ ] float64 , 0 , len ( _cacc ) ) ; for _ddcd := range _cacc { _debc = append ( _debc , _ddcd ) ; } ; _e . Float64s ( _debc ) ;
return _debc ; } ; func ( _agcf paraList ) yNeighbours ( _geef float64 ) map [ * textPara ] [ ] int { _bgcdg := make ( [ ] event , 2 * len ( _agcf ) ) ; if _geef == 0 { for _ddba , _cegea := range _agcf { _bgcdg [ 2 * _ddba ] = event { _cegea . Lly , true , _ddba } ; _bgcdg [ 2 * _ddba + 1 ] = event { _cegea . Ury , false , _ddba } ;
} ; } else { for _ggeee , _baacca := range _agcf { _bgcdg [ 2 * _ggeee ] = event { _baacca . Lly - _geef * _baacca . fontsize ( ) , true , _ggeee } ; _bgcdg [ 2 * _ggeee + 1 ] = event { _baacca . Ury + _geef * _baacca . fontsize ( ) , false , _ggeee } ; } ; } ; return _agcf . eventNeighbours ( _bgcdg ) ;
} ; func ( _bfa * Extractor ) extractPageText ( _ecf string , _bdb * _af . PdfPageResources , _ebfc _aae . Matrix , _bgf int , _eadd bool ) ( * PageText , int , int , error ) { _ag . Log . Trace ( "\u0065x\u0074\u0072\u0061\u0063t\u0050\u0061\u0067\u0065\u0054e\u0078t\u003a \u006c\u0065\u0076\u0065\u006c\u003d\u0025d" , _bgf ) ;
_fbc := & PageText { _cdf : _bfa . _fd , _cgad : _bfa . _ba , _dfc : _bfa . _ac } ; _abg := _ecd ( _bfa . _fd ) ; var _dcg stateStack ; _gab := _ffag ( _bfa , _bdb , _aa . GraphicsState { } , & _abg , & _dcg ) ; _acd := shapesState { _gdec : _ebfc , _cffc : _aae . IdentityMatrix ( ) , _cegf : _gab } ;
var _fde bool ; _ddd := - 1 ; if _bgf > _bagb { _fff := _b . New ( "\u0066\u006f\u0072\u006d s\u0074\u0061\u0063\u006b\u0020\u006f\u0076\u0065\u0072\u0066\u006c\u006f\u0077" ) ; _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a \u0065\u0078\u0074\u0072\u0061\u0063\u0074\u0050\u0061\u0067\u0065\u0054\u0065\u0078\u0074\u002e\u0020\u0072\u0065\u0063u\u0072\u0073\u0069\u006f\u006e\u0020\u006c\u0065\u0076\u0065\u006c\u003d\u0025\u0064 \u0065r\u0072\u003d\u0025\u0076" , _bgf , _fff ) ;
return _fbc , _abg . _cfg , _abg . _dacb , _fff ; } ; _cged := _aa . NewContentStreamParser ( _ecf ) ; _fcab , _ced := _cged . Parse ( ) ; if _ced != nil { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020e\u0078\u0074\u0072a\u0063\u0074\u0050\u0061g\u0065\u0054\u0065\u0078\u0074\u0020\u0070\u0061\u0072\u0073\u0065\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _ced ) ;
return _fbc , _abg . _cfg , _abg . _dacb , _ced ; } ; _fbc . _cfa = _fcab ; _fdfc := _aa . NewContentStreamProcessor ( * _fcab ) ; _fdfc . AddHandler ( _aa . HandlerConditionEnumAllOperands , "" , func ( _bbd * _aa . ContentStreamOperation , _gbba _aa . GraphicsState , _fgfg * _af . PdfPageResources ) error { _bga := _bbd . Operand ;
if _egbg { _ag . Log . Info ( "\u0026&\u0026\u0020\u006f\u0070\u003d\u0025s" , _bbd ) ; } ; switch _bga { case "\u0071" : if _cece { _ag . Log . Info ( "\u0063\u0074\u006d\u003d\u0025\u0073" , _acd . _cffc ) ; } ; _dcg . push ( & _abg ) ; case "\u0051" : if ! _dcg . empty ( ) { _abg = * _dcg . pop ( ) ;
} ; _acd . _cffc = _gbba . CTM ; if _cece { _ag . Log . Info ( "\u0063\u0074\u006d\u003d\u0025\u0073" , _acd . _cffc ) ; } ; case "\u0042\u0044\u0043" : _fab , _dge := _gf . GetDict ( _bbd . Params [ 1 ] ) ; if ! _dge { _ag . Log . Debug ( "\u0045\u0052\u0052O\u0052\u003a\u0020\u0042D\u0043\u0020\u006f\u0070\u003d\u0025\u0073 \u0047\u0065\u0074\u0044\u0069\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064" , _bbd ) ;
return _ced ; } ; _gac := _fab . Get ( "\u004d\u0043\u0049\u0044" ) ; if _gac != nil { _dff , _fda := _gf . GetIntVal ( _gac ) ; if ! _fda { _ag . Log . Debug ( "\u0045R\u0052\u004fR\u003a\u0020\u0042\u0044C\u0020\u006f\u0070=\u0025\u0073\u002e\u0020\u0042\u0061\u0064\u0020\u006eum\u0065\u0072\u0069c\u0061\u006c \u006f\u0062\u006a\u0065\u0063\u0074.\u0020\u006f=\u0025\u0073" , _bbd , _gac ) ;
} ; _ddd = _dff ; } else { _ddd = - 1 ; } ; case "\u0045\u004d\u0043" : _ddd = - 1 ; case "\u0042\u0054" : if _fde { _ag . Log . Debug ( "\u0042\u0054\u0020\u0063\u0061\u006c\u006c\u0065\u0064\u0020\u0077\u0068\u0069\u006c\u0065 \u0069n\u0020\u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074" ) ;
_fbc . _fecaa = append ( _fbc . _fecaa , _gab . _afff ... ) ; } ; _fde = true ; _eag := _gbba ; if _eadd { _eag = _aa . GraphicsState { } ; _eag . CTM = _acd . _cffc ; } ; _eag . CTM = _ebfc . Mult ( _eag . CTM ) ; _gab = _ffag ( _bfa , _fgfg , _eag , & _abg , & _dcg ) ; _acd . _cegf = _gab ; case "\u0045\u0054" : if ! _fde { _ag . Log . Debug ( "\u0045\u0054\u0020ca\u006c\u006c\u0065\u0064\u0020\u006f\u0075\u0074\u0073i\u0064e\u0020o\u0066 \u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074" ) ;
} ; _fde = false ; _fbc . _fecaa = append ( _fbc . _fecaa , _gab . _afff ... ) ; _gab . reset ( ) ; case "\u0054\u002a" : _gab . nextLine ( ) ; case "\u0054\u0064" : if _ccc , _dga := _gab . checkOp ( _bbd , 2 , true ) ; ! _ccc { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _dga ) ;
return _dga ; } ; _eeb , _eg , _eebf := _bbgag ( _bbd . Params ) ; if _eebf != nil { return _eebf ; } ; _gab . moveText ( _eeb , _eg ) ; case "\u0054\u0044" : if _aac , _cd := _gab . checkOp ( _bbd , 2 , true ) ; ! _aac { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _cd ) ;
return _cd ; } ; _aba , _ecc , _gcf := _bbgag ( _bbd . Params ) ; if _gcf != nil { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _gcf ) ; return _gcf ; } ; _gab . moveTextSetLeading ( _aba , _ecc ) ; case "\u0054\u006a" : if _gdf , _bfb := _gab . checkOp ( _bbd , 1 , true ) ;
! _gdf { _ag . Log . Debug ( "\u0045\u0052\u0052\u004fR:\u0020\u0054\u006a\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0065\u0072\u0072\u003d%\u0076" , _bbd , _bfb ) ; return _bfb ; } ; _befb := _gf . TraceToDirectObject ( _bbd . Params [ 0 ] ) ; _dgd , _caed := _gf . GetStringBytes ( _befb ) ;
if ! _caed { _ag . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a\u0020T\u006a\u0020o\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074S\u0074\u0072\u0069\u006e\u0067\u0042\u0079\u0074\u0065\u0073\u0020\u0066a\u0069\u006c\u0065\u0064" , _bbd ) ; return _gf . ErrTypeError ;
} ; return _gab . showText ( _befb , _dgd , _ddd ) ; case "\u0054\u004a" : if _bcg , _gbbg := _gab . checkOp ( _bbd , 1 , true ) ; ! _bcg { _ag . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u004a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _gbbg ) ; return _gbbg ; } ; _ceb , _bgff := _gf . GetArray ( _bbd . Params [ 0 ] ) ;
if ! _bgff { _ag . Log . Debug ( "\u0045\u0052\u0052OR\u003a\u0020\u0054\u004a\u0020\u006f\u0070\u003d\u0025s\u0020G\u0065t\u0041r\u0072\u0061\u0079\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065\u0064" , _bbd ) ; return _ced ; } ; return _gab . showTextAdjusted ( _ceb , _ddd ) ;
case "\u0027" : if _ggfc , _gef := _gab . checkOp ( _bbd , 1 , true ) ; ! _ggfc { _ag . Log . Debug ( "\u0045R\u0052O\u0052\u003a\u0020\u0027\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _gef ) ; return _gef ; } ; _afd := _gf . TraceToDirectObject ( _bbd . Params [ 0 ] ) ; _efeg , _dcc := _gf . GetStringBytes ( _afd ) ;
if ! _dcc { _ag . Log . Debug ( "\u0045\u0052RO\u0052\u003a\u0020'\u0020\u006f\u0070\u003d%s \u0047et\u0053\u0074\u0072\u0069\u006e\u0067\u0042yt\u0065\u0073\u0020\u0066\u0061\u0069\u006ce\u0064" , _bbd ) ; return _gf . ErrTypeError ; } ; _gab . nextLine ( ) ; return _gab . showText ( _afd , _efeg , _ddd ) ;
case "\u0022" : if _geg , _eea := _gab . checkOp ( _bbd , 3 , true ) ; ! _geg { _ag . Log . Debug ( "\u0045R\u0052O\u0052\u003a\u0020\u0022\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _eea ) ; return _eea ; } ; _ggga , _dcb , _bac := _bbgag ( _bbd . Params [ : 2 ] ) ; if _bac != nil { return _bac ;
} ; _abf := _gf . TraceToDirectObject ( _bbd . Params [ 2 ] ) ; _fba , _bgbg := _gf . GetStringBytes ( _abf ) ; if ! _bgbg { _ag . Log . Debug ( "\u0045\u0052RO\u0052\u003a\u0020\"\u0020\u006f\u0070\u003d%s \u0047et\u0053\u0074\u0072\u0069\u006e\u0067\u0042yt\u0065\u0073\u0020\u0066\u0061\u0069\u006ce\u0064" , _bbd ) ;
return _gf . ErrTypeError ; } ; _gab . setCharSpacing ( _ggga ) ; _gab . setWordSpacing ( _dcb ) ; _gab . nextLine ( ) ; return _gab . showText ( _abf , _fba , _ddd ) ; case "\u0054\u004c" : _eefe , _eda := _feca ( _bbd ) ; if _eda != nil { _ag . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u004c\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _eda ) ;
return _eda ; } ; _gab . setTextLeading ( _eefe ) ; case "\u0054\u0063" : _fccd , _dfa := _feca ( _bbd ) ; if _dfa != nil { _ag . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u0063\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _dfa ) ; return _dfa ; } ; _gab . setCharSpacing ( _fccd ) ;
case "\u0054\u0066" : if _fcf , _fec := _gab . checkOp ( _bbd , 2 , true ) ; ! _fcf { _ag . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u0066\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _fec ) ; return _fec ; } ; _cgc , _ccba := _gf . GetNameVal ( _bbd . Params [ 0 ] ) ; if ! _ccba { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a \u0054\u0066\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074\u004ea\u006d\u0065\u0056\u0061\u006c\u0020\u0066a\u0069\u006c\u0065\u0064" , _bbd ) ;
return _gf . ErrTypeError ; } ; _egd , _ecbd := _gf . GetNumberAsFloat ( _bbd . Params [ 1 ] ) ; if ! _ccba { _ag . Log . Debug ( "\u0045\u0052\u0052O\u0052\u003a\u0020\u0054\u0066\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074\u0046\u006c\u006f\u0061\u0074\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065d\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _bbd , _ecbd ) ;
return _ecbd ; } ; _ecbd = _gab . setFont ( _cgc , _egd ) ; _gab . _cdcc = _b . Is ( _ecbd , _gf . ErrNotSupported ) ; if _ecbd != nil && ! _gab . _cdcc { return _ecbd ; } ; case "\u0054\u006d" : if _fea , _abge := _gab . checkOp ( _bbd , 6 , true ) ; ! _fea { _ag . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u006d\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _abge ) ;
return _abge ; } ; _bccb , _efb := _gf . GetNumbersAsFloat ( _bbd . Params ) ; if _efb != nil { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _efb ) ; return _efb ; } ; _gab . setTextMatrix ( _bccb ) ; case "\u0054\u0072" : if _adf , _fgg := _gab . checkOp ( _bbd , 1 , true ) ;
! _adf { _ag . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u0072\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _fgg ) ; return _fgg ; } ; _fef , _daa := _gf . GetIntVal ( _bbd . Params [ 0 ] ) ; if ! _daa { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0072\u0020\u006f\u0070\u003d\u0025\u0073 \u0047e\u0074\u0049\u006e\u0074\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065\u0064" , _bbd ) ;
return _gf . ErrTypeError ; } ; _gab . setTextRenderMode ( _fef ) ; case "\u0054\u0073" : if _cce , _ecfd := _gab . checkOp ( _bbd , 1 , true ) ; ! _cce { _ag . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u0073\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _ecfd ) ; return _ecfd ;
} ; _cba , _aab := _gf . GetNumberAsFloat ( _bbd . Params [ 0 ] ) ; if _aab != nil { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _aab ) ; return _aab ; } ; _gab . setTextRise ( _cba ) ; case "\u0054\u0077" : if _eagf , _egb := _gab . checkOp ( _bbd , 1 , true ) ;
! _eagf { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _egb ) ; return _egb ; } ; _dbgf , _bfcb := _gf . GetNumberAsFloat ( _bbd . Params [ 0 ] ) ; if _bfcb != nil { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _bfcb ) ;
return _bfcb ; } ; _gab . setWordSpacing ( _dbgf ) ; case "\u0054\u007a" : if _edec , _edb := _gab . checkOp ( _bbd , 1 , true ) ; ! _edec { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _edb ) ; return _edb ; } ; _gbe , _agb := _gf . GetNumberAsFloat ( _bbd . Params [ 0 ] ) ;
if _agb != nil { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _agb ) ; return _agb ; } ; _gab . setHorizScaling ( _gbe ) ; case "\u0063\u006d" : if ! _eadd { _acd . _cffc = _gbba . CTM ; } ; if _acd . _cffc . Singular ( ) { _afb := _aae . IdentityMatrix ( ) . Translate ( _acd . _cffc . Translation ( ) ) ;
_ag . Log . Debug ( "S\u0069n\u0067\u0075\u006c\u0061\u0072\u0020\u0063\u0074m\u003d\u0025\u0073\u2192%s" , _acd . _cffc , _afb ) ; _acd . _cffc = _afb ; } ; if _cece { _ag . Log . Info ( "\u0063\u0074\u006d\u003d\u0025\u0073" , _acd . _cffc ) ; } ; case "\u006d" : if len ( _bbd . Params ) != 2 { _ag . Log . Debug ( "\u0057\u0041\u0052\u004e\u003a\u0020\u0065\u0072\u0072o\u0072\u0020\u0077\u0068\u0069\u006c\u0065\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u0020\u0060\u006d\u0060\u0020o\u0070\u0065r\u0061\u0074o\u0072\u003a\u0020\u0025\u0076\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074 m\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063o\u0072\u0072\u0065\u0063\u0074\u002e" , _eb ) ;
return nil ; } ; _daaf , _dfed := _gf . GetNumbersAsFloat ( _bbd . Params ) ; if _dfed != nil { return _dfed ; } ; _acd . moveTo ( _daaf [ 0 ] , _daaf [ 1 ] ) ; case "\u006c" : if len ( _bbd . Params ) != 2 { _ag . Log . Debug ( "\u0057\u0041\u0052\u004e\u003a\u0020\u0065\u0072\u0072o\u0072\u0020\u0077\u0068\u0069\u006c\u0065\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u0020\u0060\u006c\u0060\u0020o\u0070\u0065r\u0061\u0074o\u0072\u003a\u0020\u0025\u0076\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074 m\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063o\u0072\u0072\u0065\u0063\u0074\u002e" , _eb ) ;
return nil ; } ; _cfe , _cfea := _gf . GetNumbersAsFloat ( _bbd . Params ) ; if _cfea != nil { return _cfea ; } ; _acd . lineTo ( _cfe [ 0 ] , _cfe [ 1 ] ) ; case "\u0063" : if len ( _bbd . Params ) != 6 { return _eb ; } ; _ddfc , _adb := _gf . GetNumbersAsFloat ( _bbd . Params ) ; if _adb != nil { return _adb ;
} ; _ag . Log . Debug ( "\u0043u\u0062\u0069\u0063\u0020b\u0065\u007a\u0069\u0065\u0072 \u0070a\u0072a\u006d\u0073\u003a\u0020\u0025\u002e\u0032f" , _ddfc ) ; _acd . cubicTo ( _ddfc [ 0 ] , _ddfc [ 1 ] , _ddfc [ 2 ] , _ddfc [ 3 ] , _ddfc [ 4 ] , _ddfc [ 5 ] ) ; case "\u0076" , "\u0079" : if len ( _bbd . Params ) != 4 { return _eb ;
} ; _dfeg , _dddd := _gf . GetNumbersAsFloat ( _bbd . Params ) ; if _dddd != nil { return _dddd ; } ; _ag . Log . Debug ( "\u0043u\u0062\u0069\u0063\u0020b\u0065\u007a\u0069\u0065\u0072 \u0070a\u0072a\u006d\u0073\u003a\u0020\u0025\u002e\u0032f" , _dfeg ) ; _acd . quadraticTo ( _dfeg [ 0 ] , _dfeg [ 1 ] , _dfeg [ 2 ] , _dfeg [ 3 ] ) ;
case "\u0068" : _acd . closePath ( ) ; case "\u0072\u0065" : if len ( _bbd . Params ) != 4 { return _eb ; } ; _bgg , _ffad := _gf . GetNumbersAsFloat ( _bbd . Params ) ; if _ffad != nil { return _ffad ; } ; _acd . drawRectangle ( _bgg [ 0 ] , _bgg [ 1 ] , _bgg [ 2 ] , _bgg [ 3 ] ) ; _acd . closePath ( ) ;
case "\u0053" : _acd . stroke ( & _fbc . _gggf ) ; _acd . clearPath ( ) ; case "\u0073" : _acd . closePath ( ) ; _acd . stroke ( & _fbc . _gggf ) ; _acd . clearPath ( ) ; case "\u0046" : _acd . fill ( & _fbc . _afbg ) ; _acd . clearPath ( ) ; case "\u0066" , "\u0066\u002a" : _acd . closePath ( ) ; _acd . fill ( & _fbc . _afbg ) ;
_acd . clearPath ( ) ; case "\u0042" , "\u0042\u002a" : _acd . fill ( & _fbc . _afbg ) ; _acd . stroke ( & _fbc . _gggf ) ; _acd . clearPath ( ) ; case "\u0062" , "\u0062\u002a" : _acd . closePath ( ) ; _acd . fill ( & _fbc . _afbg ) ; _acd . stroke ( & _fbc . _gggf ) ; _acd . clearPath ( ) ; case "\u006e" : _acd . clearPath ( ) ;
case "\u0044\u006f" : if len ( _bbd . Params ) == 0 { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0078\u0070\u0065\u0063\u0074\u0065\u0064\u0020\u0058\u004fbj\u0065c\u0074\u0020\u006e\u0061\u006d\u0065\u0020\u006f\u0070\u0065\u0072\u0061n\u0064\u0020\u0066\u006f\u0072\u0020\u0044\u006f\u0020\u006f\u0070\u0065\u0072\u0061\u0074\u006f\u0072.\u0020\u0047\u006f\u0074\u0020\u0025\u002b\u0076\u002e" , _bbd . Params ) ;
return _gf . ErrRangeError ; } ; _fae , _eddfb := _gf . GetName ( _bbd . Params [ 0 ] ) ; if ! _eddfb { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u0044\u006f\u0020\u006f\u0070e\u0072a\u0074\u006f\u0072\u0020\u0058\u004f\u0062\u006a\u0065\u0063\u0074\u0020\u006e\u0061\u006d\u0065\u0020\u006fp\u0065\u0072\u0061\u006e\u0064\u003a\u0020\u0025\u002b\u0076\u002e" , _bbd . Params [ 0 ] ) ;
return _gf . ErrTypeError ; } ; _ , _baae := _fgfg . GetXObjectByName ( * _fae ) ; if _baae != _af . XObjectTypeForm { break ; } ; _ade , _eddfb := _bfa . _ec [ _fae . String ( ) ] ; if ! _eddfb { _fcg , _cccc := _fgfg . GetXObjectFormByName ( * _fae ) ; if _cccc != nil { _ag . Log . Debug ( "\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v" , _cccc ) ;
return _cccc ; } ; _dcd , _cccc := _fcg . GetContentStream ( ) ; if _cccc != nil { _ag . Log . Debug ( "\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v" , _cccc ) ; return _cccc ; } ; _gca := _fcg . Resources ; if _gca == nil { _gca = _fgfg ; } ; _dgb := _gbba . CTM ; if _age , _eec := _gf . GetArray ( _fcg . Matrix ) ;
_eec { _bebc , _gea := _age . GetAsFloat64Slice ( ) ; if _gea != nil { return _gea ; } ; if len ( _bebc ) != 6 { return _eb ; } ; _gbf := _aae . NewMatrix ( _bebc [ 0 ] , _bebc [ 1 ] , _bebc [ 2 ] , _bebc [ 3 ] , _bebc [ 4 ] , _bebc [ 5 ] ) ; _dgb = _gbba . CTM . Mult ( _gbf ) ; } ; _cabf , _gfc , _cbdf , _cccc := _bfa . extractPageText ( string ( _dcd ) , _gca , _ebfc . Mult ( _dgb ) , _bgf + 1 , false ) ;
if _cccc != nil { _ag . Log . Debug ( "\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v" , _cccc ) ; return _cccc ; } ; _ade = textResult { * _cabf , _gfc , _cbdf } ; _bfa . _ec [ _fae . String ( ) ] = _ade ; } ; _acd . _cffc = _gbba . CTM ; if _cece { _ag . Log . Info ( "\u0063\u0074\u006d\u003d\u0025\u0073" , _acd . _cffc ) ;
} ; _fbc . _fecaa = append ( _fbc . _fecaa , _ade . _eae . _fecaa ... ) ; _fbc . _gggf = append ( _fbc . _gggf , _ade . _eae . _gggf ... ) ; _fbc . _afbg = append ( _fbc . _afbg , _ade . _eae . _afbg ... ) ; _abg . _cfg += _ade . _cec ; _abg . _dacb += _ade . _gabe ; case "\u0072\u0067" , "\u0067" , "\u006b" , "\u0063\u0073" , "\u0073\u0063" , "\u0073\u0063\u006e" : _gab . _aef . ColorspaceNonStroking = _gbba . ColorspaceNonStroking ;
_gab . _aef . ColorNonStroking = _gbba . ColorNonStroking ; case "\u0052\u0047" , "\u0047" , "\u004b" , "\u0043\u0053" , "\u0053\u0043" , "\u0053\u0043\u004e" : _gab . _aef . ColorspaceStroking = _gbba . ColorspaceStroking ; _gab . _aef . ColorStroking = _gbba . ColorStroking ;
} ; return nil ; } ) ; _ced = _fdfc . Process ( _bdb ) ; if _bfa . _efe != nil && _bfa . _efe . IncludeAnnotations && ! _eadd { for _ , _ebb := range _bfa . _cae { _aaf , _daaff := _gf . GetDict ( _ebb . AP ) ; if ! _daaff { continue ; } ; _afbb , _daaff := _aaf . Get ( "\u004e" ) . ( * _gf . PdfObjectStream ) ;
if ! _daaff { continue ; } ; _bfef , _edf := _gf . DecodeStream ( _afbb ) ; if _edf != nil { _ag . Log . Debug ( "\u0045\u0072\u0072\u006f\u0072\u0020\u006f\u006e\u0020\u0064\u0065c\u006f\u0064\u0065\u0020\u0073\u0074\u0072\u0065\u0061\u006d:\u0020\u0025\u0076" , _edf ) ;
continue ; } ; _eca := _afbb . PdfObjectDictionary . Get ( "\u0052e\u0073\u006f\u0075\u0072\u0063\u0065s" ) ; _cgeg , _edf := _af . NewPdfPageResourcesFromDict ( _eca . ( * _gf . PdfObjectDictionary ) ) ; if _edf != nil { _ag . Log . Debug ( "\u0045\u0072\u0072\u006f\u0072 \u006f\u006e\u0020\u0067\u0065\u0074\u0074\u0069\u006e\u0067\u0020\u0061\u006en\u006f\u0074\u0061\u0074\u0069\u006f\u006e\u0020\u0072\u0065\u0073\u006f\u0075\u0072\u0063\u0065\u0073\u003a\u0020\u0025\u0076" , _edf ) ;
continue ; } ; _gfge := _aae . IdentityMatrix ( ) ; _bggg , _daaff := _afbb . PdfObjectDictionary . Get ( "\u004d\u0061\u0074\u0072\u0069\u0078" ) . ( * _gf . PdfObjectArray ) ; if _daaff { _dde , _aabf := _bggg . GetAsFloat64Slice ( ) ; if _aabf != nil { _ag . Log . Debug ( "\u0045\u0072\u0072or\u0020\u006f\u006e\u0020\u0067\u0065\u0074\u0074\u0069n\u0067 \u0066l\u006fa\u0074\u0036\u0034\u0020\u0073\u006c\u0069\u0063\u0065\u003a\u0020\u0025\u0076" , _aabf ) ;
continue ; } ; if len ( _dde ) != 6 { _ag . Log . Debug ( "I\u006e\u0076\u0061\u006c\u0069\u0064 \u006d\u0061\u0074\u0072\u0069\u0078\u0020\u0073\u006ci\u0063\u0065\u0020l\u0065n\u0067\u0074\u0068" ) ; continue ; } ; _gfge = _aae . NewMatrix ( _dde [ 0 ] , _dde [ 1 ] , _dde [ 2 ] , _dde [ 3 ] , _dde [ 4 ] , _dde [ 5 ] ) ;
} ; _bca , _daaff := _bfa . _agf [ _afbb . String ( ) ] ; if ! _daaff { _ccd , _gafe , _acf , _gcfe := _bfa . extractPageText ( string ( _bfef ) , _cgeg , _gfge , _bgf + 1 , true ) ; if _gcfe != nil { _ag . Log . Debug ( "\u0045\u0052R\u004f\u0052\u0020\u0065x\u0074\u0072a\u0063\u0074\u0069\u006e\u0067\u0020\u0061\u006en\u006f\u0074\u0061\u0074\u0069\u006f\u006e\u0020\u0074\u0065\u0078\u0074s\u003a\u0020\u0025\u0076" , _gcfe ) ;
continue ; } ; _bca = textResult { * _ccd , _gafe , _acf } ; _bfa . _agf [ _afbb . String ( ) ] = _bca ; } ; _fbc . _fecaa = append ( _fbc . _fecaa , _bca . _eae . _fecaa ... ) ; _fbc . _gggf = append ( _fbc . _gggf , _bca . _eae . _gggf ... ) ; _fbc . _afbg = append ( _fbc . _afbg , _bca . _eae . _afbg ... ) ;
_abg . _cfg += _bca . _cec ; _abg . _dacb += _bca . _gabe ; } ; } ; return _fbc , _abg . _cfg , _abg . _dacb , _ced ; } ; func ( _eafg rectRuling ) checkWidth ( _eaccd , _cafg float64 ) ( float64 , bool ) { _dcfce := _cafg - _eaccd ; _gdfg := _dcfce <= _cggd ; return _dcfce , _gdfg ; } ; func _egbgea ( _dffg * textLine , _dgc [ ] * textLine , _ffga [ ] float64 ) float64 { var _gcba float64 = - 1 ;
for _ , _dabg := range _dgc { if _dabg . _addd > _dffg . _addd { if _ea . Round ( _dabg . Llx ) >= _ea . Round ( _dffg . Llx ) { _gcba = _dabg . _addd ; } else { break ; } ; } ; } ; return _gcba ; } ; func _fcgc ( _gagg * list ) [ ] * textLine { for _ , _bgdf := range _gagg . _fbef { switch _bgdf . _fdgc { case "\u004c\u0042\u006fd\u0079" : if len ( _bgdf . _fged ) != 0 { return _bgdf . _fged ;
} ; return _fcgc ( _bgdf ) ; case "\u0053\u0070\u0061\u006e" : return _bgdf . _fged ; case "I\u006e\u006c\u0069\u006e\u0065\u0053\u0068\u0061\u0070\u0065" : return _bgdf . _fged ; } ; } ; return nil ; } ; func ( _fgef paraList ) computeEBBoxes ( ) { if _cffe { _ag . Log . Info ( "\u0063o\u006dp\u0075\u0074\u0065\u0045\u0042\u0042\u006f\u0078\u0065\u0073\u003a" ) ;
} ; for _ , _gccb := range _fgef { _gccb . _gbgbb = _gccb . PdfRectangle ; } ; _eggc := _fgef . yNeighbours ( 0 ) ; for _eggf , _gbgdd := range _fgef { _feaac := _gbgdd . _gbgbb ; _dfbec , _cgfbb := - 1.0e9 , + 1.0e9 ; for _ , _dcbf := range _eggc [ _gbgdd ] { _aaec := _fgef [ _dcbf ] . _gbgbb ;
if _aaec . Urx < _feaac . Llx { _dfbec = _ea . Max ( _dfbec , _aaec . Urx ) ; } else if _feaac . Urx < _aaec . Llx { _cgfbb = _ea . Min ( _cgfbb , _aaec . Llx ) ; } ; } ; for _gcga , _ddag := range _fgef { _acgbf := _ddag . _gbgbb ; if _eggf == _gcga || _acgbf . Ury > _feaac . Lly { continue ;
} ; if _dfbec <= _acgbf . Llx && _acgbf . Llx < _feaac . Llx { _feaac . Llx = _acgbf . Llx ; } else if _acgbf . Urx <= _cgfbb && _feaac . Urx < _acgbf . Urx { _feaac . Urx = _acgbf . Urx ; } ; } ; if _cffe { _efc . Printf ( "\u0025\u0034\u0064\u003a %\u0036\u002e\u0032\u0066\u2192\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0071\u000a" , _eggf , _gbgdd . _gbgbb , _feaac , _efcca ( _gbgdd . text ( ) , 50 ) ) ;
} ; _gbgdd . _gbgbb = _feaac ; } ; if _adfb { for _ , _ccdc := range _fgef { _ccdc . PdfRectangle = _ccdc . _gbgbb ; } ; } ; } ;
2024-03-27 22:34:33 +00:00
2024-05-29 17:04:37 +00:00
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
func ( _gff * Extractor ) ExtractTextWithStats ( ) ( _fdd string , _ffe int , _bbe int , _efd error ) { _gde , _ffe , _bbe , _efd := _gff . ExtractPageText ( ) ; if _efd != nil { return "" , _ffe , _bbe , _efd ; } ; return _gde . Text ( ) , _ffe , _bbe , nil ; } ; func ( _acaa * structTreeRoot ) buildList ( _bgga map [ int ] [ ] * textLine , _bfdb _gf . PdfObject ) [ ] * list { if _acaa == nil { _ag . Log . Debug ( "\u0062\u0075\u0069\u006c\u0064\u004c\u0069\u0073\u0074\u003a\u0020t\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u0020\u0069\u0073 \u006e\u0069\u006c" ) ;
return nil ; } ; var _cbc * structElement ; _eaadd := [ ] structElement { } ; if len ( _acaa . _cfbfg ) == 1 { _gceg := _acaa . _cfbfg [ 0 ] . _dccda ; if _gceg == "\u0044\u006f\u0063\u0075\u006d\u0065\u006e\u0074" || _gceg == "\u0053\u0065\u0063\u0074" || _gceg == "\u0050\u0061\u0072\u0074" || _gceg == "\u0044\u0069\u0076" || _gceg == "\u0041\u0072\u0074" { _cbc = & _acaa . _cfbfg [ 0 ] ;
} ; } else { _cbc = & structElement { _befc : _acaa . _cfbfg , _dccda : _acaa . _gfbe } ; } ; if _cbc == nil { _ag . Log . Debug ( "\u0062\u0075\u0069\u006cd\u004c\u0069\u0073\u0074\u003a\u0020\u0074\u006f\u0070\u0045l\u0065m\u0065\u006e\u0074\u0020\u0069\u0073\u0020n\u0069\u006c" ) ;
return nil ; } ; for _ , _afge := range _cbc . _befc { if _afge . _dccda == "\u004c" { _eaadd = append ( _eaadd , _afge ) ; } else if _afge . _dccda == "\u0054\u0061\u0062l\u0065" { _egeee := _dbddc ( _afge ) ; _eaadd = append ( _eaadd , _egeee ... ) ; } ; } ; _aebd := _eebe ( _eaadd , _bgga , _bfdb ) ;
var _adaf [ ] * list ; for _ , _adfba := range _aebd { _abbbc := _afc ( _adfba ) ; _adaf = append ( _adaf , _abbbc ... ) ; } ; return _adaf ; } ;
2024-04-16 11:40:43 +00:00
2024-05-29 17:04:37 +00:00
// RenderMode specifies the text rendering mode (Tmode), which determines whether showing text shall cause
// glyph outlines to be stroked, filled, used as a clipping boundary, or some combination of the three.
// Stroking, filling, and clipping shall have the same effects for a text object as they do for a path object
// (see 8.5.3, "Path-Painting Operators" and 8.5.4, "Clipping Path Operators").
type RenderMode int ; func ( _aegc * shapesState ) stroke ( _eabc * [ ] pathSection ) { _eabf := pathSection { _bgbeg : _aegc . _baca , Color : _aegc . _cegf . getStrokeColor ( ) } ; * _eabc = append ( * _eabc , _eabf ) ; if _gdeb { _efc . Printf ( "\u0020 \u0020\u0020S\u0054\u0052\u004fK\u0045\u003a\u0020\u0025\u0064\u0020\u0073t\u0072\u006f\u006b\u0065\u0073\u0020s\u0073\u003d\u0025\u0073\u0020\u0063\u006f\u006c\u006f\u0072\u003d%\u002b\u0076\u0020\u0025\u0036\u002e\u0032\u0066\u000a" , len ( * _eabc ) , _aegc , _aegc . _cegf . getStrokeColor ( ) , _eabf . bbox ( ) ) ;
if _edebg { for _fagf , _cfgeg := range _aegc . _baca { _efc . Printf ( "\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a" , _fagf , _cfgeg ) ; if _fagf == 10 { break ; } ; } ; } ; } ; } ; func ( _ddebg * wordBag ) maxDepth ( ) float64 { return _ddebg . _ecba - _ddebg . Lly } ; func _edeg ( _ebcb string ) bool { if _bb . RuneCountInString ( _ebcb ) < _eade { return false ;
} ; _aceb , _aagfa := _bb . DecodeLastRuneInString ( _ebcb ) ; if _aagfa <= 0 || ! _fg . Is ( _fg . Hyphen , _aceb ) { return false ; } ; _aceb , _aagfa = _bb . DecodeLastRuneInString ( _ebcb [ : len ( _ebcb ) - _aagfa ] ) ; return _aagfa > 0 && ! _fg . IsSpace ( _aceb ) ; } ; func ( _gaad * textTable ) emptyCompositeColumn ( _cdde int ) bool { for _cacb := 0 ;
_cacb < _gaad . _cegga ; _cacb ++ { if _gbfab , _gccaa := _gaad . _becfc [ _cdgd ( _cdde , _cacb ) ] ; _gccaa { if len ( _gbfab . paraList ) > 0 { return false ; } ; } ; } ; return true ; } ; func ( _fafa * wordBag ) firstReadingIndex ( _baf int ) int { _bdbf := _fafa . firstWord ( _baf ) . _abcc ;
_gaee := float64 ( _baf + 1 ) * _cdcb ; _adac := _gaee + _fcbe * _bdbf ; _dcaf := _baf ; for _ , _deccc := range _fafa . depthBand ( _gaee , _adac ) { if _fdbb ( _fafa . firstWord ( _deccc ) , _fafa . firstWord ( _dcaf ) ) < 0 { _dcaf = _deccc ; } ; } ; return _dcaf ; } ; type gridTile struct { _af . PdfRectangle ;
_ffdf , _gceeb , _dbafa , _gdcbg bool ; } ; func ( _fgag * textObject ) setFont ( _fefa string , _faeb float64 ) error { if _fgag == nil { return nil ; } ; _fgag . _ecff . _gbbgg = _faeb ; _bdad , _edad := _fgag . getFont ( _fefa ) ; if _edad != nil { return _edad ; } ; _fgag . _ecff . _fgfgb = _bdad ;
return nil ; } ; type structTreeRoot struct { _cfbfg [ ] structElement ; _gfbe string ; } ;
2024-04-16 11:40:43 +00:00
2024-05-29 17:04:37 +00:00
// ExtractFonts returns all font information from the page extractor, including
// font name, font type, the raw data of the embedded font file (if embedded), font descriptor and more.
//
// The argument `previousPageFonts` is used when trying to build a complete font catalog for multiple pages or the entire document.
// The entries from `previousPageFonts` are added to the returned result unless already included in the page, i.e. no duplicate entries.
//
// NOTE: If previousPageFonts is nil, all fonts from the page will be returned. Use it when building up a full list of fonts for a document or page range.
func ( _eba * Extractor ) ExtractFonts ( previousPageFonts * PageFonts ) ( * PageFonts , error ) { _fdg := PageFonts { } ; _aed := _fdg . extractPageResourcesToFont ( _eba . _cf ) ; if _aed != nil { return nil , _aed ; } ; if previousPageFonts != nil { for _ , _aea := range previousPageFonts . Fonts { if ! _dgg ( _fdg . Fonts , _aea . FontName ) { _fdg . Fonts = append ( _fdg . Fonts , _aea ) ;
} ; } ; } ; return & PageFonts { Fonts : _fdg . Fonts } , nil ; } ; func _deadf ( _cbdaf int , _cbfda map [ int ] [ ] float64 ) ( [ ] int , int ) { _bdcg := make ( [ ] int , _cbdaf ) ; _egcbd := 0 ; for _ffacd := 0 ; _ffacd < _cbdaf ; _ffacd ++ { _bdcg [ _ffacd ] = _egcbd ; _egcbd += len ( _cbfda [ _ffacd ] ) + 1 ;
} ; return _bdcg , _egcbd ; } ; func _aacg ( _bdbbd [ ] rulingList ) ( rulingList , rulingList ) { var _gfgec rulingList ; for _ , _cfcgg := range _bdbbd { _gfgec = append ( _gfgec , _cfcgg ... ) ; } ; return _gfgec . vertsHorzs ( ) ; } ; func ( _cagc rulingList ) asTiling ( ) gridTiling { if _fgge { _ag . Log . Info ( "r\u0075\u006ci\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0076\u0065\u0063s\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u002b\u002b\u002b\u0020\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d" , len ( _cagc ) ) ;
} ; for _afebd , _gfef := range _cagc [ 1 : ] { _bedf := _cagc [ _afebd ] ; if _bedf . alignsPrimary ( _gfef ) && _bedf . alignsSec ( _gfef ) { _ag . Log . Error ( "a\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0044\u0075\u0070\u006c\u0069\u0063\u0061\u0074\u0065 \u0072\u0075\u006c\u0069\u006e\u0067\u0073\u002e\u000a\u0009v=\u0025\u0073\u000a\t\u0077=\u0025\u0073" , _gfef , _bedf ) ;
} ; } ; _cagc . sortStrict ( ) ; _cagc . log ( "\u0073n\u0061\u0070\u0070\u0065\u0064" ) ; _efffa , _dgccc := _cagc . vertsHorzs ( ) ; _ecfa := _efffa . primaries ( ) ; _daffe := _dgccc . primaries ( ) ; _egaa := len ( _ecfa ) - 1 ; _dfcee := len ( _daffe ) - 1 ; if _egaa == 0 || _dfcee == 0 { return gridTiling { } ;
} ; _bgcdb := _af . PdfRectangle { Llx : _ecfa [ 0 ] , Urx : _ecfa [ _egaa ] , Lly : _daffe [ 0 ] , Ury : _daffe [ _dfcee ] } ; if _fgge { _ag . Log . Info ( "\u0072\u0075l\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0076\u0065\u0072\u0074s=\u0025\u0064" , len ( _efffa ) ) ;
for _cgac , _geaf := range _efffa { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _cgac , _geaf ) ; } ; _ag . Log . Info ( "\u0072\u0075l\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0068\u006f\u0072\u007as=\u0025\u0064" , len ( _dgccc ) ) ;
for _faga , _bfbb := range _dgccc { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _faga , _bfbb ) ; } ; _ag . Log . Info ( "\u0072\u0075\u006c\u0069\u006eg\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067:\u0020\u0020\u0077\u0078\u0068\u003d\u0025\u0064\u0078\u0025\u0064\u000a\u0009\u006c\u006c\u0078\u003d\u0025\u002e\u0032\u0066\u000a\u0009\u006c\u006c\u0079\u003d\u0025\u002e\u0032f" , _egaa , _dfcee , _ecfa , _daffe ) ;
} ; _caaff := make ( [ ] gridTile , _egaa * _dfcee ) ; for _acbc := _dfcee - 1 ; _acbc >= 0 ; _acbc -- { _bcfg := _daffe [ _acbc ] ; _feaaf := _daffe [ _acbc + 1 ] ; for _ebaf := 0 ; _ebaf < _egaa ; _ebaf ++ { _adfg := _ecfa [ _ebaf ] ; _dfbae := _ecfa [ _ebaf + 1 ] ; _aefe := _efffa . findPrimSec ( _adfg , _bcfg ) ;
_efdgg := _efffa . findPrimSec ( _dfbae , _bcfg ) ; _cffb := _dgccc . findPrimSec ( _bcfg , _adfg ) ; _gbbab := _dgccc . findPrimSec ( _feaaf , _adfg ) ; _aeac := _af . PdfRectangle { Llx : _adfg , Urx : _dfbae , Lly : _bcfg , Ury : _feaaf } ; _fadg := _ebaad ( _aeac , _aefe , _efdgg , _cffb , _gbbab ) ;
_caaff [ _acbc * _egaa + _ebaf ] = _fadg ; if _fgge { _efc . Printf ( "\u0020\u0020\u0078\u003d\u0025\u0032\u0064\u0020\u0079\u003d\u0025\u0032\u0064\u003a\u0020%\u0073 \u0025\u0036\u002e\u0032\u0066\u0020\u0078\u0020\u0025\u0036\u002e\u0032\u0066\u000a" , _ebaf , _acbc , _fadg . String ( ) , _fadg . Width ( ) , _fadg . Height ( ) ) ;
} ; } ; } ; if _fgge { _ag . Log . Info ( "r\u0075\u006c\u0069\u006e\u0067\u004c\u0069\u0073\u0074.\u0061\u0073\u0054\u0069\u006c\u0069\u006eg:\u0020\u0063\u006f\u0061l\u0065\u0073\u0063\u0065\u0020\u0068\u006f\u0072\u0069zo\u006e\u0074a\u006c\u002e\u0020\u0025\u0036\u002e\u0032\u0066" , _bgcdb ) ;
} ; _fdfbce := make ( [ ] map [ float64 ] gridTile , _dfcee ) ; for _fgea := _dfcee - 1 ; _fgea >= 0 ; _fgea -- { if _fgge { _efc . Printf ( "\u0020\u0020\u0079\u003d\u0025\u0032\u0064\u000a" , _fgea ) ; } ; _fdfbce [ _fgea ] = make ( map [ float64 ] gridTile , _egaa ) ; for _acgeb := 0 ; _acgeb < _egaa ;
_acgeb ++ { _bcfaa := _caaff [ _fgea * _egaa + _acgeb ] ; if _fgge { _efc . Printf ( "\u0020\u0020\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _acgeb , _bcfaa ) ; } ; if ! _bcfaa . _gceeb { continue ; } ; _fcba := _acgeb ; for _bebgg := _acgeb + 1 ; ! _bcfaa . _gdcbg && _bebgg < _egaa ;
_bebgg ++ { _aaaf := _caaff [ _fgea * _egaa + _bebgg ] ; _bcfaa . Urx = _aaaf . Urx ; _bcfaa . _ffdf = _bcfaa . _ffdf || _aaaf . _ffdf ; _bcfaa . _dbafa = _bcfaa . _dbafa || _aaaf . _dbafa ; _bcfaa . _gdcbg = _aaaf . _gdcbg ; if _fgge { _efc . Printf ( "\u0020 \u0020%\u0034\u0064\u003a\u0020\u0025s\u0020\u2192 \u0025\u0073\u000a" , _bebgg , _aaaf , _bcfaa ) ;
} ; _fcba = _bebgg ; } ; if _fgge { _efc . Printf ( " \u0020 \u0025\u0032\u0064\u0020\u002d\u0020\u0025\u0032d\u0020\u2192\u0020\u0025s\n" , _acgeb , _fcba , _bcfaa ) ; } ; _acgeb = _fcba ; _fdfbce [ _fgea ] [ _bcfaa . Llx ] = _bcfaa ; } ; } ; _aged := make ( map [ float64 ] map [ float64 ] gridTile , _dfcee ) ;
_ebgd := make ( map [ float64 ] map [ float64 ] struct { } , _dfcee ) ; for _edde := _dfcee - 1 ; _edde >= 0 ; _edde -- { _ggcba := _caaff [ _edde * _egaa ] . Lly ; _aged [ _ggcba ] = make ( map [ float64 ] gridTile , _egaa ) ; _ebgd [ _ggcba ] = make ( map [ float64 ] struct { } , _egaa ) ; } ; if _fgge { _ag . Log . Info ( "\u0072u\u006c\u0069n\u0067\u004c\u0069s\u0074\u002e\u0061\u0073\u0054\u0069\u006ci\u006e\u0067\u003a\u0020\u0063\u006fa\u006c\u0065\u0073\u0063\u0065\u0020\u0076\u0065\u0072\u0074\u0069c\u0061\u006c\u002e\u0020\u0025\u0036\u002e\u0032\u0066" , _bgcdb ) ;
} ; for _ecfaf := _dfcee - 1 ; _ecfaf >= 0 ; _ecfaf -- { _eefef := _caaff [ _ecfaf * _egaa ] . Lly ; _fdaa := _fdfbce [ _ecfaf ] ; if _fgge { _efc . Printf ( "\u0020\u0020\u0079\u003d\u0025\u0032\u0064\u000a" , _ecfaf ) ; } ; for _ , _cccdg := range _aecg ( _fdaa ) { if _ , _cbfe := _ebgd [ _eefef ] [ _cccdg ] ;
_cbfe { continue ; } ; _gbfed := _fdaa [ _cccdg ] ; if _fgge { _efc . Printf ( " \u0020\u0020\u0020\u0020\u0076\u0030\u003d\u0025\u0073\u000a" , _gbfed . String ( ) ) ; } ; for _acbac := _ecfaf - 1 ; _acbac >= 0 ; _acbac -- { if _gbfed . _dbafa { break ; } ; _dfdec := _fdfbce [ _acbac ] ;
_bbccc , _gdab := _dfdec [ _cccdg ] ; if ! _gdab { break ; } ; if _bbccc . Urx != _gbfed . Urx { break ; } ; _gbfed . _dbafa = _bbccc . _dbafa ; _gbfed . Lly = _bbccc . Lly ; if _fgge { _efc . Printf ( "\u0020\u0020\u0020\u0020 \u0020\u0020\u0076\u003d\u0025\u0073\u0020\u0076\u0030\u003d\u0025\u0073\u000a" , _bbccc . String ( ) , _gbfed . String ( ) ) ;
} ; _ebgd [ _bbccc . Lly ] [ _bbccc . Llx ] = struct { } { } ; } ; if _ecfaf == 0 { _gbfed . _dbafa = true ; } ; if _gbfed . complete ( ) { _aged [ _eefef ] [ _cccdg ] = _gbfed ; } ; } ; } ; _geeg := gridTiling { PdfRectangle : _bgcdb , _beefa : _eedcg ( _aged ) , _eeba : _dgca ( _aged ) , _agba : _aged } ;
_geeg . log ( "\u0043r\u0065\u0061\u0074\u0065\u0064" ) ; return _geeg ; } ; func ( _feaab rulingList ) augmentGrid ( ) ( rulingList , rulingList ) { _febd , _fgdb := _feaab . vertsHorzs ( ) ; if len ( _febd ) == 0 || len ( _fgdb ) == 0 { return _febd , _fgdb ; } ; _gfcd , _bbea := _febd , _fgdb ;
_gegf := _febd . bbox ( ) ; _ggae := _fgdb . bbox ( ) ; if _gdeb { _ag . Log . Info ( "\u0061u\u0067\u006d\u0065\u006e\u0074\u0047\u0072\u0069\u0064\u003a\u0020b\u0062\u006f\u0078\u0056\u003d\u0025\u0036\u002e\u0032\u0066" , _gegf ) ; _ag . Log . Info ( "\u0061u\u0067\u006d\u0065\u006e\u0074\u0047\u0072\u0069\u0064\u003a\u0020b\u0062\u006f\u0078\u0048\u003d\u0025\u0036\u002e\u0032\u0066" , _ggae ) ;
} ; var _abgbg , _gbfb , _cfdad , _bagd * ruling ; if _ggae . Llx < _gegf . Llx - _bcae { _abgbg = & ruling { _agff : _gcfgb , _ecfb : _gecdf , _aeef : _ggae . Llx , _ggdb : _gegf . Lly , _gbca : _gegf . Ury } ; _febd = append ( rulingList { _abgbg } , _febd ... ) ; } ; if _ggae . Urx > _gegf . Urx + _bcae { _gbfb = & ruling { _agff : _gcfgb , _ecfb : _gecdf , _aeef : _ggae . Urx , _ggdb : _gegf . Lly , _gbca : _gegf . Ury } ;
_febd = append ( _febd , _gbfb ) ; } ; if _gegf . Lly < _ggae . Lly - _bcae { _cfdad = & ruling { _agff : _gcfgb , _ecfb : _eeg , _aeef : _gegf . Lly , _ggdb : _ggae . Llx , _gbca : _ggae . Urx } ; _fgdb = append ( rulingList { _cfdad } , _fgdb ... ) ; } ; if _gegf . Ury > _ggae . Ury + _bcae { _bagd = & ruling { _agff : _gcfgb , _ecfb : _eeg , _aeef : _gegf . Ury , _ggdb : _ggae . Llx , _gbca : _ggae . Urx } ;
_fgdb = append ( _fgdb , _bagd ) ; } ; if len ( _febd ) + len ( _fgdb ) == len ( _feaab ) { return _gfcd , _bbea ; } ; _fbcc := append ( _febd , _fgdb ... ) ; _feaab . log ( "u\u006e\u0061\u0075\u0067\u006d\u0065\u006e\u0074\u0065\u0064" ) ; _fbcc . log ( "\u0061u\u0067\u006d\u0065\u006e\u0074\u0065d" ) ;
return _febd , _fgdb ; } ; func ( _eddc gridTiling ) log ( _gfdf string ) { if ! _fgge { return ; } ; _ag . Log . Info ( "\u0074i\u006ci\u006e\u0067\u003a\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0071" , len ( _eddc . _beefa ) , len ( _eddc . _eeba ) , _gfdf ) ; _efc . Printf ( "\u0020\u0020\u0020l\u006c\u0078\u003d\u0025\u002e\u0032\u0066\u000a" , _eddc . _beefa ) ;
_efc . Printf ( "\u0020\u0020\u0020l\u006c\u0079\u003d\u0025\u002e\u0032\u0066\u000a" , _eddc . _eeba ) ; for _facb , _gcda := range _eddc . _eeba { _bfec , _bgdgb := _eddc . _agba [ _gcda ] ; if ! _bgdgb { continue ; } ; _efc . Printf ( "%\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u000a" , _facb , _gcda ) ;
for _dcbfg , _ggdc := range _eddc . _beefa { _baacb , _dfgf := _bfec [ _ggdc ] ; if ! _dfgf { continue ; } ; _efc . Printf ( "\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a" , _dcbfg , _baacb . String ( ) ) ; } ; } ; } ; func ( _aegf * shapesState ) establishSubpath ( ) * subpath { _cfae , _bgfc := _aegf . lastpointEstablished ( ) ;
if ! _bgfc { _aegf . _baca = append ( _aegf . _baca , _ggda ( _cfae ) ) ; } ; if len ( _aegf . _baca ) == 0 { return nil ; } ; _aegf . _gbee = false ; return _aegf . _baca [ len ( _aegf . _baca ) - 1 ] ; } ; func ( _dedcc rulingList ) splitSec ( ) [ ] rulingList { _e . Slice ( _dedcc , func ( _adga , _fgfd int ) bool { _daafb , _cffdf := _dedcc [ _adga ] , _dedcc [ _fgfd ] ;
if _daafb . _ggdb != _cffdf . _ggdb { return _daafb . _ggdb < _cffdf . _ggdb ; } ; return _daafb . _gbca < _cffdf . _gbca ; } ) ; _ddaf := make ( map [ * ruling ] struct { } , len ( _dedcc ) ) ; _gafdb := func ( _dgga * ruling ) rulingList { _gdcge := rulingList { _dgga } ; _ddaf [ _dgga ] = struct { } { } ;
for _ , _bdfga := range _dedcc { if _ , _cbafb := _ddaf [ _bdfga ] ; _cbafb { continue ; } ; for _ , _edgfg := range _gdcge { if _bdfga . alignsSec ( _edgfg ) { _gdcge = append ( _gdcge , _bdfga ) ; _ddaf [ _bdfga ] = struct { } { } ; break ; } ; } ; } ; return _gdcge ; } ; _aadc := [ ] rulingList { _gafdb ( _dedcc [ 0 ] ) } ;
for _ , _ccgfg := range _dedcc [ 1 : ] { if _ , _dggd := _ddaf [ _ccgfg ] ; _dggd { continue ; } ; _aadc = append ( _aadc , _gafdb ( _ccgfg ) ) ; } ; return _aadc ; } ; func ( _acbb * textPara ) writeText ( _decgc _fc . Writer ) { if _acbb . _befe == nil { _acbb . writeCellText ( _decgc ) ; return ;
} ; for _gaeedg := 0 ; _gaeedg < _acbb . _befe . _cegga ; _gaeedg ++ { for _ccbb := 0 ; _ccbb < _acbb . _befe . _aageb ; _ccbb ++ { _ecbdf := _acbb . _befe . get ( _ccbb , _gaeedg ) ; if _ecbdf == nil { _decgc . Write ( [ ] byte ( "\u0009" ) ) ; } else { _ecbdf . writeCellText ( _decgc ) ; } ; _decgc . Write ( [ ] byte ( "\u0020" ) ) ;
} ; if _gaeedg < _acbb . _befe . _cegga - 1 { _decgc . Write ( [ ] byte ( "\u000a" ) ) ; } ; } ; } ; func ( _daffdg paraList ) topoOrder ( ) [ ] int { if _efbd { _ag . Log . Info ( "\u0074\u006f\u0070\u006f\u004f\u0072\u0064\u0065\u0072\u003a" ) ; } ; _egbga := len ( _daffdg ) ; _bbbc := make ( [ ] bool , _egbga ) ;
_bbadf := make ( [ ] int , 0 , _egbga ) ; _badc := _daffdg . llyOrdering ( ) ; var _adgf func ( _abgccg int ) ; _adgf = func ( _bcge int ) { _bbbc [ _bcge ] = true ; for _afba := 0 ; _afba < _egbga ; _afba ++ { if ! _bbbc [ _afba ] { if _daffdg . readBefore ( _badc , _bcge , _afba ) { _adgf ( _afba ) ;
} ; } ; } ; _bbadf = append ( _bbadf , _bcge ) ; } ; for _adgg := 0 ; _adgg < _egbga ; _adgg ++ { if ! _bbbc [ _adgg ] { _adgf ( _adgg ) ; } ; } ; return _deeg ( _bbadf ) ; } ; func _gdcgf ( _becf , _fcbcf float64 ) bool { return _becf / _ea . Max ( _gcffb , _fcbcf ) < _ebffe } ;
2024-04-16 11:40:43 +00:00
2024-05-29 17:04:37 +00:00
// RangeOffset returns the TextMarks in `ma` that overlap text[start:end] in the extracted text.
// These are tm: `start` <= tm.Offset + len(tm.Text) && tm.Offset < `end` where
// `start` and `end` are offsets in the extracted text.
// NOTE: TextMarks can contain multiple characters. e.g. "ffi" for the ffi ligature so the first and
// last elements of the returned TextMarkArray may only partially overlap text[start:end].
func ( _cdb * TextMarkArray ) RangeOffset ( start , end int ) ( * TextMarkArray , error ) { if _cdb == nil { return nil , _b . New ( "\u006da\u003d\u003d\u006e\u0069\u006c" ) ; } ; if end < start { return nil , _efc . Errorf ( "\u0065\u006e\u0064\u0020\u003c\u0020\u0073\u0074\u0061\u0072\u0074\u002e\u0020\u0052\u0061n\u0067\u0065\u004f\u0066\u0066\u0073\u0065\u0074\u0020\u006e\u006f\u0074\u0020d\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020\u0073\u0074\u0061\u0072t=\u0025\u0064\u0020\u0065\u006e\u0064\u003d\u0025\u0064\u0020" , start , end ) ;
} ; _cfbf := len ( _cdb . _aec ) ; if _cfbf == 0 { return _cdb , nil ; } ; if start < _cdb . _aec [ 0 ] . Offset { start = _cdb . _aec [ 0 ] . Offset ; } ; if end > _cdb . _aec [ _cfbf - 1 ] . Offset + 1 { end = _cdb . _aec [ _cfbf - 1 ] . Offset + 1 ; } ; _dcgd := _e . Search ( _cfbf , func ( _cdca int ) bool { return _cdb . _aec [ _cdca ] . Offset + len ( _cdb . _aec [ _cdca ] . Text ) - 1 >= start } ) ;
if ! ( 0 <= _dcgd && _dcgd < _cfbf ) { _ggcb := _efc . Errorf ( "\u004f\u0075\u0074\u0020\u006f\u0066\u0020\u0072\u0061\u006e\u0067\u0065\u002e\u0020\u0073\u0074\u0061\u0072\u0074\u003d%\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009 \u006c\u0061\u0073\u0074\u003d%\u0076" , start , _dcgd , _cfbf , _cdb . _aec [ 0 ] , _cdb . _aec [ _cfbf - 1 ] ) ;
return nil , _ggcb ; } ; _abce := _e . Search ( _cfbf , func ( _fcb int ) bool { return _cdb . _aec [ _fcb ] . Offset > end - 1 } ) ; if ! ( 0 <= _abce && _abce < _cfbf ) { _gbgb := _efc . Errorf ( "\u004f\u0075\u0074\u0020\u006f\u0066\u0020r\u0061\u006e\u0067e\u002e\u0020\u0065n\u0064\u003d%\u0064\u0020\u0069\u0045\u006e\u0064=\u0025d \u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d\u0025\u0076" , end , _abce , _cfbf , _cdb . _aec [ 0 ] , _cdb . _aec [ _cfbf - 1 ] ) ;
return nil , _gbgb ; } ; if _abce <= _dcgd { return nil , _efc . Errorf ( "\u0069\u0045\u006e\u0064\u0020\u003c=\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003a\u0020\u0073\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u0065\u006ed\u003d\u0025\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020i\u0045n\u0064\u003d\u0025\u0064" , start , end , _dcgd , _abce ) ;
} ; return & TextMarkArray { _aec : _cdb . _aec [ _dcgd : _abce ] } , nil ; } ;
// Tables returns the tables extracted from the page.
func ( _fag PageText ) Tables ( ) [ ] TextTable { if _dedc { _ag . Log . Info ( "\u0054\u0061\u0062\u006c\u0065\u0073\u003a\u0020\u0025\u0064" , len ( _fag . _gacf ) ) ; } ; return _fag . _gacf ; } ;
// New returns an Extractor instance for extracting content from the input PDF page.
func New ( page * _af . PdfPage ) ( * Extractor , error ) { return NewWithOptions ( page , nil ) } ; func ( _bbc * textObject ) setTextMatrix ( _fdbg [ ] float64 ) { if len ( _fdbg ) != 6 { _ag . Log . Debug ( "\u0045\u0052\u0052OR\u003a\u0020\u006c\u0065\u006e\u0028\u0066\u0029\u0020\u0021\u003d\u0020\u0036\u0020\u0028\u0025\u0064\u0029" , len ( _fdbg ) ) ;
return ; } ; _dccf , _gdd , _feba , _edc , _ffaf , _gfd := _fdbg [ 0 ] , _fdbg [ 1 ] , _fdbg [ 2 ] , _fdbg [ 3 ] , _fdbg [ 4 ] , _fdbg [ 5 ] ; _bbc . _dbc = _aae . NewMatrix ( _dccf , _gdd , _feba , _edc , _ffaf , _gfd ) ; _bbc . _ebc = _bbc . _dbc ; } ;
// NewFromContents creates a new extractor from contents and page resources.
func NewFromContents ( contents string , resources * _af . PdfPageResources ) ( * Extractor , error ) { const _gb = "\u0065x\u0074\u0072\u0061\u0063t\u006f\u0072\u002e\u004e\u0065w\u0046r\u006fm\u0043\u006f\u006e\u0074\u0065\u006e\u0074s" ; _ee := & Extractor { _fgb : contents , _cf : resources , _fgf : map [ string ] fontEntry { } , _ec : map [ string ] textResult { } } ;
_d . TrackUse ( _gb ) ; return _ee , nil ; } ;
// String returns a human readable description of `ss`.
func ( _dbdf * shapesState ) String ( ) string { return _efc . Sprintf ( "\u007b\u0025\u0064\u0020su\u0062\u0070\u0061\u0074\u0068\u0073\u0020\u0066\u0072\u0065\u0073\u0068\u003d\u0025t\u007d" , len ( _dbdf . _baca ) , _dbdf . _gbee ) ; } ; func ( _gee * PageText ) computeViews ( ) { _abab := _gee . getParagraphs ( ) ;
_bff := new ( _ae . Buffer ) ; _abab . writeText ( _bff ) ; _gee . _ffb = _bff . String ( ) ; _gee . _gdbf = _abab . toTextMarks ( ) ; _gee . _gacf = _abab . tables ( ) ; if _dedc { _ag . Log . Info ( "\u0063\u006f\u006dpu\u0074\u0065\u0056\u0069\u0065\u0077\u0073\u003a\u0020\u0074\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064" , len ( _gee . _gacf ) ) ;
} ; } ; func ( _acda * textPara ) taken ( ) bool { return _acda == nil || _acda . _fcdcf } ; func _gccad ( _ccfa , _eabg _aae . Point , _accg _fe . Color ) ( * ruling , bool ) { _eegb := lineRuling { _bbee : _ccfa , _efge : _eabg , _faab : _fdag ( _ccfa , _eabg ) , Color : _accg } ; if _eegb . _faab == _ceag { return nil , false ;
} ; return _eegb . asRuling ( ) ; } ; func ( _gdad compositeCell ) String ( ) string { _abbc := "" ; if len ( _gdad . paraList ) > 0 { _abbc = _efcca ( _gdad . paraList . merge ( ) . text ( ) , 50 ) ; } ; return _efc . Sprintf ( "\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0070\u0061\u0072a\u0073\u0020\u0025\u0071" , _gdad . PdfRectangle , len ( _gdad . paraList ) , _abbc ) ;
} ;
// ApplyArea processes the page text only within the specified area `bbox`.
// Each time ApplyArea is called, it updates the result set in `pt`.
// Can be called multiple times in a row with different bounding boxes.
func ( _fgfe * PageText ) ApplyArea ( bbox _af . PdfRectangle ) { _aedc := make ( [ ] * textMark , 0 , len ( _fgfe . _fecaa ) ) ; for _ , _bgab := range _fgfe . _fecaa { if _bdcb ( _bgab . bbox ( ) , bbox ) { _aedc = append ( _aedc , _bgab ) ; } ; } ; var _dda paraList ; _cbfd := len ( _aedc ) ;
for _gbff := 0 ; _gbff < 360 && _cbfd > 0 ; _gbff += 90 { _caag := make ( [ ] * textMark , 0 , len ( _aedc ) - _cbfd ) ; for _ , _dec := range _aedc { if _dec . _ddfdb == _gbff { _caag = append ( _caag , _dec ) ; } ; } ; if len ( _caag ) > 0 { _dbgg := _bdfg ( _caag , _fgfe . _cdf , nil , nil , _fgfe . _gbg . _dbed ) ;
_dda = append ( _dda , _dbgg ... ) ; _cbfd -= len ( _caag ) ; } ; } ; _dbea := new ( _ae . Buffer ) ; _dda . writeText ( _dbea ) ; _fgfe . _ffb = _dbea . String ( ) ; _fgfe . _gdbf = _dda . toTextMarks ( ) ; _fgfe . _gacf = _dda . tables ( ) ; } ; func ( _agdae * textWord ) absorb ( _baaac * textWord ) { _agdae . PdfRectangle = _cfab ( _agdae . PdfRectangle , _baaac . PdfRectangle ) ;
_agdae . _ffcd = append ( _agdae . _ffcd , _baaac . _ffcd ... ) ; } ; func ( _efcg paraList ) findTableGrid ( _afbbf gridTiling ) ( * textTable , map [ * textPara ] struct { } ) { _ecefd := len ( _afbbf . _beefa ) ; _efac := len ( _afbbf . _eeba ) ; _bffge := textTable { _caagg : true , _aageb : _ecefd , _cegga : _efac , _dgcf : make ( map [ uint64 ] * textPara , _ecefd * _efac ) , _becfc : make ( map [ uint64 ] compositeCell , _ecefd * _efac ) } ;
_bffge . PdfRectangle = _afbbf . PdfRectangle ; _degf := make ( map [ * textPara ] struct { } ) ; _abafc := int ( ( 1.0 - _ccfc ) * float64 ( _ecefd * _efac ) ) ; _aeec := 0 ; if _fgge { _ag . Log . Info ( "\u0066\u0069\u006e\u0064Ta\u0062\u006c\u0065\u0047\u0072\u0069\u0064\u003a\u0020\u0025\u0064\u0020\u0078\u0020%\u0064" , _ecefd , _efac ) ;
} ; for _fdbga , _gabf := range _afbbf . _eeba { _bffc , _gbaec := _afbbf . _agba [ _gabf ] ; if ! _gbaec { continue ; } ; for _fcbg , _gdfgd := range _afbbf . _beefa { _ccbbf , _efffc := _bffc [ _gdfgd ] ; if ! _efffc { continue ; } ; _fadbd := _efcg . inTile ( _ccbbf ) ; if len ( _fadbd ) == 0 { _aeec ++ ;
if _aeec > _abafc { if _fgge { _ag . Log . Info ( "\u0021\u006e\u0075m\u0045\u006d\u0070\u0074\u0079\u003d\u0025\u0064" , _aeec ) ; } ; return nil , nil ; } ; } else { _bffge . putComposite ( _fcbg , _fdbga , _fadbd , _ccbbf . PdfRectangle ) ; for _ , _eedcb := range _fadbd { _degf [ _eedcb ] = struct { } { } ;
} ; } ; } ; } ; _dade := 0 ; for _fddec := 0 ; _fddec < _ecefd ; _fddec ++ { _aggcg := _bffge . get ( _fddec , 0 ) ; if _aggcg == nil || ! _aggcg . _bdgc { _dade ++ ; } ; } ; if _dade == 0 { if _fgge { _ag . Log . Info ( "\u0021\u006e\u0075m\u0048\u0065\u0061\u0064\u0065\u0072\u003d\u0030" ) ; } ;
return nil , nil ; } ; _dacdg := _bffge . reduceTiling ( _afbbf , _bfad ) ; _dacdg = _dacdg . subdivide ( ) ; return _dacdg , _degf ; } ; func _dcbd ( _cdae , _gecb , _fafeb , _cede * textPara ) * textTable { _cegcd := & textTable { _aageb : 2 , _cegga : 2 , _dgcf : make ( map [ uint64 ] * textPara , 4 ) } ;
_cegcd . put ( 0 , 0 , _cdae ) ; _cegcd . put ( 1 , 0 , _gecb ) ; _cegcd . put ( 0 , 1 , _fafeb ) ; _cegcd . put ( 1 , 1 , _cede ) ; return _cegcd ; } ; func _ebfg ( _fead , _beced int ) int { if _fead > _beced { return _fead ; } ; return _beced ; } ; var _gded = TextMark { Text : "\u005b\u0058\u005d" , Original : "\u0020" , Meta : true , FillColor : _fe . White , StrokeColor : _fe . White } ;
func ( _bgaeg * textTable ) newTablePara ( ) * textPara { _ccdea := _bgaeg . computeBbox ( ) ; _egabd := & textPara { PdfRectangle : _ccdea , _gbgbb : _ccdea , _befe : _bgaeg } ; if _dedc { _ag . Log . Info ( "\u006e\u0065w\u0054\u0061\u0062l\u0065\u0050\u0061\u0072\u0061\u003a\u0020\u0025\u0073" , _egabd ) ;
} ; return _egabd ; } ;
// PageText represents the layout of text on a device page.
type PageText struct { _fecaa [ ] * textMark ; _ffb string ; _gdbf [ ] TextMark ; _gacf [ ] TextTable ; _cdf _af . PdfRectangle ; _gggf [ ] pathSection ; _afbg [ ] pathSection ; _cgad * _gf . PdfObject ; _dfc _gf . PdfObject ; _cfa * _aa . ContentStreamOperations ; _gbg PageTextOptions ;
} ; const ( _cffe = false ; _aebe = false ; _egbg = false ; _gfgca = false ; _cece = false ; _bgbd = false ; _cfgf = false ; _efbd = false ; _fbeb = false ; _fdfb = _fbeb && true ; _aedg = _fdfb && false ; _gbgbd = _fbeb && true ; _dedc = false ; _gddf = _dedc && false ; _edge = _dedc && true ;
_gdeb = false ; _edebg = _gdeb && false ; _gcbf = _gdeb && false ; _fgge = _gdeb && true ; _fccf = _gdeb && false ; _bgeg = _gdeb && false ; ) ; type compositeCell struct { _af . PdfRectangle ; paraList ; } ; func ( _aefa rulingList ) toGrids ( ) [ ] rulingList { if _gdeb { _ag . Log . Info ( "t\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0025\u0073" , _aefa ) ;
} ; _egcb := _aefa . intersections ( ) ; if _gdeb { _ag . Log . Info ( "\u0074\u006f\u0047r\u0069\u0064\u0073\u003a \u0076\u0065\u0063\u0073\u003d\u0025\u0064 \u0069\u006e\u0074\u0065\u0072\u0073\u0065\u0063\u0074\u0073\u003d\u0025\u0064\u0020" , len ( _aefa ) , len ( _egcb ) ) ;
for _ , _fafb := range _adca ( _egcb ) { _efc . Printf ( "\u00254\u0064\u003a\u0020\u0025\u002b\u0076\n" , _fafb , _egcb [ _fafb ] ) ; } ; } ; _cbee := make ( map [ int ] intSet , len ( _aefa ) ) ; for _fdgfe := range _aefa { _cfad := _aefa . connections ( _egcb , _fdgfe ) ; if len ( _cfad ) > 0 { _cbee [ _fdgfe ] = _cfad ;
} ; } ; if _gdeb { _ag . Log . Info ( "t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u0063\u006fn\u006e\u0065\u0063\u0074s=\u0025\u0064" , len ( _cbee ) ) ; for _ , _ddccd := range _adca ( _cbee ) { _efc . Printf ( "\u00254\u0064\u003a\u0020\u0025\u002b\u0076\n" , _ddccd , _cbee [ _ddccd ] ) ;
} ; } ; _ccgc := _bebec ( len ( _aefa ) , func ( _adbc , _bgffc int ) bool { _gggd , _bfag := len ( _cbee [ _adbc ] ) , len ( _cbee [ _bgffc ] ) ; if _gggd != _bfag { return _gggd > _bfag ; } ; return _aefa . comp ( _adbc , _bgffc ) ; } ) ; if _gdeb { _ag . Log . Info ( "t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u006f\u0072d\u0065\u0072\u0069\u006eg=\u0025\u0076" , _ccgc ) ;
} ; _bbffb := [ ] [ ] int { { _ccgc [ 0 ] } } ; _fbdb : for _ , _gbcag := range _ccgc [ 1 : ] { for _edada , _bdeg := range _bbffb { for _ , _acge := range _bdeg { if _cbee [ _acge ] . has ( _gbcag ) { _bbffb [ _edada ] = append ( _bdeg , _gbcag ) ; continue _fbdb ; } ; } ; } ; _bbffb = append ( _bbffb , [ ] int { _gbcag } ) ;
} ; if _gdeb { _ag . Log . Info ( "\u0074o\u0047r\u0069\u0064\u0073\u003a\u0020i\u0067\u0072i\u0064\u0073\u003d\u0025\u0076" , _bbffb ) ; } ; _e . SliceStable ( _bbffb , func ( _gabgg , _eecg int ) bool { return len ( _bbffb [ _gabgg ] ) > len ( _bbffb [ _eecg ] ) } ) ; for _ , _bgfg := range _bbffb { _e . Slice ( _bgfg , func ( _gecc , _dfec int ) bool { return _aefa . comp ( _bgfg [ _gecc ] , _bgfg [ _dfec ] ) } ) ;
} ; _fcabe := make ( [ ] rulingList , len ( _bbffb ) ) ; for _cecec , _ebce := range _bbffb { _cedccg := make ( rulingList , len ( _ebce ) ) ; for _acbbb , _ffgdf := range _ebce { _cedccg [ _acbbb ] = _aefa [ _ffgdf ] ; } ; _fcabe [ _cecec ] = _cedccg ; } ; if _gdeb { _ag . Log . Info ( "\u0074o\u0047r\u0069\u0064\u0073\u003a\u0020g\u0072\u0069d\u0073\u003d\u0025\u002b\u0076" , _fcabe ) ;
} ; var _dfcf [ ] rulingList ; for _ , _fcfbd := range _fcabe { if _cbgf , _eggcb := _fcfbd . isActualGrid ( ) ; _eggcb { _fcfbd = _cbgf ; _fcfbd = _fcfbd . snapToGroups ( ) ; _dfcf = append ( _dfcf , _fcfbd ) ; } ; } ; if _gdeb { _acgbd ( "t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u0061\u0063t\u0075\u0061\u006c\u0047ri\u0064\u0073" , _dfcf ) ;
_ag . Log . Info ( "\u0074\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0067\u0072\u0069\u0064\u0073\u003d%\u0064 \u0061\u0063\u0074\u0075\u0061\u006c\u0047\u0072\u0069\u0064\u0073\u003d\u0025\u0064" , len ( _fcabe ) , len ( _dfcf ) ) ; } ; return _dfcf ; } ; func ( _degd * textPara ) text ( ) string { _effce := new ( _ae . Buffer ) ;
_degd . writeText ( _effce ) ; return _effce . String ( ) ; } ; func ( _abeb rulingList ) merge ( ) * ruling { _gffbb := _abeb [ 0 ] . _aeef ; _beac := _abeb [ 0 ] . _ggdb ; _decf := _abeb [ 0 ] . _gbca ; for _ , _fbgd := range _abeb [ 1 : ] { _gffbb += _fbgd . _aeef ; if _fbgd . _ggdb < _beac { _beac = _fbgd . _ggdb ;
} ; if _fbgd . _gbca > _decf { _decf = _fbgd . _gbca ; } ; } ; _ggfdc := & ruling { _ecfb : _abeb [ 0 ] . _ecfb , _agff : _abeb [ 0 ] . _agff , Color : _abeb [ 0 ] . Color , _aeef : _gffbb / float64 ( len ( _abeb ) ) , _ggdb : _beac , _gbca : _decf } ; if _gcbf { _ag . Log . Info ( "\u006de\u0072g\u0065\u003a\u0020\u0025\u0032d\u0020\u0076e\u0063\u0073\u0020\u0025\u0073" , len ( _abeb ) , _ggfdc ) ;
for _cgef , _gfacc := range _abeb { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _cgef , _gfacc ) ; } ; } ; return _ggfdc ; } ; func ( _gbae * textPara ) fontsize ( ) float64 { return _gbae . _aage [ 0 ] . _afeb } ; func ( _fefd * stateStack ) pop ( ) * textState { if _fefd . empty ( ) { return nil ;
} ; _edce := * ( * _fefd ) [ len ( * _fefd ) - 1 ] ; * _fefd = ( * _fefd ) [ : len ( * _fefd ) - 1 ] ; return & _edce ; } ; func _bgfgg ( _gbaa , _baaa _aae . Point ) bool { return _gbaa . X == _baaa . X && _gbaa . Y == _baaa . Y } ; func ( _cdbg * subpath ) last ( ) _aae . Point { return _cdbg . _aaebg [ len ( _cdbg . _aaebg ) - 1 ] } ;
func ( _gccc paraList ) list ( ) [ ] * list { var _acfb [ ] * textLine ; var _gaab [ ] * textLine ; for _ , _daffd := range _gccc { _dgeb := _daffd . getListLines ( ) ; _acfb = append ( _acfb , _dgeb ... ) ; _gaab = append ( _gaab , _daffd . _aage ... ) ; } ; _dbcg := _dfga ( _acfb ) ; _edef := _caegg ( _gaab , _dbcg ) ;
return _edef ; } ; func ( _ebfa * imageExtractContext ) extractXObjectImage ( _ebg * _gf . PdfObjectName , _aeb _aa . GraphicsState , _aeg * _af . PdfPageResources ) error { _cega , _ := _aeg . GetXObjectByName ( * _ebg ) ; if _cega == nil { return nil ; } ; _fga , _bcc := _ebfa . _ebf [ _cega ] ;
if ! _bcc { _fgc , _bdf := _aeg . GetXObjectImageByName ( * _ebg ) ; if _bdf != nil { return _bdf ; } ; if _fgc == nil { return nil ; } ; _bbgg , _bdf := _fgc . ToImage ( ) ; if _bdf != nil { return _bdf ; } ; var _ddc _ef . Image ; if _fgc . Mask != nil { if _ddc , _bdf = _edfff ( _fgc . Mask , _fe . Opaque ) ;
_bdf != nil { _ag . Log . Debug ( "\u0057\u0041\u0052\u004e\u003a \u0063\u006f\u0075\u006c\u0064 \u006eo\u0074\u0020\u0067\u0065\u0074\u0020\u0065\u0078\u0070\u006c\u0069\u0063\u0069\u0074\u0020\u0069\u006d\u0061\u0067e\u0020\u006d\u0061\u0073\u006b\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074\u0020\u006d\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063o\u0072\u0072\u0065\u0063\u0074\u002e" ) ;
} ; } else if _fgc . SMask != nil { _ddc , _bdf = _ebdca ( _fgc . SMask , _fe . Opaque ) ; if _bdf != nil { _ag . Log . Debug ( "W\u0041\u0052\u004e\u003a\u0020\u0063\u006f\u0075\u006c\u0064\u0020\u006e\u006f\u0074\u0020\u0067\u0065\u0074\u0020\u0073\u006f\u0066\u0074\u0020\u0069\u006da\u0067e\u0020\u006d\u0061\u0073k\u002e\u0020O\u0075\u0074\u0070\u0075\u0074\u0020\u006d\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063\u006f\u0072\u0072\u0065\u0063\u0074\u002e" ) ;
} ; } ; if _ddc != nil { _bef , _eddf := _bbgg . ToGoImage ( ) ; if _eddf != nil { return _eddf ; } ; _bef = _fabfa ( _bef , _ddc ) ; switch _fgc . ColorSpace . String ( ) { case "\u0044\u0065\u0076\u0069\u0063\u0065\u0047\u0072\u0061\u0079" , "\u0049n\u0064\u0065\u0078\u0065\u0064" : _bbgg , _eddf = _af . ImageHandling . NewGrayImageFromGoImage ( _bef ) ;
if _eddf != nil { return _eddf ; } ; default : _bbgg , _eddf = _af . ImageHandling . NewImageFromGoImage ( _bef ) ; if _eddf != nil { return _eddf ; } ; } ; } ; _fga = & cachedImage { _aag : _bbgg , _gbb : _fgc . ColorSpace } ; _ebfa . _ebf [ _cega ] = _fga ; } ; _dgge := _fga . _aag ; _bda := _fga . _gbb ;
_gdb , _bf := _bda . ImageToRGB ( * _dgge ) ; if _bf != nil { return _bf ; } ; _ag . Log . Debug ( "@\u0044\u006f\u0020\u0043\u0054\u004d\u003a\u0020\u0025\u0073" , _aeb . CTM . String ( ) ) ; _gdg := ImageMark { Image : & _gdb , Width : _aeb . CTM . ScalingFactorX ( ) , Height : _aeb . CTM . ScalingFactorY ( ) , Angle : _aeb . CTM . Angle ( ) } ;
_gdg . X , _gdg . Y = _aeb . CTM . Translation ( ) ; _ebfa . _cgg = append ( _ebfa . _cgg , _gdg ) ; _ebfa . _fcc ++ ; return nil ; } ;
2023-09-07 17:40:17 +00:00
2023-10-07 13:58:01 +00:00
// TextMark represents extracted text on a page with information regarding both textual content,
// formatting (font and size) and positioning.
// It is the smallest unit of text on a PDF page, typically a single character.
//
// getBBox() in test_text.go shows how to compute bounding boxes of substrings of extracted text.
// The following code extracts the text on PDF page `page` into `text` then finds the bounding box
// `bbox` of substring `term` in `text`.
//
// ex, _ := New(page)
// // handle errors
// pageText, _, _, err := ex.ExtractPageText()
// // handle errors
// text := pageText.Text()
// textMarks := pageText.Marks()
//
// start := strings.Index(text, term)
// end := start + len(term)
// spanMarks, err := textMarks.RangeOffset(start, end)
// // handle errors
// bbox, ok := spanMarks.BBox()
// // handle errors
type TextMark struct {
2023-09-07 17:40:17 +00:00
2023-10-07 13:58:01 +00:00
// Text is the extracted text.
Text string ;
2023-09-07 17:40:17 +00:00
2023-10-07 13:58:01 +00:00
// Original is the text in the PDF. It has not been decoded like `Text`.
Original string ;
2023-09-07 17:40:17 +00:00
2023-10-07 13:58:01 +00:00
// BBox is the bounding box of the text.
2024-05-29 17:04:37 +00:00
BBox _af . PdfRectangle ;
2023-09-07 17:40:17 +00:00
2023-10-07 13:58:01 +00:00
// Font is the font the text was drawn with.
2024-05-29 17:04:37 +00:00
Font * _af . PdfFont ;
2023-09-07 17:40:17 +00:00
2023-10-07 13:58:01 +00:00
// FontSize is the font size the text was drawn with.
FontSize float64 ;
2023-09-07 17:40:17 +00:00
2023-10-07 13:58:01 +00:00
// Offset is the offset of the start of TextMark.Text in the extracted text. If you do this
// text, textMarks := pageText.Text(), pageText.Marks()
// marks := textMarks.Elements()
// then marks[i].Offset is the offset of marks[i].Text in text.
Offset int ;
2023-09-07 17:40:17 +00:00
2023-10-07 13:58:01 +00:00
// Meta is set true for spaces and line breaks that we insert in the extracted text. We insert
// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
// distance apart. See wordJoiner (lineJoiner) in PageText.computeViews().
Meta bool ;
2023-09-07 17:40:17 +00:00
2023-10-07 13:58:01 +00:00
// FillColor is the fill color of the text.
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
2024-05-29 17:04:37 +00:00
FillColor _fe . Color ;
2023-09-07 17:40:17 +00:00
2023-10-07 13:58:01 +00:00
// StrokeColor is the stroke color of the text.
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
2024-05-29 17:04:37 +00:00
StrokeColor _fe . Color ;
2023-09-07 17:40:17 +00:00
2023-10-07 13:58:01 +00:00
// Orientation is the text orientation
Orientation int ;
2023-09-07 17:40:17 +00:00
2023-10-07 13:58:01 +00:00
// DirectObject is the underlying PdfObject (Text Object) that represents the visible texts. This is introduced to get
// a simple access to the TextObject in case editing or replacment of some text is needed. E.g during redaction.
2024-05-29 17:04:37 +00:00
DirectObject _gf . PdfObject ;
2023-09-07 17:40:17 +00:00
2023-10-07 13:58:01 +00:00
// ObjString is a decoded string operand of a text-showing operator. It has the same value as `Text` attribute except
// when many glyphs are represented with the same Text Object that contains multiple length string operand in which case
// ObjString spans more than one character string that falls in different TextMark objects.
2024-05-29 17:04:37 +00:00
ObjString [ ] string ; Tw float64 ; Th float64 ; Tc float64 ; Index int ; _bacg bool ; _efce * TextTable ; } ;
2024-04-16 11:40:43 +00:00
2024-05-29 17:04:37 +00:00
// ToTextMark returns the public view of `tm`.
func ( _acba * textMark ) ToTextMark ( ) TextMark { return TextMark { Text : _acba . _cgeb , Original : _acba . _faaf , BBox : _acba . _fbaa , Font : _acba . _dbgfg , FontSize : _acba . _acaf , FillColor : _acba . _ccbff , StrokeColor : _acba . _bcfc , Orientation : _acba . _ddfdb , DirectObject : _acba . _bae , ObjString : _acba . _afcb , Tw : _acba . Tw , Th : _acba . Th , Tc : _acba . _bdeb , Index : _acba . _feg } ;
} ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// TableInfo gets table information of the textmark `tm`.
func ( _bgbe * TextMark ) TableInfo ( ) ( * TextTable , [ ] [ ] int ) { if ! _bgbe . _bacg { return nil , nil ; } ; _ebcc := _bgbe . _efce ; _edfa := _ebcc . getCellInfo ( * _bgbe ) ; return _ebcc , _edfa ; } ; func ( _bgea * wordBag ) makeRemovals ( ) map [ int ] map [ * textWord ] struct { } { _agegg := make ( map [ int ] map [ * textWord ] struct { } , len ( _bgea . _cdbc ) ) ;
for _gec := range _bgea . _cdbc { _agegg [ _gec ] = make ( map [ * textWord ] struct { } ) ; } ; return _agegg ; } ; type cachedImage struct { _aag * _af . Image ; _gbb _af . PdfColorspace ; } ; func ( _gafcd paraList ) lines ( ) [ ] * textLine { var _aebf [ ] * textLine ; for _ , _cfca := range _gafcd { _aebf = append ( _aebf , _cfca . _aage ... ) ;
} ; return _aebf ; } ; func ( _fdedg * wordBag ) arrangeText ( ) * textPara { _fdedg . sort ( ) ; if _deeb { _fdedg . removeDuplicates ( ) ; } ; var _fcef [ ] * textLine ; for _ , _afaf := range _fdedg . depthIndexes ( ) { for ! _fdedg . empty ( _afaf ) { _acc := _fdedg . firstReadingIndex ( _afaf ) ;
_deafb := _fdedg . firstWord ( _acc ) ; _ccedc := _gfgde ( _fdedg , _acc ) ; _bbga := _deafb . _abcc ; _deebg := _deafb . _accb - _egeb * _bbga ; _agda := _deafb . _accb + _egeb * _bbga ; _adge := _geac * _bbga ; _aeeg := _efbf * _bbga ; _ffee : for { var _fdgbf * textWord ; _edaa := 0 ;
for _ , _eaec := range _fdedg . depthBand ( _deebg , _agda ) { _dbbd := _fdedg . highestWord ( _eaec , _deebg , _agda ) ; if _dbbd == nil { continue ; } ; _dfcc := _egec ( _dbbd , _ccedc . _cfcb [ len ( _ccedc . _cfcb ) - 1 ] ) ; if _dfcc < - _aeeg { break _ffee ; } ; if _dfcc > _adge { continue ;
} ; if _fdgbf != nil && _fdbb ( _dbbd , _fdgbf ) >= 0 { continue ; } ; _fdgbf = _dbbd ; _edaa = _eaec ; } ; if _fdgbf == nil { break ; } ; _ccedc . pullWord ( _fdedg , _fdgbf , _edaa ) ; } ; _ccedc . markWordBoundaries ( ) ; _fcef = append ( _fcef , _ccedc ) ; } ; } ; if len ( _fcef ) == 0 { return nil ;
} ; _e . Slice ( _fcef , func ( _dbf , _dbec int ) bool { return _bcbe ( _fcef [ _dbf ] , _fcef [ _dbec ] ) < 0 } ) ; _gecd := _adbde ( _fdedg . PdfRectangle , _fcef ) ; if _fbeb { _ag . Log . Info ( "\u0061\u0072\u0072an\u0067\u0065\u0054\u0065\u0078\u0074\u0020\u0021\u0021\u0021\u0020\u0070\u0061\u0072\u0061\u003d\u0025\u0073" , _gecd . String ( ) ) ;
if _fdfb { for _bbac , _gabgd := range _gecd . _aage { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _bbac , _gabgd . String ( ) ) ; if _aedg { for _fefb , _fafe := range _gabgd . _cfcb { _efc . Printf ( "\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a" , _fefb , _fafe . String ( ) ) ;
for _gdbc , _cfcba := range _fafe . _ffcd { _efc . Printf ( "\u00251\u0032\u0064\u003a\u0020\u0025\u0073\n" , _gdbc , _cfcba . String ( ) ) ; } ; } ; } ; } ; } ; } ; return _gecd ; } ; func ( _abbcc * textWord ) appendMark ( _agbbg * textMark , _cgaeb _af . PdfRectangle ) { _abbcc . _ffcd = append ( _abbcc . _ffcd , _agbbg ) ;
_abbcc . PdfRectangle = _cfab ( _abbcc . PdfRectangle , _agbbg . PdfRectangle ) ; if _agbbg . _acaf > _abbcc . _abcc { _abbcc . _abcc = _agbbg . _acaf ; } ; _abbcc . _accb = _cgaeb . Ury - _abbcc . PdfRectangle . Lly ; } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// PageTextOptions holds various options available in extraction process.
type PageTextOptions struct { _gcgg bool ; _dbed bool ; } ; func _eaae ( _cegaa [ ] * textMark , _ccee _af . PdfRectangle ) * textWord { _aeaaf := _cegaa [ 0 ] . PdfRectangle ; _gefef := _cegaa [ 0 ] . _acaf ; for _ , _eecbb := range _cegaa [ 1 : ] { _aeaaf = _cfab ( _aeaaf , _eecbb . PdfRectangle ) ;
if _eecbb . _acaf > _gefef { _gefef = _eecbb . _acaf ; } ; } ; return & textWord { PdfRectangle : _aeaaf , _ffcd : _cegaa , _accb : _ccee . Ury - _aeaaf . Lly , _abcc : _gefef } ; } ; func ( _bgfd * stateStack ) top ( ) * textState { if _bgfd . empty ( ) { return nil ; } ; return ( * _bgfd ) [ _bgfd . size ( ) - 1 ] ;
} ; func ( _afgee * textTable ) getDown ( ) paraList { _gdge := make ( paraList , _afgee . _aageb ) ; for _cbdc := 0 ; _cbdc < _afgee . _aageb ; _cbdc ++ { _baddf := _afgee . get ( _cbdc , _afgee . _cegga - 1 ) . _cabda ; if _baddf . taken ( ) { return nil ; } ; _gdge [ _cbdc ] = _baddf ; } ; for _eaff := 0 ;
_eaff < _afgee . _aageb - 1 ; _eaff ++ { if _gdge [ _eaff ] . _aggd != _gdge [ _eaff + 1 ] { return nil ; } ; } ; return _gdge ; } ; func ( _dfag * textTable ) markCells ( ) { for _egcd := 0 ; _egcd < _dfag . _cegga ; _egcd ++ { for _dcfcg := 0 ; _dcfcg < _dfag . _aageb ; _dcfcg ++ { _fbba := _dfag . get ( _dcfcg , _egcd ) ;
if _fbba != nil { _fbba . _fcdcf = true ; } ; } ; } ; } ;
2024-04-30 12:24:05 +00:00
// ImageExtractOptions contains options for controlling image extraction from
// PDF pages.
2024-05-29 17:04:37 +00:00
type ImageExtractOptions struct { IncludeInlineStencilMasks bool ; } ; func ( _agbea gridTiling ) complete ( ) bool { for _ , _geedc := range _agbea . _agba { for _ , _fcgefg := range _geedc { if ! _fcgefg . complete ( ) { return false ; } ; } ; } ; return true ; } ; func _ffadg ( _afeg map [ float64 ] [ ] * textLine ) [ ] float64 { _bfeae := [ ] float64 { } ;
for _aegd := range _afeg { _bfeae = append ( _bfeae , _aegd ) ; } ; _e . Float64s ( _bfeae ) ; return _bfeae ; } ; func ( _deab paraList ) llyOrdering ( ) [ ] int { _fbca := make ( [ ] int , len ( _deab ) ) ; for _gfgee := range _deab { _fbca [ _gfgee ] = _gfgee ; } ; _e . SliceStable ( _fbca , func ( _ffge , _fbda int ) bool { _gdbb , _ffbdf := _fbca [ _ffge ] , _fbca [ _fbda ] ;
return _deab [ _gdbb ] . Lly < _deab [ _ffbdf ] . Lly ; } ) ; return _fbca ; } ; func ( _bcdd lineRuling ) xMean ( ) float64 { return 0.5 * ( _bcdd . _bbee . X + _bcdd . _efge . X ) } ; const _edac = 1.0 / 1000.0 ; func _adab ( _feaag , _babd * textPara ) bool { if _feaag . _bdgc || _babd . _bdgc { return true ;
} ; return _cdaea ( _feaag . depth ( ) - _babd . depth ( ) ) ; } ;
2024-04-16 11:40:43 +00:00
2024-05-29 17:04:37 +00:00
// ExtractText processes and extracts all text data in content streams and returns as a string.
// It takes into account character encodings in the PDF file, which are decoded by
// CharcodeBytesToUnicode.
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20> ).
func ( _ggg * Extractor ) ExtractText ( ) ( string , error ) { _bfc , _ , _ , _cge := _ggg . ExtractTextWithStats ( ) ; return _bfc , _cge ; } ; func _cfeg ( _cdeb _af . PdfColorspace , _bbccb _af . PdfColor ) _fe . Color { if _cdeb == nil || _bbccb == nil { return _fe . Black ; } ; _bbde , _aabcd := _cdeb . ColorToRGB ( _bbccb ) ;
if _aabcd != nil { _ag . Log . Debug ( "\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006fu\u006c\u0064\u0020no\u0074\u0020\u0063\u006f\u006e\u0076e\u0072\u0074\u0020\u0063\u006f\u006c\u006f\u0072\u0020\u0025\u0076\u0020\u0028\u0025\u0076)\u0020\u0074\u006f\u0020\u0052\u0047\u0042\u003a \u0025\u0073" , _bbccb , _cdeb , _aabcd ) ;
return _fe . Black ; } ; _gdafd , _bfeea := _bbde . ( * _af . PdfColorDeviceRGB ) ; if ! _bfeea { _ag . Log . Debug ( "\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006f\u006e\u0076\u0065\u0072\u0074\u0065\u0064 \u0063\u006f\u006c\u006f\u0072\u0020\u0069\u0073\u0020\u006e\u006f\u0074\u0020i\u006e\u0020\u0074\u0068\u0065\u0020\u0052\u0047\u0042\u0020\u0063\u006flo\u0072\u0073\u0070\u0061\u0063\u0065\u003a\u0020\u0025\u0076" , _bbde ) ;
return _fe . Black ; } ; return _fe . NRGBA { R : uint8 ( _gdafd . R ( ) * 255 ) , G : uint8 ( _gdafd . G ( ) * 255 ) , B : uint8 ( _gdafd . B ( ) * 255 ) , A : uint8 ( 255 ) } ; } ; func _dgbf ( _bcaca [ ] * textLine , _gefd , _eccd float64 ) [ ] * textLine { var _fgagc [ ] * textLine ; for _ , _fdcb := range _bcaca { if _gefd == - 1 { if _fdcb . _addd > _eccd { _fgagc = append ( _fgagc , _fdcb ) ;
} ; } else { if _fdcb . _addd > _eccd && _fdcb . _addd < _gefd { _fgagc = append ( _fgagc , _fdcb ) ; } ; } ; } ; return _fgagc ; } ; func _dgfg ( _bebg , _egee _af . PdfRectangle ) ( _af . PdfRectangle , bool ) { if ! _bdcb ( _bebg , _egee ) { return _af . PdfRectangle { } , false ; } ; return _af . PdfRectangle { Llx : _ea . Max ( _bebg . Llx , _egee . Llx ) , Urx : _ea . Min ( _bebg . Urx , _egee . Urx ) , Lly : _ea . Max ( _bebg . Lly , _egee . Lly ) , Ury : _ea . Min ( _bebg . Ury , _egee . Ury ) } , true ;
} ; func ( _dfbf rulingList ) mergePrimary ( ) float64 { _gacc := _dfbf [ 0 ] . _aeef ; for _ , _gbde := range _dfbf [ 1 : ] { _gacc += _gbde . _aeef ; } ; return _gacc / float64 ( len ( _dfbf ) ) ; } ; func ( _agdc rulingList ) sort ( ) { _e . Slice ( _agdc , _agdc . comp ) } ; func ( _ffbc rulingList ) connections ( _fdaeg map [ int ] intSet , _ggec int ) intSet { _dfde := make ( intSet ) ;
_dcdaa := make ( intSet ) ; var _aada func ( int ) ; _aada = func ( _bgad int ) { if ! _dcdaa . has ( _bgad ) { _dcdaa . add ( _bgad ) ; for _efdg := range _ffbc { if _fdaeg [ _efdg ] . has ( _bgad ) { _dfde . add ( _efdg ) ; } ; } ; for _gdaf := range _ffbc { if _dfde . has ( _gdaf ) { _aada ( _gdaf ) ;
} ; } ; } ; } ; _aada ( _ggec ) ; return _dfde ; } ; func ( _deg * shapesState ) lineTo ( _bed , _gfcg float64 ) { if _cece { _ag . Log . Info ( "\u006c\u0069\u006eeT\u006f\u0028\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0070\u003d\u0025\u002e\u0032\u0066" , _bed , _gfcg , _deg . devicePoint ( _bed , _gfcg ) ) ;
} ; _deg . addPoint ( _bed , _gfcg ) ; } ; func ( _adfeb rulingList ) findPrimSec ( _edfg , _ebead float64 ) * ruling { for _ , _ceedg := range _adfeb { if _cdaea ( _ceedg . _aeef - _edfg ) && _ceedg . _ggdb - _bcae <= _ebead && _ebead <= _ceedg . _gbca + _bcae { return _ceedg ; } ; } ; return nil ;
} ; type rectRuling struct { _beda rulingKind ; _fdfdf markKind ; _fe . Color ; _af . PdfRectangle ; } ; func ( _gfe * textObject ) getCurrentFont ( ) * _af . PdfFont { _afaca := _gfe . _ecff . _fgfgb ; if _afaca == nil { _ag . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u0020\u0064\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020U\u0073\u0069\u006e\u0067\u0020d\u0065\u0066a\u0075\u006c\u0074\u002e" ) ;
return _af . DefaultFont ( ) ; } ; return _afaca ; } ; func ( _geab rulingList ) isActualGrid ( ) ( rulingList , bool ) { _ecde , _fafea := _geab . augmentGrid ( ) ; if ! ( len ( _ecde ) >= _fcad + 1 && len ( _fafea ) >= _bce + 1 ) { if _gdeb { _ag . Log . Info ( "\u0069s\u0041\u0063t\u0075\u0061\u006c\u0047r\u0069\u0064\u003a \u004e\u006f\u0074\u0020\u0061\u006c\u0069\u0067\u006eed\u002e\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u003c\u0020\u0025d\u0020\u0078 \u0025\u0064" , len ( _ecde ) , len ( _fafea ) , _fcad + 1 , _bce + 1 ) ;
} ; return nil , false ; } ; if _gdeb { _ag . Log . Info ( "\u0069\u0073\u0041\u0063\u0074\u0075a\u006c\u0047\u0072\u0069\u0064\u003a\u0020\u0025\u0073\u0020\u003a\u0020\u0025t\u0020\u0026\u0020\u0025\u0074\u0020\u2192 \u0025\u0074" , _geab , len ( _ecde ) >= 2 , len ( _fafea ) >= 2 , len ( _ecde ) >= 2 && len ( _fafea ) >= 2 ) ;
for _cfef , _gbeb := range _geab { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0076\u000a" , _cfef , _gbeb ) ; } ; } ; if _bedg { _gdded , _egeef := _ecde [ 0 ] , _ecde [ len ( _ecde ) - 1 ] ; _gffeg , _ffada := _fafea [ 0 ] , _fafea [ len ( _fafea ) - 1 ] ; if ! ( _eaaa ( _gdded . _aeef - _gffeg . _ggdb ) && _eaaa ( _egeef . _aeef - _gffeg . _gbca ) && _eaaa ( _gffeg . _aeef - _gdded . _gbca ) && _eaaa ( _ffada . _aeef - _gdded . _ggdb ) ) { if _gdeb { _ag . Log . Info ( "\u0069\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069d\u003a\u0020\u0020N\u006f\u0074 \u0061\u006c\u0069\u0067\u006e\u0065d\u002e\n\t\u0076\u0030\u003d\u0025\u0073\u000a\u0009\u0076\u0031\u003d\u0025\u0073\u000a\u0009\u0068\u0030\u003d\u0025\u0073\u000a\u0009\u0068\u0031\u003d\u0025\u0073" , _gdded , _egeef , _gffeg , _ffada ) ;
} ; return nil , false ; } ; } else { if ! _ecde . aligned ( ) { if _gcbf { _ag . Log . Info ( "i\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069\u0064\u003a\u0020N\u006f\u0074\u0020\u0061\u006c\u0069\u0067n\u0065\u0064\u0020\u0076\u0065\u0072\u0074\u0073\u002e\u0020%\u0064" , len ( _ecde ) ) ;
} ; return nil , false ; } ; if ! _fafea . aligned ( ) { if _gdeb { _ag . Log . Info ( "i\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069\u0064\u003a\u0020N\u006f\u0074\u0020\u0061\u006c\u0069\u0067n\u0065\u0064\u0020\u0068\u006f\u0072\u007a\u0073\u002e\u0020%\u0064" , len ( _fafea ) ) ;
} ; return nil , false ; } ; } ; _ebfb := append ( _ecde , _fafea ... ) ; return _ebfb , true ; } ; type pathSection struct { _bgbeg [ ] * subpath ; _fe . Color ; } ; func _fdfbc ( _acbgc , _gdada _aae . Point ) bool { _accf := _ea . Abs ( _acbgc . X - _gdada . X ) ; _gcae := _ea . Abs ( _acbgc . Y - _gdada . Y ) ;
return _gdcgf ( _gcae , _accf ) ; } ; func _ffag ( _egaf * Extractor , _dgf * _af . PdfPageResources , _bdfe _aa . GraphicsState , _fdbc * textState , _feae * stateStack ) * textObject { return & textObject { _dbe : _egaf , _dae : _dgf , _aef : _bdfe , _aega : _feae , _ecff : _fdbc , _dbc : _aae . IdentityMatrix ( ) , _ebc : _aae . IdentityMatrix ( ) } ;
} ; func ( _afdb intSet ) del ( _efefb int ) { delete ( _afdb , _efefb ) } ; func ( _gada * textPara ) bbox ( ) _af . PdfRectangle { return _gada . PdfRectangle } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// String returns a string describing `pt`.
func ( _gege PageText ) String ( ) string { _egac := _efc . Sprintf ( "P\u0061\u0067\u0065\u0054ex\u0074:\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073" , len ( _gege . _fecaa ) ) ; _eab := [ ] string { "\u002d" + _egac } ; for _ , _aga := range _gege . _fecaa { _eab = append ( _eab , _aga . String ( ) ) ;
} ; _eab = append ( _eab , "\u002b" + _egac ) ; return _a . Join ( _eab , "\u000a" ) ; } ; func _fdgf ( _cfgg float64 ) int { var _agegf int ; if _cfgg >= 0 { _agegf = int ( _cfgg / _cdcb ) ; } else { _agegf = int ( _cfgg / _cdcb ) - 1 ; } ; return _agegf ; } ; func _efg ( _ebge [ ] * textWord , _ggfcd float64 , _bgaag , _aaae rulingList ) * wordBag { _ebbe := _cgd ( _ebge [ 0 ] , _ggfcd , _bgaag , _aaae ) ;
for _ , _gfag := range _ebge [ 1 : ] { _gccg := _fdgf ( _gfag . _accb ) ; _ebbe . _cdbc [ _gccg ] = append ( _ebbe . _cdbc [ _gccg ] , _gfag ) ; _ebbe . PdfRectangle = _cfab ( _ebbe . PdfRectangle , _gfag . PdfRectangle ) ; } ; _ebbe . sort ( ) ; return _ebbe ; } ; func ( _fdcc * textObject ) setCharSpacing ( _feda float64 ) { if _fdcc == nil { return ;
} ; _fdcc . _ecff . _fdad = _feda ; if _bgbd { _ag . Log . Info ( "\u0073\u0065t\u0043\u0068\u0061\u0072\u0053\u0070\u0061\u0063\u0069\u006e\u0067\u003a\u0020\u0025\u002e\u0032\u0066\u0020\u0073\u0074\u0061\u0074e=\u0025\u0073" , _feda , _fdcc . _ecff . String ( ) ) ; } ;
} ; func _eedcg ( _caac map [ float64 ] map [ float64 ] gridTile ) [ ] float64 { _efaec := make ( [ ] float64 , 0 , len ( _caac ) ) ; _eebag := make ( map [ float64 ] struct { } , len ( _caac ) ) ; for _ , _abfeg := range _caac { for _ffef := range _abfeg { if _ , _bdfcd := _eebag [ _ffef ] ; _bdfcd { continue ;
} ; _efaec = append ( _efaec , _ffef ) ; _eebag [ _ffef ] = struct { } { } ; } ; } ; _e . Float64s ( _efaec ) ; return _efaec ; } ; func ( _dcgea paraList ) extractTables ( _bcaf [ ] gridTiling ) paraList { if _dedc { _ag . Log . Debug ( "\u0065\u0078\u0074r\u0061\u0063\u0074\u0054\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d" , len ( _dcgea ) ) ;
} ; if len ( _dcgea ) < _gaba { return _dcgea ; } ; _bfeca := _dcgea . findTables ( _bcaf ) ; if _dedc { _ag . Log . Info ( "c\u006f\u006d\u0062\u0069\u006e\u0065d\u0020\u0074\u0061\u0062\u006c\u0065s\u0020\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d" , len ( _bfeca ) ) ;
for _begag , _adggd := range _bfeca { _adggd . log ( _efc . Sprintf ( "c\u006f\u006d\u0062\u0069\u006e\u0065\u0064\u0020\u0025\u0064" , _begag ) ) ; } ; } ; return _dcgea . applyTables ( _bfeca ) ; } ; func ( _aeaa paraList ) applyTables ( _aafcf [ ] * textTable ) paraList { var _adeaa paraList ;
for _ , _eeffd := range _aafcf { _adeaa = append ( _adeaa , _eeffd . newTablePara ( ) ) ; } ; for _ , _aaea := range _aeaa { if _aaea . _fcdcf { continue ; } ; _adeaa = append ( _adeaa , _aaea ) ; } ; return _adeaa ; } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// String returns a string describing `ma`.
func ( _gace TextMarkArray ) String ( ) string { _ada := len ( _gace . _aec ) ; if _ada == 0 { return "\u0045\u004d\u0050T\u0059" ; } ; _gbaf := _gace . _aec [ 0 ] ; _dgdd := _gace . _aec [ _ada - 1 ] ; return _efc . Sprintf ( "\u007b\u0054\u0045\u0058\u0054\u004d\u0041\u0052K\u0041\u0052\u0052AY\u003a\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025s\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d%\u0073\u007d" , _ada , _gbaf , _dgdd ) ;
} ; const ( RenderModeStroke RenderMode = 1 << iota ; RenderModeFill ; RenderModeClip ; ) ;
2024-04-30 12:24:05 +00:00
// TableCell is a cell in a TextTable.
2024-05-29 17:04:37 +00:00
type TableCell struct { _af . PdfRectangle ;
2024-04-30 12:24:05 +00:00
// Text is the extracted text.
Text string ;
// Marks returns the TextMarks corresponding to the text in Text.
2024-05-29 17:04:37 +00:00
Marks TextMarkArray ; } ; func _bfeb ( _dfbe [ ] * textLine ) [ ] * textLine { _fdea := [ ] * textLine { } ; for _ , _fadbe := range _dfbe { _gcd := _fadbe . text ( ) ; _ddfd := _fbgge . Find ( [ ] byte ( _gcd ) ) ; if _ddfd != nil { _fdea = append ( _fdea , _fadbe ) ; } ; } ; return _fdea ; } ; var ( _ca = _b . New ( "\u0074\u0079p\u0065\u0020\u0063h\u0065\u0063\u006b\u0020\u0065\u0072\u0072\u006f\u0072" ) ;
_eb = _b . New ( "\u0072\u0061\u006e\u0067\u0065\u0020\u0063\u0068\u0065\u0063\u006b\u0020e\u0072\u0072\u006f\u0072" ) ; ) ; func _ebgf ( _ccdf * wordBag , _gedd * textWord , _cffd float64 ) bool { return _ccdf . Urx <= _gedd . Llx && _gedd . Llx < _ccdf . Urx + _cffd ; } ; func ( _afga * wordBag ) pullWord ( _dffa * textWord , _fadc int , _cegad map [ int ] map [ * textWord ] struct { } ) { _afga . PdfRectangle = _cfab ( _afga . PdfRectangle , _dffa . PdfRectangle ) ;
if _dffa . _abcc > _afga . _cdac { _afga . _cdac = _dffa . _abcc ; } ; _afga . _cdbc [ _fadc ] = append ( _afga . _cdbc [ _fadc ] , _dffa ) ; _cegad [ _fadc ] [ _dffa ] = struct { } { } ; } ; func ( _cege * textObject ) renderText ( _edbc _gf . PdfObject , _bbdg [ ] byte , _fac int ) error { if _cege . _cdcc { _ag . Log . Debug ( "\u0072\u0065\u006e\u0064\u0065r\u0054\u0065\u0078\u0074\u003a\u0020\u0049\u006e\u0076\u0061\u006c\u0069\u0064 \u0066\u006f\u006e\u0074\u002e\u0020\u004e\u006f\u0074\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u002e" ) ;
return nil ; } ; _cfge := _cege . getCurrentFont ( ) ; _ffadc := _cfge . BytesToCharcodes ( _bbdg ) ; _bcd , _acb , _fgbf := _cfge . CharcodesToStrings ( _ffadc ) ; if _fgbf > 0 { _ag . Log . Debug ( "\u0072\u0065nd\u0065\u0072\u0054e\u0078\u0074\u003a\u0020num\u0043ha\u0072\u0073\u003d\u0025\u0064\u0020\u006eum\u004d\u0069\u0073\u0073\u0065\u0073\u003d%\u0064" , _acb , _fgbf ) ;
} ; _cege . _ecff . _cfg += _acb ; _cege . _ecff . _dacb += _fgbf ; _afdf := _cege . _ecff ; _afaa := _afdf . _gbbgg ; _dee := _afdf . _dba / 100.0 ; _eded := _edac ; if _cfge . Subtype ( ) == "\u0054\u0079\u0070e\u0033" { _eded = 1 ; } ; _fcfb , _cef := _cfge . GetRuneMetrics ( ' ' ) ; if ! _cef { _fcfb , _cef = _cfge . GetCharMetrics ( 32 ) ;
} ; if ! _cef { _fcfb , _ = _af . DefaultFont ( ) . GetRuneMetrics ( ' ' ) ; } ; _edcg := _fcfb . Wx * _eded ; _ag . Log . Trace ( "\u0073p\u0061\u0063e\u0057\u0069\u0064t\u0068\u003d\u0025\u002e\u0032\u0066\u0020t\u0065\u0078\u0074\u003d\u0025\u0071 \u0066\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0066\u006f\u006et\u0053\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066" , _edcg , _bcd , _cfge , _afaa ) ;
_fccb := _aae . NewMatrix ( _afaa * _dee , 0 , 0 , _afaa , 0 , _afdf . _dgef ) ; if _bgbd { _ag . Log . Info ( "\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0074\u0065\u0078t\u0073\u003d\u0025\u0071" , len ( _ffadc ) , _ffadc , _bcd ) ;
} ; _ag . Log . Trace ( "\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0072\u0075\u006ee\u0073\u003d\u0025\u0071" , len ( _ffadc ) , _ffadc , len ( _bcd ) ) ; _eeaf := _cege . getFillColor ( ) ;
_gge := _cege . getStrokeColor ( ) ; for _bbb , _bbbe := range _bcd { _adfdf := [ ] rune ( _bbbe ) ; if len ( _adfdf ) == 1 && _adfdf [ 0 ] == '\x00' { continue ; } ; _caa := _ffadc [ _bbb ] ; _dcdg := _cege . _aef . CTM . Mult ( _cege . _dbc ) . Mult ( _fccb ) ; _daad := 0.0 ; if len ( _adfdf ) == 1 && _adfdf [ 0 ] == 32 { _daad = _afdf . _febe ;
} ; _gdc , _fbe := _cfge . GetCharMetrics ( _caa ) ; if ! _fbe { _ag . Log . Debug ( "\u0045R\u0052\u004fR\u003a\u0020\u004e\u006f \u006d\u0065\u0074r\u0069\u0063\u0020\u0066\u006f\u0072\u0020\u0063\u006fde\u003d\u0025\u0064 \u0072\u003d0\u0078\u0025\u0030\u0034\u0078\u003d%\u002b\u0071 \u0025\u0073" , _caa , _adfdf , _adfdf , _cfge ) ;
return _efc . Errorf ( "\u006e\u006f\u0020\u0063\u0068\u0061\u0072\u0020\u006d\u0065\u0074\u0072\u0069\u0063\u0073:\u0020f\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0063\u006f\u0064\u0065\u003d\u0025\u0064" , _cfge . String ( ) , _caa ) ; } ; _ecfc := _aae . Point { X : _gdc . Wx * _eded , Y : _gdc . Wy * _eded } ;
_edbf := _aae . Point { X : ( _ecfc . X * _afaa + _daad ) * _dee } ; _bbcb := _aae . Point { X : ( _ecfc . X * _afaa + _afdf . _fdad + _daad ) * _dee } ; if _bgbd { _ag . Log . Info ( "\u0074\u0066\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0063\u003d\u0025\u002e\u0032f\u0020t\u0077\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0068\u003d\u0025\u002e\u0032\u0066" , _afaa , _afdf . _fdad , _afdf . _febe , _dee ) ;
_ag . Log . Info ( "\u0064x\u002c\u0064\u0079\u003d%\u002e\u0033\u0066\u0020\u00740\u003d%\u002e3\u0066\u0020\u0074\u003d\u0025\u002e\u0033f" , _ecfc , _edbf , _bbcb ) ; } ; _abba := _add ( _edbf ) ; _bgdb := _add ( _bbcb ) ; _aaef := _cege . _aef . CTM . Mult ( _cege . _dbc ) . Mult ( _abba ) ;
if _gfgca { _ag . Log . Info ( "e\u006e\u0064\u003a\u000a\tC\u0054M\u003d\u0025\u0073\u000a\u0009 \u0074\u006d\u003d\u0025\u0073\u000a" + "\u0009\u0020t\u0064\u003d\u0025s\u0020\u0078\u006c\u0061\u0074\u003d\u0025\u0073\u000a" + "\u0009t\u0064\u0030\u003d\u0025s\u000a\u0009\u0020\u0020\u2192 \u0025s\u0020x\u006c\u0061\u0074\u003d\u0025\u0073" , _cege . _aef . CTM , _cege . _dbc , _bgdb , _eaf ( _cege . _aef . CTM . Mult ( _cege . _dbc ) . Mult ( _bgdb ) ) , _abba , _aaef , _eaf ( _aaef ) ) ;
} ; _cabfb , _cebd := _cege . newTextMark ( _cb . ExpandLigatures ( _adfdf ) , _dcdg , _eaf ( _aaef ) , _ea . Abs ( _edcg * _dcdg . ScalingFactorX ( ) ) , _cfge , _cege . _ecff . _fdad , _eeaf , _gge , _edbc , _bcd , _bbb , _fac ) ; if ! _cebd { _ag . Log . Debug ( "\u0054\u0065\u0078\u0074\u0020\u006d\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069d\u0065 \u0070\u0061\u0067\u0065\u002e\u0020\u0053\u006b\u0069\u0070\u0070\u0069\u006e\u0067" ) ;
continue ; } ; if _cfge == nil { _ag . Log . Debug ( "\u0045R\u0052O\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u002e" ) ; } else if _cfge . Encoder ( ) == nil { _ag . Log . Debug ( "E\u0052\u0052\u004f\u0052\u003a\u0020N\u006f\u0020\u0065\u006e\u0063\u006f\u0064\u0069\u006eg\u002e\u0020\u0066o\u006et\u003d\u0025\u0073" , _cfge ) ;
} else { if _gfac , _defe := _cfge . Encoder ( ) . CharcodeToRune ( _caa ) ; _defe { _cabfb . _faaf = string ( _gfac ) ; } ; } ; _ag . Log . Trace ( "i\u003d\u0025\u0064\u0020\u0063\u006fd\u0065\u003d\u0025\u0064\u0020\u006d\u0061\u0072\u006b=\u0025\u0073\u0020t\u0072m\u003d\u0025\u0073" , _bbb , _caa , _cabfb , _dcdg ) ;
_cege . _afff = append ( _cege . _afff , & _cabfb ) ; _cege . _dbc . Concat ( _bgdb ) ; } ; return nil ; } ;
2024-04-16 11:40:43 +00:00
2024-05-29 17:04:37 +00:00
// String returns a description of `w`.
func ( _bfeg * textWord ) String ( ) string { return _efc . Sprintf ( "\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022" , _bfeg . _accb , _bfeg . PdfRectangle , _bfeg . _abcc , _bfeg . _ccbcc ) ;
} ; func _efcca ( _afed string , _abefc int ) string { if len ( _afed ) < _abefc { return _afed ; } ; return _afed [ : _abefc ] ; } ; func ( _cagg * structTreeRoot ) parseStructTreeRoot ( _gced _gf . PdfObject ) { if _gced != nil { _fdgb , _bafb := _gf . GetDict ( _gced ) ; if ! _bafb { _ag . Log . Debug ( "\u0070\u0061\u0072s\u0065\u0053\u0074\u0072\u0075\u0063\u0074\u0054\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u003a\u0020\u0064\u0069\u0063\u0074\u0069\u006f\u006e\u0061\u0072\u0079\u0020\u006eo\u0074\u0020\u0066\u006f\u0075\u006e\u0064\u002e" ) ;
} ; K := _fdgb . Get ( "\u004b" ) ; _beaa := _fdgb . Get ( "\u0054\u0079\u0070\u0065" ) . String ( ) ; var _gddg * _gf . PdfObjectArray ; switch _ffec := K . ( type ) { case * _gf . PdfObjectArray : _gddg = _ffec ; case * _gf . PdfObjectReference : _gddg = _gf . MakeArray ( K ) ; } ; _ecgc := [ ] structElement { } ;
for _ , _begg := range _gddg . Elements ( ) { _baabg := & structElement { } ; _baabg . parseStructElement ( _begg ) ; _ecgc = append ( _ecgc , * _baabg ) ; } ; _cagg . _cfbfg = _ecgc ; _cagg . _gfbe = _beaa ; } ; } ; func ( _geae * subpath ) makeRectRuling ( _egeda _fe . Color ) ( * ruling , bool ) { if _fccf { _ag . Log . Info ( "\u006d\u0061\u006beR\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0070\u0061\u0074\u0068\u003d\u0025\u0076" , _geae ) ;
} ; _bcceb := _geae . _aaebg [ : 4 ] ; _cad := make ( map [ int ] rulingKind , len ( _bcceb ) ) ; for _ddagg , _bfgg := range _bcceb { _feea := _geae . _aaebg [ ( _ddagg + 1 ) % 4 ] ; _cad [ _ddagg ] = _bcgd ( _bfgg , _feea ) ; if _fccf { _efc . Printf ( "\u0025\u0034\u0064: \u0025\u0073\u0020\u003d\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u002d\u0020\u0025\u0036\u002e\u0032\u0066" , _ddagg , _cad [ _ddagg ] , _bfgg , _feea ) ;
} ; } ; if _fccf { _efc . Printf ( "\u0020\u0020\u0020\u006b\u0069\u006e\u0064\u0073\u003d\u0025\u002b\u0076\u000a" , _cad ) ; } ; var _dfbda , _gcge [ ] int ; for _eaea , _dbced := range _cad { switch _dbced { case _eeg : _gcge = append ( _gcge , _eaea ) ; case _gecdf : _dfbda = append ( _dfbda , _eaea ) ;
} ; } ; if _fccf { _efc . Printf ( "\u0020\u0020 \u0068\u006f\u0072z\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a" , len ( _gcge ) , _gcge ) ; _efc . Printf ( "\u0020\u0020 \u0076\u0065\u0072t\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a" , len ( _dfbda ) , _dfbda ) ;
} ; _egbbc := ( len ( _gcge ) == 2 && len ( _dfbda ) == 2 ) || ( len ( _gcge ) == 2 && len ( _dfbda ) == 0 && _fdfbc ( _bcceb [ _gcge [ 0 ] ] , _bcceb [ _gcge [ 1 ] ] ) ) || ( len ( _dfbda ) == 2 && len ( _gcge ) == 0 && _cbcee ( _bcceb [ _dfbda [ 0 ] ] , _bcceb [ _dfbda [ 1 ] ] ) ) ; if _fccf { _efc . Printf ( " \u0020\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064\u0020\u0076\u0065\u0072\u0074\u0073=\u0025\u0064\u0020o\u006b=\u0025\u0074\u000a" , len ( _gcge ) , len ( _dfbda ) , _egbbc ) ;
} ; if ! _egbbc { if _fccf { _ag . Log . Error ( "\u0021!\u006d\u0061\u006b\u0065R\u0065\u0063\u0074\u0052\u0075l\u0069n\u0067:\u0020\u0070\u0061\u0074\u0068\u003d\u0025v" , _geae ) ; _efc . Printf ( " \u0020\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064\u0020\u0076\u0065\u0072\u0074\u0073=\u0025\u0064\u0020o\u006b=\u0025\u0074\u000a" , len ( _gcge ) , len ( _dfbda ) , _egbbc ) ;
} ; return & ruling { } , false ; } ; if len ( _dfbda ) == 0 { for _beaee , _abecc := range _cad { if _abecc != _eeg { _dfbda = append ( _dfbda , _beaee ) ; } ; } ; } ; if len ( _gcge ) == 0 { for _deed , _ecdaf := range _cad { if _ecdaf != _gecdf { _gcge = append ( _gcge , _deed ) ; } ; } ; } ; if _fccf { _ag . Log . Info ( "\u006da\u006b\u0065R\u0065\u0063\u0074\u0052u\u006c\u0069\u006eg\u003a\u0020\u0068\u006f\u0072\u007a\u0073\u003d\u0025d \u0076\u0065\u0072t\u0073\u003d%\u0064\u0020\u0070\u006f\u0069\u006et\u0073\u003d%\u0064\u000a" + "\u0009\u0020\u0068o\u0072\u007a\u0073\u003d\u0025\u002b\u0076\u000a" + "\u0009\u0020\u0076e\u0072\u0074\u0073\u003d\u0025\u002b\u0076\u000a" + "\t\u0070\u006f\u0069\u006e\u0074\u0073\u003d\u0025\u002b\u0076" , len ( _gcge ) , len ( _dfbda ) , len ( _bcceb ) , _gcge , _dfbda , _bcceb ) ;
} ; var _aggcd , _ceccd , _gbggf , _fgad _aae . Point ; if _bcceb [ _gcge [ 0 ] ] . Y > _bcceb [ _gcge [ 1 ] ] . Y { _gbggf , _fgad = _bcceb [ _gcge [ 0 ] ] , _bcceb [ _gcge [ 1 ] ] ; } else { _gbggf , _fgad = _bcceb [ _gcge [ 1 ] ] , _bcceb [ _gcge [ 0 ] ] ; } ; if _bcceb [ _dfbda [ 0 ] ] . X > _bcceb [ _dfbda [ 1 ] ] . X { _aggcd , _ceccd = _bcceb [ _dfbda [ 0 ] ] , _bcceb [ _dfbda [ 1 ] ] ;
} else { _aggcd , _ceccd = _bcceb [ _dfbda [ 1 ] ] , _bcceb [ _dfbda [ 0 ] ] ; } ; _dcdgf := _af . PdfRectangle { Llx : _aggcd . X , Urx : _ceccd . X , Lly : _fgad . Y , Ury : _gbggf . Y } ; if _dcdgf . Llx > _dcdgf . Urx { _dcdgf . Llx , _dcdgf . Urx = _dcdgf . Urx , _dcdgf . Llx ; } ; if _dcdgf . Lly > _dcdgf . Ury { _dcdgf . Lly , _dcdgf . Ury = _dcdgf . Ury , _dcdgf . Lly ;
} ; _dcbfd := rectRuling { PdfRectangle : _dcdgf , _beda : _bbbee ( _dcdgf ) , Color : _egeda } ; if _dcbfd . _beda == _ceag { if _fccf { _ag . Log . Error ( "\u006da\u006b\u0065\u0052\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006eg\u003a\u0020\u006b\u0069\u006e\u0064\u003d\u006e\u0069\u006c" ) ;
} ; return nil , false ; } ; _dfeb , _ecfg := _dcbfd . asRuling ( ) ; if ! _ecfg { if _fccf { _ag . Log . Error ( "\u006da\u006b\u0065\u0052\u0065c\u0074\u0052\u0075\u006c\u0069n\u0067:\u0020!\u0069\u0073\u0052\u0075\u006c\u0069\u006eg" ) ; } ; return nil , false ; } ; if _gdeb { _efc . Printf ( "\u0020\u0020\u0020\u0072\u003d\u0025\u0073\u000a" , _dfeb . String ( ) ) ;
} ; return _dfeb , true ; } ; func ( _faccf * subpath ) add ( _bccf ... _aae . Point ) { _faccf . _aaebg = append ( _faccf . _aaebg , _bccf ... ) } ; func ( _aabb * shapesState ) fill ( _dfd * [ ] pathSection ) { _cfgac := pathSection { _bgbeg : _aabb . _baca , Color : _aabb . _cegf . getFillColor ( ) } ;
* _dfd = append ( * _dfd , _cfgac ) ; if _gdeb { _ccdg := _cfgac . bbox ( ) ; _efc . Printf ( "\u0020 \u0020\u0020\u0046\u0049\u004c\u004c\u003a %\u0032\u0064\u0020\u0066\u0069\u006c\u006c\u0073\u0020\u0028\u0025\u0064\u0020\u006ee\u0077\u0029 \u0073\u0073\u003d%\u0073\u0020\u0063\u006f\u006c\u006f\u0072\u003d\u0025\u0033\u0076\u0020\u0025\u0036\u002e\u0032f\u003d\u00256.\u0032\u0066\u0078%\u0036\u002e\u0032\u0066\u000a" , len ( * _dfd ) , len ( _cfgac . _bgbeg ) , _aabb , _cfgac . Color , _ccdg , _ccdg . Width ( ) , _ccdg . Height ( ) ) ;
if _edebg { for _caaf , _eaad := range _cfgac . _bgbeg { _efc . Printf ( "\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a" , _caaf , _eaad ) ; if _caaf == 10 { break ; } ; } ; } ; } ; } ; func _adca ( _accgf map [ int ] intSet ) [ ] int { _ceda := make ( [ ] int , 0 , len ( _accgf ) ) ; for _eebg := range _accgf { _ceda = append ( _ceda , _eebg ) ;
} ; _e . Ints ( _ceda ) ; return _ceda ; } ; func _edddc ( _abcg , _cgfdb float64 ) bool { return _ea . Abs ( _abcg - _cgfdb ) <= _bcae } ; func ( _fafcf paraList ) findTextTables ( ) [ ] * textTable { var _dbfef [ ] * textTable ; for _ , _fgeb := range _fafcf { if _fgeb . taken ( ) || _fgeb . Width ( ) == 0 { continue ;
} ; _facd := _fgeb . isAtom ( ) ; if _facd == nil { continue ; } ; _facd . growTable ( ) ; if _facd . _aageb * _facd . _cegga < _gaba { continue ; } ; _facd . markCells ( ) ; _facd . log ( "\u0067\u0072\u006fw\u006e" ) ; _dbfef = append ( _dbfef , _facd ) ; } ; return _dbfef ; } ; func ( _gccca rulingList ) log ( _cdfad string ) { if ! _gdeb { return ;
} ; _ag . Log . Info ( "\u0023\u0023\u0023\u0020\u0025\u0031\u0030\u0073\u003a\u0020\u0076\u0065c\u0073\u003d\u0025\u0073" , _cdfad , _gccca . String ( ) ) ; for _fefe , _beggd := range _gccca { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _fefe , _beggd . String ( ) ) ;
} ; } ; func ( _dgged * wordBag ) firstWord ( _ceae int ) * textWord { return _dgged . _cdbc [ _ceae ] [ 0 ] } ; func _cdee ( _deafe [ ] float64 , _egdc , _fbcab float64 ) [ ] float64 { _gefgg , _bdaf := _egdc , _fbcab ; if _bdaf < _gefgg { _gefgg , _bdaf = _bdaf , _gefgg ; } ; _gdcbc := make ( [ ] float64 , 0 , len ( _deafe ) + 2 ) ;
_gdcbc = append ( _gdcbc , _egdc ) ; for _ , _fcgfe := range _deafe { if _fcgfe <= _gefgg { continue ; } else if _fcgfe >= _bdaf { break ; } ; _gdcbc = append ( _gdcbc , _fcgfe ) ; } ; _gdcbc = append ( _gdcbc , _fbcab ) ; return _gdcbc ; } ; func _acgbd ( _dgbe string , _cbddd [ ] rulingList ) { _ag . Log . Info ( "\u0024\u0024 \u0025\u0064\u0020g\u0072\u0069\u0064\u0073\u0020\u002d\u0020\u0025\u0073" , len ( _cbddd ) , _dgbe ) ;
for _fcdde , _gdcc := range _cbddd { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _fcdde , _gdcc . String ( ) ) ; } ; } ; func ( _gece paraList ) toTextMarks ( ) [ ] TextMark { _ccff := 0 ; var _daed [ ] TextMark ; for _geba , _ecda := range _gece { if _ecda . _bdgc { continue ;
} ; _bggag := _ecda . toTextMarks ( & _ccff ) ; _daed = append ( _daed , _bggag ... ) ; if _geba != len ( _gece ) - 1 { if _adab ( _ecda , _gece [ _geba + 1 ] ) { _daed = _dbce ( _daed , & _ccff , "\u0020" ) ; } else { _daed = _dbce ( _daed , & _ccff , "\u000a" ) ; _daed = _dbce ( _daed , & _ccff , "\u000a" ) ;
} ; } ; } ; _daed = _dbce ( _daed , & _ccff , "\u000a" ) ; _daed = _dbce ( _daed , & _ccff , "\u000a" ) ; return _daed ; } ; type list struct { _fged [ ] * textLine ; _fdgc string ; _fbef [ ] * list ; _cbda string ; } ; func ( _cbac * textObject ) moveTextSetLeading ( _cabdc , _agbf float64 ) { _cbac . _ecff . _cdc = - _agbf ;
_cbac . moveLP ( _cabdc , _agbf ) ; } ; func ( _dad * wordBag ) sort ( ) { for _ , _eged := range _dad . _cdbc { _e . Slice ( _eged , func ( _feab , _fadb int ) bool { return _fdbb ( _eged [ _feab ] , _eged [ _fadb ] ) < 0 } ) ; } ; } ; func _gdfa ( _ddae , _gfgd bounded ) float64 { return _fbce ( _ddae ) - _fbce ( _gfgd ) } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// String returns a string descibing `i`.
func ( _cbbc gridTile ) String ( ) string { _bcdg := func ( _eccdf bool , _bfgag string ) string { if _eccdf { return _bfgag ; } ; return "\u005f" ; } ; return _efc . Sprintf ( "\u00256\u002e2\u0066\u0020\u0025\u0031\u0073%\u0031\u0073%\u0031\u0073\u0025\u0031\u0073" , _cbbc . PdfRectangle , _bcdg ( _cbbc . _gceeb , "\u004c" ) , _bcdg ( _cbbc . _gdcbg , "\u0052" ) , _bcdg ( _cbbc . _dbafa , "\u0042" ) , _bcdg ( _cbbc . _ffdf , "\u0054" ) ) ;
} ; func _cfab ( _fbaf , _edee _af . PdfRectangle ) _af . PdfRectangle { return _af . PdfRectangle { Llx : _ea . Min ( _fbaf . Llx , _edee . Llx ) , Lly : _ea . Min ( _fbaf . Lly , _edee . Lly ) , Urx : _ea . Max ( _fbaf . Urx , _edee . Urx ) , Ury : _ea . Max ( _fbaf . Ury , _edee . Ury ) } ;
} ; func ( _bbega * ruling ) alignsSec ( _bgaef * ruling ) bool { const _caab = _cggd + 1.0 ; return _bbega . _ggdb - _caab <= _bgaef . _gbca && _bgaef . _ggdb - _caab <= _bbega . _gbca ; } ; func ( _abe * shapesState ) addPoint ( _bfg , _bebb float64 ) { _edbb := _abe . establishSubpath ( ) ;
_gafeg := _abe . devicePoint ( _bfg , _bebb ) ; if _edbb == nil { _abe . _gbee = true ; _abe . _faa = _gafeg ; } else { _edbb . add ( _gafeg ) ; } ; } ; func _dbce ( _bced [ ] TextMark , _bffg * int , _dbba string ) [ ] TextMark { _cegfg := _gded ; _cegfg . Text = _dbba ; return _fgec ( _bced , _bffg , _cegfg ) ;
} ; func ( _ccac lineRuling ) yMean ( ) float64 { return 0.5 * ( _ccac . _bbee . Y + _ccac . _efge . Y ) } ; func ( _acdc * textObject ) nextLine ( ) { _acdc . moveLP ( 0 , - _acdc . _ecff . _cdc ) } ; const ( _afdgf markKind = iota ; _cbeb ; _bddf ; _gcfgb ; ) ; func ( _ggfdd rulingList ) vertsHorzs ( ) ( rulingList , rulingList ) { var _geagb , _ecbc rulingList ;
for _ , _edefe := range _ggfdd { switch _edefe . _ecfb { case _gecdf : _geagb = append ( _geagb , _edefe ) ; case _eeg : _ecbc = append ( _ecbc , _edefe ) ; } ; } ; return _geagb , _ecbc ; } ; func ( _cbaa * ruling ) alignsPrimary ( _bfgd * ruling ) bool { return _cbaa . _ecfb == _bfgd . _ecfb && _ea . Abs ( _cbaa . _aeef - _bfgd . _aeef ) < _cggd * 0.5 ;
2024-04-30 12:24:05 +00:00
} ;
2024-05-29 17:04:37 +00:00
// String returns a human readable description of `s`.
func ( _gfaa intSet ) String ( ) string { var _eedae [ ] int ; for _eddg := range _gfaa { if _gfaa . has ( _eddg ) { _eedae = append ( _eedae , _eddg ) ; } ; } ; _e . Ints ( _eedae ) ; return _efc . Sprintf ( "\u0025\u002b\u0076" , _eedae ) ; } ; func ( _aagb * textLine ) markWordBoundaries ( ) { _geb := _bgcc * _aagb . _afeb ;
for _ggbe , _edadd := range _aagb . _cfcb [ 1 : ] { if _egec ( _edadd , _aagb . _cfcb [ _ggbe ] ) >= _geb { _edadd . _dgeeg = true ; } ; } ; } ; func _gaccd ( _aedb map [ int ] [ ] float64 ) [ ] int { _fade := make ( [ ] int , len ( _aedb ) ) ; _ggabd := 0 ; for _bfaf := range _aedb { _fade [ _ggabd ] = _bfaf ;
_ggabd ++ ; } ; _e . Ints ( _fade ) ; return _fade ; } ; func ( _cbcb paraList ) eventNeighbours ( _fbfb [ ] event ) map [ * textPara ] [ ] int { _e . Slice ( _fbfb , func ( _bdefe , _ceef int ) bool { _ebfbd , _febda := _fbfb [ _bdefe ] , _fbfb [ _ceef ] ; _aegcb , _babb := _ebfbd . _gafa , _febda . _gafa ;
if _aegcb != _babb { return _aegcb < _babb ; } ; if _ebfbd . _ccbeb != _febda . _ccbeb { return _ebfbd . _ccbeb ; } ; return _bdefe < _ceef ; } ) ; _faag := make ( map [ int ] intSet ) ; _bceg := make ( intSet ) ; for _ , _fgdbb := range _fbfb { if _fgdbb . _ccbeb { _faag [ _fgdbb . _ggfab ] = make ( intSet ) ;
for _bbffe := range _bceg { if _bbffe != _fgdbb . _ggfab { _faag [ _fgdbb . _ggfab ] . add ( _bbffe ) ; _faag [ _bbffe ] . add ( _fgdbb . _ggfab ) ; } ; } ; _bceg . add ( _fgdbb . _ggfab ) ; } else { _bceg . del ( _fgdbb . _ggfab ) ; } ; } ; _fbbeg := map [ * textPara ] [ ] int { } ; for _ebbad , _cabg := range _faag { _dddgc := _cbcb [ _ebbad ] ;
if len ( _cabg ) == 0 { _fbbeg [ _dddgc ] = nil ; continue ; } ; _deecb := make ( [ ] int , len ( _cabg ) ) ; _abdf := 0 ; for _fegde := range _cabg { _deecb [ _abdf ] = _fegde ; _abdf ++ ; } ; _fbbeg [ _dddgc ] = _deecb ; } ; return _fbbeg ; } ; func ( _dbeca rulingList ) primaries ( ) [ ] float64 { _aagc := make ( map [ float64 ] struct { } , len ( _dbeca ) ) ;
for _ , _deec := range _dbeca { _aagc [ _deec . _aeef ] = struct { } { } ; } ; _dfdb := make ( [ ] float64 , len ( _aagc ) ) ; _befa := 0 ; for _cgedf := range _aagc { _dfdb [ _befa ] = _cgedf ; _befa ++ ; } ; _e . Float64s ( _dfdb ) ; return _dfdb ; } ; func _dgg ( _ead [ ] Font , _edd string ) bool { for _ , _ad := range _ead { if _ad . FontName == _edd { return true ;
} ; } ; return false ; } ; func ( _ddb * wordBag ) removeWord ( _bfbe * textWord , _dbaf int ) { _cccd := _ddb . _cdbc [ _dbaf ] ; _cccd = _fgcbe ( _cccd , _bfbe ) ; if len ( _cccd ) == 0 { delete ( _ddb . _cdbc , _dbaf ) ; } else { _ddb . _cdbc [ _dbaf ] = _cccd ; } ; } ; func ( _gcg * imageExtractContext ) extractContentStreamImages ( _fca string , _bg * _af . PdfPageResources ) error { _bdc := _aa . NewContentStreamParser ( _fca ) ;
_cbea , _fdc := _bdc . Parse ( ) ; if _fdc != nil { return _fdc ; } ; if _gcg . _ebf == nil { _gcg . _ebf = map [ * _gf . PdfObjectStream ] * cachedImage { } ; } ; if _gcg . _dac == nil { _gcg . _dac = & ImageExtractOptions { } ; } ; _cac := _aa . NewContentStreamProcessor ( * _cbea ) ; _cac . AddHandler ( _aa . HandlerConditionEnumAllOperands , "" , _gcg . processOperand ) ;
return _cac . Process ( _bg ) ; } ; func ( _adfd * textObject ) getStrokeColor ( ) _fe . Color { return _cfeg ( _adfd . _aef . ColorspaceStroking , _adfd . _aef . ColorStroking ) ; } ; func ( _aaac * textObject ) newTextMark ( _dcgb string , _bcacb _aae . Matrix , _egde _aae . Point , _egbc float64 , _dgac * _af . PdfFont , _afde float64 , _bagc , _dbgdg _fe . Color , _eeca _gf . PdfObject , _acdb [ ] string , _cdbb int , _bagce int ) ( textMark , bool ) { _aafe := _bcacb . Angle ( ) ;
_bbad := _afeec ( _aafe , _dgbd ) ; var _gcffbc float64 ; if _bbad % 180 != 90 { _gcffbc = _bcacb . ScalingFactorY ( ) ; } else { _gcffbc = _bcacb . ScalingFactorX ( ) ; } ; _fbdf := _eaf ( _bcacb ) ; _dgdb := _af . PdfRectangle { Llx : _fbdf . X , Lly : _fbdf . Y , Urx : _egde . X , Ury : _egde . Y } ;
switch _bbad % 360 { case 90 : _dgdb . Urx -= _gcffbc ; case 180 : _dgdb . Ury -= _gcffbc ; case 270 : _dgdb . Urx += _gcffbc ; case 0 : _dgdb . Ury += _gcffbc ; default : _bbad = 0 ; _dgdb . Ury += _gcffbc ; } ; if _dgdb . Llx > _dgdb . Urx { _dgdb . Llx , _dgdb . Urx = _dgdb . Urx , _dgdb . Llx ;
} ; if _dgdb . Lly > _dgdb . Ury { _dgdb . Lly , _dgdb . Ury = _dgdb . Ury , _dgdb . Lly ; } ; _cded := true ; if _aaac . _dbe . _fd . Width ( ) > 0 { _bacf , _gbefa := _dgfg ( _dgdb , _aaac . _dbe . _fd ) ; if ! _gbefa { _cded = false ; _ag . Log . Debug ( "\u0054\u0065\u0078\u0074\u0020m\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069\u0064\u0065\u0020\u0070a\u0067\u0065\u002e\u0020\u0062\u0062\u006f\u0078\u003d\u0025\u0067\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u003d\u0025\u0067\u0020\u0074\u0065\u0078\u0074\u003d\u0025q" , _dgdb , _aaac . _dbe . _fd , _dcgb ) ;
} ; _dgdb = _bacf ; } ; _cfbec := _dgdb ; _eccag := _aaac . _dbe . _fd ; switch _bbad % 360 { case 90 : _eccag . Urx , _eccag . Ury = _eccag . Ury , _eccag . Urx ; _cfbec = _af . PdfRectangle { Llx : _eccag . Urx - _dgdb . Ury , Urx : _eccag . Urx - _dgdb . Lly , Lly : _dgdb . Llx , Ury : _dgdb . Urx } ;
case 180 : _cfbec = _af . PdfRectangle { Llx : _eccag . Urx - _dgdb . Llx , Urx : _eccag . Urx - _dgdb . Urx , Lly : _eccag . Ury - _dgdb . Lly , Ury : _eccag . Ury - _dgdb . Ury } ; case 270 : _eccag . Urx , _eccag . Ury = _eccag . Ury , _eccag . Urx ; _cfbec = _af . PdfRectangle { Llx : _dgdb . Ury , Urx : _dgdb . Lly , Lly : _eccag . Ury - _dgdb . Llx , Ury : _eccag . Ury - _dgdb . Urx } ;
} ; if _cfbec . Llx > _cfbec . Urx { _cfbec . Llx , _cfbec . Urx = _cfbec . Urx , _cfbec . Llx ; } ; if _cfbec . Lly > _cfbec . Ury { _cfbec . Lly , _cfbec . Ury = _cfbec . Ury , _cfbec . Lly ; } ; _cegg := textMark { _cgeb : _dcgb , PdfRectangle : _cfbec , _fbaa : _dgdb , _dbgfg : _dgac , _acaf : _gcffbc , _bdeb : _afde , _acgb : _bcacb , _efab : _egde , _ddfdb : _bbad , _ccbff : _bagc , _bcfc : _dbgdg , _bae : _eeca , _afcb : _acdb , Th : _aaac . _ecff . _dba , Tw : _aaac . _ecff . _febe , _ffbdg : _bagce , _feg : _cdbb } ;
if _aebe { _ag . Log . Info ( "n\u0065\u0077\u0054\u0065\u0078\u0074M\u0061\u0072\u006b\u003a\u0020\u0073t\u0061\u0072\u0074\u003d\u0025\u002e\u0032f\u0020\u0065\u006e\u0064\u003d\u0025\u002e\u0032\u0066\u0020%\u0073" , _fbdf , _egde , _cegg . String ( ) ) ; } ; return _cegg , _cded ;
} ; type gridTiling struct { _af . PdfRectangle ; _beefa [ ] float64 ; _eeba [ ] float64 ; _agba map [ float64 ] map [ float64 ] gridTile ; } ; func _egdfd ( _acdd [ ] TextMark , _dcaad * TextTable ) [ ] TextMark { var _dgea [ ] TextMark ; for _ , _ebeb := range _acdd { _ebeb . _bacg = true ;
_ebeb . _efce = _dcaad ; _dgea = append ( _dgea , _ebeb ) ; } ; return _dgea ; } ; func _edfff ( _acdfa _gf . PdfObject , _fbgab _fe . Color ) ( _ef . Image , error ) { _gceb , _fgdd := _gf . GetStream ( _acdfa ) ; if ! _fgdd { return nil , nil ; } ; _aagcd , _afae := _af . NewXObjectImageFromStream ( _gceb ) ;
if _afae != nil { return nil , _afae ; } ; _edfec , _afae := _aagcd . ToImage ( ) ; if _afae != nil { return nil , _afae ; } ; return _gaeb ( _edfec , _fbgab ) , nil ; } ; func ( _fecga rulingList ) snapToGroups ( ) rulingList { _adfdc , _ffagb := _fecga . vertsHorzs ( ) ; if len ( _adfdc ) > 0 { _adfdc = _adfdc . snapToGroupsDirection ( ) ;
} ; if len ( _ffagb ) > 0 { _ffagb = _ffagb . snapToGroupsDirection ( ) ; } ; _aafc := append ( _adfdc , _ffagb ... ) ; _aafc . log ( "\u0073\u006e\u0061p\u0054\u006f\u0047\u0072\u006f\u0075\u0070\u0073" ) ; return _aafc ; } ; func ( _bfeaa * textLine ) pullWord ( _cbde * wordBag , _ccfb * textWord , _bgcd int ) { _bfeaa . appendWord ( _ccfb ) ;
_cbde . removeWord ( _ccfb , _bgcd ) ; } ; func ( _dfb * textObject ) getFillColor ( ) _fe . Color { return _cfeg ( _dfb . _aef . ColorspaceNonStroking , _dfb . _aef . ColorNonStroking ) ; } ; func _affg ( _cagbd float64 ) float64 { return _fbf * _ea . Round ( _cagbd / _fbf ) } ; func _ecd ( _gbc _af . PdfRectangle ) textState { return textState { _dba : 100 , _aaeb : RenderModeFill , _bcbc : _gbc } ;
} ; func ( _bgdbe * wordBag ) depthBand ( _gaed , _dab float64 ) [ ] int { if len ( _bgdbe . _cdbc ) == 0 { return nil ; } ; return _bgdbe . depthRange ( _bgdbe . getDepthIdx ( _gaed ) , _bgdbe . getDepthIdx ( _dab ) ) ; } ;
2024-04-16 11:40:43 +00:00
2024-05-29 17:04:37 +00:00
// BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
func ( _facc * TextMarkArray ) BBox ( ) ( _af . PdfRectangle , bool ) { var _cfcc _af . PdfRectangle ; _eacc := false ; for _ , _cff := range _facc . _aec { if _cff . Meta || _fcegd ( _cff . Text ) { continue ; } ; if _eacc { _cfcc = _cfab ( _cfcc , _cff . BBox ) ; } else { _cfcc = _cff . BBox ;
_eacc = true ; } ; } ; return _cfcc , _eacc ; } ; func _deeg ( _deba [ ] int ) [ ] int { _egdbc := make ( [ ] int , len ( _deba ) ) ; for _adba , _ccgf := range _deba { _egdbc [ len ( _deba ) - 1 - _adba ] = _ccgf ; } ; return _egdbc ; } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// Text returns the extracted page text.
func ( _gcb PageText ) Text ( ) string { return _gcb . _ffb } ; var _gd = false ; func ( _fcfc * shapesState ) lastpointEstablished ( ) ( _aae . Point , bool ) { if _fcfc . _gbee { return _fcfc . _faa , false ; } ; _feee := len ( _fcfc . _baca ) ; if _feee > 0 && _fcfc . _baca [ _feee - 1 ] . _cedc { return _fcfc . _baca [ _feee - 1 ] . last ( ) , false ;
} ; return _aae . Point { } , true ; } ; func _bdfg ( _bcgg [ ] * textMark , _adcg _af . PdfRectangle , _acdcb rulingList , _acbae [ ] gridTiling , _agac bool ) paraList { _ag . Log . Trace ( "\u006d\u0061\u006b\u0065\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065\u003a \u0025\u0064\u0020\u0065\u006c\u0065m\u0065\u006e\u0074\u0073\u0020\u0070\u0061\u0067\u0065\u0053\u0069\u007a\u0065=\u0025\u002e\u0032\u0066" , len ( _bcgg ) , _adcg ) ;
if len ( _bcgg ) == 0 { return nil ; } ; _baef := _dbeee ( _bcgg , _adcg ) ; if len ( _baef ) == 0 { return nil ; } ; _acdcb . log ( "\u006d\u0061\u006be\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065" ) ; _gaceb , _bgbec := _acdcb . vertsHorzs ( ) ; _agbb := _efg ( _baef , _adcg . Ury , _gaceb , _bgbec ) ;
_fdfg := _bcfb ( _agbb , _adcg . Ury , _gaceb , _bgbec ) ; _fdfg = _facf ( _fdfg ) ; _bfbc := make ( paraList , 0 , len ( _fdfg ) ) ; for _ , _cfbfgb := range _fdfg { _fccdc := _cfbfgb . arrangeText ( ) ; if _fccdc != nil { _bfbc = append ( _bfbc , _fccdc ) ; } ; } ; if ! _agac && len ( _bfbc ) >= _gaba { _bfbc = _bfbc . extractTables ( _acbae ) ;
} ; _bfbc . sortReadingOrder ( ) ; if ! _agac { _bfbc . sortTopoOrder ( ) ; } ; _bfbc . log ( "\u0073\u006f\u0072te\u0064\u0020\u0069\u006e\u0020\u0072\u0065\u0061\u0064\u0069\u006e\u0067\u0020\u006f\u0072\u0064\u0065\u0072" ) ; return _bfbc ; } ; func _cdgd ( _bdcba , _ecddg int ) uint64 { return uint64 ( _bdcba ) * 0x1000000 + uint64 ( _ecddg ) } ;
func ( _ebdcb * textTable ) reduceTiling ( _acfbb gridTiling , _gebdf float64 ) * textTable { _abeg := make ( [ ] int , 0 , _ebdcb . _cegga ) ; _bfcaf := make ( [ ] int , 0 , _ebdcb . _aageb ) ; _dccca := _acfbb . _beefa ; _egccg := _acfbb . _eeba ; for _ccaa := 0 ; _ccaa < _ebdcb . _cegga ;
_ccaa ++ { _dccfa := _ccaa > 0 && _ea . Abs ( _egccg [ _ccaa - 1 ] - _egccg [ _ccaa ] ) < _gebdf && _ebdcb . emptyCompositeRow ( _ccaa ) ; if ! _dccfa { _abeg = append ( _abeg , _ccaa ) ; } ; } ; for _bbeaa := 0 ; _bbeaa < _ebdcb . _aageb ; _bbeaa ++ { _cgcbb := _bbeaa < _ebdcb . _aageb - 1 && _ea . Abs ( _dccca [ _bbeaa + 1 ] - _dccca [ _bbeaa ] ) < _gebdf && _ebdcb . emptyCompositeColumn ( _bbeaa ) ;
if ! _cgcbb { _bfcaf = append ( _bfcaf , _bbeaa ) ; } ; } ; if len ( _abeg ) == _ebdcb . _cegga && len ( _bfcaf ) == _ebdcb . _aageb { return _ebdcb ; } ; _bged := textTable { _caagg : _ebdcb . _caagg , _aageb : len ( _bfcaf ) , _cegga : len ( _abeg ) , _becfc : make ( map [ uint64 ] compositeCell , len ( _bfcaf ) * len ( _abeg ) ) } ;
if _dedc { _ag . Log . Info ( "\u0072\u0065\u0064\u0075c\u0065\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0025d\u0078%\u0064\u0020\u002d\u003e\u0020\u0025\u0064x\u0025\u0064" , _ebdcb . _aageb , _ebdcb . _cegga , len ( _bfcaf ) , len ( _abeg ) ) ; _ag . Log . Info ( "\u0072\u0065d\u0075\u0063\u0065d\u0043\u006f\u006c\u0073\u003a\u0020\u0025\u002b\u0076" , _bfcaf ) ;
_ag . Log . Info ( "\u0072\u0065d\u0075\u0063\u0065d\u0052\u006f\u0077\u0073\u003a\u0020\u0025\u002b\u0076" , _abeg ) ; } ; for _adcfa , _fcddeg := range _abeg { for _gdfac , _fcdee := range _bfcaf { _cgcac , _cfcaa := _ebdcb . getComposite ( _fcdee , _fcddeg ) ; if len ( _cgcac ) == 0 { continue ;
} ; if _dedc { _efc . Printf ( "\u0020 \u0025\u0032\u0064\u002c \u0025\u0032\u0064\u0020\u0028%\u0032d\u002c \u0025\u0032\u0064\u0029\u0020\u0025\u0071\n" , _gdfac , _adcfa , _fcdee , _fcddeg , _efcca ( _cgcac . merge ( ) . text ( ) , 50 ) ) ; } ; _bged . putComposite ( _gdfac , _adcfa , _cgcac , _cfcaa ) ;
} ; } ; return & _bged ; } ; func _eeecd ( _bbeg _af . PdfRectangle , _bbbd bounded ) float64 { return _bbeg . Ury - _bbbd . bbox ( ) . Lly } ; func ( _edfe * textPara ) isAtom ( ) * textTable { _fbebb := _edfe ; _gcfa := _edfe . _aggd ; _dbfb := _edfe . _cabda ; if _gcfa . taken ( ) || _dbfb . taken ( ) { return nil ;
} ; _fbcgf := _gcfa . _cabda ; if _fbcgf . taken ( ) || _fbcgf != _dbfb . _aggd { return nil ; } ; return _dcbd ( _fbebb , _gcfa , _dbfb , _fbcgf ) ; } ; func ( _cdg * stateStack ) push ( _efcc * textState ) { _cbb := * _efcc ; * _cdg = append ( * _cdg , & _cbb ) } ; type textLine struct { _af . PdfRectangle ;
_addd float64 ; _cfcb [ ] * textWord ; _afeb float64 ; } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// String returns a description of `t`.
func ( _gdcdc * textTable ) String ( ) string { return _efc . Sprintf ( "\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u0020\u0025\u0074" , _gdcdc . _aageb , _gdcdc . _cegga , _gdcdc . _caagg ) ; } ; func ( _ebba * textTable ) bbox ( ) _af . PdfRectangle { return _ebba . PdfRectangle } ;
func ( _aeagcb * textTable ) depth ( ) float64 { _fgbgd := 1e10 ; for _dgad := 0 ; _dgad < _aeagcb . _aageb ; _dgad ++ { _fbbda := _aeagcb . get ( _dgad , 0 ) ; if _fbbda == nil || _fbbda . _bdgc { continue ; } ; _fgbgd = _ea . Min ( _fgbgd , _fbbda . depth ( ) ) ; } ; return _fgbgd ; } ; type fontEntry struct { _gggfe * _af . PdfFont ;
_edeb int64 ; } ; func _afgg ( _caebb [ ] compositeCell ) [ ] float64 { var _cfbab [ ] * textLine ; _cfgfd := 0 ; for _ , _gedb := range _caebb { _cfgfd += len ( _gedb . paraList ) ; _cfbab = append ( _cfbab , _gedb . lines ( ) ... ) ; } ; _e . Slice ( _cfbab , func ( _deada , _defa int ) bool { _gaec , _fbcdd := _cfbab [ _deada ] , _cfbab [ _defa ] ;
_dgee , _dddg := _gaec . _addd , _fbcdd . _addd ; if ! _cdaea ( _dgee - _dddg ) { return _dgee < _dddg ; } ; return _gaec . Llx < _fbcdd . Llx ; } ) ; if _dedc { _efc . Printf ( "\u0020\u0020\u0020 r\u006f\u0077\u0042\u006f\u0072\u0064\u0065\u0072\u0073:\u0020%\u0064 \u0070a\u0072\u0061\u0073\u0020\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u000a" , _cfgfd , len ( _cfbab ) ) ;
for _ebgef , _eedce := range _cfbab { _efc . Printf ( "\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a" , _ebgef , _eedce ) ; } ; } ; var _eagfg [ ] float64 ; _acfca := _cfbab [ 0 ] ; var _dace [ ] [ ] * textLine ; _aagfb := [ ] * textLine { _acfca } ; for _cacgb , _fffga := range _cfbab [ 1 : ] { if _fffga . Ury < _acfca . Lly { _dfff := 0.5 * ( _fffga . Ury + _acfca . Lly ) ;
if _dedc { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u003c\u0020\u0025\u0036.\u0032f\u0020\u0062\u006f\u0072\u0064\u0065\u0072\u003d\u0025\u0036\u002e\u0032\u0066\u000a" + "\u0009\u0020\u0071\u003d\u0025\u0073\u000a\u0009\u0020p\u003d\u0025\u0073\u000a" , _cacgb , _fffga . Ury , _acfca . Lly , _dfff , _acfca , _fffga ) ;
} ; _eagfg = append ( _eagfg , _dfff ) ; _dace = append ( _dace , _aagfb ) ; _aagfb = nil ; } ; _aagfb = append ( _aagfb , _fffga ) ; if _fffga . Lly < _acfca . Lly { _acfca = _fffga ; } ; } ; if len ( _aagfb ) > 0 { _dace = append ( _dace , _aagfb ) ; } ; if _dedc { _efc . Printf ( " \u0020\u0020\u0020\u0020\u0020\u0020 \u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064o\u0072\u0073\u003d%\u0036.\u0032\u0066\u000a" , _eagfg ) ;
} ; if _dedc { _ag . Log . Info ( "\u0072\u006f\u0077\u003d\u0025\u0064" , len ( _caebb ) ) ; for _eddec , _cedd := range _caebb { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _eddec , _cedd ) ; } ; _ag . Log . Info ( "\u0067r\u006f\u0075\u0070\u0073\u003d\u0025d" , len ( _dace ) ) ;
for _gfec , _dbeac := range _dace { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0064\u000a" , _gfec , len ( _dbeac ) ) ; for _dbfa , _gfed := range _dbeac { _efc . Printf ( "\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a" , _dbfa , _gfed ) ; } ; } ; } ; _dbceg := true ;
for _adbca , _fgabb := range _dace { _defeg := true ; for _cgcg , _bgfac := range _caebb { if _dedc { _efc . Printf ( "\u0020\u0020\u0020\u007e\u007e\u007e\u0067\u0072\u006f\u0075\u0070\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064\u0020\u0063\u0065\u006cl\u0020\u0025\u0064\u0020\u006ff\u0020\u0025d\u0020\u0025\u0073\u000a" , _adbca , len ( _dace ) , _cgcg , len ( _caebb ) , _bgfac ) ;
} ; if ! _bgfac . hasLines ( _fgabb ) { if _dedc { _efc . Printf ( "\u0020\u0020\u0020\u0021\u0021\u0021\u0067\u0072\u006f\u0075\u0070\u0020\u0025d\u0020\u006f\u0066\u0020\u0025\u0064 \u0063\u0065\u006c\u006c\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064 \u004f\u0055\u0054\u000a" , _adbca , len ( _dace ) , _cgcg , len ( _caebb ) ) ;
} ; _defeg = false ; break ; } ; } ; if ! _defeg { _dbceg = false ; break ; } ; } ; if ! _dbceg { if _dedc { _ag . Log . Info ( "\u0072\u006f\u0077\u0020\u0063o\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u0020\u0064\u006f\u006e\u0027\u0074 \u0073\u0070\u0061\u006e\u0020\u0061\u006c\u006c\u0020\u0063\u0065\u006c\u006c\u0073\u0020\u0069\u006e\u0020\u0072\u006f\u0077\u002e\u0020\u0069\u0067\u006e\u006f\u0072\u0069\u006eg" ) ;
} ; _eagfg = nil ; } ; if _dedc && _eagfg != nil { _efc . Printf ( "\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u002a\u002a*\u0072\u006f\u0077\u0043\u006f\u0072\u0072i\u0064\u006f\u0072\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u000a" , _eagfg ) ; } ; return _eagfg ;
} ; func _dfba ( _ecg , _bddb _af . PdfRectangle ) bool { return _ecg . Lly <= _bddb . Ury && _bddb . Lly <= _ecg . Ury } ; func ( _dccfe * textObject ) setTextRise ( _gdgc float64 ) { if _dccfe == nil { return ; } ; _dccfe . _ecff . _dgef = _gdgc ; } ; type markKind int ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// String returns a string describing `tm`.
func ( _egbe TextMark ) String ( ) string { _afg := _egbe . BBox ; var _dccc string ; if _egbe . Font != nil { _dccc = _egbe . Font . String ( ) ; if len ( _dccc ) > 50 { _dccc = _dccc [ : 50 ] + "\u002e\u002e\u002e" ; } ; } ; var _feaa string ; if _egbe . Meta { _feaa = "\u0020\u002a\u004d\u002a" ;
} ; return _efc . Sprintf ( "\u007b\u0054\u0065\u0078t\u004d\u0061\u0072\u006b\u003a\u0020\u0025\u0064\u0020%\u0071\u003d\u0025\u0030\u0032\u0078\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e2\u0066\u0029\u0020\u0028\u00256\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0025\u0073\u007d" , _egbe . Offset , _egbe . Text , [ ] rune ( _egbe . Text ) , _afg . Llx , _afg . Lly , _afg . Urx , _afg . Ury , _dccc , _feaa ) ;
} ; func ( _fgfbd intSet ) add ( _ddedb int ) { _fgfbd [ _ddedb ] = struct { } { } } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// String returns a description of `v`.
func ( _eada * ruling ) String ( ) string { if _eada . _ecfb == _ceag { return "\u004e\u004f\u0054\u0020\u0052\u0055\u004c\u0049\u004e\u0047" ; } ; _ccfcf , _ggff := "\u0078" , "\u0079" ; if _eada . _ecfb == _eeg { _ccfcf , _ggff = "\u0079" , "\u0078" ; } ; _dgcg := "" ; if _eada . _faba != 0.0 { _dgcg = _efc . Sprintf ( " \u0077\u0069\u0064\u0074\u0068\u003d\u0025\u002e\u0032\u0066" , _eada . _faba ) ;
} ; return _efc . Sprintf ( "\u0025\u00310\u0073\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u002d\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0020\u0025\u0076\u0025\u0073" , _eada . _ecfb , _ccfcf , _eada . _aeef , _ggff , _eada . _ggdb , _eada . _gbca , _eada . _gbca - _eada . _ggdb , _eada . _agff , _eada . Color , _dgcg ) ;
} ; var _fbgge * _g . Regexp = _g . MustCompile ( _cdaf + "\u007c" + _defba ) ;
2024-04-16 11:40:43 +00:00
2024-05-29 17:04:37 +00:00
// String returns a description of `b`.
func ( _ffgc * wordBag ) String ( ) string { var _eefd [ ] string ; for _ , _eee := range _ffgc . depthIndexes ( ) { _bgdg := _ffgc . _cdbc [ _eee ] ; for _ , _febf := range _bgdg { _eefd = append ( _eefd , _febf . _ccbcc ) ; } ; } ; return _efc . Sprintf ( "\u0025.\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065=\u0025\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0025\u0071" , _ffgc . PdfRectangle , _ffgc . _cdac , len ( _eefd ) , _eefd ) ;
} ; func ( _bbdda paraList ) inTile ( _ebga gridTile ) paraList { var _dcga paraList ; for _ , _dcca := range _bbdda { if _ebga . contains ( _dcca . PdfRectangle ) { _dcga = append ( _dcga , _dcca ) ; } ; } ; if _dedc { _efc . Printf ( "\u0020 \u0020\u0069\u006e\u0054i\u006c\u0065\u003a\u0020\u0020%\u0073 \u0069n\u0073\u0069\u0064\u0065\u003d\u0025\u0064\n" , _ebga , len ( _dcga ) ) ;
for _gdff , _bafa := range _dcga { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _gdff , _bafa ) ; } ; _efc . Println ( "" ) ; } ; return _dcga ; } ; func _fgec ( _fgcc [ ] TextMark , _bfgf * int , _daff TextMark ) [ ] TextMark { _daff . Offset = * _bfgf ; _fgcc = append ( _fgcc , _daff ) ;
* _bfgf += len ( _daff . Text ) ; return _fgcc ; } ; func _ccgd ( _eabe * list , _fgede * string ) string { _dfbd := _a . Split ( _eabe . _cbda , "\u000a" ) ; _ebfd := & _a . Builder { } ; for _ , _bebe := range _dfbd { if _bebe != "" { _ebfd . WriteString ( * _fgede ) ; _ebfd . WriteString ( _bebe ) ;
_ebfd . WriteString ( "\u000a" ) ; } ; } ; return _ebfd . String ( ) ; } ; func _dgca ( _edfae map [ float64 ] map [ float64 ] gridTile ) [ ] float64 { _baacc := make ( [ ] float64 , 0 , len ( _edfae ) ) ; for _cgabg := range _edfae { _baacc = append ( _baacc , _cgabg ) ; } ; _e . Float64s ( _baacc ) ;
_fdebg := len ( _baacc ) ; for _gdga := 0 ; _gdga < _fdebg / 2 ; _gdga ++ { _baacc [ _gdga ] , _baacc [ _fdebg - 1 - _gdga ] = _baacc [ _fdebg - 1 - _gdga ] , _baacc [ _gdga ] ; } ; return _baacc ; } ; func _bedc ( _fdbf * textLine ) float64 { return _fdbf . _cfcb [ 0 ] . Llx } ; type shapesState struct { _cffc _aae . Matrix ;
_gdec _aae . Matrix ; _baca [ ] * subpath ; _gbee bool ; _faa _aae . Point ; _cegf * textObject ; } ;
2024-04-30 12:24:05 +00:00
// GetContentStreamOps returns the contentStreamOps field of `pt`.
2024-05-29 17:04:37 +00:00
func ( _edbff * PageText ) GetContentStreamOps ( ) * _aa . ContentStreamOperations { return _edbff . _cfa } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
// TODO(peterwilliams97): The stats complicate this function signature and aren't very useful.
//
// Replace with a function like Extract() (*PageText, error)
func ( _bdg * Extractor ) ExtractPageText ( ) ( * PageText , int , int , error ) { _eedd , _dbgc , _deaf , _beb := _bdg . extractPageText ( _bdg . _fgb , _bdg . _cf , _aae . IdentityMatrix ( ) , 0 , false ) ; if _beb != nil && _beb != _af . ErrColorOutOfRange { return nil , 0 , 0 , _beb ; } ; if _bdg . _efe != nil { _eedd . _gbg . _dbed = _bdg . _efe . UseSimplerExtractionProcess ;
} ; _eedd . computeViews ( ) ; _beb = _ecbg ( _eedd ) ; if _beb != nil { return nil , 0 , 0 , _beb ; } ; if _bdg . _efe != nil { if _bdg . _efe . ApplyCropBox && _bdg . _fed != nil { _eedd . ApplyArea ( * _bdg . _fed ) ; } ; _eedd . _gbg . _gcgg = _bdg . _efe . DisableDocumentTags ; } ; return _eedd , _dbgc , _deaf , nil ;
} ; func ( _fcga * textObject ) reset ( ) { _fcga . _dbc = _aae . IdentityMatrix ( ) ; _fcga . _ebc = _aae . IdentityMatrix ( ) ; _fcga . _afff = nil ; } ; func _gacff ( _fcde * wordBag , _ddeg * textWord , _dbeb float64 ) bool { return _ddeg . Llx < _fcde . Urx + _dbeb && _fcde . Llx - _dbeb < _ddeg . Urx ;
} ; var _ecccgb = map [ rulingKind ] string { _ceag : "\u006e\u006f\u006e\u0065" , _eeg : "\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c" , _gecdf : "\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c" } ; func _bbgag ( _daec [ ] _gf . PdfObject ) ( _ccfgc , _gfbgc float64 , _aabeg error ) { if len ( _daec ) != 2 { return 0 , 0 , _efc . Errorf ( "\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u006e\u0075\u006d\u0062\u0065\u0072\u0020o\u0066 \u0070\u0061\u0072\u0061\u006d\u0073\u003a \u0025\u0064" , len ( _daec ) ) ;
} ; _fgac , _aabeg := _gf . GetNumbersAsFloat ( _daec ) ; if _aabeg != nil { return 0 , 0 , _aabeg ; } ; return _fgac [ 0 ] , _fgac [ 1 ] , nil ; } ; type wordBag struct { _af . PdfRectangle ; _cdac float64 ; _dcaa , _bfcg rulingList ; _ecba float64 ; _cdbc map [ int ] [ ] * textWord ; } ; func ( _dcbg gridTile ) complete ( ) bool { return _dcbg . numBorders ( ) == 4 } ;
type event struct { _gafa float64 ; _ccbeb bool ; _ggfab int ; } ; func ( _gdfe * wordBag ) empty ( _badd int ) bool { _ , _bbca := _gdfe . _cdbc [ _badd ] ; return ! _bbca } ; func _fdbb ( _bbfe , _bfff bounded ) float64 { return _bbfe . bbox ( ) . Llx - _bfff . bbox ( ) . Llx } ; func _caaa ( _cggf , _dgbg int ) int { if _cggf < _dgbg { return _cggf ;
} ; return _dgbg ; } ; func ( _dfefa * textTable ) getComposite ( _dgfb , _gddcc int ) ( paraList , _af . PdfRectangle ) { _eabfgb , _dbcf := _dfefa . _becfc [ _cdgd ( _dgfb , _gddcc ) ] ; if _dedc { _efc . Printf ( "\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0067\u0065\u0074\u0043\u006f\u006d\u0070o\u0073i\u0074\u0065\u0028\u0025\u0064\u002c\u0025\u0064\u0029\u002d\u003e\u0025\u0073\u000a" , _dgfb , _gddcc , _eabfgb . String ( ) ) ;
} ; if ! _dbcf { return nil , _af . PdfRectangle { } ; } ; return _eabfgb . parasBBox ( ) ; } ; func ( _dbbc * ruling ) encloses ( _eecb , _bdcbd float64 ) bool { return _dbbc . _ggdb - _bcae <= _eecb && _bdcbd <= _dbbc . _gbca + _bcae ; } ; func _bcgac ( _dgedb map [ int ] [ ] float64 ) { if len ( _dgedb ) <= 1 { return ;
} ; _cgfc := _gaccd ( _dgedb ) ; if _dedc { _ag . Log . Info ( "\u0066i\u0078C\u0065\u006c\u006c\u0073\u003a \u006b\u0065y\u0073\u003d\u0025\u002b\u0076" , _cgfc ) ; } ; var _bace , _dfaf int ; for _bace , _dfaf = range _cgfc { if _dgedb [ _dfaf ] != nil { break ; } ; } ; for _aebfc , _gcgdad := range _cgfc [ _bace : ] { _fcbea := _dgedb [ _gcgdad ] ;
if _fcbea == nil { continue ; } ; if _dedc { _efc . Printf ( "\u0025\u0034\u0064\u003a\u0020\u006b\u0030\u003d\u0025\u0064\u0020\u006b1\u003d\u0025\u0064\u000a" , _bace + _aebfc , _dfaf , _gcgdad ) ; } ; _ebfadb := _dgedb [ _gcgdad ] ; if _ebfadb [ len ( _ebfadb ) - 1 ] > _fcbea [ 0 ] { _ebfadb [ len ( _ebfadb ) - 1 ] = _fcbea [ 0 ] ;
_dgedb [ _dfaf ] = _ebfadb ; } ; _dfaf = _gcgdad ; } ; } ; func ( _geeb * textTable ) compositeRowCorridors ( ) map [ int ] [ ] float64 { _bfageb := make ( map [ int ] [ ] float64 , _geeb . _cegga ) ; if _dedc { _ag . Log . Info ( "c\u006f\u006d\u0070\u006f\u0073\u0069t\u0065\u0052\u006f\u0077\u0043\u006f\u0072\u0072\u0069d\u006f\u0072\u0073:\u0020h\u003d\u0025\u0064" , _geeb . _cegga ) ;
} ; for _dfda := 1 ; _dfda < _geeb . _cegga ; _dfda ++ { var _dfgd [ ] compositeCell ; for _eggd := 0 ; _eggd < _geeb . _aageb ; _eggd ++ { if _ddgc , _fdbbb := _geeb . _becfc [ _cdgd ( _eggd , _dfda ) ] ; _fdbbb { _dfgd = append ( _dfgd , _ddgc ) ; } ; } ; if len ( _dfgd ) == 0 { continue ; } ; _bcef := _afgg ( _dfgd ) ;
_bfageb [ _dfda ] = _bcef ; if _dedc { _efc . Printf ( "\u0020\u0020\u0020\u0025\u0032\u0064\u003a\u0020\u00256\u002e\u0032\u0066\u000a" , _dfda , _bcef ) ; } ; } ; return _bfageb ; } ; func _cgd ( _gcaa * textWord , _aegaf float64 , _decb , _abgc rulingList ) * wordBag { _fbgg := _fdgf ( _gcaa . _accb ) ;
_ecdf := [ ] * textWord { _gcaa } ; _fggd := wordBag { _cdbc : map [ int ] [ ] * textWord { _fbgg : _ecdf } , PdfRectangle : _gcaa . PdfRectangle , _cdac : _gcaa . _abcc , _ecba : _aegaf , _dcaa : _decb , _bfcg : _abgc } ; return & _fggd ; } ; func ( _ddbb * wordBag ) absorb ( _fdde * wordBag ) { _bbggb := _fdde . makeRemovals ( ) ;
for _cbff , _aabce := range _fdde . _cdbc { for _ , _edbd := range _aabce { _ddbb . pullWord ( _edbd , _cbff , _bbggb ) ; } ; } ; _fdde . applyRemovals ( _bbggb ) ; } ; func _aefg ( _beab [ ] pathSection ) rulingList { _abbaa ( _beab ) ; if _gdeb { _ag . Log . Info ( "\u006da\u006b\u0065\u0046\u0069l\u006c\u0052\u0075\u006c\u0069n\u0067s\u003a \u0025\u0064\u0020\u0066\u0069\u006c\u006cs" , len ( _beab ) ) ;
} ; var _dfeef rulingList ; for _ , _bbdcg := range _beab { for _ , _ceedd := range _bbdcg . _bgbeg { if ! _ceedd . isQuadrilateral ( ) { if _gdeb { _ag . Log . Error ( "!\u0069s\u0051\u0075\u0061\u0064\u0072\u0069\u006c\u0061t\u0065\u0072\u0061\u006c: \u0025\u0073" , _ceedd ) ;
} ; continue ; } ; if _aeabg , _cegbe := _ceedd . makeRectRuling ( _bbdcg . Color ) ; _cegbe { _dfeef = append ( _dfeef , _aeabg ) ; } else { if _fccf { _ag . Log . Error ( "\u0021\u006d\u0061\u006beR\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0025\u0073" , _ceedd ) ;
} ; } ; } ; } ; if _gdeb { _ag . Log . Info ( "\u006d\u0061\u006b\u0065Fi\u006c\u006c\u0052\u0075\u006c\u0069\u006e\u0067\u0073\u003a\u0020\u0025\u0073" , _dfeef . String ( ) ) ; } ; return _dfeef ; } ;
2024-04-30 12:24:05 +00:00
// Marks returns the TextMark collection for a page. It represents all the text on the page.
2024-05-29 17:04:37 +00:00
func ( _aca PageText ) Marks ( ) * TextMarkArray { return & TextMarkArray { _aec : _aca . _gdbf } } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// ExtractPageImages returns the image contents of the page extractor, including data
// and position, size information for each image.
// A set of options to control page image extraction can be passed in. The options
// parameter can be nil for the default options. By default, inline stencil masks
// are not extracted.
func ( _cab * Extractor ) ExtractPageImages ( options * ImageExtractOptions ) ( * PageImages , error ) { _cea := & imageExtractContext { _dac : options } ; _eef := _cea . extractContentStreamImages ( _cab . _fgb , _cab . _cf ) ; if _eef != nil { return nil , _eef ; } ; return & PageImages { Images : _cea . _cgg } , nil ;
} ; func _bbaf ( _baeg _af . PdfRectangle ) * ruling { return & ruling { _ecfb : _eeg , _aeef : _baeg . Ury , _ggdb : _baeg . Llx , _gbca : _baeg . Urx } ; } ; func ( _ecccg * textPara ) getListLines ( ) [ ] * textLine { var _cgbe [ ] * textLine ; _gffg := _bfeb ( _ecccg . _aage ) ; for _ , _fbgf := range _ecccg . _aage { _ggbd := _fbgf . _cfcb [ 0 ] . _ccbcc [ 0 ] ;
if _debe ( _ggbd ) { _cgbe = append ( _cgbe , _fbgf ) ; } ; } ; _cgbe = append ( _cgbe , _gffg ... ) ; return _cgbe ; } ; func ( _ddef * textMark ) inDiacriticArea ( _dgddg * textMark ) bool { _gefga := _ddef . Llx - _dgddg . Llx ; _bdab := _ddef . Urx - _dgddg . Urx ; _aeea := _ddef . Lly - _dgddg . Lly ;
return _ea . Abs ( _gefga + _bdab ) < _ddef . Width ( ) * _bbcc && _ea . Abs ( _aeea ) < _ddef . Height ( ) * _bbcc ; } ; func _bbbee ( _adgea _af . PdfRectangle ) rulingKind { _edea := _adgea . Width ( ) ; _acced := _adgea . Height ( ) ; if _edea > _acced { if _edea >= _abca { return _eeg ;
} ; } else { if _acced >= _abca { return _gecdf ; } ; } ; return _ceag ; } ; type bounded interface { bbox ( ) _af . PdfRectangle } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// ToText returns the page text as a single string.
// Deprecated: This function is deprecated and will be removed in a future major version. Please use
// Text() instead.
func ( _fecg PageText ) ToText ( ) string { return _fecg . Text ( ) } ; func _ecbg ( _cgage * PageText ) error { _gefgb := _d . GetLicenseKey ( ) ; if _gefgb != nil && _gefgb . IsLicensed ( ) || _gd { return nil ; } ; _efc . Printf ( "\u0055\u006e\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0064\u0020c\u006f\u0070\u0079\u0020\u006f\u0066\u0020\u0055\u006e\u0069P\u0044\u0046\u000a" ) ;
_efc . Println ( "-\u0020\u0047\u0065\u0074\u0020\u0061\u0020\u0066\u0072e\u0065\u0020\u0074\u0072\u0069\u0061\u006c l\u0069\u0063\u0065\u006es\u0065\u0020\u006f\u006e\u0020\u0068\u0074\u0074\u0070s:\u002f\u002fu\u006e\u0069\u0064\u006f\u0063\u002e\u0069\u006f" ) ;
return _b . New ( "\u0075\u006e\u0069\u0070d\u0066\u0020\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0020c\u006fd\u0065\u0020\u0072\u0065\u0071\u0075\u0069r\u0065\u0064" ) ; } ; func ( _bfcc * shapesState ) clearPath ( ) { _bfcc . _baca = nil ; _bfcc . _gbee = false ; if _cece { _ag . Log . Info ( "\u0043\u004c\u0045A\u0052\u003a\u0020\u0073\u0073\u003d\u0025\u0073" , _bfcc ) ;
} ; } ;
2024-04-30 12:24:05 +00:00
// String returns a description of `state`.
2024-05-29 17:04:37 +00:00
func ( _dedf * textState ) String ( ) string { _gfafg := "\u005bN\u004f\u0054\u0020\u0053\u0045\u0054]" ; if _dedf . _fgfgb != nil { _gfafg = _dedf . _fgfgb . BaseFont ( ) ; } ; return _efc . Sprintf ( "\u0074\u0063\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0077\u003d\u0025\u002e\u0032\u0066 \u0074f\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u003d\u0025\u0071" , _dedf . _fdad , _dedf . _febe , _dedf . _gbbgg , _gfafg ) ;
} ; func ( _ggedf rulingList ) secMinMax ( ) ( float64 , float64 ) { _ddbf , _fced := _ggedf [ 0 ] . _ggdb , _ggedf [ 0 ] . _gbca ; for _ , _ccge := range _ggedf [ 1 : ] { if _ccge . _ggdb < _ddbf { _ddbf = _ccge . _ggdb ; } ; if _ccge . _gbca > _fced { _fced = _ccge . _gbca ; } ; } ; return _ddbf , _fced ;
} ; func _fgcbe ( _egaag [ ] * textWord , _bcee * textWord ) [ ] * textWord { for _edgebf , _efgc := range _egaag { if _efgc == _bcee { return _adfc ( _egaag , _edgebf ) ; } ; } ; _ag . Log . Error ( "\u0072\u0065\u006d\u006f\u0076e\u0057\u006f\u0072\u0064\u003a\u0020\u0077\u006f\u0072\u0064\u0073\u0020\u0064o\u0065\u0073\u006e\u0027\u0074\u0020\u0063\u006f\u006e\u0074\u0061\u0069\u006e\u0020\u0077\u006f\u0072\u0064\u003d\u0025\u0073" , _bcee ) ;
return nil ; } ; func ( _ffcb * textObject ) showText ( _eccc _gf . PdfObject , _ega [ ] byte , _ded int ) error { return _ffcb . renderText ( _eccc , _ega , _ded ) ; } ; func ( _bgcb pathSection ) bbox ( ) _af . PdfRectangle { _fbd := _bgcb . _bgbeg [ 0 ] . _aaebg [ 0 ] ; _aaeg := _af . PdfRectangle { Llx : _fbd . X , Urx : _fbd . X , Lly : _fbd . Y , Ury : _fbd . Y } ;
_fcac := func ( _fge _aae . Point ) { if _fge . X < _aaeg . Llx { _aaeg . Llx = _fge . X ; } else if _fge . X > _aaeg . Urx { _aaeg . Urx = _fge . X ; } ; if _fge . Y < _aaeg . Lly { _aaeg . Lly = _fge . Y ; } else if _fge . Y > _aaeg . Ury { _aaeg . Ury = _fge . Y ; } ; } ; for _ , _agc := range _bgcb . _bgbeg [ 0 ] . _aaebg [ 1 : ] { _fcac ( _agc ) ;
} ; for _ , _gfgc := range _bgcb . _bgbeg [ 1 : ] { for _ , _bfd := range _gfgc . _aaebg { _fcac ( _bfd ) ; } ; } ; return _aaeg ; } ; func ( _dgfae * textLine ) appendWord ( _cdcbc * textWord ) { _dgfae . _cfcb = append ( _dgfae . _cfcb , _cdcbc ) ; _dgfae . PdfRectangle = _cfab ( _dgfae . PdfRectangle , _cdcbc . PdfRectangle ) ;
if _cdcbc . _abcc > _dgfae . _afeb { _dgfae . _afeb = _cdcbc . _abcc ; } ; if _cdcbc . _accb > _dgfae . _addd { _dgfae . _addd = _cdcbc . _accb ; } ; } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// String returns a description of `tm`.
func ( _cddfb * textMark ) String ( ) string { return _efc . Sprintf ( "\u0025\u002e\u0032f \u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\u0022\u0025\u0073\u0022" , _cddfb . PdfRectangle , _cddfb . _acaf , _cddfb . _cgeb ) ; } ; func _ccga ( _gddb * textLine ) bool { _abaf := true ;
_gdaa := - 1 ; for _ , _afgb := range _gddb . _cfcb { for _ , _effda := range _afgb . _ffcd { _cbae := _effda . _ffbdg ; if _gdaa == - 1 { _gdaa = _cbae ; } else { if _gdaa != _cbae { _abaf = false ; break ; } ; } ; } ; } ; return _abaf ; } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// String returns a description of `l`.
func ( _fccg * textLine ) String ( ) string { return _efc . Sprintf ( "\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022" , _fccg . _addd , _fccg . PdfRectangle , _fccg . _afeb , _fccg . text ( ) ) ;
} ; type textPara struct { _af . PdfRectangle ; _gbgbb _af . PdfRectangle ; _aage [ ] * textLine ; _befe * textTable ; _fcdcf bool ; _bdgc bool ; _caagd * textPara ; _aggd * textPara ; _ecdfc * textPara ; _cabda * textPara ; _fadbb [ ] list ; } ; func _gaeb ( _ccdda * _af . Image , _ebgag _fe . Color ) _ef . Image { _bedb , _fddg := int ( _ccdda . Width ) , int ( _ccdda . Height ) ;
_fdade := _ef . NewRGBA ( _ef . Rect ( 0 , 0 , _bedb , _fddg ) ) ; for _beagf := 0 ; _beagf < _fddg ; _beagf ++ { for _ffeg := 0 ; _ffeg < _bedb ; _ffeg ++ { _acfg , _ebeag := _ccdda . ColorAt ( _ffeg , _beagf ) ; if _ebeag != nil { _ag . Log . Debug ( "\u0057\u0041\u0052\u004e\u003a\u0020\u0063o\u0075\u006c\u0064\u0020\u006e\u006f\u0074\u0020\u0072\u0065\u0074\u0072\u0069\u0065v\u0065 \u0069\u006d\u0061\u0067\u0065\u0020m\u0061\u0073\u006b\u0020\u0076\u0061\u006cu\u0065\u0020\u0061\u0074\u0020\u0028\u0025\u0064\u002c\u0020\u0025\u0064\u0029\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074\u0020\u006da\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063\u006f\u0072\u0072\u0065\u0063t\u002e" , _ffeg , _beagf ) ;
continue ; } ; _eaaaf , _gaeg , _ebdg , _ := _acfg . RGBA ( ) ; var _feef _fe . Color ; if _eaaaf + _gaeg + _ebdg == 0 { _feef = _ebgag ; } else { _feef = _fe . Transparent ; } ; _fdade . Set ( _ffeg , _beagf , _feef ) ; } ; } ; return _fdade ; } ; func ( _gdebf gridTile ) numBorders ( ) int { _ebag := 0 ;
if _gdebf . _gceeb { _ebag ++ ; } ; if _gdebf . _gdcbg { _ebag ++ ; } ; if _gdebf . _dbafa { _ebag ++ ; } ; if _gdebf . _ffdf { _ebag ++ ; } ; return _ebag ; } ; func _cbcee ( _cccb , _bfffd _aae . Point ) bool { _bbdd := _ea . Abs ( _cccb . X - _bfffd . X ) ; _ccbef := _ea . Abs ( _cccb . Y - _bfffd . Y ) ;
return _gdcgf ( _bbdd , _ccbef ) ; } ; func ( _adc TextTable ) getCellInfo ( _ccde TextMark ) [ ] [ ] int { for _abfd , _bfcf := range _adc . Cells { for _cbfdb := range _bfcf { _dgaa := & _bfcf [ _cbfdb ] . Marks ; if _dgaa . exists ( _ccde ) { return [ ] [ ] int { { _abfd } , { _cbfdb } } ; } ;
} ; } ; return nil ; } ; func _bdcb ( _fgafb , _bcf _af . PdfRectangle ) bool { return _gfb ( _fgafb , _bcf ) && _dfba ( _fgafb , _bcf ) } ; func _adfc ( _gecbbg [ ] * textWord , _dcdc int ) [ ] * textWord { _dabfb := len ( _gecbbg ) ; copy ( _gecbbg [ _dcdc : ] , _gecbbg [ _dcdc + 1 : ] ) ; return _gecbbg [ : _dabfb - 1 ] ;
} ; func ( _gcgda * textMark ) bbox ( ) _af . PdfRectangle { return _gcgda . PdfRectangle } ; func _fcegd ( _bage string ) bool { for _ , _daag := range _bage { if ! _fg . IsSpace ( _daag ) { return false ; } ; } ; return true ; } ; func ( _cbab * subpath ) removeDuplicates ( ) { if len ( _cbab . _aaebg ) == 0 { return ;
} ; _cbaf := [ ] _aae . Point { _cbab . _aaebg [ 0 ] } ; for _ , _dege := range _cbab . _aaebg [ 1 : ] { if ! _bgfgg ( _dege , _cbaf [ len ( _cbaf ) - 1 ] ) { _cbaf = append ( _cbaf , _dege ) ; } ; } ; _cbab . _aaebg = _cbaf ; } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// Len returns the number of TextMarks in `ma`.
func ( _fbb * TextMarkArray ) Len ( ) int { if _fbb == nil { return 0 ; } ; return len ( _fbb . _aec ) ; } ; func _fbce ( _ebeab bounded ) float64 { return - _ebeab . bbox ( ) . Lly } ; func _bebec ( _acca int , _aagfc func ( int , int ) bool ) [ ] int { _abfeb := make ( [ ] int , _acca ) ; for _ecbbc := range _abfeb { _abfeb [ _ecbbc ] = _ecbbc ;
} ; _e . Slice ( _abfeb , func ( _fefaf , _bcbgc int ) bool { return _aagfc ( _abfeb [ _fefaf ] , _abfeb [ _bcbgc ] ) } ) ; return _abfeb ; } ; func ( _cdgb rulingList ) aligned ( ) bool { if len ( _cdgb ) < 2 { return false ; } ; _cgcff := make ( map [ * ruling ] int ) ; _cgcff [ _cdgb [ 0 ] ] = 0 ;
for _ , _bgbb := range _cdgb [ 1 : ] { _gdcd := false ; for _afdfc := range _cgcff { if _bgbb . gridIntersecting ( _afdfc ) { _cgcff [ _afdfc ] ++ ; _gdcd = true ; break ; } ; } ; if ! _gdcd { _cgcff [ _bgbb ] = 0 ; } ; } ; _adgca := 0 ; for _ , _cfbef := range _cgcff { if _cfbef == 0 { _adgca ++ ;
} ; } ; _ffacb := float64 ( _adgca ) / float64 ( len ( _cdgb ) ) ; _ecgcd := _ffacb <= 1.0 - _bbce ; if _gdeb { _ag . Log . Info ( "\u0061\u006c\u0069\u0067\u006e\u0065\u0064\u003d\u0025\u0074\u0020\u0075\u006em\u0061\u0074\u0063\u0068\u0065\u0064=\u0025\u002e\u0032\u0066\u003d\u0025\u0064\u002f\u0025\u0064\u0020\u0076\u0065c\u0073\u003d\u0025\u0073" , _ecgcd , _ffacb , _adgca , len ( _cdgb ) , _cdgb . String ( ) ) ;
} ; return _ecgcd ; } ; func ( _afcf intSet ) has ( _cgbf int ) bool { _ , _ffdb := _afcf [ _cgbf ] ; return _ffdb } ; func ( _cbed paraList ) merge ( ) * textPara { _ag . Log . Trace ( "\u006d\u0065\u0072\u0067\u0065:\u0020\u0070\u0061\u0072\u0061\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d" , len ( _cbed ) ) ;
if len ( _cbed ) == 0 { return nil ; } ; _cbed . sortReadingOrder ( ) ; _bgcg := _cbed [ 0 ] . PdfRectangle ; _gcbad := _cbed [ 0 ] . _aage ; for _ , _cbdbg := range _cbed [ 1 : ] { _bgcg = _cfab ( _bgcg , _cbdbg . PdfRectangle ) ; _gcbad = append ( _gcbad , _cbdbg . _aage ... ) ; } ; return _adbde ( _bgcg , _gcbad ) ;
} ; func ( _dgbc compositeCell ) hasLines ( _ccffd [ ] * textLine ) bool { for _agdab , _eaba := range _ccffd { _agdabf := _bdcb ( _dgbc . PdfRectangle , _eaba . PdfRectangle ) ; if _dedc { _efc . Printf ( "\u0020\u0020\u0020\u0020\u0020\u0020\u005e\u005e\u005e\u0069\u006e\u0074\u0065\u0072\u0073e\u0063t\u0073\u003d\u0025\u0074\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064\u000a" , _agdabf , _agdab , len ( _ccffd ) ) ;
_efc . Printf ( "\u0020\u0020\u0020\u0020 \u005e\u005e\u005e\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u003d\u0025s\u000a" , _dgbc ) ; _efc . Printf ( "\u0020 \u0020 \u0020\u0020\u0020\u006c\u0069\u006e\u0065\u003d\u0025\u0073\u000a" , _eaba ) ; } ; if _agdabf { return true ;
} ; } ; return false ; } ; func ( _ffae * textWord ) bbox ( ) _af . PdfRectangle { return _ffae . PdfRectangle } ; func _dbddc ( _effff structElement ) [ ] structElement { _cfba := [ ] structElement { } ; for _ , _gbdg := range _effff . _befc { for _ , _dbae := range _gbdg . _befc { for _ , _cced := range _dbae . _befc { if _cced . _dccda == "\u004c" { _cfba = append ( _cfba , _cced ) ;
} ; } ; } ; } ; return _cfba ; } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// Text gets the extracted text contained in `l`.
func ( _cfac * list ) Text ( ) string { _ecdce := & _a . Builder { } ; _edcd := "" ; _fecd ( _cfac , _ecdce , & _edcd ) ; return _ecdce . String ( ) ; } ; func _gfae ( _fgce map [ int ] [ ] float64 ) string { _gfdfe := _gaccd ( _fgce ) ; _cdgf := make ( [ ] string , len ( _fgce ) ) ; for _gadb , _fgab := range _gfdfe { _cdgf [ _gadb ] = _efc . Sprintf ( "\u0025\u0064\u003a\u0020\u0025\u002e\u0032\u0066" , _fgab , _fgce [ _fgab ] ) ;
} ; return _efc . Sprintf ( "\u007b\u0025\u0073\u007d" , _a . Join ( _cdgf , "\u002c\u0020" ) ) ; } ; func _gfgde ( _eaeg * wordBag , _cfbc int ) * textLine { _egbge := _eaeg . firstWord ( _cfbc ) ; _ddda := textLine { PdfRectangle : _egbge . PdfRectangle , _afeb : _egbge . _abcc , _addd : _egbge . _accb } ;
_ddda . pullWord ( _eaeg , _egbge , _cfbc ) ; return & _ddda ; } ; func ( _babcc * textTable ) toTextTable ( ) TextTable { if _dedc { _ag . Log . Info ( "t\u006fT\u0065\u0078\u0074\u0054\u0061\u0062\u006c\u0065:\u0020\u0025\u0064\u0020x \u0025\u0064" , _babcc . _aageb , _babcc . _cegga ) ;
} ; _bfecd := make ( [ ] [ ] TableCell , _babcc . _cegga ) ; for _ecce := 0 ; _ecce < _babcc . _cegga ; _ecce ++ { _bfecd [ _ecce ] = make ( [ ] TableCell , _babcc . _aageb ) ; for _bebbb := 0 ; _bebbb < _babcc . _aageb ; _bebbb ++ { _cbcc := _babcc . get ( _bebbb , _ecce ) ; if _cbcc == nil { continue ;
} ; if _dedc { _efc . Printf ( "\u0025\u0034\u0064 \u0025\u0032\u0064\u003a\u0020\u0025\u0073\u000a" , _bebbb , _ecce , _cbcc ) ; } ; _bfecd [ _ecce ] [ _bebbb ] . Text = _cbcc . text ( ) ; _dged := 0 ; _bfecd [ _ecce ] [ _bebbb ] . Marks . _aec = _cbcc . toTextMarks ( & _dged ) ; } ; } ; _acdg := TextTable { W : _babcc . _aageb , H : _babcc . _cegga , Cells : _bfecd } ;
_acdg . PdfRectangle = _babcc . bbox ( ) ; return _acdg ; } ; type ruling struct { _ecfb rulingKind ; _agff markKind ; _fe . Color ; _aeef float64 ; _ggdb float64 ; _gbca float64 ; _faba float64 ; } ; func ( _gafc * wordBag ) depthIndexes ( ) [ ] int { if len ( _gafc . _cdbc ) == 0 { return nil ;
} ; _fgca := make ( [ ] int , len ( _gafc . _cdbc ) ) ; _acef := 0 ; for _gbbgf := range _gafc . _cdbc { _fgca [ _acef ] = _gbbgf ; _acef ++ ; } ; _e . Ints ( _fgca ) ; return _fgca ; } ;
2024-04-30 12:24:05 +00:00
2024-05-29 17:04:37 +00:00
// PageFonts represents extracted fonts on a PDF page.
type PageFonts struct { Fonts [ ] Font ; } ; func ( _cgbb * textLine ) bbox ( ) _af . PdfRectangle { return _cgbb . PdfRectangle } ; func ( _aegbf rulingList ) blocks ( _gdcbb , _fgcf * ruling ) bool { if _gdcbb . _ggdb > _fgcf . _gbca || _fgcf . _ggdb > _gdcbb . _gbca { return false ;
} ; _cbebf := _ea . Max ( _gdcbb . _ggdb , _fgcf . _ggdb ) ; _ffafc := _ea . Min ( _gdcbb . _gbca , _fgcf . _gbca ) ; if _gdcbb . _aeef > _fgcf . _aeef { _gdcbb , _fgcf = _fgcf , _gdcbb ; } ; for _ , _fggc := range _aegbf { if _gdcbb . _aeef <= _fggc . _aeef + _cggd && _fggc . _aeef <= _fgcf . _aeef + _cggd && _fggc . _ggdb <= _ffafc && _cbebf <= _fggc . _gbca { return true ;
} ; } ; return false ; } ; func ( _fddae * ruling ) equals ( _cddc * ruling ) bool { return _fddae . _ecfb == _cddc . _ecfb && _edddc ( _fddae . _aeef , _cddc . _aeef ) && _edddc ( _fddae . _ggdb , _cddc . _ggdb ) && _edddc ( _fddae . _gbca , _cddc . _gbca ) ; } ; func ( _ccce * shapesState ) devicePoint ( _gcbb , _gcea float64 ) _aae . Point { _gcc := _ccce . _gdec . Mult ( _ccce . _cffc ) ;
_gcbb , _gcea = _gcc . Transform ( _gcbb , _gcea ) ; return _aae . NewPoint ( _gcbb , _gcea ) ; } ; func ( _aabg * textObject ) setHorizScaling ( _bbcd float64 ) { if _aabg == nil { return ; } ; _aabg . _ecff . _dba = _bbcd ; } ; func ( _agea * shapesState ) drawRectangle ( _adbd , _acdcf , _cfga , _adbe float64 ) { if _cece { _eaa := _agea . devicePoint ( _adbd , _acdcf ) ;
_gcee := _agea . devicePoint ( _adbd + _cfga , _acdcf + _adbe ) ; _fcfa := _af . PdfRectangle { Llx : _eaa . X , Lly : _eaa . Y , Urx : _gcee . X , Ury : _gcee . Y } ; _ag . Log . Info ( "d\u0072a\u0077\u0052\u0065\u0063\u0074\u0061\u006e\u0067l\u0065\u003a\u0020\u00256.\u0032\u0066" , _fcfa ) ;
} ; _agea . newSubPath ( ) ; _agea . moveTo ( _adbd , _acdcf ) ; _agea . lineTo ( _adbd + _cfga , _acdcf ) ; _agea . lineTo ( _adbd + _cfga , _acdcf + _adbe ) ; _agea . lineTo ( _adbd , _acdcf + _adbe ) ; _agea . closePath ( ) ; } ; func _caegg ( _fbbd [ ] * textLine , _cagb map [ float64 ] [ ] * textLine ) [ ] * list { _fdgg := _ffadg ( _cagb ) ;
_caec := [ ] * list { } ; if len ( _fdgg ) == 0 { return _caec ; } ; _fafc := _fdgg [ 0 ] ; _agfe := 1 ; _baba := _cagb [ _fafc ] ; for _gcfg , _cgfa := range _baba { var _bdgd float64 ; _abgg := [ ] * list { } ; _gcfgc := _cgfa . _addd ; _agdgc := - 1.0 ; if _gcfg < len ( _baba ) - 1 { _agdgc = _baba [ _gcfg + 1 ] . _addd ;
} ; if _agfe < len ( _fdgg ) { _abgg = _bggb ( _fbbd , _cagb , _fdgg , _agfe , _gcfgc , _agdgc ) ; } ; _bdgd = _agdgc ; if len ( _abgg ) > 0 { _fcfad := _abgg [ 0 ] ; if len ( _fcfad . _fged ) > 0 { _bdgd = _fcfad . _fged [ 0 ] . _addd ; } ; } ; _fcfcb := [ ] * textLine { _cgfa } ; _bgcca := _ggfca ( _cgfa , _fbbd , _fdgg , _gcfgc , _bdgd ) ;
_fcfcb = append ( _fcfcb , _bgcca ... ) ; _dddb := _abda ( _fcfcb , "\u0062\u0075\u006c\u006c\u0065\u0074" , _abgg ) ; _dddb . _cbda = _ebgc ( _fcfcb , "" ) ; _caec = append ( _caec , _dddb ) ; } ; return _caec ; } ; type textMark struct { _af . PdfRectangle ; _ddfdb int ; _cgeb string ;
_faaf string ; _dbgfg * _af . PdfFont ; _acaf float64 ; _bdeb float64 ; _acgb _aae . Matrix ; _efab _aae . Point ; _fbaa _af . PdfRectangle ; _ccbff _fe . Color ; _bcfc _fe . Color ; _bae _gf . PdfObject ; _afcb [ ] string ; Tw float64 ; Th float64 ; _ffbdg int ; _feg int ; } ;
func ( _begad rulingList ) removeDuplicates ( ) rulingList { if len ( _begad ) == 0 { return nil ; } ; _begad . sort ( ) ; _accd := rulingList { _begad [ 0 ] } ; for _ , _ffadd := range _begad [ 1 : ] { if _ffadd . equals ( _accd [ len ( _accd ) - 1 ] ) { continue ; } ; _accd = append ( _accd , _ffadd ) ;
} ; return _accd ; } ; func ( _cbce rulingList ) sortStrict ( ) { _e . Slice ( _cbce , func ( _cdcf , _gbce int ) bool { _addg , _gcbe := _cbce [ _cdcf ] , _cbce [ _gbce ] ; _fdeaf , _gbag := _addg . _ecfb , _gcbe . _ecfb ; if _fdeaf != _gbag { return _fdeaf > _gbag ; } ; _eccge , _ebfca := _addg . _aeef , _gcbe . _aeef ;
if ! _cdaea ( _eccge - _ebfca ) { return _eccge < _ebfca ; } ; _eccge , _ebfca = _addg . _ggdb , _gcbe . _ggdb ; if _eccge != _ebfca { return _eccge < _ebfca ; } ; return _addg . _gbca < _gcbe . _gbca ; } ) ; } ;