2020-08-27 21:45:09 +00:00
//
// Copyright 2020 FoxyUtils ehf. All rights reserved.
//
// This is a commercial product and requires a license to operate.
// A trial license can be obtained at https://unidoc.io
//
// DO NOT EDIT: generated by unitwist Go source code obfuscator.
//
// Use of this source code is governed by the UniDoc End User License Agreement
// terms that can be accessed at https://unidoc.io/eula/
2018-03-22 14:03:47 +00:00
2020-08-27 21:45:09 +00:00
//
// Package extractor is used for quickly extracting PDF content through a simple interface.
// Currently offers functionality for extracting textual content.
//
2023-07-28 12:14:31 +00:00
package extractor ; import ( _dfe "bytes" ; _d "errors" ; _ce "fmt" ; _b "github.com/unidoc/unipdf/v3/common" ; _fb "github.com/unidoc/unipdf/v3/contentstream" ; _ea "github.com/unidoc/unipdf/v3/core" ; _gb "github.com/unidoc/unipdf/v3/internal/license" ; _aa "github.com/unidoc/unipdf/v3/internal/textencoding" ;
_gab "github.com/unidoc/unipdf/v3/internal/transform" ; _bg "github.com/unidoc/unipdf/v3/model" ; _ec "golang.org/x/image/draw" ; _dg "golang.org/x/text/unicode/norm" ; _ae "golang.org/x/xerrors" ; _fa "image" ; _ag "image/color" ; _ga "io" ; _ef "math" ; _e "reflect" ; _gg "regexp" ;
_df "sort" ; _c "strings" ; _f "unicode" ; _a "unicode/utf8" ; ) ; func ( _fedg * wordBag ) getDepthIdx ( _eebg float64 ) int { _aaca := _fedg . depthIndexes ( ) ; _edf := _ebfc ( _eebg ) ; if _edf < _aaca [ 0 ] { return _aaca [ 0 ] ; } ; if _edf > _aaca [ len ( _aaca ) - 1 ] { return _aaca [ len ( _aaca ) - 1 ] ;
} ; return _edf ; } ; func _cffd ( _bfedd , _acdbg int ) int { if _bfedd < _acdbg { return _bfedd ; } ; return _acdbg ; } ; func ( _fabee * subpath ) last ( ) _gab . Point { return _fabee . _fbcgf [ len ( _fabee . _fbcgf ) - 1 ] } ; var _daed = map [ markKind ] string { _bafga : "\u0073\u0074\u0072\u006f\u006b\u0065" , _ceag : "\u0066\u0069\u006c\u006c" , _abgg : "\u0061u\u0067\u006d\u0065\u006e\u0074" } ;
2023-02-07 17:17:49 +00:00
2023-04-06 19:57:40 +00:00
2023-07-28 12:14:31 +00:00
// String returns a human readable description of `path`.
func ( _fcg * subpath ) String ( ) string { _gbgb := _fcg . _fbcgf ; _edbb := len ( _gbgb ) ; if _edbb <= 5 { return _ce . Sprintf ( "\u0025d\u003a\u0020\u0025\u0036\u002e\u0032f" , _edbb , _gbgb ) ; } ; return _ce . Sprintf ( "\u0025d\u003a\u0020\u0025\u0036.\u0032\u0066\u0020\u0025\u0036.\u0032f\u0020.\u002e\u002e\u0020\u0025\u0036\u002e\u0032f" , _edbb , _gbgb [ 0 ] , _gbgb [ 1 ] , _gbgb [ _edbb - 1 ] ) ;
} ; func ( _fedc * wordBag ) scanBand ( _bcgb string , _daec * wordBag , _cff func ( _bdbd * wordBag , _gaage * textWord ) bool , _cgcb , _bfdf , _cgce float64 , _ccece , _bbfd bool ) int { _effd := _daec . _ecdf ; var _egdg map [ int ] map [ * textWord ] struct { } ; if ! _ccece { _egdg = _fedc . makeRemovals ( ) ;
} ; _aeba := _gdab * _effd ; _aff := 0 ; for _ , _ffgc := range _fedc . depthBand ( _cgcb - _aeba , _bfdf + _aeba ) { if len ( _fedc . _cgdg [ _ffgc ] ) == 0 { continue ; } ; for _ , _fcgc := range _fedc . _cgdg [ _ffgc ] { if ! ( _cgcb - _aeba <= _fcgc . _baebb && _fcgc . _baebb <= _bfdf + _aeba ) { continue ;
} ; if ! _cff ( _daec , _fcgc ) { continue ; } ; _gdaa := 2.0 * _ef . Abs ( _fcgc . _ebgb - _daec . _ecdf ) / ( _fcgc . _ebgb + _daec . _ecdf ) ; _ccfc := _ef . Max ( _fcgc . _ebgb / _daec . _ecdf , _daec . _ecdf / _fcgc . _ebgb ) ; _gfdd := _ef . Min ( _gdaa , _ccfc ) ; if _cgce > 0 && _gfdd > _cgce { continue ;
} ; if _daec . blocked ( _fcgc ) { continue ; } ; if ! _ccece { _daec . pullWord ( _fcgc , _ffgc , _egdg ) ; } ; _aff ++ ; if ! _bbfd { if _fcgc . _baebb < _cgcb { _cgcb = _fcgc . _baebb ; } ; if _fcgc . _baebb > _bfdf { _bfdf = _fcgc . _baebb ; } ; } ; if _ccece { break ; } ; } ; } ; if ! _ccece { _fedc . applyRemovals ( _egdg ) ;
} ; return _aff ; } ; func _aeef ( _bgcf _bg . PdfRectangle , _fdce , _bdaecd , _ffge , _ggdc * ruling ) gridTile { _ebbe := _bgcf . Llx ; _fgfb := _bgcf . Urx ; _egcgc := _bgcf . Lly ; _gdad := _bgcf . Ury ; return gridTile { PdfRectangle : _bgcf , _afdge : _fdce != nil && _fdce . encloses ( _egcgc , _gdad ) , _bfecb : _bdaecd != nil && _bdaecd . encloses ( _egcgc , _gdad ) , _eaed : _ffge != nil && _ffge . encloses ( _ebbe , _fgfb ) , _fdbd : _ggdc != nil && _ggdc . encloses ( _ebbe , _fgfb ) } ;
} ; type structElement struct { _bfeg string ; _efce [ ] structElement ; _aeae int64 ; _fgagg _ea . PdfObject ; } ;
2023-04-06 19:57:40 +00:00
2023-07-28 12:14:31 +00:00
// String returns a string describing `ma`.
func ( _eafg TextMarkArray ) String ( ) string { _bag := len ( _eafg . _bca ) ; if _bag == 0 { return "\u0045\u004d\u0050T\u0059" ; } ; _dadf := _eafg . _bca [ 0 ] ; _fecb := _eafg . _bca [ _bag - 1 ] ; return _ce . Sprintf ( "\u007b\u0054\u0045\u0058\u0054\u004d\u0041\u0052K\u0041\u0052\u0052AY\u003a\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025s\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d%\u0073\u007d" , _bag , _dadf , _fecb ) ;
} ; func ( _gbb * textObject ) reset ( ) { _gbb . _fda = _gab . IdentityMatrix ( ) ; _gbb . _cfec = _gab . IdentityMatrix ( ) ; _gbb . _fecd = nil ; } ; var _dgdfb = map [ rulingKind ] string { _bgbdg : "\u006e\u006f\u006e\u0065" , _cefaa : "\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c" , _acgee : "\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c" } ;
func ( _ccfe * wordBag ) depthRange ( _dgab , _ggab int ) [ ] int { var _abece [ ] int ; for _fbec := range _ccfe . _cgdg { if _dgab <= _fbec && _fbec <= _ggab { _abece = append ( _abece , _fbec ) ; } ; } ; if len ( _abece ) == 0 { return nil ; } ; _df . Ints ( _abece ) ; return _abece ; } ; type lineRuling struct { _cbfb rulingKind ;
_abeac markKind ; _ag . Color ; _egaf , _eaebf _gab . Point ; } ; func ( _gedeb paraList ) yNeighbours ( _daab float64 ) map [ * textPara ] [ ] int { _cdada := make ( [ ] event , 2 * len ( _gedeb ) ) ; if _daab == 0 { for _cffg , _beceb := range _gedeb { _cdada [ 2 * _cffg ] = event { _beceb . Lly , true , _cffg } ;
_cdada [ 2 * _cffg + 1 ] = event { _beceb . Ury , false , _cffg } ; } ; } else { for _fegef , _aecb := range _gedeb { _cdada [ 2 * _fegef ] = event { _aecb . Lly - _daab * _aecb . fontsize ( ) , true , _fegef } ; _cdada [ 2 * _fegef + 1 ] = event { _aecb . Ury + _daab * _aecb . fontsize ( ) , false , _fegef } ;
} ; } ; return _gedeb . eventNeighbours ( _cdada ) ; } ; func ( _fdcdg rulingList ) aligned ( ) bool { if len ( _fdcdg ) < 2 { return false ; } ; _fcdcd := make ( map [ * ruling ] int ) ; _fcdcd [ _fdcdg [ 0 ] ] = 0 ; for _ , _bfcae := range _fdcdg [ 1 : ] { _gbggd := false ; for _cebc := range _fcdcd { if _bfcae . gridIntersecting ( _cebc ) { _fcdcd [ _cebc ] ++ ;
_gbggd = true ; break ; } ; } ; if ! _gbggd { _fcdcd [ _bfcae ] = 0 ; } ; } ; _bbbd := 0 ; for _ , _bfab := range _fcdcd { if _bfab == 0 { _bbbd ++ ; } ; } ; _cfgfb := float64 ( _bbbd ) / float64 ( len ( _fdcdg ) ) ; _fffcge := _cfgfb <= 1.0 - _eadb ; if _bccgb { _b . Log . Info ( "\u0061\u006c\u0069\u0067\u006e\u0065\u0064\u003d\u0025\u0074\u0020\u0075\u006em\u0061\u0074\u0063\u0068\u0065\u0064=\u0025\u002e\u0032\u0066\u003d\u0025\u0064\u002f\u0025\u0064\u0020\u0076\u0065c\u0073\u003d\u0025\u0073" , _fffcge , _cfgfb , _bbbd , len ( _fdcdg ) , _fdcdg . String ( ) ) ;
} ; return _fffcge ; } ; type subpath struct { _fbcgf [ ] _gab . Point ; _bbdg bool ; } ; func ( _feea * textWord ) computeText ( ) string { _egfa := make ( [ ] string , len ( _feea . _dggf ) ) ; for _dgbfc , _ebgf := range _feea . _dggf { _egfa [ _dgbfc ] = _ebgf . _ebgd ; } ; return _c . Join ( _egfa , "" ) ;
} ; func ( _fefd * textObject ) getStrokeColor ( ) _ag . Color { return _badcf ( _fefd . _agbf . ColorspaceStroking , _fefd . _agbf . ColorStroking ) ; } ; var _fbdd = TextMark { Text : "\u005b\u0058\u005d" , Original : "\u0020" , Meta : true , FillColor : _ag . White , StrokeColor : _ag . White } ;
func ( _fcee paraList ) log ( _dcaf string ) { if ! _ecdg { return ; } ; _b . Log . Info ( "%\u0038\u0073\u003a\u0020\u0025\u0064 \u0070\u0061\u0072\u0061\u0073\u0020=\u003d\u003d\u003d\u003d\u003d\u003d\u002d-\u002d\u002d\u002d\u002d\u002d\u003d\u003d\u003d\u003d\u003d=\u003d" , _dcaf , len ( _fcee ) ) ;
for _agdd , _cfegb := range _fcee { if _cfegb == nil { continue ; } ; _caaec := _cfegb . text ( ) ; _fbgc := "\u0020\u0020" ; if _cfegb . _bgba != nil { _fbgc = _ce . Sprintf ( "\u005b%\u0064\u0078\u0025\u0064\u005d" , _cfegb . _bgba . _ddfc , _cfegb . _bgba . _gcbge ) ; } ; _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0025s\u0020\u0025\u0071\u000a" , _agdd , _cfegb . PdfRectangle , _fbgc , _dfcggd ( _caaec , 50 ) ) ;
} ; } ; func ( _eeffg * structTreeRoot ) parseStructTreeRoot ( _dfgd _ea . PdfObject ) { if _dfgd != nil { _eceae , _gbfdf := _ea . GetDict ( _dfgd ) ; if ! _gbfdf { _b . Log . Debug ( "\u0070\u0061\u0072s\u0065\u0053\u0074\u0072\u0075\u0063\u0074\u0054\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u003a\u0020\u0064\u0069\u0063\u0074\u0069\u006f\u006e\u0061\u0072\u0079\u0020\u006eo\u0074\u0020\u0066\u006f\u0075\u006e\u0064\u002e" ) ;
} ; K := _eceae . Get ( "\u004b" ) ; _aebd := _eceae . Get ( "\u0054\u0079\u0070\u0065" ) . String ( ) ; var _dbcg * _ea . PdfObjectArray ; switch _degd := K . ( type ) { case * _ea . PdfObjectArray : _dbcg = _degd ; case * _ea . PdfObjectReference : _dbcg = _ea . MakeArray ( K ) ; } ; _bdcc := [ ] structElement { } ;
for _ , _eecfa := range _dbcg . Elements ( ) { _gbfdfa := & structElement { } ; _gbfdfa . parseStructElement ( _eecfa ) ; _bdcc = append ( _bdcc , * _gbfdfa ) ; } ; _eeffg . _cegf = _bdcc ; _eeffg . _baeb = _aebd ; } ; } ; func _fgad ( _dgcb [ ] * textMark , _eedeg _bg . PdfRectangle , _eeda rulingList , _ccfa [ ] gridTiling , _cgba bool ) paraList { _b . Log . Trace ( "\u006d\u0061\u006b\u0065\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065\u003a \u0025\u0064\u0020\u0065\u006c\u0065m\u0065\u006e\u0074\u0073\u0020\u0070\u0061\u0067\u0065\u0053\u0069\u007a\u0065=\u0025\u002e\u0032\u0066" , len ( _dgcb ) , _eedeg ) ;
if len ( _dgcb ) == 0 { return nil ; } ; _egecb := _gbeaf ( _dgcb , _eedeg ) ; if len ( _egecb ) == 0 { return nil ; } ; _eeda . log ( "\u006d\u0061\u006be\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065" ) ; _dcdfd , _dcaga := _eeda . vertsHorzs ( ) ; _afgd := _gaaff ( _egecb , _eedeg . Ury , _dcdfd , _dcaga ) ;
_adfc := _bccb ( _afgd , _eedeg . Ury , _dcdfd , _dcaga ) ; _adfc = _bacb ( _adfc ) ; _cfdea := make ( paraList , 0 , len ( _adfc ) ) ; for _ , _bbde := range _adfc { _ecaa := _bbde . arrangeText ( ) ; if _ecaa != nil { _cfdea = append ( _cfdea , _ecaa ) ; } ; } ; if ! _cgba && len ( _cfdea ) >= _abda { _cfdea = _cfdea . extractTables ( _ccfa ) ;
} ; _cfdea . sortReadingOrder ( ) ; if ! _cgba { _cfdea . sortTopoOrder ( ) ; } ; _cfdea . log ( "\u0073\u006f\u0072te\u0064\u0020\u0069\u006e\u0020\u0072\u0065\u0061\u0064\u0069\u006e\u0067\u0020\u006f\u0072\u0064\u0065\u0072" ) ; return _cfdea ; } ; func ( _befe * wordBag ) highestWord ( _cecb int , _efc , _cdbb float64 ) * textWord { for _ , _bbga := range _befe . _cgdg [ _cecb ] { if _efc <= _bbga . _baebb && _bbga . _baebb <= _cdbb { return _bbga ;
} ; } ; return nil ; } ; func _afbd ( _bfac _bg . PdfRectangle ) textState { return textState { _dgc : 100 , _gd : RenderModeFill , _fagg : _bfac } ; } ; func ( _cbda * textPara ) getListLines ( ) [ ] * textLine { var _acaf [ ] * textLine ; _baed := _dgga ( _cbda . _gfbb ) ; for _ , _ebge := range _cbda . _gfbb { _dced := _ebge . _aafd [ 0 ] . _ggaef [ 0 ] ;
if _dbae ( _dced ) { _acaf = append ( _acaf , _ebge ) ; } ; } ; _acaf = append ( _acaf , _baed ... ) ; return _acaf ; } ; func ( _eee * imageExtractContext ) extractXObjectImage ( _dcg * _ea . PdfObjectName , _egd _fb . GraphicsState , _bfa * _bg . PdfPageResources ) error { _fd , _ := _bfa . GetXObjectByName ( * _dcg ) ;
if _fd == nil { return nil ; } ; _bcf , _bcd := _eee . _agf [ _fd ] ; if ! _bcd { _cbd , _faac := _bfa . GetXObjectImageByName ( * _dcg ) ; if _faac != nil { return _faac ; } ; if _cbd == nil { return nil ; } ; _gf , _faac := _cbd . ToImage ( ) ; if _faac != nil { return _faac ; } ; var _eaf _fa . Image ;
if _cbd . SMask != nil { _eaf , _faac = _acgbed ( _cbd . SMask , _ag . Opaque ) ; if _faac != nil { _b . Log . Debug ( "W\u0041\u0052\u004e\u003a\u0020\u0063\u006f\u0075\u006c\u0064\u0020\u006e\u006f\u0074\u0020\u0067\u0065\u0074\u0020\u0073\u006f\u0066\u0074\u0020\u0069\u006da\u0067e\u0020\u006d\u0061\u0073k\u002e\u0020O\u0075\u0074\u0070\u0075\u0074\u0020\u006d\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063\u006f\u0072\u0072\u0065\u0063\u0074\u002e" ) ;
} ; } ; if _eaf != nil { _dfa , _fgc := _gf . ToGoImage ( ) ; if _fgc != nil { return _fgc ; } ; _dfa = _aaeg ( _dfa , _eaf ) ; switch _cbd . ColorSpace . String ( ) { case "\u0044\u0065\u0076\u0069\u0063\u0065\u0047\u0072\u0061\u0079" , "\u0049n\u0064\u0065\u0078\u0065\u0064" : _gf , _fgc = _bg . ImageHandling . NewGrayImageFromGoImage ( _dfa ) ;
if _fgc != nil { return _fgc ; } ; default : _gf , _fgc = _bg . ImageHandling . NewImageFromGoImage ( _dfa ) ; if _fgc != nil { return _fgc ; } ; } ; } ; _bcf = & cachedImage { _dba : _gf , _gca : _cbd . ColorSpace } ; _eee . _agf [ _fd ] = _bcf ; } ; _fga := _bcf . _dba ; _dab := _bcf . _gca ; _deb , _cfgb := _dab . ImageToRGB ( * _fga ) ;
if _cfgb != nil { return _cfgb ; } ; _b . Log . Debug ( "@\u0044\u006f\u0020\u0043\u0054\u004d\u003a\u0020\u0025\u0073" , _egd . CTM . String ( ) ) ; _edd := ImageMark { Image : & _deb , Width : _egd . CTM . ScalingFactorX ( ) , Height : _egd . CTM . ScalingFactorY ( ) , Angle : _egd . CTM . Angle ( ) } ;
_edd . X , _edd . Y = _egd . CTM . Translation ( ) ; _eee . _dae = append ( _eee . _dae , _edd ) ; _eee . _edc ++ ; return nil ; } ; type rulingKind int ; func _ggfa ( _fdbb [ ] rulingList ) ( rulingList , rulingList ) { var _dbedb rulingList ; for _ , _bcec := range _fdbb { _dbedb = append ( _dbedb , _bcec ... ) ;
} ; return _dbedb . vertsHorzs ( ) ; } ;
2023-04-06 19:57:40 +00:00
2023-06-30 13:19:48 +00:00
// String returns a description of `state`.
2023-07-28 12:14:31 +00:00
func ( _bccg * textState ) String ( ) string { _aag := "\u005bN\u004f\u0054\u0020\u0053\u0045\u0054]" ; if _bccg . _dgdf != nil { _aag = _bccg . _dgdf . BaseFont ( ) ; } ; return _ce . Sprintf ( "\u0074\u0063\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0077\u003d\u0025\u002e\u0032\u0066 \u0074f\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u003d\u0025\u0071" , _bccg . _fdf , _bccg . _cagc , _bccg . _fgca , _aag ) ;
} ; func ( _dfgcaa paraList ) findTableGrid ( _dgggf gridTiling ) ( * textTable , map [ * textPara ] struct { } ) { _cbafe := len ( _dgggf . _eaafd ) ; _fgdfg := len ( _dgggf . _dade ) ; _fbab := textTable { _edgac : true , _ddfc : _cbafe , _gcbge : _fgdfg , _efeac : make ( map [ uint64 ] * textPara , _cbafe * _fgdfg ) , _dadcc : make ( map [ uint64 ] compositeCell , _cbafe * _fgdfg ) } ;
_fbab . PdfRectangle = _dgggf . PdfRectangle ; _bebe := make ( map [ * textPara ] struct { } ) ; _caceg := int ( ( 1.0 - _dfecd ) * float64 ( _cbafe * _fgdfg ) ) ; _cddcd := 0 ; if _agd { _b . Log . Info ( "\u0066\u0069\u006e\u0064Ta\u0062\u006c\u0065\u0047\u0072\u0069\u0064\u003a\u0020\u0025\u0064\u0020\u0078\u0020%\u0064" , _cbafe , _fgdfg ) ;
} ; for _adaf , _aegd := range _dgggf . _dade { _egedc , _efgb := _dgggf . _cbec [ _aegd ] ; if ! _efgb { continue ; } ; for _dggd , _gbdag := range _dgggf . _eaafd { _fdca , _ggefd := _egedc [ _gbdag ] ; if ! _ggefd { continue ; } ; _eaecb := _dfgcaa . inTile ( _fdca ) ; if len ( _eaecb ) == 0 { _cddcd ++ ;
if _cddcd > _caceg { if _agd { _b . Log . Info ( "\u0021\u006e\u0075m\u0045\u006d\u0070\u0074\u0079\u003d\u0025\u0064" , _cddcd ) ; } ; return nil , nil ; } ; } else { _fbab . putComposite ( _dggd , _adaf , _eaecb , _fdca . PdfRectangle ) ; for _ , _cafc := range _eaecb { _bebe [ _cafc ] = struct { } { } ;
} ; } ; } ; } ; _ggcd := 0 ; for _fcab := 0 ; _fcab < _cbafe ; _fcab ++ { _bgbfe := _fbab . get ( _fcab , 0 ) ; if _bgbfe == nil || ! _bgbfe . _cfga { _ggcd ++ ; } ; } ; if _ggcd == 0 { if _agd { _b . Log . Info ( "\u0021\u006e\u0075m\u0048\u0065\u0061\u0064\u0065\u0072\u003d\u0030" ) ; } ; return nil , nil ;
} ; _fdecd := _fbab . reduceTiling ( _dgggf , _feaa ) ; _fdecd = _fdecd . subdivide ( ) ; return _fdecd , _bebe ; } ; func _ggegd ( _adff [ ] pathSection ) rulingList { _bbcaa ( _adff ) ; if _bccgb { _b . Log . Info ( "\u006da\u006b\u0065\u0046\u0069l\u006c\u0052\u0075\u006c\u0069n\u0067s\u003a \u0025\u0064\u0020\u0066\u0069\u006c\u006cs" , len ( _adff ) ) ;
} ; var _bcagd rulingList ; for _ , _acde := range _adff { for _ , _caaa := range _acde . _dgfc { if ! _caaa . isQuadrilateral ( ) { if _bccgb { _b . Log . Error ( "!\u0069s\u0051\u0075\u0061\u0064\u0072\u0069\u006c\u0061t\u0065\u0072\u0061\u006c: \u0025\u0073" , _caaa ) ; } ;
continue ; } ; if _fdffef , _bdbcdd := _caaa . makeRectRuling ( _acde . Color ) ; _bdbcdd { _bcagd = append ( _bcagd , _fdffef ) ; } else { if _cfde { _b . Log . Error ( "\u0021\u006d\u0061\u006beR\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0025\u0073" , _caaa ) ;
} ; } ; } ; } ; if _bccgb { _b . Log . Info ( "\u006d\u0061\u006b\u0065Fi\u006c\u006c\u0052\u0075\u006c\u0069\u006e\u0067\u0073\u003a\u0020\u0025\u0073" , _bcagd . String ( ) ) ; } ; return _bcagd ; } ; func ( _bdag * textTable ) depth ( ) float64 { _fefc := 1e10 ; for _daebc := 0 ; _daebc < _bdag . _ddfc ;
_daebc ++ { _ecedg := _bdag . get ( _daebc , 0 ) ; if _ecedg == nil || _ecedg . _cfga { continue ; } ; _fefc = _ef . Min ( _fefc , _ecedg . depth ( ) ) ; } ; return _fefc ; } ; func _ggbg ( _gfbbb , _daecbe , _dbdg , _fcfge * textPara ) * textTable { _dcbg := & textTable { _ddfc : 2 , _gcbge : 2 , _efeac : make ( map [ uint64 ] * textPara , 4 ) } ;
_dcbg . put ( 0 , 0 , _gfbbb ) ; _dcbg . put ( 1 , 0 , _daecbe ) ; _dcbg . put ( 0 , 1 , _dbdg ) ; _dcbg . put ( 1 , 1 , _fcfge ) ; return _dcbg ; } ; func ( _gec * textObject ) showTextAdjusted ( _bccc * _ea . PdfObjectArray , _bcg int ) error { _bac := false ; for _ , _acg := range _bccc . Elements ( ) { switch _acg . ( type ) { case * _ea . PdfObjectFloat , * _ea . PdfObjectInteger : _gcbg , _gcdg := _ea . GetNumberAsFloat ( _acg ) ;
if _gcdg != nil { _b . Log . Debug ( "\u0045\u0052\u0052\u004fR\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078t\u0041\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0042\u0061\u0064\u0020\u006e\u0075\u006d\u0065r\u0069\u0063\u0061\u006c\u0020a\u0072\u0067\u002e\u0020\u006f\u003d\u0025\u0073\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076" , _acg , _bccc ) ;
return _gcdg ; } ; _fdb , _dfd := - _gcbg * 0.001 * _gec . _gacd . _fgca , 0.0 ; if _bac { _dfd , _fdb = _fdb , _dfd ; } ; _gac := _ebba ( _gab . Point { X : _fdb , Y : _dfd } ) ; _gec . _fda . Concat ( _gac ) ; case * _ea . PdfObjectString : _ggfd := _ea . TraceToDirectObject ( _acg ) ; _eaba , _cgaf := _ea . GetStringBytes ( _ggfd ) ;
if ! _cgaf { _b . Log . Trace ( "s\u0068\u006f\u0077\u0054\u0065\u0078\u0074\u0041\u0064j\u0075\u0073\u0074\u0065\u0064\u003a\u0020Ba\u0064\u0020\u0073\u0074r\u0069\u006e\u0067\u0020\u0061\u0072\u0067\u002e\u0020o=\u0025\u0073 \u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076" , _acg , _bccc ) ;
return _ea . ErrTypeError ; } ; _gec . renderText ( _ggfd , _eaba , _bcg ) ; default : _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078\u0074A\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0055\u006e\u0065\u0078p\u0065\u0063\u0074\u0065\u0064\u0020\u0074\u0079\u0070\u0065\u0020\u0028%T\u0029\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076" , _acg , _bccc ) ;
return _ea . ErrTypeError ; } ; } ; return nil ; } ; func ( _bbccf * textObject ) getFontDict ( _bdeac string ) ( _abec _ea . PdfObject , _gcae error ) { _defc := _bbccf . _edef ; if _defc == nil { _b . Log . Debug ( "g\u0065\u0074\u0046\u006f\u006e\u0074D\u0069\u0063\u0074\u002e\u0020\u004eo\u0020\u0072\u0065\u0073\u006f\u0075\u0072c\u0065\u0073\u002e\u0020\u006e\u0061\u006d\u0065\u003d\u0025#\u0071" , _bdeac ) ;
return nil , nil ; } ; _abec , _ccea := _defc . GetFontByName ( _ea . PdfObjectName ( _bdeac ) ) ; if ! _ccea { _b . Log . Debug ( "\u0045R\u0052\u004fR\u003a\u0020\u0067\u0065t\u0046\u006f\u006et\u0044\u0069\u0063\u0074\u003a\u0020\u0046\u006f\u006et \u006e\u006f\u0074 \u0066\u006fu\u006e\u0064\u003a\u0020\u006e\u0061m\u0065\u003d%\u0023\u0071" , _bdeac ) ;
return nil , _d . New ( "f\u006f\u006e\u0074\u0020no\u0074 \u0069\u006e\u0020\u0072\u0065s\u006f\u0075\u0072\u0063\u0065\u0073" ) ; } ; return _abec , nil ; } ; func ( _cbbb * subpath ) removeDuplicates ( ) { if len ( _cbbb . _fbcgf ) == 0 { return ; } ; _dcgd := [ ] _gab . Point { _cbbb . _fbcgf [ 0 ] } ;
for _ , _ddfd := range _cbbb . _fbcgf [ 1 : ] { if ! _eaca ( _ddfd , _dcgd [ len ( _dcgd ) - 1 ] ) { _dcgd = append ( _dcgd , _ddfd ) ; } ; } ; _cbbb . _fbcgf = _dcgd ; } ;
2023-06-30 13:19:48 +00:00
// String returns a description of `k`.
2023-07-28 12:14:31 +00:00
func ( _ccgb rulingKind ) String ( ) string { _dcegg , _fegf := _dgdfb [ _ccgb ] ; if ! _fegf { return _ce . Sprintf ( "\u004e\u006ft\u0020\u0061\u0020r\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0025\u0064" , _ccgb ) ; } ; return _dcegg ; } ; type textResult struct { _ddd PageText ;
_aede int ; _bega int ; } ; func _egbg ( _dedg _gab . Point ) * subpath { return & subpath { _fbcgf : [ ] _gab . Point { _dedg } } } ; func ( _afaa * ruling ) intersects ( _cdgca * ruling ) bool { _dfbb := ( _afaa . _eabdg == _acgee && _cdgca . _eabdg == _cefaa ) || ( _cdgca . _eabdg == _acgee && _afaa . _eabdg == _cefaa ) ;
_dace := func ( _cbced , _gcabg * ruling ) bool { return _cbced . _agbc - _cfgg <= _gcabg . _befee && _gcabg . _befee <= _cbced . _gffgd + _cfgg ; } ; _bfbc := _dace ( _afaa , _cdgca ) ; _abdc := _dace ( _cdgca , _afaa ) ; if _bccgb { _ce . Printf ( "\u0020\u0020\u0020\u0020\u0069\u006e\u0074\u0065\u0072\u0073\u0065\u0063\u0074\u0073\u003a\u0020\u0020\u006fr\u0074\u0068\u006f\u0067\u006f\u006e\u0061l\u003d\u0025\u0074\u0020\u006f\u0031\u003d\u0025\u0074\u0020\u006f2\u003d\u0025\u0074\u0020\u2192\u0020\u0025\u0074\u000a" + "\u0020\u0020\u0020 \u0020\u0020\u0020\u0076\u003d\u0025\u0073\u000a" + " \u0020\u0020\u0020\u0020\u0020\u0077\u003d\u0025\u0073\u000a" , _dfbb , _bfbc , _abdc , _dfbb && _bfbc && _abdc , _afaa , _cdgca ) ;
} ; return _dfbb && _bfbc && _abdc ; } ; func ( _eagee * wordBag ) applyRemovals ( _gcea map [ int ] map [ * textWord ] struct { } ) { for _ecab , _ageb := range _gcea { if len ( _ageb ) == 0 { continue ; } ; _afdb := _eagee . _cgdg [ _ecab ] ; _bbdc := len ( _afdb ) - len ( _ageb ) ; if _bbdc == 0 { delete ( _eagee . _cgdg , _ecab ) ;
continue ; } ; _dabg := make ( [ ] * textWord , _bbdc ) ; _feagc := 0 ; for _ , _cdda := range _afdb { if _ , _abca := _ageb [ _cdda ] ; ! _abca { _dabg [ _feagc ] = _cdda ; _feagc ++ ; } ; } ; _eagee . _cgdg [ _ecab ] = _dabg ; } ; } ; func _cgdcf ( _gada [ ] TextMark , _fage * int , _cgdag TextMark ) [ ] TextMark { _cgdag . Offset = * _fage ;
_gada = append ( _gada , _cgdag ) ; * _fage += len ( _cgdag . Text ) ; return _gada ; } ; func _aee ( _ba [ ] Font , _abb string ) bool { for _ , _beg := range _ba { if _beg . FontName == _abb { return true ; } ; } ; return false ; } ; func _dcdgd ( _cfdg [ ] * textLine , _fbfgg string ) string { var _fcac _c . Builder ;
_bgdga := 0.0 ; for _cccd , _fdg := range _cfdg { _ecc := _fdg . text ( ) ; _cbcg := _fdg . _cbbd ; if _cccd < len ( _cfdg ) - 1 { _bgdga = _cfdg [ _cccd + 1 ] . _cbbd ; } else { _bgdga = 0.0 ; } ; _fcac . WriteString ( _fbfgg ) ; _fcac . WriteString ( _ecc ) ; if _bgdga != _cbcg { _fcac . WriteString ( "\u000a" ) ;
} else { _fcac . WriteString ( "\u0020" ) ; } ; } ; return _fcac . String ( ) ; } ;
2023-02-07 17:17:49 +00:00
2023-07-28 12:14:31 +00:00
// BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
func ( _gfae * TextMarkArray ) BBox ( ) ( _bg . PdfRectangle , bool ) { var _dbef _bg . PdfRectangle ; _eef := false ; for _ , _degg := range _gfae . _bca { if _degg . Meta || _dfcc ( _degg . Text ) { continue ; } ; if _eef { _dbef = _egbga ( _dbef , _degg . BBox ) ; } else { _dbef = _degg . BBox ;
_eef = true ; } ; } ; return _dbef , _eef ; } ; func _ddda ( _ddbg , _dbac _gab . Point ) rulingKind { _dgdd := _ef . Abs ( _ddbg . X - _dbac . X ) ; _fgadf := _ef . Abs ( _ddbg . Y - _dbac . Y ) ; return _bffg ( _dgdd , _fgadf , _gbca ) ; } ; func _gdgf ( _dcef [ ] * textLine , _fcge map [ float64 ] [ ] * textLine , _gbee [ ] float64 , _aecg int , _dgbc , _dabd float64 ) [ ] * list { _gage := [ ] * list { } ;
_bgaa := _aecg ; _aecg = _aecg + 1 ; _egcfe := _gbee [ _bgaa ] ; _bbab := _fcge [ _egcfe ] ; _afeac := _ddba ( _bbab , _dabd , _dgbc ) ; for _cgef , _gfcb := range _afeac { var _gfce float64 ; _adfd := [ ] * list { } ; _abbg := _gfcb . _cbbd ; _caad := _dabd ; if _cgef < len ( _afeac ) - 1 { _caad = _afeac [ _cgef + 1 ] . _cbbd ;
} ; if _aecg < len ( _gbee ) { _adfd = _gdgf ( _dcef , _fcge , _gbee , _aecg , _abbg , _caad ) ; } ; _gfce = _caad ; if len ( _adfd ) > 0 { _agc := _adfd [ 0 ] ; if len ( _agc . _ecdee ) > 0 { _gfce = _agc . _ecdee [ 0 ] . _cbbd ; } ; } ; _agfc := [ ] * textLine { _gfcb } ; _caegf := _cbcbe ( _gfcb , _dcef , _gbee , _abbg , _gfce ) ;
_agfc = append ( _agfc , _caegf ... ) ; _edac := _facb ( _agfc , "\u0062\u0075\u006c\u006c\u0065\u0074" , _adfd ) ; _edac . _bfcg = _dcdgd ( _agfc , "" ) ; _gage = append ( _gage , _edac ) ; } ; return _gage ; } ; func ( _degc * wordBag ) arrangeText ( ) * textPara { _degc . sort ( ) ; if _aaade { _degc . removeDuplicates ( ) ;
} ; var _addf [ ] * textLine ; for _ , _bdbda := range _degc . depthIndexes ( ) { for ! _degc . empty ( _bdbda ) { _bdac := _degc . firstReadingIndex ( _bdbda ) ; _gdgd := _degc . firstWord ( _bdac ) ; _acbb := _ffff ( _degc , _bdac ) ; _cbed := _gdgd . _ebgb ; _eggg := _gdgd . _baebb - _gdab * _cbed ;
_eded := _gdgd . _baebb + _gdab * _cbed ; _gegd := _dcdc * _cbed ; _egcce := _efdcb * _cbed ; _febe : for { var _fedeg * textWord ; _aecf := 0 ; for _ , _acbdf := range _degc . depthBand ( _eggg , _eded ) { _efbb := _degc . highestWord ( _acbdf , _eggg , _eded ) ; if _efbb == nil { continue ;
} ; _eebdd := _efbc ( _efbb , _acbb . _aafd [ len ( _acbb . _aafd ) - 1 ] ) ; if _eebdd < - _egcce { break _febe ; } ; if _eebdd > _gegd { continue ; } ; if _fedeg != nil && _aea ( _efbb , _fedeg ) >= 0 { continue ; } ; _fedeg = _efbb ; _aecf = _acbdf ; } ; if _fedeg == nil { break ; } ; _acbb . pullWord ( _degc , _fedeg , _aecf ) ;
} ; _acbb . markWordBoundaries ( ) ; _addf = append ( _addf , _acbb ) ; } ; } ; if len ( _addf ) == 0 { return nil ; } ; _df . Slice ( _addf , func ( _fgfc , _gagbg int ) bool { return _dag ( _addf [ _fgfc ] , _addf [ _gagbg ] ) < 0 } ) ; _cgbd := _gdae ( _degc . PdfRectangle , _addf ) ; if _gde { _b . Log . Info ( "\u0061\u0072\u0072an\u0067\u0065\u0054\u0065\u0078\u0074\u0020\u0021\u0021\u0021\u0020\u0070\u0061\u0072\u0061\u003d\u0025\u0073" , _cgbd . String ( ) ) ;
if _cdbf { for _babc , _gdbdb := range _cgbd . _gfbb { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _babc , _gdbdb . String ( ) ) ; if _baeef { for _feae , _abfg := range _gdbdb . _aafd { _ce . Printf ( "\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a" , _feae , _abfg . String ( ) ) ;
for _dbed , _deff := range _abfg . _dggf { _ce . Printf ( "\u00251\u0032\u0064\u003a\u0020\u0025\u0073\n" , _dbed , _deff . String ( ) ) ; } ; } ; } ; } ; } ; } ; return _cgbd ; } ; func ( _eaad * textTable ) get ( _cfcff , _egcgd int ) * textPara { return _eaad . _efeac [ _fgcce ( _cfcff , _egcgd ) ] ;
} ; func ( _egea paraList ) merge ( ) * textPara { _b . Log . Trace ( "\u006d\u0065\u0072\u0067\u0065:\u0020\u0070\u0061\u0072\u0061\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d" , len ( _egea ) ) ;
if len ( _egea ) == 0 { return nil ; } ; _egea . sortReadingOrder ( ) ; _gbbc := _egea [ 0 ] . PdfRectangle ; _aebe := _egea [ 0 ] . _gfbb ; for _ , _cdeee := range _egea [ 1 : ] { _gbbc = _egbga ( _gbbc , _cdeee . PdfRectangle ) ; _aebe = append ( _aebe , _cdeee . _gfbb ... ) ; } ; return _gdae ( _gbbc , _aebe ) ;
} ; type textPara struct { _bg . PdfRectangle ; _gfbgd _bg . PdfRectangle ; _gfbb [ ] * textLine ; _bgba * textTable ; _abeg bool ; _cfga bool ; _dbaed * textPara ; _eabac * textPara ; _ffeg * textPara ; _fgdg * textPara ; _gbff [ ] list ; } ; func _ggdg ( _gacb , _eagab float64 ) bool { return _gacb / _ef . Max ( _gfad , _eagab ) < _ceecf } ;
type bounded interface { bbox ( ) _bg . PdfRectangle } ; func ( _cbebd rulingList ) findPrimSec ( _eaacd , _afcdb float64 ) * ruling { for _ , _dcebfe := range _cbebd { if _acbc ( _dcebfe . _befee - _eaacd ) && _dcebfe . _agbc - _cfgg <= _afcdb && _afcdb <= _dcebfe . _gffgd + _cfgg { return _dcebfe ;
} ; } ; return nil ; } ; func ( _gabg rulingList ) comp ( _egcg , _baga int ) bool { _gbgc , _ffga := _gabg [ _egcg ] , _gabg [ _baga ] ; _bdabe , _gfed := _gbgc . _eabdg , _ffga . _eabdg ; if _bdabe != _gfed { return _bdabe > _gfed ; } ; if _bdabe == _bgbdg { return false ; } ; _bcfa := func ( _fecf bool ) bool { if _bdabe == _cefaa { return _fecf ;
} ; return ! _fecf ; } ; _gaec , _effa := _gbgc . _befee , _ffga . _befee ; if _gaec != _effa { return _bcfa ( _gaec > _effa ) ; } ; _gaec , _effa = _gbgc . _agbc , _ffga . _agbc ; if _gaec != _effa { return _bcfa ( _gaec < _effa ) ; } ; return _bcfa ( _gbgc . _gffgd < _ffga . _gffgd ) ;
} ; func ( _fdfd * wordBag ) firstWord ( _fcb int ) * textWord { return _fdfd . _cgdg [ _fcb ] [ 0 ] } ; func ( _cdd * subpath ) close ( ) { if ! _eaca ( _cdd . _fbcgf [ 0 ] , _cdd . last ( ) ) { _cdd . add ( _cdd . _fbcgf [ 0 ] ) ; } ; _cdd . _bbdg = true ; _cdd . removeDuplicates ( ) ; } ; func ( _aeaab rulingList ) toTilings ( ) ( rulingList , [ ] gridTiling ) { _aeaab . log ( "\u0074o\u0054\u0069\u006c\u0069\u006e\u0067s" ) ;
if len ( _aeaab ) == 0 { return nil , nil ; } ; _aeaab = _aeaab . tidied ( "\u0061\u006c\u006c" ) ; _aeaab . log ( "\u0074\u0069\u0064\u0069\u0065\u0064" ) ; _fafgb := _aeaab . toGrids ( ) ; _cefb := make ( [ ] gridTiling , len ( _fafgb ) ) ; for _fbgg , _dfdc := range _fafgb { _cefb [ _fbgg ] = _dfdc . asTiling ( ) ;
} ; return _aeaab , _cefb ; } ; const ( _deg = "\u0045\u0052R\u004f\u0052\u003a\u0020\u0043\u0061\u006e\u0027\u0074\u0020\u0063\u006f\u006e\u0076\u0065\u0072\u0074\u0020\u0066\u006f\u006e\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074\u002c\u0020\u0069\u006e\u0076\u0061\u006c\u0069\u0064\u0020\u0074\u0079\u0070\u0065" ;
_egf = "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0043a\u006e\u0027\u0074 g\u0065\u0074\u0020\u0066\u006f\u006et\u0020\u0070\u0072\u006f\u0070\u0065\u0072\u0074\u0069\u0065\u0073\u002c\u0020\u0066\u006fn\u0074\u0020\u006e\u006f\u0074\u0020\u0066\u006fu\u006e\u0064" ;
_cf = "\u0045\u0052\u0052O\u0052\u003a\u0020\u0043\u0061\u006e\u0027\u0074\u0020\u0067\u0065\u0074\u0020\u0066\u006f\u006e\u0074\u0020\u0073\u0074\u0072\u0065\u0061\u006d\u002c\u0020\u0069\u006e\u0076a\u006c\u0069\u0064\u0020\u0074\u0079\u0070\u0065" ; ) ;
func _dgbe ( _eged * list , _dfdf * _c . Builder , _cgfea * string ) { _bdbcc := _ggcc ( _eged , _cgfea ) ; _dfdf . WriteString ( _bdbcc ) ; for _ , _dcfg := range _eged . _cdfc { _ddffg := * _cgfea + "\u0020\u0020\u0020" ; _dgbe ( _dcfg , _dfdf , & _ddffg ) ; } ; } ; func ( _edb * stateStack ) empty ( ) bool { return len ( * _edb ) == 0 } ;
func ( _befc * textObject ) checkOp ( _cagf * _fb . ContentStreamOperation , _gaac int , _cac bool ) ( _ada bool , _ffce error ) { if _befc == nil { var _cae [ ] _ea . PdfObject ; if _gaac > 0 { _cae = _cagf . Params ; if len ( _cae ) > _gaac { _cae = _cae [ : _gaac ] ; } ; } ; _b . Log . Debug ( "\u0025\u0023q \u006f\u0070\u0065r\u0061\u006e\u0064\u0020out\u0073id\u0065\u0020\u0074\u0065\u0078\u0074\u002e p\u0061\u0072\u0061\u006d\u0073\u003d\u0025+\u0076" , _cagf . Operand , _cae ) ;
} ; if _gaac >= 0 { if len ( _cagf . Params ) != _gaac { if _cac { _ffce = _d . New ( "\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et" ) ; } ; _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076" , _cagf . Operand , _gaac , len ( _cagf . Params ) , _cagf . Params ) ;
return false , _ffce ; } ; } ; return true , nil ; } ; func ( _fbeeb rulingList ) secMinMax ( ) ( float64 , float64 ) { _cfedd , _dfecf := _fbeeb [ 0 ] . _agbc , _fbeeb [ 0 ] . _gffgd ; for _ , _geae := range _fbeeb [ 1 : ] { if _geae . _agbc < _cfedd { _cfedd = _geae . _agbc ; } ; if _geae . _gffgd > _dfecf { _dfecf = _geae . _gffgd ;
} ; } ; return _cfedd , _dfecf ; } ; func ( _dcfgf * textWord ) addDiacritic ( _cdcb string ) { _ebafb := _dcfgf . _dggf [ len ( _dcfgf . _dggf ) - 1 ] ; _ebafb . _ebgd += _cdcb ; _ebafb . _ebgd = _dg . NFKC . String ( _ebafb . _ebgd ) ; } ; func ( _dfdb * textObject ) newTextMark ( _baagd string , _cdbd _gab . Matrix , _bbae _gab . Point , _gdbd float64 , _bfef * _bg . PdfFont , _dgcf float64 , _cdcd , _gecb _ag . Color , _edfg _ea . PdfObject , _cgecc [ ] string , _fbcca int , _gafc int ) ( textMark , bool ) { _fdge := _cdbd . Angle ( ) ;
_cdca := _fdfb ( _fdge , _fcbd ) ; var _eabe float64 ; if _cdca % 180 != 90 { _eabe = _cdbd . ScalingFactorY ( ) ; } else { _eabe = _cdbd . ScalingFactorX ( ) ; } ; _bcbgcd := _bbccc ( _cdbd ) ; _eega := _bg . PdfRectangle { Llx : _bcbgcd . X , Lly : _bcbgcd . Y , Urx : _bbae . X , Ury : _bbae . Y } ;
switch _cdca % 360 { case 90 : _eega . Urx -= _eabe ; case 180 : _eega . Ury -= _eabe ; case 270 : _eega . Urx += _eabe ; case 0 : _eega . Ury += _eabe ; default : _cdca = 0 ; _eega . Ury += _eabe ; } ; if _eega . Llx > _eega . Urx { _eega . Llx , _eega . Urx = _eega . Urx , _eega . Llx ; } ; if _eega . Lly > _eega . Ury { _eega . Lly , _eega . Ury = _eega . Ury , _eega . Lly ;
} ; _gegb := true ; if _dfdb . _dcdg . _de . Width ( ) > 0 { _gaad , _ebefe := _fgdf ( _eega , _dfdb . _dcdg . _de ) ; if ! _ebefe { _gegb = false ; _b . Log . Debug ( "\u0054\u0065\u0078\u0074\u0020m\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069\u0064\u0065\u0020\u0070a\u0067\u0065\u002e\u0020\u0062\u0062\u006f\u0078\u003d\u0025\u0067\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u003d\u0025\u0067\u0020\u0074\u0065\u0078\u0074\u003d\u0025q" , _eega , _dfdb . _dcdg . _de , _baagd ) ;
} ; _eega = _gaad ; } ; _ffd := _eega ; _ggeac := _dfdb . _dcdg . _de ; switch _cdca % 360 { case 90 : _ggeac . Urx , _ggeac . Ury = _ggeac . Ury , _ggeac . Urx ; _ffd = _bg . PdfRectangle { Llx : _ggeac . Urx - _eega . Ury , Urx : _ggeac . Urx - _eega . Lly , Lly : _eega . Llx , Ury : _eega . Urx } ;
case 180 : _ffd = _bg . PdfRectangle { Llx : _ggeac . Urx - _eega . Llx , Urx : _ggeac . Urx - _eega . Urx , Lly : _ggeac . Ury - _eega . Lly , Ury : _ggeac . Ury - _eega . Ury } ; case 270 : _ggeac . Urx , _ggeac . Ury = _ggeac . Ury , _ggeac . Urx ; _ffd = _bg . PdfRectangle { Llx : _eega . Ury , Urx : _eega . Lly , Lly : _ggeac . Ury - _eega . Llx , Ury : _ggeac . Ury - _eega . Urx } ;
} ; if _ffd . Llx > _ffd . Urx { _ffd . Llx , _ffd . Urx = _ffd . Urx , _ffd . Llx ; } ; if _ffd . Lly > _ffd . Ury { _ffd . Lly , _ffd . Ury = _ffd . Ury , _ffd . Lly ; } ; _bdbg := textMark { _ebgd : _baagd , PdfRectangle : _ffd , _bcfd : _eega , _ecbeg : _bfef , _gceb : _eabe , _abac : _dgcf , _acddd : _cdbd , _efgg : _bbae , _acec : _cdca , _bdaff : _cdcd , _bfdb : _gecb , _dcbd : _edfg , _babd : _cgecc , Th : _dfdb . _gacd . _dgc , Tw : _dfdb . _gacd . _cagc , _adbb : _gafc , _fbcc : _fbcca } ;
if _efe { _b . Log . Info ( "n\u0065\u0077\u0054\u0065\u0078\u0074M\u0061\u0072\u006b\u003a\u0020\u0073t\u0061\u0072\u0074\u003d\u0025\u002e\u0032f\u0020\u0065\u006e\u0064\u003d\u0025\u002e\u0032\u0066\u0020%\u0073" , _bcbgcd , _bbae , _bdbg . String ( ) ) ; } ; return _bdbg , _gegb ;
} ; func ( _fecgd * textWord ) bbox ( ) _bg . PdfRectangle { return _fecgd . PdfRectangle } ; func ( _ecad * shapesState ) closePath ( ) { if _ecad . _afge { _ecad . _cbfc = append ( _ecad . _cbfc , _egbg ( _ecad . _bfd ) ) ; _ecad . _afge = false ; } else if len ( _ecad . _cbfc ) == 0 { if _bdaae { _b . Log . Debug ( "\u0063\u006c\u006f\u0073eP\u0061\u0074\u0068\u0020\u0077\u0069\u0074\u0068\u0020\u006e\u006f\u0020\u0070\u0061t\u0068" ) ;
} ; _ecad . _afge = false ; return ; } ; _ecad . _cbfc [ len ( _ecad . _cbfc ) - 1 ] . close ( ) ; if _bdaae { _b . Log . Info ( "\u0063\u006c\u006f\u0073\u0065\u0050\u0061\u0074\u0068\u003a\u0020\u0025\u0073" , _ecad ) ; } ; } ; func _gaf ( _cece * wordBag , _egbe * textWord , _caae float64 ) bool { return _egbe . Llx < _cece . Urx + _caae && _cece . Llx - _caae < _egbe . Urx ;
} ; func ( _baab * textObject ) setTextRenderMode ( _bda int ) { if _baab == nil { return ; } ; _baab . _gacd . _gd = RenderMode ( _bda ) ; } ; func ( _efee * textLine ) appendWord ( _bgf * textWord ) { _efee . _aafd = append ( _efee . _aafd , _bgf ) ; _efee . PdfRectangle = _egbga ( _efee . PdfRectangle , _bgf . PdfRectangle ) ;
if _bgf . _ebgb > _efee . _bfbb { _efee . _bfbb = _bgf . _ebgb ; } ; if _bgf . _baebb > _efee . _cbbd { _efee . _cbbd = _bgf . _baebb ; } ; } ; func ( _efdd * textLine ) text ( ) string { var _afde [ ] string ; for _ , _eaaf := range _efdd . _aafd { if _eaaf . _gagaf { _afde = append ( _afde , "\u0020" ) ;
} ; _afde = append ( _afde , _eaaf . _ggaef ) ; } ; return _c . Join ( _afde , "" ) ; } ; var ( _gbf = _d . New ( "\u0074\u0079p\u0065\u0020\u0063h\u0065\u0063\u006b\u0020\u0065\u0072\u0072\u006f\u0072" ) ; _gc = _d . New ( "\u0072\u0061\u006e\u0067\u0065\u0020\u0063\u0068\u0065\u0063\u006b\u0020e\u0072\u0072\u006f\u0072" ) ;
) ; func _cbcbe ( _bab * textLine , _gffg [ ] * textLine , _cfacd [ ] float64 , _eaegb , _gceg float64 ) [ ] * textLine { _bggb := [ ] * textLine { } ; for _ , _cbca := range _gffg { if _cbca . _cbbd >= _eaegb { if _gceg != - 1 && _cbca . _cbbd < _gceg { if _cbca . text ( ) != _bab . text ( ) { if _ef . Round ( _cbca . Llx ) < _ef . Round ( _bab . Llx ) { break ;
} ; _bggb = append ( _bggb , _cbca ) ; } ; } else if _gceg == - 1 { if _cbca . _cbbd == _bab . _cbbd { if _cbca . text ( ) != _bab . text ( ) { _bggb = append ( _bggb , _cbca ) ; } ; continue ; } ; _ecge := _cfad ( _bab , _gffg , _cfacd ) ; if _ecge != - 1 && _cbca . _cbbd <= _ecge { _bggb = append ( _bggb , _cbca ) ;
} ; } ; } ; } ; return _bggb ; } ; func ( _bedgb * textPara ) isAtom ( ) * textTable { _eagcg := _bedgb ; _dfgda := _bedgb . _eabac ; _acdc := _bedgb . _fgdg ; if _dfgda . taken ( ) || _acdc . taken ( ) { return nil ; } ; _ffecc := _dfgda . _fgdg ; if _ffecc . taken ( ) || _ffecc != _acdc . _eabac { return nil ;
} ; return _ggbg ( _eagcg , _dfgda , _acdc , _ffecc ) ; } ; type gridTiling struct { _bg . PdfRectangle ; _eaafd [ ] float64 ; _dade [ ] float64 ; _cbec map [ float64 ] map [ float64 ] gridTile ; } ; type cachedImage struct { _dba * _bg . Image ; _gca _bg . PdfColorspace ; } ; func _gdbc ( _fbcfc [ ] TextMark , _cecf * int , _bebd string ) [ ] TextMark { _eced := _fbdd ;
_eced . Text = _bebd ; return _cgdcf ( _fbcfc , _cecf , _eced ) ; } ; func ( _gggaa * textTable ) putComposite ( _gecc , _aegc int , _gdff paraList , _bcef _bg . PdfRectangle ) { if len ( _gdff ) == 0 { _b . Log . Error ( "\u0074\u0065xt\u0054\u0061\u0062l\u0065\u0029\u0020\u0070utC\u006fmp\u006f\u0073\u0069\u0074\u0065\u003a\u0020em\u0070\u0074\u0079\u0020\u0070\u0061\u0072a\u0073" ) ;
return ; } ; _egefg := compositeCell { PdfRectangle : _bcef , paraList : _gdff } ; if _cgafg { _ce . Printf ( "\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0070\u0075\u0074\u0043\u006f\u006d\u0070o\u0073i\u0074\u0065\u0028\u0025\u0064\u002c\u0025\u0064\u0029\u003c\u002d\u0025\u0073\u000a" , _gecc , _aegc , _egefg . String ( ) ) ;
} ; _egefg . updateBBox ( ) ; _gggaa . _dadcc [ _fgcce ( _gecc , _aegc ) ] = _egefg ; } ; func ( _gbaa rulingList ) removeDuplicates ( ) rulingList { if len ( _gbaa ) == 0 { return nil ; } ; _gbaa . sort ( ) ; _eeee := rulingList { _gbaa [ 0 ] } ; for _ , _agec := range _gbaa [ 1 : ] { if _agec . equals ( _eeee [ len ( _eeee ) - 1 ] ) { continue ;
} ; _eeee = append ( _eeee , _agec ) ; } ; return _eeee ; } ; func ( _gedd * subpath ) isQuadrilateral ( ) bool { if len ( _gedd . _fbcgf ) < 4 || len ( _gedd . _fbcgf ) > 5 { return false ; } ; if len ( _gedd . _fbcgf ) == 5 { _abgc := _gedd . _fbcgf [ 0 ] ; _abaa := _gedd . _fbcgf [ 4 ] ; if _abgc . X != _abaa . X || _abgc . Y != _abaa . Y { return false ;
} ; } ; return true ; } ; func ( _bdaf * PageText ) computeViews ( ) { _fbga := _bdaf . getParagraphs ( ) ; _bbda := new ( _dfe . Buffer ) ; _fbga . writeText ( _bbda ) ; _bdaf . _bdf = _bbda . String ( ) ; _bdaf . _fccf = _fbga . toTextMarks ( ) ; _bdaf . _ecege = _fbga . tables ( ) ; if _cgafg { _b . Log . Info ( "\u0063\u006f\u006dpu\u0074\u0065\u0056\u0069\u0065\u0077\u0073\u003a\u0020\u0074\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064" , len ( _bdaf . _ecege ) ) ;
} ; } ; func _egagc ( _abga float64 ) float64 { return _dgeb * _ef . Round ( _abga / _dgeb ) } ; func _ebfc ( _cagb float64 ) int { var _eace int ; if _cagb >= 0 { _eace = int ( _cagb / _fefe ) ; } else { _eace = int ( _cagb / _fefe ) - 1 ; } ; return _eace ; } ; func _dddaa ( _gddc map [ int ] [ ] float64 ) { if len ( _gddc ) <= 1 { return ;
} ; _cdggb := _ecdb ( _gddc ) ; if _cgafg { _b . Log . Info ( "\u0066i\u0078C\u0065\u006c\u006c\u0073\u003a \u006b\u0065y\u0073\u003d\u0025\u002b\u0076" , _cdggb ) ; } ; var _edead , _ebbg int ; for _edead , _ebbg = range _cdggb { if _gddc [ _ebbg ] != nil { break ; } ; } ; for _ecfe , _fegb := range _cdggb [ _edead : ] { _bdeb := _gddc [ _fegb ] ;
if _bdeb == nil { continue ; } ; if _cgafg { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u006b\u0030\u003d\u0025\u0064\u0020\u006b1\u003d\u0025\u0064\u000a" , _edead + _ecfe , _ebbg , _fegb ) ; } ; _gbccg := _gddc [ _fegb ] ; if _gbccg [ len ( _gbccg ) - 1 ] > _bdeb [ 0 ] { _gbccg [ len ( _gbccg ) - 1 ] = _bdeb [ 0 ] ;
_gddc [ _ebbg ] = _gbccg ; } ; _ebbg = _fegb ; } ; } ;
2023-05-29 17:26:33 +00:00
2023-07-28 12:14:31 +00:00
// String returns a string describing `tm`.
func ( _fba TextMark ) String ( ) string { _ebfa := _fba . BBox ; var _aacc string ; if _fba . Font != nil { _aacc = _fba . Font . String ( ) ; if len ( _aacc ) > 50 { _aacc = _aacc [ : 50 ] + "\u002e\u002e\u002e" ; } ; } ; var _dgca string ; if _fba . Meta { _dgca = "\u0020\u002a\u004d\u002a" ;
} ; return _ce . Sprintf ( "\u007b\u0054\u0065\u0078t\u004d\u0061\u0072\u006b\u003a\u0020\u0025\u0064\u0020%\u0071\u003d\u0025\u0030\u0032\u0078\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e2\u0066\u0029\u0020\u0028\u00256\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0025\u0073\u007d" , _fba . Offset , _fba . Text , [ ] rune ( _fba . Text ) , _ebfa . Llx , _ebfa . Lly , _ebfa . Urx , _ebfa . Ury , _aacc , _dgca ) ;
} ; func ( _gef * shapesState ) moveTo ( _becea , _aad float64 ) { _gef . _afge = true ; _gef . _bfd = _gef . devicePoint ( _becea , _aad ) ; if _bdaae { _b . Log . Info ( "\u006d\u006fv\u0065\u0054\u006f\u003a\u0020\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0064\u0065\u0076\u0069\u0063\u0065\u003d%.\u0032\u0066" , _becea , _aad , _gef . _bfd ) ;
} ; } ; func _eafb ( _aagf [ ] structElement , _ffae map [ int ] [ ] * textLine , _bgeg _ea . PdfObject ) [ ] * list { _gded := [ ] * list { } ; for _ , _eefc := range _aagf { _cdeeb := _eefc . _efce ; _ecgd := int ( _eefc . _aeae ) ; _affe := _eefc . _bfeg ; _gefb := [ ] * textLine { } ; _gefg := [ ] * list { } ;
_ddaa := _eefc . _fgagg ; _egec , _dgaag := ( _ddaa . ( * _ea . PdfObjectReference ) ) ; if ! _dgaag { _b . Log . Debug ( "\u0066\u0061\u0069l\u0065\u0064\u0020\u006f\u0074\u0020\u0063\u0061\u0073\u0074\u0020\u0074\u006f\u0020\u002a\u0063\u006f\u0072\u0065\u002e\u0050\u0064\u0066\u004f\u0062\u006a\u0065\u0063\u0074R\u0065\u0066\u0065\u0072\u0065\u006e\u0063\u0065" ) ;
} ; if _ecgd != - 1 && _egec != nil { if _egdgf , _becc := _ffae [ _ecgd ] ; _becc { if _efdf , _aefd := _bgeg . ( * _ea . PdfIndirectObject ) ; _aefd { _edbf := _efdf . PdfObjectReference ; if _e . DeepEqual ( * _egec , _edbf ) { _gefb = _egdgf ; } ; } ; } ; } ; if _cdeeb != nil { _gefg = _eafb ( _cdeeb , _ffae , _bgeg ) ;
} ; _afbgc := _facb ( _gefb , _affe , _gefg ) ; _gded = append ( _gded , _afbgc ) ; } ; return _gded ; } ;
2023-05-29 17:26:33 +00:00
2023-07-28 12:14:31 +00:00
// Options extractor options.
type Options struct {
2023-02-07 17:17:49 +00:00
2023-07-28 12:14:31 +00:00
// DisableDocumentTags specifies whether to use the document tags during list extraction.
DisableDocumentTags bool ;
2023-05-29 17:26:33 +00:00
2023-07-28 12:14:31 +00:00
// ApplyCropBox will extract page text based on page cropbox if set to `true`.
ApplyCropBox bool ;
2023-02-07 17:17:49 +00:00
2023-07-28 12:14:31 +00:00
// UseSimplerExtractionProcess will skip topological text ordering and table processing.
//
// NOTE: While normally the extra processing is beneficial, it can also lead to problems when it does not work.
// Thus it is a flag to allow the user to control this process.
//
// Skipping some extraction processes would also lead to the reduced processing time.
UseSimplerExtractionProcess bool ; } ; func ( _bebge * textTable ) reduce ( ) * textTable { _agbbg := make ( [ ] int , 0 , _bebge . _gcbge ) ; _gbae := make ( [ ] int , 0 , _bebge . _ddfc ) ; for _begcf := 0 ; _begcf < _bebge . _gcbge ; _begcf ++ { if ! _bebge . emptyCompositeRow ( _begcf ) { _agbbg = append ( _agbbg , _begcf ) ;
} ; } ; for _cgafa := 0 ; _cgafa < _bebge . _ddfc ; _cgafa ++ { if ! _bebge . emptyCompositeColumn ( _cgafa ) { _gbae = append ( _gbae , _cgafa ) ; } ; } ; if len ( _agbbg ) == _bebge . _gcbge && len ( _gbae ) == _bebge . _ddfc { return _bebge ; } ; _edbdg := textTable { _edgac : _bebge . _edgac , _ddfc : len ( _gbae ) , _gcbge : len ( _agbbg ) , _efeac : make ( map [ uint64 ] * textPara , len ( _gbae ) * len ( _agbbg ) ) } ;
if _cgafg { _b . Log . Info ( "\u0072\u0065\u0064\u0075ce\u003a\u0020\u0025\u0064\u0078\u0025\u0064\u0020\u002d\u003e\u0020\u0025\u0064\u0078%\u0064" , _bebge . _ddfc , _bebge . _gcbge , len ( _gbae ) , len ( _agbbg ) ) ; _b . Log . Info ( "\u0072\u0065d\u0075\u0063\u0065d\u0043\u006f\u006c\u0073\u003a\u0020\u0025\u002b\u0076" , _gbae ) ;
_b . Log . Info ( "\u0072\u0065d\u0075\u0063\u0065d\u0052\u006f\u0077\u0073\u003a\u0020\u0025\u002b\u0076" , _agbbg ) ; } ; for _bgdgd , _beggc := range _agbbg { for _ffde , _ccbg := range _gbae { _dcgfb , _efdcf := _bebge . getComposite ( _ccbg , _beggc ) ; if _dcgfb == nil { continue ;
} ; if _cgafg { _ce . Printf ( "\u0020 \u0025\u0032\u0064\u002c \u0025\u0032\u0064\u0020\u0028%\u0032d\u002c \u0025\u0032\u0064\u0029\u0020\u0025\u0071\n" , _ffde , _bgdgd , _ccbg , _beggc , _dfcggd ( _dcgfb . merge ( ) . text ( ) , 50 ) ) ; } ; _edbdg . putComposite ( _ffde , _bgdgd , _dcgfb , _efdcf ) ;
} ; } ; return & _edbdg ; } ; func ( _fdcf paraList ) extractTables ( _ffgge [ ] gridTiling ) paraList { if _cgafg { _b . Log . Debug ( "\u0065\u0078\u0074r\u0061\u0063\u0074\u0054\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d" , len ( _fdcf ) ) ;
} ; if len ( _fdcf ) < _abda { return _fdcf ; } ; _dbdd := _fdcf . findTables ( _ffgge ) ; if _cgafg { _b . Log . Info ( "c\u006f\u006d\u0062\u0069\u006e\u0065d\u0020\u0074\u0061\u0062\u006c\u0065s\u0020\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d" , len ( _dbdd ) ) ;
for _faff , _egbgae := range _dbdd { _egbgae . log ( _ce . Sprintf ( "c\u006f\u006d\u0062\u0069\u006e\u0065\u0064\u0020\u0025\u0064" , _faff ) ) ; } ; } ; return _fdcf . applyTables ( _dbdd ) ; } ; type wordBag struct { _bg . PdfRectangle ; _ecdf float64 ; _debag , _gfe rulingList ;
_dgec float64 ; _cgdg map [ int ] [ ] * textWord ; } ; func _fffd ( _dbbc , _gbdf * textPara ) bool { return _gefa ( _dbbc . _gfbgd , _gbdf . _gfbgd ) } ; func ( _gcfb * textObject ) setWordSpacing ( _bdbc float64 ) { if _gcfb == nil { return ; } ; _gcfb . _gacd . _cagc = _bdbc ; } ; func _fecc ( _aacad , _adcf _gab . Point ) rulingKind { _geaf := _ef . Abs ( _aacad . X - _adcf . X ) ;
_fbeba := _ef . Abs ( _aacad . Y - _adcf . Y ) ; return _bffg ( _geaf , _fbeba , _ceecf ) ; } ; func ( _cdgd * textPara ) writeText ( _gcbd _ga . Writer ) { if _cdgd . _bgba == nil { _cdgd . writeCellText ( _gcbd ) ; return ; } ; for _accd := 0 ; _accd < _cdgd . _bgba . _gcbge ; _accd ++ { for _faadc := 0 ;
_faadc < _cdgd . _bgba . _ddfc ; _faadc ++ { _eagg := _cdgd . _bgba . get ( _faadc , _accd ) ; if _eagg == nil { _gcbd . Write ( [ ] byte ( "\u0009" ) ) ; } else { _eagg . writeCellText ( _gcbd ) ; } ; _gcbd . Write ( [ ] byte ( "\u0020" ) ) ; } ; if _accd < _cdgd . _bgba . _gcbge - 1 { _gcbd . Write ( [ ] byte ( "\u000a" ) ) ;
} ; } ; } ; func _ddcc ( _adcdg bounded ) float64 { return - _adcdg . bbox ( ) . Lly } ; func ( _gaga * TextMarkArray ) exists ( _becg TextMark ) bool { for _ , _cbef := range _gaga . Elements ( ) { if _e . DeepEqual ( _becg . DirectObject , _cbef . DirectObject ) && _e . DeepEqual ( _becg . BBox , _cbef . BBox ) && _cbef . Text == _becg . Text { return true ;
} ; } ; return false ; } ; func ( _agae * wordBag ) makeRemovals ( ) map [ int ] map [ * textWord ] struct { } { _ggaf := make ( map [ int ] map [ * textWord ] struct { } , len ( _agae . _cgdg ) ) ; for _fdba := range _agae . _cgdg { _ggaf [ _fdba ] = make ( map [ * textWord ] struct { } ) ; } ; return _ggaf ;
} ; func _gfea ( _cbdea , _cddab _bg . PdfRectangle ) bool { return _cbdea . Llx <= _cddab . Llx && _cddab . Urx <= _cbdea . Urx && _cbdea . Lly <= _cddab . Lly && _cddab . Ury <= _cbdea . Ury ; } ; func ( _ggbe rulingList ) sort ( ) { _df . Slice ( _ggbe , _ggbe . comp ) } ;
2023-02-07 17:17:49 +00:00
2023-07-28 12:14:31 +00:00
// String returns a description of `l`.
func ( _ddce * textLine ) String ( ) string { return _ce . Sprintf ( "\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022" , _ddce . _cbbd , _ddce . PdfRectangle , _ddce . _bfbb , _ddce . text ( ) ) ;
} ; func ( _acce rectRuling ) asRuling ( ) ( * ruling , bool ) { _ebgce := ruling { _eabdg : _acce . _fbad , Color : _acce . Color , _gggfe : _ceag } ; switch _acce . _fbad { case _acgee : _ebgce . _befee = 0.5 * ( _acce . Llx + _acce . Urx ) ; _ebgce . _agbc = _acce . Lly ; _ebgce . _gffgd = _acce . Ury ;
_gcbb , _bbgaab := _acce . checkWidth ( _acce . Llx , _acce . Urx ) ; if ! _bbgaab { if _cfde { _b . Log . Error ( "\u0072\u0065\u0063\u0074\u0052\u0075l\u0069\u006e\u0067\u002e\u0061\u0073\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0072\u0075\u006c\u0069\u006e\u0067V\u0065\u0072\u0074\u0020\u0021\u0063\u0068\u0065\u0063\u006b\u0057\u0069\u0064\u0074h\u0020v\u003d\u0025\u002b\u0076" , _acce ) ;
} ; return nil , false ; } ; _ebgce . _fadae = _gcbb ; case _cefaa : _ebgce . _befee = 0.5 * ( _acce . Lly + _acce . Ury ) ; _ebgce . _agbc = _acce . Llx ; _ebgce . _gffgd = _acce . Urx ; _daca , _gccgd := _acce . checkWidth ( _acce . Lly , _acce . Ury ) ; if ! _gccgd { if _cfde { _b . Log . Error ( "\u0072\u0065\u0063\u0074\u0052\u0075l\u0069\u006e\u0067\u002e\u0061\u0073\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0072\u0075\u006c\u0069\u006e\u0067H\u006f\u0072\u007a\u0020\u0021\u0063\u0068\u0065\u0063\u006b\u0057\u0069\u0064\u0074h\u0020v\u003d\u0025\u002b\u0076" , _acce ) ;
} ; return nil , false ; } ; _ebgce . _fadae = _daca ; default : _b . Log . Error ( "\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064" , _acce . _fbad ) ; return nil , false ; } ; return & _ebgce , true ; } ; func _egbga ( _gafe , _cebb _bg . PdfRectangle ) _bg . PdfRectangle { return _bg . PdfRectangle { Llx : _ef . Min ( _gafe . Llx , _cebb . Llx ) , Lly : _ef . Min ( _gafe . Lly , _cebb . Lly ) , Urx : _ef . Max ( _gafe . Urx , _cebb . Urx ) , Ury : _ef . Max ( _gafe . Ury , _cebb . Ury ) } ;
} ; func ( _cded paraList ) llyOrdering ( ) [ ] int { _ecede := make ( [ ] int , len ( _cded ) ) ; for _afac := range _cded { _ecede [ _afac ] = _afac ; } ; _df . SliceStable ( _ecede , func ( _gbgg , _fbea int ) bool { _bcde , _gfbg := _ecede [ _gbgg ] , _ecede [ _fbea ] ; return _cded [ _bcde ] . Lly < _cded [ _gfbg ] . Lly ;
} ) ; return _ecede ; } ; var _bgbdd string = "\u005e\u005b\u0061\u002d\u007a\u0041\u002dZ\u005d\u0028\u005c)\u007c\u005c\u002e)\u007c\u005e[\u005c\u0064\u005d\u002b\u0028\u005c)\u007c\\.\u0029\u007c\u005e\u005c\u0028\u005b\u0061\u002d\u007a\u0041\u002d\u005a\u005d\u005c\u0029\u007c\u005e\u005c\u0028\u005b\u005c\u0064\u005d\u002b\u005c\u0029" ;
func ( _cadb * TextMarkArray ) getTextMarkAtOffset ( _gcda int ) * TextMark { for _ , _adbc := range _cadb . _bca { if _adbc . Offset == _gcda { return & _adbc ; } ; } ; return nil ; } ; var _cd = false ;
2023-03-01 18:45:57 +00:00
2023-07-28 12:14:31 +00:00
// Text gets the extracted text contained in `l`.
func ( _cgb * list ) Text ( ) string { _afcd := & _c . Builder { } ; _dfgc := "" ; _dgbe ( _cgb , _afcd , & _dfgc ) ; return _afcd . String ( ) ; } ; func _gcgc ( _gcfe , _dbec _gab . Point , _bfedg _ag . Color ) ( * ruling , bool ) { _adea := lineRuling { _egaf : _gcfe , _eaebf : _dbec , _cbfb : _ddda ( _gcfe , _dbec ) , Color : _bfedg } ;
if _adea . _cbfb == _bgbdg { return nil , false ; } ; return _adea . asRuling ( ) ; } ;
2023-03-01 18:45:57 +00:00
2023-07-28 12:14:31 +00:00
// String returns a string descibing `i`.
func ( _aecdb gridTile ) String ( ) string { _facac := func ( _dcbba bool , _dffg string ) string { if _dcbba { return _dffg ; } ; return "\u005f" ; } ; return _ce . Sprintf ( "\u00256\u002e2\u0066\u0020\u0025\u0031\u0073%\u0031\u0073%\u0031\u0073\u0025\u0031\u0073" , _aecdb . PdfRectangle , _facac ( _aecdb . _afdge , "\u004c" ) , _facac ( _aecdb . _bfecb , "\u0052" ) , _facac ( _aecdb . _eaed , "\u0042" ) , _facac ( _aecdb . _fdbd , "\u0054" ) ) ;
} ; func ( _gcdda * ruling ) equals ( _bdaec * ruling ) bool { return _gcdda . _eabdg == _bdaec . _eabdg && _ffcaf ( _gcdda . _befee , _bdaec . _befee ) && _ffcaf ( _gcdda . _agbc , _bdaec . _agbc ) && _ffcaf ( _gcdda . _gffgd , _bdaec . _gffgd ) ; } ;
2023-03-01 18:45:57 +00:00
2023-07-28 12:14:31 +00:00
// String returns a description of `t`.
func ( _fgge * textTable ) String ( ) string { return _ce . Sprintf ( "\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u0020\u0025\u0074" , _fgge . _ddfc , _fgge . _gcbge , _fgge . _edgac ) ; } ; const ( _eebec = false ; _efe = false ; _cfab = false ; _adbf = false ; _bdaae = false ; _beff = false ;
_fcea = false ; _ecdg = false ; _gde = false ; _cdbf = _gde && true ; _baeef = _cdbf && false ; _aebb = _gde && true ; _cgafg = false ; _efed = _cgafg && false ; _eecc = _cgafg && true ; _bccgb = false ; _fegd = _bccgb && false ; _dbdb = _bccgb && false ; _agd = _bccgb && true ; _cfde = _bccgb && false ;
_geg = _bccgb && false ; ) ;
2023-03-01 18:45:57 +00:00
2023-07-28 12:14:31 +00:00
// ExtractText processes and extracts all text data in content streams and returns as a string.
// It takes into account character encodings in the PDF file, which are decoded by
// CharcodeBytesToUnicode.
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20> ).
func ( _fea * Extractor ) ExtractText ( ) ( string , error ) { _dgd , _ , _ , _eca := _fea . ExtractTextWithStats ( ) ; return _dgd , _eca ; } ;
2023-05-29 17:26:33 +00:00
2023-07-28 12:14:31 +00:00
// Extractor stores and offers functionality for extracting content from PDF pages.
type Extractor struct { _bc string ; _gga * _bg . PdfPageResources ; _de _bg . PdfRectangle ; _fc * _bg . PdfRectangle ; _eg map [ string ] fontEntry ; _gbfd map [ string ] textResult ; _ge int64 ; _ead int ; _ad * Options ; _fbb * _ea . PdfObject ; _ca _ea . PdfObject ; } ; func ( _fbfd * textPara ) depth ( ) float64 { if _fbfd . _cfga { return - 1.0 ;
} ; if len ( _fbfd . _gfbb ) > 0 { return _fbfd . _gfbb [ 0 ] . _cbbd ; } ; return _fbfd . _bgba . depth ( ) ; } ; type fontEntry struct { _bcae * _bg . PdfFont ; _edcg int64 ; } ; const ( _efea = 1.0e-6 ; _dgeb = 1.0e-4 ; _fcbd = 10 ; _fefe = 6 ; _gdab = 0.5 ; _ffef = 0.12 ; _ccfg = 0.19 ; _ccfb = 0.04 ;
_gbcf = 0.04 ; _dega = 1.0 ; _fbbf = 0.04 ; _bcbge = 0.4 ; _abg = 0.7 ; _begg = 1.0 ; _dcdf = 0.1 ; _dcdc = 1.4 ; _efdcb = 0.46 ; _cfbb = 0.02 ; _ccee = 0.2 ; _cfdf = 0.5 ; _ecg = 4 ; _dggb = 4.0 ; _abda = 6 ; _dfecd = 0.3 ; _bbe = 0.01 ; _baeeb = 0.02 ; _gfda = 2 ; _cafad = 2 ; _cfae = 500 ; _gbca = 4.0 ; _gfdaf = 4.0 ;
_ceecf = 0.05 ; _gfad = 0.1 ; _cfgg = 2.0 ; _bddeb = 2.0 ; _cddf = 1.5 ; _feaa = 3.0 ; _eadb = 0.25 ; ) ; func _gdae ( _cdag _bg . PdfRectangle , _bgdf [ ] * textLine ) * textPara { return & textPara { PdfRectangle : _cdag , _gfbb : _bgdf } ; } ; func ( _gbef * textMark ) inDiacriticArea ( _fcfe * textMark ) bool { _gggeac := _gbef . Llx - _fcfe . Llx ;
_efde := _gbef . Urx - _fcfe . Urx ; _aafa := _gbef . Lly - _fcfe . Lly ; return _ef . Abs ( _gggeac + _efde ) < _gbef . Width ( ) * _cfdf && _ef . Abs ( _aafa ) < _gbef . Height ( ) * _cfdf ; } ; func ( _eafa paraList ) list ( ) [ ] * list { var _effe [ ] * textLine ; var _bfce [ ] * textLine ;
for _ , _bgbb := range _eafa { _bbgda := _bgbb . getListLines ( ) ; _effe = append ( _effe , _bbgda ... ) ; _bfce = append ( _bfce , _bgbb . _gfbb ... ) ; } ; _ccgf := _dceg ( _effe ) ; _efab := _ggeb ( _bfce , _ccgf ) ; return _efab ; } ; func _ebba ( _bebga _gab . Point ) _gab . Matrix { return _gab . TranslationMatrix ( _bebga . X , _bebga . Y ) } ;
2023-05-29 17:26:33 +00:00
2023-07-28 12:14:31 +00:00
// Tables returns the tables extracted from the page.
func ( _gba PageText ) Tables ( ) [ ] TextTable { if _cgafg { _b . Log . Info ( "\u0054\u0061\u0062\u006c\u0065\u0073\u003a\u0020\u0025\u0064" , len ( _gba . _ecege ) ) ; } ; return _gba . _ecege ; } ; func ( _ebbb * shapesState ) clearPath ( ) { _ebbb . _cbfc = nil ; _ebbb . _afge = false ;
if _bdaae { _b . Log . Info ( "\u0043\u004c\u0045A\u0052\u003a\u0020\u0073\u0073\u003d\u0025\u0073" , _ebbb ) ; } ; } ; type ruling struct { _eabdg rulingKind ; _gggfe markKind ; _ag . Color ; _befee float64 ; _agbc float64 ; _gffgd float64 ; _fadae float64 ; } ; func ( _adgb * wordBag ) depthBand ( _eeff , _bafg float64 ) [ ] int { if len ( _adgb . _cgdg ) == 0 { return nil ;
} ; return _adgb . depthRange ( _adgb . getDepthIdx ( _eeff ) , _adgb . getDepthIdx ( _bafg ) ) ; } ; func _egfd ( _fabc [ ] pathSection ) rulingList { _bbcaa ( _fabc ) ; if _bccgb { _b . Log . Info ( "\u006d\u0061k\u0065\u0053\u0074\u0072\u006f\u006b\u0065\u0052\u0075\u006c\u0069\u006e\u0067\u0073\u003a\u0020\u0025\u0064\u0020\u0073\u0074\u0072ok\u0065\u0073" , len ( _fabc ) ) ;
} ; var _fffcg rulingList ; for _ , _baagf := range _fabc { for _ , _cdcca := range _baagf . _dgfc { if len ( _cdcca . _fbcgf ) < 2 { continue ; } ; _gaca := _cdcca . _fbcgf [ 0 ] ; for _ , _edgg := range _cdcca . _fbcgf [ 1 : ] { if _fgbg , _dabc := _gcgc ( _gaca , _edgg , _baagf . Color ) ;
_dabc { _fffcg = append ( _fffcg , _fgbg ) ; } ; _gaca = _edgg ; } ; } ; } ; if _bccgb { _b . Log . Info ( "m\u0061\u006b\u0065\u0053tr\u006fk\u0065\u0052\u0075\u006c\u0069n\u0067\u0073\u003a\u0020\u0025\u0073" , _fffcg ) ; } ; return _fffcg ; } ; func _bbccc ( _eage _gab . Matrix ) _gab . Point { _ecbe , _fgbe := _eage . Translation ( ) ;
return _gab . Point { X : _ecbe , Y : _fgbe } ; } ; var _ebef = [ ] string { "\u2756" , "\u27a2" , "\u2713" , "\u2022" , "\uf0a7" , "\u25a1" , "\u2212" , "\u25a0" , "\u25aa" , "\u006f" } ; func _eabga ( _dgeeg float64 ) bool { return _ef . Abs ( _dgeeg ) < _bddeb } ; func ( _gaabg * textTable ) newTablePara ( ) * textPara { _baabc := _gaabg . computeBbox ( ) ;
_fbdda := & textPara { PdfRectangle : _baabc , _gfbgd : _baabc , _bgba : _gaabg } ; if _cgafg { _b . Log . Info ( "\u006e\u0065w\u0054\u0061\u0062l\u0065\u0050\u0061\u0072\u0061\u003a\u0020\u0025\u0073" , _fbdda ) ; } ; return _fbdda ; } ; func ( _dfdae * wordBag ) depthIndexes ( ) [ ] int { if len ( _dfdae . _cgdg ) == 0 { return nil ;
} ; _cgfa := make ( [ ] int , len ( _dfdae . _cgdg ) ) ; _feg := 0 ; for _adef := range _dfdae . _cgdg { _cgfa [ _feg ] = _adef ; _feg ++ ; } ; _df . Ints ( _cgfa ) ; return _cgfa ; } ; func ( _fdaa paraList ) lines ( ) [ ] * textLine { var _cadf [ ] * textLine ; for _ , _defcg := range _fdaa { _cadf = append ( _cadf , _defcg . _gfbb ... ) ;
} ; return _cadf ; } ; func _dbcdg ( _acgbd map [ int ] intSet ) [ ] int { _ecfcf := make ( [ ] int , 0 , len ( _acgbd ) ) ; for _eebgg := range _acgbd { _ecfcf = append ( _ecfcf , _eebgg ) ; } ; _df . Ints ( _ecfcf ) ; return _ecfcf ; } ; func ( _fbgd * ruling ) encloses ( _efcf , _bebc float64 ) bool { return _fbgd . _agbc - _cfgg <= _efcf && _bebc <= _fbgd . _gffgd + _cfgg ;
} ; func _ggff ( _bedg string ) bool { if _a . RuneCountInString ( _bedg ) < _ecg { return false ; } ; _daa , _efgfa := _a . DecodeLastRuneInString ( _bedg ) ; if _efgfa <= 0 || ! _f . Is ( _f . Hyphen , _daa ) { return false ; } ; _daa , _efgfa = _a . DecodeLastRuneInString ( _bedg [ : len ( _bedg ) - _efgfa ] ) ;
return _efgfa > 0 && ! _f . IsSpace ( _daa ) ; } ; func ( _dbbfc rulingList ) augmentGrid ( ) ( rulingList , rulingList ) { _faged , _fedbb := _dbbfc . vertsHorzs ( ) ; if len ( _faged ) == 0 || len ( _fedbb ) == 0 { return _faged , _fedbb ; } ; _aade , _ggbf := _faged , _fedbb ; _dbaac := _faged . bbox ( ) ;
_gedbg := _fedbb . bbox ( ) ; if _bccgb { _b . Log . Info ( "\u0061u\u0067\u006d\u0065\u006e\u0074\u0047\u0072\u0069\u0064\u003a\u0020b\u0062\u006f\u0078\u0056\u003d\u0025\u0036\u002e\u0032\u0066" , _dbaac ) ; _b . Log . Info ( "\u0061u\u0067\u006d\u0065\u006e\u0074\u0047\u0072\u0069\u0064\u003a\u0020b\u0062\u006f\u0078\u0048\u003d\u0025\u0036\u002e\u0032\u0066" , _gedbg ) ;
} ; var _bgae , _fcgd , _eceaeb , _cbafd * ruling ; if _gedbg . Llx < _dbaac . Llx - _cfgg { _bgae = & ruling { _gggfe : _abgg , _eabdg : _acgee , _befee : _gedbg . Llx , _agbc : _dbaac . Lly , _gffgd : _dbaac . Ury } ; _faged = append ( rulingList { _bgae } , _faged ... ) ; } ; if _gedbg . Urx > _dbaac . Urx + _cfgg { _fcgd = & ruling { _gggfe : _abgg , _eabdg : _acgee , _befee : _gedbg . Urx , _agbc : _dbaac . Lly , _gffgd : _dbaac . Ury } ;
_faged = append ( _faged , _fcgd ) ; } ; if _dbaac . Lly < _gedbg . Lly - _cfgg { _eceaeb = & ruling { _gggfe : _abgg , _eabdg : _cefaa , _befee : _dbaac . Lly , _agbc : _gedbg . Llx , _gffgd : _gedbg . Urx } ; _fedbb = append ( rulingList { _eceaeb } , _fedbb ... ) ; } ; if _dbaac . Ury > _gedbg . Ury + _cfgg { _cbafd = & ruling { _gggfe : _abgg , _eabdg : _cefaa , _befee : _dbaac . Ury , _agbc : _gedbg . Llx , _gffgd : _gedbg . Urx } ;
_fedbb = append ( _fedbb , _cbafd ) ; } ; if len ( _faged ) + len ( _fedbb ) == len ( _dbbfc ) { return _aade , _ggbf ; } ; _eaebe := append ( _faged , _fedbb ... ) ; _dbbfc . log ( "u\u006e\u0061\u0075\u0067\u006d\u0065\u006e\u0074\u0065\u0064" ) ; _eaebe . log ( "\u0061u\u0067\u006d\u0065\u006e\u0074\u0065d" ) ;
return _faged , _fedbb ; } ; func ( _dgfd * textObject ) getFillColor ( ) _ag . Color { return _badcf ( _dgfd . _agbf . ColorspaceNonStroking , _dgfd . _agbf . ColorNonStroking ) ; } ; func _gfdaa ( _eaaa [ ] int ) [ ] int { _eeede := make ( [ ] int , len ( _eaaa ) ) ; for _dbbfg , _gcef := range _eaaa { _eeede [ len ( _eaaa ) - 1 - _dbbfg ] = _gcef ;
} ; return _eeede ; } ; func _ggcc ( _egcd * list , _abee * string ) string { _fbgf := _c . Split ( _egcd . _bfcg , "\u000a" ) ; _cdcc := & _c . Builder { } ; for _ , _dedc := range _fbgf { if _dedc != "" { _cdcc . WriteString ( * _abee ) ; _cdcc . WriteString ( _dedc ) ; _cdcc . WriteString ( "\u000a" ) ;
} ; } ; return _cdcc . String ( ) ; } ;
2022-06-27 19:58:38 +00:00
2022-07-13 21:28:43 +00:00
// TextMark represents extracted text on a page with information regarding both textual content,
// formatting (font and size) and positioning.
// It is the smallest unit of text on a PDF page, typically a single character.
//
// getBBox() in test_text.go shows how to compute bounding boxes of substrings of extracted text.
// The following code extracts the text on PDF page `page` into `text` then finds the bounding box
// `bbox` of substring `term` in `text`.
//
2023-01-08 22:34:27 +00:00
// ex, _ := New(page)
// // handle errors
// pageText, _, _, err := ex.ExtractPageText()
// // handle errors
// text := pageText.Text()
// textMarks := pageText.Marks()
2022-07-13 21:28:43 +00:00
//
2023-01-08 22:34:27 +00:00
// start := strings.Index(text, term)
// end := start + len(term)
// spanMarks, err := textMarks.RangeOffset(start, end)
// // handle errors
// bbox, ok := spanMarks.BBox()
// // handle errors
2022-07-13 21:28:43 +00:00
type TextMark struct {
2022-06-27 19:58:38 +00:00
2022-07-13 21:28:43 +00:00
// Text is the extracted text.
Text string ;
2022-06-27 19:58:38 +00:00
2022-07-13 21:28:43 +00:00
// Original is the text in the PDF. It has not been decoded like `Text`.
Original string ;
2022-06-27 19:58:38 +00:00
2022-07-13 21:28:43 +00:00
// BBox is the bounding box of the text.
2023-07-28 12:14:31 +00:00
BBox _bg . PdfRectangle ;
2022-06-27 19:58:38 +00:00
2022-07-13 21:28:43 +00:00
// Font is the font the text was drawn with.
2023-07-28 12:14:31 +00:00
Font * _bg . PdfFont ;
2022-06-27 19:58:38 +00:00
2022-07-13 21:28:43 +00:00
// FontSize is the font size the text was drawn with.
FontSize float64 ;
2022-06-27 19:58:38 +00:00
2022-07-13 21:28:43 +00:00
// Offset is the offset of the start of TextMark.Text in the extracted text. If you do this
// text, textMarks := pageText.Text(), pageText.Marks()
// marks := textMarks.Elements()
// then marks[i].Offset is the offset of marks[i].Text in text.
Offset int ;
2022-06-27 19:58:38 +00:00
2022-07-13 21:28:43 +00:00
// Meta is set true for spaces and line breaks that we insert in the extracted text. We insert
// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
// distance apart. See wordJoiner (lineJoiner) in PageText.computeViews().
Meta bool ;
2022-06-27 19:58:38 +00:00
2022-07-13 21:28:43 +00:00
// FillColor is the fill color of the text.
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
2023-07-28 12:14:31 +00:00
FillColor _ag . Color ;
2022-06-27 19:58:38 +00:00
2022-07-13 21:28:43 +00:00
// StrokeColor is the stroke color of the text.
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
2023-07-28 12:14:31 +00:00
StrokeColor _ag . Color ;
2022-06-27 19:58:38 +00:00
2022-07-13 21:28:43 +00:00
// Orientation is the text orientation
Orientation int ;
2022-06-27 19:58:38 +00:00
2022-07-13 21:28:43 +00:00
// DirectObject is the underlying PdfObject (Text Object) that represents the visible texts. This is introduced to get
// a simple access to the TextObject in case editing or replacment of some text is needed. E.g during redaction.
2023-07-28 12:14:31 +00:00
DirectObject _ea . PdfObject ;
2022-07-13 21:28:43 +00:00
// ObjString is a decoded string operand of a text-showing operator. It has the same value as `Text` attribute except
// when many glyphs are represented with the same Text Object that contains multiple length string operand in which case
// ObjString spans more than one character string that falls in different TextMark objects.
2023-07-28 12:14:31 +00:00
ObjString [ ] string ; Tw float64 ; Th float64 ; Tc float64 ; Index int ; _fgfd bool ; _ggea * TextTable ; } ; func _begbe ( _eaeeg [ ] _ea . PdfObject ) ( _cggde , _bfda float64 , _bfaf error ) { if len ( _eaeeg ) != 2 { return 0 , 0 , _ce . Errorf ( "\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u006e\u0075\u006d\u0062\u0065\u0072\u0020o\u0066 \u0070\u0061\u0072\u0061\u006d\u0073\u003a \u0025\u0064" , len ( _eaeeg ) ) ;
} ; _degcc , _bfaf := _ea . GetNumbersAsFloat ( _eaeeg ) ; if _bfaf != nil { return 0 , 0 , _bfaf ; } ; return _degcc [ 0 ] , _degcc [ 1 ] , nil ; } ;
2023-02-07 17:17:49 +00:00
2023-07-28 12:14:31 +00:00
// String returns a string describing `pt`.
func ( _cfcb PageText ) String ( ) string { _dceb := _ce . Sprintf ( "P\u0061\u0067\u0065\u0054ex\u0074:\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073" , len ( _cfcb . _ccf ) ) ; _gcga := [ ] string { "\u002d" + _dceb } ; for _ , _cbgf := range _cfcb . _ccf { _gcga = append ( _gcga , _cbgf . String ( ) ) ;
} ; _gcga = append ( _gcga , "\u002b" + _dceb ) ; return _c . Join ( _gcga , "\u000a" ) ; } ; func ( _bef * textObject ) setTextMatrix ( _cebf [ ] float64 ) { if len ( _cebf ) != 6 { _b . Log . Debug ( "\u0045\u0052\u0052OR\u003a\u0020\u006c\u0065\u006e\u0028\u0066\u0029\u0020\u0021\u003d\u0020\u0036\u0020\u0028\u0025\u0064\u0029" , len ( _cebf ) ) ;
return ; } ; _bad , _bee , _cfcc , _eaga , _cgcab , _gcf := _cebf [ 0 ] , _cebf [ 1 ] , _cebf [ 2 ] , _cebf [ 3 ] , _cebf [ 4 ] , _cebf [ 5 ] ; _bef . _fda = _gab . NewMatrix ( _bad , _bee , _cfcc , _eaga , _cgcab , _gcf ) ; _bef . _cfec = _bef . _fda ; } ; func ( _babe * textPara ) bbox ( ) _bg . PdfRectangle { return _babe . PdfRectangle } ;
func ( _aef * structTreeRoot ) buildList ( _egdgb map [ int ] [ ] * textLine , _ggc _ea . PdfObject ) [ ] * list { if _aef == nil { _b . Log . Debug ( "\u0062\u0075\u0069\u006c\u0064\u004c\u0069\u0073\u0074\u003a\u0020t\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u0020\u0069\u0073 \u006e\u0069\u006c" ) ;
return nil ; } ; var _gdeb * structElement ; _cddg := [ ] structElement { } ; if len ( _aef . _cegf ) == 1 { _cbgd := _aef . _cegf [ 0 ] . _bfeg ; if _cbgd == "\u0044\u006f\u0063\u0075\u006d\u0065\u006e\u0074" || _cbgd == "\u0053\u0065\u0063\u0074" || _cbgd == "\u0050\u0061\u0072\u0074" || _cbgd == "\u0044\u0069\u0076" || _cbgd == "\u0041\u0072\u0074" { _gdeb = & _aef . _cegf [ 0 ] ;
} ; } else { _gdeb = & structElement { _efce : _aef . _cegf , _bfeg : _aef . _baeb } ; } ; if _gdeb == nil { _b . Log . Debug ( "\u0062\u0075\u0069\u006cd\u004c\u0069\u0073\u0074\u003a\u0020\u0074\u006f\u0070\u0045l\u0065m\u0065\u006e\u0074\u0020\u0069\u0073\u0020n\u0069\u006c" ) ;
return nil ; } ; for _ , _bcbgc := range _gdeb . _efce { if _bcbgc . _bfeg == "\u004c" { _cddg = append ( _cddg , _bcbgc ) ; } else if _bcbgc . _bfeg == "\u0054\u0061\u0062l\u0065" { _ccdc := _eabd ( _bcbgc ) ; _cddg = append ( _cddg , _ccdc ... ) ; } ; } ; _bdgb := _eafb ( _cddg , _egdgb , _ggc ) ;
var _dadfb [ ] * list ; for _ , _gfadc := range _bdgb { _cbaf := _bgabc ( _gfadc ) ; _dadfb = append ( _dadfb , _cbaf ... ) ; } ; return _dadfb ; } ; func _fdfb ( _afed float64 , _fdec int ) int { if _fdec == 0 { _fdec = 1 ; } ; _gfba := float64 ( _fdec ) ; return int ( _ef . Round ( _afed / _gfba ) * _gfba ) ;
} ; func _fgdf ( _faebb , _faagd _bg . PdfRectangle ) ( _bg . PdfRectangle , bool ) { if ! _geec ( _faebb , _faagd ) { return _bg . PdfRectangle { } , false ; } ; return _bg . PdfRectangle { Llx : _ef . Max ( _faebb . Llx , _faagd . Llx ) , Urx : _ef . Min ( _faebb . Urx , _faagd . Urx ) , Lly : _ef . Max ( _faebb . Lly , _faagd . Lly ) , Ury : _ef . Min ( _faebb . Ury , _faagd . Ury ) } , true ;
} ; func ( _ffe * Extractor ) extractPageText ( _aga string , _cge * _bg . PdfPageResources , _fac _gab . Matrix , _ece int ) ( * PageText , int , int , error ) { _b . Log . Trace ( "\u0065x\u0074\u0072\u0061\u0063t\u0050\u0061\u0067\u0065\u0054e\u0078t\u003a \u006c\u0065\u0076\u0065\u006c\u003d\u0025d" , _ece ) ;
_dbc := & PageText { _ebbd : _ffe . _de , _cbc : _ffe . _fbb , _fabg : _ffe . _ca } ; _gccg := _afbd ( _ffe . _de ) ; var _afeg stateStack ; _cab := _aedc ( _ffe , _cge , _fb . GraphicsState { } , & _gccg , & _afeg ) ; _cgd := shapesState { _afee : _fac , _dfgb : _gab . IdentityMatrix ( ) , _eeadb : _cab } ;
var _abe bool ; _bec := - 1 ; if _ece > _bb { _gcg := _d . New ( "\u0066\u006f\u0072\u006d s\u0074\u0061\u0063\u006b\u0020\u006f\u0076\u0065\u0072\u0066\u006c\u006f\u0077" ) ; _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a \u0065\u0078\u0074\u0072\u0061\u0063\u0074\u0050\u0061\u0067\u0065\u0054\u0065\u0078\u0074\u002e\u0020\u0072\u0065\u0063u\u0072\u0073\u0069\u006f\u006e\u0020\u006c\u0065\u0076\u0065\u006c\u003d\u0025\u0064 \u0065r\u0072\u003d\u0025\u0076" , _ece , _gcg ) ;
return _dbc , _gccg . _feab , _gccg . _fefg , _gcg ; } ; _ddb := _fb . NewContentStreamParser ( _aga ) ; _cec , _bcbg := _ddb . Parse ( ) ; if _bcbg != nil { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020e\u0078\u0074\u0072a\u0063\u0074\u0050\u0061g\u0065\u0054\u0065\u0078\u0074\u0020\u0070\u0061\u0072\u0073\u0065\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _bcbg ) ;
return _dbc , _gccg . _feab , _gccg . _fefg , _bcbg ; } ; _dbc . _gagg = _cec ; _dbbf := _fb . NewContentStreamProcessor ( * _cec ) ; _dbbf . AddHandler ( _fb . HandlerConditionEnumAllOperands , "" , func ( _cbf * _fb . ContentStreamOperation , _eaec _fb . GraphicsState , _acb * _bg . PdfPageResources ) error { _dgdb := _cbf . Operand ;
if _cfab { _b . Log . Info ( "\u0026&\u0026\u0020\u006f\u0070\u003d\u0025s" , _cbf ) ; } ; switch _dgdb { case "\u0071" : if _bdaae { _b . Log . Info ( "\u0063\u0074\u006d\u003d\u0025\u0073" , _cgd . _dfgb ) ; } ; _afeg . push ( & _gccg ) ; case "\u0051" : if ! _afeg . empty ( ) { _gccg = * _afeg . pop ( ) ;
} ; _cgd . _dfgb = _eaec . CTM ; if _bdaae { _b . Log . Info ( "\u0063\u0074\u006d\u003d\u0025\u0073" , _cgd . _dfgb ) ; } ; case "\u0042\u0044\u0043" : _ecea , _eec := _ea . GetDict ( _cbf . Params [ 1 ] ) ; if ! _eec { _b . Log . Debug ( "\u0045\u0052\u0052O\u0052\u003a\u0020\u0042D\u0043\u0020\u006f\u0070\u003d\u0025\u0073 \u0047\u0065\u0074\u0044\u0069\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064" , _cbf ) ;
return _bcbg ; } ; _ega := _ecea . Get ( "\u004d\u0043\u0049\u0044" ) ; if _ega != nil { _bgcg , _aba := _ea . GetIntVal ( _ega ) ; if ! _aba { _b . Log . Debug ( "\u0045R\u0052\u004fR\u003a\u0020\u0042\u0044C\u0020\u006f\u0070=\u0025\u0073\u002e\u0020\u0042\u0061\u0064\u0020\u006eum\u0065\u0072\u0069c\u0061\u006c \u006f\u0062\u006a\u0065\u0063\u0074.\u0020\u006f=\u0025\u0073" , _cbf , _ega ) ;
} ; _bec = _bgcg ; } else { _bec = - 1 ; } ; case "\u0045\u004d\u0043" : _bec = - 1 ; case "\u0042\u0054" : if _abe { _b . Log . Debug ( "\u0042\u0054\u0020\u0063\u0061\u006c\u006c\u0065\u0064\u0020\u0077\u0068\u0069\u006c\u0065 \u0069n\u0020\u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074" ) ;
_dbc . _ccf = append ( _dbc . _ccf , _cab . _fecd ... ) ; } ; _abe = true ; _ffa := _eaec ; _ffa . CTM = _fac . Mult ( _ffa . CTM ) ; _cab = _aedc ( _ffe , _acb , _ffa , & _gccg , & _afeg ) ; _cgd . _eeadb = _cab ; case "\u0045\u0054" : if ! _abe { _b . Log . Debug ( "\u0045\u0054\u0020ca\u006c\u006c\u0065\u0064\u0020\u006f\u0075\u0074\u0073i\u0064e\u0020o\u0066 \u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074" ) ;
} ; _abe = false ; _dbc . _ccf = append ( _dbc . _ccf , _cab . _fecd ... ) ; _cab . reset ( ) ; case "\u0054\u002a" : _cab . nextLine ( ) ; case "\u0054\u0064" : if _dcf , _def := _cab . checkOp ( _cbf , 2 , true ) ; ! _dcf { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _def ) ;
return _def ; } ; _bbd , _daeb , _ecfg := _begbe ( _cbf . Params ) ; if _ecfg != nil { return _ecfg ; } ; _cab . moveText ( _bbd , _daeb ) ; case "\u0054\u0044" : if _egb , _gcac := _cab . checkOp ( _cbf , 2 , true ) ; ! _egb { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _gcac ) ;
return _gcac ; } ; _eaee , _cfe , _eed := _begbe ( _cbf . Params ) ; if _eed != nil { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _eed ) ; return _eed ; } ; _cab . moveTextSetLeading ( _eaee , _cfe ) ; case "\u0054\u006a" : if _afg , _beb := _cab . checkOp ( _cbf , 1 , true ) ;
! _afg { _b . Log . Debug ( "\u0045\u0052\u0052\u004fR:\u0020\u0054\u006a\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0065\u0072\u0072\u003d%\u0076" , _cbf , _beb ) ; return _beb ; } ; _aeg := _ea . TraceToDirectObject ( _cbf . Params [ 0 ] ) ; _decc , _dea := _ea . GetStringBytes ( _aeg ) ;
if ! _dea { _b . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a\u0020T\u006a\u0020o\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074S\u0074\u0072\u0069\u006e\u0067\u0042\u0079\u0074\u0065\u0073\u0020\u0066a\u0069\u006c\u0065\u0064" , _cbf ) ; return _ea . ErrTypeError ;
} ; return _cab . showText ( _aeg , _decc , _bec ) ; case "\u0054\u004a" : if _eeb , _eafe := _cab . checkOp ( _cbf , 1 , true ) ; ! _eeb { _b . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u004a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _eafe ) ; return _eafe ; } ; _abbd , _aab := _ea . GetArray ( _cbf . Params [ 0 ] ) ;
if ! _aab { _b . Log . Debug ( "\u0045\u0052\u0052OR\u003a\u0020\u0054\u004a\u0020\u006f\u0070\u003d\u0025s\u0020G\u0065t\u0041r\u0072\u0061\u0079\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065\u0064" , _cbf ) ; return _bcbg ; } ; return _cab . showTextAdjusted ( _abbd , _bec ) ;
case "\u0027" : if _cdc , _cefa := _cab . checkOp ( _cbf , 1 , true ) ; ! _cdc { _b . Log . Debug ( "\u0045R\u0052O\u0052\u003a\u0020\u0027\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _cefa ) ; return _cefa ; } ; _cfd := _ea . TraceToDirectObject ( _cbf . Params [ 0 ] ) ; _eea , _efd := _ea . GetStringBytes ( _cfd ) ;
if ! _efd { _b . Log . Debug ( "\u0045\u0052RO\u0052\u003a\u0020'\u0020\u006f\u0070\u003d%s \u0047et\u0053\u0074\u0072\u0069\u006e\u0067\u0042yt\u0065\u0073\u0020\u0066\u0061\u0069\u006ce\u0064" , _cbf ) ; return _ea . ErrTypeError ; } ; _cab . nextLine ( ) ; return _cab . showText ( _cfd , _eea , _bec ) ;
case "\u0022" : if _fbg , _bdc := _cab . checkOp ( _cbf , 3 , true ) ; ! _fbg { _b . Log . Debug ( "\u0045R\u0052O\u0052\u003a\u0020\u0022\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _bdc ) ; return _bdc ; } ; _cebe , _fbf , _aabc := _begbe ( _cbf . Params [ : 2 ] ) ; if _aabc != nil { return _aabc ;
} ; _cfeb := _ea . TraceToDirectObject ( _cbf . Params [ 2 ] ) ; _fec , _cebd := _ea . GetStringBytes ( _cfeb ) ; if ! _cebd { _b . Log . Debug ( "\u0045\u0052RO\u0052\u003a\u0020\"\u0020\u006f\u0070\u003d%s \u0047et\u0053\u0074\u0072\u0069\u006e\u0067\u0042yt\u0065\u0073\u0020\u0066\u0061\u0069\u006ce\u0064" , _cbf ) ;
return _ea . ErrTypeError ; } ; _cab . setCharSpacing ( _cebe ) ; _cab . setWordSpacing ( _fbf ) ; _cab . nextLine ( ) ; return _cab . showText ( _cfeb , _fec , _bec ) ; case "\u0054\u004c" : _gge , _dfg := _cdb ( _cbf ) ; if _dfg != nil { _b . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u004c\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _dfg ) ;
return _dfg ; } ; _cab . setTextLeading ( _gge ) ; case "\u0054\u0063" : _aeeg , _bebg := _cdb ( _cbf ) ; if _bebg != nil { _b . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u0063\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _bebg ) ; return _bebg ; } ; _cab . setCharSpacing ( _aeeg ) ;
case "\u0054\u0066" : if _face , _fgb := _cab . checkOp ( _cbf , 2 , true ) ; ! _face { _b . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u0066\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _fgb ) ; return _fgb ; } ; _fbd , _eaeb := _ea . GetNameVal ( _cbf . Params [ 0 ] ) ; if ! _eaeb { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a \u0054\u0066\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074\u004ea\u006d\u0065\u0056\u0061\u006c\u0020\u0066a\u0069\u006c\u0065\u0064" , _cbf ) ;
return _ea . ErrTypeError ; } ; _ecfa , _dff := _ea . GetNumberAsFloat ( _cbf . Params [ 1 ] ) ; if ! _eaeb { _b . Log . Debug ( "\u0045\u0052\u0052O\u0052\u003a\u0020\u0054\u0066\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074\u0046\u006c\u006f\u0061\u0074\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065d\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _cbf , _dff ) ;
return _dff ; } ; _dff = _cab . setFont ( _fbd , _ecfa ) ; _cab . _cbde = _ae . Is ( _dff , _ea . ErrNotSupported ) ; if _dff != nil && ! _cab . _cbde { return _dff ; } ; case "\u0054\u006d" : if _aae , _cag := _cab . checkOp ( _cbf , 6 , true ) ; ! _aae { _b . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u006d\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _cag ) ;
return _cag ; } ; _ebdf , _cgdc := _ea . GetNumbersAsFloat ( _cbf . Params ) ; if _cgdc != nil { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _cgdc ) ; return _cgdc ; } ; _cab . setTextMatrix ( _ebdf ) ; case "\u0054\u0072" : if _fge , _eab := _cab . checkOp ( _cbf , 1 , true ) ;
! _fge { _b . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u0072\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _eab ) ; return _eab ; } ; _add , _eecf := _ea . GetIntVal ( _cbf . Params [ 0 ] ) ; if ! _eecf { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0072\u0020\u006f\u0070\u003d\u0025\u0073 \u0047e\u0074\u0049\u006e\u0074\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065\u0064" , _cbf ) ;
return _ea . ErrTypeError ; } ; _cab . setTextRenderMode ( _add ) ; case "\u0054\u0073" : if _fef , _cfc := _cab . checkOp ( _cbf , 1 , true ) ; ! _fef { _b . Log . Debug ( "\u0045\u0052R\u004f\u0052\u003a \u0054\u0073\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _cfc ) ; return _cfc ;
} ; _dad , _gfc := _ea . GetNumberAsFloat ( _cbf . Params [ 0 ] ) ; if _gfc != nil { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _gfc ) ; return _gfc ; } ; _cab . setTextRise ( _dad ) ; case "\u0054\u0077" : if _dce , _bfe := _cab . checkOp ( _cbf , 1 , true ) ;
! _dce { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _bfe ) ; return _bfe ; } ; _aedg , _bfb := _ea . GetNumberAsFloat ( _cbf . Params [ 0 ] ) ; if _bfb != nil { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _bfb ) ;
return _bfb ; } ; _cab . setWordSpacing ( _aedg ) ; case "\u0054\u007a" : if _adf , _ggd := _cab . checkOp ( _cbf , 1 , true ) ; ! _adf { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _ggd ) ; return _ggd ; } ; _ggeg , _aac := _ea . GetNumberAsFloat ( _cbf . Params [ 0 ] ) ;
if _aac != nil { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _aac ) ; return _aac ; } ; _cab . setHorizScaling ( _ggeg ) ; case "\u0063\u006d" : _cgd . _dfgb = _eaec . CTM ; if _cgd . _dfgb . Singular ( ) { _egcf := _gab . IdentityMatrix ( ) . Translate ( _cgd . _dfgb . Translation ( ) ) ;
_b . Log . Debug ( "S\u0069n\u0067\u0075\u006c\u0061\u0072\u0020\u0063\u0074m\u003d\u0025\u0073\u2192%s" , _cgd . _dfgb , _egcf ) ; _cgd . _dfgb = _egcf ; } ; if _bdaae { _b . Log . Info ( "\u0063\u0074\u006d\u003d\u0025\u0073" , _cgd . _dfgb ) ; } ; case "\u006d" : if len ( _cbf . Params ) != 2 { _b . Log . Debug ( "\u0057\u0041\u0052\u004e\u003a\u0020\u0065\u0072\u0072o\u0072\u0020\u0077\u0068\u0069\u006c\u0065\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u0020\u0060\u006d\u0060\u0020o\u0070\u0065r\u0061\u0074o\u0072\u003a\u0020\u0025\u0076\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074 m\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063o\u0072\u0072\u0065\u0063\u0074\u002e" , _gc ) ;
return nil ; } ; _cgc , _gaaf := _ea . GetNumbersAsFloat ( _cbf . Params ) ; if _gaaf != nil { return _gaaf ; } ; _cgd . moveTo ( _cgc [ 0 ] , _cgc [ 1 ] ) ; case "\u006c" : if len ( _cbf . Params ) != 2 { _b . Log . Debug ( "\u0057\u0041\u0052\u004e\u003a\u0020\u0065\u0072\u0072o\u0072\u0020\u0077\u0068\u0069\u006c\u0065\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u0020\u0060\u006c\u0060\u0020o\u0070\u0065r\u0061\u0074o\u0072\u003a\u0020\u0025\u0076\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074 m\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063o\u0072\u0072\u0065\u0063\u0074\u002e" , _gc ) ;
return nil ; } ; _bdea , _eag := _ea . GetNumbersAsFloat ( _cbf . Params ) ; if _eag != nil { return _eag ; } ; _cgd . lineTo ( _bdea [ 0 ] , _bdea [ 1 ] ) ; case "\u0063" : if len ( _cbf . Params ) != 6 { return _gc ; } ; _ecd , _gabd := _ea . GetNumbersAsFloat ( _cbf . Params ) ; if _gabd != nil { return _gabd ;
} ; _b . Log . Debug ( "\u0043u\u0062\u0069\u0063\u0020b\u0065\u007a\u0069\u0065\u0072 \u0070a\u0072a\u006d\u0073\u003a\u0020\u0025\u002e\u0032f" , _ecd ) ; _cgd . cubicTo ( _ecd [ 0 ] , _ecd [ 1 ] , _ecd [ 2 ] , _ecd [ 3 ] , _ecd [ 4 ] , _ecd [ 5 ] ) ; case "\u0076" , "\u0079" : if len ( _cbf . Params ) != 4 { return _gc ;
} ; _cgca , _fde := _ea . GetNumbersAsFloat ( _cbf . Params ) ; if _fde != nil { return _fde ; } ; _b . Log . Debug ( "\u0043u\u0062\u0069\u0063\u0020b\u0065\u007a\u0069\u0065\u0072 \u0070a\u0072a\u006d\u0073\u003a\u0020\u0025\u002e\u0032f" , _cgca ) ; _cgd . quadraticTo ( _cgca [ 0 ] , _cgca [ 1 ] , _cgca [ 2 ] , _cgca [ 3 ] ) ;
case "\u0068" : _cgd . closePath ( ) ; case "\u0072\u0065" : if len ( _cbf . Params ) != 4 { return _gc ; } ; _deba , _cga := _ea . GetNumbersAsFloat ( _cbf . Params ) ; if _cga != nil { return _cga ; } ; _cgd . drawRectangle ( _deba [ 0 ] , _deba [ 1 ] , _deba [ 2 ] , _deba [ 3 ] ) ; _cgd . closePath ( ) ;
case "\u0053" : _cgd . stroke ( & _dbc . _bgac ) ; _cgd . clearPath ( ) ; case "\u0073" : _cgd . closePath ( ) ; _cgd . stroke ( & _dbc . _bgac ) ; _cgd . clearPath ( ) ; case "\u0046" : _cgd . fill ( & _dbc . _ffb ) ; _cgd . clearPath ( ) ; case "\u0066" , "\u0066\u002a" : _cgd . closePath ( ) ; _cgd . fill ( & _dbc . _ffb ) ;
_cgd . clearPath ( ) ; case "\u0042" , "\u0042\u002a" : _cgd . fill ( & _dbc . _ffb ) ; _cgd . stroke ( & _dbc . _bgac ) ; _cgd . clearPath ( ) ; case "\u0062" , "\u0062\u002a" : _cgd . closePath ( ) ; _cgd . fill ( & _dbc . _ffb ) ; _cgd . stroke ( & _dbc . _bgac ) ; _cgd . clearPath ( ) ; case "\u006e" : _cgd . clearPath ( ) ;
case "\u0044\u006f" : if len ( _cbf . Params ) == 0 { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0078\u0070\u0065\u0063\u0074\u0065\u0064\u0020\u0058\u004fbj\u0065c\u0074\u0020\u006e\u0061\u006d\u0065\u0020\u006f\u0070\u0065\u0072\u0061n\u0064\u0020\u0066\u006f\u0072\u0020\u0044\u006f\u0020\u006f\u0070\u0065\u0072\u0061\u0074\u006f\u0072.\u0020\u0047\u006f\u0074\u0020\u0025\u002b\u0076\u002e" , _cbf . Params ) ;
return _ea . ErrRangeError ; } ; _dgf , _bece := _ea . GetName ( _cbf . Params [ 0 ] ) ; if ! _bece { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u0044\u006f\u0020\u006f\u0070e\u0072a\u0074\u006f\u0072\u0020\u0058\u004f\u0062\u006a\u0065\u0063\u0074\u0020\u006e\u0061\u006d\u0065\u0020\u006fp\u0065\u0072\u0061\u006e\u0064\u003a\u0020\u0025\u002b\u0076\u002e" , _cbf . Params [ 0 ] ) ;
return _ea . ErrTypeError ; } ; _ , _agb := _acb . GetXObjectByName ( * _dgf ) ; if _agb != _bg . XObjectTypeForm { break ; } ; _beca , _bece := _ffe . _gbfd [ _dgf . String ( ) ] ; if ! _bece { _cfgf , _ade := _acb . GetXObjectFormByName ( * _dgf ) ; if _ade != nil { _b . Log . Debug ( "\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v" , _ade ) ;
return _ade ; } ; _cde , _ade := _cfgf . GetContentStream ( ) ; if _ade != nil { _b . Log . Debug ( "\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v" , _ade ) ; return _ade ; } ; _fbbb := _cfgf . Resources ; if _fbbb == nil { _fbbb = _acb ; } ; _ggg := _eaec . CTM ; if _ceg , _cea := _ea . GetArray ( _cfgf . Matrix ) ;
_cea { _dge , _dde := _ceg . GetAsFloat64Slice ( ) ; if _dde != nil { return _dde ; } ; if len ( _dge ) != 6 { return _gc ; } ; _bgab := _gab . NewMatrix ( _dge [ 0 ] , _dge [ 1 ] , _dge [ 2 ] , _dge [ 3 ] , _dge [ 4 ] , _dge [ 5 ] ) ; _ggg = _eaec . CTM . Mult ( _bgab ) ; } ; _bgdd , _gbcc , _cecc , _ade := _ffe . extractPageText ( string ( _cde ) , _fbbb , _fac . Mult ( _ggg ) , _ece + 1 ) ;
if _ade != nil { _b . Log . Debug ( "\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v" , _ade ) ; return _ade ; } ; _beca = textResult { * _bgdd , _gbcc , _cecc } ; _ffe . _gbfd [ _dgf . String ( ) ] = _beca ; } ; _cgd . _dfgb = _eaec . CTM ; if _bdaae { _b . Log . Info ( "\u0063\u0074\u006d\u003d\u0025\u0073" , _cgd . _dfgb ) ;
} ; _dbc . _ccf = append ( _dbc . _ccf , _beca . _ddd . _ccf ... ) ; _dbc . _bgac = append ( _dbc . _bgac , _beca . _ddd . _bgac ... ) ; _dbc . _ffb = append ( _dbc . _ffb , _beca . _ddd . _ffb ... ) ; _gccg . _feab += _beca . _aede ; _gccg . _fefg += _beca . _bega ; case "\u0072\u0067" , "\u0067" , "\u006b" , "\u0063\u0073" , "\u0073\u0063" , "\u0073\u0063\u006e" : _cab . _agbf . ColorspaceNonStroking = _eaec . ColorspaceNonStroking ;
_cab . _agbf . ColorNonStroking = _eaec . ColorNonStroking ; case "\u0052\u0047" , "\u0047" , "\u004b" , "\u0043\u0053" , "\u0053\u0043" , "\u0053\u0043\u004e" : _cab . _agbf . ColorspaceStroking = _eaec . ColorspaceStroking ; _cab . _agbf . ColorStroking = _eaec . ColorStroking ;
} ; return nil ; } ) ; _bcbg = _dbbf . Process ( _cge ) ; return _dbc , _gccg . _feab , _gccg . _fefg , _bcbg ; } ; func _cbba ( _dcag * textLine ) float64 { return _dcag . _aafd [ 0 ] . Llx } ; func _effad ( _efad map [ float64 ] map [ float64 ] gridTile ) [ ] float64 { _feee := make ( [ ] float64 , 0 , len ( _efad ) ) ;
_fbbefg := make ( map [ float64 ] struct { } , len ( _efad ) ) ; for _ , _becee := range _efad { for _aadc := range _becee { if _ , _gdac := _fbbefg [ _aadc ] ; _gdac { continue ; } ; _feee = append ( _feee , _aadc ) ; _fbbefg [ _aadc ] = struct { } { } ; } ; } ; _df . Float64s ( _feee ) ; return _feee ;
} ; func ( _gabb * shapesState ) newSubPath ( ) { _gabb . clearPath ( ) ; if _bdaae { _b . Log . Info ( "\u006e\u0065\u0077\u0053\u0075\u0062\u0050\u0061\u0074h\u003a\u0020\u0025\u0073" , _gabb ) ; } ; } ;
2023-05-29 17:26:33 +00:00
// ImageMark represents an image drawn on a page and its position in device coordinates.
// All coordinates are in device coordinates.
2023-07-28 12:14:31 +00:00
type ImageMark struct { Image * _bg . Image ;
2023-05-29 17:26:33 +00:00
// Dimensions of the image as displayed in the PDF.
Width float64 ; Height float64 ;
// Position of the image in PDF coordinates (lower left corner).
X float64 ; Y float64 ;
// Angle in degrees, if rotated.
2023-07-28 12:14:31 +00:00
Angle float64 ; } ; func _bffg ( _dcce , _bbea , _gggc float64 ) rulingKind { if _dcce >= _gggc && _ggdg ( _bbea , _dcce ) { return _cefaa ; } ; if _bbea >= _gggc && _ggdg ( _dcce , _bbea ) { return _acgee ; } ; return _bgbdg ; } ; func _befef ( _edfbc [ ] * textWord , _gbfdb * textWord ) [ ] * textWord { for _cdaa , _fbdb := range _edfbc { if _fbdb == _gbfdb { return _dddc ( _edfbc , _cdaa ) ;
} ; } ; _b . Log . Error ( "\u0072\u0065\u006d\u006f\u0076e\u0057\u006f\u0072\u0064\u003a\u0020\u0077\u006f\u0072\u0064\u0073\u0020\u0064o\u0065\u0073\u006e\u0027\u0074\u0020\u0063\u006f\u006e\u0074\u0061\u0069\u006e\u0020\u0077\u006f\u0072\u0064\u003d\u0025\u0073" , _gbfdb ) ;
return nil ; } ; func _ggeb ( _bfca [ ] * textLine , _edeb map [ float64 ] [ ] * textLine ) [ ] * list { _cdce := _cecd ( _edeb ) ; _bfgd := [ ] * list { } ; if len ( _cdce ) == 0 { return _bfgd ; } ; _baag := _cdce [ 0 ] ; _ccge := 1 ; _eggfa := _edeb [ _baag ] ; for _aaccf , _gff := range _eggfa { var _cfeg float64 ;
_cgdgb := [ ] * list { } ; _bgge := _gff . _cbbd ; _gagb := - 1.0 ; if _aaccf < len ( _eggfa ) - 1 { _gagb = _eggfa [ _aaccf + 1 ] . _cbbd ; } ; if _ccge < len ( _cdce ) { _cgdgb = _gdgf ( _bfca , _edeb , _cdce , _ccge , _bgge , _gagb ) ; } ; _cfeg = _gagb ; if len ( _cgdgb ) > 0 { _agdg := _cgdgb [ 0 ] ;
if len ( _agdg . _ecdee ) > 0 { _cfeg = _agdg . _ecdee [ 0 ] . _cbbd ; } ; } ; _ccb := [ ] * textLine { _gff } ; _efec := _cbcbe ( _gff , _bfca , _cdce , _bgge , _cfeg ) ; _ccb = append ( _ccb , _efec ... ) ; _dcba := _facb ( _ccb , "\u0062\u0075\u006c\u006c\u0065\u0074" , _cgdgb ) ; _dcba . _bfcg = _dcdgd ( _ccb , "" ) ;
_bfgd = append ( _bfgd , _dcba ) ; } ; return _bfgd ; } ;
2022-07-13 21:28:43 +00:00
2023-06-30 13:19:48 +00:00
// TextTable represents a table.
// Cells are ordered top-to-bottom, left-to-right.
// Cells[y] is the (0-offset) y'th row in the table.
// Cells[y][x] is the (0-offset) x'th column in the table.
2023-07-28 12:14:31 +00:00
type TextTable struct { _bg . PdfRectangle ; W , H int ; Cells [ ] [ ] TableCell ; } ; func ( _fbff * subpath ) add ( _aaba ... _gab . Point ) { _fbff . _fbcgf = append ( _fbff . _fbcgf , _aaba ... ) } ; func ( _baba rulingList ) vertsHorzs ( ) ( rulingList , rulingList ) { var _gbcb , _gede rulingList ;
for _ , _bfaa := range _baba { switch _bfaa . _eabdg { case _acgee : _gbcb = append ( _gbcb , _bfaa ) ; case _cefaa : _gede = append ( _gede , _bfaa ) ; } ; } ; return _gbcb , _gede ; } ; const ( _fgab markKind = iota ; _bafga ; _ceag ; _abgg ; ) ; func _dfcggd ( _fdcbd string , _agfb int ) string { if len ( _fdcbd ) < _agfb { return _fdcbd ;
} ; return _fdcbd [ : _agfb ] ; } ; func _ceac ( _beaeg map [ float64 ] gridTile ) [ ] float64 { _eegc := make ( [ ] float64 , 0 , len ( _beaeg ) ) ; for _gacg := range _beaeg { _eegc = append ( _eegc , _gacg ) ; } ; _df . Float64s ( _eegc ) ; return _eegc ; } ; func ( _fee * imageExtractContext ) extractFormImages ( _cbg * _ea . PdfObjectName , _cef _fb . GraphicsState , _eeed * _bg . PdfPageResources ) error { _bcda , _bgbd := _eeed . GetXObjectFormByName ( * _cbg ) ;
if _bgbd != nil { return _bgbd ; } ; if _bcda == nil { return nil ; } ; _bgc , _bgbd := _bcda . GetContentStream ( ) ; if _bgbd != nil { return _bgbd ; } ; _fbcf := _bcda . Resources ; if _fbcf == nil { _fbcf = _eeed ; } ; _bgbd = _fee . extractContentStreamImages ( string ( _bgc ) , _fbcf ) ;
if _bgbd != nil { return _bgbd ; } ; _fee . _ddc ++ ; return nil ; } ; func _gceee ( _efcb * PageText ) error { _dfff := _gb . GetLicenseKey ( ) ; if _dfff != nil && _dfff . IsLicensed ( ) || _cd { return nil ; } ; _ce . Printf ( "\u0055\u006e\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0064\u0020c\u006f\u0070\u0079\u0020\u006f\u0066\u0020\u0055\u006e\u0069P\u0044\u0046\u000a" ) ;
_ce . Println ( "-\u0020\u0047\u0065\u0074\u0020\u0061\u0020\u0066\u0072e\u0065\u0020\u0074\u0072\u0069\u0061\u006c l\u0069\u0063\u0065\u006es\u0065\u0020\u006f\u006e\u0020\u0068\u0074\u0074\u0070s:\u002f\u002fu\u006e\u0069\u0064\u006f\u0063\u002e\u0069\u006f" ) ;
return _d . New ( "\u0075\u006e\u0069\u0070d\u0066\u0020\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0020c\u006fd\u0065\u0020\u0072\u0065\u0071\u0075\u0069r\u0065\u0064" ) ; } ; func ( _bacbb rulingList ) isActualGrid ( ) ( rulingList , bool ) { _fdfa , _aecgf := _bacbb . augmentGrid ( ) ;
if ! ( len ( _fdfa ) >= _gfda + 1 && len ( _aecgf ) >= _cafad + 1 ) { if _bccgb { _b . Log . Info ( "\u0069s\u0041\u0063t\u0075\u0061\u006c\u0047r\u0069\u0064\u003a \u004e\u006f\u0074\u0020\u0061\u006c\u0069\u0067\u006eed\u002e\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u003c\u0020\u0025d\u0020\u0078 \u0025\u0064" , len ( _fdfa ) , len ( _aecgf ) , _gfda + 1 , _cafad + 1 ) ;
} ; return nil , false ; } ; if _bccgb { _b . Log . Info ( "\u0069\u0073\u0041\u0063\u0074\u0075a\u006c\u0047\u0072\u0069\u0064\u003a\u0020\u0025\u0073\u0020\u003a\u0020\u0025t\u0020\u0026\u0020\u0025\u0074\u0020\u2192 \u0025\u0074" , _bacbb , len ( _fdfa ) >= 2 , len ( _aecgf ) >= 2 , len ( _fdfa ) >= 2 && len ( _aecgf ) >= 2 ) ;
for _edce , _aeac := range _bacbb { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0076\u000a" , _edce , _aeac ) ; } ; } ; if _caec { _bffe , _ggdf := _fdfa [ 0 ] , _fdfa [ len ( _fdfa ) - 1 ] ; _edec , _ddgb := _aecgf [ 0 ] , _aecgf [ len ( _aecgf ) - 1 ] ; if ! ( _eabga ( _bffe . _befee - _edec . _agbc ) && _eabga ( _ggdf . _befee - _edec . _gffgd ) && _eabga ( _edec . _befee - _bffe . _gffgd ) && _eabga ( _ddgb . _befee - _bffe . _agbc ) ) { if _bccgb { _b . Log . Info ( "\u0069\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069d\u003a\u0020\u0020N\u006f\u0074 \u0061\u006c\u0069\u0067\u006e\u0065d\u002e\n\t\u0076\u0030\u003d\u0025\u0073\u000a\u0009\u0076\u0031\u003d\u0025\u0073\u000a\u0009\u0068\u0030\u003d\u0025\u0073\u000a\u0009\u0068\u0031\u003d\u0025\u0073" , _bffe , _ggdf , _edec , _ddgb ) ;
} ; return nil , false ; } ; } else { if ! _fdfa . aligned ( ) { if _dbdb { _b . Log . Info ( "i\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069\u0064\u003a\u0020N\u006f\u0074\u0020\u0061\u006c\u0069\u0067n\u0065\u0064\u0020\u0076\u0065\u0072\u0074\u0073\u002e\u0020%\u0064" , len ( _fdfa ) ) ;
} ; return nil , false ; } ; if ! _aecgf . aligned ( ) { if _bccgb { _b . Log . Info ( "i\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069\u0064\u003a\u0020N\u006f\u0074\u0020\u0061\u006c\u0069\u0067n\u0065\u0064\u0020\u0068\u006f\u0072\u007a\u0073\u002e\u0020%\u0064" , len ( _aecgf ) ) ;
} ; return nil , false ; } ; } ; _effc := append ( _fdfa , _aecgf ... ) ; return _effc , true ; } ;
2023-01-08 22:34:27 +00:00
2023-07-28 12:14:31 +00:00
// PageFonts represents extracted fonts on a PDF page.
type PageFonts struct { Fonts [ ] Font ; } ; func ( _faec * shapesState ) lineTo ( _dfecb , _dcde float64 ) { if _bdaae { _b . Log . Info ( "\u006c\u0069\u006eeT\u006f\u0028\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0070\u003d\u0025\u002e\u0032\u0066" , _dfecb , _dcde , _faec . devicePoint ( _dfecb , _dcde ) ) ;
} ; _faec . addPoint ( _dfecb , _dcde ) ; } ; func ( _dcfdb * textTable ) reduceTiling ( _cdcg gridTiling , _bbdfc float64 ) * textTable { _dedcd := make ( [ ] int , 0 , _dcfdb . _gcbge ) ; _adgc := make ( [ ] int , 0 , _dcfdb . _ddfc ) ; _ggfc := _cdcg . _eaafd ; _ccdg := _cdcg . _dade ; for _cfebe := 0 ;
_cfebe < _dcfdb . _gcbge ; _cfebe ++ { _ecgf := _cfebe > 0 && _ef . Abs ( _ccdg [ _cfebe - 1 ] - _ccdg [ _cfebe ] ) < _bbdfc && _dcfdb . emptyCompositeRow ( _cfebe ) ; if ! _ecgf { _dedcd = append ( _dedcd , _cfebe ) ; } ; } ; for _ccagd := 0 ; _ccagd < _dcfdb . _ddfc ; _ccagd ++ { _eabde := _ccagd < _dcfdb . _ddfc - 1 && _ef . Abs ( _ggfc [ _ccagd + 1 ] - _ggfc [ _ccagd ] ) < _bbdfc && _dcfdb . emptyCompositeColumn ( _ccagd ) ;
if ! _eabde { _adgc = append ( _adgc , _ccagd ) ; } ; } ; if len ( _dedcd ) == _dcfdb . _gcbge && len ( _adgc ) == _dcfdb . _ddfc { return _dcfdb ; } ; _fafb := textTable { _edgac : _dcfdb . _edgac , _ddfc : len ( _adgc ) , _gcbge : len ( _dedcd ) , _dadcc : make ( map [ uint64 ] compositeCell , len ( _adgc ) * len ( _dedcd ) ) } ;
if _cgafg { _b . Log . Info ( "\u0072\u0065\u0064\u0075c\u0065\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0025d\u0078%\u0064\u0020\u002d\u003e\u0020\u0025\u0064x\u0025\u0064" , _dcfdb . _ddfc , _dcfdb . _gcbge , len ( _adgc ) , len ( _dedcd ) ) ; _b . Log . Info ( "\u0072\u0065d\u0075\u0063\u0065d\u0043\u006f\u006c\u0073\u003a\u0020\u0025\u002b\u0076" , _adgc ) ;
_b . Log . Info ( "\u0072\u0065d\u0075\u0063\u0065d\u0052\u006f\u0077\u0073\u003a\u0020\u0025\u002b\u0076" , _dedcd ) ; } ; for _bcga , _bagae := range _dedcd { for _ggbdf , _eebda := range _adgc { _afbc , _bcgc := _dcfdb . getComposite ( _eebda , _bagae ) ; if len ( _afbc ) == 0 { continue ;
} ; if _cgafg { _ce . Printf ( "\u0020 \u0025\u0032\u0064\u002c \u0025\u0032\u0064\u0020\u0028%\u0032d\u002c \u0025\u0032\u0064\u0029\u0020\u0025\u0071\n" , _ggbdf , _bcga , _eebda , _bagae , _dfcggd ( _afbc . merge ( ) . text ( ) , 50 ) ) ; } ; _fafb . putComposite ( _ggbdf , _bcga , _afbc , _bcgc ) ;
} ; } ; return & _fafb ; } ; func _fbabg ( _ebdb [ ] * textMark , _fggc _bg . PdfRectangle ) * textWord { _cffde := _ebdb [ 0 ] . PdfRectangle ; _egfee := _ebdb [ 0 ] . _gceb ; for _ , _gfdcf := range _ebdb [ 1 : ] { _cffde = _egbga ( _cffde , _gfdcf . PdfRectangle ) ; if _gfdcf . _gceb > _egfee { _egfee = _gfdcf . _gceb ;
} ; } ; return & textWord { PdfRectangle : _cffde , _dggf : _ebdb , _baebb : _fggc . Ury - _cffde . Lly , _ebgb : _egfee } ; } ; func ( _acegag * textTable ) growTable ( ) { _ecbd := func ( _abbb paraList ) { _acegag . _gcbge ++ ; for _bcecb := 0 ; _bcecb < _acegag . _ddfc ; _bcecb ++ { _gabe := _abbb [ _bcecb ] ;
_acegag . put ( _bcecb , _acegag . _gcbge - 1 , _gabe ) ; } ; } ; _gffd := func ( _gdfe paraList ) { _acegag . _ddfc ++ ; for _acgbf := 0 ; _acgbf < _acegag . _gcbge ; _acgbf ++ { _acdb := _gdfe [ _acgbf ] ; _acegag . put ( _acegag . _ddfc - 1 , _acgbf , _acdb ) ; } ; } ; if _efed { _acegag . log ( "\u0067r\u006f\u0077\u0054\u0061\u0062\u006ce" ) ;
} ; for _fagbg := 0 ; ; _fagbg ++ { _fccb := false ; _gdcd := _acegag . getDown ( ) ; _daf := _acegag . getRight ( ) ; if _efed { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _fagbg , _acegag ) ; _ce . Printf ( "\u0020\u0020 \u0020\u0020\u0020 \u0020\u0064\u006f\u0077\u006e\u003d\u0025\u0073\u000a" , _gdcd ) ;
_ce . Printf ( "\u0020\u0020 \u0020\u0020\u0020 \u0072\u0069\u0067\u0068\u0074\u003d\u0025\u0073\u000a" , _daf ) ; } ; if _gdcd != nil && _daf != nil { _bcea := _gdcd [ len ( _gdcd ) - 1 ] ; if ! _bcea . taken ( ) && _bcea == _daf [ len ( _daf ) - 1 ] { _ecbd ( _gdcd ) ; if _daf = _acegag . getRight ( ) ;
_daf != nil { _gffd ( _daf ) ; _acegag . put ( _acegag . _ddfc - 1 , _acegag . _gcbge - 1 , _bcea ) ; } ; _fccb = true ; } ; } ; if ! _fccb && _gdcd != nil { _ecbd ( _gdcd ) ; _fccb = true ; } ; if ! _fccb && _daf != nil { _gffd ( _daf ) ; _fccb = true ; } ; if ! _fccb { break ; } ; } ; } ; func _dgga ( _cdbge [ ] * textLine ) [ ] * textLine { _edade := [ ] * textLine { } ;
for _ , _fced := range _cdbge { _cead := _fced . text ( ) ; _bdg := _bgaf . Find ( [ ] byte ( _cead ) ) ; if _bdg != nil { _edade = append ( _edade , _fced ) ; } ; } ; return _edade ; } ; func ( _aaed * textLine ) toTextMarks ( _agbfc * int ) [ ] TextMark { var _fdebf [ ] TextMark ; for _ , _ecadc := range _aaed . _aafd { if _ecadc . _gagaf { _fdebf = _gdbc ( _fdebf , _agbfc , "\u0020" ) ;
} ; _edefd := _ecadc . toTextMarks ( _agbfc ) ; _fdebf = append ( _fdebf , _edefd ... ) ; } ; return _fdebf ; } ; func ( _ageg paraList ) llyRange ( _bbgg [ ] int , _eaeca , _beae float64 ) [ ] int { _ffdg := len ( _ageg ) ; if _beae < _ageg [ _bbgg [ 0 ] ] . Lly || _eaeca > _ageg [ _bbgg [ _ffdg - 1 ] ] . Lly { return nil ;
} ; _eeac := _df . Search ( _ffdg , func ( _gfcg int ) bool { return _ageg [ _bbgg [ _gfcg ] ] . Lly >= _eaeca } ) ; _dccc := _df . Search ( _ffdg , func ( _bafe int ) bool { return _ageg [ _bbgg [ _bafe ] ] . Lly > _beae } ) ; return _bbgg [ _eeac : _dccc ] ; } ; func ( _aacg * wordBag ) minDepth ( ) float64 { return _aacg . _dgec - ( _aacg . Ury - _aacg . _ecdf ) } ;
func ( _gadbd lineRuling ) asRuling ( ) ( * ruling , bool ) { _dceba := ruling { _eabdg : _gadbd . _cbfb , Color : _gadbd . Color , _gggfe : _bafga } ; switch _gadbd . _cbfb { case _acgee : _dceba . _befee = _gadbd . xMean ( ) ; _dceba . _agbc = _ef . Min ( _gadbd . _egaf . Y , _gadbd . _eaebf . Y ) ;
_dceba . _gffgd = _ef . Max ( _gadbd . _egaf . Y , _gadbd . _eaebf . Y ) ; case _cefaa : _dceba . _befee = _gadbd . yMean ( ) ; _dceba . _agbc = _ef . Min ( _gadbd . _egaf . X , _gadbd . _eaebf . X ) ; _dceba . _gffgd = _ef . Max ( _gadbd . _egaf . X , _gadbd . _eaebf . X ) ; default : _b . Log . Error ( "\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064" , _gadbd . _cbfb ) ;
return nil , false ; } ; return & _dceba , true ; } ; func ( _fff * wordBag ) firstReadingIndex ( _ggde int ) int { _fbca := _fff . firstWord ( _ggde ) . _ebgb ; _cbce := float64 ( _ggde + 1 ) * _fefe ; _cbcb := _cbce + _dggb * _fbca ; _abcg := _ggde ; for _ , _fbbed := range _fff . depthBand ( _cbce , _cbcb ) { if _aea ( _fff . firstWord ( _fbbed ) , _fff . firstWord ( _abcg ) ) < 0 { _abcg = _fbbed ;
} ; } ; return _abcg ; } ;
2023-01-08 22:34:27 +00:00
2023-07-28 12:14:31 +00:00
// ExtractFonts returns all font information from the page extractor, including
// font name, font type, the raw data of the embedded font file (if embedded), font descriptor and more.
//
// The argument `previousPageFonts` is used when trying to build a complete font catalog for multiple pages or the entire document.
// The entries from `previousPageFonts` are added to the returned result unless already included in the page, i.e. no duplicate entries.
//
// NOTE: If previousPageFonts is nil, all fonts from the page will be returned. Use it when building up a full list of fonts for a document or page range.
func ( _gbe * Extractor ) ExtractFonts ( previousPageFonts * PageFonts ) ( * PageFonts , error ) { _bgb := PageFonts { } ; _eaa := _bgb . extractPageResourcesToFont ( _gbe . _gga ) ; if _eaa != nil { return nil , _eaa ; } ; if previousPageFonts != nil { for _ , _eb := range previousPageFonts . Fonts { if ! _aee ( _bgb . Fonts , _eb . FontName ) { _bgb . Fonts = append ( _bgb . Fonts , _eb ) ;
} ; } ; } ; return & PageFonts { Fonts : _bgb . Fonts } , nil ; } ;
2023-01-08 22:34:27 +00:00
2023-07-28 12:14:31 +00:00
// TableInfo gets table information of the textmark `tm`.
func ( _bgcc * TextMark ) TableInfo ( ) ( * TextTable , [ ] [ ] int ) { if ! _bgcc . _fgfd { return nil , nil ; } ; _eead := _bgcc . _ggea ; _bdae := _eead . getCellInfo ( * _bgcc ) ; return _eead , _bdae ; } ;
2023-01-08 22:34:27 +00:00
2023-07-28 12:14:31 +00:00
// PageText represents the layout of text on a device page.
type PageText struct { _ccf [ ] * textMark ; _bdf string ; _fccf [ ] TextMark ; _ecege [ ] TextTable ; _ebbd _bg . PdfRectangle ; _bgac [ ] pathSection ; _ffb [ ] pathSection ; _cbc * _ea . PdfObject ; _fabg _ea . PdfObject ; _gagg * _fb . ContentStreamOperations ; _fca PageTextOptions ;
} ;
2023-05-29 17:26:33 +00:00
2023-06-30 13:19:48 +00:00
// TableCell is a cell in a TextTable.
2023-07-28 12:14:31 +00:00
type TableCell struct { _bg . PdfRectangle ;
2023-05-29 17:26:33 +00:00
2023-06-30 13:19:48 +00:00
// Text is the extracted text.
Text string ;
2023-01-08 22:34:27 +00:00
2023-06-30 13:19:48 +00:00
// Marks returns the TextMarks corresponding to the text in Text.
2023-07-28 12:14:31 +00:00
Marks TextMarkArray ; } ; func _acfc ( _becd func ( * wordBag , * textWord , float64 ) bool , _aagb float64 ) func ( * wordBag , * textWord ) bool { return func ( _cadd * wordBag , _ddccb * textWord ) bool { return _becd ( _cadd , _ddccb , _aagb ) } ; } ; type lists [ ] * list ; func ( _gfdc * PageText ) getParagraphs ( ) paraList { var _ccc rulingList ;
if _bbdab { _adg := _egfd ( _gfdc . _bgac ) ; _ccc = append ( _ccc , _adg ... ) ; } ; if _bcac { _bgad := _ggegd ( _gfdc . _ffb ) ; _ccc = append ( _ccc , _bgad ... ) ; } ; _ccc , _ged := _ccc . toTilings ( ) ; var _ddf paraList ; _fedf := len ( _gfdc . _ccf ) ; for _adbg := 0 ; _adbg < 360 && _fedf > 0 ;
_adbg += 90 { _bce := make ( [ ] * textMark , 0 , len ( _gfdc . _ccf ) - _fedf ) ; for _ , _aagd := range _gfdc . _ccf { if _aagd . _acec == _adbg { _bce = append ( _bce , _aagd ) ; } ; } ; if len ( _bce ) > 0 { _fgde := _fgad ( _bce , _gfdc . _ebbd , _ccc , _ged , _gfdc . _fca . _bbg ) ; _ddf = append ( _ddf , _fgde ... ) ;
_fedf -= len ( _bce ) ; } ; } ; return _ddf ; } ;
2022-12-15 21:59:56 +00:00
2023-07-28 12:14:31 +00:00
// GetContentStreamOps returns the contentStreamOps field of `pt`.
func ( _acc * PageText ) GetContentStreamOps ( ) * _fb . ContentStreamOperations { return _acc . _gagg } ; func ( _dfac * structElement ) parseStructElement ( _gcgb _ea . PdfObject ) { _edca , _faabf := _ea . GetDict ( _gcgb ) ; if ! _faabf { _b . Log . Debug ( "\u0070\u0061\u0072\u0073\u0065\u0053\u0074\u0072u\u0063\u0074\u0045le\u006d\u0065\u006e\u0074\u003a\u0020d\u0069\u0063\u0074\u0069\u006f\u006e\u0061\u0072\u0079\u0020\u006f\u0062\u006a\u0065\u0063t\u0020\u006e\u006f\u0074\u0020\u0066\u006f\u0075n\u0064\u002e" ) ;
return ; } ; _fegdc := _edca . Get ( "\u0053" ) ; _eggb := _edca . Get ( "\u0050\u0067" ) ; _ddcba := "" ; if _fegdc != nil { _ddcba = _fegdc . String ( ) ; } ; _efgd := _edca . Get ( "\u004b" ) ; _dfac . _bfeg = _ddcba ; _dfac . _fgagg = _eggb ; switch _dgfa := _efgd . ( type ) { case * _ea . PdfObjectInteger : _dfac . _bfeg = _ddcba ;
_dfac . _aeae = int64 ( * _dgfa ) ; _dfac . _fgagg = _eggb ; case * _ea . PdfObjectReference : _dgagc := * _ea . MakeArray ( _dgfa ) ; var _gaae int64 = - 1 ; _dfac . _aeae = _gaae ; if _dgagc . Len ( ) == 1 { _fdfc := _dgagc . Elements ( ) [ 0 ] ; _ccfcf , _aaec := _fdfc . ( * _ea . PdfObjectInteger ) ;
if _aaec { _gaae = int64 ( * _ccfcf ) ; _dfac . _aeae = _gaae ; _dfac . _bfeg = _ddcba ; _dfac . _fgagg = _eggb ; return ; } ; } ; _cecag := [ ] structElement { } ; for _ , _egbbf := range _dgagc . Elements ( ) { _fafg , _bfdg := _egbbf . ( * _ea . PdfObjectInteger ) ; if _bfdg { _gaae = int64 ( * _fafg ) ;
_dfac . _aeae = _gaae ; _dfac . _bfeg = _ddcba ; } else { _acfb := & structElement { } ; _acfb . parseStructElement ( _egbbf ) ; _cecag = append ( _cecag , * _acfb ) ; } ; _gaae = - 1 ; } ; _dfac . _efce = _cecag ; case * _ea . PdfObjectArray : _gagac := _efgd . ( * _ea . PdfObjectArray ) ; var _caag int64 = - 1 ;
_dfac . _aeae = _caag ; if _gagac . Len ( ) == 1 { _dgb := _gagac . Elements ( ) [ 0 ] ; _abad , _cfcbg := _dgb . ( * _ea . PdfObjectInteger ) ; if _cfcbg { _caag = int64 ( * _abad ) ; _dfac . _aeae = _caag ; _dfac . _bfeg = _ddcba ; _dfac . _fgagg = _eggb ; return ; } ; } ; _gbag := [ ] structElement { } ;
for _ , _fbfg := range _gagac . Elements ( ) { _dfb , _ebc := _fbfg . ( * _ea . PdfObjectInteger ) ; if _ebc { _caag = int64 ( * _dfb ) ; _dfac . _aeae = _caag ; _dfac . _bfeg = _ddcba ; _dfac . _fgagg = _eggb ; } else { _aec := & structElement { } ; _aec . parseStructElement ( _fbfg ) ; _gbag = append ( _gbag , * _aec ) ;
} ; _caag = - 1 ; } ; _dfac . _efce = _gbag ; } ; } ; func ( _ed * PageFonts ) extractPageResourcesToFont ( _af * _bg . PdfPageResources ) error { _bf , _fag := _ea . GetDict ( _af . Font ) ; if ! _fag { return _d . New ( _deg ) ; } ; for _ , _ecb := range _bf . Keys ( ) { var ( _bff = true ; _cb [ ] byte ;
_ede string ; ) ; _da , _gaa := _af . GetFontByName ( _ecb ) ; if ! _gaa { return _d . New ( _egf ) ; } ; _fg , _dd := _bg . NewPdfFontFromPdfObject ( _da ) ; if _dd != nil { return _dd ; } ; _cee := _fg . FontDescriptor ( ) ; _cc := _fg . FontDescriptor ( ) . FontName . String ( ) ; _gcb := _fg . Subtype ( ) ;
if _aee ( _ed . Fonts , _cc ) { continue ; } ; if len ( _fg . ToUnicode ( ) ) == 0 { _bff = false ; } ; if _cee . FontFile != nil { if _faa , _bga := _ea . GetStream ( _cee . FontFile ) ; _bga { _cb , _dd = _ea . DecodeStream ( _faa ) ; if _dd != nil { return _dd ; } ; _ede = _cc + "\u002e\u0070\u0066\u0062" ;
} ; } else if _cee . FontFile2 != nil { if _dc , _faab := _ea . GetStream ( _cee . FontFile2 ) ; _faab { _cb , _dd = _ea . DecodeStream ( _dc ) ; if _dd != nil { return _dd ; } ; _ede = _cc + "\u002e\u0074\u0074\u0066" ; } ; } else if _cee . FontFile3 != nil { if _afe , _aed := _ea . GetStream ( _cee . FontFile3 ) ;
_aed { _cb , _dd = _ea . DecodeStream ( _afe ) ; if _dd != nil { return _dd ; } ; _ede = _cc + "\u002e\u0063\u0066\u0066" ; } ; } ; if len ( _ede ) < 1 { _b . Log . Debug ( _cf ) ; } ; _ebd := Font { FontName : _cc , PdfFont : _fg , IsCID : _fg . IsCID ( ) , IsSimple : _fg . IsSimple ( ) , ToUnicode : _bff , FontType : _gcb , FontData : _cb , FontFileName : _ede , FontDescriptor : _cee } ;
_ed . Fonts = append ( _ed . Fonts , _ebd ) ; } ; return nil ; } ; func _ffff ( _aaef * wordBag , _dcbb int ) * textLine { _ggb := _aaef . firstWord ( _dcbb ) ; _dfc := textLine { PdfRectangle : _ggb . PdfRectangle , _bfbb : _ggb . _ebgb , _cbbd : _ggb . _baebb } ; _dfc . pullWord ( _aaef , _ggb , _dcbb ) ;
return & _dfc ; } ; func ( _aecgb rulingList ) primMinMax ( ) ( float64 , float64 ) { _dcca , _ggeaa := _aecgb [ 0 ] . _befee , _aecgb [ 0 ] . _befee ; for _ , _eadf := range _aecgb [ 1 : ] { if _eadf . _befee < _dcca { _dcca = _eadf . _befee ; } else if _eadf . _befee > _ggeaa { _ggeaa = _eadf . _befee ;
} ; } ; return _dcca , _ggeaa ; } ; func ( _cadgg rulingList ) snapToGroupsDirection ( ) rulingList { _cadgg . sortStrict ( ) ; _afad := make ( map [ * ruling ] rulingList , len ( _cadgg ) ) ; _cede := _cadgg [ 0 ] ; _gcgg := func ( _egfc * ruling ) { _cede = _egfc ; _afad [ _cede ] = rulingList { _egfc } } ;
_gcgg ( _cadgg [ 0 ] ) ; for _ , _agbe := range _cadgg [ 1 : ] { if _agbe . _befee < _cede . _befee - _efea { _b . Log . Error ( "\u0073\u006e\u0061\u0070T\u006f\u0047\u0072\u006f\u0075\u0070\u0073\u0044\u0069r\u0065\u0063\u0074\u0069\u006f\u006e\u003a\u0020\u0057\u0072\u006f\u006e\u0067\u0020\u0070\u0072\u0069\u006da\u0072\u0079\u0020\u006f\u0072d\u0065\u0072\u002e\u000a\u0009\u0076\u0030\u003d\u0025\u0073\u000a\u0009\u0020\u0076\u003d\u0025\u0073" , _cede , _agbe ) ;
} ; if _agbe . _befee > _cede . _befee + _bddeb { _gcgg ( _agbe ) ; } else { _afad [ _cede ] = append ( _afad [ _cede ] , _agbe ) ; } ; } ; _ebbff := make ( map [ * ruling ] float64 , len ( _afad ) ) ; _cecdb := make ( map [ * ruling ] * ruling , len ( _cadgg ) ) ; for _fecdd , _fecbc := range _afad { _ebbff [ _fecdd ] = _fecbc . mergePrimary ( ) ;
for _ , _dgbb := range _fecbc { _cecdb [ _dgbb ] = _fecdd ; } ; } ; for _ , _bfced := range _cadgg { _bfced . _befee = _ebbff [ _cecdb [ _bfced ] ] ; } ; _gfbce := make ( rulingList , 0 , len ( _cadgg ) ) ; for _ , _cadga := range _afad { _gege := _cadga . splitSec ( ) ; for _eeba , _deffe := range _gege { _aagee := _deffe . merge ( ) ;
if len ( _gfbce ) > 0 { _edfgb := _gfbce [ len ( _gfbce ) - 1 ] ; if _edfgb . alignsPrimary ( _aagee ) && _edfgb . alignsSec ( _aagee ) { _b . Log . Error ( "\u0073\u006e\u0061\u0070\u0054\u006fG\u0072\u006f\u0075\u0070\u0073\u0044\u0069\u0072\u0065\u0063\u0074\u0069\u006f\u006e\u003a\u0020\u0044\u0075\u0070\u006ci\u0063\u0061\u0074\u0065\u0020\u0069\u003d\u0025\u0064\u000a\u0009\u0077\u003d\u0025s\u000a\t\u0076\u003d\u0025\u0073" , _eeba , _edfgb , _aagee ) ;
continue ; } ; } ; _gfbce = append ( _gfbce , _aagee ) ; } ; } ; _gfbce . sortStrict ( ) ; return _gfbce ; } ;
2022-10-27 19:04:58 +00:00
2023-07-28 12:14:31 +00:00
// Elements returns the TextMarks in `ma`.
func ( _cdbc * TextMarkArray ) Elements ( ) [ ] TextMark { return _cdbc . _bca } ;
2023-05-29 17:26:33 +00:00
2023-07-28 12:14:31 +00:00
// NewWithOptions an Extractor instance for extracting content from the input PDF page with options.
func NewWithOptions ( page * _bg . PdfPage , options * Options ) ( * Extractor , error ) { const _be = "\u0065x\u0074\u0072\u0061\u0063\u0074\u006f\u0072\u002e\u004e\u0065\u0077W\u0069\u0074\u0068\u004f\u0070\u0074\u0069\u006f\u006e\u0073" ; _ff , _db := page . GetAllContentStreams ( ) ;
if _db != nil { return nil , _db ; } ; _dga , _fcc := page . GetStructTreeRoot ( ) ; if ! _fcc { _b . Log . Info ( "T\u0068\u0065\u0020\u0070\u0064\u0066\u0020\u0064\u006f\u0063\u0075\u006d\u0065\u006e\u0074\u0020\u0069\u0073\u0020\u006e\u006f\u0074\u0020\u0074\u0061\u0067g\u0065d\u002e\u0020\u0053\u0074r\u0075\u0063t\u0054\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u0020\u0064\u006f\u0065\u0073\u006e\u0027\u0074\u0020\u0065\u0078\u0069\u0073\u0074\u002e" ) ;
} ; _dbb := page . GetContainingPdfObject ( ) ; _ceb , _db := page . GetMediaBox ( ) ; if _db != nil { return nil , _ce . Errorf ( "\u0065\u0078\u0074r\u0061\u0063\u0074\u006fr\u0020\u0072\u0065\u0071\u0075\u0069\u0072e\u0073\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u002e\u0020\u0025\u0076" , _db ) ;
} ; _dbd := & Extractor { _bc : _ff , _gga : page . Resources , _de : * _ceb , _fc : page . CropBox , _eg : map [ string ] fontEntry { } , _gbfd : map [ string ] textResult { } , _ad : options , _fbb : _dga , _ca : _dbb } ; if _dbd . _de . Llx > _dbd . _de . Urx { _b . Log . Info ( "\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0058\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e" , _dbd . _de ) ;
_dbd . _de . Llx , _dbd . _de . Urx = _dbd . _de . Urx , _dbd . _de . Llx ; } ; if _dbd . _de . Lly > _dbd . _de . Ury { _b . Log . Info ( "\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0059\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e" , _dbd . _de ) ;
_dbd . _de . Lly , _dbd . _de . Ury = _dbd . _de . Ury , _dbd . _de . Lly ; } ; _gb . TrackUse ( _be ) ; return _dbd , nil ; } ; type textObject struct { _dcdg * Extractor ; _edef * _bg . PdfPageResources ; _agbf _fb . GraphicsState ; _gacd * textState ; _bffa * stateStack ; _fda _gab . Matrix ;
_cfec _gab . Matrix ; _fecd [ ] * textMark ; _cbde bool ; } ;
2023-06-30 13:19:48 +00:00
2023-07-28 12:14:31 +00:00
// String returns a string describing the current state of the textState stack.
func ( _acdd * stateStack ) String ( ) string { _fdc := [ ] string { _ce . Sprintf ( "\u002d\u002d\u002d\u002d f\u006f\u006e\u0074\u0020\u0073\u0074\u0061\u0063\u006b\u003a\u0020\u0025\u0064" , len ( * _acdd ) ) } ; for _cfdd , _acf := range * _acdd { _bada := "\u003c\u006e\u0069l\u003e" ;
if _acf != nil { _bada = _acf . String ( ) ; } ; _fdc = append ( _fdc , _ce . Sprintf ( "\u0009\u0025\u0032\u0064\u003a\u0020\u0025\u0073" , _cfdd , _bada ) ) ; } ; return _c . Join ( _fdc , "\u000a" ) ; } ;
2023-05-29 17:26:33 +00:00
// ExtractPageImages returns the image contents of the page extractor, including data
// and position, size information for each image.
// A set of options to control page image extraction can be passed in. The options
// parameter can be nil for the default options. By default, inline stencil masks
// are not extracted.
2023-07-28 12:14:31 +00:00
func ( _ggf * Extractor ) ExtractPageImages ( options * ImageExtractOptions ) ( * PageImages , error ) { _dgaa := & imageExtractContext { _fae : options } ; _bcc := _dgaa . extractContentStreamImages ( _ggf . _bc , _ggf . _gga ) ; if _bcc != nil { return nil , _bcc ; } ; return & PageImages { Images : _dgaa . _dae } , nil ;
} ; func ( _ddeg * textObject ) renderText ( _dbg _ea . PdfObject , _cfb [ ] byte , _afd int ) error { if _ddeg . _cbde { _b . Log . Debug ( "\u0072\u0065\u006e\u0064\u0065r\u0054\u0065\u0078\u0074\u003a\u0020\u0049\u006e\u0076\u0061\u006c\u0069\u0064 \u0066\u006f\u006e\u0074\u002e\u0020\u004e\u006f\u0074\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u002e" ) ;
return nil ; } ; _bccce := _ddeg . getCurrentFont ( ) ; _cbfa := _bccce . BytesToCharcodes ( _cfb ) ; _ffcac , _dbe , _faga := _bccce . CharcodesToStrings ( _cbfa ) ; if _faga > 0 { _b . Log . Debug ( "\u0072\u0065nd\u0065\u0072\u0054e\u0078\u0074\u003a\u0020num\u0043ha\u0072\u0073\u003d\u0025\u0064\u0020\u006eum\u004d\u0069\u0073\u0073\u0065\u0073\u003d%\u0064" , _dbe , _faga ) ;
} ; _ddeg . _gacd . _feab += _dbe ; _ddeg . _gacd . _fefg += _faga ; _ggae := _ddeg . _gacd ; _bgbe := _ggae . _fgca ; _bdcd := _ggae . _dgc / 100.0 ; _ded := _aceg ; if _bccce . Subtype ( ) == "\u0054\u0079\u0070e\u0033" { _ded = 1 ; } ; _cagfg , _eceg := _bccce . GetRuneMetrics ( ' ' ) ; if ! _eceg { _cagfg , _eceg = _bccce . GetCharMetrics ( 32 ) ;
} ; if ! _eceg { _cagfg , _ = _bg . DefaultFont ( ) . GetRuneMetrics ( ' ' ) ; } ; _efb := _cagfg . Wx * _ded ; _b . Log . Trace ( "\u0073p\u0061\u0063e\u0057\u0069\u0064t\u0068\u003d\u0025\u002e\u0032\u0066\u0020t\u0065\u0078\u0074\u003d\u0025\u0071 \u0066\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0066\u006f\u006et\u0053\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066" , _efb , _ffcac , _bccce , _bgbe ) ;
_eecfe := _gab . NewMatrix ( _bgbe * _bdcd , 0 , 0 , _bgbe , 0 , _ggae . _fdea ) ; if _beff { _b . Log . Info ( "\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0074\u0065\u0078t\u0073\u003d\u0025\u0071" , len ( _cbfa ) , _cbfa , _ffcac ) ;
} ; _b . Log . Trace ( "\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0072\u0075\u006ee\u0073\u003d\u0025\u0071" , len ( _cbfa ) , _cbfa , len ( _ffcac ) ) ; _bdde := _ddeg . getFillColor ( ) ;
_bbcc := _ddeg . getStrokeColor ( ) ; for _ceca , _dbf := range _ffcac { _eaag := [ ] rune ( _dbf ) ; if len ( _eaag ) == 1 && _eaag [ 0 ] == '\x00' { continue ; } ; _dfda := _cbfa [ _ceca ] ; _dgag := _ddeg . _agbf . CTM . Mult ( _ddeg . _fda ) . Mult ( _eecfe ) ; _adcg := 0.0 ; if len ( _eaag ) == 1 && _eaag [ 0 ] == 32 { _adcg = _ggae . _cagc ;
} ; _ege , _cefd := _bccce . GetCharMetrics ( _dfda ) ; if ! _cefd { _b . Log . Debug ( "\u0045R\u0052\u004fR\u003a\u0020\u004e\u006f \u006d\u0065\u0074r\u0069\u0063\u0020\u0066\u006f\u0072\u0020\u0063\u006fde\u003d\u0025\u0064 \u0072\u003d0\u0078\u0025\u0030\u0034\u0078\u003d%\u002b\u0071 \u0025\u0073" , _dfda , _eaag , _eaag , _bccce ) ;
return _ce . Errorf ( "\u006e\u006f\u0020\u0063\u0068\u0061\u0072\u0020\u006d\u0065\u0074\u0072\u0069\u0063\u0073:\u0020f\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0063\u006f\u0064\u0065\u003d\u0025\u0064" , _bccce . String ( ) , _dfda ) ; } ; _cgg := _gab . Point { X : _ege . Wx * _ded , Y : _ege . Wy * _ded } ;
_aafg := _gab . Point { X : ( _cgg . X * _bgbe + _adcg ) * _bdcd } ; _ggfda := _gab . Point { X : ( _cgg . X * _bgbe + _ggae . _fdf + _adcg ) * _bdcd } ; if _beff { _b . Log . Info ( "\u0074\u0066\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0063\u003d\u0025\u002e\u0032f\u0020t\u0077\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0068\u003d\u0025\u002e\u0032\u0066" , _bgbe , _ggae . _fdf , _ggae . _cagc , _bdcd ) ;
_b . Log . Info ( "\u0064x\u002c\u0064\u0079\u003d%\u002e\u0033\u0066\u0020\u00740\u003d%\u002e3\u0066\u0020\u0074\u003d\u0025\u002e\u0033f" , _cgg , _aafg , _ggfda ) ; } ; _aaa := _ebba ( _aafg ) ; _fabeb := _ebba ( _ggfda ) ; _ebf := _ddeg . _agbf . CTM . Mult ( _ddeg . _fda ) . Mult ( _aaa ) ;
if _adbf { _b . Log . Info ( "e\u006e\u0064\u003a\u000a\tC\u0054M\u003d\u0025\u0073\u000a\u0009 \u0074\u006d\u003d\u0025\u0073\u000a" + "\u0009\u0020t\u0064\u003d\u0025s\u0020\u0078\u006c\u0061\u0074\u003d\u0025\u0073\u000a" + "\u0009t\u0064\u0030\u003d\u0025s\u000a\u0009\u0020\u0020\u2192 \u0025s\u0020x\u006c\u0061\u0074\u003d\u0025\u0073" , _ddeg . _agbf . CTM , _ddeg . _fda , _fabeb , _bbccc ( _ddeg . _agbf . CTM . Mult ( _ddeg . _fda ) . Mult ( _fabeb ) ) , _aaa , _ebf , _bbccc ( _ebf ) ) ;
} ; _fbcg , _bede := _ddeg . newTextMark ( _aa . ExpandLigatures ( _eaag ) , _dgag , _bbccc ( _ebf ) , _ef . Abs ( _efb * _dgag . ScalingFactorX ( ) ) , _bccce , _ddeg . _gacd . _fdf , _bdde , _bbcc , _dbg , _ffcac , _ceca , _afd ) ; if ! _bede { _b . Log . Debug ( "\u0054\u0065\u0078\u0074\u0020\u006d\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069d\u0065 \u0070\u0061\u0067\u0065\u002e\u0020\u0053\u006b\u0069\u0070\u0070\u0069\u006e\u0067" ) ;
continue ; } ; if _bccce == nil { _b . Log . Debug ( "\u0045R\u0052O\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u002e" ) ; } else if _bccce . Encoder ( ) == nil { _b . Log . Debug ( "E\u0052\u0052\u004f\u0052\u003a\u0020N\u006f\u0020\u0065\u006e\u0063\u006f\u0064\u0069\u006eg\u002e\u0020\u0066o\u006et\u003d\u0025\u0073" , _bccce ) ;
} else { if _bgabg , _adb := _bccce . Encoder ( ) . CharcodeToRune ( _dfda ) ; _adb { _fbcg . _ffbg = string ( _bgabg ) ; } ; } ; _b . Log . Trace ( "i\u003d\u0025\u0064\u0020\u0063\u006fd\u0065\u003d\u0025\u0064\u0020\u006d\u0061\u0072\u006b=\u0025\u0073\u0020t\u0072m\u003d\u0025\u0073" , _ceca , _dfda , _fbcg , _dgag ) ;
_ddeg . _fecd = append ( _ddeg . _fecd , & _fbcg ) ; _ddeg . _fda . Concat ( _fabeb ) ; } ; return nil ; } ; func _cecd ( _aeaa map [ float64 ] [ ] * textLine ) [ ] float64 { _faefa := [ ] float64 { } ; for _bbce := range _aeaa { _faefa = append ( _faefa , _bbce ) ; } ; _df . Float64s ( _faefa ) ; return _faefa ;
} ; func ( _cgfg * textTable ) computeBbox ( ) _bg . PdfRectangle { var _fdcec _bg . PdfRectangle ; _faeff := false ; for _gebg := 0 ; _gebg < _cgfg . _gcbge ; _gebg ++ { for _eafea := 0 ; _eafea < _cgfg . _ddfc ; _eafea ++ { _becgb := _cgfg . get ( _eafea , _gebg ) ; if _becgb == nil { continue ;
} ; if ! _faeff { _fdcec = _becgb . PdfRectangle ; _faeff = true ; } else { _fdcec = _egbga ( _fdcec , _becgb . PdfRectangle ) ; } ; } ; } ; return _fdcec ; } ; func ( _dgdc * textObject ) setHorizScaling ( _egfb float64 ) { if _dgdc == nil { return ; } ; _dgdc . _gacd . _dgc = _egfb ; } ; type markKind int ;
func _cdb ( _ffec * _fb . ContentStreamOperation ) ( float64 , error ) { if len ( _ffec . Params ) != 1 { _bebb := _d . New ( "\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et" ) ; _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076" , _ffec . Operand , 1 , len ( _ffec . Params ) , _ffec . Params ) ;
return 0.0 , _bebb ; } ; return _ea . GetNumberAsFloat ( _ffec . Params [ 0 ] ) ; } ; const ( _fgcg = true ; _aaade = true ; _ccdf = true ; _gbcd = false ; _cafa = false ; _fddc = 6 ; _fddb = 3.0 ; _ceec = 200 ; _ceed = true ; _adfg = true ; _bbdab = true ; _bcac = true ; _caec = false ; ) ; func ( _ggfb paraList ) computeEBBoxes ( ) { if _eebec { _b . Log . Info ( "\u0063o\u006dp\u0075\u0074\u0065\u0045\u0042\u0042\u006f\u0078\u0065\u0073\u003a" ) ;
} ; for _ , _fbccg := range _ggfb { _fbccg . _gfbgd = _fbccg . PdfRectangle ; } ; _degb := _ggfb . yNeighbours ( 0 ) ; for _fceb , _cfacf := range _ggfb { _edab := _cfacf . _gfbgd ; _gdec , _agea := - 1.0e9 , + 1.0e9 ; for _ , _dcee := range _degb [ _cfacf ] { _eefa := _ggfb [ _dcee ] . _gfbgd ;
if _eefa . Urx < _edab . Llx { _gdec = _ef . Max ( _gdec , _eefa . Urx ) ; } else if _edab . Urx < _eefa . Llx { _agea = _ef . Min ( _agea , _eefa . Llx ) ; } ; } ; for _bbcb , _dgacb := range _ggfb { _bgegc := _dgacb . _gfbgd ; if _fceb == _bbcb || _bgegc . Ury > _edab . Lly { continue ;
} ; if _gdec <= _bgegc . Llx && _bgegc . Llx < _edab . Llx { _edab . Llx = _bgegc . Llx ; } else if _bgegc . Urx <= _agea && _edab . Urx < _bgegc . Urx { _edab . Urx = _bgegc . Urx ; } ; } ; if _eebec { _ce . Printf ( "\u0025\u0034\u0064\u003a %\u0036\u002e\u0032\u0066\u2192\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0071\u000a" , _fceb , _cfacf . _gfbgd , _edab , _dfcggd ( _cfacf . text ( ) , 50 ) ) ;
} ; _cfacf . _gfbgd = _edab ; } ; if _gbcd { for _ , _gfca := range _ggfb { _gfca . PdfRectangle = _gfca . _gfbgd ; } ; } ; } ; type textTable struct { _bg . PdfRectangle ; _ddfc , _gcbge int ; _edgac bool ; _efeac map [ uint64 ] * textPara ; _dadcc map [ uint64 ] compositeCell ; } ; func ( _bcaf * wordBag ) text ( ) string { _cegb := _bcaf . allWords ( ) ;
_afdg := make ( [ ] string , len ( _cegb ) ) ; for _agff , _bcbd := range _cegb { _afdg [ _agff ] = _bcbd . _ggaef ; } ; return _c . Join ( _afdg , "\u0020" ) ; } ; func ( _gccgdc paraList ) xNeighbours ( _beac float64 ) map [ * textPara ] [ ] int { _ecded := make ( [ ] event , 2 * len ( _gccgdc ) ) ;
if _beac == 0 { for _aege , _dddf := range _gccgdc { _ecded [ 2 * _aege ] = event { _dddf . Llx , true , _aege } ; _ecded [ 2 * _aege + 1 ] = event { _dddf . Urx , false , _aege } ; } ; } else { for _fdfg , _cfced := range _gccgdc { _ecded [ 2 * _fdfg ] = event { _cfced . Llx - _beac * _cfced . fontsize ( ) , true , _fdfg } ;
_ecded [ 2 * _fdfg + 1 ] = event { _cfced . Urx + _beac * _cfced . fontsize ( ) , false , _fdfg } ; } ; } ; return _gccgdc . eventNeighbours ( _ecded ) ; } ; func _bccb ( _adbff * wordBag , _dadb float64 , _gcfg , _gfbaf rulingList ) [ ] * wordBag { var _bacaa [ ] * wordBag ; for _ , _fcfgg := range _adbff . depthIndexes ( ) { _dgae := false ;
for ! _adbff . empty ( _fcfgg ) { _bbdb := _adbff . firstReadingIndex ( _fcfgg ) ; _bbfa := _adbff . firstWord ( _bbdb ) ; _ddfa := _fcfg ( _bbfa , _dadb , _gcfg , _gfbaf ) ; _adbff . removeWord ( _bbfa , _bbdb ) ; if _fcea { _b . Log . Info ( "\u0066\u0069\u0072\u0073\u0074\u0057\u006f\u0072\u0064\u0020\u005e\u005e^\u005e\u0020\u0025\u0073" , _bbfa . String ( ) ) ;
} ; for _aebf := true ; _aebf ; _aebf = _dgae { _dgae = false ; _caab := _begg * _ddfa . _ecdf ; _bdcf := _bcbge * _ddfa . _ecdf ; _bcbfd := _dega * _ddfa . _ecdf ; if _fcea { _b . Log . Info ( "\u0070a\u0072a\u0057\u006f\u0072\u0064\u0073\u0020\u0064\u0065\u0070\u0074\u0068 \u0025\u002e\u0032\u0066 \u002d\u0020\u0025\u002e\u0032f\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061\u0044\u0065\u0070\u0074\u0068\u0047\u0061\u0070\u003d\u0025\u002e\u0032\u0066\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061R\u0065\u0061\u0064\u0069\u006e\u0067\u0047\u0061p\u003d\u0025\u002e\u0032\u0066" , _ddfa . minDepth ( ) , _ddfa . maxDepth ( ) , _bcbfd , _bdcf ) ;
} ; if _adbff . scanBand ( "\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c" , _ddfa , _acfc ( _gaf , 0 ) , _ddfa . minDepth ( ) - _bcbfd , _ddfa . maxDepth ( ) + _bcbfd , _fbbf , false , false ) > 0 { _dgae = true ; } ; if _adbff . scanBand ( "\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c" , _ddfa , _acfc ( _gaf , _bdcf ) , _ddfa . minDepth ( ) , _ddfa . maxDepth ( ) , _abg , false , false ) > 0 { _dgae = true ;
} ; if _dgae { continue ; } ; _efbac := _adbff . scanBand ( "" , _ddfa , _acfc ( _bbdf , _caab ) , _ddfa . minDepth ( ) , _ddfa . maxDepth ( ) , _dcdf , true , false ) ; if _efbac > 0 { _fadge := ( _ddfa . maxDepth ( ) - _ddfa . minDepth ( ) ) / _ddfa . _ecdf ; if ( _efbac > 1 && float64 ( _efbac ) > 0.3 * _fadge ) || _efbac <= 10 { if _adbff . scanBand ( "\u006f\u0074\u0068e\u0072" , _ddfa , _acfc ( _bbdf , _caab ) , _ddfa . minDepth ( ) , _ddfa . maxDepth ( ) , _dcdf , false , true ) > 0 { _dgae = true ;
} ; } ; } ; } ; _bacaa = append ( _bacaa , _ddfa ) ; } ; } ; return _bacaa ; } ; func _geec ( _ffac , _bbgf _bg . PdfRectangle ) bool { return _gefa ( _ffac , _bbgf ) && _cfda ( _ffac , _bbgf ) } ; type textState struct { _fdf float64 ; _cagc float64 ; _dgc float64 ; _ced float64 ; _fgca float64 ;
_gd RenderMode ; _fdea float64 ; _dgdf * _bg . PdfFont ; _fagg _bg . PdfRectangle ; _feab int ; _fefg int ; } ; func _cfda ( _bgca , _bffb _bg . PdfRectangle ) bool { return _bgca . Lly <= _bffb . Ury && _bffb . Lly <= _bgca . Ury ; } ; func _ddba ( _bgeb [ ] * textLine , _cgdcc , _cgda float64 ) [ ] * textLine { var _edfc [ ] * textLine ;
for _ , _cfdcf := range _bgeb { if _cgdcc == - 1 { if _cfdcf . _cbbd > _cgda { _edfc = append ( _edfc , _cfdcf ) ; } ; } else { if _cfdcf . _cbbd > _cgda && _cfdcf . _cbbd < _cgdcc { _edfc = append ( _edfc , _cfdcf ) ; } ; } ; } ; return _edfc ; } ;
2023-06-30 13:19:48 +00:00
// List returns all the list objects detected on the page.
// It detects all the bullet point Lists from a given pdf page and builds a slice of bullet list objects.
// A given bullet list object has a tree structure.
// Each bullet point list is extracted with the text content it contains and all the sub lists found under it as children in the tree.
// The rest content of the pdf is ignored and only text in the bullet point lists are extracted.
// The list extraction is done in two ways.
// 1. If the document is tagged then the lists are extracted using the tags provided in the document.
// 2. Otherwise the bullet lists are extracted from the raw text using regex matching.
// By default the document tag is used if available.
// However this can be disabled using `DisableDocumentTags` in the `Options` object.
// Sometimes disabling document tags option might give a better bullet list extraction if the document was tagged incorrectly.
// options := &Options{
// DisableDocumentTags: false, // this means use document tag if available
// }
// ex, err := NewWithOptions(page, options)
// // handle error
// pageText, _, _, err := ex.ExtractPageText()
// // handle error
// lists := pageText.List()
// txt := lists.Text()
2023-07-28 12:14:31 +00:00
func ( _bgdg PageText ) List ( ) lists { _egfbg := ! _bgdg . _fca . _cbff ; _dfcg := _bgdg . getParagraphs ( ) ; _ggee := true ; if _bgdg . _cbc == nil || * _bgdg . _cbc == nil { _ggee = false ; } ; _bedgf := _dfcg . list ( ) ; if _ggee && _egfbg { _cfgc := _dfba ( & _dfcg ) ; _acega := & structTreeRoot { } ;
_acega . parseStructTreeRoot ( * _bgdg . _cbc ) ; if _acega . _cegf == nil { _b . Log . Debug ( "\u004c\u0069\u0073\u0074\u003a\u0020\u0073t\u0072\u0075\u0063\u0074\u0054\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u0020\u0064\u006f\u0065\u0073\u006e'\u0074\u0020\u0068\u0061\u0076e\u0020\u0061\u006e\u0079\u0020\u0063\u006f\u006e\u0074e\u006e\u0074\u002c\u0020\u0075\u0073\u0069\u006e\u0067\u0020\u0074\u0065\u0078\u0074\u0020\u006d\u0061\u0074\u0063\u0068\u0069\u006e\u0067\u0020\u006d\u0065\u0074\u0068\u006f\u0064\u0020\u0069\u006e\u0073\u0074\u0065\u0061\u0064\u002e" ) ;
return _bedgf ; } ; _bedgf = _acega . buildList ( _cfgc , _bgdg . _fabg ) ; } ; return _bedgf ; } ; func ( _fgee rulingList ) primaries ( ) [ ] float64 { _fbfga := make ( map [ float64 ] struct { } , len ( _fgee ) ) ; for _ , _bbdaa := range _fgee { _fbfga [ _bbdaa . _befee ] = struct { } { } ; } ; _dged := make ( [ ] float64 , len ( _fbfga ) ) ;
_eaeeb := 0 ; for _dcadf := range _fbfga { _dged [ _eaeeb ] = _dcadf ; _eaeeb ++ ; } ; _df . Float64s ( _dged ) ; return _dged ; } ; func ( _gbbf * shapesState ) addPoint ( _gcab , _feb float64 ) { _afa := _gbbf . establishSubpath ( ) ; _agaf := _gbbf . devicePoint ( _gcab , _feb ) ; if _afa == nil { _gbbf . _afge = true ;
_gbbf . _bfd = _agaf ; } else { _afa . add ( _agaf ) ; } ; } ; func ( _cddec * textTable ) isExportable ( ) bool { if _cddec . _edgac { return true ; } ; _adbbc := func ( _eaece int ) bool { _geed := _cddec . get ( 0 , _eaece ) ; if _geed == nil { return false ; } ; _bgcb := _geed . text ( ) ; _egff := _a . RuneCountInString ( _bgcb ) ;
_dbee := _gefeb . MatchString ( _bgcb ) ; return _egff <= 1 || _dbee ; } ; for _begdb := 0 ; _begdb < _cddec . _gcbge ; _begdb ++ { if ! _adbbc ( _begdb ) { return true ; } ; } ; return false ; } ;
// Font represents the font properties on a PDF page.
type Font struct { PdfFont * _bg . PdfFont ;
// FontName represents Font Name from font properties.
FontName string ;
// FontType represents Font Subtype entry in the font dictionary inside page resources.
// Examples : type0, Type1, MMType1, Type3, TrueType, CIDFont.
FontType string ;
// ToUnicode is true if font provides a `ToUnicode` mapping.
ToUnicode bool ;
// IsCID is true if underlying font is a composite font.
// Composite font is represented by a font dictionary whose Subtype is `Type0`
IsCID bool ;
// IsSimple is true if font is simple font.
// A simple font is limited to only 8 bit (255) character codes.
IsSimple bool ;
// FontData represents the raw data of the embedded font file.
// It can have format TrueType (TTF), PostScript Font (PFB) or Compact Font Format (CCF).
// FontData value can be indicates from `FontFile`, `FontFile2` or `FontFile3` inside Font Descriptor.
// At most, only one of `FontFile`, `FontFile2` or `FontFile3` will be FontData value.
FontData [ ] byte ;
// FontFileName is a name representing the font. it has format:
// (Font Name) + (Font Type Extension), example: helvetica.ttf.
FontFileName string ;
// FontDescriptor represents metrics and other attributes inside font properties from PDF Structure (Font Descriptor).
FontDescriptor * _bg . PdfFontDescriptor ; } ;
// ImageExtractOptions contains options for controlling image extraction from
// PDF pages.
type ImageExtractOptions struct { IncludeInlineStencilMasks bool ; } ; func _bbdf ( _ecee * wordBag , _dcc * textWord , _eaae float64 ) bool { return _ecee . Urx <= _dcc . Llx && _dcc . Llx < _ecee . Urx + _eaae ; } ;
// String returns a description of `p`.
func ( _gbbcc * textPara ) String ( ) string { if _gbbcc . _cfga { return _ce . Sprintf ( "\u0025\u0036\u002e\u0032\u0066\u0020\u005b\u0045\u004d\u0050\u0054\u0059\u005d" , _gbbcc . PdfRectangle ) ; } ; _gbeg := "" ; if _gbbcc . _bgba != nil { _gbeg = _ce . Sprintf ( "\u005b\u0025\u0064\u0078\u0025\u0064\u005d\u0020" , _gbbcc . _bgba . _ddfc , _gbbcc . _bgba . _gcbge ) ;
} ; return _ce . Sprintf ( "\u0025\u0036\u002e\u0032f \u0025\u0073\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u0020\u0025\u0071" , _gbbcc . PdfRectangle , _gbeg , len ( _gbbcc . _gfbb ) , _dfcggd ( _gbbcc . text ( ) , 50 ) ) ; } ; func _bdab ( _gfcf [ ] TextMark , _bfga * int ) [ ] TextMark { _dgecc := _gfcf [ len ( _gfcf ) - 1 ] ;
_gggf := [ ] rune ( _dgecc . Text ) ; if len ( _gggf ) == 1 { _gfcf = _gfcf [ : len ( _gfcf ) - 1 ] ; _gdf := _gfcf [ len ( _gfcf ) - 1 ] ; * _bfga = _gdf . Offset + len ( _gdf . Text ) ; } else { _bddd := _cdbbf ( _dgecc . Text ) ; * _bfga += len ( _bddd ) - len ( _dgecc . Text ) ; _dgecc . Text = _bddd ;
} ; return _gfcf ; } ; func ( _eebddb gridTile ) contains ( _cbcbc _bg . PdfRectangle ) bool { if _eebddb . numBorders ( ) < 3 { return false ; } ; if _eebddb . _afdge && _cbcbc . Llx < _eebddb . Llx - _cddf { return false ; } ; if _eebddb . _bfecb && _cbcbc . Urx > _eebddb . Urx + _cddf { return false ;
} ; if _eebddb . _eaed && _cbcbc . Lly < _eebddb . Lly - _cddf { return false ; } ; if _eebddb . _fdbd && _cbcbc . Ury > _eebddb . Ury + _cddf { return false ; } ; return true ; } ; func ( _gbbca * textPara ) toCellTextMarks ( _cfggf * int ) [ ] TextMark { var _egad [ ] TextMark ; for _ggebc , _gccgb := range _gbbca . _gfbb { _dgdcd := _gccgb . toTextMarks ( _cfggf ) ;
_bffae := _fgcg && _gccgb . endsInHyphen ( ) && _ggebc != len ( _gbbca . _gfbb ) - 1 ; if _bffae { _dgdcd = _bdab ( _dgdcd , _cfggf ) ; } ; _egad = append ( _egad , _dgdcd ... ) ; if ! ( _bffae || _ggebc == len ( _gbbca . _gfbb ) - 1 ) { _egad = _gdbc ( _egad , _cfggf , _dgbg ( _gccgb . _cbbd , _gbbca . _gfbb [ _ggebc + 1 ] . _cbbd ) ) ;
} ; } ; return _egad ; } ; func _dfcc ( _dbbb string ) bool { for _ , _gfff := range _dbbb { if ! _f . IsSpace ( _gfff ) { return false ; } ; } ; return true ; } ; func ( _ebe * shapesState ) drawRectangle ( _fgcc , _geee , _gbbb , _aaag float64 ) { if _bdaae { _gcacd := _ebe . devicePoint ( _fgcc , _geee ) ;
_eggf := _ebe . devicePoint ( _fgcc + _gbbb , _geee + _aaag ) ; _fdffe := _bg . PdfRectangle { Llx : _gcacd . X , Lly : _gcacd . Y , Urx : _eggf . X , Ury : _eggf . Y } ; _b . Log . Info ( "d\u0072a\u0077\u0052\u0065\u0063\u0074\u0061\u006e\u0067l\u0065\u003a\u0020\u00256.\u0032\u0066" , _fdffe ) ;
} ; _ebe . newSubPath ( ) ; _ebe . moveTo ( _fgcc , _geee ) ; _ebe . lineTo ( _fgcc + _gbbb , _geee ) ; _ebe . lineTo ( _fgcc + _gbbb , _geee + _aaag ) ; _ebe . lineTo ( _fgcc , _geee + _aaag ) ; _ebe . closePath ( ) ; } ; func ( _bfbcg paraList ) findTables ( _ffcd [ ] gridTiling ) [ ] * textTable { _bfbcg . addNeighbours ( ) ;
_df . Slice ( _bfbcg , func ( _dddb , _fedef int ) bool { return _fcd ( _bfbcg [ _dddb ] , _bfbcg [ _fedef ] ) < 0 } ) ; var _ddege [ ] * textTable ; if _ceed { _eeeg := _bfbcg . findGridTables ( _ffcd ) ; _ddege = append ( _ddege , _eeeg ... ) ; } ; if _adfg { _efgc := _bfbcg . findTextTables ( ) ;
_ddege = append ( _ddege , _efgc ... ) ; } ; return _ddege ; } ; func _ccab ( _eagd [ ] compositeCell ) [ ] float64 { var _ggccg [ ] * textLine ; _gbad := 0 ; for _ , _dbdf := range _eagd { _gbad += len ( _dbdf . paraList ) ; _ggccg = append ( _ggccg , _dbdf . lines ( ) ... ) ; } ; _df . Slice ( _ggccg , func ( _deab , _ddadg int ) bool { _bdfe , _badec := _ggccg [ _deab ] , _ggccg [ _ddadg ] ;
_dacab , _bagfg := _bdfe . _cbbd , _badec . _cbbd ; if ! _acbc ( _dacab - _bagfg ) { return _dacab < _bagfg ; } ; return _bdfe . Llx < _badec . Llx ; } ) ; if _cgafg { _ce . Printf ( "\u0020\u0020\u0020 r\u006f\u0077\u0042\u006f\u0072\u0064\u0065\u0072\u0073:\u0020%\u0064 \u0070a\u0072\u0061\u0073\u0020\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u000a" , _gbad , len ( _ggccg ) ) ;
for _affd , _gfcgb := range _ggccg { _ce . Printf ( "\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a" , _affd , _gfcgb ) ; } ; } ; var _eedcb [ ] float64 ; _bdeeb := _ggccg [ 0 ] ; var _baae [ ] [ ] * textLine ; _egbf := [ ] * textLine { _bdeeb } ; for _adee , _cgdf := range _ggccg [ 1 : ] { if _cgdf . Ury < _bdeeb . Lly { _ggce := 0.5 * ( _cgdf . Ury + _bdeeb . Lly ) ;
if _cgafg { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u003c\u0020\u0025\u0036.\u0032f\u0020\u0062\u006f\u0072\u0064\u0065\u0072\u003d\u0025\u0036\u002e\u0032\u0066\u000a" + "\u0009\u0020\u0071\u003d\u0025\u0073\u000a\u0009\u0020p\u003d\u0025\u0073\u000a" , _adee , _cgdf . Ury , _bdeeb . Lly , _ggce , _bdeeb , _cgdf ) ;
} ; _eedcb = append ( _eedcb , _ggce ) ; _baae = append ( _baae , _egbf ) ; _egbf = nil ; } ; _egbf = append ( _egbf , _cgdf ) ; if _cgdf . Lly < _bdeeb . Lly { _bdeeb = _cgdf ; } ; } ; if len ( _egbf ) > 0 { _baae = append ( _baae , _egbf ) ; } ; if _cgafg { _ce . Printf ( " \u0020\u0020\u0020\u0020\u0020\u0020 \u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064o\u0072\u0073\u003d%\u0036.\u0032\u0066\u000a" , _eedcb ) ;
} ; if _cgafg { _b . Log . Info ( "\u0072\u006f\u0077\u003d\u0025\u0064" , len ( _eagd ) ) ; for _bbccfc , _cddb := range _eagd { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _bbccfc , _cddb ) ; } ; _b . Log . Info ( "\u0067r\u006f\u0075\u0070\u0073\u003d\u0025d" , len ( _baae ) ) ;
for _fadf , _cdec := range _baae { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0064\u000a" , _fadf , len ( _cdec ) ) ; for _ccbb , _cacge := range _cdec { _ce . Printf ( "\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a" , _ccbb , _cacge ) ; } ; } ; } ; _gebgf := true ; for _efdcc , _agbaa := range _baae { _bddcb := true ;
for _cadfc , _fcgg := range _eagd { if _cgafg { _ce . Printf ( "\u0020\u0020\u0020\u007e\u007e\u007e\u0067\u0072\u006f\u0075\u0070\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064\u0020\u0063\u0065\u006cl\u0020\u0025\u0064\u0020\u006ff\u0020\u0025d\u0020\u0025\u0073\u000a" , _efdcc , len ( _baae ) , _cadfc , len ( _eagd ) , _fcgg ) ;
} ; if ! _fcgg . hasLines ( _agbaa ) { if _cgafg { _ce . Printf ( "\u0020\u0020\u0020\u0021\u0021\u0021\u0067\u0072\u006f\u0075\u0070\u0020\u0025d\u0020\u006f\u0066\u0020\u0025\u0064 \u0063\u0065\u006c\u006c\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064 \u004f\u0055\u0054\u000a" , _efdcc , len ( _baae ) , _cadfc , len ( _eagd ) ) ;
} ; _bddcb = false ; break ; } ; } ; if ! _bddcb { _gebgf = false ; break ; } ; } ; if ! _gebgf { if _cgafg { _b . Log . Info ( "\u0072\u006f\u0077\u0020\u0063o\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u0020\u0064\u006f\u006e\u0027\u0074 \u0073\u0070\u0061\u006e\u0020\u0061\u006c\u006c\u0020\u0063\u0065\u006c\u006c\u0073\u0020\u0069\u006e\u0020\u0072\u006f\u0077\u002e\u0020\u0069\u0067\u006e\u006f\u0072\u0069\u006eg" ) ;
} ; _eedcb = nil ; } ; if _cgafg && _eedcb != nil { _ce . Printf ( "\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u002a\u002a*\u0072\u006f\u0077\u0043\u006f\u0072\u0072i\u0064\u006f\u0072\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u000a" , _eedcb ) ; } ; return _eedcb ;
} ; type paraList [ ] * textPara ; func ( _dgace rectRuling ) checkWidth ( _cfba , _bfea float64 ) ( float64 , bool ) { _ggfdg := _bfea - _cfba ; _aagef := _ggfdg <= _bddeb ; return _ggfdg , _aagef ; } ; func ( _gaafa rulingList ) intersections ( ) map [ int ] intSet { var _fcbc , _eecd [ ] int ;
for _agffb , _bdafa := range _gaafa { switch _bdafa . _eabdg { case _acgee : _fcbc = append ( _fcbc , _agffb ) ; case _cefaa : _eecd = append ( _eecd , _agffb ) ; } ; } ; if len ( _fcbc ) < _gfda + 1 || len ( _eecd ) < _cafad + 1 { return nil ; } ; if len ( _fcbc ) + len ( _eecd ) > _cfae { _b . Log . Debug ( "\u0069\u006e\u0074\u0065\u0072\u0073e\u0063\u0074\u0069\u006f\u006e\u0073\u003a\u0020\u0054\u004f\u004f\u0020\u004d\u0041\u004e\u0059\u0020\u0072\u0075\u006ci\u006e\u0067\u0073\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u003d\u0020%\u0064 \u0078\u0020\u0025\u0064" , len ( _gaafa ) , len ( _fcbc ) , len ( _eecd ) ) ;
return nil ; } ; _bgacf := make ( map [ int ] intSet , len ( _fcbc ) + len ( _eecd ) ) ; for _ , _gbegd := range _fcbc { for _ , _aaefb := range _eecd { if _gaafa [ _gbegd ] . intersects ( _gaafa [ _aaefb ] ) { if _ , _dfeb := _bgacf [ _gbegd ] ; ! _dfeb { _bgacf [ _gbegd ] = make ( intSet ) ;
} ; if _ , _dcad := _bgacf [ _aaefb ] ; ! _dcad { _bgacf [ _aaefb ] = make ( intSet ) ; } ; _bgacf [ _gbegd ] . add ( _aaefb ) ; _bgacf [ _aaefb ] . add ( _gbegd ) ; } ; } ; } ; return _bgacf ; } ; func ( _beced * shapesState ) devicePoint ( _feag , _bea float64 ) _gab . Point { _gce := _beced . _afee . Mult ( _beced . _dfgb ) ;
_feag , _bea = _gce . Transform ( _feag , _bea ) ; return _gab . NewPoint ( _feag , _bea ) ; } ; func ( _bge * textObject ) setFont ( _bdd string , _ggaa float64 ) error { if _bge == nil { return nil ; } ; _bge . _gacd . _fgca = _ggaa ; _cfce , _caa := _bge . getFont ( _bdd ) ; if _caa != nil { return _caa ;
} ; _bge . _gacd . _dgdf = _cfce ; return nil ; } ; func _cadc ( _bdfd [ ] TextMark , _acddc * TextTable ) [ ] TextMark { var _gdeg [ ] TextMark ; for _ , _ececa := range _bdfd { _ececa . _fgfd = true ; _ececa . _ggea = _acddc ; _gdeg = append ( _gdeg , _ececa ) ; } ; return _gdeg ; } ; const ( _bgbdg rulingKind = iota ;
_cefaa ; _acgee ; ) ; type gridTile struct { _bg . PdfRectangle ; _fdbd , _afdge , _eaed , _bfecb bool ; } ; func ( _fgaf * textTable ) getDown ( ) paraList { _gbdg := make ( paraList , _fgaf . _ddfc ) ; for _agce := 0 ; _agce < _fgaf . _ddfc ; _agce ++ { _fgafb := _fgaf . get ( _agce , _fgaf . _gcbge - 1 ) . _fgdg ;
if _fgafb . taken ( ) { return nil ; } ; _gbdg [ _agce ] = _fgafb ; } ; for _gfcfe := 0 ; _gfcfe < _fgaf . _ddfc - 1 ; _gfcfe ++ { if _gbdg [ _gfcfe ] . _eabac != _gbdg [ _gfcfe + 1 ] { return nil ; } ; } ; return _gbdg ; } ; func ( _fdcdgd * textTable ) toTextTable ( ) TextTable { if _cgafg { _b . Log . Info ( "t\u006fT\u0065\u0078\u0074\u0054\u0061\u0062\u006c\u0065:\u0020\u0025\u0064\u0020x \u0025\u0064" , _fdcdgd . _ddfc , _fdcdgd . _gcbge ) ;
} ; _cfdfba := make ( [ ] [ ] TableCell , _fdcdgd . _gcbge ) ; for _dbba := 0 ; _dbba < _fdcdgd . _gcbge ; _dbba ++ { _cfdfba [ _dbba ] = make ( [ ] TableCell , _fdcdgd . _ddfc ) ; for _fcda := 0 ; _fcda < _fdcdgd . _ddfc ; _fcda ++ { _beab := _fdcdgd . get ( _fcda , _dbba ) ; if _beab == nil { continue ;
} ; if _cgafg { _ce . Printf ( "\u0025\u0034\u0064 \u0025\u0032\u0064\u003a\u0020\u0025\u0073\u000a" , _fcda , _dbba , _beab ) ; } ; _cfdfba [ _dbba ] [ _fcda ] . Text = _beab . text ( ) ; _bagb := 0 ; _cfdfba [ _dbba ] [ _fcda ] . Marks . _bca = _beab . toTextMarks ( & _bagb ) ; } ; } ; _fbgge := TextTable { W : _fdcdgd . _ddfc , H : _fdcdgd . _gcbge , Cells : _cfdfba } ;
_fbgge . PdfRectangle = _fdcdgd . bbox ( ) ; return _fbgge ; } ;
// PageTextOptions holds various options available in extraction process.
type PageTextOptions struct { _cbff bool ; _bbg bool ; } ; func _gefa ( _geb , _fgcaf _bg . PdfRectangle ) bool { return _fgcaf . Llx <= _geb . Urx && _geb . Llx <= _fgcaf . Urx ; } ; func _ecdb ( _bbgc map [ int ] [ ] float64 ) [ ] int { _cacc := make ( [ ] int , len ( _bbgc ) ) ; _ggbdb := 0 ;
for _cedeg := range _bbgc { _cacc [ _ggbdb ] = _cedeg ; _ggbdb ++ ; } ; _df . Ints ( _cacc ) ; return _cacc ; } ; func _aage ( _fdcb _bg . PdfRectangle ) * ruling { return & ruling { _eabdg : _cefaa , _befee : _fdcb . Ury , _agbc : _fdcb . Llx , _gffgd : _fdcb . Urx } ; } ; func ( _egdf * textMark ) bbox ( ) _bg . PdfRectangle { return _egdf . PdfRectangle } ;
func ( _cdae rulingList ) log ( _bgbf string ) { if ! _bccgb { return ; } ; _b . Log . Info ( "\u0023\u0023\u0023\u0020\u0025\u0031\u0030\u0073\u003a\u0020\u0076\u0065c\u0073\u003d\u0025\u0073" , _bgbf , _cdae . String ( ) ) ; for _ecfd , _cbab := range _cdae { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _ecfd , _cbab . String ( ) ) ;
} ; } ; var ( _fegfa = map [ rune ] string { 0x0060 : "\u0300" , 0x02CB : "\u0300" , 0x0027 : "\u0301" , 0x00B4 : "\u0301" , 0x02B9 : "\u0301" , 0x02CA : "\u0301" , 0x005E : "\u0302" , 0x02C6 : "\u0302" , 0x007E : "\u0303" , 0x02DC : "\u0303" , 0x00AF : "\u0304" , 0x02C9 : "\u0304" , 0x02D8 : "\u0306" , 0x02D9 : "\u0307" , 0x00A8 : "\u0308" , 0x00B0 : "\u030a" , 0x02DA : "\u030a" , 0x02BA : "\u030b" , 0x02DD : "\u030b" , 0x02C7 : "\u030c" , 0x02C8 : "\u030d" , 0x0022 : "\u030e" , 0x02BB : "\u0312" , 0x02BC : "\u0313" , 0x0486 : "\u0313" , 0x055A : "\u0313" , 0x02BD : "\u0314" , 0x0485 : "\u0314" , 0x0559 : "\u0314" , 0x02D4 : "\u031d" , 0x02D5 : "\u031e" , 0x02D6 : "\u031f" , 0x02D7 : "\u0320" , 0x02B2 : "\u0321" , 0x00B8 : "\u0327" , 0x02CC : "\u0329" , 0x02B7 : "\u032b" , 0x02CD : "\u0331" , 0x005F : "\u0332" , 0x204E : "\u0359" } ;
) ; func ( _ggddb paraList ) topoOrder ( ) [ ] int { if _ecdg { _b . Log . Info ( "\u0074\u006f\u0070\u006f\u004f\u0072\u0064\u0065\u0072\u003a" ) ; } ; _fdcd := len ( _ggddb ) ; _bcag := make ( [ ] bool , _fdcd ) ; _efbae := make ( [ ] int , 0 , _fdcd ) ; _ebaf := _ggddb . llyOrdering ( ) ; var _cdge func ( _bcgbe int ) ;
_cdge = func ( _degdg int ) { _bcag [ _degdg ] = true ; for _bcbfa := 0 ; _bcbfa < _fdcd ; _bcbfa ++ { if ! _bcag [ _bcbfa ] { if _ggddb . readBefore ( _ebaf , _degdg , _bcbfa ) { _cdge ( _bcbfa ) ; } ; } ; } ; _efbae = append ( _efbae , _degdg ) ; } ; for _dfea := 0 ; _dfea < _fdcd ; _dfea ++ { if ! _bcag [ _dfea ] { _cdge ( _dfea ) ;
} ; } ; return _gfdaa ( _efbae ) ; } ; func _fgcce ( _aged , _afecb int ) uint64 { return uint64 ( _aged ) * 0x1000000 + uint64 ( _afecb ) } ; func _dfba ( _cbeb * paraList ) map [ int ] [ ] * textLine { _gefe := map [ int ] [ ] * textLine { } ; for _ , _aeec := range * _cbeb { for _ , _gfaea := range _aeec . _gfbb { if ! _gegg ( _gfaea ) { _b . Log . Debug ( "g\u0072\u006f\u0075p\u004c\u0069\u006e\u0065\u0073\u003a\u0020\u0054\u0068\u0065\u0020\u0074\u0065\u0078\u0074\u0020\u006c\u0069\u006e\u0065\u0020\u0063\u006f\u006e\u0074a\u0069\u006e\u0073 \u006d\u006f\u0072\u0065\u0020\u0074\u0068\u0061\u006e\u0020\u006f\u006e\u0065 \u006d\u0063\u0069\u0064 \u006e\u0075\u006d\u0062e\u0072\u002e\u0020\u0049\u0074\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020\u0062\u0065\u0020\u0073p\u006c\u0069\u0074\u002e" ) ;
continue ; } ; _bafd := _gfaea . _aafd [ 0 ] . _dggf [ 0 ] . _adbb ; _gefe [ _bafd ] = append ( _gefe [ _bafd ] , _gfaea ) ; } ; if _aeec . _bgba != nil { _eebgf := _aeec . _bgba . _efeac ; for _ , _bfcge := range _eebgf { for _ , _eede := range _bfcge . _gfbb { if ! _gegg ( _eede ) { _b . Log . Debug ( "g\u0072\u006f\u0075p\u004c\u0069\u006e\u0065\u0073\u003a\u0020\u0054\u0068\u0065\u0020\u0074\u0065\u0078\u0074\u0020\u006c\u0069\u006e\u0065\u0020\u0063\u006f\u006e\u0074a\u0069\u006e\u0073 \u006d\u006f\u0072\u0065\u0020\u0074\u0068\u0061\u006e\u0020\u006f\u006e\u0065 \u006d\u0063\u0069\u0064 \u006e\u0075\u006d\u0062e\u0072\u002e\u0020\u0049\u0074\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020\u0062\u0065\u0020\u0073p\u006c\u0069\u0074\u002e" ) ;
continue ; } ; _bbb := _eede . _aafd [ 0 ] . _dggf [ 0 ] . _adbb ; _gefe [ _bbb ] = append ( _gefe [ _bbb ] , _eede ) ; } ; } ; } ; } ; return _gefe ; } ; func ( _dfdag * textTable ) compositeRowCorridors ( ) map [ int ] [ ] float64 { _fgfg := make ( map [ int ] [ ] float64 , _dfdag . _gcbge ) ; if _cgafg { _b . Log . Info ( "c\u006f\u006d\u0070\u006f\u0073\u0069t\u0065\u0052\u006f\u0077\u0043\u006f\u0072\u0072\u0069d\u006f\u0072\u0073:\u0020h\u003d\u0025\u0064" , _dfdag . _gcbge ) ;
} ; for _bedc := 1 ; _bedc < _dfdag . _gcbge ; _bedc ++ { var _gdefb [ ] compositeCell ; for _ggcge := 0 ; _ggcge < _dfdag . _ddfc ; _ggcge ++ { if _efecd , _gcfff := _dfdag . _dadcc [ _fgcce ( _ggcge , _bedc ) ] ; _gcfff { _gdefb = append ( _gdefb , _efecd ) ; } ; } ; if len ( _gdefb ) == 0 { continue ;
} ; _dgdeb := _ccab ( _gdefb ) ; _fgfg [ _bedc ] = _dgdeb ; if _cgafg { _ce . Printf ( "\u0020\u0020\u0020\u0025\u0032\u0064\u003a\u0020\u00256\u002e\u0032\u0066\u000a" , _bedc , _dgdeb ) ; } ; } ; return _fgfg ; } ; var _dabgf string = "\u0028\u003f\u0069\u0029\u005e\u0028\u004d\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u0043\u0028?\u003a\u0044\u007cM\u0029\u007c\u0044\u003f\u0043{\u0030\u002c\u0033\u007d\u0029\u0028\u0058\u0028\u003f\u003a\u004c\u007c\u0043\u0029\u007cL\u003f\u0058\u007b\u0030\u002c\u0033}\u0029\u0028\u0049\u0028\u003f\u003a\u0056\u007c\u0058\u0029\u007c\u0056\u003f\u0049\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u005c\u0029\u007c\u005c\u002e\u0029\u007c\u005e\u005c\u0028\u0028\u004d\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u0043\u0028\u003f\u003aD\u007cM\u0029\u007c\u0044\u003f\u0043\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u0058\u0028?\u003a\u004c\u007c\u0043\u0029\u007c\u004c?\u0058\u007b0\u002c\u0033\u007d\u0029(\u0049\u0028\u003f\u003a\u0056|\u0058\u0029\u007c\u0056\u003f\u0049\u007b\u0030\u002c\u0033\u007d\u0029\u005c\u0029" ;
func ( _abd * textObject ) nextLine ( ) { _abd . moveLP ( 0 , - _abd . _gacd . _ced ) } ; func ( _eecff * textLine ) bbox ( ) _bg . PdfRectangle { return _eecff . PdfRectangle } ; func ( _faebf gridTile ) numBorders ( ) int { _agdb := 0 ; if _faebf . _afdge { _agdb ++ ; } ; if _faebf . _bfecb { _agdb ++ ;
} ; if _faebf . _eaed { _agdb ++ ; } ; if _faebf . _fdbd { _agdb ++ ; } ; return _agdb ; } ; type textWord struct { _bg . PdfRectangle ; _baebb float64 ; _ggaef string ; _dggf [ ] * textMark ; _ebgb float64 ; _gagaf bool ; } ; func ( _dcac rulingList ) blocks ( _gabdf , _cedb * ruling ) bool { if _gabdf . _agbc > _cedb . _gffgd || _cedb . _agbc > _gabdf . _gffgd { return false ;
} ; _ecef := _ef . Max ( _gabdf . _agbc , _cedb . _agbc ) ; _aebdd := _ef . Min ( _gabdf . _gffgd , _cedb . _gffgd ) ; if _gabdf . _befee > _cedb . _befee { _gabdf , _cedb = _cedb , _gabdf ; } ; for _ , _baeg := range _dcac { if _gabdf . _befee <= _baeg . _befee + _bddeb && _baeg . _befee <= _cedb . _befee + _bddeb && _baeg . _agbc <= _aebdd && _ecef <= _baeg . _gffgd { return true ;
} ; } ; return false ; } ; func _fgeb ( _bbdd int , _faeec map [ int ] [ ] float64 ) ( [ ] int , int ) { _ccfgf := make ( [ ] int , _bbdd ) ; _feaaf := 0 ; for _dceed := 0 ; _dceed < _bbdd ; _dceed ++ { _ccfgf [ _dceed ] = _feaaf ; _feaaf += len ( _faeec [ _dceed ] ) + 1 ; } ; return _ccfgf , _feaaf ; } ;
func _dgbg ( _fbeb , _bedb float64 ) string { _efeg := ! _acbc ( _fbeb - _bedb ) ; if _efeg { return "\u000a" ; } ; return "\u0020" ; } ; func _eabd ( _fbee structElement ) [ ] structElement { _edadc := [ ] structElement { } ; for _ , _cegba := range _fbee . _efce { for _ , _gbfee := range _cegba . _efce { for _ , _gfdb := range _gbfee . _efce { if _gfdb . _bfeg == "\u004c" { _edadc = append ( _edadc , _gfdb ) ;
} ; } ; } ; } ; return _edadc ; } ; func _cdbbf ( _gbagd string ) string { _ggeab := [ ] rune ( _gbagd ) ; return string ( _ggeab [ : len ( _ggeab ) - 1 ] ) } ; func ( _fggb * textTable ) log ( _dcbf string ) { if ! _cgafg { return ; } ; _b . Log . Info ( "~\u007e\u007e\u0020\u0025\u0073\u003a \u0025\u0064\u0020\u0078\u0020\u0025d\u0020\u0067\u0072\u0069\u0064\u003d\u0025t\u000a\u0020\u0020\u0020\u0020\u0020\u0020\u0025\u0036\u002e2\u0066" , _dcbf , _fggb . _ddfc , _fggb . _gcbge , _fggb . _edgac , _fggb . PdfRectangle ) ;
for _ffaa := 0 ; _ffaa < _fggb . _gcbge ; _ffaa ++ { for _deabg := 0 ; _deabg < _fggb . _ddfc ; _deabg ++ { _gdgb := _fggb . get ( _deabg , _ffaa ) ; if _gdgb == nil { continue ; } ; _ce . Printf ( "%\u0034\u0064\u0020\u00252d\u003a \u0025\u0036\u002e\u0032\u0066 \u0025\u0071\u0020\u0025\u0064\u000a" , _deabg , _ffaa , _gdgb . PdfRectangle , _dfcggd ( _gdgb . text ( ) , 50 ) , _a . RuneCountInString ( _gdgb . text ( ) ) ) ;
} ; } ; } ; func ( _gbda * wordBag ) sort ( ) { for _ , _edcce := range _gbda . _cgdg { _df . Slice ( _edcce , func ( _abbeb , _fcaf int ) bool { return _aea ( _edcce [ _abbeb ] , _edcce [ _fcaf ] ) < 0 } ) ; } ; } ;
// Len returns the number of TextMarks in `ma`.
func ( _fddf * TextMarkArray ) Len ( ) int { if _fddf == nil { return 0 ; } ; return len ( _fddf . _bca ) ; } ; func ( _debg * textObject ) setTextRise ( _cad float64 ) { if _debg == nil { return ; } ; _debg . _gacd . _fdea = _cad ; } ; func ( _bbfca * textTable ) emptyCompositeColumn ( _cfada int ) bool { for _eefaa := 0 ;
_eefaa < _bbfca . _gcbge ; _eefaa ++ { if _dcbcb , _cfadc := _bbfca . _dadcc [ _fgcce ( _cfada , _eefaa ) ] ; _cfadc { if len ( _dcbcb . paraList ) > 0 { return false ; } ; } ; } ; return true ; } ; func ( _cdac * textLine ) markWordBoundaries ( ) { _eedfg := _cfbb * _cdac . _bfbb ; for _gadf , _fgfe := range _cdac . _aafd [ 1 : ] { if _efbc ( _fgfe , _cdac . _aafd [ _gadf ] ) >= _eedfg { _fgfe . _gagaf = true ;
} ; } ; } ;
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
// TODO(peterwilliams97): The stats complicate this function signature and aren't very useful.
//
// Replace with a function like Extract() (*PageText, error)
func ( _egg * Extractor ) ExtractPageText ( ) ( * PageText , int , int , error ) { _bbc , _bde , _abbe , _acd := _egg . extractPageText ( _egg . _bc , _egg . _gga , _gab . IdentityMatrix ( ) , 0 ) ; if _acd != nil && _acd != _bg . ErrColorOutOfRange { return nil , 0 , 0 , _acd ; } ; if _egg . _ad != nil { _bbc . _fca . _bbg = _egg . _ad . UseSimplerExtractionProcess ;
} ; _bbc . computeViews ( ) ; _acd = _gceee ( _bbc ) ; if _acd != nil { return nil , 0 , 0 , _acd ; } ; if _egg . _ad != nil { if _egg . _ad . ApplyCropBox && _egg . _fc != nil { _bbc . ApplyArea ( * _egg . _fc ) ; } ; _bbc . _fca . _cbff = _egg . _ad . DisableDocumentTags ; } ; return _bbc , _bde , _abbe , nil ;
} ; func ( _fcfgd paraList ) addNeighbours ( ) { _bafdf := func ( _gbcda [ ] int , _efgfd * textPara ) ( [ ] * textPara , [ ] * textPara ) { _fdde := make ( [ ] * textPara , 0 , len ( _gbcda ) - 1 ) ; _bfbd := make ( [ ] * textPara , 0 , len ( _gbcda ) - 1 ) ; for _ , _cced := range _gbcda { _gdabd := _fcfgd [ _cced ] ;
if _gdabd . Urx <= _efgfd . Llx { _fdde = append ( _fdde , _gdabd ) ; } else if _gdabd . Llx >= _efgfd . Urx { _bfbd = append ( _bfbd , _gdabd ) ; } ; } ; return _fdde , _bfbd ; } ; _befcb := func ( _faed [ ] int , _eebf * textPara ) ( [ ] * textPara , [ ] * textPara ) { _gfcdc := make ( [ ] * textPara , 0 , len ( _faed ) - 1 ) ;
_fgeg := make ( [ ] * textPara , 0 , len ( _faed ) - 1 ) ; for _ , _babg := range _faed { _egfg := _fcfgd [ _babg ] ; if _egfg . Ury <= _eebf . Lly { _fgeg = append ( _fgeg , _egfg ) ; } else if _egfg . Lly >= _eebf . Ury { _gfcdc = append ( _gfcdc , _egfg ) ; } ; } ; return _gfcdc , _fgeg ; } ; _bcebd := _fcfgd . yNeighbours ( _baeeb ) ;
for _ , _afab := range _fcfgd { _gfece := _bcebd [ _afab ] ; if len ( _gfece ) == 0 { continue ; } ; _dabfg , _gfac := _bafdf ( _gfece , _afab ) ; if len ( _dabfg ) == 0 && len ( _gfac ) == 0 { continue ; } ; if len ( _dabfg ) > 0 { _egbc := _dabfg [ 0 ] ; for _ , _bebbb := range _dabfg [ 1 : ] { if _bebbb . Urx >= _egbc . Urx { _egbc = _bebbb ;
} ; } ; for _ , _defe := range _dabfg { if _defe != _egbc && _defe . Urx > _egbc . Llx { _egbc = nil ; break ; } ; } ; if _egbc != nil && _cfda ( _afab . PdfRectangle , _egbc . PdfRectangle ) { _afab . _dbaed = _egbc ; } ; } ; if len ( _gfac ) > 0 { _gffe := _gfac [ 0 ] ; for _ , _dcfa := range _gfac [ 1 : ] { if _dcfa . Llx <= _gffe . Llx { _gffe = _dcfa ;
} ; } ; for _ , _abegb := range _gfac { if _abegb != _gffe && _abegb . Llx < _gffe . Urx { _gffe = nil ; break ; } ; } ; if _gffe != nil && _cfda ( _afab . PdfRectangle , _gffe . PdfRectangle ) { _afab . _eabac = _gffe ; } ; } ; } ; _bcebd = _fcfgd . xNeighbours ( _bbe ) ; for _ , _fccc := range _fcfgd { _dfgf := _bcebd [ _fccc ] ;
if len ( _dfgf ) == 0 { continue ; } ; _dcfgd , _baaf := _befcb ( _dfgf , _fccc ) ; if len ( _dcfgd ) == 0 && len ( _baaf ) == 0 { continue ; } ; if len ( _baaf ) > 0 { _afgbf := _baaf [ 0 ] ; for _ , _gggg := range _baaf [ 1 : ] { if _gggg . Ury >= _afgbf . Ury { _afgbf = _gggg ; } ; } ; for _ , _gedfg := range _baaf { if _gedfg != _afgbf && _gedfg . Ury > _afgbf . Lly { _afgbf = nil ;
break ; } ; } ; if _afgbf != nil && _gefa ( _fccc . PdfRectangle , _afgbf . PdfRectangle ) { _fccc . _fgdg = _afgbf ; } ; } ; if len ( _dcfgd ) > 0 { _gbeb := _dcfgd [ 0 ] ; for _ , _dfab := range _dcfgd [ 1 : ] { if _dfab . Lly <= _gbeb . Lly { _gbeb = _dfab ; } ; } ; for _ , _fdcfd := range _dcfgd { if _fdcfd != _gbeb && _fdcfd . Lly < _gbeb . Ury { _gbeb = nil ;
break ; } ; } ; if _gbeb != nil && _gefa ( _fccc . PdfRectangle , _gbeb . PdfRectangle ) { _fccc . _ffeg = _gbeb ; } ; } ; } ; for _ , _bddcbe := range _fcfgd { if _bddcbe . _dbaed != nil && _bddcbe . _dbaed . _eabac != _bddcbe { _bddcbe . _dbaed = nil ; } ; if _bddcbe . _ffeg != nil && _bddcbe . _ffeg . _fgdg != _bddcbe { _bddcbe . _ffeg = nil ;
} ; if _bddcbe . _eabac != nil && _bddcbe . _eabac . _dbaed != _bddcbe { _bddcbe . _eabac = nil ; } ; if _bddcbe . _fgdg != nil && _bddcbe . _fgdg . _ffeg != _bddcbe { _bddcbe . _fgdg = nil ; } ; } ; } ; func _acgbed ( _cgaae _ea . PdfObject , _abdef _ag . Color ) ( _fa . Image , error ) { _ebff , _aaaf := _ea . GetStream ( _cgaae ) ;
if ! _aaaf { return nil , nil ; } ; _cbfcf , _aabac := _bg . NewXObjectImageFromStream ( _ebff ) ; if _aabac != nil { return nil , _aabac ; } ; _bbfdb , _aabac := _cbfcf . ToImage ( ) ; if _aabac != nil { return nil , _aabac ; } ; return _ffda ( _bbfdb , _abdef ) , nil ; } ; const ( RenderModeStroke RenderMode = 1 << iota ;
RenderModeFill ; RenderModeClip ; ) ; func _dadbd ( _gdced int , _adffg func ( int , int ) bool ) [ ] int { _gagbd := make ( [ ] int , _gdced ) ; for _aggg := range _gagbd { _gagbd [ _aggg ] = _aggg ; } ; _df . Slice ( _gagbd , func ( _accf , _fdffb int ) bool { return _adffg ( _gagbd [ _accf ] , _gagbd [ _fdffb ] ) } ) ;
return _gagbd ; } ; func ( _abde intSet ) add ( _gfcag int ) { _abde [ _gfcag ] = struct { } { } } ; func ( _cfcac paraList ) inTile ( _ebcbg gridTile ) paraList { var _dcge paraList ; for _ , _ddgf := range _cfcac { if _ebcbg . contains ( _ddgf . PdfRectangle ) { _dcge = append ( _dcge , _ddgf ) ;
} ; } ; if _cgafg { _ce . Printf ( "\u0020 \u0020\u0069\u006e\u0054i\u006c\u0065\u003a\u0020\u0020%\u0073 \u0069n\u0073\u0069\u0064\u0065\u003d\u0025\u0064\n" , _ebcbg , len ( _dcge ) ) ; for _ccfba , _eebgbb := range _dcge { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _ccfba , _eebgbb ) ;
} ; _ce . Println ( "" ) ; } ; return _dcge ; } ; func _ffda ( _acfcg * _bg . Image , _gbecd _ag . Color ) _fa . Image { _ddge , _eegb := int ( _acfcg . Width ) , int ( _acfcg . Height ) ; _bffbd := _fa . NewRGBA ( _fa . Rect ( 0 , 0 , _ddge , _eegb ) ) ; for _aceb := 0 ; _aceb < _eegb ; _aceb ++ { for _dedf := 0 ;
_dedf < _ddge ; _dedf ++ { _bgfe , _dfaa := _acfcg . ColorAt ( _dedf , _aceb ) ; if _dfaa != nil { _b . Log . Debug ( "\u0057\u0041\u0052\u004e\u003a\u0020\u0063o\u0075\u006c\u0064\u0020\u006e\u006f\u0074\u0020\u0072\u0065\u0074\u0072\u0069\u0065v\u0065 \u0069\u006d\u0061\u0067\u0065\u0020m\u0061\u0073\u006b\u0020\u0076\u0061\u006cu\u0065\u0020\u0061\u0074\u0020\u0028\u0025\u0064\u002c\u0020\u0025\u0064\u0029\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074\u0020\u006da\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063\u006f\u0072\u0072\u0065\u0063t\u002e" , _dedf , _aceb ) ;
continue ; } ; _cafg , _fggbe , _ccaf , _ := _bgfe . RGBA ( ) ; var _cfcef _ag . Color ; if _cafg + _fggbe + _ccaf == 0 { _cfcef = _ag . Transparent ; } else { _cfcef = _gbecd ; } ; _bffbd . Set ( _dedf , _aceb , _cfcef ) ; } ; } ; return _bffbd ; } ; func ( _dbddf paraList ) eventNeighbours ( _ebda [ ] event ) map [ * textPara ] [ ] int { _df . Slice ( _ebda , func ( _ebedf , _abgd int ) bool { _dbafb , _dbddd := _ebda [ _ebedf ] , _ebda [ _abgd ] ;
_dcdfbc , _cbgg := _dbafb . _faccf , _dbddd . _faccf ; if _dcdfbc != _cbgg { return _dcdfbc < _cbgg ; } ; if _dbafb . _ecacd != _dbddd . _ecacd { return _dbafb . _ecacd ; } ; return _ebedf < _abgd ; } ) ; _dbbd := make ( map [ int ] intSet ) ; _eade := make ( intSet ) ; for _ , _fbfgd := range _ebda { if _fbfgd . _ecacd { _dbbd [ _fbfgd . _gbfff ] = make ( intSet ) ;
for _gbba := range _eade { if _gbba != _fbfgd . _gbfff { _dbbd [ _fbfgd . _gbfff ] . add ( _gbba ) ; _dbbd [ _gbba ] . add ( _fbfgd . _gbfff ) ; } ; } ; _eade . add ( _fbfgd . _gbfff ) ; } else { _eade . del ( _fbfgd . _gbfff ) ; } ; } ; _eadd := map [ * textPara ] [ ] int { } ; for _bbgde , _becbc := range _dbbd { _dcga := _dbddf [ _bbgde ] ;
if len ( _becbc ) == 0 { _eadd [ _dcga ] = nil ; continue ; } ; _gbbde := make ( [ ] int , len ( _becbc ) ) ; _gdbb := 0 ; for _gaee := range _becbc { _gbbde [ _gdbb ] = _gaee ; _gdbb ++ ; } ; _eadd [ _dcga ] = _gbbde ; } ; return _eadd ; } ; func ( _dcgfd * wordBag ) removeDuplicates ( ) { if _aebb { _b . Log . Info ( "r\u0065m\u006f\u0076\u0065\u0044\u0075\u0070\u006c\u0069c\u0061\u0074\u0065\u0073: \u0025\u0071" , _dcgfd . text ( ) ) ;
} ; for _ , _gcgae := range _dcgfd . depthIndexes ( ) { if len ( _dcgfd . _cgdg [ _gcgae ] ) == 0 { continue ; } ; _ecfb := _dcgfd . _cgdg [ _gcgae ] [ 0 ] ; _acecf := _ccee * _ecfb . _ebgb ; _ceaa := _ecfb . _baebb ; for _ , _edgb := range _dcgfd . depthBand ( _ceaa , _ceaa + _acecf ) { _dcdgc := map [ * textWord ] struct { } { } ;
_adad := _dcgfd . _cgdg [ _edgb ] ; for _ , _baad := range _adad { if _ , _gcegc := _dcdgc [ _baad ] ; _gcegc { continue ; } ; for _ , _abbcd := range _adad { if _ , _aegg := _dcdgc [ _abbcd ] ; _aegg { continue ; } ; if _abbcd != _baad && _abbcd . _ggaef == _baad . _ggaef && _ef . Abs ( _abbcd . Llx - _baad . Llx ) < _acecf && _ef . Abs ( _abbcd . Urx - _baad . Urx ) < _acecf && _ef . Abs ( _abbcd . Lly - _baad . Lly ) < _acecf && _ef . Abs ( _abbcd . Ury - _baad . Ury ) < _acecf { _dcdgc [ _abbcd ] = struct { } { } ;
} ; } ; } ; if len ( _dcdgc ) > 0 { _fcbf := 0 ; for _ , _gebf := range _adad { if _ , _dbfgc := _dcdgc [ _gebf ] ; ! _dbfgc { _adad [ _fcbf ] = _gebf ; _fcbf ++ ; } ; } ; _dcgfd . _cgdg [ _edgb ] = _adad [ : len ( _adad ) - len ( _dcdgc ) ] ; if len ( _dcgfd . _cgdg [ _edgb ] ) == 0 { delete ( _dcgfd . _cgdg , _edgb ) ;
} ; } ; } ; } ; } ; func _gdfa ( _dadec map [ int ] [ ] float64 ) string { _edgba := _ecdb ( _dadec ) ; _afgdc := make ( [ ] string , len ( _dadec ) ) ; for _cbgef , _ggfag := range _edgba { _afgdc [ _cbgef ] = _ce . Sprintf ( "\u0025\u0064\u003a\u0020\u0025\u002e\u0032\u0066" , _ggfag , _dadec [ _ggfag ] ) ;
} ; return _ce . Sprintf ( "\u007b\u0025\u0073\u007d" , _c . Join ( _afgdc , "\u002c\u0020" ) ) ; } ; func ( _gcce paraList ) sortTopoOrder ( ) { _abdd := _gcce . topoOrder ( ) ; _gcce . reorder ( _abdd ) } ; func ( _egafc rulingList ) tidied ( _gcfeb string ) rulingList { _bdgg := _egafc . removeDuplicates ( ) ;
_bdgg . log ( "\u0075n\u0069\u0071\u0075\u0065\u0073" ) ; _dgff := _bdgg . snapToGroups ( ) ; if _dgff == nil { return nil ; } ; _dgff . sort ( ) ; if _bccgb { _b . Log . Info ( "\u0074\u0069\u0064i\u0065\u0064\u003a\u0020\u0025\u0071\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u0075\u006e\u0069\u0071\u0075\u0065\u0073\u003d\u0025\u0064\u0020\u0063\u006f\u0061l\u0065\u0073\u0063\u0065\u0064\u003d\u0025\u0064" , _gcfeb , len ( _egafc ) , len ( _bdgg ) , len ( _dgff ) ) ;
} ; _dgff . log ( "\u0063o\u0061\u006c\u0065\u0073\u0063\u0065d" ) ; return _dgff ; } ; func ( _ffca * textObject ) showText ( _cdg _ea . PdfObject , _dcb [ ] byte , _gaag int ) error { return _ffca . renderText ( _cdg , _dcb , _gaag ) ; } ; const _aceg = 1.0 / 1000.0 ; type structTreeRoot struct { _cegf [ ] structElement ;
_baeb string ; } ; func _dag ( _cddag , _edga bounded ) float64 { _eedb := _fdcc ( _cddag , _edga ) ; if ! _acbc ( _eedb ) { return _eedb ; } ; return _aea ( _cddag , _edga ) ; } ; func _bacb ( _cbge [ ] * wordBag ) [ ] * wordBag { if len ( _cbge ) <= 1 { return _cbge ; } ; if _gde { _b . Log . Info ( "\u006d\u0065\u0072\u0067\u0065\u0057\u006f\u0072\u0064B\u0061\u0067\u0073\u003a" ) ;
} ; _df . Slice ( _cbge , func ( _efga , _fagge int ) bool { _fgbba , _bfdfe := _cbge [ _efga ] , _cbge [ _fagge ] ; _eaeg := _fgbba . Width ( ) * _fgbba . Height ( ) ; _edg := _bfdfe . Width ( ) * _bfdfe . Height ( ) ; if _eaeg != _edg { return _eaeg > _edg ; } ; if _fgbba . Height ( ) != _bfdfe . Height ( ) { return _fgbba . Height ( ) > _bfdfe . Height ( ) ;
} ; return _efga < _fagge ; } ) ; var _ddfb [ ] * wordBag ; _cfag := make ( intSet ) ; for _efgf := 0 ; _efgf < len ( _cbge ) ; _efgf ++ { if _cfag . has ( _efgf ) { continue ; } ; _begag := _cbge [ _efgf ] ; for _fbbdc := _efgf + 1 ; _fbbdc < len ( _cbge ) ; _fbbdc ++ { if _cfag . has ( _efgf ) { continue ;
} ; _cgafc := _cbge [ _fbbdc ] ; _agba := _begag . PdfRectangle ; _agba . Llx -= _begag . _ecdf ; if _gfea ( _agba , _cgafc . PdfRectangle ) { _begag . absorb ( _cgafc ) ; _cfag . add ( _fbbdc ) ; } ; } ; _ddfb = append ( _ddfb , _begag ) ; } ; if len ( _cbge ) != len ( _ddfb ) + len ( _cfag ) { _b . Log . Error ( "\u006d\u0065\u0072ge\u0057\u006f\u0072\u0064\u0042\u0061\u0067\u0073\u003a \u0025d\u2192%\u0064 \u0061\u0062\u0073\u006f\u0072\u0062\u0065\u0064\u003d\u0025\u0064" , len ( _cbge ) , len ( _ddfb ) , len ( _cfag ) ) ;
} ; return _ddfb ; } ; func ( _cgadg * textWord ) toTextMarks ( _ccdfa * int ) [ ] TextMark { var _cggd [ ] TextMark ; for _ , _adbfg := range _cgadg . _dggf { _cggd = _cgdcf ( _cggd , _ccdfa , _adbfg . ToTextMark ( ) ) ; } ; return _cggd ; } ; func _gfeg ( _ffefc [ ] float64 , _bgdfd , _gacfg float64 ) [ ] float64 { _ebfe , _fged := _bgdfd , _gacfg ;
if _fged < _ebfe { _ebfe , _fged = _fged , _ebfe ; } ; _bdfdc := make ( [ ] float64 , 0 , len ( _ffefc ) + 2 ) ; _bdfdc = append ( _bdfdc , _bgdfd ) ; for _ , _cdbca := range _ffefc { if _cdbca <= _ebfe { continue ; } else if _cdbca >= _fged { break ; } ; _bdfdc = append ( _bdfdc , _cdbca ) ;
} ; _bdfdc = append ( _bdfdc , _gacfg ) ; return _bdfdc ; } ; func ( _cdcef * textPara ) taken ( ) bool { return _cdcef == nil || _cdcef . _abeg } ; func _badcf ( _fefga _bg . PdfColorspace , _geafd _bg . PdfColor ) _ag . Color { if _fefga == nil || _geafd == nil { return _ag . Black ;
} ; _beaac , _afgba := _fefga . ColorToRGB ( _geafd ) ; if _afgba != nil { _b . Log . Debug ( "\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006fu\u006c\u0064\u0020no\u0074\u0020\u0063\u006f\u006e\u0076e\u0072\u0074\u0020\u0063\u006f\u006c\u006f\u0072\u0020\u0025\u0076\u0020\u0028\u0025\u0076)\u0020\u0074\u006f\u0020\u0052\u0047\u0042\u003a \u0025\u0073" , _geafd , _fefga , _afgba ) ;
return _ag . Black ; } ; _ggfbb , _eafcg := _beaac . ( * _bg . PdfColorDeviceRGB ) ; if ! _eafcg { _b . Log . Debug ( "\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006f\u006e\u0076\u0065\u0072\u0074\u0065\u0064 \u0063\u006f\u006c\u006f\u0072\u0020\u0069\u0073\u0020\u006e\u006f\u0074\u0020i\u006e\u0020\u0074\u0068\u0065\u0020\u0052\u0047\u0042\u0020\u0063\u006flo\u0072\u0073\u0070\u0061\u0063\u0065\u003a\u0020\u0025\u0076" , _beaac ) ;
return _ag . Black ; } ; return _ag . NRGBA { R : uint8 ( _ggfbb . R ( ) * 255 ) , G : uint8 ( _ggfbb . G ( ) * 255 ) , B : uint8 ( _ggfbb . B ( ) * 255 ) , A : uint8 ( 255 ) } ; } ; func ( _cfcae * textTable ) getComposite ( _fadgb , _gfcfg int ) ( paraList , _bg . PdfRectangle ) { _eebgb , _ggef := _cfcae . _dadcc [ _fgcce ( _fadgb , _gfcfg ) ] ;
if _cgafg { _ce . Printf ( "\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0067\u0065\u0074\u0043\u006f\u006d\u0070o\u0073i\u0074\u0065\u0028\u0025\u0064\u002c\u0025\u0064\u0029\u002d\u003e\u0025\u0073\u000a" , _fadgb , _gfcfg , _eebgb . String ( ) ) ; } ; if ! _ggef { return nil , _bg . PdfRectangle { } ;
} ; return _eebgb . parasBBox ( ) ; } ; func ( _cbgga * textWord ) absorb ( _dgbf * textWord ) { _cbgga . PdfRectangle = _egbga ( _cbgga . PdfRectangle , _dgbf . PdfRectangle ) ; _cbgga . _dggf = append ( _cbgga . _dggf , _dgbf . _dggf ... ) ; } ; func ( _gead * imageExtractContext ) extractContentStreamImages ( _bd string , _baa * _bg . PdfPageResources ) error { _ccg := _fb . NewContentStreamParser ( _bd ) ;
_ee , _bae := _ccg . Parse ( ) ; if _bae != nil { return _bae ; } ; if _gead . _agf == nil { _gead . _agf = map [ * _ea . PdfObjectStream ] * cachedImage { } ; } ; if _gead . _fae == nil { _gead . _fae = & ImageExtractOptions { } ; } ; _eae := _fb . NewContentStreamProcessor ( * _ee ) ; _eae . AddHandler ( _fb . HandlerConditionEnumAllOperands , "" , _gead . processOperand ) ;
return _eae . Process ( _baa ) ; } ; func _bbcaa ( _dadbf [ ] pathSection ) { if _dgeb < 0.0 { return ; } ; if _bccgb { _b . Log . Info ( "\u0067\u0072\u0061\u006e\u0075\u006c\u0061\u0072\u0069\u007a\u0065\u003a\u0020\u0025\u0064 \u0073u\u0062\u0070\u0061\u0074\u0068\u0020\u0073\u0065\u0063\u0074\u0069\u006f\u006e\u0073" , len ( _dadbf ) ) ;
} ; for _fafdd , _defg := range _dadbf { for _gdfaf , _abed := range _defg . _dgfc { for _cccfc , _ccfga := range _abed . _fbcgf { _abed . _fbcgf [ _cccfc ] = _gab . Point { X : _egagc ( _ccfga . X ) , Y : _egagc ( _ccfga . Y ) } ; if _bccgb { _bgdgb := _abed . _fbcgf [ _cccfc ] ; if ! _eaca ( _ccfga , _bgdgb ) { _cgcf := _gab . Point { X : _bgdgb . X - _ccfga . X , Y : _bgdgb . Y - _ccfga . Y } ;
_ce . Printf ( "\u0025\u0034d \u002d\u0020\u00254\u0064\u0020\u002d\u0020%4d\u003a %\u002e\u0032\u0066\u0020\u2192\u0020\u0025.2\u0066\u0020\u0028\u0025\u0067\u0029\u000a" , _fafdd , _gdfaf , _cccfc , _ccfga , _bgdgb , _cgcf ) ; } ; } ; } ; } ; } ; } ; func _gaaff ( _dcbc [ ] * textWord , _gda float64 , _bfc , _dgaac rulingList ) * wordBag { _cabc := _fcfg ( _dcbc [ 0 ] , _gda , _bfc , _dgaac ) ;
for _ , _dcfb := range _dcbc [ 1 : ] { _efa := _ebfc ( _dcfb . _baebb ) ; _cabc . _cgdg [ _efa ] = append ( _cabc . _cgdg [ _efa ] , _dcfb ) ; _cabc . PdfRectangle = _egbga ( _cabc . PdfRectangle , _dcfb . PdfRectangle ) ; } ; _cabc . sort ( ) ; return _cabc ; } ; func _eba ( _bdcg _bg . PdfRectangle , _eadg bounded ) float64 { return _bdcg . Ury - _eadg . bbox ( ) . Lly } ;
func _gegg ( _ffcc * textLine ) bool { _egbgc := true ; _dfdg := - 1 ; for _ , _ffcg := range _ffcc . _aafd { for _ , _fadg := range _ffcg . _dggf { _edee := _fadg . _adbb ; if _dfdg == - 1 { _dfdg = _edee ; } else { if _dfdg != _edee { _egbgc = false ; break ; } ; } ; } ; } ; return _egbgc ; } ;
func ( _ddcd * wordBag ) pullWord ( _agg * textWord , _bcab int , _bfba map [ int ] map [ * textWord ] struct { } ) { _ddcd . PdfRectangle = _egbga ( _ddcd . PdfRectangle , _agg . PdfRectangle ) ; if _agg . _ebgb > _ddcd . _ecdf { _ddcd . _ecdf = _agg . _ebgb ; } ; _ddcd . _cgdg [ _bcab ] = append ( _ddcd . _cgdg [ _bcab ] , _agg ) ;
_bfba [ _bcab ] [ _agg ] = struct { } { } ; } ; func ( _gabf * textPara ) writeCellText ( _acgd _ga . Writer ) { for _dedgf , _ddg := range _gabf . _gfbb { _abfa := _ddg . text ( ) ; _bdcfb := _fgcg && _ddg . endsInHyphen ( ) && _dedgf != len ( _gabf . _gfbb ) - 1 ; if _bdcfb { _abfa = _cdbbf ( _abfa ) ;
} ; _acgd . Write ( [ ] byte ( _abfa ) ) ; if ! ( _bdcfb || _dedgf == len ( _gabf . _gfbb ) - 1 ) { _acgd . Write ( [ ] byte ( _dgbg ( _ddg . _cbbd , _gabf . _gfbb [ _dedgf + 1 ] . _cbbd ) ) ) ; } ; } ; } ; func ( _acdg * textObject ) moveText ( _bdb , _cfa float64 ) { _acdg . moveLP ( _bdb , _cfa ) } ; func ( _effea * textTable ) bbox ( ) _bg . PdfRectangle { return _effea . PdfRectangle } ;
const _efdc = 10 ;
2023-01-08 22:34:27 +00:00
2023-07-28 12:14:31 +00:00
// String returns a human readable description of `ss`.
func ( _ebfg * shapesState ) String ( ) string { return _ce . Sprintf ( "\u007b\u0025\u0064\u0020su\u0062\u0070\u0061\u0074\u0068\u0073\u0020\u0066\u0072\u0065\u0073\u0068\u003d\u0025t\u007d" , len ( _ebfg . _cbfc ) , _ebfg . _afge ) ; } ;
// String returns a description of `tm`.
func ( _fbbef * textMark ) String ( ) string { return _ce . Sprintf ( "\u0025\u002e\u0032f \u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\u0022\u0025\u0073\u0022" , _fbbef . PdfRectangle , _fbbef . _gceb , _fbbef . _ebgd ) ; } ; func _afega ( _ecgef _bg . PdfRectangle ) * ruling { return & ruling { _eabdg : _acgee , _befee : _ecgef . Llx , _agbc : _ecgef . Lly , _gffgd : _ecgef . Ury } ;
} ; func ( _cdgc * textLine ) pullWord ( _fgag * wordBag , _afcb * textWord , _bgdc int ) { _cdgc . appendWord ( _afcb ) ; _fgag . removeWord ( _afcb , _bgdc ) ; } ; func ( _afcac rulingList ) sortStrict ( ) { _df . Slice ( _afcac , func ( _edcca , _cdfe int ) bool { _ggga , _ecfbc := _afcac [ _edcca ] , _afcac [ _cdfe ] ;
_gcega , _gbbd := _ggga . _eabdg , _ecfbc . _eabdg ; if _gcega != _gbbd { return _gcega > _gbbd ; } ; _agee , _abeaf := _ggga . _befee , _ecfbc . _befee ; if ! _acbc ( _agee - _abeaf ) { return _agee < _abeaf ; } ; _agee , _abeaf = _ggga . _agbc , _ecfbc . _agbc ; if _agee != _abeaf { return _agee < _abeaf ;
} ; return _ggga . _gffgd < _ecfbc . _gffgd ; } ) ; } ; func ( _bbf * textObject ) getFont ( _ddff string ) ( * _bg . PdfFont , error ) { if _bbf . _dcdg . _eg != nil { _cgfe , _ebbab := _bbf . getFontDict ( _ddff ) ; if _ebbab != nil { _b . Log . Debug ( "\u0045\u0052\u0052OR\u003a\u0020\u0067\u0065\u0074\u0046\u006f\u006e\u0074:\u0020n\u0061m\u0065=\u0025\u0073\u002c\u0020\u0065\u0072\u0072\u006f\u0072\u003a\u0020\u0025\u0073" , _ddff , _ebbab . Error ( ) ) ;
return nil , _ebbab ; } ; _bbf . _dcdg . _ge ++ ; _afca , _abea := _bbf . _dcdg . _eg [ _cgfe . String ( ) ] ; if _abea { _afca . _edcg = _bbf . _dcdg . _ge ; return _afca . _bcae , nil ; } ; } ; _dfdac , _cace := _bbf . getFontDict ( _ddff ) ; if _cace != nil { return nil , _cace ; } ; _bcdf , _cace := _bbf . getFontDirect ( _ddff ) ;
if _cace != nil { return nil , _cace ; } ; if _bbf . _dcdg . _eg != nil { _dcgg := fontEntry { _bcdf , _bbf . _dcdg . _ge } ; if len ( _bbf . _dcdg . _eg ) >= _efdc { var _ffgg [ ] string ; for _eff := range _bbf . _dcdg . _eg { _ffgg = append ( _ffgg , _eff ) ; } ; _df . Slice ( _ffgg , func ( _ggdd , _eedf int ) bool { return _bbf . _dcdg . _eg [ _ffgg [ _ggdd ] ] . _edcg < _bbf . _dcdg . _eg [ _ffgg [ _eedf ] ] . _edcg ;
} ) ; delete ( _bbf . _dcdg . _eg , _ffgg [ 0 ] ) ; } ; _bbf . _dcdg . _eg [ _dfdac . String ( ) ] = _dcgg ; } ; return _bcdf , nil ; } ; func ( _edad * wordBag ) maxDepth ( ) float64 { return _edad . _dgec - _edad . Lly } ; func _dbae ( _gfec byte ) bool { for _ , _bba := range _ebef { if [ ] byte ( _bba ) [ 0 ] == _gfec { return true ;
} ; } ; return false ; } ; func ( _gdd * textPara ) text ( ) string { _dgfae := new ( _dfe . Buffer ) ; _gdd . writeText ( _dgfae ) ; return _dgfae . String ( ) ; } ; func ( _bbgaf compositeCell ) parasBBox ( ) ( paraList , _bg . PdfRectangle ) { return _bbgaf . paraList , _bbgaf . PdfRectangle ;
} ; func _ffcaf ( _eabg , _adaab float64 ) bool { return _ef . Abs ( _eabg - _adaab ) <= _cfgg } ; func _cfad ( _gadd * textLine , _bcee [ ] * textLine , _ffeb [ ] float64 ) float64 { var _dfecc float64 = - 1 ; for _ , _bfdfc := range _bcee { if _bfdfc . _cbbd > _gadd . _cbbd { if _ef . Round ( _bfdfc . Llx ) >= _ef . Round ( _gadd . Llx ) { _dfecc = _bfdfc . _cbbd ;
} else { break ; } ; } ; } ; return _dfecc ; } ;
2023-03-01 18:45:57 +00:00
2023-05-29 17:26:33 +00:00
// PageImages represents extracted images on a PDF page with spatial information:
// display position and size.
2023-07-28 12:14:31 +00:00
type PageImages struct { Images [ ] ImageMark ; } ; const _bb = 20 ; func ( _ceab intSet ) del ( _gdda int ) { delete ( _ceab , _gdda ) } ;
// NewFromContents creates a new extractor from contents and page resources.
func NewFromContents ( contents string , resources * _bg . PdfPageResources ) ( * Extractor , error ) { const _ab = "\u0065x\u0074\u0072\u0061\u0063t\u006f\u0072\u002e\u004e\u0065w\u0046r\u006fm\u0043\u006f\u006e\u0074\u0065\u006e\u0074s" ; _cg := & Extractor { _bc : contents , _gga : resources , _eg : map [ string ] fontEntry { } , _gbfd : map [ string ] textResult { } } ;
_gb . TrackUse ( _ab ) ; return _cg , nil ; } ;
// String returns a description of `b`.
func ( _eefg * wordBag ) String ( ) string { var _cdbg [ ] string ; for _ , _egbb := range _eefg . depthIndexes ( ) { _eac := _eefg . _cgdg [ _egbb ] ; for _ , _bdce := range _eac { _cdbg = append ( _cdbg , _bdce . _ggaef ) ; } ; } ; return _ce . Sprintf ( "\u0025.\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065=\u0025\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0025\u0071" , _eefg . PdfRectangle , _eefg . _ecdf , len ( _cdbg ) , _cdbg ) ;
} ;
// RangeOffset returns the TextMarks in `ma` that overlap text[start:end] in the extracted text.
// These are tm: `start` <= tm.Offset + len(tm.Text) && tm.Offset < `end` where
// `start` and `end` are offsets in the extracted text.
// NOTE: TextMarks can contain multiple characters. e.g. "ffi" for the ffi ligature so the first and
// last elements of the returned TextMarkArray may only partially overlap text[start:end].
func ( _caf * TextMarkArray ) RangeOffset ( start , end int ) ( * TextMarkArray , error ) { if _caf == nil { return nil , _d . New ( "\u006da\u003d\u003d\u006e\u0069\u006c" ) ; } ; if end < start { return nil , _ce . Errorf ( "\u0065\u006e\u0064\u0020\u003c\u0020\u0073\u0074\u0061\u0072\u0074\u002e\u0020\u0052\u0061n\u0067\u0065\u004f\u0066\u0066\u0073\u0065\u0074\u0020\u006e\u006f\u0074\u0020d\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020\u0073\u0074\u0061\u0072t=\u0025\u0064\u0020\u0065\u006e\u0064\u003d\u0025\u0064\u0020" , start , end ) ;
} ; _bfg := len ( _caf . _bca ) ; if _bfg == 0 { return _caf , nil ; } ; if start < _caf . _bca [ 0 ] . Offset { start = _caf . _bca [ 0 ] . Offset ; } ; if end > _caf . _bca [ _bfg - 1 ] . Offset + 1 { end = _caf . _bca [ _bfg - 1 ] . Offset + 1 ; } ; _edcc := _df . Search ( _bfg , func ( _dda int ) bool { return _caf . _bca [ _dda ] . Offset + len ( _caf . _bca [ _dda ] . Text ) - 1 >= start } ) ;
if ! ( 0 <= _edcc && _edcc < _bfg ) { _dcff := _ce . Errorf ( "\u004f\u0075\u0074\u0020\u006f\u0066\u0020\u0072\u0061\u006e\u0067\u0065\u002e\u0020\u0073\u0074\u0061\u0072\u0074\u003d%\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009 \u006c\u0061\u0073\u0074\u003d%\u0076" , start , _edcc , _bfg , _caf . _bca [ 0 ] , _caf . _bca [ _bfg - 1 ] ) ;
return nil , _dcff ; } ; _fgbb := _df . Search ( _bfg , func ( _abf int ) bool { return _caf . _bca [ _abf ] . Offset > end - 1 } ) ; if ! ( 0 <= _fgbb && _fgbb < _bfg ) { _gee := _ce . Errorf ( "\u004f\u0075\u0074\u0020\u006f\u0066\u0020r\u0061\u006e\u0067e\u002e\u0020\u0065n\u0064\u003d%\u0064\u0020\u0069\u0045\u006e\u0064=\u0025d \u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d\u0025\u0076" , end , _fgbb , _bfg , _caf . _bca [ 0 ] , _caf . _bca [ _bfg - 1 ] ) ;
return nil , _gee ; } ; if _fgbb <= _edcc { return nil , _ce . Errorf ( "\u0069\u0045\u006e\u0064\u0020\u003c=\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003a\u0020\u0073\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u0065\u006ed\u003d\u0025\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020i\u0045n\u0064\u003d\u0025\u0064" , start , end , _edcc , _fgbb ) ;
} ; return & TextMarkArray { _bca : _caf . _bca [ _edcc : _fgbb ] } , nil ; } ; type rectRuling struct { _fbad rulingKind ; _gadb markKind ; _ag . Color ; _bg . PdfRectangle ; } ; var _bgaf * _gg . Regexp = _gg . MustCompile ( _dabgf + "\u007c" + _bgbdd ) ; func ( _dffb * textTable ) subdivide ( ) * textTable { _dffb . logComposite ( "\u0073u\u0062\u0064\u0069\u0076\u0069\u0064e" ) ;
_gbegc := _dffb . compositeRowCorridors ( ) ; _befa := _dffb . compositeColCorridors ( ) ; if _cgafg { _b . Log . Info ( "\u0073u\u0062\u0064i\u0076\u0069\u0064\u0065:\u000a\u0009\u0072o\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s=\u0025\u0073\u000a\t\u0063\u006fl\u0043\u006f\u0072\u0072\u0069\u0064o\u0072\u0073=\u0025\u0073" , _gdfa ( _gbegc ) , _gdfa ( _befa ) ) ;
} ; if len ( _gbegc ) == 0 || len ( _befa ) == 0 { return _dffb ; } ; _dddaa ( _gbegc ) ; _dddaa ( _befa ) ; if _cgafg { _b . Log . Info ( "\u0073\u0075\u0062\u0064\u0069\u0076\u0069\u0064\u0065\u0020\u0066\u0069\u0078\u0065\u0064\u003a\u000a\u0009r\u006f\u0077\u0043\u006f\u0072\u0072\u0069d\u006f\u0072\u0073\u003d\u0025\u0073\u000a\u0009\u0063\u006f\u006cC\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u003d\u0025\u0073" , _gdfa ( _gbegc ) , _gdfa ( _befa ) ) ;
} ; _cbdga , _aegag := _fgeb ( _dffb . _gcbge , _gbegc ) ; _aedeb , _aada := _fgeb ( _dffb . _ddfc , _befa ) ; _afgg := make ( map [ uint64 ] * textPara , _aada * _aegag ) ; _gbac := & textTable { PdfRectangle : _dffb . PdfRectangle , _edgac : _dffb . _edgac , _gcbge : _aegag , _ddfc : _aada , _efeac : _afgg } ;
if _cgafg { _b . Log . Info ( "\u0073\u0075b\u0064\u0069\u0076\u0069\u0064\u0065\u003a\u0020\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u0020\u003d\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u0020\u0063\u0065\u006c\u006c\u0073\u003d\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u000a" + "\u0009\u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s\u003d\u0025\u0073\u000a" + "\u0009\u0063\u006f\u006c\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s\u003d\u0025\u0073\u000a" + "\u0009\u0079\u004f\u0066\u0066\u0073\u0065\u0074\u0073=\u0025\u002b\u0076\u000a" + "\u0009\u0078\u004f\u0066\u0066\u0073\u0065\u0074\u0073\u003d\u0025\u002b\u0076" , _dffb . _ddfc , _dffb . _gcbge , _aada , _aegag , _gdfa ( _gbegc ) , _gdfa ( _befa ) , _cbdga , _aedeb ) ;
} ; for _gggd := 0 ; _gggd < _dffb . _gcbge ; _gggd ++ { _ggbfe := _cbdga [ _gggd ] ; for _dgabd := 0 ; _dgabd < _dffb . _ddfc ; _dgabd ++ { _gcgba := _aedeb [ _dgabd ] ; if _cgafg { _ce . Printf ( "\u0025\u0036\u0064\u002c %\u0032\u0064\u003a\u0020\u0078\u0030\u003d\u0025\u0064\u0020\u0079\u0030\u003d\u0025d\u000a" , _dgabd , _gggd , _gcgba , _ggbfe ) ;
} ; _dbgg , _bbca := _dffb . _dadcc [ _fgcce ( _dgabd , _gggd ) ] ; if ! _bbca { continue ; } ; _agdca := _dbgg . split ( _gbegc [ _gggd ] , _befa [ _dgabd ] ) ; for _ddag := 0 ; _ddag < _agdca . _gcbge ; _ddag ++ { for _adbfe := 0 ; _adbfe < _agdca . _ddfc ; _adbfe ++ { _cacfa := _agdca . get ( _adbfe , _ddag ) ;
_gbac . put ( _gcgba + _adbfe , _ggbfe + _ddag , _cacfa ) ; if _cgafg { _ce . Printf ( "\u0025\u0038\u0064\u002c\u0020\u0025\u0032\u0064\u003a\u0020\u0025\u0073\u000a" , _gcgba + _adbfe , _ggbfe + _ddag , _cacfa ) ; } ; } ; } ; } ; } ; return _gbac ; } ; func ( _degde gridTiling ) log ( _abadc string ) { if ! _agd { return ;
} ; _b . Log . Info ( "\u0074i\u006ci\u006e\u0067\u003a\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0071" , len ( _degde . _eaafd ) , len ( _degde . _dade ) , _abadc ) ; _ce . Printf ( "\u0020\u0020\u0020l\u006c\u0078\u003d\u0025\u002e\u0032\u0066\u000a" , _degde . _eaafd ) ;
_ce . Printf ( "\u0020\u0020\u0020l\u006c\u0079\u003d\u0025\u002e\u0032\u0066\u000a" , _degde . _dade ) ; for _aeag , _gcee := range _degde . _dade { _dada , _eadbb := _degde . _cbec [ _gcee ] ; if ! _eadbb { continue ; } ; _ce . Printf ( "%\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u000a" , _aeag , _gcee ) ;
for _cgbc , _edebe := range _degde . _eaafd { _beba , _ggffa := _dada [ _edebe ] ; if ! _ggffa { continue ; } ; _ce . Printf ( "\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a" , _cgbc , _beba . String ( ) ) ; } ; } ; } ;
// New returns an Extractor instance for extracting content from the input PDF page.
func New ( page * _bg . PdfPage ) ( * Extractor , error ) { return NewWithOptions ( page , nil ) } ;
2023-05-29 17:26:33 +00:00
2023-06-30 13:19:48 +00:00
// String returns a human readable description of `vecs`.
2023-07-28 12:14:31 +00:00
func ( _afagb rulingList ) String ( ) string { if len ( _afagb ) == 0 { return "\u007b \u0045\u004d\u0050\u0054\u0059\u0020}" ; } ; _aacae , _geef := _afagb . vertsHorzs ( ) ; _beaa := len ( _aacae ) ; _edcd := len ( _geef ) ; if _beaa == 0 || _edcd == 0 { return _ce . Sprintf ( "\u007b%\u0064\u0020\u0078\u0020\u0025\u0064}" , _beaa , _edcd ) ;
} ; _abcb := _bg . PdfRectangle { Llx : _aacae [ 0 ] . _befee , Urx : _aacae [ _beaa - 1 ] . _befee , Lly : _geef [ _edcd - 1 ] . _befee , Ury : _geef [ 0 ] . _befee } ; return _ce . Sprintf ( "\u007b\u0025d\u0020\u0078\u0020%\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u007d" , _beaa , _edcd , _abcb ) ;
} ; func ( _dac * compositeCell ) updateBBox ( ) { for _ , _gebb := range _dac . paraList { _dac . PdfRectangle = _egbga ( _dac . PdfRectangle , _gebb . PdfRectangle ) ; } ; } ; func _dceg ( _acga [ ] * textLine ) map [ float64 ] [ ] * textLine { _df . Slice ( _acga , func ( _gggea , _bbgaa int ) bool { return _acga [ _gggea ] . _cbbd < _acga [ _bbgaa ] . _cbbd } ) ;
_febc := map [ float64 ] [ ] * textLine { } ; for _ , _ebede := range _acga { _aacb := _cbba ( _ebede ) ; _aacb = _ef . Round ( _aacb ) ; _febc [ _aacb ] = append ( _febc [ _aacb ] , _ebede ) ; } ; return _febc ; } ; func ( _dadc compositeCell ) split ( _fbaa , _ceede [ ] float64 ) * textTable { _cbdg := len ( _fbaa ) + 1 ;
_gaeb := len ( _ceede ) + 1 ; if _cgafg { _b . Log . Info ( "\u0063\u006f\u006d\u0070\u006f\u0073\u0069t\u0065\u0043\u0065l\u006c\u002e\u0073\u0070l\u0069\u0074\u003a\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u000a\u0009\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u003d\u0025\u0073\u000a" + "\u0009\u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073=\u0025\u0036\u002e\u0032\u0066\u000a\t\u0063\u006f\u006c\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u003d%\u0036\u002e\u0032\u0066" , _gaeb , _cbdg , _dadc , _fbaa , _ceede ) ;
_ce . Printf ( "\u0020\u0020\u0020\u0020\u0025\u0064\u0020\u0070\u0061\u0072\u0061\u0073\u000a" , len ( _dadc . paraList ) ) ; for _fadc , _ccag := range _dadc . paraList { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _fadc , _ccag . String ( ) ) ; } ; _ce . Printf ( "\u0020\u0020\u0020\u0020\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u000a" , len ( _dadc . lines ( ) ) ) ;
for _bfbbba , _aaaa := range _dadc . lines ( ) { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _bfbbba , _aaaa ) ; } ; } ; _fbaa = _gfeg ( _fbaa , _dadc . Ury , _dadc . Lly ) ; _ceede = _gfeg ( _ceede , _dadc . Llx , _dadc . Urx ) ; _fccd := make ( map [ uint64 ] * textPara , _gaeb * _cbdg ) ;
_bafc := textTable { _ddfc : _gaeb , _gcbge : _cbdg , _efeac : _fccd } ; _gfg := _dadc . paraList ; _df . Slice ( _gfg , func ( _cdgg , _agcc int ) bool { _ddbf , _dfcgg := _gfg [ _cdgg ] , _gfg [ _agcc ] ; _dcdfb , _dgfgc := _ddbf . Lly , _dfcgg . Lly ; if _dcdfb != _dgfgc { return _dcdfb < _dgfgc ;
} ; return _ddbf . Llx < _dfcgg . Llx ; } ) ; _acge := make ( map [ uint64 ] _bg . PdfRectangle , _gaeb * _cbdg ) ; for _gggfb , _gefad := range _fbaa [ 1 : ] { _dcbde := _fbaa [ _gggfb ] ; for _eaeea , _gged := range _ceede [ 1 : ] { _gdef := _ceede [ _eaeea ] ; _acge [ _fgcce ( _eaeea , _gggfb ) ] = _bg . PdfRectangle { Llx : _gdef , Urx : _gged , Lly : _gefad , Ury : _dcbde } ;
} ; } ; if _cgafg { _b . Log . Info ( "\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u0043\u0065l\u006c\u002e\u0073\u0070\u006c\u0069\u0074\u003a\u0020\u0072e\u0063\u0074\u0073" ) ; _ce . Printf ( "\u0020\u0020\u0020\u0020" ) ; for _bggdb := 0 ; _bggdb < _gaeb ;
_bggdb ++ { _ce . Printf ( "\u0025\u0033\u0030\u0064\u002c\u0020" , _bggdb ) ; } ; _ce . Println ( ) ; for _fdebd := 0 ; _fdebd < _cbdg ; _fdebd ++ { _ce . Printf ( "\u0020\u0020\u0025\u0032\u0064\u003a" , _fdebd ) ; for _edeba := 0 ; _edeba < _gaeb ; _edeba ++ { _ce . Printf ( "\u00256\u002e\u0032\u0066\u002c\u0020" , _acge [ _fgcce ( _edeba , _fdebd ) ] ) ;
} ; _ce . Println ( ) ; } ; } ; _cbcf := func ( _cedae * textLine ) ( int , int ) { for _ffbd := 0 ; _ffbd < _cbdg ; _ffbd ++ { for _begc := 0 ; _begc < _gaeb ; _begc ++ { if _gfea ( _acge [ _fgcce ( _begc , _ffbd ) ] , _cedae . PdfRectangle ) { return _begc , _ffbd ; } ; } ; } ; return - 1 , - 1 ; } ; _cgde := make ( map [ uint64 ] [ ] * textLine , _gaeb * _cbdg ) ;
for _ , _cgdccd := range _gfg . lines ( ) { _debe , _dggg := _cbcf ( _cgdccd ) ; if _debe < 0 { continue ; } ; _cgde [ _fgcce ( _debe , _dggg ) ] = append ( _cgde [ _fgcce ( _debe , _dggg ) ] , _cgdccd ) ; } ; for _aafb := 0 ; _aafb < len ( _fbaa ) - 1 ; _aafb ++ { _cggc := _fbaa [ _aafb ] ; _abbee := _fbaa [ _aafb + 1 ] ;
for _gbga := 0 ; _gbga < len ( _ceede ) - 1 ; _gbga ++ { _ffea := _ceede [ _gbga ] ; _fafd := _ceede [ _gbga + 1 ] ; _fdfcc := _bg . PdfRectangle { Llx : _ffea , Urx : _fafd , Lly : _abbee , Ury : _cggc } ; _gcdd := _cgde [ _fgcce ( _gbga , _aafb ) ] ; if len ( _gcdd ) == 0 { continue ; } ; _fabb := _gdae ( _fdfcc , _gcdd ) ;
_bafc . put ( _gbga , _aafb , _fabb ) ; } ; } ; return & _bafc ; } ; func ( _dgge * wordBag ) absorb ( _fddg * wordBag ) { _cdad := _fddg . makeRemovals ( ) ; for _abce , _bggd := range _fddg . _cgdg { for _ , _cdde := range _bggd { _dgge . pullWord ( _cdde , _abce , _cdad ) ; } ; } ; _fddg . applyRemovals ( _cdad ) ;
} ; func ( _bceb paraList ) writeText ( _ebce _ga . Writer ) { for _eddbb , _fdfdf := range _bceb { if _fdfdf . _cfga { continue ; } ; _fdfdf . writeText ( _ebce ) ; if _eddbb != len ( _bceb ) - 1 { if _bbgff ( _fdfdf , _bceb [ _eddbb + 1 ] ) { _ebce . Write ( [ ] byte ( "\u0020" ) ) ; } else { _ebce . Write ( [ ] byte ( "\u000a" ) ) ;
_ebce . Write ( [ ] byte ( "\u000a" ) ) ; } ; } ; } ; _ebce . Write ( [ ] byte ( "\u000a" ) ) ; _ebce . Write ( [ ] byte ( "\u000a" ) ) ; } ; func ( _aca * wordBag ) removeWord ( _bged * textWord , _afdf int ) { _dgdba := _aca . _cgdg [ _afdf ] ; _dgdba = _befef ( _dgdba , _bged ) ; if len ( _dgdba ) == 0 { delete ( _aca . _cgdg , _afdf ) ;
} else { _aca . _cgdg [ _afdf ] = _dgdba ; } ; } ; func ( _adgbe * wordBag ) empty ( _faad int ) bool { _ , _cgeb := _adgbe . _cgdg [ _faad ] ; return ! _cgeb } ; func ( _efba * shapesState ) cubicTo ( _faef , _cfed , _caeg , _efff , _cdee , _aaad float64 ) { if _bdaae { _b . Log . Info ( "\u0063\u0075\u0062\u0069\u0063\u0054\u006f\u003a" ) ;
} ; _efba . addPoint ( _cdee , _aaad ) ; } ; func ( _cbfg * shapesState ) establishSubpath ( ) * subpath { _bcbf , _cbb := _cbfg . lastpointEstablished ( ) ; if ! _cbb { _cbfg . _cbfc = append ( _cbfg . _cbfc , _egbg ( _bcbf ) ) ; } ; if len ( _cbfg . _cbfc ) == 0 { return nil ; } ; _cbfg . _afge = false ;
return _cbfg . _cbfc [ len ( _cbfg . _cbfc ) - 1 ] ; } ; func ( _eedc * stateStack ) push ( _cfdc * textState ) { _cce := * _cfdc ; * _eedc = append ( * _eedc , & _cce ) } ; func _aaeg ( _dded , _egede _fa . Image ) _fa . Image { _edfa , _dbbdg := _egede . Bounds ( ) . Size ( ) , _dded . Bounds ( ) . Size ( ) ;
_aggb , _afgdg := _edfa . X , _edfa . Y ; if _dbbdg . X > _aggb { _aggb = _dbbdg . X ; } ; if _dbbdg . Y > _afgdg { _afgdg = _dbbdg . Y ; } ; _cabb := _fa . Rect ( 0 , 0 , _aggb , _afgdg ) ; if _edfa . X != _aggb || _edfa . Y != _afgdg { _eccf := _fa . NewRGBA ( _cabb ) ; _ec . BiLinear . Scale ( _eccf , _cabb , _dded , _egede . Bounds ( ) , _ec . Over , nil ) ;
_egede = _eccf ; } ; if _dbbdg . X != _aggb || _dbbdg . Y != _afgdg { _facd := _fa . NewRGBA ( _cabb ) ; _ec . BiLinear . Scale ( _facd , _cabb , _dded , _dded . Bounds ( ) , _ec . Over , nil ) ; _dded = _facd ; } ; _cedg := _fa . NewRGBA ( _cabb ) ; _ec . DrawMask ( _cedg , _cabb , _dded , _fa . Point { } , _egede , _fa . Point { } , _ec . Over ) ;
return _cedg ; } ; func _cgae ( _gfgc , _efbaef _gab . Point ) bool { _ccgfd := _ef . Abs ( _gfgc . X - _efbaef . X ) ; _fdgd := _ef . Abs ( _gfgc . Y - _efbaef . Y ) ; return _ggdg ( _fdgd , _ccgfd ) ; } ;
// Append appends `mark` to the mark array.
func ( _fdbe * TextMarkArray ) Append ( mark TextMark ) { _fdbe . _bca = append ( _fdbe . _bca , mark ) } ; func _fcfg ( _daba * textWord , _gacdd float64 , _caff , _ceea rulingList ) * wordBag { _fbed := _ebfc ( _daba . _baebb ) ; _cefg := [ ] * textWord { _daba } ; _faag := wordBag { _cgdg : map [ int ] [ ] * textWord { _fbed : _cefg } , PdfRectangle : _daba . PdfRectangle , _ecdf : _daba . _ebgb , _dgec : _gacdd , _debag : _caff , _gfe : _ceea } ;
return & _faag ; } ; func _eeg ( _dbce * list ) [ ] * textLine { for _ , _fcfc := range _dbce . _cdfc { switch _fcfc . _ebed { case "\u004c\u0042\u006fd\u0079" : if len ( _fcfc . _ecdee ) != 0 { return _fcfc . _ecdee ; } ; return _eeg ( _fcfc ) ; case "\u0053\u0070\u0061\u006e" : return _fcfc . _ecdee ;
case "I\u006e\u006c\u0069\u006e\u0065\u0053\u0068\u0061\u0070\u0065" : return _fcfc . _ecdee ; } ; } ; return nil ; } ; func ( _eefgc compositeCell ) String ( ) string { _ebbf := "" ; if len ( _eefgc . paraList ) > 0 { _ebbf = _dfcggd ( _eefgc . paraList . merge ( ) . text ( ) , 50 ) ; } ;
return _ce . Sprintf ( "\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0070\u0061\u0072a\u0073\u0020\u0025\u0071" , _eefgc . PdfRectangle , len ( _eefgc . paraList ) , _ebbf ) ; } ; func ( _gccde rulingList ) splitSec ( ) [ ] rulingList { _df . Slice ( _gccde , func ( _cecdd , _ebgcg int ) bool { _daga , _acgc := _gccde [ _cecdd ] , _gccde [ _ebgcg ] ;
if _daga . _agbc != _acgc . _agbc { return _daga . _agbc < _acgc . _agbc ; } ; return _daga . _gffgd < _acgc . _gffgd ; } ) ; _gaaca := make ( map [ * ruling ] struct { } , len ( _gccde ) ) ; _dcedb := func ( _dbcd * ruling ) rulingList { _dgbcg := rulingList { _dbcd } ; _gaaca [ _dbcd ] = struct { } { } ;
for _ , _bbbdb := range _gccde { if _ , _geba := _gaaca [ _bbbdb ] ; _geba { continue ; } ; for _ , _ccfab := range _dgbcg { if _bbbdb . alignsSec ( _ccfab ) { _dgbcg = append ( _dgbcg , _bbbdb ) ; _gaaca [ _bbbdb ] = struct { } { } ; break ; } ; } ; } ; return _dgbcg ; } ; _bdgc := [ ] rulingList { _dcedb ( _gccde [ 0 ] ) } ;
for _ , _aeeb := range _gccde [ 1 : ] { if _ , _gdfg := _gaaca [ _aeeb ] ; _gdfg { continue ; } ; _bdgc = append ( _bdgc , _dcedb ( _aeeb ) ) ; } ; return _bdgc ; } ; func ( _gcde * textObject ) moveTextSetLeading ( _fdeb , _gfa float64 ) { _gcde . _gacd . _ced = - _gfa ; _gcde . moveLP ( _fdeb , _gfa ) ;
} ; func ( _geaba paraList ) applyTables ( _edea [ ] * textTable ) paraList { var _cbcae paraList ; for _ , _gedg := range _edea { _cbcae = append ( _cbcae , _gedg . newTablePara ( ) ) ; } ; for _ , _cfcf := range _geaba { if _cfcf . _abeg { continue ; } ; _cbcae = append ( _cbcae , _cfcf ) ;
} ; return _cbcae ; } ; func _caed ( _ggfe _bg . PdfRectangle ) * ruling { return & ruling { _eabdg : _acgee , _befee : _ggfe . Urx , _agbc : _ggfe . Lly , _gffgd : _ggfe . Ury } ; } ; type list struct { _ecdee [ ] * textLine ; _ebed string ; _cdfc [ ] * list ; _bfcg string ; } ; type imageExtractContext struct { _dae [ ] ImageMark ;
_egfe int ; _edc int ; _ddc int ; _agf map [ * _ea . PdfObjectStream ] * cachedImage ; _fae * ImageExtractOptions ; _gea bool ; } ; type stateStack [ ] * textState ;
// ToTextMark returns the public view of `tm`.
func ( _fcfa * textMark ) ToTextMark ( ) TextMark { return TextMark { Text : _fcfa . _ebgd , Original : _fcfa . _ffbg , BBox : _fcfa . _bcfd , Font : _fcfa . _ecbeg , FontSize : _fcfa . _gceb , FillColor : _fcfa . _bdaff , StrokeColor : _fcfa . _bfdb , Orientation : _fcfa . _acec , DirectObject : _fcfa . _dcbd , ObjString : _fcfa . _babd , Tw : _fcfa . Tw , Th : _fcfa . Th , Tc : _fcfa . _abac , Index : _fcfa . _fbcc } ;
2023-06-30 13:19:48 +00:00
} ;
2023-05-29 17:26:33 +00:00
2023-07-28 12:14:31 +00:00
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
func ( _baf * Extractor ) ExtractTextWithStats ( ) ( _ace string , _dec int , _ffc int , _afb error ) { _egc , _dec , _ffc , _afb := _baf . ExtractPageText ( ) ; if _afb != nil { return "" , _dec , _ffc , _afb ; } ; return _egc . Text ( ) , _dec , _ffc , nil ; } ; func _bgabc ( _bcce * list ) [ ] * list { var _befcc [ ] * list ;
for _ , _gedf := range _bcce . _cdfc { switch _gedf . _ebed { case "\u004c\u0049" : _aeab := _eeg ( _gedf ) ; _cadg := _bgabc ( _gedf ) ; _dbgfd := _facb ( _aeab , "\u0062\u0075\u006c\u006c\u0065\u0074" , _cadg ) ; _bgfc := _dcdgd ( _aeab , "" ) ; _dbgfd . _bfcg = _bgfc ; _befcc = append ( _befcc , _dbgfd ) ;
case "\u004c\u0042\u006fd\u0079" : return _bgabc ( _gedf ) ; case "\u004c" : _gaef := _bgabc ( _gedf ) ; _befcc = append ( _befcc , _gaef ... ) ; return _befcc ; } ; } ; return _befcc ; } ; func _fgdeg ( _ggbd , _dcded _gab . Point ) bool { _dcfc := _ef . Abs ( _ggbd . X - _dcded . X ) ; _ggaae := _ef . Abs ( _ggbd . Y - _dcded . Y ) ;
return _ggdg ( _dcfc , _ggaae ) ; } ; func _aedc ( _bcbb * Extractor , _dgac * _bg . PdfPageResources , _aace _fb . GraphicsState , _fgf * textState , _dcfd * stateStack ) * textObject { return & textObject { _dcdg : _bcbb , _edef : _dgac , _agbf : _aace , _bffa : _dcfd , _gacd : _fgf , _fda : _gab . IdentityMatrix ( ) , _cfec : _gab . IdentityMatrix ( ) } ;
} ; func _gdc ( _gbbbf string , _acbdfb [ ] rulingList ) { _b . Log . Info ( "\u0024\u0024 \u0025\u0064\u0020g\u0072\u0069\u0064\u0073\u0020\u002d\u0020\u0025\u0073" , len ( _acbdfb ) , _gbbbf ) ; for _dcced , _eeada := range _acbdfb { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _dcced , _eeada . String ( ) ) ;
} ; } ; func ( _beag rulingList ) mergePrimary ( ) float64 { _aedce := _beag [ 0 ] . _befee ; for _ , _bcaa := range _beag [ 1 : ] { _aedce += _bcaa . _befee ; } ; return _aedce / float64 ( len ( _beag ) ) ; } ;
// ApplyArea processes the page text only within the specified area `bbox`.
// Each time ApplyArea is called, it updates the result set in `pt`.
// Can be called multiple times in a row with different bounding boxes.
func ( _eagc * PageText ) ApplyArea ( bbox _bg . PdfRectangle ) { _cdga := make ( [ ] * textMark , 0 , len ( _eagc . _ccf ) ) ; for _ , _gedb := range _eagc . _ccf { if _geec ( _gedb . bbox ( ) , bbox ) { _cdga = append ( _cdga , _gedb ) ; } ; } ; var _dgdbb paraList ; _bdaaf := len ( _cdga ) ; for _adga := 0 ;
_adga < 360 && _bdaaf > 0 ; _adga += 90 { _cbe := make ( [ ] * textMark , 0 , len ( _cdga ) - _bdaaf ) ; for _ , _bccf := range _cdga { if _bccf . _acec == _adga { _cbe = append ( _cbe , _bccf ) ; } ; } ; if len ( _cbe ) > 0 { _eddb := _fgad ( _cbe , _eagc . _ebbd , nil , nil , _eagc . _fca . _bbg ) ;
_dgdbb = append ( _dgdbb , _eddb ... ) ; _bdaaf -= len ( _cbe ) ; } ; } ; _aeb := new ( _dfe . Buffer ) ; _dgdbb . writeText ( _aeb ) ; _eagc . _bdf = _aeb . String ( ) ; _eagc . _fccf = _dgdbb . toTextMarks ( ) ; _eagc . _ecege = _dgdbb . tables ( ) ; } ; func ( _edadd paraList ) tables ( ) [ ] TextTable { var _cbcgd [ ] TextTable ;
if _cgafg { _b . Log . Info ( "\u0070\u0061\u0072\u0061\u0073\u002e\u0074\u0061\u0062\u006c\u0065\u0073\u003a" ) ; } ; for _ , _bdcb := range _edadd { _gccgg := _bdcb . _bgba ; if _gccgg != nil && _gccgg . isExportable ( ) { _cbcgd = append ( _cbcgd , _gccgg . toTextTable ( ) ) ;
} ; } ; return _cbcgd ; } ; func ( _dbcb * textTable ) compositeColCorridors ( ) map [ int ] [ ] float64 { _gdcg := make ( map [ int ] [ ] float64 , _dbcb . _ddfc ) ; if _cgafg { _b . Log . Info ( "\u0063\u006f\u006d\u0070o\u0073\u0069\u0074\u0065\u0043\u006f\u006c\u0043\u006f\u0072r\u0069d\u006f\u0072\u0073\u003a\u0020\u0077\u003d%\u0064\u0020" , _dbcb . _ddfc ) ;
} ; for _fbggb := 0 ; _fbggb < _dbcb . _ddfc ; _fbggb ++ { _gdcg [ _fbggb ] = nil ; } ; return _gdcg ; } ; func _acbc ( _egfeb float64 ) bool { return _ef . Abs ( _egfeb ) < _efea } ;
2023-05-29 17:26:33 +00:00
2023-06-30 13:19:48 +00:00
// Text returns the extracted page text.
2023-07-28 12:14:31 +00:00
func ( _bdaa PageText ) Text ( ) string { return _bdaa . _bdf } ; func ( _gdb * textObject ) getCurrentFont ( ) * _bg . PdfFont { _abbc := _gdb . _gacd . _dgdf ; if _abbc == nil { _b . Log . Debug ( "\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u0020\u0064\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020U\u0073\u0069\u006e\u0067\u0020d\u0065\u0066a\u0075\u006c\u0074\u002e" ) ;
return _bg . DefaultFont ( ) ; } ; return _abbc ; } ; func ( _deda * textTable ) logComposite ( _gcfa string ) { if ! _cgafg { return ; } ; _b . Log . Info ( "\u007e~\u007eP\u0061\u0072\u0061\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0073" , _deda . _ddfc , _deda . _gcbge , _gcfa ) ;
_ce . Printf ( "\u0025\u0035\u0073 \u007c" , "" ) ; for _bdgce := 0 ; _bdgce < _deda . _ddfc ; _bdgce ++ { _ce . Printf ( "\u0025\u0033\u0064 \u007c" , _bdgce ) ; } ; _ce . Println ( "" ) ; _ce . Printf ( "\u0025\u0035\u0073 \u002b" , "" ) ; for _cfcbc := 0 ; _cfcbc < _deda . _ddfc ; _cfcbc ++ { _ce . Printf ( "\u0025\u0033\u0073 \u002b" , "\u002d\u002d\u002d" ) ;
} ; _ce . Println ( "" ) ; for _cafb := 0 ; _cafb < _deda . _gcbge ; _cafb ++ { _ce . Printf ( "\u0025\u0035\u0064 \u007c" , _cafb ) ; for _dface := 0 ; _dface < _deda . _ddfc ; _dface ++ { _eagga , _ := _deda . _dadcc [ _fgcce ( _dface , _cafb ) ] . parasBBox ( ) ; _ce . Printf ( "\u0025\u0033\u0064 \u007c" , len ( _eagga ) ) ;
} ; _ce . Println ( "" ) ; } ; _b . Log . Info ( "\u007e~\u007eT\u0065\u0078\u0074\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0073" , _deda . _ddfc , _deda . _gcbge , _gcfa ) ; _ce . Printf ( "\u0025\u0035\u0073 \u007c" , "" ) ; for _deec := 0 ; _deec < _deda . _ddfc ; _deec ++ { _ce . Printf ( "\u0025\u0031\u0032\u0064\u0020\u007c" , _deec ) ;
} ; _ce . Println ( "" ) ; _ce . Printf ( "\u0025\u0035\u0073 \u002b" , "" ) ; for _cbcgb := 0 ; _cbcgb < _deda . _ddfc ; _cbcgb ++ { _ce . Print ( "\u002d\u002d\u002d\u002d\u002d\u002d\u002d\u002d\u002d-\u002d\u002d\u002d\u002b" ) ; } ; _ce . Println ( "" ) ; for _cdbgf := 0 ; _cdbgf < _deda . _gcbge ;
_cdbgf ++ { _ce . Printf ( "\u0025\u0035\u0064 \u007c" , _cdbgf ) ; for _bgcbe := 0 ; _bgcbe < _deda . _ddfc ; _bgcbe ++ { _bbbc , _ := _deda . _dadcc [ _fgcce ( _bgcbe , _cdbgf ) ] . parasBBox ( ) ; _gece := "" ; _gacfd := _bbbc . merge ( ) ; if _gacfd != nil { _gece = _gacfd . text ( ) ; } ; _gece = _ce . Sprintf ( "\u0025\u0071" , _dfcggd ( _gece , 12 ) ) ;
_gece = _gece [ 1 : len ( _gece ) - 1 ] ; _ce . Printf ( "\u0025\u0031\u0032\u0073\u0020\u007c" , _gece ) ; } ; _ce . Println ( "" ) ; } ; } ; func ( _bacbf paraList ) readBefore ( _fdbc [ ] int , _fffc , _cddea int ) bool { _fgg , _bbace := _bacbf [ _fffc ] , _bacbf [ _cddea ] ; if _fffd ( _fgg , _bbace ) && _fgg . Lly > _bbace . Lly { return true ;
} ; if ! ( _fgg . _gfbgd . Urx < _bbace . _gfbgd . Llx ) { return false ; } ; _bfcd , _dbcgc := _fgg . Lly , _bbace . Lly ; if _bfcd > _dbcgc { _dbcgc , _bfcd = _bfcd , _dbcgc ; } ; _debae := _ef . Max ( _fgg . _gfbgd . Llx , _bbace . _gfbgd . Llx ) ; _cacgf := _ef . Min ( _fgg . _gfbgd . Urx , _bbace . _gfbgd . Urx ) ;
_cbee := _bacbf . llyRange ( _fdbc , _bfcd , _dbcgc ) ; for _ , _addb := range _cbee { if _addb == _fffc || _addb == _cddea { continue ; } ; _dgee := _bacbf [ _addb ] ; if _dgee . _gfbgd . Llx <= _cacgf && _debae <= _dgee . _gfbgd . Urx { return false ; } ; } ; return true ; } ; func ( _dfaf TextTable ) getCellInfo ( _gcgf TextMark ) [ ] [ ] int { for _cdf , _cceb := range _dfaf . Cells { for _bfec , _dbgd := range _cceb { _gagge := & _dbgd . Marks ;
if _gagge . exists ( _gcgf ) { return [ ] [ ] int { { _cdf } , { _bfec } } ; } ; } ; } ; return nil ; } ; func ( _fgd * textObject ) setTextLeading ( _cgafd float64 ) { if _fgd == nil { return ; } ; _fgd . _gacd . _ced = _cgafd ; } ; func ( _afeee * textPara ) fontsize ( ) float64 { return _afeee . _gfbb [ 0 ] . _bfbb } ;
func _aefc ( _cafeb _bg . PdfRectangle ) rulingKind { _bbff := _cafeb . Width ( ) ; _ffgf := _cafeb . Height ( ) ; if _bbff > _ffgf { if _bbff >= _gbca { return _cefaa ; } ; } else { if _ffgf >= _gbca { return _acgee ; } ; } ; return _bgbdg ; } ; func ( _gacf * ruling ) alignsSec ( _edbd * ruling ) bool { const _gdbdbg = _bddeb + 1.0 ;
return _gacf . _agbc - _gdbdbg <= _edbd . _gffgd && _edbd . _agbc - _gdbdbg <= _gacf . _gffgd ; } ; func ( _bgfaa gridTiling ) complete ( ) bool { for _ , _gdafa := range _bgfaa . _cbec { for _ , _cfgbb := range _gdafa { if ! _cfgbb . complete ( ) { return false ; } ; } ; } ; return true ;
} ; func ( _gfd * stateStack ) size ( ) int { return len ( * _gfd ) } ; func ( _afec pathSection ) bbox ( ) _bg . PdfRectangle { _dcebf := _afec . _dgfc [ 0 ] . _fbcgf [ 0 ] ; _caee := _bg . PdfRectangle { Llx : _dcebf . X , Urx : _dcebf . X , Lly : _dcebf . Y , Ury : _dcebf . Y } ; _cgcg := func ( _eebd _gab . Point ) { if _eebd . X < _caee . Llx { _caee . Llx = _eebd . X ;
} else if _eebd . X > _caee . Urx { _caee . Urx = _eebd . X ; } ; if _eebd . Y < _caee . Lly { _caee . Lly = _eebd . Y ; } else if _eebd . Y > _caee . Ury { _caee . Ury = _eebd . Y ; } ; } ; for _ , _fdee := range _afec . _dgfc [ 0 ] . _fbcgf [ 1 : ] { _cgcg ( _fdee ) ; } ; for _ , _egcc := range _afec . _dgfc [ 1 : ] { for _ , _eecfeg := range _egcc . _fbcgf { _cgcg ( _eecfeg ) ;
} ; } ; return _caee ; } ; var _gefeb = _gg . MustCompile ( "\u005e\u005c\u0073\u002a\u0028\u005c\u0064\u002b\u005c\u002e\u003f|\u005b\u0049\u0069\u0076\u005d\u002b\u0029\u005c\u0073\u002a\\\u0029\u003f\u0024" ) ; func ( _daecb lineRuling ) xMean ( ) float64 { return 0.5 * ( _daecb . _egaf . X + _daecb . _eaebf . X ) } ;
type shapesState struct { _dfgb _gab . Matrix ; _afee _gab . Matrix ; _cbfc [ ] * subpath ; _afge bool ; _bfd _gab . Point ; _eeadb * textObject ; } ; func _eaca ( _eeab , _ecbef _gab . Point ) bool { return _eeab . X == _ecbef . X && _eeab . Y == _ecbef . Y } ; func ( _bbfc * wordBag ) blocked ( _fada * textWord ) bool { if _fada . Urx < _bbfc . Llx { _gad := _caed ( _fada . PdfRectangle ) ;
_acfa := _afega ( _bbfc . PdfRectangle ) ; if _bbfc . _debag . blocks ( _gad , _acfa ) { if _geg { _b . Log . Info ( "\u0062\u006c\u006f\u0063ke\u0064\u0020\u2190\u0078\u003a\u0020\u0025\u0073\u0020\u0025\u0073" , _fada , _bbfc ) ; } ; return true ; } ; } else if _bbfc . Urx < _fada . Llx { _acda := _caed ( _bbfc . PdfRectangle ) ;
_ddac := _afega ( _fada . PdfRectangle ) ; if _bbfc . _debag . blocks ( _acda , _ddac ) { if _geg { _b . Log . Info ( "b\u006co\u0063\u006b\u0065\u0064\u0020\u0078\u2192\u0020:\u0020\u0025\u0073\u0020%s" , _fada , _bbfc ) ; } ; return true ; } ; } ; if _fada . Ury < _bbfc . Lly { _febb := _aage ( _fada . PdfRectangle ) ;
_cccb := _cfaf ( _bbfc . PdfRectangle ) ; if _bbfc . _gfe . blocks ( _febb , _cccb ) { if _geg { _b . Log . Info ( "\u0062\u006c\u006f\u0063ke\u0064\u0020\u2190\u0079\u003a\u0020\u0025\u0073\u0020\u0025\u0073" , _fada , _bbfc ) ; } ; return true ; } ; } else if _bbfc . Ury < _fada . Lly { _fedd := _aage ( _bbfc . PdfRectangle ) ;
_eebb := _cfaf ( _fada . PdfRectangle ) ; if _bbfc . _gfe . blocks ( _fedd , _eebb ) { if _geg { _b . Log . Info ( "b\u006co\u0063\u006b\u0065\u0064\u0020\u0079\u2192\u0020:\u0020\u0025\u0073\u0020%s" , _fada , _bbfc ) ; } ; return true ; } ; } ; return false ; } ; func _fdcc ( _ccd , _afba bounded ) float64 { return _ddcc ( _ccd ) - _ddcc ( _afba ) } ;
func ( _dcdcc * ruling ) alignsPrimary ( _gdbg * ruling ) bool { return _dcdcc . _eabdg == _gdbg . _eabdg && _ef . Abs ( _dcdcc . _befee - _gdbg . _befee ) < _bddeb * 0.5 ; } ;
2023-06-30 13:19:48 +00:00
2023-07-28 12:14:31 +00:00
// String returns a description of `w`.
func ( _efdb * textWord ) String ( ) string { return _ce . Sprintf ( "\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022" , _efdb . _baebb , _efdb . PdfRectangle , _efdb . _ebgb , _efdb . _ggaef ) ;
} ; func ( _begb * textLine ) endsInHyphen ( ) bool { _gfb := _begb . _aafd [ len ( _begb . _aafd ) - 1 ] ; _fbeg := _gfb . _ggaef ; _gacdf , _deea := _a . DecodeLastRuneInString ( _fbeg ) ; if _deea <= 0 || ! _f . Is ( _f . Hyphen , _gacdf ) { return false ; } ; if _gfb . _gagaf && _ggff ( _fbeg ) { return true ;
} ; return _ggff ( _begb . text ( ) ) ; } ; func ( _gebc rulingList ) snapToGroups ( ) rulingList { _fbddf , _afeea := _gebc . vertsHorzs ( ) ; if len ( _fbddf ) > 0 { _fbddf = _fbddf . snapToGroupsDirection ( ) ; } ; if len ( _afeea ) > 0 { _afeea = _afeea . snapToGroupsDirection ( ) ; } ; _afaae := append ( _fbddf , _afeea ... ) ;
_afaae . log ( "\u0073\u006e\u0061p\u0054\u006f\u0047\u0072\u006f\u0075\u0070\u0073" ) ; return _afaae ; } ; func ( _age * shapesState ) stroke ( _cabe * [ ] pathSection ) { _adgd := pathSection { _dgfc : _age . _cbfc , Color : _age . _eeadb . getStrokeColor ( ) } ; * _cabe = append ( * _cabe , _adgd ) ;
if _bccgb { _ce . Printf ( "\u0020 \u0020\u0020S\u0054\u0052\u004fK\u0045\u003a\u0020\u0025\u0064\u0020\u0073t\u0072\u006f\u006b\u0065\u0073\u0020s\u0073\u003d\u0025\u0073\u0020\u0063\u006f\u006c\u006f\u0072\u003d%\u002b\u0076\u0020\u0025\u0036\u002e\u0032\u0066\u000a" , len ( * _cabe ) , _age , _age . _eeadb . getStrokeColor ( ) , _adgd . bbox ( ) ) ;
if _fegd { for _bade , _baca := range _age . _cbfc { _ce . Printf ( "\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a" , _bade , _baca ) ; if _bade == 10 { break ; } ; } ; } ; } ; } ;
2023-06-30 13:19:48 +00:00
2023-07-28 12:14:31 +00:00
// String returns a description of `k`.
func ( _fedcg markKind ) String ( ) string { _badb , _efaba := _daed [ _fedcg ] ; if ! _efaba { return _ce . Sprintf ( "\u004e\u006f\u0074\u0020\u0061\u0020\u006d\u0061\u0072k\u003a\u0020\u0025\u0064" , _fedcg ) ; } ; return _badb ; } ; func ( _dabf * textTable ) put ( _aaagb , _bcafa int , _eacg * textPara ) { _dabf . _efeac [ _fgcce ( _aaagb , _bcafa ) ] = _eacg ;
} ;
2023-06-30 13:19:48 +00:00
// String returns a description of `v`.
2023-07-28 12:14:31 +00:00
func ( _gabdb * ruling ) String ( ) string { if _gabdb . _eabdg == _bgbdg { return "\u004e\u004f\u0054\u0020\u0052\u0055\u004c\u0049\u004e\u0047" ; } ; _gdaf , _ecga := "\u0078" , "\u0079" ; if _gabdb . _eabdg == _cefaa { _gdaf , _ecga = "\u0079" , "\u0078" ; } ; _gbea := "" ; if _gabdb . _fadae != 0.0 { _gbea = _ce . Sprintf ( " \u0077\u0069\u0064\u0074\u0068\u003d\u0025\u002e\u0032\u0066" , _gabdb . _fadae ) ;
} ; return _ce . Sprintf ( "\u0025\u00310\u0073\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u002d\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0020\u0025\u0076\u0025\u0073" , _gabdb . _eabdg , _gdaf , _gabdb . _befee , _ecga , _gabdb . _agbc , _gabdb . _gffgd , _gabdb . _gffgd - _gabdb . _agbc , _gabdb . _gggfe , _gabdb . Color , _gbea ) ;
} ; func ( _gbfe * shapesState ) lastpointEstablished ( ) ( _gab . Point , bool ) { if _gbfe . _afge { return _gbfe . _bfd , false ; } ; _gae := len ( _gbfe . _cbfc ) ; if _gae > 0 && _gbfe . _cbfc [ _gae - 1 ] . _bbdg { return _gbfe . _cbfc [ _gae - 1 ] . last ( ) , false ; } ; return _gab . Point { } , true ;
} ; func _cfaf ( _dbede _bg . PdfRectangle ) * ruling { return & ruling { _eabdg : _cefaa , _befee : _dbede . Lly , _agbc : _dbede . Llx , _gffgd : _dbede . Urx } ; } ; func ( _begbd * subpath ) makeRectRuling ( _fgbd _ag . Color ) ( * ruling , bool ) { if _cfde { _b . Log . Info ( "\u006d\u0061\u006beR\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0070\u0061\u0074\u0068\u003d\u0025\u0076" , _begbd ) ;
} ; _faca := _begbd . _fbcgf [ : 4 ] ; _baeed := make ( map [ int ] rulingKind , len ( _faca ) ) ; for _gbgf , _bcccf := range _faca { _ggca := _begbd . _fbcgf [ ( _gbgf + 1 ) % 4 ] ; _baeed [ _gbgf ] = _fecc ( _bcccf , _ggca ) ; if _cfde { _ce . Printf ( "\u0025\u0034\u0064: \u0025\u0073\u0020\u003d\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u002d\u0020\u0025\u0036\u002e\u0032\u0066" , _gbgf , _baeed [ _gbgf ] , _bcccf , _ggca ) ;
} ; } ; if _cfde { _ce . Printf ( "\u0020\u0020\u0020\u006b\u0069\u006e\u0064\u0073\u003d\u0025\u002b\u0076\u000a" , _baeed ) ; } ; var _dgbgb , _dgde [ ] int ; for _ccead , _dcdd := range _baeed { switch _dcdd { case _cefaa : _dgde = append ( _dgde , _ccead ) ; case _acgee : _dgbgb = append ( _dgbgb , _ccead ) ;
} ; } ; if _cfde { _ce . Printf ( "\u0020\u0020 \u0068\u006f\u0072z\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a" , len ( _dgde ) , _dgde ) ; _ce . Printf ( "\u0020\u0020 \u0076\u0065\u0072t\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a" , len ( _dgbgb ) , _dgbgb ) ;
} ; _adffb := ( len ( _dgde ) == 2 && len ( _dgbgb ) == 2 ) || ( len ( _dgde ) == 2 && len ( _dgbgb ) == 0 && _cgae ( _faca [ _dgde [ 0 ] ] , _faca [ _dgde [ 1 ] ] ) ) || ( len ( _dgbgb ) == 2 && len ( _dgde ) == 0 && _fgdeg ( _faca [ _dgbgb [ 0 ] ] , _faca [ _dgbgb [ 1 ] ] ) ) ; if _cfde { _ce . Printf ( " \u0020\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064\u0020\u0076\u0065\u0072\u0074\u0073=\u0025\u0064\u0020o\u006b=\u0025\u0074\u000a" , len ( _dgde ) , len ( _dgbgb ) , _adffb ) ;
} ; if ! _adffb { if _cfde { _b . Log . Error ( "\u0021!\u006d\u0061\u006b\u0065R\u0065\u0063\u0074\u0052\u0075l\u0069n\u0067:\u0020\u0070\u0061\u0074\u0068\u003d\u0025v" , _begbd ) ; _ce . Printf ( " \u0020\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064\u0020\u0076\u0065\u0072\u0074\u0073=\u0025\u0064\u0020o\u006b=\u0025\u0074\u000a" , len ( _dgde ) , len ( _dgbgb ) , _adffb ) ;
} ; return & ruling { } , false ; } ; if len ( _dgbgb ) == 0 { for _aeca , _ecfcg := range _baeed { if _ecfcg != _cefaa { _dgbgb = append ( _dgbgb , _aeca ) ; } ; } ; } ; if len ( _dgde ) == 0 { for _cccf , _cddc := range _baeed { if _cddc != _acgee { _dgde = append ( _dgde , _cccf ) ; } ; } ; } ; if _cfde { _b . Log . Info ( "\u006da\u006b\u0065R\u0065\u0063\u0074\u0052u\u006c\u0069\u006eg\u003a\u0020\u0068\u006f\u0072\u007a\u0073\u003d\u0025d \u0076\u0065\u0072t\u0073\u003d%\u0064\u0020\u0070\u006f\u0069\u006et\u0073\u003d%\u0064\u000a" + "\u0009\u0020\u0068o\u0072\u007a\u0073\u003d\u0025\u002b\u0076\u000a" + "\u0009\u0020\u0076e\u0072\u0074\u0073\u003d\u0025\u002b\u0076\u000a" + "\t\u0070\u006f\u0069\u006e\u0074\u0073\u003d\u0025\u002b\u0076" , len ( _dgde ) , len ( _dgbgb ) , len ( _faca ) , _dgde , _dgbgb , _faca ) ;
} ; var _fege , _afdc , _abcf , _eafc _gab . Point ; if _faca [ _dgde [ 0 ] ] . Y > _faca [ _dgde [ 1 ] ] . Y { _abcf , _eafc = _faca [ _dgde [ 0 ] ] , _faca [ _dgde [ 1 ] ] ; } else { _abcf , _eafc = _faca [ _dgde [ 1 ] ] , _faca [ _dgde [ 0 ] ] ; } ; if _faca [ _dgbgb [ 0 ] ] . X > _faca [ _dgbgb [ 1 ] ] . X { _fege , _afdc = _faca [ _dgbgb [ 0 ] ] , _faca [ _dgbgb [ 1 ] ] ;
} else { _fege , _afdc = _faca [ _dgbgb [ 1 ] ] , _faca [ _dgbgb [ 0 ] ] ; } ; _abeag := _bg . PdfRectangle { Llx : _fege . X , Urx : _afdc . X , Lly : _eafc . Y , Ury : _abcf . Y } ; if _abeag . Llx > _abeag . Urx { _abeag . Llx , _abeag . Urx = _abeag . Urx , _abeag . Llx ; } ; if _abeag . Lly > _abeag . Ury { _abeag . Lly , _abeag . Ury = _abeag . Ury , _abeag . Lly ;
} ; _fgcf := rectRuling { PdfRectangle : _abeag , _fbad : _aefc ( _abeag ) , Color : _fgbd } ; if _fgcf . _fbad == _bgbdg { if _cfde { _b . Log . Error ( "\u006da\u006b\u0065\u0052\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006eg\u003a\u0020\u006b\u0069\u006e\u0064\u003d\u006e\u0069\u006c" ) ;
} ; return nil , false ; } ; _gbbbb , _gfgg := _fgcf . asRuling ( ) ; if ! _gfgg { if _cfde { _b . Log . Error ( "\u006da\u006b\u0065\u0052\u0065c\u0074\u0052\u0075\u006c\u0069n\u0067:\u0020!\u0069\u0073\u0052\u0075\u006c\u0069\u006eg" ) ; } ; return nil , false ; } ; if _bccgb { _ce . Printf ( "\u0020\u0020\u0020\u0072\u003d\u0025\u0073\u000a" , _gbbbb . String ( ) ) ;
} ; return _gbbbb , true ; } ; func ( _ecaad gridTile ) complete ( ) bool { return _ecaad . numBorders ( ) == 4 } ; type textMark struct { _bg . PdfRectangle ; _acec int ; _ebgd string ; _ffbg string ; _ecbeg * _bg . PdfFont ; _gceb float64 ; _abac float64 ; _acddd _gab . Matrix ; _efgg _gab . Point ;
_bcfd _bg . PdfRectangle ; _bdaff _ag . Color ; _bfdb _ag . Color ; _dcbd _ea . PdfObject ; _babd [ ] string ; Tw float64 ; Th float64 ; _adbb int ; _fbcc int ; } ; func _aea ( _ecde , _bgbef bounded ) float64 { return _ecde . bbox ( ) . Llx - _bgbef . bbox ( ) . Llx } ; func ( _aecd * textPara ) toTextMarks ( _gecg * int ) [ ] TextMark { if _aecd . _bgba == nil { return _aecd . toCellTextMarks ( _gecg ) ;
} ; var _badea [ ] TextMark ; for _ccae := 0 ; _ccae < _aecd . _bgba . _gcbge ; _ccae ++ { for _bgff := 0 ; _bgff < _aecd . _bgba . _ddfc ; _bgff ++ { _ceeg := _aecd . _bgba . get ( _bgff , _ccae ) ; if _ceeg == nil { _badea = _gdbc ( _badea , _gecg , "\u0009" ) ; } else { _cedd := _ceeg . toCellTextMarks ( _gecg ) ;
_badea = append ( _badea , _cedd ... ) ; } ; _badea = _gdbc ( _badea , _gecg , "\u0020" ) ; } ; if _ccae < _aecd . _bgba . _gcbge - 1 { _badea = _gdbc ( _badea , _gecg , "\u000a" ) ; } ; } ; _fbfdg := _aecd . _bgba ; if _fbfdg . isExportable ( ) { _agga := _fbfdg . toTextTable ( ) ; _badea = _cadc ( _badea , & _agga ) ;
} ; return _badea ; } ;
2023-06-30 13:19:48 +00:00
2023-07-28 12:14:31 +00:00
// String returns a human readable description of `s`.
func ( _acdda intSet ) String ( ) string { var _defa [ ] int ; for _bgabb := range _acdda { if _acdda . has ( _bgabb ) { _defa = append ( _defa , _bgabb ) ; } ; } ; _df . Ints ( _defa ) ; return _ce . Sprintf ( "\u0025\u002b\u0076" , _defa ) ; } ; type intSet map [ int ] struct { } ; func ( _fggg rulingList ) bbox ( ) _bg . PdfRectangle { var _aafba _bg . PdfRectangle ;
if len ( _fggg ) == 0 { _b . Log . Error ( "r\u0075\u006c\u0069\u006e\u0067\u004ci\u0073\u0074\u002e\u0062\u0062\u006f\u0078\u003a\u0020n\u006f\u0020\u0072u\u006ci\u006e\u0067\u0073" ) ; return _bg . PdfRectangle { } ; } ; if _fggg [ 0 ] . _eabdg == _cefaa { _aafba . Llx , _aafba . Urx = _fggg . secMinMax ( ) ;
_aafba . Lly , _aafba . Ury = _fggg . primMinMax ( ) ; } else { _aafba . Llx , _aafba . Urx = _fggg . primMinMax ( ) ; _aafba . Lly , _aafba . Ury = _fggg . secMinMax ( ) ; } ; return _aafba ; } ; func ( _fdff * textObject ) getFontDirect ( _gcaa string ) ( * _bg . PdfFont , error ) { _ccec , _cgad := _fdff . getFontDict ( _gcaa ) ;
if _cgad != nil { return nil , _cgad ; } ; _dca , _cgad := _bg . NewPdfFontFromPdfObject ( _ccec ) ; if _cgad != nil { _b . Log . Debug ( "\u0067\u0065\u0074\u0046\u006f\u006e\u0074\u0044\u0069\u0072\u0065\u0063\u0074\u003a\u0020\u004e\u0065\u0077Pd\u0066F\u006f\u006e\u0074\u0046\u0072\u006f\u006d\u0050\u0064\u0066\u004f\u0062j\u0065\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u006e\u0061\u006d\u0065\u003d%\u0023\u0071\u0020\u0065\u0072\u0072\u003d\u0025\u0076" , _gcaa , _cgad ) ;
} ; return _dca , _cgad ; } ;
2023-06-30 13:19:48 +00:00
2023-07-28 12:14:31 +00:00
// RenderMode specifies the text rendering mode (Tmode), which determines whether showing text shall cause
// glyph outlines to be stroked, filled, used as a clipping boundary, or some combination of the three.
// Stroking, filling, and clipping shall have the same effects for a text object as they do for a path object
// (see 8.5.3, "Path-Painting Operators" and 8.5.4, "Clipping Path Operators").
type RenderMode int ; func _bbbdbd ( _ggfde , _becb int ) int { if _ggfde > _becb { return _ggfde ; } ; return _becb ; } ; func ( _baff rulingList ) merge ( ) * ruling { _ccgc := _baff [ 0 ] . _befee ; _eabdga := _baff [ 0 ] . _agbc ; _ddcde := _baff [ 0 ] . _gffgd ; for _ , _bagd := range _baff [ 1 : ] { _ccgc += _bagd . _befee ;
if _bagd . _agbc < _eabdga { _eabdga = _bagd . _agbc ; } ; if _bagd . _gffgd > _ddcde { _ddcde = _bagd . _gffgd ; } ; } ; _dcacb := & ruling { _eabdg : _baff [ 0 ] . _eabdg , _gggfe : _baff [ 0 ] . _gggfe , Color : _baff [ 0 ] . Color , _befee : _ccgc / float64 ( len ( _baff ) ) , _agbc : _eabdga , _gffgd : _ddcde } ;
if _dbdb { _b . Log . Info ( "\u006de\u0072g\u0065\u003a\u0020\u0025\u0032d\u0020\u0076e\u0063\u0073\u0020\u0025\u0073" , len ( _baff ) , _dcacb ) ; for _abba , _aadf := range _baff { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _abba , _aadf ) ; } ; } ;
return _dcacb ; } ; func ( _cdbdf * textWord ) appendMark ( _edfe * textMark , _bdcfbg _bg . PdfRectangle ) { _cdbdf . _dggf = append ( _cdbdf . _dggf , _edfe ) ; _cdbdf . PdfRectangle = _egbga ( _cdbdf . PdfRectangle , _edfe . PdfRectangle ) ; if _edfe . _gceb > _cdbdf . _ebgb { _cdbdf . _ebgb = _edfe . _gceb ;
} ; _cdbdf . _baebb = _bdcfbg . Ury - _cdbdf . PdfRectangle . Lly ; } ; func ( _cgac paraList ) findGridTables ( _gdaae [ ] gridTiling ) [ ] * textTable { if _cgafg { _b . Log . Info ( "\u0066i\u006e\u0064\u0047\u0072\u0069\u0064\u0054\u0061\u0062\u006c\u0065s\u003a\u0020\u0025\u0064\u0020\u0070\u0061\u0072\u0061\u0073" , len ( _cgac ) ) ;
for _bebcf , _ffcf := range _cgac { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _bebcf , _ffcf ) ; } ; } ; var _gacfc [ ] * textTable ; for _acfd , _bedd := range _gdaae { _bdcfbc , _dbga := _cgac . findTableGrid ( _bedd ) ; if _bdcfbc != nil { _bdcfbc . log ( _ce . Sprintf ( "\u0066\u0069\u006e\u0064Ta\u0062\u006c\u0065\u0057\u0069\u0074\u0068\u0047\u0072\u0069\u0064\u0073\u003a\u0020%\u0064" , _acfd ) ) ;
_gacfc = append ( _gacfc , _bdcfbc ) ; _bdcfbc . markCells ( ) ; } ; for _cecad := range _dbga { _cecad . _abeg = true ; } ; } ; if _cgafg { _b . Log . Info ( "\u0066i\u006e\u0064\u0047\u0072i\u0064\u0054\u0061\u0062\u006ce\u0073:\u0020%\u0064\u0020\u0074\u0061\u0062\u006c\u0065s" , len ( _gacfc ) ) ;
} ; return _gacfc ; } ; func ( _egag * wordBag ) allWords ( ) [ ] * textWord { var _bced [ ] * textWord ; for _ , _dbaa := range _egag . _cgdg { _bced = append ( _bced , _dbaa ... ) ; } ; return _bced ; } ; func ( _fed * stateStack ) pop ( ) * textState { if _fed . empty ( ) { return nil ; } ; _dcd := * ( * _fed ) [ len ( * _fed ) - 1 ] ;
* _fed = ( * _fed ) [ : len ( * _fed ) - 1 ] ; return & _dcd ; } ; func ( _gbfc paraList ) sortReadingOrder ( ) { _b . Log . Trace ( "\u0073\u006fr\u0074\u0052\u0065\u0061\u0064i\u006e\u0067\u004f\u0072\u0064e\u0072\u003a\u0020\u0070\u0061\u0072\u0061\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d" , len ( _gbfc ) ) ;
if len ( _gbfc ) <= 1 { return ; } ; _gbfc . computeEBBoxes ( ) ; _df . Slice ( _gbfc , func ( _bbee , _addc int ) bool { return _dag ( _gbfc [ _bbee ] , _gbfc [ _addc ] ) <= 0 } ) ; } ; func ( _badeca * textTable ) emptyCompositeRow ( _aeabg int ) bool { for _eedg := 0 ; _eedg < _badeca . _ddfc ;
_eedg ++ { if _fedce , _addfc := _badeca . _dadcc [ _fgcce ( _eedg , _aeabg ) ] ; _addfc { if len ( _fedce . paraList ) > 0 { return false ; } ; } ; } ; return true ; } ;
// Text returns the text content of the `bulletLists`.
func ( _afgb * lists ) Text ( ) string { _dbgf := & _c . Builder { } ; for _ , _gbbg := range * _afgb { _ecac := _gbbg . Text ( ) ; _dbgf . WriteString ( _ecac ) ; } ; return _dbgf . String ( ) ; } ; func ( _gbg * subpath ) clear ( ) { * _gbg = subpath { } } ; func ( _cgec * shapesState ) quadraticTo ( _bagc , _eaebd , _dgg , _ddef float64 ) { if _bdaae { _b . Log . Info ( "\u0071\u0075\u0061d\u0072\u0061\u0074\u0069\u0063\u0054\u006f\u003a" ) ;
} ; _cgec . addPoint ( _dgg , _ddef ) ; } ; func ( _efgad intSet ) has ( _dfdd int ) bool { _ , _fefed := _efgad [ _dfdd ] ; return _fefed } ; func _gbeaf ( _aabcc [ ] * textMark , _aagde _bg . PdfRectangle ) [ ] * textWord { var _cbcd [ ] * textWord ; var _dafa * textWord ; if _efe { _b . Log . Info ( "\u006d\u0061\u006beT\u0065\u0078\u0074\u0057\u006f\u0072\u0064\u0073\u003a\u0020\u0025\u0064\u0020\u006d\u0061\u0072\u006b\u0073" , len ( _aabcc ) ) ;
} ; _eccc := func ( ) { if _dafa != nil { _aebag := _dafa . computeText ( ) ; if ! _dfcc ( _aebag ) { _dafa . _ggaef = _aebag ; _cbcd = append ( _cbcd , _dafa ) ; if _efe { _b . Log . Info ( "\u0061\u0064\u0064Ne\u0077\u0057\u006f\u0072\u0064\u003a\u0020\u0025\u0064\u003a\u0020\u0077\u006f\u0072\u0064\u003d\u0025\u0073" , len ( _cbcd ) - 1 , _dafa . String ( ) ) ;
for _dacc , _cccca := range _dafa . _dggf { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _dacc , _cccca . String ( ) ) ; } ; } ; } ; _dafa = nil ; } ; } ; for _ , _fecg := range _aabcc { if _ccdf && _dafa != nil && len ( _dafa . _dggf ) > 0 { _fdeg := _dafa . _dggf [ len ( _dafa . _dggf ) - 1 ] ;
_dcbac , _fccg := _egeag ( _fecg . _ebgd ) ; _gdffe , _gggff := _egeag ( _fdeg . _ebgd ) ; if _fccg && ! _gggff && _fdeg . inDiacriticArea ( _fecg ) { _dafa . addDiacritic ( _dcbac ) ; continue ; } ; if _gggff && ! _fccg && _fecg . inDiacriticArea ( _fdeg ) { _dafa . _dggf = _dafa . _dggf [ : len ( _dafa . _dggf ) - 1 ] ;
_dafa . appendMark ( _fecg , _aagde ) ; _dafa . addDiacritic ( _gdffe ) ; continue ; } ; } ; _ecgg := _dfcc ( _fecg . _ebgd ) ; if _ecgg { _eccc ( ) ; continue ; } ; if _dafa == nil && ! _ecgg { _dafa = _fbabg ( [ ] * textMark { _fecg } , _aagde ) ; continue ; } ; _eggba := _dafa . _ebgb ; _dabb := _ef . Abs ( _eba ( _aagde , _fecg ) - _dafa . _baebb ) / _eggba ;
_beaf := _efbc ( _fecg , _dafa ) / _eggba ; if _beaf >= _ffef || ! ( - _ccfg <= _beaf && _dabb <= _ccfb ) { _eccc ( ) ; _dafa = _fbabg ( [ ] * textMark { _fecg } , _aagde ) ; continue ; } ; _dafa . appendMark ( _fecg , _aagde ) ; } ; _eccc ( ) ; return _cbcd ; } ; func ( _fbafe * textTable ) getRight ( ) paraList { _fddcf := make ( paraList , _fbafe . _gcbge ) ;
for _aacga := 0 ; _aacga < _fbafe . _gcbge ; _aacga ++ { _fbgde := _fbafe . get ( _fbafe . _ddfc - 1 , _aacga ) . _eabac ; if _fbgde . taken ( ) { return nil ; } ; _fddcf [ _aacga ] = _fbgde ; } ; for _efaf := 0 ; _efaf < _fbafe . _gcbge - 1 ; _efaf ++ { if _fddcf [ _efaf ] . _fgdg != _fddcf [ _efaf + 1 ] { return nil ;
} ; } ; return _fddcf ; } ; func _efbc ( _gcgfb , _bebf bounded ) float64 { return _gcgfb . bbox ( ) . Llx - _bebf . bbox ( ) . Urx } ;
// Marks returns the TextMark collection for a page. It represents all the text on the page.
func ( _baee PageText ) Marks ( ) * TextMarkArray { return & TextMarkArray { _bca : _baee . _fccf } } ; func ( _gbcg paraList ) toTextMarks ( ) [ ] TextMark { _cabce := 0 ; var _dddg [ ] TextMark ; for _fedb , _dcgf := range _gbcg { if _dcgf . _cfga { continue ; } ; _abdf := _dcgf . toTextMarks ( & _cabce ) ;
_dddg = append ( _dddg , _abdf ... ) ; if _fedb != len ( _gbcg ) - 1 { if _bbgff ( _dcgf , _gbcg [ _fedb + 1 ] ) { _dddg = _gdbc ( _dddg , & _cabce , "\u0020" ) ; } else { _dddg = _gdbc ( _dddg , & _cabce , "\u000a" ) ; _dddg = _gdbc ( _dddg , & _cabce , "\u000a" ) ; } ; } ; } ; _dddg = _gdbc ( _dddg , & _cabce , "\u000a" ) ;
_dddg = _gdbc ( _dddg , & _cabce , "\u000a" ) ; return _dddg ; } ; func ( _cdeg rulingList ) toGrids ( ) [ ] rulingList { if _bccgb { _b . Log . Info ( "t\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0025\u0073" , _cdeg ) ; } ; _fcdc := _cdeg . intersections ( ) ; if _bccgb { _b . Log . Info ( "\u0074\u006f\u0047r\u0069\u0064\u0073\u003a \u0076\u0065\u0063\u0073\u003d\u0025\u0064 \u0069\u006e\u0074\u0065\u0072\u0073\u0065\u0063\u0074\u0073\u003d\u0025\u0064\u0020" , len ( _cdeg ) , len ( _fcdc ) ) ;
for _ , _dggc := range _dbcdg ( _fcdc ) { _ce . Printf ( "\u00254\u0064\u003a\u0020\u0025\u002b\u0076\n" , _dggc , _fcdc [ _dggc ] ) ; } ; } ; _bdcea := make ( map [ int ] intSet , len ( _cdeg ) ) ; for _cedc := range _cdeg { _fbebc := _cdeg . connections ( _fcdc , _cedc ) ; if len ( _fbebc ) > 0 { _bdcea [ _cedc ] = _fbebc ;
} ; } ; if _bccgb { _b . Log . Info ( "t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u0063\u006fn\u006e\u0065\u0063\u0074s=\u0025\u0064" , len ( _bdcea ) ) ; for _ , _bbgaff := range _dbcdg ( _bdcea ) { _ce . Printf ( "\u00254\u0064\u003a\u0020\u0025\u002b\u0076\n" , _bbgaff , _bdcea [ _bbgaff ] ) ;
} ; } ; _ffggf := _dadbd ( len ( _cdeg ) , func ( _aaab , _dbaf int ) bool { _fbccd , _ffefd := len ( _bdcea [ _aaab ] ) , len ( _bdcea [ _dbaf ] ) ; if _fbccd != _ffefd { return _fbccd > _ffefd ; } ; return _cdeg . comp ( _aaab , _dbaf ) ; } ) ; if _bccgb { _b . Log . Info ( "t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u006f\u0072d\u0065\u0072\u0069\u006eg=\u0025\u0076" , _ffggf ) ;
} ; _gcff := [ ] [ ] int { { _ffggf [ 0 ] } } ; _eaac : for _ , _bffbb := range _ffggf [ 1 : ] { for _afced , _efegg := range _gcff { for _ , _agegg := range _efegg { if _bdcea [ _agegg ] . has ( _bffbb ) { _gcff [ _afced ] = append ( _efegg , _bffbb ) ; continue _eaac ; } ; } ; } ; _gcff = append ( _gcff , [ ] int { _bffbb } ) ;
} ; if _bccgb { _b . Log . Info ( "\u0074o\u0047r\u0069\u0064\u0073\u003a\u0020i\u0067\u0072i\u0064\u0073\u003d\u0025\u0076" , _gcff ) ; } ; _df . SliceStable ( _gcff , func ( _gbaaa , _gccc int ) bool { return len ( _gcff [ _gbaaa ] ) > len ( _gcff [ _gccc ] ) } ) ; for _ , _afag := range _gcff { _df . Slice ( _afag , func ( _beeg , _gaab int ) bool { return _cdeg . comp ( _afag [ _beeg ] , _afag [ _gaab ] ) } ) ;
} ; _befg := make ( [ ] rulingList , len ( _gcff ) ) ; for _dgfaa , _afbf := range _gcff { _ddbaa := make ( rulingList , len ( _afbf ) ) ; for _edgf , _afeb := range _afbf { _ddbaa [ _edgf ] = _cdeg [ _afeb ] ; } ; _befg [ _dgfaa ] = _ddbaa ; } ; if _bccgb { _b . Log . Info ( "\u0074o\u0047r\u0069\u0064\u0073\u003a\u0020g\u0072\u0069d\u0073\u003d\u0025\u002b\u0076" , _befg ) ;
} ; var _dbgb [ ] rulingList ; for _ , _egef := range _befg { if _ffaf , _afdee := _egef . isActualGrid ( ) ; _afdee { _egef = _ffaf ; _egef = _egef . snapToGroups ( ) ; _dbgb = append ( _dbgb , _egef ) ; } ; } ; if _bccgb { _gdc ( "t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u0061\u0063t\u0075\u0061\u006c\u0047ri\u0064\u0073" , _dbgb ) ;
_b . Log . Info ( "\u0074\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0067\u0072\u0069\u0064\u0073\u003d%\u0064 \u0061\u0063\u0074\u0075\u0061\u006c\u0047\u0072\u0069\u0064\u0073\u003d\u0025\u0064" , len ( _befg ) , len ( _dbgb ) ) ; } ; return _dbgb ; } ; func ( _bcbc * ruling ) gridIntersecting ( _gedbc * ruling ) bool { return _ffcaf ( _bcbc . _agbc , _gedbc . _agbc ) && _ffcaf ( _bcbc . _gffgd , _gedbc . _gffgd ) ;
} ; func ( _adfe * stateStack ) top ( ) * textState { if _adfe . empty ( ) { return nil ; } ; return ( * _adfe ) [ _adfe . size ( ) - 1 ] ; } ; func ( _afce * textObject ) setCharSpacing ( _aega float64 ) { if _afce == nil { return ; } ; _afce . _gacd . _fdf = _aega ; if _beff { _b . Log . Info ( "\u0073\u0065t\u0043\u0068\u0061\u0072\u0053\u0070\u0061\u0063\u0069\u006e\u0067\u003a\u0020\u0025\u002e\u0032\u0066\u0020\u0073\u0074\u0061\u0074e=\u0025\u0073" , _aega , _afce . _gacd . String ( ) ) ;
} ; } ; func ( _gbgeb * textTable ) markCells ( ) { for _bddb := 0 ; _bddb < _gbgeb . _gcbge ; _bddb ++ { for _abace := 0 ; _abace < _gbgeb . _ddfc ; _abace ++ { _abdfe := _gbgeb . get ( _abace , _bddb ) ; if _abdfe != nil { _abdfe . _abeg = true ; } ; } ; } ; } ; func ( _ecf * imageExtractContext ) extractInlineImage ( _abc * _fb . ContentStreamInlineImage , _fabe _fb . GraphicsState , _gbd * _bg . PdfPageResources ) error { _fce , _faf := _abc . ToImage ( _gbd ) ;
if _faf != nil { return _faf ; } ; _gbc , _faf := _abc . GetColorSpace ( _gbd ) ; if _faf != nil { return _faf ; } ; if _gbc == nil { _gbc = _bg . NewPdfColorspaceDeviceGray ( ) ; } ; _aaf , _faf := _gbc . ImageToRGB ( * _fce ) ; if _faf != nil { return _faf ; } ; _cfg := ImageMark { Image : & _aaf , Width : _fabe . CTM . ScalingFactorX ( ) , Height : _fabe . CTM . ScalingFactorY ( ) , Angle : _fabe . CTM . Angle ( ) } ;
_cfg . X , _cfg . Y = _fabe . CTM . Translation ( ) ; _ecf . _dae = append ( _ecf . _dae , _cfg ) ; _ecf . _egfe ++ ; return nil ; } ; func _fcd ( _bfed , _fbbbc bounded ) float64 { _gdg := _aea ( _bfed , _fbbbc ) ; if ! _acbc ( _gdg ) { return _gdg ; } ; return _fdcc ( _bfed , _fbbbc ) ; } ; type rulingList [ ] * ruling ;
func ( _dgaf * textObject ) moveLP ( _ebg , _gag float64 ) { _dgaf . _cfec . Concat ( _gab . NewMatrix ( 1 , 0 , 0 , 1 , _ebg , _gag ) ) ; _dgaf . _fda = _dgaf . _cfec ; } ; func _bbgff ( _dabe , _ecfc * textPara ) bool { if _dabe . _cfga || _ecfc . _cfga { return true ; } ; return _acbc ( _dabe . depth ( ) - _ecfc . depth ( ) ) ;
} ; func _facb ( _fgaa [ ] * textLine , _gcag string , _afef [ ] * list ) * list { return & list { _ecdee : _fgaa , _ebed : _gcag , _cdfc : _afef } ; } ; func _bdee ( _gfbc map [ float64 ] map [ float64 ] gridTile ) [ ] float64 { _gbbfa := make ( [ ] float64 , 0 , len ( _gfbc ) ) ; for _gggb := range _gfbc { _gbbfa = append ( _gbbfa , _gggb ) ;
} ; _df . Float64s ( _gbbfa ) ; _faaf := len ( _gbbfa ) ; for _afgbe := 0 ; _afgbe < _faaf / 2 ; _afgbe ++ { _gbbfa [ _afgbe ] , _gbbfa [ _faaf - 1 - _afgbe ] = _gbbfa [ _faaf - 1 - _afgbe ] , _gbbfa [ _afgbe ] ; } ; return _gbbfa ; } ;
// TextMarkArray is a collection of TextMarks.
type TextMarkArray struct { _bca [ ] TextMark } ; func _egeag ( _baedb string ) ( string , bool ) { _gbdbc := [ ] rune ( _baedb ) ; if len ( _gbdbc ) != 1 { return "" , false ; } ; _ffdf , _acfg := _fegfa [ _gbdbc [ 0 ] ] ; return _ffdf , _acfg ; } ;
// ToText returns the page text as a single string.
// Deprecated: This function is deprecated and will be removed in a future major version. Please use
// Text() instead.
func ( _feeb PageText ) ToText ( ) string { return _feeb . Text ( ) } ; func ( _cgbg compositeCell ) hasLines ( _ggac [ ] * textLine ) bool { for _efae , _gaddg := range _ggac { _gfee := _geec ( _cgbg . PdfRectangle , _gaddg . PdfRectangle ) ; if _cgafg { _ce . Printf ( "\u0020\u0020\u0020\u0020\u0020\u0020\u005e\u005e\u005e\u0069\u006e\u0074\u0065\u0072\u0073e\u0063t\u0073\u003d\u0025\u0074\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064\u000a" , _gfee , _efae , len ( _ggac ) ) ;
_ce . Printf ( "\u0020\u0020\u0020\u0020 \u005e\u005e\u005e\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u003d\u0025s\u000a" , _cgbg ) ; _ce . Printf ( "\u0020 \u0020 \u0020\u0020\u0020\u006c\u0069\u006e\u0065\u003d\u0025\u0073\u000a" , _gaddg ) ; } ; if _gfee { return true ;
} ; } ; return false ; } ; type compositeCell struct { _bg . PdfRectangle ; paraList ; } ; func ( _gbec paraList ) reorder ( _ddae [ ] int ) { _bdaafg := make ( paraList , len ( _gbec ) ) ; for _ggddc , _edbc := range _ddae { _bdaafg [ _ggddc ] = _gbec [ _edbc ] ; } ; copy ( _gbec , _bdaafg ) ;
} ; func ( _fabf lineRuling ) yMean ( ) float64 { return 0.5 * ( _fabf . _egaf . Y + _fabf . _eaebf . Y ) } ; type textLine struct { _bg . PdfRectangle ; _cbbd float64 ; _aafd [ ] * textWord ; _bfbb float64 ; } ; func _dddc ( _gagc [ ] * textWord , _fbdc int ) [ ] * textWord { _cfgfe := len ( _gagc ) ;
copy ( _gagc [ _fbdc : ] , _gagc [ _fbdc + 1 : ] ) ; return _gagc [ : _cfgfe - 1 ] ; } ; func ( _ceaf rulingList ) connections ( _cfddb map [ int ] intSet , _agbb int ) intSet { _adgbee := make ( intSet ) ; _baec := make ( intSet ) ; var _ccbf func ( int ) ; _ccbf = func ( _fbaf int ) { if ! _baec . has ( _fbaf ) { _baec . add ( _fbaf ) ;
for _ffbc := range _ceaf { if _cfddb [ _ffbc ] . has ( _fbaf ) { _adgbee . add ( _ffbc ) ; } ; } ; for _ggcg := range _ceaf { if _adgbee . has ( _ggcg ) { _ccbf ( _ggcg ) ; } ; } ; } ; } ; _ccbf ( _agbb ) ; return _adgbee ; } ; type event struct { _faccf float64 ; _ecacd bool ; _gbfff int ; } ; func ( _ac * imageExtractContext ) processOperand ( _cgf * _fb . ContentStreamOperation , _fbe _fb . GraphicsState , _gcc * _bg . PdfPageResources ) error { if _cgf . Operand == "\u0042\u0049" && len ( _cgf . Params ) == 1 { _adc , _afc := _cgf . Params [ 0 ] . ( * _fb . ContentStreamInlineImage ) ;
if ! _afc { return nil ; } ; if _fe , _dee := _ea . GetBoolVal ( _adc . ImageMask ) ; _dee { if _fe && ! _ac . _fae . IncludeInlineStencilMasks { return nil ; } ; } ; return _ac . extractInlineImage ( _adc , _fbe , _gcc ) ; } else if _cgf . Operand == "\u0044\u006f" && len ( _cgf . Params ) == 1 { _bgd , _cba := _ea . GetName ( _cgf . Params [ 0 ] ) ;
if ! _cba { _b . Log . Debug ( "E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065" ) ; return _gbf ; } ; _ , _ebb := _gcc . GetXObjectByName ( * _bgd ) ; switch _ebb { case _bg . XObjectTypeImage : return _ac . extractXObjectImage ( _bgd , _fbe , _gcc ) ; case _bg . XObjectTypeForm : return _ac . extractFormImages ( _bgd , _fbe , _gcc ) ;
} ; } else if _ac . _gea && ( _cgf . Operand == "\u0073\u0063\u006e" || _cgf . Operand == "\u0053\u0043\u004e" ) && len ( _cgf . Params ) == 1 { _dfec , _adcd := _ea . GetName ( _cgf . Params [ 0 ] ) ; if ! _adcd { _b . Log . Debug ( "E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065" ) ;
return _gbf ; } ; _bcb , _adcd := _gcc . GetPatternByName ( * _dfec ) ; if ! _adcd { _b . Log . Debug ( "\u0045R\u0052\u004f\u0052\u003a\u0020\u0050\u0061\u0074\u0074\u0065\u0072n\u0020\u006e\u006f\u0074\u0020\u0066\u006f\u0075\u006e\u0064" ) ; return nil ; } ; if _bcb . IsTiling ( ) { _eda := _bcb . GetAsTilingPattern ( ) ;
_efg , _bed := _eda . GetContentStream ( ) ; if _bed != nil { return _bed ; } ; _bed = _ac . extractContentStreamImages ( string ( _efg ) , _eda . Resources ) ; if _bed != nil { return _bed ; } ; } ; } else if ( _cgf . Operand == "\u0063\u0073" || _cgf . Operand == "\u0043\u0053" ) && len ( _cgf . Params ) >= 1 { _ac . _gea = _cgf . Params [ 0 ] . String ( ) == "\u0050a\u0074\u0074\u0065\u0072\u006e" ;
} ; return nil ; } ; func ( _bdeae rulingList ) asTiling ( ) gridTiling { if _agd { _b . Log . Info ( "r\u0075\u006ci\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0076\u0065\u0063s\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u002b\u002b\u002b\u0020\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d" , len ( _bdeae ) ) ;
} ; for _gdce , _dagc := range _bdeae [ 1 : ] { _gfbd := _bdeae [ _gdce ] ; if _gfbd . alignsPrimary ( _dagc ) && _gfbd . alignsSec ( _dagc ) { _b . Log . Error ( "a\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0044\u0075\u0070\u006c\u0069\u0063\u0061\u0074\u0065 \u0072\u0075\u006c\u0069\u006e\u0067\u0073\u002e\u000a\u0009v=\u0025\u0073\u000a\t\u0077=\u0025\u0073" , _dagc , _gfbd ) ;
} ; } ; _bdeae . sortStrict ( ) ; _bdeae . log ( "\u0073n\u0061\u0070\u0070\u0065\u0064" ) ; _gdeff , _cfegd := _bdeae . vertsHorzs ( ) ; _ggfef := _gdeff . primaries ( ) ; _efbe := _cfegd . primaries ( ) ; _ddbag := len ( _ggfef ) - 1 ; _eacc := len ( _efbe ) - 1 ; if _ddbag == 0 || _eacc == 0 { return gridTiling { } ;
} ; _bgag := _bg . PdfRectangle { Llx : _ggfef [ 0 ] , Urx : _ggfef [ _ddbag ] , Lly : _efbe [ 0 ] , Ury : _efbe [ _eacc ] } ; if _agd { _b . Log . Info ( "\u0072\u0075l\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0076\u0065\u0072\u0074s=\u0025\u0064" , len ( _gdeff ) ) ;
for _fgcab , _gfcd := range _gdeff { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _fgcab , _gfcd ) ; } ; _b . Log . Info ( "\u0072\u0075l\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0068\u006f\u0072\u007as=\u0025\u0064" , len ( _cfegd ) ) ;
for _bdacf , _dbedeg := range _cfegd { _ce . Printf ( "\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _bdacf , _dbedeg ) ; } ; _b . Log . Info ( "\u0072\u0075\u006c\u0069\u006eg\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067:\u0020\u0020\u0077\u0078\u0068\u003d\u0025\u0064\u0078\u0025\u0064\u000a\u0009\u006c\u006c\u0078\u003d\u0025\u002e\u0032\u0066\u000a\u0009\u006c\u006c\u0079\u003d\u0025\u002e\u0032f" , _ddbag , _eacc , _ggfef , _efbe ) ;
} ; _gecgg := make ( [ ] gridTile , _ddbag * _eacc ) ; for _cccc := _eacc - 1 ; _cccc >= 0 ; _cccc -- { _dcfbd := _efbe [ _cccc ] ; _gbab := _efbe [ _cccc + 1 ] ; for _gcdga := 0 ; _gcdga < _ddbag ; _gcdga ++ { _fagc := _ggfef [ _gcdga ] ; _dfgca := _ggfef [ _gcdga + 1 ] ; _gefbf := _gdeff . findPrimSec ( _fagc , _dcfbd ) ;
_gfge := _gdeff . findPrimSec ( _dfgca , _dcfbd ) ; _ccef := _cfegd . findPrimSec ( _dcfbd , _fagc ) ; _afgc := _cfegd . findPrimSec ( _gbab , _fagc ) ; _fdfce := _bg . PdfRectangle { Llx : _fagc , Urx : _dfgca , Lly : _dcfbd , Ury : _gbab } ; _aeaf := _aeef ( _fdfce , _gefbf , _gfge , _ccef , _afgc ) ;
_gecgg [ _cccc * _ddbag + _gcdga ] = _aeaf ; if _agd { _ce . Printf ( "\u0020\u0020\u0078\u003d\u0025\u0032\u0064\u0020\u0079\u003d\u0025\u0032\u0064\u003a\u0020%\u0073 \u0025\u0036\u002e\u0032\u0066\u0020\u0078\u0020\u0025\u0036\u002e\u0032\u0066\u000a" , _gcdga , _cccc , _aeaf . String ( ) , _aeaf . Width ( ) , _aeaf . Height ( ) ) ;
} ; } ; } ; if _agd { _b . Log . Info ( "r\u0075\u006c\u0069\u006e\u0067\u004c\u0069\u0073\u0074.\u0061\u0073\u0054\u0069\u006c\u0069\u006eg:\u0020\u0063\u006f\u0061l\u0065\u0073\u0063\u0065\u0020\u0068\u006f\u0072\u0069zo\u006e\u0074a\u006c\u002e\u0020\u0025\u0036\u002e\u0032\u0066" , _bgag ) ;
} ; _bebff := make ( [ ] map [ float64 ] gridTile , _eacc ) ; for _fgfcd := _eacc - 1 ; _fgfcd >= 0 ; _fgfcd -- { if _agd { _ce . Printf ( "\u0020\u0020\u0079\u003d\u0025\u0032\u0064\u000a" , _fgfcd ) ; } ; _bebff [ _fgfcd ] = make ( map [ float64 ] gridTile , _ddbag ) ; for _aacec := 0 ; _aacec < _ddbag ;
_aacec ++ { _cegd := _gecgg [ _fgfcd * _ddbag + _aacec ] ; if _agd { _ce . Printf ( "\u0020\u0020\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a" , _aacec , _cegd ) ; } ; if ! _cegd . _afdge { continue ; } ; _acgef := _aacec ; for _fgaga := _aacec + 1 ; ! _cegd . _bfecb && _fgaga < _ddbag ;
_fgaga ++ { _ddbgf := _gecgg [ _fgfcd * _ddbag + _fgaga ] ; _cegd . Urx = _ddbgf . Urx ; _cegd . _fdbd = _cegd . _fdbd || _ddbgf . _fdbd ; _cegd . _eaed = _cegd . _eaed || _ddbgf . _eaed ; _cegd . _bfecb = _ddbgf . _bfecb ; if _agd { _ce . Printf ( "\u0020 \u0020%\u0034\u0064\u003a\u0020\u0025s\u0020\u2192 \u0025\u0073\u000a" , _fgaga , _ddbgf , _cegd ) ;
} ; _acgef = _fgaga ; } ; if _agd { _ce . Printf ( " \u0020 \u0025\u0032\u0064\u0020\u002d\u0020\u0025\u0032d\u0020\u2192\u0020\u0025s\n" , _aacec , _acgef , _cegd ) ; } ; _aacec = _acgef ; _bebff [ _fgfcd ] [ _cegd . Llx ] = _cegd ; } ; } ; _ddaf := make ( map [ float64 ] map [ float64 ] gridTile , _eacc ) ;
_fagga := make ( map [ float64 ] map [ float64 ] struct { } , _eacc ) ; for _bcdb := _eacc - 1 ; _bcdb >= 0 ; _bcdb -- { _aeacd := _gecgg [ _bcdb * _ddbag ] . Lly ; _ddaf [ _aeacd ] = make ( map [ float64 ] gridTile , _ddbag ) ; _fagga [ _aeacd ] = make ( map [ float64 ] struct { } , _ddbag ) ; } ; if _agd { _b . Log . Info ( "\u0072u\u006c\u0069n\u0067\u004c\u0069s\u0074\u002e\u0061\u0073\u0054\u0069\u006ci\u006e\u0067\u003a\u0020\u0063\u006fa\u006c\u0065\u0073\u0063\u0065\u0020\u0076\u0065\u0072\u0074\u0069c\u0061\u006c\u002e\u0020\u0025\u0036\u002e\u0032\u0066" , _bgag ) ;
} ; for _aaga := _eacc - 1 ; _aaga >= 0 ; _aaga -- { _afdga := _gecgg [ _aaga * _ddbag ] . Lly ; _dfbe := _bebff [ _aaga ] ; if _agd { _ce . Printf ( "\u0020\u0020\u0079\u003d\u0025\u0032\u0064\u000a" , _aaga ) ; } ; for _ , _ddgc := range _ceac ( _dfbe ) { if _ , _bgace := _fagga [ _afdga ] [ _ddgc ] ;
_bgace { continue ; } ; _bffaf := _dfbe [ _ddgc ] ; if _agd { _ce . Printf ( " \u0020\u0020\u0020\u0020\u0076\u0030\u003d\u0025\u0073\u000a" , _bffaf . String ( ) ) ; } ; for _eeaa := _aaga - 1 ; _eeaa >= 0 ; _eeaa -- { if _bffaf . _eaed { break ; } ; _caecc := _bebff [ _eeaa ] ; _dccf , _acee := _caecc [ _ddgc ] ;
if ! _acee { break ; } ; if _dccf . Urx != _bffaf . Urx { break ; } ; _bffaf . _eaed = _dccf . _eaed ; _bffaf . Lly = _dccf . Lly ; if _agd { _ce . Printf ( "\u0020\u0020\u0020\u0020 \u0020\u0020\u0076\u003d\u0025\u0073\u0020\u0076\u0030\u003d\u0025\u0073\u000a" , _dccf . String ( ) , _bffaf . String ( ) ) ;
} ; _fagga [ _dccf . Lly ] [ _dccf . Llx ] = struct { } { } ; } ; if _aaga == 0 { _bffaf . _eaed = true ; } ; if _bffaf . complete ( ) { _ddaf [ _afdga ] [ _ddgc ] = _bffaf ; } ; } ; } ; _cceeb := gridTiling { PdfRectangle : _bgag , _eaafd : _effad ( _ddaf ) , _dade : _bdee ( _ddaf ) , _cbec : _ddaf } ; _cceeb . log ( "\u0043r\u0065\u0061\u0074\u0065\u0064" ) ;
return _cceeb ; } ; func ( _ceada paraList ) findTextTables ( ) [ ] * textTable { var _gcgaea [ ] * textTable ; for _ , _gcfea := range _ceada { if _gcfea . taken ( ) || _gcfea . Width ( ) == 0 { continue ; } ; _abddf := _gcfea . isAtom ( ) ; if _abddf == nil { continue ; } ; _abddf . growTable ( ) ;
if _abddf . _ddfc * _abddf . _gcbge < _abda { continue ; } ; _abddf . markCells ( ) ; _abddf . log ( "\u0067\u0072\u006fw\u006e" ) ; _gcgaea = append ( _gcgaea , _abddf ) ; } ; return _gcgaea ; } ; func ( _geab * shapesState ) fill ( _dbfg * [ ] pathSection ) { _bcdc := pathSection { _dgfc : _geab . _cbfc , Color : _geab . _eeadb . getFillColor ( ) } ;
* _dbfg = append ( * _dbfg , _bcdc ) ; if _bccgb { _ceda := _bcdc . bbox ( ) ; _ce . Printf ( "\u0020 \u0020\u0020\u0046\u0049\u004c\u004c\u003a %\u0032\u0064\u0020\u0066\u0069\u006c\u006c\u0073\u0020\u0028\u0025\u0064\u0020\u006ee\u0077\u0029 \u0073\u0073\u003d%\u0073\u0020\u0063\u006f\u006c\u006f\u0072\u003d\u0025\u0033\u0076\u0020\u0025\u0036\u002e\u0032f\u003d\u00256.\u0032\u0066\u0078%\u0036\u002e\u0032\u0066\u000a" , len ( * _dbfg ) , len ( _bcdc . _dgfc ) , _geab , _bcdc . Color , _ceda , _ceda . Width ( ) , _ceda . Height ( ) ) ;
if _fegd { for _ddcb , _afbg := range _bcdc . _dgfc { _ce . Printf ( "\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a" , _ddcb , _afbg ) ; if _ddcb == 10 { break ; } ; } ; } ; } ; } ; type pathSection struct { _dgfc [ ] * subpath ; _ag . Color ; } ;