mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-26 13:48:55 +08:00
990 lines
214 KiB
Go
990 lines
214 KiB
Go
//
|
||
// Copyright 2020 FoxyUtils ehf. All rights reserved.
|
||
//
|
||
// This is a commercial product and requires a license to operate.
|
||
// A trial license can be obtained at https://unidoc.io
|
||
//
|
||
// DO NOT EDIT: generated by unitwist Go source code obfuscator.
|
||
//
|
||
// Use of this source code is governed by the UniDoc End User License Agreement
|
||
// terms that can be accessed at https://unidoc.io/eula/
|
||
|
||
//
|
||
// Package extractor is used for quickly extracting PDF content through a simple interface.
|
||
// Currently offers functionality for extracting textual content.
|
||
//
|
||
package extractor ;import (_db "bytes";_b "errors";_eg "fmt";_bb "github.com/unidoc/unipdf/v3/common";_de "github.com/unidoc/unipdf/v3/contentstream";_ga "github.com/unidoc/unipdf/v3/core";_ge "github.com/unidoc/unipdf/v3/internal/license";_fa "github.com/unidoc/unipdf/v3/internal/textencoding";
|
||
_cb "github.com/unidoc/unipdf/v3/internal/transform";_bg "github.com/unidoc/unipdf/v3/model";_gd "golang.org/x/image/draw";_a "golang.org/x/text/unicode/norm";_egd "golang.org/x/xerrors";_be "image";_g "image/color";_c "io";_ef "math";_dc "reflect";_bf "regexp";
|
||
_ff "sort";_f "strings";_e "unicode";_ba "unicode/utf8";);func (_ffdc *subpath )isQuadrilateral ()bool {if len (_ffdc ._egfb )< 4||len (_ffdc ._egfb )> 5{return false ;};if len (_ffdc ._egfb )==5{_dgag :=_ffdc ._egfb [0];_ecfb :=_ffdc ._egfb [4];if _dgag .X !=_ecfb .X ||_dgag .Y !=_ecfb .Y {return false ;
|
||
};};return true ;};func (_gaggg paraList )writeText (_cgae _c .Writer ){for _cefe ,_efeb :=range _gaggg {if _efeb ._fcag {continue ;};_efeb .writeText (_cgae );if _cefe !=len (_gaggg )-1{if _dgee (_efeb ,_gaggg [_cefe +1]){_cgae .Write ([]byte ("\u0020"));
|
||
}else {_cgae .Write ([]byte ("\u000a"));_cgae .Write ([]byte ("\u000a"));};};};_cgae .Write ([]byte ("\u000a"));_cgae .Write ([]byte ("\u000a"));};func (_dgdg *stateStack )empty ()bool {return len (*_dgdg )==0};func _gbbd (_feef *textLine ,_cefc []*textLine ,_ggfdc []float64 ,_gdff ,_ddab float64 )[]*textLine {_cdee :=[]*textLine {};
|
||
for _ ,_aefa :=range _cefc {if _aefa ._ffbb >=_gdff {if _ddab !=-1&&_aefa ._ffbb < _ddab {if _aefa .text ()!=_feef .text (){if _ef .Round (_aefa .Llx )< _ef .Round (_feef .Llx ){break ;};_cdee =append (_cdee ,_aefa );};}else if _ddab ==-1{if _aefa ._ffbb ==_feef ._ffbb {if _aefa .text ()!=_feef .text (){_cdee =append (_cdee ,_aefa );
|
||
};continue ;};_bec :=_debb (_feef ,_cefc ,_ggfdc );if _bec !=-1&&_aefa ._ffbb <=_bec {_cdee =append (_cdee ,_aefa );};};};};return _cdee ;};func (_efee rectRuling )asRuling ()(*ruling ,bool ){_egfbb :=ruling {_ffdb :_efee ._fbgd ,Color :_efee .Color ,_cbbcc :_cede };
|
||
switch _efee ._fbgd {case _gfbd :_egfbb ._ecdge =0.5*(_efee .Llx +_efee .Urx );_egfbb ._bafb =_efee .Lly ;_egfbb ._cbdc =_efee .Ury ;_dggf ,_deefe :=_efee .checkWidth (_efee .Llx ,_efee .Urx );if !_deefe {if _dfdg {_bb .Log .Error ("\u0072\u0065\u0063\u0074\u0052\u0075l\u0069\u006e\u0067\u002e\u0061\u0073\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0072\u0075\u006c\u0069\u006e\u0067V\u0065\u0072\u0074\u0020\u0021\u0063\u0068\u0065\u0063\u006b\u0057\u0069\u0064\u0074h\u0020v\u003d\u0025\u002b\u0076",_efee );
|
||
};return nil ,false ;};_egfbb ._gafcd =_dggf ;case _fcec :_egfbb ._ecdge =0.5*(_efee .Lly +_efee .Ury );_egfbb ._bafb =_efee .Llx ;_egfbb ._cbdc =_efee .Urx ;_gbea ,_agfc :=_efee .checkWidth (_efee .Lly ,_efee .Ury );if !_agfc {if _dfdg {_bb .Log .Error ("\u0072\u0065\u0063\u0074\u0052\u0075l\u0069\u006e\u0067\u002e\u0061\u0073\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0072\u0075\u006c\u0069\u006e\u0067H\u006f\u0072\u007a\u0020\u0021\u0063\u0068\u0065\u0063\u006b\u0057\u0069\u0064\u0074h\u0020v\u003d\u0025\u002b\u0076",_efee );
|
||
};return nil ,false ;};_egfbb ._gafcd =_gbea ;default:_bb .Log .Error ("\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064",_efee ._fbgd );return nil ,false ;};return &_egfbb ,true ;};func (_ddbd *textObject )moveLP (_bgf ,_beg float64 ){_ddbd ._bbde .Concat (_cb .NewMatrix (1,0,0,1,_bgf ,_beg ));
|
||
_ddbd ._eaaf =_ddbd ._bbde ;};func (_ccca *wordBag )depthBand (_dafb ,_ffaf float64 )[]int {if len (_ccca ._dceb )==0{return nil ;};return _ccca .depthRange (_ccca .getDepthIdx (_dafb ),_ccca .getDepthIdx (_ffaf ));};func _dfcgg (_dbeg []*textLine )[]*textLine {_bgda :=[]*textLine {};
|
||
for _ ,_cadf :=range _dbeg {_bddc :=_cadf .text ();_ebf :=_fabd .Find ([]byte (_bddc ));if _ebf !=nil {_bgda =append (_bgda ,_cadf );};};return _bgda ;};
|
||
|
||
// Text returns the text content of the `bulletLists`.
|
||
func (_acae *lists )Text ()string {_bceb :=&_f .Builder {};for _ ,_dbga :=range *_acae {_bdgc :=_dbga .Text ();_bceb .WriteString (_bdgc );};return _bceb .String ();};func _efde (_bdcfd []*textMark ,_geeg _bg .PdfRectangle ,_daacg rulingList ,_afac []gridTiling ,_ffde bool )paraList {_bb .Log .Trace ("\u006d\u0061\u006b\u0065\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065\u003a \u0025\u0064\u0020\u0065\u006c\u0065m\u0065\u006e\u0074\u0073\u0020\u0070\u0061\u0067\u0065\u0053\u0069\u007a\u0065=\u0025\u002e\u0032\u0066",len (_bdcfd ),_geeg );
|
||
if len (_bdcfd )==0{return nil ;};_eaff :=_fbag (_bdcfd ,_geeg );if len (_eaff )==0{return nil ;};_daacg .log ("\u006d\u0061\u006be\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065");_cegfc ,_gbaf :=_daacg .vertsHorzs ();_bfbe :=_ccdg (_eaff ,_geeg .Ury ,_cegfc ,_gbaf );
|
||
_cgdg :=_aged (_bfbe ,_geeg .Ury ,_cegfc ,_gbaf );_cgdg =_cgdb (_cgdg );_abbbe :=make (paraList ,0,len (_cgdg ));for _ ,_cdeb :=range _cgdg {_bgbgb :=_cdeb .arrangeText ();if _bgbgb !=nil {_abbbe =append (_abbbe ,_bgbgb );};};if !_ffde &&len (_abbbe )>=_bbgee {_abbbe =_abbbe .extractTables (_afac );
|
||
};_abbbe .sortReadingOrder ();if !_ffde {_abbbe .sortTopoOrder ();};_abbbe .log ("\u0073\u006f\u0072te\u0064\u0020\u0069\u006e\u0020\u0072\u0065\u0061\u0064\u0069\u006e\u0067\u0020\u006f\u0072\u0064\u0065\u0072");return _abbbe ;};func (_bdaea *subpath )makeRectRuling (_gbed _g .Color )(*ruling ,bool ){if _dfdg {_bb .Log .Info ("\u006d\u0061\u006beR\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0070\u0061\u0074\u0068\u003d\u0025\u0076",_bdaea );
|
||
};_afgad :=_bdaea ._egfb [:4];_aafa :=make (map[int ]rulingKind ,len (_afgad ));for _cbdd ,_bdgcc :=range _afgad {_ggda :=_bdaea ._egfb [(_cbdd +1)%4];_aafa [_cbdd ]=_bggg (_bdgcc ,_ggda );if _dfdg {_eg .Printf ("\u0025\u0034\u0064: \u0025\u0073\u0020\u003d\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u002d\u0020\u0025\u0036\u002e\u0032\u0066",_cbdd ,_aafa [_cbdd ],_bdgcc ,_ggda );
|
||
};};if _dfdg {_eg .Printf ("\u0020\u0020\u0020\u006b\u0069\u006e\u0064\u0073\u003d\u0025\u002b\u0076\u000a",_aafa );};var _eeaa ,_afbdc []int ;for _bfbdd ,_gaef :=range _aafa {switch _gaef {case _fcec :_afbdc =append (_afbdc ,_bfbdd );case _gfbd :_eeaa =append (_eeaa ,_bfbdd );
|
||
};};if _dfdg {_eg .Printf ("\u0020\u0020 \u0068\u006f\u0072z\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a",len (_afbdc ),_afbdc );_eg .Printf ("\u0020\u0020 \u0076\u0065\u0072t\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a",len (_eeaa ),_eeaa );
|
||
};_edef :=(len (_afbdc )==2&&len (_eeaa )==2)||(len (_afbdc )==2&&len (_eeaa )==0&&_gdcee (_afgad [_afbdc [0]],_afgad [_afbdc [1]]))||(len (_eeaa )==2&&len (_afbdc )==0&&_debfd (_afgad [_eeaa [0]],_afgad [_eeaa [1]]));if _dfdg {_eg .Printf (" \u0020\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064\u0020\u0076\u0065\u0072\u0074\u0073=\u0025\u0064\u0020o\u006b=\u0025\u0074\u000a",len (_afbdc ),len (_eeaa ),_edef );
|
||
};if !_edef {if _dfdg {_bb .Log .Error ("\u0021!\u006d\u0061\u006b\u0065R\u0065\u0063\u0074\u0052\u0075l\u0069n\u0067:\u0020\u0070\u0061\u0074\u0068\u003d\u0025v",_bdaea );_eg .Printf (" \u0020\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064\u0020\u0076\u0065\u0072\u0074\u0073=\u0025\u0064\u0020o\u006b=\u0025\u0074\u000a",len (_afbdc ),len (_eeaa ),_edef );
|
||
};return &ruling {},false ;};if len (_eeaa )==0{for _faega ,_afaf :=range _aafa {if _afaf !=_fcec {_eeaa =append (_eeaa ,_faega );};};};if len (_afbdc )==0{for _acbd ,_gabff :=range _aafa {if _gabff !=_gfbd {_afbdc =append (_afbdc ,_acbd );};};};if _dfdg {_bb .Log .Info ("\u006da\u006b\u0065R\u0065\u0063\u0074\u0052u\u006c\u0069\u006eg\u003a\u0020\u0068\u006f\u0072\u007a\u0073\u003d\u0025d \u0076\u0065\u0072t\u0073\u003d%\u0064\u0020\u0070\u006f\u0069\u006et\u0073\u003d%\u0064\u000a"+"\u0009\u0020\u0068o\u0072\u007a\u0073\u003d\u0025\u002b\u0076\u000a"+"\u0009\u0020\u0076e\u0072\u0074\u0073\u003d\u0025\u002b\u0076\u000a"+"\t\u0070\u006f\u0069\u006e\u0074\u0073\u003d\u0025\u002b\u0076",len (_afbdc ),len (_eeaa ),len (_afgad ),_afbdc ,_eeaa ,_afgad );
|
||
};var _aecad ,_defff ,_bdbb ,_cbgadg _cb .Point ;if _afgad [_afbdc [0]].Y > _afgad [_afbdc [1]].Y {_bdbb ,_cbgadg =_afgad [_afbdc [0]],_afgad [_afbdc [1]];}else {_bdbb ,_cbgadg =_afgad [_afbdc [1]],_afgad [_afbdc [0]];};if _afgad [_eeaa [0]].X > _afgad [_eeaa [1]].X {_aecad ,_defff =_afgad [_eeaa [0]],_afgad [_eeaa [1]];
|
||
}else {_aecad ,_defff =_afgad [_eeaa [1]],_afgad [_eeaa [0]];};_bbdeae :=_bg .PdfRectangle {Llx :_aecad .X ,Urx :_defff .X ,Lly :_cbgadg .Y ,Ury :_bdbb .Y };if _bbdeae .Llx > _bbdeae .Urx {_bbdeae .Llx ,_bbdeae .Urx =_bbdeae .Urx ,_bbdeae .Llx ;};if _bbdeae .Lly > _bbdeae .Ury {_bbdeae .Lly ,_bbdeae .Ury =_bbdeae .Ury ,_bbdeae .Lly ;
|
||
};_adgbe :=rectRuling {PdfRectangle :_bbdeae ,_fbgd :_ddbc (_bbdeae ),Color :_gbed };if _adgbe ._fbgd ==_dgeec {if _dfdg {_bb .Log .Error ("\u006da\u006b\u0065\u0052\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006eg\u003a\u0020\u006b\u0069\u006e\u0064\u003d\u006e\u0069\u006c");
|
||
};return nil ,false ;};_fffde ,_ceba :=_adgbe .asRuling ();if !_ceba {if _dfdg {_bb .Log .Error ("\u006da\u006b\u0065\u0052\u0065c\u0074\u0052\u0075\u006c\u0069n\u0067:\u0020!\u0069\u0073\u0052\u0075\u006c\u0069\u006eg");};return nil ,false ;};if _bffg {_eg .Printf ("\u0020\u0020\u0020\u0072\u003d\u0025\u0073\u000a",_fffde .String ());
|
||
};return _fffde ,true ;};func _aecaf (_defbf _bg .PdfRectangle )*ruling {return &ruling {_ffdb :_fcec ,_ecdge :_defbf .Lly ,_bafb :_defbf .Llx ,_cbdc :_defbf .Urx };};func _eaefd (_gcbc string )(string ,bool ){_adad :=[]rune (_gcbc );if len (_adad )!=1{return "",false ;
|
||
};_febbe ,_eddc :=_edcb [_adad [0]];return _febbe ,_eddc ;};
|
||
|
||
// NewFromContents creates a new extractor from contents and page resources.
|
||
func NewFromContents (contents string ,resources *_bg .PdfPageResources )(*Extractor ,error ){const _bgb ="\u0065x\u0074\u0072\u0061\u0063t\u006f\u0072\u002e\u004e\u0065w\u0046r\u006fm\u0043\u006f\u006e\u0074\u0065\u006e\u0074s";_cfc :=&Extractor {_cd :contents ,_ee :resources ,_cf :map[string ]fontEntry {},_ce :map[string ]textResult {}};
|
||
_ge .TrackUse (_bgb );return _cfc ,nil ;};func _bdfa (_afggf map[float64 ]gridTile )[]float64 {_fgae :=make ([]float64 ,0,len (_afggf ));for _caeef :=range _afggf {_fgae =append (_fgae ,_caeef );};_ff .Float64s (_fgae );return _fgae ;};func (_cceb TextTable )getCellInfo (_bfcec TextMark )[][]int {for _aaff ,_ecfc :=range _cceb .Cells {for _cdc ,_gde :=range _ecfc {_dbe :=&_gde .Marks ;
|
||
if _dbe .exists (_bfcec ){return [][]int {{_aaff },{_cdc }};};};};return nil ;};func (_gecd *shapesState )cubicTo (_dgec ,_cefd ,_gdb ,_ebba ,_cfed ,_cdbc float64 ){if _ggbc {_bb .Log .Info ("\u0063\u0075\u0062\u0069\u0063\u0054\u006f\u003a");};_gecd .addPoint (_cfed ,_cdbc );
|
||
};func _cgfc (_cdbb structElement )[]structElement {_gggd :=[]structElement {};for _ ,_caac :=range _cdbb ._acdgb {for _ ,_dcbcg :=range _caac ._acdgb {for _ ,_ccfd :=range _dcbcg ._acdgb {if _ccfd ._ddag =="\u004c"{_gggd =append (_gggd ,_ccfd );};};};
|
||
};return _gggd ;};func (_ddca rulingList )blocks (_bdggf ,_cdfeg *ruling )bool {if _bdggf ._bafb > _cdfeg ._cbdc ||_cdfeg ._bafb > _bdggf ._cbdc {return false ;};_cfecc :=_ef .Max (_bdggf ._bafb ,_cdfeg ._bafb );_gcbg :=_ef .Min (_bdggf ._cbdc ,_cdfeg ._cbdc );
|
||
if _bdggf ._ecdge > _cdfeg ._ecdge {_bdggf ,_cdfeg =_cdfeg ,_bdggf ;};for _ ,_bdeb :=range _ddca {if _bdggf ._ecdge <=_bdeb ._ecdge +_ceaf &&_bdeb ._ecdge <=_cdfeg ._ecdge +_ceaf &&_bdeb ._bafb <=_gcbg &&_cfecc <=_bdeb ._cbdc {return true ;};};return false ;
|
||
};func (_gadf *textTable )newTablePara ()*textPara {_ccffce :=_gadf .computeBbox ();_fgeaf :=&textPara {PdfRectangle :_ccffce ,_aaga :_ccffce ,_gbgg :_gadf };if _aeff {_bb .Log .Info ("\u006e\u0065w\u0054\u0061\u0062l\u0065\u0050\u0061\u0072\u0061\u003a\u0020\u0025\u0073",_fgeaf );
|
||
};return _fgeaf ;};
|
||
|
||
// String returns a description of `p`.
|
||
func (_fbga *textPara )String ()string {if _fbga ._fcag {return _eg .Sprintf ("\u0025\u0036\u002e\u0032\u0066\u0020\u005b\u0045\u004d\u0050\u0054\u0059\u005d",_fbga .PdfRectangle );};_fdagg :="";if _fbga ._gbgg !=nil {_fdagg =_eg .Sprintf ("\u005b\u0025\u0064\u0078\u0025\u0064\u005d\u0020",_fbga ._gbgg ._afaff ,_fbga ._gbgg ._gaaa );
|
||
};return _eg .Sprintf ("\u0025\u0036\u002e\u0032f \u0025\u0073\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u0020\u0025\u0071",_fbga .PdfRectangle ,_fdagg ,len (_fbga ._bbagc ),_egde (_fbga .text (),50));};func (_fggd paraList )findTableGrid (_degg gridTiling )(*textTable ,map[*textPara ]struct{}){_fggec :=len (_degg ._dbdc );
|
||
_cbddg :=len (_degg ._eaag );_cbafg :=textTable {_cfabb :true ,_afaff :_fggec ,_gaaa :_cbddg ,_feeba :make (map[uint64 ]*textPara ,_fggec *_cbddg ),_bgag :make (map[uint64 ]compositeCell ,_fggec *_cbddg )};_cbafg .PdfRectangle =_degg .PdfRectangle ;_eaefe :=make (map[*textPara ]struct{});
|
||
_adcf :=int ((1.0-_abfgg )*float64 (_fggec *_cbddg ));_gcfg :=0;if _dcdd {_bb .Log .Info ("\u0066\u0069\u006e\u0064Ta\u0062\u006c\u0065\u0047\u0072\u0069\u0064\u003a\u0020\u0025\u0064\u0020\u0078\u0020%\u0064",_fggec ,_cbddg );};for _caebc ,_gdfb :=range _degg ._eaag {_cdbe ,_dfff :=_degg ._eagfa [_gdfb ];
|
||
if !_dfff {continue ;};for _bacce ,_gdcbc :=range _degg ._dbdc {_daad ,_bagcd :=_cdbe [_gdcbc ];if !_bagcd {continue ;};_feafc :=_fggd .inTile (_daad );if len (_feafc )==0{_gcfg ++;if _gcfg > _adcf {if _dcdd {_bb .Log .Info ("\u0021\u006e\u0075m\u0045\u006d\u0070\u0074\u0079\u003d\u0025\u0064",_gcfg );
|
||
};return nil ,nil ;};}else {_cbafg .putComposite (_bacce ,_caebc ,_feafc ,_daad .PdfRectangle );for _ ,_fcaec :=range _feafc {_eaefe [_fcaec ]=struct{}{};};};};};_cddfc :=0;for _dbfd :=0;_dbfd < _fggec ;_dbfd ++{_agbea :=_cbafg .get (_dbfd ,0);if _agbea ==nil ||!_agbea ._fcag {_cddfc ++;
|
||
};};if _cddfc ==0{if _dcdd {_bb .Log .Info ("\u0021\u006e\u0075m\u0048\u0065\u0061\u0064\u0065\u0072\u003d\u0030");};return nil ,nil ;};_dfaee :=_cbafg .reduceTiling (_degg ,_ddbac );_dfaee =_dfaee .subdivide ();return _dfaee ,_eaefe ;};const (_bcc ="\u0045\u0052R\u004f\u0052\u003a\u0020\u0043\u0061\u006e\u0027\u0074\u0020\u0063\u006f\u006e\u0076\u0065\u0072\u0074\u0020\u0066\u006f\u006e\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074\u002c\u0020\u0069\u006e\u0076\u0061\u006c\u0069\u0064\u0020\u0074\u0079\u0070\u0065";
|
||
_ag ="\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0043a\u006e\u0027\u0074 g\u0065\u0074\u0020\u0066\u006f\u006et\u0020\u0070\u0072\u006f\u0070\u0065\u0072\u0074\u0069\u0065\u0073\u002c\u0020\u0066\u006fn\u0074\u0020\u006e\u006f\u0074\u0020\u0066\u006fu\u006e\u0064";
|
||
_gad ="\u0045\u0052\u0052O\u0052\u003a\u0020\u0043\u0061\u006e\u0027\u0074\u0020\u0067\u0065\u0074\u0020\u0066\u006f\u006e\u0074\u0020\u0073\u0074\u0072\u0065\u0061\u006d\u002c\u0020\u0069\u006e\u0076a\u006c\u0069\u0064\u0020\u0074\u0079\u0070\u0065";);
|
||
func _cegd (_dcea ,_cbgf float64 )string {_ebge :=!_cadfe (_dcea -_cbgf );if _ebge {return "\u000a";};return "\u0020";};func (_ecef *shapesState )quadraticTo (_bgde ,_aebb ,_caga ,_fgef float64 ){if _ggbc {_bb .Log .Info ("\u0071\u0075\u0061d\u0072\u0061\u0074\u0069\u0063\u0054\u006f\u003a");
|
||
};_ecef .addPoint (_caga ,_fgef );};func (_afa *stateStack )push (_gcfc *textState ){_geb :=*_gcfc ;*_afa =append (*_afa ,&_geb )};func _dgebc (_bdfbg []pathSection ){if _bagd < 0.0{return ;};if _bffg {_bb .Log .Info ("\u0067\u0072\u0061\u006e\u0075\u006c\u0061\u0072\u0069\u007a\u0065\u003a\u0020\u0025\u0064 \u0073u\u0062\u0070\u0061\u0074\u0068\u0020\u0073\u0065\u0063\u0074\u0069\u006f\u006e\u0073",len (_bdfbg ));
|
||
};for _ffaea ,_babdd :=range _bdfbg {for _dcaae ,_bcgbd :=range _babdd ._aadg {for _gede ,_ccdbd :=range _bcgbd ._egfb {_bcgbd ._egfb [_gede ]=_cb .Point {X :_fdae (_ccdbd .X ),Y :_fdae (_ccdbd .Y )};if _bffg {_egdbe :=_bcgbd ._egfb [_gede ];if !_babg (_ccdbd ,_egdbe ){_ggaf :=_cb .Point {X :_egdbe .X -_ccdbd .X ,Y :_egdbe .Y -_ccdbd .Y };
|
||
_eg .Printf ("\u0025\u0034d \u002d\u0020\u00254\u0064\u0020\u002d\u0020%4d\u003a %\u002e\u0032\u0066\u0020\u2192\u0020\u0025.2\u0066\u0020\u0028\u0025\u0067\u0029\u000a",_ffaea ,_dcaae ,_gede ,_ccdbd ,_egdbe ,_ggaf );};};};};};};type intSet map[int ]struct{};
|
||
func (_bdga *wordBag )allWords ()[]*textWord {var _fccg []*textWord ;for _ ,_agca :=range _bdga ._dceb {_fccg =append (_fccg ,_agca ...);};return _fccg ;};const (_dgeec rulingKind =iota ;_fcec ;_gfbd ;);func _cgdb (_gge []*wordBag )[]*wordBag {if len (_gge )<=1{return _gge ;
|
||
};if _cdgd {_bb .Log .Info ("\u006d\u0065\u0072\u0067\u0065\u0057\u006f\u0072\u0064B\u0061\u0067\u0073\u003a");};_ff .Slice (_gge ,func (_ccbf ,_caggb int )bool {_aedgc ,_dbab :=_gge [_ccbf ],_gge [_caggb ];_ddeb :=_aedgc .Width ()*_aedgc .Height ();_efgb :=_dbab .Width ()*_dbab .Height ();
|
||
if _ddeb !=_efgb {return _ddeb > _efgb ;};if _aedgc .Height ()!=_dbab .Height (){return _aedgc .Height ()> _dbab .Height ();};return _ccbf < _caggb ;});var _gdc []*wordBag ;_bccac :=make (intSet );for _afee :=0;_afee < len (_gge );_afee ++{if _bccac .has (_afee ){continue ;
|
||
};_gaga :=_gge [_afee ];for _cabd :=_afee +1;_cabd < len (_gge );_cabd ++{if _bccac .has (_afee ){continue ;};_bgfe :=_gge [_cabd ];_bgc :=_gaga .PdfRectangle ;_bgc .Llx -=_gaga ._ddgcf ;if _adfd (_bgc ,_bgfe .PdfRectangle ){_gaga .absorb (_bgfe );_bccac .add (_cabd );
|
||
};};_gdc =append (_gdc ,_gaga );};if len (_gge )!=len (_gdc )+len (_bccac ){_bb .Log .Error ("\u006d\u0065\u0072ge\u0057\u006f\u0072\u0064\u0042\u0061\u0067\u0073\u003a \u0025d\u2192%\u0064 \u0061\u0062\u0073\u006f\u0072\u0062\u0065\u0064\u003d\u0025\u0064",len (_gge ),len (_gdc ),len (_bccac ));
|
||
};return _gdc ;};
|
||
|
||
// String returns a human readable description of `path`.
|
||
func (_dag *subpath )String ()string {_cbbc :=_dag ._egfb ;_agcg :=len (_cbbc );if _agcg <=5{return _eg .Sprintf ("\u0025d\u003a\u0020\u0025\u0036\u002e\u0032f",_agcg ,_cbbc );};return _eg .Sprintf ("\u0025d\u003a\u0020\u0025\u0036.\u0032\u0066\u0020\u0025\u0036.\u0032f\u0020.\u002e\u002e\u0020\u0025\u0036\u002e\u0032f",_agcg ,_cbbc [0],_cbbc [1],_cbbc [_agcg -1]);
|
||
};func _cggec (_bgge *list ,_bdgd *string )string {_gaca :=_f .Split (_bgge ._feed ,"\u000a");_eca :=&_f .Builder {};for _ ,_afde :=range _gaca {if _afde !=""{_eca .WriteString (*_bdgd );_eca .WriteString (_afde );_eca .WriteString ("\u000a");};};return _eca .String ();
|
||
};func _cebc (_ebga float64 )bool {return _ef .Abs (_ebga )< _ceaf };func (_dafae paraList )list ()[]*list {var _abca []*textLine ;var _eagd []*textLine ;for _ ,_daed :=range _dafae {_gdffg :=_daed .getListLines ();_abca =append (_abca ,_gdffg ...);_eagd =append (_eagd ,_daed ._bbagc ...);
|
||
};_cbegg :=_ccfde (_abca );_fdde :=_gdba (_eagd ,_cbegg );return _fdde ;};
|
||
|
||
// New returns an Extractor instance for extracting content from the input PDF page.
|
||
func New (page *_bg .PdfPage )(*Extractor ,error ){return NewWithOptions (page ,nil )};func _dgee (_agag ,_fecbd *textPara )bool {if _agag ._fcag ||_fecbd ._fcag {return true ;};return _cadfe (_agag .depth ()-_fecbd .depth ());};func (_bgdfc *shapesState )clearPath (){_bgdfc ._bfeb =nil ;
|
||
_bgdfc ._abac =false ;if _ggbc {_bb .Log .Info ("\u0043\u004c\u0045A\u0052\u003a\u0020\u0073\u0073\u003d\u0025\u0073",_bgdfc );};};func (_adaaf *textPara )fontsize ()float64 {return _adaaf ._bbagc [0]._bbeb };func (_bbee *textLine )appendWord (_agfg *textWord ){_bbee ._ggdd =append (_bbee ._ggdd ,_agfg );
|
||
_bbee .PdfRectangle =_dfba (_bbee .PdfRectangle ,_agfg .PdfRectangle );if _agfg ._beaeg > _bbee ._bbeb {_bbee ._bbeb =_agfg ._beaeg ;};if _agfg ._gdfbg > _bbee ._ffbb {_bbee ._ffbb =_agfg ._gdfbg ;};};func _gdba (_fgdg []*textLine ,_ccee map[float64 ][]*textLine )[]*list {_efbc :=_gbbc (_ccee );
|
||
_cfba :=[]*list {};if len (_efbc )==0{return _cfba ;};_edgf :=_efbc [0];_afbe :=1;_gdfgf :=_ccee [_edgf ];for _gdag ,_fbdc :=range _gdfgf {var _gdbdf float64 ;_aff :=[]*list {};_bgdc :=_fbdc ._ffbb ;_dcf :=-1.0;if _gdag < len (_gdfgf )-1{_dcf =_gdfgf [_gdag +1]._ffbb ;
|
||
};if _afbe < len (_efbc ){_aff =_gafc (_fgdg ,_ccee ,_efbc ,_afbe ,_bgdc ,_dcf );};_gdbdf =_dcf ;if len (_aff )> 0{_effb :=_aff [0];if len (_effb ._dgef )> 0{_gdbdf =_effb ._dgef [0]._ffbb ;};};_bcdg :=[]*textLine {_fbdc };_fbae :=_gbbd (_fbdc ,_fgdg ,_efbc ,_bgdc ,_gdbdf );
|
||
_bcdg =append (_bcdg ,_fbae ...);_bcdge :=_gfea (_bcdg ,"\u0062\u0075\u006c\u006c\u0065\u0074",_aff );_bcdge ._feed =_abfbf (_bcdg ,"");_cfba =append (_cfba ,_bcdge );};return _cfba ;};func (_gfb *subpath )close (){if !_babg (_gfb ._egfb [0],_gfb .last ()){_gfb .add (_gfb ._egfb [0]);
|
||
};_gfb ._ffff =true ;_gfb .removeDuplicates ();};func _cadfe (_bgaga float64 )bool {return _ef .Abs (_bgaga )< _adcc };
|
||
|
||
// ToTextMark returns the public view of `tm`.
|
||
func (_cddf *textMark )ToTextMark ()TextMark {return TextMark {Text :_cddf ._gcgb ,Original :_cddf ._ebaa ,BBox :_cddf ._cedgd ,Font :_cddf ._gcded ,FontSize :_cddf ._cffb ,FillColor :_cddf ._dfbf ,StrokeColor :_cddf ._ffdf ,Orientation :_cddf ._eabg ,DirectObject :_cddf ._eggb ,ObjString :_cddf ._fcbgg ,Tw :_cddf .Tw ,Th :_cddf .Th ,Tc :_cddf ._gaac ,Index :_cddf ._fcadf };
|
||
};func (_gca *textObject )showTextAdjusted (_bgbd *_ga .PdfObjectArray ,_gdgab int )error {_ffd :=false ;for _ ,_abcb :=range _bgbd .Elements (){switch _abcb .(type ){case *_ga .PdfObjectFloat ,*_ga .PdfObjectInteger :_gfa ,_facc :=_ga .GetNumberAsFloat (_abcb );
|
||
if _facc !=nil {_bb .Log .Debug ("\u0045\u0052\u0052\u004fR\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078t\u0041\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0042\u0061\u0064\u0020\u006e\u0075\u006d\u0065r\u0069\u0063\u0061\u006c\u0020a\u0072\u0067\u002e\u0020\u006f\u003d\u0025\u0073\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_abcb ,_bgbd );
|
||
return _facc ;};_agg ,_gfac :=-_gfa *0.001*_gca ._aed ._fbda ,0.0;if _ffd {_gfac ,_agg =_agg ,_gfac ;};_fgc :=_cgge (_cb .Point {X :_agg ,Y :_gfac });_gca ._eaaf .Concat (_fgc );case *_ga .PdfObjectString :_cbb :=_ga .TraceToDirectObject (_abcb );_aebe ,_bcab :=_ga .GetStringBytes (_cbb );
|
||
if !_bcab {_bb .Log .Trace ("s\u0068\u006f\u0077\u0054\u0065\u0078\u0074\u0041\u0064j\u0075\u0073\u0074\u0065\u0064\u003a\u0020Ba\u0064\u0020\u0073\u0074r\u0069\u006e\u0067\u0020\u0061\u0072\u0067\u002e\u0020o=\u0025\u0073 \u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_abcb ,_bgbd );
|
||
return _ga .ErrTypeError ;};_gca .renderText (_cbb ,_aebe ,_gdgab );default:_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078\u0074A\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0055\u006e\u0065\u0078p\u0065\u0063\u0074\u0065\u0064\u0020\u0074\u0079\u0070\u0065\u0020\u0028%T\u0029\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_abcb ,_bgbd );
|
||
return _ga .ErrTypeError ;};};return nil ;};func _caccb (_fgba int ,_afdg map[int ][]float64 )([]int ,int ){_eedd :=make ([]int ,_fgba );_bedf :=0;for _gbddf :=0;_gbddf < _fgba ;_gbddf ++{_eedd [_gbddf ]=_bedf ;_bedf +=len (_afdg [_gbddf ])+1;};return _eedd ,_bedf ;
|
||
};func _gbbce (_cfgcge ,_aeebf int )uint64 {return uint64 (_cfgcge )*0x1000000+uint64 (_aeebf )};func (_dcdf paraList )log (_gggce string ){if !_bdcc {return ;};_bb .Log .Info ("%\u0038\u0073\u003a\u0020\u0025\u0064 \u0070\u0061\u0072\u0061\u0073\u0020=\u003d\u003d\u003d\u003d\u003d\u003d\u002d-\u002d\u002d\u002d\u002d\u002d\u003d\u003d\u003d\u003d\u003d=\u003d",_gggce ,len (_dcdf ));
|
||
for _gbdc ,_eaef :=range _dcdf {if _eaef ==nil {continue ;};_adef :=_eaef .text ();_gecde :="\u0020\u0020";if _eaef ._gbgg !=nil {_gecde =_eg .Sprintf ("\u005b%\u0064\u0078\u0025\u0064\u005d",_eaef ._gbgg ._afaff ,_eaef ._gbgg ._gaaa );};_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0025s\u0020\u0025\u0071\u000a",_gbdc ,_eaef .PdfRectangle ,_gecde ,_egde (_adef ,50));
|
||
};};const _egdf =1.0/1000.0;func (_abbe *imageExtractContext )extractFormImages (_bfe *_ga .PdfObjectName ,_cdbf _de .GraphicsState ,_bbe *_bg .PdfPageResources )error {_cbd ,_bgbb :=_bbe .GetXObjectFormByName (*_bfe );if _bgbb !=nil {return _bgbb ;};if _cbd ==nil {return nil ;
|
||
};_fcf ,_bgbb :=_cbd .GetContentStream ();if _bgbb !=nil {return _bgbb ;};_ged :=_cbd .Resources ;if _ged ==nil {_ged =_bbe ;};_bgbb =_abbe .extractContentStreamImages (string (_fcf ),_ged );if _bgbb !=nil {return _bgbb ;};_abbe ._bac ++;return nil ;};
|
||
func (_bffe paraList )yNeighbours (_ggee float64 )map[*textPara ][]int {_bbfed :=make ([]event ,2*len (_bffe ));if _ggee ==0{for _fbgab ,_afeg :=range _bffe {_bbfed [2*_fbgab ]=event {_afeg .Lly ,true ,_fbgab };_bbfed [2*_fbgab +1]=event {_afeg .Ury ,false ,_fbgab };
|
||
};}else {for _faafg ,_aagbc :=range _bffe {_bbfed [2*_faafg ]=event {_aagbc .Lly -_ggee *_aagbc .fontsize (),true ,_faafg };_bbfed [2*_faafg +1]=event {_aagbc .Ury +_ggee *_aagbc .fontsize (),false ,_faafg };};};return _bffe .eventNeighbours (_bbfed );
|
||
};
|
||
|
||
// String returns a description of `state`.
|
||
func (_cdfe *textState )String ()string {_babb :="\u005bN\u004f\u0054\u0020\u0053\u0045\u0054]";if _cdfe ._ecd !=nil {_babb =_cdfe ._ecd .BaseFont ();};return _eg .Sprintf ("\u0074\u0063\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0077\u003d\u0025\u002e\u0032\u0066 \u0074f\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u003d\u0025\u0071",_cdfe ._edf ,_cdfe ._dedcd ,_cdfe ._fbda ,_babb );
|
||
};
|
||
|
||
// ImageMark represents an image drawn on a page and its position in device coordinates.
|
||
// All coordinates are in device coordinates.
|
||
type ImageMark struct{Image *_bg .Image ;
|
||
|
||
// Dimensions of the image as displayed in the PDF.
|
||
Width float64 ;Height float64 ;
|
||
|
||
// Position of the image in PDF coordinates (lower left corner).
|
||
X float64 ;Y float64 ;
|
||
|
||
// Angle in degrees, if rotated.
|
||
Angle float64 ;};func _gdcbd (_cbcd ,_eebg _cb .Point ,_gabg _g .Color )(*ruling ,bool ){_bfgba :=lineRuling {_gabf :_cbcd ,_fccf :_eebg ,_fcfg :_bgfgg (_cbcd ,_eebg ),Color :_gabg };if _bfgba ._fcfg ==_dgeec {return nil ,false ;};return _bfgba .asRuling ();
|
||
};func (_dfg *textObject )moveTextSetLeading (_fgeg ,_ege float64 ){_dfg ._aed ._ddba =-_ege ;_dfg .moveLP (_fgeg ,_ege );};func _dfea (_ebbf ,_acef bounded )float64 {return _ebbf .bbox ().Llx -_acef .bbox ().Urx };func _caggg (_fafcb _bg .PdfRectangle )*ruling {return &ruling {_ffdb :_gfbd ,_ecdge :_fafcb .Urx ,_bafb :_fafcb .Lly ,_cbdc :_fafcb .Ury };
|
||
};
|
||
|
||
// TextMarkArray is a collection of TextMarks.
|
||
type TextMarkArray struct{_dafa []TextMark };func _aedac (_afec *paraList )map[int ][]*textLine {_efc :=map[int ][]*textLine {};for _ ,_aacc :=range *_afec {for _ ,_eba :=range _aacc ._bbagc {if !_egba (_eba ){_bb .Log .Debug ("g\u0072\u006f\u0075p\u004c\u0069\u006e\u0065\u0073\u003a\u0020\u0054\u0068\u0065\u0020\u0074\u0065\u0078\u0074\u0020\u006c\u0069\u006e\u0065\u0020\u0063\u006f\u006e\u0074a\u0069\u006e\u0073 \u006d\u006f\u0072\u0065\u0020\u0074\u0068\u0061\u006e\u0020\u006f\u006e\u0065 \u006d\u0063\u0069\u0064 \u006e\u0075\u006d\u0062e\u0072\u002e\u0020\u0049\u0074\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020\u0062\u0065\u0020\u0073p\u006c\u0069\u0074\u002e");
|
||
continue ;};_agcfb :=_eba ._ggdd [0]._dbff [0]._egaa ;_efc [_agcfb ]=append (_efc [_agcfb ],_eba );};if _aacc ._gbgg !=nil {_agec :=_aacc ._gbgg ._feeba ;for _ ,_gggc :=range _agec {for _ ,_cbec :=range _gggc ._bbagc {if !_egba (_cbec ){_bb .Log .Debug ("g\u0072\u006f\u0075p\u004c\u0069\u006e\u0065\u0073\u003a\u0020\u0054\u0068\u0065\u0020\u0074\u0065\u0078\u0074\u0020\u006c\u0069\u006e\u0065\u0020\u0063\u006f\u006e\u0074a\u0069\u006e\u0073 \u006d\u006f\u0072\u0065\u0020\u0074\u0068\u0061\u006e\u0020\u006f\u006e\u0065 \u006d\u0063\u0069\u0064 \u006e\u0075\u006d\u0062e\u0072\u002e\u0020\u0049\u0074\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020\u0062\u0065\u0020\u0073p\u006c\u0069\u0074\u002e");
|
||
continue ;};_gecf :=_cbec ._ggdd [0]._dbff [0]._egaa ;_efc [_gecf ]=append (_efc [_gecf ],_cbec );};};};};return _efc ;};type bounded interface{bbox ()_bg .PdfRectangle };func _fecd (_faffg []pathSection )rulingList {_dgebc (_faffg );if _bffg {_bb .Log .Info ("\u006da\u006b\u0065\u0046\u0069l\u006c\u0052\u0075\u006c\u0069n\u0067s\u003a \u0025\u0064\u0020\u0066\u0069\u006c\u006cs",len (_faffg ));
|
||
};var _edfd rulingList ;for _ ,_debd :=range _faffg {for _ ,_babdb :=range _debd ._aadg {if !_babdb .isQuadrilateral (){if _bffg {_bb .Log .Error ("!\u0069s\u0051\u0075\u0061\u0064\u0072\u0069\u006c\u0061t\u0065\u0072\u0061\u006c: \u0025\u0073",_babdb );
|
||
};continue ;};if _gbab ,_gfcd :=_babdb .makeRectRuling (_debd .Color );_gfcd {_edfd =append (_edfd ,_gbab );}else {if _dfdg {_bb .Log .Error ("\u0021\u006d\u0061\u006beR\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0025\u0073",_babdb );
|
||
};};};};if _bffg {_bb .Log .Info ("\u006d\u0061\u006b\u0065Fi\u006c\u006c\u0052\u0075\u006c\u0069\u006e\u0067\u0073\u003a\u0020\u0025\u0073",_edfd .String ());};return _edfd ;};func (_bea *textObject )getFontDirect (_agaf string )(*_bg .PdfFont ,error ){_fee ,_ecgg :=_bea .getFontDict (_agaf );
|
||
if _ecgg !=nil {return nil ,_ecgg ;};_bdgg ,_ecgg :=_bg .NewPdfFontFromPdfObject (_fee );if _ecgg !=nil {_bb .Log .Debug ("\u0067\u0065\u0074\u0046\u006f\u006e\u0074\u0044\u0069\u0072\u0065\u0063\u0074\u003a\u0020\u004e\u0065\u0077Pd\u0066F\u006f\u006e\u0074\u0046\u0072\u006f\u006d\u0050\u0064\u0066\u004f\u0062j\u0065\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u006e\u0061\u006d\u0065\u003d%\u0023\u0071\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_agaf ,_ecgg );
|
||
};return _bdgg ,_ecgg ;};func (_gcec paraList )llyOrdering ()[]int {_cgb :=make ([]int ,len (_gcec ));for _dbebb :=range _gcec {_cgb [_dbebb ]=_dbebb ;};_ff .SliceStable (_cgb ,func (_bcdc ,_egfac int )bool {_cbab ,_dbed :=_cgb [_bcdc ],_cgb [_egfac ];
|
||
return _gcec [_cbab ].Lly < _gcec [_dbed ].Lly ;});return _cgb ;};func _deda (_ffgf func (*wordBag ,*textWord ,float64 )bool ,_ddce float64 )func (*wordBag ,*textWord )bool {return func (_afefg *wordBag ,_ecde *textWord )bool {return _ffgf (_afefg ,_ecde ,_ddce )};
|
||
};func (_afbac paraList )addNeighbours (){_fgag :=func (_cded []int ,_aedd *textPara )([]*textPara ,[]*textPara ){_bggef :=make ([]*textPara ,0,len (_cded )-1);_geebca :=make ([]*textPara ,0,len (_cded )-1);for _ ,_ecdb :=range _cded {_acbfa :=_afbac [_ecdb ];
|
||
if _acbfa .Urx <=_aedd .Llx {_bggef =append (_bggef ,_acbfa );}else if _acbfa .Llx >=_aedd .Urx {_geebca =append (_geebca ,_acbfa );};};return _bggef ,_geebca ;};_dfbgd :=func (_gbcgc []int ,_efffe *textPara )([]*textPara ,[]*textPara ){_cdgaa :=make ([]*textPara ,0,len (_gbcgc )-1);
|
||
_ffcda :=make ([]*textPara ,0,len (_gbcgc )-1);for _ ,_cffeg :=range _gbcgc {_deabe :=_afbac [_cffeg ];if _deabe .Ury <=_efffe .Lly {_ffcda =append (_ffcda ,_deabe );}else if _deabe .Lly >=_efffe .Ury {_cdgaa =append (_cdgaa ,_deabe );};};return _cdgaa ,_ffcda ;
|
||
};_fefac :=_afbac .yNeighbours (_fbc );for _ ,_bagf :=range _afbac {_gegb :=_fefac [_bagf ];if len (_gegb )==0{continue ;};_eaaffc ,_bdbdf :=_fgag (_gegb ,_bagf );if len (_eaaffc )==0&&len (_bdbdf )==0{continue ;};if len (_eaaffc )> 0{_gbgd :=_eaaffc [0];
|
||
for _ ,_bedc :=range _eaaffc [1:]{if _bedc .Urx >=_gbgd .Urx {_gbgd =_bedc ;};};for _ ,_aeafd :=range _eaaffc {if _aeafd !=_gbgd &&_aeafd .Urx > _gbgd .Llx {_gbgd =nil ;break ;};};if _gbgd !=nil &&_cfa (_bagf .PdfRectangle ,_gbgd .PdfRectangle ){_bagf ._dgaf =_gbgd ;
|
||
};};if len (_bdbdf )> 0{_abacad :=_bdbdf [0];for _ ,_adgad :=range _bdbdf [1:]{if _adgad .Llx <=_abacad .Llx {_abacad =_adgad ;};};for _ ,_gacb :=range _bdbdf {if _gacb !=_abacad &&_gacb .Llx < _abacad .Urx {_abacad =nil ;break ;};};if _abacad !=nil &&_cfa (_bagf .PdfRectangle ,_abacad .PdfRectangle ){_bagf ._gdfgd =_abacad ;
|
||
};};};_fefac =_afbac .xNeighbours (_afdc );for _ ,_fege :=range _afbac {_gcfcg :=_fefac [_fege ];if len (_gcfcg )==0{continue ;};_deege ,_dfgb :=_dfbgd (_gcfcg ,_fege );if len (_deege )==0&&len (_dfgb )==0{continue ;};if len (_dfgb )> 0{_dgggc :=_dfgb [0];
|
||
for _ ,_ebefc :=range _dfgb [1:]{if _ebefc .Ury >=_dgggc .Ury {_dgggc =_ebefc ;};};for _ ,_aaceg :=range _dfgb {if _aaceg !=_dgggc &&_aaceg .Ury > _dgggc .Lly {_dgggc =nil ;break ;};};if _dgggc !=nil &&_gfdb (_fege .PdfRectangle ,_dgggc .PdfRectangle ){_fege ._daag =_dgggc ;
|
||
};};if len (_deege )> 0{_dgfa :=_deege [0];for _ ,_egge :=range _deege [1:]{if _egge .Lly <=_dgfa .Lly {_dgfa =_egge ;};};for _ ,_bdbdb :=range _deege {if _bdbdb !=_dgfa &&_bdbdb .Lly < _dgfa .Ury {_dgfa =nil ;break ;};};if _dgfa !=nil &&_gfdb (_fege .PdfRectangle ,_dgfa .PdfRectangle ){_fege ._eeba =_dgfa ;
|
||
};};};for _ ,_gbbgg :=range _afbac {if _gbbgg ._dgaf !=nil &&_gbbgg ._dgaf ._gdfgd !=_gbbgg {_gbbgg ._dgaf =nil ;};if _gbbgg ._eeba !=nil &&_gbbgg ._eeba ._daag !=_gbbgg {_gbbgg ._eeba =nil ;};if _gbbgg ._gdfgd !=nil &&_gbbgg ._gdfgd ._dgaf !=_gbbgg {_gbbgg ._gdfgd =nil ;
|
||
};if _gbbgg ._daag !=nil &&_gbbgg ._daag ._eeba !=_gbbgg {_gbbgg ._daag =nil ;};};};func _bccb (_fgbe ,_dbbfd float64 )bool {return _ef .Abs (_fgbe -_dbbfd )<=_adca };type ruling struct{_ffdb rulingKind ;_cbbcc markKind ;_g .Color ;_ecdge float64 ;_bafb float64 ;
|
||
_cbdc float64 ;_gafcd float64 ;};func _efdd (_gbge *textWord ,_gcde float64 ,_fffc ,_eae rulingList )*wordBag {_accg :=_adbg (_gbge ._gdfbg );_abfc :=[]*textWord {_gbge };_fdce :=wordBag {_dceb :map[int ][]*textWord {_accg :_abfc },PdfRectangle :_gbge .PdfRectangle ,_ddgcf :_gbge ._beaeg ,_gcdf :_gcde ,_ecgb :_fffc ,_bfadd :_eae };
|
||
return &_fdce ;};func (_fdfd *wordBag )minDepth ()float64 {return _fdfd ._gcdf -(_fdfd .Ury -_fdfd ._ddgcf )};func _bgfgg (_bcbg ,_cfdcg _cb .Point )rulingKind {_bdbf :=_ef .Abs (_bcbg .X -_cfdcg .X );_ggae :=_ef .Abs (_bcbg .Y -_cfdcg .Y );return _abaa (_bdbf ,_ggae ,_afdf );
|
||
};
|
||
|
||
// Text returns the extracted page text.
|
||
func (_fcbg PageText )Text ()string {return _fcbg ._bffc };func (_cffad rulingList )merge ()*ruling {_gbceb :=_cffad [0]._ecdge ;_fdgac :=_cffad [0]._bafb ;_ccddbd :=_cffad [0]._cbdc ;for _ ,_fcecg :=range _cffad [1:]{_gbceb +=_fcecg ._ecdge ;if _fcecg ._bafb < _fdgac {_fdgac =_fcecg ._bafb ;
|
||
};if _fcecg ._cbdc > _ccddbd {_ccddbd =_fcecg ._cbdc ;};};_gbga :=&ruling {_ffdb :_cffad [0]._ffdb ,_cbbcc :_cffad [0]._cbbcc ,Color :_cffad [0].Color ,_ecdge :_gbceb /float64 (len (_cffad )),_bafb :_fdgac ,_cbdc :_ccddbd };if _cfec {_bb .Log .Info ("\u006de\u0072g\u0065\u003a\u0020\u0025\u0032d\u0020\u0076e\u0063\u0073\u0020\u0025\u0073",len (_cffad ),_gbga );
|
||
for _abef ,_bdaf :=range _cffad {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_abef ,_bdaf );};};return _gbga ;};func _dddd (_aadc _bg .PdfRectangle ,_geae bounded )float64 {return _aadc .Ury -_geae .bbox ().Lly };func (_gedg rulingList )sort (){_ff .Slice (_gedg ,_gedg .comp )};
|
||
func _dbcgg (_cfde map[int ]intSet )[]int {_aacg :=make ([]int ,0,len (_cfde ));for _deafd :=range _cfde {_aacg =append (_aacg ,_deafd );};_ff .Ints (_aacg );return _aacg ;};type lists []*list ;func _fbbe (_fddg map[int ][]float64 ){if len (_fddg )<=1{return ;
|
||
};_gcddba :=_cfbe (_fddg );if _aeff {_bb .Log .Info ("\u0066i\u0078C\u0065\u006c\u006c\u0073\u003a \u006b\u0065y\u0073\u003d\u0025\u002b\u0076",_gcddba );};var _fccce ,_ggcd int ;for _fccce ,_ggcd =range _gcddba {if _fddg [_ggcd ]!=nil {break ;};};for _aagb ,_deega :=range _gcddba [_fccce :]{_eecb :=_fddg [_deega ];
|
||
if _eecb ==nil {continue ;};if _aeff {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u006b\u0030\u003d\u0025\u0064\u0020\u006b1\u003d\u0025\u0064\u000a",_fccce +_aagb ,_ggcd ,_deega );};_abeb :=_fddg [_deega ];if _abeb [len (_abeb )-1]> _eecb [0]{_abeb [len (_abeb )-1]=_eecb [0];
|
||
_fddg [_ggcd ]=_abeb ;};_ggcd =_deega ;};};func (_adafb *textObject )setHorizScaling (_gee float64 ){if _adafb ==nil {return ;};_adafb ._aed ._dcc =_gee ;};func (_fefc *textObject )setTextMatrix (_bfba []float64 ){if len (_bfba )!=6{_bb .Log .Debug ("\u0045\u0052\u0052OR\u003a\u0020\u006c\u0065\u006e\u0028\u0066\u0029\u0020\u0021\u003d\u0020\u0036\u0020\u0028\u0025\u0064\u0029",len (_bfba ));
|
||
return ;};_cbf ,_dee ,_ddc ,_bcf ,_gdgad ,_acc :=_bfba [0],_bfba [1],_bfba [2],_bfba [3],_bfba [4],_bfba [5];_fefc ._eaaf =_cb .NewMatrix (_cbf ,_dee ,_ddc ,_bcf ,_gdgad ,_acc );_fefc ._bbde =_fefc ._eaaf ;};func (_gdgbf *textTable )logComposite (_fbcb string ){if !_aeff {return ;
|
||
};_bb .Log .Info ("\u007e~\u007eP\u0061\u0072\u0061\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0073",_gdgbf ._afaff ,_gdgbf ._gaaa ,_fbcb );_eg .Printf ("\u0025\u0035\u0073 \u007c","");for _acdb :=0;_acdb < _gdgbf ._afaff ;_acdb ++{_eg .Printf ("\u0025\u0033\u0064 \u007c",_acdb );
|
||
};_eg .Println ("");_eg .Printf ("\u0025\u0035\u0073 \u002b","");for _cggaf :=0;_cggaf < _gdgbf ._afaff ;_cggaf ++{_eg .Printf ("\u0025\u0033\u0073 \u002b","\u002d\u002d\u002d");};_eg .Println ("");for _bgaf :=0;_bgaf < _gdgbf ._gaaa ;_bgaf ++{_eg .Printf ("\u0025\u0035\u0064 \u007c",_bgaf );
|
||
for _cfdg :=0;_cfdg < _gdgbf ._afaff ;_cfdg ++{_agedb ,_ :=_gdgbf ._bgag [_gbbce (_cfdg ,_bgaf )].parasBBox ();_eg .Printf ("\u0025\u0033\u0064 \u007c",len (_agedb ));};_eg .Println ("");};_bb .Log .Info ("\u007e~\u007eT\u0065\u0078\u0074\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0073",_gdgbf ._afaff ,_gdgbf ._gaaa ,_fbcb );
|
||
_eg .Printf ("\u0025\u0035\u0073 \u007c","");for _cbce :=0;_cbce < _gdgbf ._afaff ;_cbce ++{_eg .Printf ("\u0025\u0031\u0032\u0064\u0020\u007c",_cbce );};_eg .Println ("");_eg .Printf ("\u0025\u0035\u0073 \u002b","");for _cfafd :=0;_cfafd < _gdgbf ._afaff ;
|
||
_cfafd ++{_eg .Print ("\u002d\u002d\u002d\u002d\u002d\u002d\u002d\u002d\u002d-\u002d\u002d\u002d\u002b");};_eg .Println ("");for _ffba :=0;_ffba < _gdgbf ._gaaa ;_ffba ++{_eg .Printf ("\u0025\u0035\u0064 \u007c",_ffba );for _cbffb :=0;_cbffb < _gdgbf ._afaff ;
|
||
_cbffb ++{_bgaad ,_ :=_gdgbf ._bgag [_gbbce (_cbffb ,_ffba )].parasBBox ();_ccefe :="";_efbcd :=_bgaad .merge ();if _efbcd !=nil {_ccefe =_efbcd .text ();};_ccefe =_eg .Sprintf ("\u0025\u0071",_egde (_ccefe ,12));_ccefe =_ccefe [1:len (_ccefe )-1];_eg .Printf ("\u0025\u0031\u0032\u0073\u0020\u007c",_ccefe );
|
||
};_eg .Println ("");};};func (_efeea *textTable )bbox ()_bg .PdfRectangle {return _efeea .PdfRectangle };func _adfd (_egbf ,_ddaa _bg .PdfRectangle )bool {return _egbf .Llx <=_ddaa .Llx &&_ddaa .Urx <=_egbf .Urx &&_egbf .Lly <=_ddaa .Lly &&_ddaa .Ury <=_egbf .Ury ;
|
||
};func _gafc (_aace []*textLine ,_deef map[float64 ][]*textLine ,_fcbad []float64 ,_bgfg int ,_daffa ,_dbea float64 )[]*list {_bcebd :=[]*list {};_aeec :=_bgfg ;_bgfg =_bgfg +1;_cafb :=_fcbad [_aeec ];_bgacf :=_deef [_cafb ];_fcfdef :=_bbbe (_bgacf ,_dbea ,_daffa );
|
||
for _cccb ,_edfa :=range _fcfdef {var _bbgc float64 ;_dcaa :=[]*list {};_gcc :=_edfa ._ffbb ;_gece :=_dbea ;if _cccb < len (_fcfdef )-1{_gece =_fcfdef [_cccb +1]._ffbb ;};if _bgfg < len (_fcbad ){_dcaa =_gafc (_aace ,_deef ,_fcbad ,_bgfg ,_gcc ,_gece );
|
||
};_bbgc =_gece ;if len (_dcaa )> 0{_defe :=_dcaa [0];if len (_defe ._dgef )> 0{_bbgc =_defe ._dgef [0]._ffbb ;};};_ffdg :=[]*textLine {_edfa };_acdf :=_gbbd (_edfa ,_aace ,_fcbad ,_gcc ,_bbgc );_ffdg =append (_ffdg ,_acdf ...);_ggfae :=_gfea (_ffdg ,"\u0062\u0075\u006c\u006c\u0065\u0074",_dcaa );
|
||
_ggfae ._feed =_abfbf (_ffdg ,"");_bcebd =append (_bcebd ,_ggfae );};return _bcebd ;};func (_febb rulingList )snapToGroups ()rulingList {_eadfd ,_cgdfd :=_febb .vertsHorzs ();if len (_eadfd )> 0{_eadfd =_eadfd .snapToGroupsDirection ();};if len (_cgdfd )> 0{_cgdfd =_cgdfd .snapToGroupsDirection ();
|
||
};_bgbe :=append (_eadfd ,_cgdfd ...);_bgbe .log ("\u0073\u006e\u0061p\u0054\u006f\u0047\u0072\u006f\u0075\u0070\u0073");return _bgbe ;};var _aaefc string ="\u0028\u003f\u0069\u0029\u005e\u0028\u004d\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u0043\u0028?\u003a\u0044\u007cM\u0029\u007c\u0044\u003f\u0043{\u0030\u002c\u0033\u007d\u0029\u0028\u0058\u0028\u003f\u003a\u004c\u007c\u0043\u0029\u007cL\u003f\u0058\u007b\u0030\u002c\u0033}\u0029\u0028\u0049\u0028\u003f\u003a\u0056\u007c\u0058\u0029\u007c\u0056\u003f\u0049\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u005c\u0029\u007c\u005c\u002e\u0029\u007c\u005e\u005c\u0028\u0028\u004d\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u0043\u0028\u003f\u003aD\u007cM\u0029\u007c\u0044\u003f\u0043\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u0058\u0028?\u003a\u004c\u007c\u0043\u0029\u007c\u004c?\u0058\u007b0\u002c\u0033\u007d\u0029(\u0049\u0028\u003f\u003a\u0056|\u0058\u0029\u007c\u0056\u003f\u0049\u007b\u0030\u002c\u0033\u007d\u0029\u005c\u0029";
|
||
func (_ega *imageExtractContext )processOperand (_dg *_de .ContentStreamOperation ,_aeb _de .GraphicsState ,_abf *_bg .PdfPageResources )error {if _dg .Operand =="\u0042\u0049"&&len (_dg .Params )==1{_eeg ,_ccg :=_dg .Params [0].(*_de .ContentStreamInlineImage );
|
||
if !_ccg {return nil ;};if _dge ,_gdf :=_ga .GetBoolVal (_eeg .ImageMask );_gdf {if _dge &&!_ega ._caa .IncludeInlineStencilMasks {return nil ;};};return _ega .extractInlineImage (_eeg ,_aeb ,_abf );}else if _dg .Operand =="\u0044\u006f"&&len (_dg .Params )==1{_edd ,_dcb :=_ga .GetName (_dg .Params [0]);
|
||
if !_dcb {_bb .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065");return _da ;};_ ,_cfdc :=_abf .GetXObjectByName (*_edd );switch _cfdc {case _bg .XObjectTypeImage :return _ega .extractXObjectImage (_edd ,_aeb ,_abf );case _bg .XObjectTypeForm :return _ega .extractFormImages (_edd ,_aeb ,_abf );
|
||
};}else if _ega ._fca &&(_dg .Operand =="\u0073\u0063\u006e"||_dg .Operand =="\u0053\u0043\u004e")&&len (_dg .Params )==1{_dbb ,_defc :=_ga .GetName (_dg .Params [0]);if !_defc {_bb .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065");
|
||
return _da ;};_ccge ,_defc :=_abf .GetPatternByName (*_dbb );if !_defc {_bb .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0050\u0061\u0074\u0074\u0065\u0072n\u0020\u006e\u006f\u0074\u0020\u0066\u006f\u0075\u006e\u0064");return nil ;};if _ccge .IsTiling (){_ded :=_ccge .GetAsTilingPattern ();
|
||
_aaa ,_fg :=_ded .GetContentStream ();if _fg !=nil {return _fg ;};_fg =_ega .extractContentStreamImages (string (_aaa ),_ded .Resources );if _fg !=nil {return _fg ;};};}else if (_dg .Operand =="\u0063\u0073"||_dg .Operand =="\u0043\u0053")&&len (_dg .Params )>=1{_ega ._fca =_dg .Params [0].String ()=="\u0050a\u0074\u0074\u0065\u0072\u006e";
|
||
};return nil ;};type textMark struct{_bg .PdfRectangle ;_eabg int ;_gcgb string ;_ebaa string ;_gcded *_bg .PdfFont ;_cffb float64 ;_gaac float64 ;_ddfg _cb .Matrix ;_ebcb _cb .Point ;_cedgd _bg .PdfRectangle ;_dfbf _g .Color ;_ffdf _g .Color ;_eggb _ga .PdfObject ;
|
||
_fcbgg []string ;Tw float64 ;Th float64 ;_egaa int ;_fcadf int ;};func (_ddaae *textLine )endsInHyphen ()bool {_aecfc :=_ddaae ._ggdd [len (_ddaae ._ggdd )-1];_bcbfc :=_aecfc ._bdbgb ;_aafff ,_cdea :=_ba .DecodeLastRuneInString (_bcbfc );if _cdea <=0||!_e .Is (_e .Hyphen ,_aafff ){return false ;
|
||
};if _aecfc ._bbdga &&_eaaad (_bcbfc ){return true ;};return _eaaad (_ddaae .text ());};func (_aec *Extractor )extractPageText (_ccc string ,_fd *_bg .PdfPageResources ,_cde _cb .Matrix ,_dga int )(*PageText ,int ,int ,error ){_bb .Log .Trace ("\u0065x\u0074\u0072\u0061\u0063t\u0050\u0061\u0067\u0065\u0054e\u0078t\u003a \u006c\u0065\u0076\u0065\u006c\u003d\u0025d",_dga );
|
||
_gefd :=&PageText {_abfb :_aec ._ea ,_abba :_aec ._ed ,_ccff :_aec ._eee };_agce :=_bda (_aec ._ea );var _dbcg stateStack ;_ceg :=_aac (_aec ,_fd ,_de .GraphicsState {},&_agce ,&_dbcg );_fag :=shapesState {_bfac :_cde ,_dea :_cb .IdentityMatrix (),_fdc :_ceg };
|
||
var _gce bool ;_cec :=-1;if _dga > _dde {_eceb :=_b .New ("\u0066\u006f\u0072\u006d s\u0074\u0061\u0063\u006b\u0020\u006f\u0076\u0065\u0072\u0066\u006c\u006f\u0077");_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a \u0065\u0078\u0074\u0072\u0061\u0063\u0074\u0050\u0061\u0067\u0065\u0054\u0065\u0078\u0074\u002e\u0020\u0072\u0065\u0063u\u0072\u0073\u0069\u006f\u006e\u0020\u006c\u0065\u0076\u0065\u006c\u003d\u0025\u0064 \u0065r\u0072\u003d\u0025\u0076",_dga ,_eceb );
|
||
return _gefd ,_agce ._ggaa ,_agce ._gba ,_eceb ;};_cbe :=_de .NewContentStreamParser (_ccc );_fce ,_eaf :=_cbe .Parse ();if _eaf !=nil {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020e\u0078\u0074\u0072a\u0063\u0074\u0050\u0061g\u0065\u0054\u0065\u0078\u0074\u0020\u0070\u0061\u0072\u0073\u0065\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_eaf );
|
||
return _gefd ,_agce ._ggaa ,_agce ._gba ,_eaf ;};_gefd ._fad =_fce ;_eaa :=_de .NewContentStreamProcessor (*_fce );_eaa .AddHandler (_de .HandlerConditionEnumAllOperands ,"",func (_bbcc *_de .ContentStreamOperation ,_dedc _de .GraphicsState ,_eeb *_bg .PdfPageResources )error {_ggd :=_bbcc .Operand ;
|
||
if _gfda {_bb .Log .Info ("\u0026&\u0026\u0020\u006f\u0070\u003d\u0025s",_bbcc );};switch _ggd {case "\u0071":if _ggbc {_bb .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_fag ._dea );};_dbcg .push (&_agce );case "\u0051":if !_dbcg .empty (){_agce =*_dbcg .pop ();
|
||
};_fag ._dea =_dedc .CTM ;if _ggbc {_bb .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_fag ._dea );};case "\u0042\u0044\u0043":_fcc ,_gbc :=_ga .GetDict (_bbcc .Params [1]);if !_gbc {_bb .Log .Debug ("\u0045\u0052\u0052O\u0052\u003a\u0020\u0042D\u0043\u0020\u006f\u0070\u003d\u0025\u0073 \u0047\u0065\u0074\u0044\u0069\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064",_bbcc );
|
||
return _eaf ;};_ebg :=_fcc .Get ("\u004d\u0043\u0049\u0044");if _ebg !=nil {_ad ,_egac :=_ga .GetIntVal (_ebg );if !_egac {_bb .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u0042\u0044C\u0020\u006f\u0070=\u0025\u0073\u002e\u0020\u0042\u0061\u0064\u0020\u006eum\u0065\u0072\u0069c\u0061\u006c \u006f\u0062\u006a\u0065\u0063\u0074.\u0020\u006f=\u0025\u0073",_bbcc ,_ebg );
|
||
};_cec =_ad ;}else {_cec =-1;};case "\u0045\u004d\u0043":_cec =-1;case "\u0042\u0054":if _gce {_bb .Log .Debug ("\u0042\u0054\u0020\u0063\u0061\u006c\u006c\u0065\u0064\u0020\u0077\u0068\u0069\u006c\u0065 \u0069n\u0020\u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074");
|
||
_gefd ._bae =append (_gefd ._bae ,_ceg ._dfge ...);};_gce =true ;_geg :=_dedc ;_geg .CTM =_cde .Mult (_geg .CTM );_ceg =_aac (_aec ,_eeb ,_geg ,&_agce ,&_dbcg );_fag ._fdc =_ceg ;case "\u0045\u0054":if !_gce {_bb .Log .Debug ("\u0045\u0054\u0020ca\u006c\u006c\u0065\u0064\u0020\u006f\u0075\u0074\u0073i\u0064e\u0020o\u0066 \u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074");
|
||
};_gce =false ;_gefd ._bae =append (_gefd ._bae ,_ceg ._dfge ...);_ceg .reset ();case "\u0054\u002a":_ceg .nextLine ();case "\u0054\u0064":if _afg ,_cdga :=_ceg .checkOp (_bbcc ,2,true );!_afg {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_cdga );
|
||
return _cdga ;};_cfg ,_cae ,_bgec :=_eecbf (_bbcc .Params );if _bgec !=nil {return _bgec ;};_ceg .moveText (_cfg ,_cae );case "\u0054\u0044":if _dbf ,_bfed :=_ceg .checkOp (_bbcc ,2,true );!_dbf {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bfed );
|
||
return _bfed ;};_ggb ,_gedf ,_cad :=_eecbf (_bbcc .Params );if _cad !=nil {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_cad );return _cad ;};_ceg .moveTextSetLeading (_ggb ,_gedf );case "\u0054\u006a":if _bage ,_fga :=_ceg .checkOp (_bbcc ,1,true );
|
||
!_bage {_bb .Log .Debug ("\u0045\u0052\u0052\u004fR:\u0020\u0054\u006a\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0065\u0072\u0072\u003d%\u0076",_bbcc ,_fga );return _fga ;};_fgbg :=_ga .TraceToDirectObject (_bbcc .Params [0]);_faa ,_fec :=_ga .GetStringBytes (_fgbg );
|
||
if !_fec {_bb .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a\u0020T\u006a\u0020o\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074S\u0074\u0072\u0069\u006e\u0067\u0042\u0079\u0074\u0065\u0073\u0020\u0066a\u0069\u006c\u0065\u0064",_bbcc );return _ga .ErrTypeError ;
|
||
};return _ceg .showText (_fgbg ,_faa ,_cec );case "\u0054\u004a":if _ccd ,_eaac :=_ceg .checkOp (_bbcc ,1,true );!_ccd {_bb .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u004a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_eaac );return _eaac ;};_dged ,_bbdb :=_ga .GetArray (_bbcc .Params [0]);
|
||
if !_bbdb {_bb .Log .Debug ("\u0045\u0052\u0052OR\u003a\u0020\u0054\u004a\u0020\u006f\u0070\u003d\u0025s\u0020G\u0065t\u0041r\u0072\u0061\u0079\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065\u0064",_bbcc );return _eaf ;};return _ceg .showTextAdjusted (_dged ,_cec );
|
||
case "\u0027":if _agcd ,_efa :=_ceg .checkOp (_bbcc ,1,true );!_agcd {_bb .Log .Debug ("\u0045R\u0052O\u0052\u003a\u0020\u0027\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_efa );return _efa ;};_gcea :=_ga .TraceToDirectObject (_bbcc .Params [0]);_abe ,_aab :=_ga .GetStringBytes (_gcea );
|
||
if !_aab {_bb .Log .Debug ("\u0045\u0052RO\u0052\u003a\u0020'\u0020\u006f\u0070\u003d%s \u0047et\u0053\u0074\u0072\u0069\u006e\u0067\u0042yt\u0065\u0073\u0020\u0066\u0061\u0069\u006ce\u0064",_bbcc );return _ga .ErrTypeError ;};_ceg .nextLine ();return _ceg .showText (_gcea ,_abe ,_cec );
|
||
case "\u0022":if _bef ,_ebe :=_ceg .checkOp (_bbcc ,3,true );!_bef {_bb .Log .Debug ("\u0045R\u0052O\u0052\u003a\u0020\u0022\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_ebe );return _ebe ;};_aae ,_egf ,_afe :=_eecbf (_bbcc .Params [:2]);if _afe !=nil {return _afe ;
|
||
};_cgecf :=_ga .TraceToDirectObject (_bbcc .Params [2]);_dgf ,_bcd :=_ga .GetStringBytes (_cgecf );if !_bcd {_bb .Log .Debug ("\u0045\u0052RO\u0052\u003a\u0020\"\u0020\u006f\u0070\u003d%s \u0047et\u0053\u0074\u0072\u0069\u006e\u0067\u0042yt\u0065\u0073\u0020\u0066\u0061\u0069\u006ce\u0064",_bbcc );
|
||
return _ga .ErrTypeError ;};_ceg .setCharSpacing (_aae );_ceg .setWordSpacing (_egf );_ceg .nextLine ();return _ceg .showText (_cgecf ,_dgf ,_cec );case "\u0054\u004c":_gega ,_cdbfa :=_edc (_bbcc );if _cdbfa !=nil {_bb .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u004c\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_cdbfa );
|
||
return _cdbfa ;};_ceg .setTextLeading (_gega );case "\u0054\u0063":_fff ,_fbe :=_edc (_bbcc );if _fbe !=nil {_bb .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0063\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_fbe );return _fbe ;};_ceg .setCharSpacing (_fff );
|
||
case "\u0054\u0066":if _bfb ,_eefc :=_ceg .checkOp (_bbcc ,2,true );!_bfb {_bb .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0066\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_eefc );return _eefc ;};_edab ,_cff :=_ga .GetNameVal (_bbcc .Params [0]);
|
||
if !_cff {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a \u0054\u0066\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074\u004ea\u006d\u0065\u0056\u0061\u006c\u0020\u0066a\u0069\u006c\u0065\u0064",_bbcc );return _ga .ErrTypeError ;};_bgeb ,_egbg :=_ga .GetNumberAsFloat (_bbcc .Params [1]);
|
||
if !_cff {_bb .Log .Debug ("\u0045\u0052\u0052O\u0052\u003a\u0020\u0054\u0066\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074\u0046\u006c\u006f\u0061\u0074\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065d\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bbcc ,_egbg );
|
||
return _egbg ;};_egbg =_ceg .setFont (_edab ,_bgeb );_ceg ._cgeb =_egd .Is (_egbg ,_ga .ErrNotSupported );if _egbg !=nil &&!_ceg ._cgeb {return _egbg ;};case "\u0054\u006d":if _fcdd ,_fef :=_ceg .checkOp (_bbcc ,6,true );!_fcdd {_bb .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u006d\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_fef );
|
||
return _fef ;};_eafg ,_acf :=_ga .GetNumbersAsFloat (_bbcc .Params );if _acf !=nil {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_acf );return _acf ;};_ceg .setTextMatrix (_eafg );case "\u0054\u0072":if _gbb ,_bccc :=_ceg .checkOp (_bbcc ,1,true );
|
||
!_gbb {_bb .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0072\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bccc );return _bccc ;};_bdc ,_daa :=_ga .GetIntVal (_bbcc .Params [0]);if !_daa {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0072\u0020\u006f\u0070\u003d\u0025\u0073 \u0047e\u0074\u0049\u006e\u0074\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065\u0064",_bbcc );
|
||
return _ga .ErrTypeError ;};_ceg .setTextRenderMode (_bdc );case "\u0054\u0073":if _cfe ,_abc :=_ceg .checkOp (_bbcc ,1,true );!_cfe {_bb .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0073\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_abc );return _abc ;
|
||
};_fcb ,_bbda :=_ga .GetNumberAsFloat (_bbcc .Params [0]);if _bbda !=nil {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bbda );return _bbda ;};_ceg .setTextRise (_fcb );case "\u0054\u0077":if _cgg ,_gf :=_ceg .checkOp (_bbcc ,1,true );
|
||
!_cgg {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_gf );return _gf ;};_ada ,_cdeg :=_ga .GetNumberAsFloat (_bbcc .Params [0]);if _cdeg !=nil {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_cdeg );
|
||
return _cdeg ;};_ceg .setWordSpacing (_ada );case "\u0054\u007a":if _fge ,_ccb :=_ceg .checkOp (_bbcc ,1,true );!_fge {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_ccb );return _ccb ;};_dfd ,_cab :=_ga .GetNumberAsFloat (_bbcc .Params [0]);
|
||
if _cab !=nil {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_cab );return _cab ;};_ceg .setHorizScaling (_dfd );case "\u0063\u006d":_fag ._dea =_dedc .CTM ;if _fag ._dea .Singular (){_deg :=_cb .IdentityMatrix ().Translate (_fag ._dea .Translation ());
|
||
_bb .Log .Debug ("S\u0069n\u0067\u0075\u006c\u0061\u0072\u0020\u0063\u0074m\u003d\u0025\u0073\u2192%s",_fag ._dea ,_deg );_fag ._dea =_deg ;};if _ggbc {_bb .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_fag ._dea );};case "\u006d":if len (_bbcc .Params )!=2{_bb .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0065\u0072\u0072o\u0072\u0020\u0077\u0068\u0069\u006c\u0065\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u0020\u0060\u006d\u0060\u0020o\u0070\u0065r\u0061\u0074o\u0072\u003a\u0020\u0025\u0076\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074 m\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063o\u0072\u0072\u0065\u0063\u0074\u002e",_eb );
|
||
return nil ;};_adaf ,_acb :=_ga .GetNumbersAsFloat (_bbcc .Params );if _acb !=nil {return _acb ;};_fag .moveTo (_adaf [0],_adaf [1]);case "\u006c":if len (_bbcc .Params )!=2{_bb .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0065\u0072\u0072o\u0072\u0020\u0077\u0068\u0069\u006c\u0065\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u0020\u0060\u006c\u0060\u0020o\u0070\u0065r\u0061\u0074o\u0072\u003a\u0020\u0025\u0076\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074 m\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063o\u0072\u0072\u0065\u0063\u0074\u002e",_eb );
|
||
return nil ;};_aafb ,_ggbe :=_ga .GetNumbersAsFloat (_bbcc .Params );if _ggbe !=nil {return _ggbe ;};_fag .lineTo (_aafb [0],_aafb [1]);case "\u0063":if len (_bbcc .Params )!=6{return _eb ;};_caeb ,_fde :=_ga .GetNumbersAsFloat (_bbcc .Params );if _fde !=nil {return _fde ;
|
||
};_bb .Log .Debug ("\u0043u\u0062\u0069\u0063\u0020b\u0065\u007a\u0069\u0065\u0072 \u0070a\u0072a\u006d\u0073\u003a\u0020\u0025\u002e\u0032f",_caeb );_fag .cubicTo (_caeb [0],_caeb [1],_caeb [2],_caeb [3],_caeb [4],_caeb [5]);case "\u0076","\u0079":if len (_bbcc .Params )!=4{return _eb ;
|
||
};_afd ,_dcg :=_ga .GetNumbersAsFloat (_bbcc .Params );if _dcg !=nil {return _dcg ;};_bb .Log .Debug ("\u0043u\u0062\u0069\u0063\u0020b\u0065\u007a\u0069\u0065\u0072 \u0070a\u0072a\u006d\u0073\u003a\u0020\u0025\u002e\u0032f",_afd );_fag .quadraticTo (_afd [0],_afd [1],_afd [2],_afd [3]);
|
||
case "\u0068":_fag .closePath ();case "\u0072\u0065":if len (_bbcc .Params )!=4{return _eb ;};_edag ,_deb :=_ga .GetNumbersAsFloat (_bbcc .Params );if _deb !=nil {return _deb ;};_fag .drawRectangle (_edag [0],_edag [1],_edag [2],_edag [3]);_fag .closePath ();
|
||
case "\u0053":_fag .stroke (&_gefd ._fcbc );_fag .clearPath ();case "\u0073":_fag .closePath ();_fag .stroke (&_gefd ._fcbc );_fag .clearPath ();case "\u0046":_fag .fill (&_gefd ._eeec );_fag .clearPath ();case "\u0066","\u0066\u002a":_fag .closePath ();
|
||
_fag .fill (&_gefd ._eeec );_fag .clearPath ();case "\u0042","\u0042\u002a":_fag .fill (&_gefd ._eeec );_fag .stroke (&_gefd ._fcbc );_fag .clearPath ();case "\u0062","\u0062\u002a":_fag .closePath ();_fag .fill (&_gefd ._eeec );_fag .stroke (&_gefd ._fcbc );
|
||
_fag .clearPath ();case "\u006e":_fag .clearPath ();case "\u0044\u006f":if len (_bbcc .Params )==0{_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0078\u0070\u0065\u0063\u0074\u0065\u0064\u0020\u0058\u004fbj\u0065c\u0074\u0020\u006e\u0061\u006d\u0065\u0020\u006f\u0070\u0065\u0072\u0061n\u0064\u0020\u0066\u006f\u0072\u0020\u0044\u006f\u0020\u006f\u0070\u0065\u0072\u0061\u0074\u006f\u0072.\u0020\u0047\u006f\u0074\u0020\u0025\u002b\u0076\u002e",_bbcc .Params );
|
||
return _ga .ErrRangeError ;};_bcec ,_gbcg :=_ga .GetName (_bbcc .Params [0]);if !_gbcg {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u0044\u006f\u0020\u006f\u0070e\u0072a\u0074\u006f\u0072\u0020\u0058\u004f\u0062\u006a\u0065\u0063\u0074\u0020\u006e\u0061\u006d\u0065\u0020\u006fp\u0065\u0072\u0061\u006e\u0064\u003a\u0020\u0025\u002b\u0076\u002e",_bbcc .Params [0]);
|
||
return _ga .ErrTypeError ;};_ ,_cfeb :=_eeb .GetXObjectByName (*_bcec );if _cfeb !=_bg .XObjectTypeForm {break ;};_cce ,_gbcg :=_aec ._ce [_bcec .String ()];if !_gbcg {_cedb ,_bbea :=_eeb .GetXObjectFormByName (*_bcec );if _bbea !=nil {_bb .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v",_bbea );
|
||
return _bbea ;};_gga ,_bbea :=_cedb .GetContentStream ();if _bbea !=nil {_bb .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v",_bbea );return _bbea ;};_fda :=_cedb .Resources ;if _fda ==nil {_fda =_eeb ;};_bfcc :=_dedc .CTM ;if _age ,_bdce :=_ga .GetArray (_cedb .Matrix );
|
||
_bdce {_bga ,_dbd :=_age .GetAsFloat64Slice ();if _dbd !=nil {return _dbd ;};if len (_bga )!=6{return _eb ;};_agbe :=_cb .NewMatrix (_bga [0],_bga [1],_bga [2],_bga [3],_bga [4],_bga [5]);_bfcc =_dedc .CTM .Mult (_agbe );};_fdg ,_bcg ,_aba ,_bbea :=_aec .extractPageText (string (_gga ),_fda ,_cde .Mult (_bfcc ),_dga +1);
|
||
if _bbea !=nil {_bb .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v",_bbea );return _bbea ;};_cce =textResult {*_fdg ,_bcg ,_aba };_aec ._ce [_bcec .String ()]=_cce ;};_fag ._dea =_dedc .CTM ;if _ggbc {_bb .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_fag ._dea );
|
||
};_gefd ._bae =append (_gefd ._bae ,_cce ._bgd ._bae ...);_gefd ._fcbc =append (_gefd ._fcbc ,_cce ._bgd ._fcbc ...);_gefd ._eeec =append (_gefd ._eeec ,_cce ._bgd ._eeec ...);_agce ._ggaa +=_cce ._eec ;_agce ._gba +=_cce ._fbbd ;case "\u0072\u0067","\u0067","\u006b","\u0063\u0073","\u0073\u0063","\u0073\u0063\u006e":_ceg ._fgd .ColorspaceNonStroking =_dedc .ColorspaceNonStroking ;
|
||
_ceg ._fgd .ColorNonStroking =_dedc .ColorNonStroking ;case "\u0052\u0047","\u0047","\u004b","\u0043\u0053","\u0053\u0043","\u0053\u0043\u004e":_ceg ._fgd .ColorspaceStroking =_dedc .ColorspaceStroking ;_ceg ._fgd .ColorStroking =_dedc .ColorStroking ;
|
||
};return nil ;});_eaf =_eaa .Process (_fd );return _gefd ,_agce ._ggaa ,_agce ._gba ,_eaf ;};func (_dcbe *textPara )writeText (_cdfg _c .Writer ){if _dcbe ._gbgg ==nil {_dcbe .writeCellText (_cdfg );return ;};for _deca :=0;_deca < _dcbe ._gbgg ._gaaa ;
|
||
_deca ++{for _agad :=0;_agad < _dcbe ._gbgg ._afaff ;_agad ++{_ebeb :=_dcbe ._gbgg .get (_agad ,_deca );if _ebeb ==nil {_cdfg .Write ([]byte ("\u0009"));}else {_ebeb .writeCellText (_cdfg );};_cdfg .Write ([]byte ("\u0020"));};if _deca < _dcbe ._gbgg ._gaaa -1{_cdfg .Write ([]byte ("\u000a"));
|
||
};};};func (_cgad paraList )findGridTables (_dccfb []gridTiling )[]*textTable {if _aeff {_bb .Log .Info ("\u0066i\u006e\u0064\u0047\u0072\u0069\u0064\u0054\u0061\u0062\u006c\u0065s\u003a\u0020\u0025\u0064\u0020\u0070\u0061\u0072\u0061\u0073",len (_cgad ));
|
||
for _defag ,_decdg :=range _cgad {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_defag ,_decdg );};};var _dfcfc []*textTable ;for _cfgfe ,_fffcb :=range _dccfb {_gffe ,_caggc :=_cgad .findTableGrid (_fffcb );if _gffe !=nil {_gffe .log (_eg .Sprintf ("\u0066\u0069\u006e\u0064Ta\u0062\u006c\u0065\u0057\u0069\u0074\u0068\u0047\u0072\u0069\u0064\u0073\u003a\u0020%\u0064",_cfgfe ));
|
||
_dfcfc =append (_dfcfc ,_gffe );_gffe .markCells ();};for _acaba :=range _caggc {_acaba ._ggcbf =true ;};};if _aeff {_bb .Log .Info ("\u0066i\u006e\u0064\u0047\u0072i\u0064\u0054\u0061\u0062\u006ce\u0073:\u0020%\u0064\u0020\u0074\u0061\u0062\u006c\u0065s",len (_dfcfc ));
|
||
};return _dfcfc ;};func (_bcfb *textObject )setWordSpacing (_dba float64 ){if _bcfb ==nil {return ;};_bcfb ._aed ._dedcd =_dba ;};func _ebbc (_gdbd *wordBag ,_ddgcb *textWord ,_gfdee float64 )bool {return _gdbd .Urx <=_ddgcb .Llx &&_ddgcb .Llx < _gdbd .Urx +_gfdee ;
|
||
};func _bggg (_bffa ,_caec _cb .Point )rulingKind {_fdgca :=_ef .Abs (_bffa .X -_caec .X );_addb :=_ef .Abs (_bffa .Y -_caec .Y );return _abaa (_fdgca ,_addb ,_dfbd );};
|
||
|
||
// TableCell is a cell in a TextTable.
|
||
type TableCell struct{_bg .PdfRectangle ;
|
||
|
||
// Text is the extracted text.
|
||
Text string ;
|
||
|
||
// Marks returns the TextMarks corresponding to the text in Text.
|
||
Marks TextMarkArray ;};
|
||
|
||
// TextMark represents extracted text on a page with information regarding both textual content,
|
||
// formatting (font and size) and positioning.
|
||
// It is the smallest unit of text on a PDF page, typically a single character.
|
||
//
|
||
// getBBox() in test_text.go shows how to compute bounding boxes of substrings of extracted text.
|
||
// The following code extracts the text on PDF page `page` into `text` then finds the bounding box
|
||
// `bbox` of substring `term` in `text`.
|
||
//
|
||
// ex, _ := New(page)
|
||
// // handle errors
|
||
// pageText, _, _, err := ex.ExtractPageText()
|
||
// // handle errors
|
||
// text := pageText.Text()
|
||
// textMarks := pageText.Marks()
|
||
//
|
||
// start := strings.Index(text, term)
|
||
// end := start + len(term)
|
||
// spanMarks, err := textMarks.RangeOffset(start, end)
|
||
// // handle errors
|
||
// bbox, ok := spanMarks.BBox()
|
||
// // handle errors
|
||
type TextMark struct{
|
||
|
||
// Text is the extracted text.
|
||
Text string ;
|
||
|
||
// Original is the text in the PDF. It has not been decoded like `Text`.
|
||
Original string ;
|
||
|
||
// BBox is the bounding box of the text.
|
||
BBox _bg .PdfRectangle ;
|
||
|
||
// Font is the font the text was drawn with.
|
||
Font *_bg .PdfFont ;
|
||
|
||
// FontSize is the font size the text was drawn with.
|
||
FontSize float64 ;
|
||
|
||
// Offset is the offset of the start of TextMark.Text in the extracted text. If you do this
|
||
// text, textMarks := pageText.Text(), pageText.Marks()
|
||
// marks := textMarks.Elements()
|
||
// then marks[i].Offset is the offset of marks[i].Text in text.
|
||
Offset int ;
|
||
|
||
// Meta is set true for spaces and line breaks that we insert in the extracted text. We insert
|
||
// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
|
||
// distance apart. See wordJoiner (lineJoiner) in PageText.computeViews().
|
||
Meta bool ;
|
||
|
||
// FillColor is the fill color of the text.
|
||
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
||
FillColor _g .Color ;
|
||
|
||
// StrokeColor is the stroke color of the text.
|
||
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
||
StrokeColor _g .Color ;
|
||
|
||
// Orientation is the text orientation
|
||
Orientation int ;
|
||
|
||
// DirectObject is the underlying PdfObject (Text Object) that represents the visible texts. This is introduced to get
|
||
// a simple access to the TextObject in case editing or replacment of some text is needed. E.g during redaction.
|
||
DirectObject _ga .PdfObject ;
|
||
|
||
// ObjString is a decoded string operand of a text-showing operator. It has the same value as `Text` attribute except
|
||
// when many glyphs are represented with the same Text Object that contains multiple length string operand in which case
|
||
// ObjString spans more than one character string that falls in different TextMark objects.
|
||
ObjString []string ;Tw float64 ;Th float64 ;Tc float64 ;Index int ;_cda bool ;_gfc *TextTable ;};func (_cfbf *textLine )pullWord (_fcab *wordBag ,_fabe *textWord ,_abed int ){_cfbf .appendWord (_fabe );_fcab .removeWord (_fabe ,_abed );};func (_bcgb *structTreeRoot )buildList (_gafd map[int ][]*textLine ,_fcadd _ga .PdfObject )[]*list {if _bcgb ==nil {_bb .Log .Debug ("\u0062\u0075\u0069\u006c\u0064\u004c\u0069\u0073\u0074\u003a\u0020t\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u0020\u0069\u0073 \u006e\u0069\u006c");
|
||
return nil ;};var _ebgg *structElement ;_cdfee :=[]structElement {};if len (_bcgb ._aaef )==1{_daff :=_bcgb ._aaef [0]._ddag ;if _daff =="\u0044\u006f\u0063\u0075\u006d\u0065\u006e\u0074"||_daff =="\u0053\u0065\u0063\u0074"||_daff =="\u0050\u0061\u0072\u0074"||_daff =="\u0044\u0069\u0076"||_daff =="\u0041\u0072\u0074"{_ebgg =&_bcgb ._aaef [0];
|
||
};}else {_ebgg =&structElement {_acdgb :_bcgb ._aaef ,_ddag :_bcgb ._cfbff };};if _ebgg ==nil {_bb .Log .Debug ("\u0062\u0075\u0069\u006cd\u004c\u0069\u0073\u0074\u003a\u0020\u0074\u006f\u0070\u0045l\u0065m\u0065\u006e\u0074\u0020\u0069\u0073\u0020n\u0069\u006c");
|
||
return nil ;};for _ ,_acce :=range _ebgg ._acdgb {if _acce ._ddag =="\u004c"{_cdfee =append (_cdfee ,_acce );}else if _acce ._ddag =="\u0054\u0061\u0062l\u0065"{_cdgaf :=_cgfc (_acce );_cdfee =append (_cdfee ,_cdgaf ...);};};_bdbg :=_gcba (_cdfee ,_gafd ,_fcadd );
|
||
var _edec []*list ;for _ ,_cfgcg :=range _bdbg {_gcdfc :=_abcdd (_cfgcg );_edec =append (_edec ,_gcdfc ...);};return _edec ;};func (_eebc *shapesState )lineTo (_dffg ,_eacc float64 ){if _ggbc {_bb .Log .Info ("\u006c\u0069\u006eeT\u006f\u0028\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0070\u003d\u0025\u002e\u0032\u0066",_dffg ,_eacc ,_eebc .devicePoint (_dffg ,_eacc ));
|
||
};_eebc .addPoint (_dffg ,_eacc );};func (_cbbg *textTable )isExportable ()bool {if _cbbg ._cfabb {return true ;};_acbff :=func (_dfegb int )bool {_ddff :=_cbbg .get (0,_dfegb );if _ddff ==nil {return false ;};_bfbcf :=_ddff .text ();_gebdb :=_ba .RuneCountInString (_bfbcf );
|
||
_dggg :=_ffbf .MatchString (_bfbcf );return _gebdb <=1||_dggg ;};for _bgbc :=0;_bgbc < _cbbg ._gaaa ;_bgbc ++{if !_acbff (_bgbc ){return true ;};};return false ;};func _gdcee (_cccdc ,_aagc _cb .Point )bool {_ffcaa :=_ef .Abs (_cccdc .X -_aagc .X );_aafg :=_ef .Abs (_cccdc .Y -_aagc .Y );
|
||
return _eacaf (_aafg ,_ffcaa );};type markKind int ;
|
||
|
||
// String returns a description of `b`.
|
||
func (_ccdda *wordBag )String ()string {var _fdgae []string ;for _ ,_dcge :=range _ccdda .depthIndexes (){_ace :=_ccdda ._dceb [_dcge ];for _ ,_dcdb :=range _ace {_fdgae =append (_fdgae ,_dcdb ._bdbgb );};};return _eg .Sprintf ("\u0025.\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065=\u0025\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0025\u0071",_ccdda .PdfRectangle ,_ccdda ._ddgcf ,len (_fdgae ),_fdgae );
|
||
};
|
||
|
||
// String returns a description of `tm`.
|
||
func (_befg *textMark )String ()string {return _eg .Sprintf ("\u0025\u002e\u0032f \u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\u0022\u0025\u0073\u0022",_befg .PdfRectangle ,_befg ._cffb ,_befg ._gcgb );};func (_gddd *textTable )toTextTable ()TextTable {if _aeff {_bb .Log .Info ("t\u006fT\u0065\u0078\u0074\u0054\u0061\u0062\u006c\u0065:\u0020\u0025\u0064\u0020x \u0025\u0064",_gddd ._afaff ,_gddd ._gaaa );
|
||
};_aaeea :=make ([][]TableCell ,_gddd ._gaaa );for _fgabd :=0;_fgabd < _gddd ._gaaa ;_fgabd ++{_aaeea [_fgabd ]=make ([]TableCell ,_gddd ._afaff );for _gggg :=0;_gggg < _gddd ._afaff ;_gggg ++{_begd :=_gddd .get (_gggg ,_fgabd );if _begd ==nil {continue ;
|
||
};if _aeff {_eg .Printf ("\u0025\u0034\u0064 \u0025\u0032\u0064\u003a\u0020\u0025\u0073\u000a",_gggg ,_fgabd ,_begd );};_aaeea [_fgabd ][_gggg ].Text =_begd .text ();_dbacc :=0;_aaeea [_fgabd ][_gggg ].Marks ._dafa =_begd .toTextMarks (&_dbacc );};};_abcae :=TextTable {W :_gddd ._afaff ,H :_gddd ._gaaa ,Cells :_aaeea };
|
||
_abcae .PdfRectangle =_gddd .bbox ();return _abcae ;};
|
||
|
||
// String returns a human readable description of `s`.
|
||
func (_edbea intSet )String ()string {var _gfbeg []int ;for _ggag :=range _edbea {if _edbea .has (_ggag ){_gfbeg =append (_gfbeg ,_ggag );};};_ff .Ints (_gfbeg );return _eg .Sprintf ("\u0025\u002b\u0076",_gfbeg );};func _bdad (_ccdga _ga .PdfObject ,_fgbdd _g .Color )(_be .Image ,error ){_ecbb ,_bdecfb :=_ga .GetStream (_ccdga );
|
||
if !_bdecfb {return nil ,nil ;};_egaeb ,_fbfgg :=_bg .NewXObjectImageFromStream (_ecbb );if _fbfgg !=nil {return nil ,_fbfgg ;};_bafge ,_fbfgg :=_egaeb .ToImage ();if _fbfgg !=nil {return nil ,_fbfgg ;};return _fbcdb (_bafge ,_fgbdd ),nil ;};func _eaga (_dfda []TextMark ,_ddaf *int )[]TextMark {_bdbgc :=_dfda [len (_dfda )-1];
|
||
_gedfg :=[]rune (_bdbgc .Text );if len (_gedfg )==1{_dfda =_dfda [:len (_dfda )-1];_bdbc :=_dfda [len (_dfda )-1];*_ddaf =_bdbc .Offset +len (_bdbc .Text );}else {_bbbb :=_aaaa (_bdbgc .Text );*_ddaf +=len (_bbbb )-len (_bdbgc .Text );_bdbgc .Text =_bbbb ;
|
||
};return _dfda ;};func (_gbbf *textObject )nextLine (){_gbbf .moveLP (0,-_gbbf ._aed ._ddba )};func _acea (_fccc *wordBag ,_cgfa *textWord ,_eaca float64 )bool {return _cgfa .Llx < _fccc .Urx +_eaca &&_fccc .Llx -_eaca < _cgfa .Urx ;};func (_gadc *ruling )encloses (_daea ,_adfbg float64 )bool {return _gadc ._bafb -_adca <=_daea &&_adfbg <=_gadc ._cbdc +_adca ;
|
||
};func (_baccg *ruling )intersects (_fbab *ruling )bool {_gdgd :=(_baccg ._ffdb ==_gfbd &&_fbab ._ffdb ==_fcec )||(_fbab ._ffdb ==_gfbd &&_baccg ._ffdb ==_fcec );_dcda :=func (_cfdd ,_dccb *ruling )bool {return _cfdd ._bafb -_adca <=_dccb ._ecdge &&_dccb ._ecdge <=_cfdd ._cbdc +_adca ;
|
||
};_gbdd :=_dcda (_baccg ,_fbab );_gafga :=_dcda (_fbab ,_baccg );if _bffg {_eg .Printf ("\u0020\u0020\u0020\u0020\u0069\u006e\u0074\u0065\u0072\u0073\u0065\u0063\u0074\u0073\u003a\u0020\u0020\u006fr\u0074\u0068\u006f\u0067\u006f\u006e\u0061l\u003d\u0025\u0074\u0020\u006f\u0031\u003d\u0025\u0074\u0020\u006f2\u003d\u0025\u0074\u0020\u2192\u0020\u0025\u0074\u000a"+"\u0020\u0020\u0020 \u0020\u0020\u0020\u0076\u003d\u0025\u0073\u000a"+" \u0020\u0020\u0020\u0020\u0020\u0077\u003d\u0025\u0073\u000a",_gdgd ,_gbdd ,_gafga ,_gdgd &&_gbdd &&_gafga ,_baccg ,_fbab );
|
||
};return _gdgd &&_gbdd &&_gafga ;};func (_aaeeab *textWord )toTextMarks (_daba *int )[]TextMark {var _eecg []TextMark ;for _ ,_cfad :=range _aaeeab ._dbff {_eecg =_dffc (_eecg ,_daba ,_cfad .ToTextMark ());};return _eecg ;};func (_cacdg *textTable )log (_dadg string ){if !_aeff {return ;
|
||
};_bb .Log .Info ("~\u007e\u007e\u0020\u0025\u0073\u003a \u0025\u0064\u0020\u0078\u0020\u0025d\u0020\u0067\u0072\u0069\u0064\u003d\u0025t\u000a\u0020\u0020\u0020\u0020\u0020\u0020\u0025\u0036\u002e2\u0066",_dadg ,_cacdg ._afaff ,_cacdg ._gaaa ,_cacdg ._cfabb ,_cacdg .PdfRectangle );
|
||
for _dcac :=0;_dcac < _cacdg ._gaaa ;_dcac ++{for _dbaed :=0;_dbaed < _cacdg ._afaff ;_dbaed ++{_agecd :=_cacdg .get (_dbaed ,_dcac );if _agecd ==nil {continue ;};_eg .Printf ("%\u0034\u0064\u0020\u00252d\u003a \u0025\u0036\u002e\u0032\u0066 \u0025\u0071\u0020\u0025\u0064\u000a",_dbaed ,_dcac ,_agecd .PdfRectangle ,_egde (_agecd .text (),50),_ba .RuneCountInString (_agecd .text ()));
|
||
};};};func (_eafb *textObject )setTextRise (_ccbe float64 ){if _eafb ==nil {return ;};_eafb ._aed ._adf =_ccbe ;};func (_cged *textObject )getCurrentFont ()*_bg .PdfFont {_gdd :=_cged ._aed ._ecd ;if _gdd ==nil {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u0020\u0064\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020U\u0073\u0069\u006e\u0067\u0020d\u0065\u0066a\u0075\u006c\u0074\u002e");
|
||
return _bg .DefaultFont ();};return _gdd ;};type event struct{_bbcgf float64 ;_cefa bool ;_bdcff int ;};
|
||
|
||
// Options extractor options.
|
||
type Options struct{
|
||
|
||
// DisableDocumentTags specifies whether to use the document tags during list extraction.
|
||
DisableDocumentTags bool ;
|
||
|
||
// ApplyCropBox will extract page text based on page cropbox if set to `true`.
|
||
ApplyCropBox bool ;
|
||
|
||
// UseSimplerExtractionProcess will skip topological text ordering and table processing.
|
||
//
|
||
// NOTE: While normally the extra processing is beneficial, it can also lead to problems when it does not work.
|
||
// Thus it is a flag to allow the user to control this process.
|
||
//
|
||
// Skipping some extraction processes would also lead to the reduced processing time.
|
||
UseSimplerExtractionProcess bool ;};
|
||
|
||
// Append appends `mark` to the mark array.
|
||
func (_gafg *TextMarkArray )Append (mark TextMark ){_gafg ._dafa =append (_gafg ._dafa ,mark )};func (_eeece *subpath )clear (){*_eeece =subpath {}};func (_dacc *textTable )markCells (){for _defg :=0;_defg < _dacc ._gaaa ;_defg ++{for _cdfa :=0;_cdfa < _dacc ._afaff ;
|
||
_cdfa ++{_bbccg :=_dacc .get (_cdfa ,_defg );if _bbccg !=nil {_bbccg ._ggcbf =true ;};};};};func _abfbf (_gdfee []*textLine ,_cfgeb string )string {var _dgbf _f .Builder ;_ddee :=0.0;for _ccffe ,_bebc :=range _gdfee {_fabf :=_bebc .text ();_bafg :=_bebc ._ffbb ;
|
||
if _ccffe < len (_gdfee )-1{_ddee =_gdfee [_ccffe +1]._ffbb ;}else {_ddee =0.0;};_dgbf .WriteString (_cfgeb );_dgbf .WriteString (_fabf );if _ddee !=_bafg {_dgbf .WriteString ("\u000a");}else {_dgbf .WriteString ("\u0020");};};return _dgbf .String ();};
|
||
func _gcbab (_bcfgf ,_aabc _be .Image )_be .Image {_daaa ,_dgac :=_aabc .Bounds ().Size (),_bcfgf .Bounds ().Size ();_dcdfe ,_fggb :=_daaa .X ,_daaa .Y ;if _dgac .X > _dcdfe {_dcdfe =_dgac .X ;};if _dgac .Y > _fggb {_fggb =_dgac .Y ;};_aafaf :=_be .Rect (0,0,_dcdfe ,_fggb );
|
||
if _daaa .X !=_dcdfe ||_daaa .Y !=_fggb {_fagee :=_be .NewRGBA (_aafaf );_gd .BiLinear .Scale (_fagee ,_aafaf ,_bcfgf ,_aabc .Bounds (),_gd .Over ,nil );_aabc =_fagee ;};if _dgac .X !=_dcdfe ||_dgac .Y !=_fggb {_cgdd :=_be .NewRGBA (_aafaf );_gd .BiLinear .Scale (_cgdd ,_aafaf ,_bcfgf ,_bcfgf .Bounds (),_gd .Over ,nil );
|
||
_bcfgf =_cgdd ;};_dcdfg :=_be .NewRGBA (_aafaf );_gd .DrawMask (_dcdfg ,_aafaf ,_bcfgf ,_be .Point {},_aabc ,_be .Point {},_gd .Over );return _dcdfg ;};func _bacd (_edff ,_abcd bounded )float64 {_efda :=_gade (_edff ,_abcd );if !_cadfe (_efda ){return _efda ;
|
||
};return _gffg (_edff ,_abcd );};func (_bdagb *textTable )getComposite (_dbfa ,_eeeff int )(paraList ,_bg .PdfRectangle ){_bgfb ,_bcef :=_bdagb ._bgag [_gbbce (_dbfa ,_eeeff )];if _aeff {_eg .Printf ("\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0067\u0065\u0074\u0043\u006f\u006d\u0070o\u0073i\u0074\u0065\u0028\u0025\u0064\u002c\u0025\u0064\u0029\u002d\u003e\u0025\u0073\u000a",_dbfa ,_eeeff ,_bgfb .String ());
|
||
};if !_bcef {return nil ,_bg .PdfRectangle {};};return _bgfb .parasBBox ();};func (_debdb gridTile )complete ()bool {return _debdb .numBorders ()==4};func (_egc *wordBag )applyRemovals (_ddd map[int ]map[*textWord ]struct{}){for _adafe ,_gbfg :=range _ddd {if len (_gbfg )==0{continue ;
|
||
};_accf :=_egc ._dceb [_adafe ];_bcbe :=len (_accf )-len (_gbfg );if _bcbe ==0{delete (_egc ._dceb ,_adafe );continue ;};_dbec :=make ([]*textWord ,_bcbe );_deaf :=0;for _ ,_bdb :=range _accf {if _ ,_aecf :=_gbfg [_bdb ];!_aecf {_dbec [_deaf ]=_bdb ;_deaf ++;
|
||
};};_egc ._dceb [_adafe ]=_dbec ;};};
|
||
|
||
// ExtractText processes and extracts all text data in content streams and returns as a string.
|
||
// It takes into account character encodings in the PDF file, which are decoded by
|
||
// CharcodeBytesToUnicode.
|
||
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
|
||
func (_cag *Extractor )ExtractText ()(string ,error ){_cbg ,_ ,_ ,_bbcb :=_cag .ExtractTextWithStats ();return _cbg ,_bbcb ;};func (_bcfg *textTable )getDown ()paraList {_dbgg :=make (paraList ,_bcfg ._afaff );for _fcefc :=0;_fcefc < _bcfg ._afaff ;_fcefc ++{_dfeeb :=_bcfg .get (_fcefc ,_bcfg ._gaaa -1)._daag ;
|
||
if _dfeeb .taken (){return nil ;};_dbgg [_fcefc ]=_dfeeb ;};for _aada :=0;_aada < _bcfg ._afaff -1;_aada ++{if _dbgg [_aada ]._gdfgd !=_dbgg [_aada +1]{return nil ;};};return _dbgg ;};func _gcba (_gdac []structElement ,_fgcf map[int ][]*textLine ,_cegbc _ga .PdfObject )[]*list {_bfgbd :=[]*list {};
|
||
for _ ,_fcda :=range _gdac {_ccfg :=_fcda ._acdgb ;_agcag :=int (_fcda ._dfdgf );_fbff :=_fcda ._ddag ;_gdfaec :=[]*textLine {};_ecba :=[]*list {};_eggf :=_fcda ._gcbe ;_dcag ,_fdcgf :=(_eggf .(*_ga .PdfObjectReference ));if !_fdcgf {_bb .Log .Debug ("\u0066\u0061\u0069l\u0065\u0064\u0020\u006f\u0074\u0020\u0063\u0061\u0073\u0074\u0020\u0074\u006f\u0020\u002a\u0063\u006f\u0072\u0065\u002e\u0050\u0064\u0066\u004f\u0062\u006a\u0065\u0063\u0074R\u0065\u0066\u0065\u0072\u0065\u006e\u0063\u0065");
|
||
};if _agcag !=-1&&_dcag !=nil {if _acde ,_cdbba :=_fgcf [_agcag ];_cdbba {if _begf ,_dec :=_cegbc .(*_ga .PdfIndirectObject );_dec {_edbb :=_begf .PdfObjectReference ;if _dc .DeepEqual (*_dcag ,_edbb ){_gdfaec =_acde ;};};};};if _ccfg !=nil {_ecba =_gcba (_ccfg ,_fgcf ,_cegbc );
|
||
};_fcfag :=_gfea (_gdfaec ,_fbff ,_ecba );_bfgbd =append (_bfgbd ,_fcfag );};return _bfgbd ;};func (_dgc *wordBag )removeWord (_daac *textWord ,_ggca int ){_dgb :=_dgc ._dceb [_ggca ];_dgb =_cafca (_dgb ,_daac );if len (_dgb )==0{delete (_dgc ._dceb ,_ggca );
|
||
}else {_dgc ._dceb [_ggca ]=_dgb ;};};func (_ebcc gridTiling )complete ()bool {for _ ,_cgfba :=range _ebcc ._eagfa {for _ ,_ggfdf :=range _cgfba {if !_ggfdf .complete (){return false ;};};};return true ;};type fontEntry struct{_fefa *_bg .PdfFont ;_cacb int64 ;
|
||
};func _bda (_gebg _bg .PdfRectangle )textState {return textState {_dcc :100,_agbg :RenderModeFill ,_cfge :_gebg };};func (_bdd *imageExtractContext )extractXObjectImage (_eeeb *_ga .PdfObjectName ,_cfb _de .GraphicsState ,_ffa *_bg .PdfPageResources )error {_fgf ,_ :=_ffa .GetXObjectByName (*_eeeb );
|
||
if _fgf ==nil {return nil ;};_gaf ,_bbd :=_bdd ._dab [_fgf ];if !_bbd {_ced ,_fbb :=_ffa .GetXObjectImageByName (*_eeeb );if _fbb !=nil {return _fbb ;};if _ced ==nil {return nil ;};_bce ,_fbb :=_ced .ToImage ();if _fbb !=nil {return _fbb ;};var _cdb _be .Image ;
|
||
if _ced .SMask !=nil {_cdb ,_fbb =_bdad (_ced .SMask ,_g .Opaque );if _fbb !=nil {_bb .Log .Debug ("W\u0041\u0052\u004e\u003a\u0020\u0063\u006f\u0075\u006c\u0064\u0020\u006e\u006f\u0074\u0020\u0067\u0065\u0074\u0020\u0073\u006f\u0066\u0074\u0020\u0069\u006da\u0067e\u0020\u006d\u0061\u0073k\u002e\u0020O\u0075\u0074\u0070\u0075\u0074\u0020\u006d\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063\u006f\u0072\u0072\u0065\u0063\u0074\u002e");
|
||
};};if _cdb !=nil {_aga ,_egb :=_bce .ToGoImage ();if _egb !=nil {return _egb ;};_aga =_gcbab (_aga ,_cdb );switch _ced .ColorSpace .String (){case "\u0044\u0065\u0076\u0069\u0063\u0065\u0047\u0072\u0061\u0079","\u0049n\u0064\u0065\u0078\u0065\u0064":_bce ,_egb =_bg .ImageHandling .NewGrayImageFromGoImage (_aga );
|
||
if _egb !=nil {return _egb ;};default:_bce ,_egb =_bg .ImageHandling .NewImageFromGoImage (_aga );if _egb !=nil {return _egb ;};};};_gaf =&cachedImage {_gaeg :_bce ,_cdf :_ced .ColorSpace };_bdd ._dab [_fgf ]=_gaf ;};_ddb :=_gaf ._gaeg ;_dgd :=_gaf ._cdf ;
|
||
_cgce ,_dcbc :=_dgd .ImageToRGB (*_ddb );if _dcbc !=nil {return _dcbc ;};_bb .Log .Debug ("@\u0044\u006f\u0020\u0043\u0054\u004d\u003a\u0020\u0025\u0073",_cfb .CTM .String ());_gaa :=ImageMark {Image :&_cgce ,Width :_cfb .CTM .ScalingFactorX (),Height :_cfb .CTM .ScalingFactorY (),Angle :_cfb .CTM .Angle ()};
|
||
_gaa .X ,_gaa .Y =_cfb .CTM .Translation ();_bdd ._cc =append (_bdd ._cc ,_gaa );_bdd ._bab ++;return nil ;};func (_eggfg intSet )del (_ddcgc int ){delete (_eggfg ,_ddcgc )};func (_feca rulingList )asTiling ()gridTiling {if _dcdd {_bb .Log .Info ("r\u0075\u006ci\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0076\u0065\u0063s\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u002b\u002b\u002b\u0020\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d",len (_feca ));
|
||
};for _effc ,_fbege :=range _feca [1:]{_cacd :=_feca [_effc ];if _cacd .alignsPrimary (_fbege )&&_cacd .alignsSec (_fbege ){_bb .Log .Error ("a\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0044\u0075\u0070\u006c\u0069\u0063\u0061\u0074\u0065 \u0072\u0075\u006c\u0069\u006e\u0067\u0073\u002e\u000a\u0009v=\u0025\u0073\u000a\t\u0077=\u0025\u0073",_fbege ,_cacd );
|
||
};};_feca .sortStrict ();_feca .log ("\u0073n\u0061\u0070\u0070\u0065\u0064");_ecdf ,_adeg :=_feca .vertsHorzs ();_edgg :=_ecdf .primaries ();_bdaeab :=_adeg .primaries ();_dagf :=len (_edgg )-1;_fcgc :=len (_bdaeab )-1;if _dagf ==0||_fcgc ==0{return gridTiling {};
|
||
};_dgae :=_bg .PdfRectangle {Llx :_edgg [0],Urx :_edgg [_dagf ],Lly :_bdaeab [0],Ury :_bdaeab [_fcgc ]};if _dcdd {_bb .Log .Info ("\u0072\u0075l\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0076\u0065\u0072\u0074s=\u0025\u0064",len (_ecdf ));
|
||
for _cfcd ,_afebb :=range _ecdf {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_cfcd ,_afebb );};_bb .Log .Info ("\u0072\u0075l\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0068\u006f\u0072\u007as=\u0025\u0064",len (_adeg ));
|
||
for _ceeaa ,_cegee :=range _adeg {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_ceeaa ,_cegee );};_bb .Log .Info ("\u0072\u0075\u006c\u0069\u006eg\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067:\u0020\u0020\u0077\u0078\u0068\u003d\u0025\u0064\u0078\u0025\u0064\u000a\u0009\u006c\u006c\u0078\u003d\u0025\u002e\u0032\u0066\u000a\u0009\u006c\u006c\u0079\u003d\u0025\u002e\u0032f",_dagf ,_fcgc ,_edgg ,_bdaeab );
|
||
};_gdbdb :=make ([]gridTile ,_dagf *_fcgc );for _decd :=_fcgc -1;_decd >=0;_decd --{_cfeaa :=_bdaeab [_decd ];_gfeac :=_bdaeab [_decd +1];for _edadc :=0;_edadc < _dagf ;_edadc ++{_gdee :=_edgg [_edadc ];_gccf :=_edgg [_edadc +1];_ddcf :=_ecdf .findPrimSec (_gdee ,_cfeaa );
|
||
_cgcecg :=_ecdf .findPrimSec (_gccf ,_cfeaa );_ffbc :=_adeg .findPrimSec (_cfeaa ,_gdee );_gdcc :=_adeg .findPrimSec (_gfeac ,_gdee );_cacba :=_bg .PdfRectangle {Llx :_gdee ,Urx :_gccf ,Lly :_cfeaa ,Ury :_gfeac };_acaa :=_aeabd (_cacba ,_ddcf ,_cgcecg ,_ffbc ,_gdcc );
|
||
_gdbdb [_decd *_dagf +_edadc ]=_acaa ;if _dcdd {_eg .Printf ("\u0020\u0020\u0078\u003d\u0025\u0032\u0064\u0020\u0079\u003d\u0025\u0032\u0064\u003a\u0020%\u0073 \u0025\u0036\u002e\u0032\u0066\u0020\u0078\u0020\u0025\u0036\u002e\u0032\u0066\u000a",_edadc ,_decd ,_acaa .String (),_acaa .Width (),_acaa .Height ());
|
||
};};};if _dcdd {_bb .Log .Info ("r\u0075\u006c\u0069\u006e\u0067\u004c\u0069\u0073\u0074.\u0061\u0073\u0054\u0069\u006c\u0069\u006eg:\u0020\u0063\u006f\u0061l\u0065\u0073\u0063\u0065\u0020\u0068\u006f\u0072\u0069zo\u006e\u0074a\u006c\u002e\u0020\u0025\u0036\u002e\u0032\u0066",_dgae );
|
||
};_beaed :=make ([]map[float64 ]gridTile ,_fcgc );for _fabda :=_fcgc -1;_fabda >=0;_fabda --{if _dcdd {_eg .Printf ("\u0020\u0020\u0079\u003d\u0025\u0032\u0064\u000a",_fabda );};_beaed [_fabda ]=make (map[float64 ]gridTile ,_dagf );for _aeeg :=0;_aeeg < _dagf ;
|
||
_aeeg ++{_aabdg :=_gdbdb [_fabda *_dagf +_aeeg ];if _dcdd {_eg .Printf ("\u0020\u0020\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_aeeg ,_aabdg );};if !_aabdg ._gafdd {continue ;};_fdef :=_aeeg ;for _edaf :=_aeeg +1;!_aabdg ._deage &&_edaf < _dagf ;
|
||
_edaf ++{_addeg :=_gdbdb [_fabda *_dagf +_edaf ];_aabdg .Urx =_addeg .Urx ;_aabdg ._fed =_aabdg ._fed ||_addeg ._fed ;_aabdg ._faba =_aabdg ._faba ||_addeg ._faba ;_aabdg ._deage =_addeg ._deage ;if _dcdd {_eg .Printf ("\u0020 \u0020%\u0034\u0064\u003a\u0020\u0025s\u0020\u2192 \u0025\u0073\u000a",_edaf ,_addeg ,_aabdg );
|
||
};_fdef =_edaf ;};if _dcdd {_eg .Printf (" \u0020 \u0025\u0032\u0064\u0020\u002d\u0020\u0025\u0032d\u0020\u2192\u0020\u0025s\n",_aeeg ,_fdef ,_aabdg );};_aeeg =_fdef ;_beaed [_fabda ][_aabdg .Llx ]=_aabdg ;};};_feedf :=make (map[float64 ]map[float64 ]gridTile ,_fcgc );
|
||
_abfac :=make (map[float64 ]map[float64 ]struct{},_fcgc );for _cdcd :=_fcgc -1;_cdcd >=0;_cdcd --{_aafd :=_gdbdb [_cdcd *_dagf ].Lly ;_feedf [_aafd ]=make (map[float64 ]gridTile ,_dagf );_abfac [_aafd ]=make (map[float64 ]struct{},_dagf );};if _dcdd {_bb .Log .Info ("\u0072u\u006c\u0069n\u0067\u004c\u0069s\u0074\u002e\u0061\u0073\u0054\u0069\u006ci\u006e\u0067\u003a\u0020\u0063\u006fa\u006c\u0065\u0073\u0063\u0065\u0020\u0076\u0065\u0072\u0074\u0069c\u0061\u006c\u002e\u0020\u0025\u0036\u002e\u0032\u0066",_dgae );
|
||
};for _gfdf :=_fcgc -1;_gfdf >=0;_gfdf --{_gadae :=_gdbdb [_gfdf *_dagf ].Lly ;_fddea :=_beaed [_gfdf ];if _dcdd {_eg .Printf ("\u0020\u0020\u0079\u003d\u0025\u0032\u0064\u000a",_gfdf );};for _ ,_cbgc :=range _bdfa (_fddea ){if _ ,_ceff :=_abfac [_gadae ][_cbgc ];
|
||
_ceff {continue ;};_aggg :=_fddea [_cbgc ];if _dcdd {_eg .Printf (" \u0020\u0020\u0020\u0020\u0076\u0030\u003d\u0025\u0073\u000a",_aggg .String ());};for _aabe :=_gfdf -1;_aabe >=0;_aabe --{if _aggg ._faba {break ;};_cbcbb :=_beaed [_aabe ];_egfc ,_bbebe :=_cbcbb [_cbgc ];
|
||
if !_bbebe {break ;};if _egfc .Urx !=_aggg .Urx {break ;};_aggg ._faba =_egfc ._faba ;_aggg .Lly =_egfc .Lly ;if _dcdd {_eg .Printf ("\u0020\u0020\u0020\u0020 \u0020\u0020\u0076\u003d\u0025\u0073\u0020\u0076\u0030\u003d\u0025\u0073\u000a",_egfc .String (),_aggg .String ());
|
||
};_abfac [_egfc .Lly ][_egfc .Llx ]=struct{}{};};if _gfdf ==0{_aggg ._faba =true ;};if _aggg .complete (){_feedf [_gadae ][_cbgc ]=_aggg ;};};};_ccfgg :=gridTiling {PdfRectangle :_dgae ,_dbdc :_geab (_feedf ),_eaag :_ddfc (_feedf ),_eagfa :_feedf };_ccfgg .log ("\u0043r\u0065\u0061\u0074\u0065\u0064");
|
||
return _ccfgg ;};func (_bddda *textTable )subdivide ()*textTable {_bddda .logComposite ("\u0073u\u0062\u0064\u0069\u0076\u0069\u0064e");_aeaf :=_bddda .compositeRowCorridors ();_eadd :=_bddda .compositeColCorridors ();if _aeff {_bb .Log .Info ("\u0073u\u0062\u0064i\u0076\u0069\u0064\u0065:\u000a\u0009\u0072o\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s=\u0025\u0073\u000a\t\u0063\u006fl\u0043\u006f\u0072\u0072\u0069\u0064o\u0072\u0073=\u0025\u0073",_egccd (_aeaf ),_egccd (_eadd ));
|
||
};if len (_aeaf )==0||len (_eadd )==0{return _bddda ;};_fbbe (_aeaf );_fbbe (_eadd );if _aeff {_bb .Log .Info ("\u0073\u0075\u0062\u0064\u0069\u0076\u0069\u0064\u0065\u0020\u0066\u0069\u0078\u0065\u0064\u003a\u000a\u0009r\u006f\u0077\u0043\u006f\u0072\u0072\u0069d\u006f\u0072\u0073\u003d\u0025\u0073\u000a\u0009\u0063\u006f\u006cC\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u003d\u0025\u0073",_egccd (_aeaf ),_egccd (_eadd ));
|
||
};_cgca ,_gdcce :=_caccb (_bddda ._gaaa ,_aeaf );_dfae ,_cbafa :=_caccb (_bddda ._afaff ,_eadd );_dgdda :=make (map[uint64 ]*textPara ,_cbafa *_gdcce );_aefd :=&textTable {PdfRectangle :_bddda .PdfRectangle ,_cfabb :_bddda ._cfabb ,_gaaa :_gdcce ,_afaff :_cbafa ,_feeba :_dgdda };
|
||
if _aeff {_bb .Log .Info ("\u0073\u0075b\u0064\u0069\u0076\u0069\u0064\u0065\u003a\u0020\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u0020\u003d\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u0020\u0063\u0065\u006c\u006c\u0073\u003d\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u000a"+"\u0009\u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s\u003d\u0025\u0073\u000a"+"\u0009\u0063\u006f\u006c\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s\u003d\u0025\u0073\u000a"+"\u0009\u0079\u004f\u0066\u0066\u0073\u0065\u0074\u0073=\u0025\u002b\u0076\u000a"+"\u0009\u0078\u004f\u0066\u0066\u0073\u0065\u0074\u0073\u003d\u0025\u002b\u0076",_bddda ._afaff ,_bddda ._gaaa ,_cbafa ,_gdcce ,_egccd (_aeaf ),_egccd (_eadd ),_cgca ,_dfae );
|
||
};for _bgdgg :=0;_bgdgg < _bddda ._gaaa ;_bgdgg ++{_ecgdd :=_cgca [_bgdgg ];for _feabe :=0;_feabe < _bddda ._afaff ;_feabe ++{_bddbg :=_dfae [_feabe ];if _aeff {_eg .Printf ("\u0025\u0036\u0064\u002c %\u0032\u0064\u003a\u0020\u0078\u0030\u003d\u0025\u0064\u0020\u0079\u0030\u003d\u0025d\u000a",_feabe ,_bgdgg ,_bddbg ,_ecgdd );
|
||
};_efcef ,_fagea :=_bddda ._bgag [_gbbce (_feabe ,_bgdgg )];if !_fagea {continue ;};_ccgca :=_efcef .split (_aeaf [_bgdgg ],_eadd [_feabe ]);for _cadbd :=0;_cadbd < _ccgca ._gaaa ;_cadbd ++{for _gdbg :=0;_gdbg < _ccgca ._afaff ;_gdbg ++{_ccfa :=_ccgca .get (_gdbg ,_cadbd );
|
||
_aefd .put (_bddbg +_gdbg ,_ecgdd +_cadbd ,_ccfa );if _aeff {_eg .Printf ("\u0025\u0038\u0064\u002c\u0020\u0025\u0032\u0064\u003a\u0020\u0025\u0073\u000a",_bddbg +_gdbg ,_ecgdd +_cadbd ,_ccfa );};};};};};return _aefd ;};
|
||
|
||
// String returns a human readable description of `ss`.
|
||
func (_acdg *shapesState )String ()string {return _eg .Sprintf ("\u007b\u0025\u0064\u0020su\u0062\u0070\u0061\u0074\u0068\u0073\u0020\u0066\u0072\u0065\u0073\u0068\u003d\u0025t\u007d",len (_acdg ._bfeb ),_acdg ._abac );};func _geab (_caad map[float64 ]map[float64 ]gridTile )[]float64 {_efff :=make ([]float64 ,0,len (_caad ));
|
||
_ebbgb :=make (map[float64 ]struct{},len (_caad ));for _ ,_beagbg :=range _caad {for _aded :=range _beagbg {if _ ,_dcbcgd :=_ebbgb [_aded ];_dcbcgd {continue ;};_efff =append (_efff ,_aded );_ebbgb [_aded ]=struct{}{};};};_ff .Float64s (_efff );return _efff ;
|
||
};
|
||
|
||
// Font represents the font properties on a PDF page.
|
||
type Font struct{PdfFont *_bg .PdfFont ;
|
||
|
||
// FontName represents Font Name from font properties.
|
||
FontName string ;
|
||
|
||
// FontType represents Font Subtype entry in the font dictionary inside page resources.
|
||
// Examples : type0, Type1, MMType1, Type3, TrueType, CIDFont.
|
||
FontType string ;
|
||
|
||
// ToUnicode is true if font provides a `ToUnicode` mapping.
|
||
ToUnicode bool ;
|
||
|
||
// IsCID is true if underlying font is a composite font.
|
||
// Composite font is represented by a font dictionary whose Subtype is `Type0`
|
||
IsCID bool ;
|
||
|
||
// IsSimple is true if font is simple font.
|
||
// A simple font is limited to only 8 bit (255) character codes.
|
||
IsSimple bool ;
|
||
|
||
// FontData represents the raw data of the embedded font file.
|
||
// It can have format TrueType (TTF), PostScript Font (PFB) or Compact Font Format (CCF).
|
||
// FontData value can be indicates from `FontFile`, `FontFile2` or `FontFile3` inside Font Descriptor.
|
||
// At most, only one of `FontFile`, `FontFile2` or `FontFile3` will be FontData value.
|
||
FontData []byte ;
|
||
|
||
// FontFileName is a name representing the font. it has format:
|
||
// (Font Name) + (Font Type Extension), example: helvetica.ttf.
|
||
FontFileName string ;
|
||
|
||
// FontDescriptor represents metrics and other attributes inside font properties from PDF Structure (Font Descriptor).
|
||
FontDescriptor *_bg .PdfFontDescriptor ;};func _aeece (_fgbd []float64 ,_gaag ,_dcbeg float64 )[]float64 {_fggeb ,_aaeed :=_gaag ,_dcbeg ;if _aaeed < _fggeb {_fggeb ,_aaeed =_aaeed ,_fggeb ;};_bbbd :=make ([]float64 ,0,len (_fgbd )+2);_bbbd =append (_bbbd ,_gaag );
|
||
for _ ,_edge :=range _fgbd {if _edge <=_fggeb {continue ;}else if _edge >=_aaeed {break ;};_bbbd =append (_bbbd ,_edge );};_bbbd =append (_bbbd ,_dcbeg );return _bbbd ;};func (_ddebb lineRuling )asRuling ()(*ruling ,bool ){_bcdb :=ruling {_ffdb :_ddebb ._fcfg ,Color :_ddebb .Color ,_cbbcc :_feeb };
|
||
switch _ddebb ._fcfg {case _gfbd :_bcdb ._ecdge =_ddebb .xMean ();_bcdb ._bafb =_ef .Min (_ddebb ._gabf .Y ,_ddebb ._fccf .Y );_bcdb ._cbdc =_ef .Max (_ddebb ._gabf .Y ,_ddebb ._fccf .Y );case _fcec :_bcdb ._ecdge =_ddebb .yMean ();_bcdb ._bafb =_ef .Min (_ddebb ._gabf .X ,_ddebb ._fccf .X );
|
||
_bcdb ._cbdc =_ef .Max (_ddebb ._gabf .X ,_ddebb ._fccf .X );default:_bb .Log .Error ("\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064",_ddebb ._fcfg );return nil ,false ;};return &_bcdb ,true ;};
|
||
func _ddbfd (_aadd bounded )float64 {return -_aadd .bbox ().Lly };func (_cegb *textObject )setFont (_agcf string ,_gec float64 )error {if _cegb ==nil {return nil ;};_cegb ._aed ._fbda =_gec ;_gdfa ,_dce :=_cegb .getFont (_agcf );if _dce !=nil {return _dce ;
|
||
};_cegb ._aed ._ecd =_gdfa ;return nil ;};
|
||
|
||
// RangeOffset returns the TextMarks in `ma` that overlap text[start:end] in the extracted text.
|
||
// These are tm: `start` <= tm.Offset + len(tm.Text) && tm.Offset < `end` where
|
||
// `start` and `end` are offsets in the extracted text.
|
||
// NOTE: TextMarks can contain multiple characters. e.g. "ffi" for the ffi ligature so the first and
|
||
// last elements of the returned TextMarkArray may only partially overlap text[start:end].
|
||
func (_gcb *TextMarkArray )RangeOffset (start ,end int )(*TextMarkArray ,error ){if _gcb ==nil {return nil ,_b .New ("\u006da\u003d\u003d\u006e\u0069\u006c");};if end < start {return nil ,_eg .Errorf ("\u0065\u006e\u0064\u0020\u003c\u0020\u0073\u0074\u0061\u0072\u0074\u002e\u0020\u0052\u0061n\u0067\u0065\u004f\u0066\u0066\u0073\u0065\u0074\u0020\u006e\u006f\u0074\u0020d\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020\u0073\u0074\u0061\u0072t=\u0025\u0064\u0020\u0065\u006e\u0064\u003d\u0025\u0064\u0020",start ,end );
|
||
};_gbaa :=len (_gcb ._dafa );if _gbaa ==0{return _gcb ,nil ;};if start < _gcb ._dafa [0].Offset {start =_gcb ._dafa [0].Offset ;};if end > _gcb ._dafa [_gbaa -1].Offset +1{end =_gcb ._dafa [_gbaa -1].Offset +1;};_eccde :=_ff .Search (_gbaa ,func (_ddea int )bool {return _gcb ._dafa [_ddea ].Offset +len (_gcb ._dafa [_ddea ].Text )-1>=start });
|
||
if !(0<=_eccde &&_eccde < _gbaa ){_dfcg :=_eg .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020\u0072\u0061\u006e\u0067\u0065\u002e\u0020\u0073\u0074\u0061\u0072\u0074\u003d%\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009 \u006c\u0061\u0073\u0074\u003d%\u0076",start ,_eccde ,_gbaa ,_gcb ._dafa [0],_gcb ._dafa [_gbaa -1]);
|
||
return nil ,_dfcg ;};_bddb :=_ff .Search (_gbaa ,func (_gacd int )bool {return _gcb ._dafa [_gacd ].Offset > end -1});if !(0<=_bddb &&_bddb < _gbaa ){_aedg :=_eg .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020r\u0061\u006e\u0067e\u002e\u0020\u0065n\u0064\u003d%\u0064\u0020\u0069\u0045\u006e\u0064=\u0025d \u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d\u0025\u0076",end ,_bddb ,_gbaa ,_gcb ._dafa [0],_gcb ._dafa [_gbaa -1]);
|
||
return nil ,_aedg ;};if _bddb <=_eccde {return nil ,_eg .Errorf ("\u0069\u0045\u006e\u0064\u0020\u003c=\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003a\u0020\u0073\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u0065\u006ed\u003d\u0025\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020i\u0045n\u0064\u003d\u0025\u0064",start ,end ,_eccde ,_bddb );
|
||
};return &TextMarkArray {_dafa :_gcb ._dafa [_eccde :_bddb ]},nil ;};func (_dfb *stateStack )pop ()*textState {if _dfb .empty (){return nil ;};_cgff :=*(*_dfb )[len (*_dfb )-1];*_dfb =(*_dfb )[:len (*_dfb )-1];return &_cgff ;};func _acefg (_ceadc _bg .PdfRectangle )*ruling {return &ruling {_ffdb :_gfbd ,_ecdge :_ceadc .Llx ,_bafb :_ceadc .Lly ,_cbdc :_ceadc .Ury };
|
||
};func (_baed *subpath )removeDuplicates (){if len (_baed ._egfb )==0{return ;};_bddd :=[]_cb .Point {_baed ._egfb [0]};for _ ,_bgg :=range _baed ._egfb [1:]{if !_babg (_bgg ,_bddd [len (_bddd )-1]){_bddd =append (_bddd ,_bgg );};};_baed ._egfb =_bddd ;
|
||
};func (_dbgd *compositeCell )updateBBox (){for _ ,_ccgeg :=range _dbgd .paraList {_dbgd .PdfRectangle =_dfba (_dbgd .PdfRectangle ,_ccgeg .PdfRectangle );};};func (_dgca paraList )topoOrder ()[]int {if _bdcc {_bb .Log .Info ("\u0074\u006f\u0070\u006f\u004f\u0072\u0064\u0065\u0072\u003a");
|
||
};_bcgf :=len (_dgca );_cffab :=make ([]bool ,_bcgf );_dacfe :=make ([]int ,0,_bcgf );_fbec :=_dgca .llyOrdering ();var _ebbg func (_fdeb int );_ebbg =func (_bcgfg int ){_cffab [_bcgfg ]=true ;for _fdafc :=0;_fdafc < _bcgf ;_fdafc ++{if !_cffab [_fdafc ]{if _dgca .readBefore (_fbec ,_bcgfg ,_fdafc ){_ebbg (_fdafc );
|
||
};};};_dacfe =append (_dacfe ,_bcgfg );};for _afgeb :=0;_afgeb < _bcgf ;_afgeb ++{if !_cffab [_afgeb ]{_ebbg (_afgeb );};};return _eafgb (_dacfe );};func (_cbeag paraList )eventNeighbours (_aagg []event )map[*textPara ][]int {_ff .Slice (_aagg ,func (_afaa ,_cbfa int )bool {_dbaab ,_fcefd :=_aagg [_afaa ],_aagg [_cbfa ];
|
||
_fdaef ,_fceef :=_dbaab ._bbcgf ,_fcefd ._bbcgf ;if _fdaef !=_fceef {return _fdaef < _fceef ;};if _dbaab ._cefa !=_fcefd ._cefa {return _dbaab ._cefa ;};return _afaa < _cbfa ;});_dbgef :=make (map[int ]intSet );_acaf :=make (intSet );for _ ,_bgab :=range _aagg {if _bgab ._cefa {_dbgef [_bgab ._bdcff ]=make (intSet );
|
||
for _fbgaf :=range _acaf {if _fbgaf !=_bgab ._bdcff {_dbgef [_bgab ._bdcff ].add (_fbgaf );_dbgef [_fbgaf ].add (_bgab ._bdcff );};};_acaf .add (_bgab ._bdcff );}else {_acaf .del (_bgab ._bdcff );};};_ggabb :=map[*textPara ][]int {};for _cfgfc ,_eacbf :=range _dbgef {_bdgf :=_cbeag [_cfgfc ];
|
||
if len (_eacbf )==0{_ggabb [_bdgf ]=nil ;continue ;};_cfeac :=make ([]int ,len (_eacbf ));_egfad :=0;for _gfdeg :=range _eacbf {_cfeac [_egfad ]=_gfdeg ;_egfad ++;};_ggabb [_bdgf ]=_cfeac ;};return _ggabb ;};var _egdbc =map[rulingKind ]string {_dgeec :"\u006e\u006f\u006e\u0065",_fcec :"\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c",_gfbd :"\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c"};
|
||
func _eacaf (_cgdaa ,_gcad float64 )bool {return _cgdaa /_ef .Max (_cga ,_gcad )< _dfbd };func _fece (_bbba ,_bgbgc *textPara )bool {return _gfdb (_bbba ._aaga ,_bgbgc ._aaga )};func (_bfgbc rulingList )primMinMax ()(float64 ,float64 ){_aegg ,_eebdf :=_bfgbc [0]._ecdge ,_bfgbc [0]._ecdge ;
|
||
for _ ,_ebgga :=range _bfgbc [1:]{if _ebgga ._ecdge < _aegg {_aegg =_ebgga ._ecdge ;}else if _ebgga ._ecdge > _eebdf {_eebdf =_ebgga ._ecdge ;};};return _aegg ,_eebdf ;};
|
||
|
||
// ToText returns the page text as a single string.
|
||
// Deprecated: This function is deprecated and will be removed in a future major version. Please use
|
||
// Text() instead.
|
||
func (_dbbd PageText )ToText ()string {return _dbbd .Text ()};
|
||
|
||
// PageFonts represents extracted fonts on a PDF page.
|
||
type PageFonts struct{Fonts []Font ;};type cachedImage struct{_gaeg *_bg .Image ;_cdf _bg .PdfColorspace ;};func (_beb *textObject )setTextLeading (_afeb float64 ){if _beb ==nil {return ;};_beb ._aed ._ddba =_afeb ;};type pathSection struct{_aadg []*subpath ;
|
||
_g .Color ;};func (_fffg *wordBag )pullWord (_dcee *textWord ,_dbbea int ,_fae map[int ]map[*textWord ]struct{}){_fffg .PdfRectangle =_dfba (_fffg .PdfRectangle ,_dcee .PdfRectangle );if _dcee ._beaeg > _fffg ._ddgcf {_fffg ._ddgcf =_dcee ._beaeg ;};_fffg ._dceb [_dbbea ]=append (_fffg ._dceb [_dbbea ],_dcee );
|
||
_fae [_dbbea ][_dcee ]=struct{}{};};func (_gdbf *wordBag )getDepthIdx (_adaca float64 )int {_aecd :=_gdbf .depthIndexes ();_fecb :=_adbg (_adaca );if _fecb < _aecd [0]{return _aecd [0];};if _fecb > _aecd [len (_aecd )-1]{return _aecd [len (_aecd )-1];};
|
||
return _fecb ;};func (_fdeg paraList )llyRange (_fcffa []int ,_dfccc ,_efgbb float64 )[]int {_degb :=len (_fdeg );if _efgbb < _fdeg [_fcffa [0]].Lly ||_dfccc > _fdeg [_fcffa [_degb -1]].Lly {return nil ;};_cfceg :=_ff .Search (_degb ,func (_dcbg int )bool {return _fdeg [_fcffa [_dcbg ]].Lly >=_dfccc });
|
||
_bbfe :=_ff .Search (_degb ,func (_ddbbb int )bool {return _fdeg [_fcffa [_ddbbb ]].Lly > _efgbb });return _fcffa [_cfceg :_bbfe ];};func _dffc (_eadg []TextMark ,_dacf *int ,_acfbdc TextMark )[]TextMark {_acfbdc .Offset =*_dacf ;_eadg =append (_eadg ,_acfbdc );
|
||
*_dacf +=len (_acfbdc .Text );return _eadg ;};func _dgcff (_fbebg float64 ,_ddddb int )int {if _ddddb ==0{_ddddb =1;};_dfbg :=float64 (_ddddb );return int (_ef .Round (_fbebg /_dfbg )*_dfbg );};func _gbbc (_egga map[float64 ][]*textLine )[]float64 {_bfbg :=[]float64 {};
|
||
for _egcf :=range _egga {_bfbg =append (_bfbg ,_egcf );};_ff .Float64s (_bfbg );return _bfbg ;};func (_ggcfd paraList )applyTables (_aaecb []*textTable )paraList {var _eddfg paraList ;for _ ,_gdaga :=range _aaecb {_eddfg =append (_eddfg ,_gdaga .newTablePara ());
|
||
};for _ ,_ceggd :=range _ggcfd {if _ceggd ._ggcbf {continue ;};_eddfg =append (_eddfg ,_ceggd );};return _eddfg ;};
|
||
|
||
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
|
||
// TODO(peterwilliams97): The stats complicate this function signature and aren't very useful.
|
||
//
|
||
// Replace with a function like Extract() (*PageText, error)
|
||
func (_bfg *Extractor )ExtractPageText ()(*PageText ,int ,int ,error ){_dbg ,_bed ,_fcfd ,_ddg :=_bfg .extractPageText (_bfg ._cd ,_bfg ._ee ,_cb .IdentityMatrix (),0);if _ddg !=nil &&_ddg !=_bg .ErrColorOutOfRange {return nil ,0,0,_ddg ;};if _bfg ._ab !=nil {_dbg ._afc ._add =_bfg ._ab .UseSimplerExtractionProcess ;
|
||
};_dbg .computeViews ();_ddg =_afdbc (_dbg );if _ddg !=nil {return nil ,0,0,_ddg ;};if _bfg ._ab !=nil {if _bfg ._ab .ApplyCropBox &&_bfg ._gc !=nil {_dbg .ApplyArea (*_bfg ._gc );};_dbg ._afc ._ggc =_bfg ._ab .DisableDocumentTags ;};return _dbg ,_bed ,_fcfd ,nil ;
|
||
};func (_dbeb *wordBag )absorb (_bbfd *wordBag ){_ebbb :=_bbfd .makeRemovals ();for _dgg ,_deba :=range _bbfd ._dceb {for _ ,_fddb :=range _deba {_dbeb .pullWord (_fddb ,_dgg ,_ebbb );};};_bbfd .applyRemovals (_ebbb );};var _feg string ="\u005e\u005b\u0061\u002d\u007a\u0041\u002dZ\u005d\u0028\u005c)\u007c\u005c\u002e)\u007c\u005e[\u005c\u0064\u005d\u002b\u0028\u005c)\u007c\\.\u0029\u007c\u005e\u005c\u0028\u005b\u0061\u002d\u007a\u0041\u002d\u005a\u005d\u005c\u0029\u007c\u005e\u005c\u0028\u005b\u005c\u0064\u005d\u002b\u005c\u0029";
|
||
type gridTiling struct{_bg .PdfRectangle ;_dbdc []float64 ;_eaag []float64 ;_eagfa map[float64 ]map[float64 ]gridTile ;};func _abcdd (_fgca *list )[]*list {var _fbba []*list ;for _ ,_eag :=range _fgca ._bcdd {switch _eag ._cfcfa {case "\u004c\u0049":_ggddd :=_ccdb (_eag );
|
||
_aebd :=_abcdd (_eag );_gadef :=_gfea (_ggddd ,"\u0062\u0075\u006c\u006c\u0065\u0074",_aebd );_bdcf :=_abfbf (_ggddd ,"");_gadef ._feed =_bdcf ;_fbba =append (_fbba ,_gadef );case "\u004c\u0042\u006fd\u0079":return _abcdd (_eag );case "\u004c":_ffga :=_abcdd (_eag );
|
||
_fbba =append (_fbba ,_ffga ...);return _fbba ;};};return _fbba ;};func (_gadd rulingList )mergePrimary ()float64 {_aeae :=_gadd [0]._ecdge ;for _ ,_fefbfb :=range _gadd [1:]{_aeae +=_fefbfb ._ecdge ;};return _aeae /float64 (len (_gadd ));};func (_abfbb paraList )sortTopoOrder (){_dbac :=_abfbb .topoOrder ();
|
||
_abfbb .reorder (_dbac )};func _cgge (_aee _cb .Point )_cb .Matrix {return _cb .TranslationMatrix (_aee .X ,_aee .Y )};
|
||
|
||
// TextTable represents a table.
|
||
// Cells are ordered top-to-bottom, left-to-right.
|
||
// Cells[y] is the (0-offset) y'th row in the table.
|
||
// Cells[y][x] is the (0-offset) x'th column in the table.
|
||
type TextTable struct{_bg .PdfRectangle ;W ,H int ;Cells [][]TableCell ;};type list struct{_dgef []*textLine ;_cfcfa string ;_bcdd []*list ;_feed string ;};func (_gcaa *textTable )reduce ()*textTable {_cfggc :=make ([]int ,0,_gcaa ._gaaa );_eebf :=make ([]int ,0,_gcaa ._afaff );
|
||
for _ceaab :=0;_ceaab < _gcaa ._gaaa ;_ceaab ++{if !_gcaa .emptyCompositeRow (_ceaab ){_cfggc =append (_cfggc ,_ceaab );};};for _egbb :=0;_egbb < _gcaa ._afaff ;_egbb ++{if !_gcaa .emptyCompositeColumn (_egbb ){_eebf =append (_eebf ,_egbb );};};if len (_cfggc )==_gcaa ._gaaa &&len (_eebf )==_gcaa ._afaff {return _gcaa ;
|
||
};_bdfbc :=textTable {_cfabb :_gcaa ._cfabb ,_afaff :len (_eebf ),_gaaa :len (_cfggc ),_feeba :make (map[uint64 ]*textPara ,len (_eebf )*len (_cfggc ))};if _aeff {_bb .Log .Info ("\u0072\u0065\u0064\u0075ce\u003a\u0020\u0025\u0064\u0078\u0025\u0064\u0020\u002d\u003e\u0020\u0025\u0064\u0078%\u0064",_gcaa ._afaff ,_gcaa ._gaaa ,len (_eebf ),len (_cfggc ));
|
||
_bb .Log .Info ("\u0072\u0065d\u0075\u0063\u0065d\u0043\u006f\u006c\u0073\u003a\u0020\u0025\u002b\u0076",_eebf );_bb .Log .Info ("\u0072\u0065d\u0075\u0063\u0065d\u0052\u006f\u0077\u0073\u003a\u0020\u0025\u002b\u0076",_cfggc );};for _bgffg ,_caeec :=range _cfggc {for _dfbac ,_adgc :=range _eebf {_aeed ,_dcbbf :=_gcaa .getComposite (_adgc ,_caeec );
|
||
if _aeed ==nil {continue ;};if _aeff {_eg .Printf ("\u0020 \u0025\u0032\u0064\u002c \u0025\u0032\u0064\u0020\u0028%\u0032d\u002c \u0025\u0032\u0064\u0029\u0020\u0025\u0071\n",_dfbac ,_bgffg ,_adgc ,_caeec ,_egde (_aeed .merge ().text (),50));};_bdfbc .putComposite (_dfbac ,_bgffg ,_aeed ,_dcbbf );
|
||
};};return &_bdfbc ;};func (_gfd *shapesState )addPoint (_agd ,_cca float64 ){_dfeg :=_gfd .establishSubpath ();_fafe :=_gfd .devicePoint (_agd ,_cca );if _dfeg ==nil {_gfd ._abac =true ;_gfd ._bfgg =_fafe ;}else {_dfeg .add (_fafe );};};func (_deeg *shapesState )establishSubpath ()*subpath {_fgcd ,_faae :=_deeg .lastpointEstablished ();
|
||
if !_faae {_deeg ._bfeb =append (_deeg ._bfeb ,_fdga (_fgcd ));};if len (_deeg ._bfeb )==0{return nil ;};_deeg ._abac =false ;return _deeg ._bfeb [len (_deeg ._bfeb )-1];};type paraList []*textPara ;func (_egbcc paraList )lines ()[]*textLine {var _feaab []*textLine ;
|
||
for _ ,_fefbf :=range _egbcc {_feaab =append (_feaab ,_fefbf ._bbagc ...);};return _feaab ;};
|
||
|
||
// ImageExtractOptions contains options for controlling image extraction from
|
||
// PDF pages.
|
||
type ImageExtractOptions struct{IncludeInlineStencilMasks bool ;};type rulingList []*ruling ;func (_accb *wordBag )sort (){for _ ,_cgcd :=range _accb ._dceb {_ff .Slice (_cgcd ,func (_efbb ,_daga int )bool {return _gffg (_cgcd [_efbb ],_cgcd [_daga ])< 0});
|
||
};};func (_bfce *stateStack )top ()*textState {if _bfce .empty (){return nil ;};return (*_bfce )[_bfce .size ()-1];};type textState struct{_edf float64 ;_dedcd float64 ;_dcc float64 ;_ddba float64 ;_fbda float64 ;_agbg RenderMode ;_adf float64 ;_ecd *_bg .PdfFont ;
|
||
_cfge _bg .PdfRectangle ;_ggaa int ;_gba int ;};func (_dbdd compositeCell )hasLines (_degf []*textLine )bool {for _fffd ,_caaeb :=range _degf {_ddgbd :=_gcfe (_dbdd .PdfRectangle ,_caaeb .PdfRectangle );if _aeff {_eg .Printf ("\u0020\u0020\u0020\u0020\u0020\u0020\u005e\u005e\u005e\u0069\u006e\u0074\u0065\u0072\u0073e\u0063t\u0073\u003d\u0025\u0074\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064\u000a",_ddgbd ,_fffd ,len (_degf ));
|
||
_eg .Printf ("\u0020\u0020\u0020\u0020 \u005e\u005e\u005e\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u003d\u0025s\u000a",_dbdd );_eg .Printf ("\u0020 \u0020 \u0020\u0020\u0020\u006c\u0069\u006e\u0065\u003d\u0025\u0073\u000a",_caaeb );};if _ddgbd {return true ;
|
||
};};return false ;};const _dde =20;func _gcfe (_cdag ,_bfcce _bg .PdfRectangle )bool {return _gfdb (_cdag ,_bfcce )&&_cfa (_cdag ,_bfcce )};
|
||
|
||
// List returns all the list objects detected on the page.
|
||
// It detects all the bullet point Lists from a given pdf page and builds a slice of bullet list objects.
|
||
// A given bullet list object has a tree structure.
|
||
// Each bullet point list is extracted with the text content it contains and all the sub lists found under it as children in the tree.
|
||
// The rest content of the pdf is ignored and only text in the bullet point lists are extracted.
|
||
// The list extraction is done in two ways.
|
||
// 1. If the document is tagged then the lists are extracted using the tags provided in the document.
|
||
// 2. Otherwise the bullet lists are extracted from the raw text using regex matching.
|
||
// By default the document tag is used if available.
|
||
// However this can be disabled using `DisableDocumentTags` in the `Options` object.
|
||
// Sometimes disabling document tags option might give a better bullet list extraction if the document was tagged incorrectly.
|
||
// options := &Options{
|
||
// DisableDocumentTags: false, // this means use document tag if available
|
||
// }
|
||
// ex, err := NewWithOptions(page, options)
|
||
// // handle error
|
||
// pageText, _, _, err := ex.ExtractPageText()
|
||
// // handle error
|
||
// lists := pageText.List()
|
||
// txt := lists.Text()
|
||
func (_babd PageText )List ()lists {_bbbc :=!_babd ._afc ._ggc ;_fbge :=_babd .getParagraphs ();_cffa :=true ;if _babd ._abba ==nil ||*_babd ._abba ==nil {_cffa =false ;};_fdfa :=_fbge .list ();if _cffa &&_bbbc {_bgcb :=_aedac (&_fbge );_fdag :=&structTreeRoot {};
|
||
_fdag .parseStructTreeRoot (*_babd ._abba );if _fdag ._aaef ==nil {_bb .Log .Debug ("\u004c\u0069\u0073\u0074\u003a\u0020\u0073t\u0072\u0075\u0063\u0074\u0054\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u0020\u0064\u006f\u0065\u0073\u006e'\u0074\u0020\u0068\u0061\u0076e\u0020\u0061\u006e\u0079\u0020\u0063\u006f\u006e\u0074e\u006e\u0074\u002c\u0020\u0075\u0073\u0069\u006e\u0067\u0020\u0074\u0065\u0078\u0074\u0020\u006d\u0061\u0074\u0063\u0068\u0069\u006e\u0067\u0020\u006d\u0065\u0074\u0068\u006f\u0064\u0020\u0069\u006e\u0073\u0074\u0065\u0061\u0064\u002e");
|
||
return _fdfa ;};_fdfa =_fdag .buildList (_bgcb ,_babd ._ccff );};return _fdfa ;};func (_dbace *ruling )gridIntersecting (_dbefg *ruling )bool {return _bccb (_dbace ._bafb ,_dbefg ._bafb )&&_bccb (_dbace ._cbdc ,_dbefg ._cbdc );};func _cfbe (_dcfc map[int ][]float64 )[]int {_dfdf :=make ([]int ,len (_dcfc ));
|
||
_ccbb :=0;for _bgddc :=range _dcfc {_dfdf [_ccbb ]=_bgddc ;_ccbb ++;};_ff .Ints (_dfdf );return _dfdf ;};func (_eadgb rulingList )sortStrict (){_ff .Slice (_eadgb ,func (_eece ,_gddceg int )bool {_bbgcb ,_ecbdg :=_eadgb [_eece ],_eadgb [_gddceg ];_dege ,_ggcf :=_bbgcb ._ffdb ,_ecbdg ._ffdb ;
|
||
if _dege !=_ggcf {return _dege > _ggcf ;};_adfcbg ,_afacd :=_bbgcb ._ecdge ,_ecbdg ._ecdge ;if !_cadfe (_adfcbg -_afacd ){return _adfcbg < _afacd ;};_adfcbg ,_afacd =_bbgcb ._bafb ,_ecbdg ._bafb ;if _adfcbg !=_afacd {return _adfcbg < _afacd ;};return _bbgcb ._cbdc < _ecbdg ._cbdc ;
|
||
});};
|
||
|
||
// String returns a string describing `tm`.
|
||
func (_gbf TextMark )String ()string {_aaabe :=_gbf .BBox ;var _acd string ;if _gbf .Font !=nil {_acd =_gbf .Font .String ();if len (_acd )> 50{_acd =_acd [:50]+"\u002e\u002e\u002e";};};var _afbg string ;if _gbf .Meta {_afbg ="\u0020\u002a\u004d\u002a";
|
||
};return _eg .Sprintf ("\u007b\u0054\u0065\u0078t\u004d\u0061\u0072\u006b\u003a\u0020\u0025\u0064\u0020%\u0071\u003d\u0025\u0030\u0032\u0078\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e2\u0066\u0029\u0020\u0028\u00256\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0025\u0073\u007d",_gbf .Offset ,_gbf .Text ,[]rune (_gbf .Text ),_aaabe .Llx ,_aaabe .Lly ,_aaabe .Urx ,_aaabe .Ury ,_acd ,_afbg );
|
||
};func _befb (_gegac []TextMark ,_bbbee *int ,_eadfg string )[]TextMark {_aabb :=_ecg ;_aabb .Text =_eadfg ;return _dffc (_gegac ,_bbbee ,_aabb );};func (_cfcc *textMark )inDiacriticArea (_cggee *textMark )bool {_eefa :=_cfcc .Llx -_cggee .Llx ;_egef :=_cfcc .Urx -_cggee .Urx ;
|
||
_bggdb :=_cfcc .Lly -_cggee .Lly ;return _ef .Abs (_eefa +_egef )< _cfcc .Width ()*_dgge &&_ef .Abs (_bggdb )< _cfcc .Height ()*_dgge ;};func (_gaab *textTable )computeBbox ()_bg .PdfRectangle {var _gggde _bg .PdfRectangle ;_ffgae :=false ;for _aaeg :=0;
|
||
_aaeg < _gaab ._gaaa ;_aaeg ++{for _ggbee :=0;_ggbee < _gaab ._afaff ;_ggbee ++{_ddbaca :=_gaab .get (_ggbee ,_aaeg );if _ddbaca ==nil {continue ;};if !_ffgae {_gggde =_ddbaca .PdfRectangle ;_ffgae =true ;}else {_gggde =_dfba (_gggde ,_ddbaca .PdfRectangle );
|
||
};};};return _gggde ;};type rulingKind int ;func (_gecb *textTable )reduceTiling (_gebde gridTiling ,_addbc float64 )*textTable {_cfegd :=make ([]int ,0,_gecb ._gaaa );_afdb :=make ([]int ,0,_gecb ._afaff );_ccag :=_gebde ._dbdc ;_dabcg :=_gebde ._eaag ;
|
||
for _gcbgd :=0;_gcbgd < _gecb ._gaaa ;_gcbgd ++{_fdge :=_gcbgd > 0&&_ef .Abs (_dabcg [_gcbgd -1]-_dabcg [_gcbgd ])< _addbc &&_gecb .emptyCompositeRow (_gcbgd );if !_fdge {_cfegd =append (_cfegd ,_gcbgd );};};for _cafg :=0;_cafg < _gecb ._afaff ;_cafg ++{_fcddf :=_cafg < _gecb ._afaff -1&&_ef .Abs (_ccag [_cafg +1]-_ccag [_cafg ])< _addbc &&_gecb .emptyCompositeColumn (_cafg );
|
||
if !_fcddf {_afdb =append (_afdb ,_cafg );};};if len (_cfegd )==_gecb ._gaaa &&len (_afdb )==_gecb ._afaff {return _gecb ;};_adee :=textTable {_cfabb :_gecb ._cfabb ,_afaff :len (_afdb ),_gaaa :len (_cfegd ),_bgag :make (map[uint64 ]compositeCell ,len (_afdb )*len (_cfegd ))};
|
||
if _aeff {_bb .Log .Info ("\u0072\u0065\u0064\u0075c\u0065\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0025d\u0078%\u0064\u0020\u002d\u003e\u0020\u0025\u0064x\u0025\u0064",_gecb ._afaff ,_gecb ._gaaa ,len (_afdb ),len (_cfegd ));_bb .Log .Info ("\u0072\u0065d\u0075\u0063\u0065d\u0043\u006f\u006c\u0073\u003a\u0020\u0025\u002b\u0076",_afdb );
|
||
_bb .Log .Info ("\u0072\u0065d\u0075\u0063\u0065d\u0052\u006f\u0077\u0073\u003a\u0020\u0025\u002b\u0076",_cfegd );};for _aefdb ,_bafa :=range _cfegd {for _fcaeb ,_gcce :=range _afdb {_gbff ,_bfcba :=_gecb .getComposite (_gcce ,_bafa );if len (_gbff )==0{continue ;
|
||
};if _aeff {_eg .Printf ("\u0020 \u0025\u0032\u0064\u002c \u0025\u0032\u0064\u0020\u0028%\u0032d\u002c \u0025\u0032\u0064\u0029\u0020\u0025\u0071\n",_fcaeb ,_aefdb ,_gcce ,_bafa ,_egde (_gbff .merge ().text (),50));};_adee .putComposite (_fcaeb ,_aefdb ,_gbff ,_bfcba );
|
||
};};return &_adee ;};func _abaa (_ecag ,_dgdc ,_dcgc float64 )rulingKind {if _ecag >=_dcgc &&_eacaf (_dgdc ,_ecag ){return _fcec ;};if _dgdc >=_dcgc &&_eacaf (_ecag ,_dgdc ){return _gfbd ;};return _dgeec ;};func (_agcgg compositeCell )String ()string {_cegg :="";
|
||
if len (_agcgg .paraList )> 0{_cegg =_egde (_agcgg .paraList .merge ().text (),50);};return _eg .Sprintf ("\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0070\u0061\u0072a\u0073\u0020\u0025\u0071",_agcgg .PdfRectangle ,len (_agcgg .paraList ),_cegg );
|
||
};func (_degd *textPara )bbox ()_bg .PdfRectangle {return _degd .PdfRectangle };func (_gfce rulingList )aligned ()bool {if len (_gfce )< 2{return false ;};_cgcec :=make (map[*ruling ]int );_cgcec [_gfce [0]]=0;for _ ,_dcged :=range _gfce [1:]{_egag :=false ;
|
||
for _bbgd :=range _cgcec {if _dcged .gridIntersecting (_bbgd ){_cgcec [_bbgd ]++;_egag =true ;break ;};};if !_egag {_cgcec [_dcged ]=0;};};_cfaf :=0;for _ ,_dgbc :=range _cgcec {if _dgbc ==0{_cfaf ++;};};_gafcg :=float64 (_cfaf )/float64 (len (_gfce ));
|
||
_acg :=_gafcg <=1.0-_cgac ;if _bffg {_bb .Log .Info ("\u0061\u006c\u0069\u0067\u006e\u0065\u0064\u003d\u0025\u0074\u0020\u0075\u006em\u0061\u0074\u0063\u0068\u0065\u0064=\u0025\u002e\u0032\u0066\u003d\u0025\u0064\u002f\u0025\u0064\u0020\u0076\u0065c\u0073\u003d\u0025\u0073",_acg ,_gafcg ,_cfaf ,len (_gfce ),_gfce .String ());
|
||
};return _acg ;};func (_gfbe rulingList )intersections ()map[int ]intSet {var _aagf ,_dbgdd []int ;for _gdfga ,_cgbfb :=range _gfbe {switch _cgbfb ._ffdb {case _gfbd :_aagf =append (_aagf ,_gdfga );case _fcec :_dbgdd =append (_dbgdd ,_gdfga );};};if len (_aagf )< _geda +1||len (_dbgdd )< _efdde +1{return nil ;
|
||
};if len (_aagf )+len (_dbgdd )> _gadb {_bb .Log .Debug ("\u0069\u006e\u0074\u0065\u0072\u0073e\u0063\u0074\u0069\u006f\u006e\u0073\u003a\u0020\u0054\u004f\u004f\u0020\u004d\u0041\u004e\u0059\u0020\u0072\u0075\u006ci\u006e\u0067\u0073\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u003d\u0020%\u0064 \u0078\u0020\u0025\u0064",len (_gfbe ),len (_aagf ),len (_dbgdd ));
|
||
return nil ;};_gfge :=make (map[int ]intSet ,len (_aagf )+len (_dbgdd ));for _ ,_dfdbf :=range _aagf {for _ ,_ffgba :=range _dbgdd {if _gfbe [_dfdbf ].intersects (_gfbe [_ffgba ]){if _ ,_deac :=_gfge [_dfdbf ];!_deac {_gfge [_dfdbf ]=make (intSet );};if _ ,_eeed :=_gfge [_ffgba ];
|
||
!_eeed {_gfge [_ffgba ]=make (intSet );};_gfge [_dfdbf ].add (_ffgba );_gfge [_ffgba ].add (_dfdbf );};};};return _gfge ;};func (_cabf *textPara )toTextMarks (_gbeg *int )[]TextMark {if _cabf ._gbgg ==nil {return _cabf .toCellTextMarks (_gbeg );};var _fdfg []TextMark ;
|
||
for _aede :=0;_aede < _cabf ._gbgg ._gaaa ;_aede ++{for _feee :=0;_feee < _cabf ._gbgg ._afaff ;_feee ++{_acab :=_cabf ._gbgg .get (_feee ,_aede );if _acab ==nil {_fdfg =_befb (_fdfg ,_gbeg ,"\u0009");}else {_fgcg :=_acab .toCellTextMarks (_gbeg );_fdfg =append (_fdfg ,_fgcg ...);
|
||
};_fdfg =_befb (_fdfg ,_gbeg ,"\u0020");};if _aede < _cabf ._gbgg ._gaaa -1{_fdfg =_befb (_fdfg ,_gbeg ,"\u000a");};};_abbag :=_cabf ._gbgg ;if _abbag .isExportable (){_decg :=_abbag .toTextTable ();_fdfg =_agfb (_fdfg ,&_decg );};return _fdfg ;};func _dgde (_fdfec ,_abd _bg .PdfRectangle )(_bg .PdfRectangle ,bool ){if !_gcfe (_fdfec ,_abd ){return _bg .PdfRectangle {},false ;
|
||
};return _bg .PdfRectangle {Llx :_ef .Max (_fdfec .Llx ,_abd .Llx ),Urx :_ef .Min (_fdfec .Urx ,_abd .Urx ),Lly :_ef .Max (_fdfec .Lly ,_abd .Lly ),Ury :_ef .Min (_fdfec .Ury ,_abd .Ury )},true ;};func _bagcde (_eecgc _bg .PdfColorspace ,_baca _bg .PdfColor )_g .Color {if _eecgc ==nil ||_baca ==nil {return _g .Black ;
|
||
};_acebb ,_febgd :=_eecgc .ColorToRGB (_baca );if _febgd !=nil {_bb .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006fu\u006c\u0064\u0020no\u0074\u0020\u0063\u006f\u006e\u0076e\u0072\u0074\u0020\u0063\u006f\u006c\u006f\u0072\u0020\u0025\u0076\u0020\u0028\u0025\u0076)\u0020\u0074\u006f\u0020\u0052\u0047\u0042\u003a \u0025\u0073",_baca ,_eecgc ,_febgd );
|
||
return _g .Black ;};_bafd ,_ffcbc :=_acebb .(*_bg .PdfColorDeviceRGB );if !_ffcbc {_bb .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006f\u006e\u0076\u0065\u0072\u0074\u0065\u0064 \u0063\u006f\u006c\u006f\u0072\u0020\u0069\u0073\u0020\u006e\u006f\u0074\u0020i\u006e\u0020\u0074\u0068\u0065\u0020\u0052\u0047\u0042\u0020\u0063\u006flo\u0072\u0073\u0070\u0061\u0063\u0065\u003a\u0020\u0025\u0076",_acebb );
|
||
return _g .Black ;};return _g .NRGBA {R :uint8 (_bafd .R ()*255),G :uint8 (_bafd .G ()*255),B :uint8 (_bafd .B ()*255),A :uint8 (255)};};func (_ccgc rulingList )primaries ()[]float64 {_efebf :=make (map[float64 ]struct{},len (_ccgc ));for _ ,_ecfce :=range _ccgc {_efebf [_ecfce ._ecdge ]=struct{}{};
|
||
};_eagf :=make ([]float64 ,len (_efebf ));_aadge :=0;for _gfgeg :=range _efebf {_eagf [_aadge ]=_gfgeg ;_aadge ++;};_ff .Float64s (_eagf );return _eagf ;};func _eecbf (_cgcb []_ga .PdfObject )(_cgcg ,_cbca float64 ,_aedbf error ){if len (_cgcb )!=2{return 0,0,_eg .Errorf ("\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u006e\u0075\u006d\u0062\u0065\u0072\u0020o\u0066 \u0070\u0061\u0072\u0061\u006d\u0073\u003a \u0025\u0064",len (_cgcb ));
|
||
};_accbgg ,_aedbf :=_ga .GetNumbersAsFloat (_cgcb );if _aedbf !=nil {return 0,0,_aedbf ;};return _accbgg [0],_accbgg [1],nil ;};func _dgcf (_cdcb byte )bool {for _ ,_geef :=range _fgea {if []byte (_geef )[0]==_cdcb {return true ;};};return false ;};
|
||
|
||
// Tables returns the tables extracted from the page.
|
||
func (_adb PageText )Tables ()[]TextTable {if _aeff {_bb .Log .Info ("\u0054\u0061\u0062\u006c\u0065\u0073\u003a\u0020\u0025\u0064",len (_adb ._ceaa ));};return _adb ._ceaa ;};func (_cdagf *textTable )getRight ()paraList {_effa :=make (paraList ,_cdagf ._gaaa );
|
||
for _cgged :=0;_cgged < _cdagf ._gaaa ;_cgged ++{_dffa :=_cdagf .get (_cdagf ._afaff -1,_cgged )._gdfgd ;if _dffa .taken (){return nil ;};_effa [_cgged ]=_dffa ;};for _efebg :=0;_efebg < _cdagf ._gaaa -1;_efebg ++{if _effa [_efebg ]._daag !=_effa [_efebg +1]{return nil ;
|
||
};};return _effa ;};func _aaaa (_cggca string )string {_ddbdd :=[]rune (_cggca );return string (_ddbdd [:len (_ddbdd )-1])};func (_aagaf *textPara )toCellTextMarks (_adec *int )[]TextMark {var _bgea []TextMark ;for _beae ,_efbf :=range _aagaf ._bbagc {_aefe :=_efbf .toTextMarks (_adec );
|
||
_bedbb :=_aefg &&_efbf .endsInHyphen ()&&_beae !=len (_aagaf ._bbagc )-1;if _bedbb {_aefe =_eaga (_aefe ,_adec );};_bgea =append (_bgea ,_aefe ...);if !(_bedbb ||_beae ==len (_aagaf ._bbagc )-1){_bgea =_befb (_bgea ,_adec ,_cegd (_efbf ._ffbb ,_aagaf ._bbagc [_beae +1]._ffbb ));
|
||
};};return _bgea ;};func _feae (_cggf *list ,_gfdg *_f .Builder ,_cagab *string ){_fgec :=_cggec (_cggf ,_cagab );_gfdg .WriteString (_fgec );for _ ,_egdd :=range _cggf ._bcdd {_gcbf :=*_cagab +"\u0020\u0020\u0020";_feae (_egdd ,_gfdg ,&_gcbf );};};
|
||
|
||
// String returns a string descibing `i`.
|
||
func (_fbccg gridTile )String ()string {_fgda :=func (_fcfdd bool ,_cfag string )string {if _fcfdd {return _cfag ;};return "\u005f";};return _eg .Sprintf ("\u00256\u002e2\u0066\u0020\u0025\u0031\u0073%\u0031\u0073%\u0031\u0073\u0025\u0031\u0073",_fbccg .PdfRectangle ,_fgda (_fbccg ._gafdd ,"\u004c"),_fgda (_fbccg ._deage ,"\u0052"),_fgda (_fbccg ._faba ,"\u0042"),_fgda (_fbccg ._fed ,"\u0054"));
|
||
};func (_bgdg *shapesState )newSubPath (){_bgdg .clearPath ();if _ggbc {_bb .Log .Info ("\u006e\u0065\u0077\u0053\u0075\u0062\u0050\u0061\u0074h\u003a\u0020\u0025\u0073",_bgdg );};};func _fbag (_ecbf []*textMark ,_ebbad _bg .PdfRectangle )[]*textWord {var _cgdba []*textWord ;
|
||
var _acaaa *textWord ;if _abfa {_bb .Log .Info ("\u006d\u0061\u006beT\u0065\u0078\u0074\u0057\u006f\u0072\u0064\u0073\u003a\u0020\u0025\u0064\u0020\u006d\u0061\u0072\u006b\u0073",len (_ecbf ));};_cafc :=func (){if _acaaa !=nil {_debe :=_acaaa .computeText ();
|
||
if !_effeg (_debe ){_acaaa ._bdbgb =_debe ;_cgdba =append (_cgdba ,_acaaa );if _abfa {_bb .Log .Info ("\u0061\u0064\u0064Ne\u0077\u0057\u006f\u0072\u0064\u003a\u0020\u0025\u0064\u003a\u0020\u0077\u006f\u0072\u0064\u003d\u0025\u0073",len (_cgdba )-1,_acaaa .String ());
|
||
for _bfbage ,_agage :=range _acaaa ._dbff {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bfbage ,_agage .String ());};};};_acaaa =nil ;};};for _ ,_bcaf :=range _ecbf {if _egffa &&_acaaa !=nil &&len (_acaaa ._dbff )> 0{_bcacg :=_acaaa ._dbff [len (_acaaa ._dbff )-1];
|
||
_fffgb ,_bcfbb :=_eaefd (_bcaf ._gcgb );_ggbeee ,_ddaeg :=_eaefd (_bcacg ._gcgb );if _bcfbb &&!_ddaeg &&_bcacg .inDiacriticArea (_bcaf ){_acaaa .addDiacritic (_fffgb );continue ;};if _ddaeg &&!_bcfbb &&_bcaf .inDiacriticArea (_bcacg ){_acaaa ._dbff =_acaaa ._dbff [:len (_acaaa ._dbff )-1];
|
||
_acaaa .appendMark (_bcaf ,_ebbad );_acaaa .addDiacritic (_ggbeee );continue ;};};_aebdd :=_effeg (_bcaf ._gcgb );if _aebdd {_cafc ();continue ;};if _acaaa ==nil &&!_aebdd {_acaaa =_bfcdg ([]*textMark {_bcaf },_ebbad );continue ;};_febbd :=_acaaa ._beaeg ;
|
||
_ffab :=_ef .Abs (_dddd (_ebbad ,_bcaf )-_acaaa ._gdfbg )/_febbd ;_bbbda :=_dfea (_bcaf ,_acaaa )/_febbd ;if _bbbda >=_cfgg ||!(-_eced <=_bbbda &&_ffab <=_ebdf ){_cafc ();_acaaa =_bfcdg ([]*textMark {_bcaf },_ebbad );continue ;};_acaaa .appendMark (_bcaf ,_ebbad );
|
||
};_cafc ();return _cgdba ;};func _cfa (_befd ,_aea _bg .PdfRectangle )bool {return _befd .Lly <=_aea .Ury &&_aea .Lly <=_befd .Ury };func _ddfc (_dbda map[float64 ]map[float64 ]gridTile )[]float64 {_eabb :=make ([]float64 ,0,len (_dbda ));for _acead :=range _dbda {_eabb =append (_eabb ,_acead );
|
||
};_ff .Float64s (_eabb );_aecc :=len (_eabb );for _cdab :=0;_cdab < _aecc /2;_cdab ++{_eabb [_cdab ],_eabb [_aecc -1-_cdab ]=_eabb [_aecc -1-_cdab ],_eabb [_cdab ];};return _eabb ;};func (_befa *textTable )get (_defda ,_faad int )*textPara {return _befa ._feeba [_gbbce (_defda ,_faad )]};
|
||
func (_ecefd rulingList )toGrids ()[]rulingList {if _bffg {_bb .Log .Info ("t\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0025\u0073",_ecefd );};_dbee :=_ecefd .intersections ();if _bffg {_bb .Log .Info ("\u0074\u006f\u0047r\u0069\u0064\u0073\u003a \u0076\u0065\u0063\u0073\u003d\u0025\u0064 \u0069\u006e\u0074\u0065\u0072\u0073\u0065\u0063\u0074\u0073\u003d\u0025\u0064\u0020",len (_ecefd ),len (_dbee ));
|
||
for _ ,_abce :=range _dbcgg (_dbee ){_eg .Printf ("\u00254\u0064\u003a\u0020\u0025\u002b\u0076\n",_abce ,_dbee [_abce ]);};};_faga :=make (map[int ]intSet ,len (_ecefd ));for _ccba :=range _ecefd {_aegc :=_ecefd .connections (_dbee ,_ccba );if len (_aegc )> 0{_faga [_ccba ]=_aegc ;
|
||
};};if _bffg {_bb .Log .Info ("t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u0063\u006fn\u006e\u0065\u0063\u0074s=\u0025\u0064",len (_faga ));for _ ,_beece :=range _dbcgg (_faga ){_eg .Printf ("\u00254\u0064\u003a\u0020\u0025\u002b\u0076\n",_beece ,_faga [_beece ]);
|
||
};};_agde :=_eaeab (len (_ecefd ),func (_cbge ,_ggab int )bool {_dgeff ,_gada :=len (_faga [_cbge ]),len (_faga [_ggab ]);if _dgeff !=_gada {return _dgeff > _gada ;};return _ecefd .comp (_cbge ,_ggab );});if _bffg {_bb .Log .Info ("t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u006f\u0072d\u0065\u0072\u0069\u006eg=\u0025\u0076",_agde );
|
||
};_dcfg :=[][]int {{_agde [0]}};_edbgg :for _ ,_dcca :=range _agde [1:]{for _eacdb ,_beeaa :=range _dcfg {for _ ,_fddd :=range _beeaa {if _faga [_fddd ].has (_dcca ){_dcfg [_eacdb ]=append (_beeaa ,_dcca );continue _edbgg ;};};};_dcfg =append (_dcfg ,[]int {_dcca });
|
||
};if _bffg {_bb .Log .Info ("\u0074o\u0047r\u0069\u0064\u0073\u003a\u0020i\u0067\u0072i\u0064\u0073\u003d\u0025\u0076",_dcfg );};_ff .SliceStable (_dcfg ,func (_fccb ,_gfaa int )bool {return len (_dcfg [_fccb ])> len (_dcfg [_gfaa ])});for _ ,_fcgg :=range _dcfg {_ff .Slice (_fcgg ,func (_ebddg ,_eacf int )bool {return _ecefd .comp (_fcgg [_ebddg ],_fcgg [_eacf ])});
|
||
};_adfgc :=make ([]rulingList ,len (_dcfg ));for _fbcc ,_ddde :=range _dcfg {_fgcc :=make (rulingList ,len (_ddde ));for _bcaac ,_ddcg :=range _ddde {_fgcc [_bcaac ]=_ecefd [_ddcg ];};_adfgc [_fbcc ]=_fgcc ;};if _bffg {_bb .Log .Info ("\u0074o\u0047r\u0069\u0064\u0073\u003a\u0020g\u0072\u0069d\u0073\u003d\u0025\u002b\u0076",_adfgc );
|
||
};var _efcd []rulingList ;for _ ,_afdee :=range _adfgc {if _fdff ,_fbaa :=_afdee .isActualGrid ();_fbaa {_afdee =_fdff ;_afdee =_afdee .snapToGroups ();_efcd =append (_efcd ,_afdee );};};if _bffg {_ddagf ("t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u0061\u0063t\u0075\u0061\u006c\u0047ri\u0064\u0073",_efcd );
|
||
_bb .Log .Info ("\u0074\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0067\u0072\u0069\u0064\u0073\u003d%\u0064 \u0061\u0063\u0074\u0075\u0061\u006c\u0047\u0072\u0069\u0064\u0073\u003d\u0025\u0064",len (_adfgc ),len (_efcd ));};return _efcd ;};func (_bdagg *structTreeRoot )parseStructTreeRoot (_gfed _ga .PdfObject ){if _gfed !=nil {_bdbe ,_bddcd :=_ga .GetDict (_gfed );
|
||
if !_bddcd {_bb .Log .Debug ("\u0070\u0061\u0072s\u0065\u0053\u0074\u0072\u0075\u0063\u0074\u0054\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u003a\u0020\u0064\u0069\u0063\u0074\u0069\u006f\u006e\u0061\u0072\u0079\u0020\u006eo\u0074\u0020\u0066\u006f\u0075\u006e\u0064\u002e");
|
||
};K :=_bdbe .Get ("\u004b");_fgdf :=_bdbe .Get ("\u0054\u0079\u0070\u0065").String ();var _adce *_ga .PdfObjectArray ;switch _bffd :=K .(type ){case *_ga .PdfObjectArray :_adce =_bffd ;case *_ga .PdfObjectReference :_adce =_ga .MakeArray (K );};_ebfb :=[]structElement {};
|
||
for _ ,_egdbf :=range _adce .Elements (){_gedd :=&structElement {};_gedd .parseStructElement (_egdbf );_ebfb =append (_ebfb ,*_gedd );};_bdagg ._aaef =_ebfb ;_bdagg ._cfbff =_fgdf ;};};type lineRuling struct{_fcfg rulingKind ;_bagea markKind ;_g .Color ;
|
||
_gabf ,_fccf _cb .Point ;};
|
||
|
||
// NewWithOptions an Extractor instance for extracting content from the input PDF page with options.
|
||
func NewWithOptions (page *_bg .PdfPage ,options *Options )(*Extractor ,error ){const _ac ="\u0065x\u0074\u0072\u0061\u0063\u0074\u006f\u0072\u002e\u004e\u0065\u0077W\u0069\u0074\u0068\u004f\u0070\u0074\u0069\u006f\u006e\u0073";_gae ,_df :=page .GetAllContentStreams ();
|
||
if _df !=nil {return nil ,_df ;};_bbc ,_gdg :=page .GetStructTreeRoot ();if !_gdg {_bb .Log .Info ("T\u0068\u0065\u0020\u0070\u0064\u0066\u0020\u0064\u006f\u0063\u0075\u006d\u0065\u006e\u0074\u0020\u0069\u0073\u0020\u006e\u006f\u0074\u0020\u0074\u0061\u0067g\u0065d\u002e\u0020\u0053\u0074r\u0075\u0063t\u0054\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u0020\u0064\u006f\u0065\u0073\u006e\u0027\u0074\u0020\u0065\u0078\u0069\u0073\u0074\u002e");
|
||
};_cdg :=page .GetContainingPdfObject ();_fab ,_df :=page .GetMediaBox ();if _df !=nil {return nil ,_eg .Errorf ("\u0065\u0078\u0074r\u0061\u0063\u0074\u006fr\u0020\u0072\u0065\u0071\u0075\u0069\u0072e\u0073\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u002e\u0020\u0025\u0076",_df );
|
||
};_bc :=&Extractor {_cd :_gae ,_ee :page .Resources ,_ea :*_fab ,_gc :page .CropBox ,_cf :map[string ]fontEntry {},_ce :map[string ]textResult {},_ab :options ,_ed :_bbc ,_eee :_cdg };if _bc ._ea .Llx > _bc ._ea .Urx {_bb .Log .Info ("\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0058\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e",_bc ._ea );
|
||
_bc ._ea .Llx ,_bc ._ea .Urx =_bc ._ea .Urx ,_bc ._ea .Llx ;};if _bc ._ea .Lly > _bc ._ea .Ury {_bb .Log .Info ("\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0059\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e",_bc ._ea );
|
||
_bc ._ea .Lly ,_bc ._ea .Ury =_bc ._ea .Ury ,_bc ._ea .Lly ;};_ge .TrackUse (_ac );return _bc ,nil ;};func (_acba *textLine )toTextMarks (_fcbd *int )[]TextMark {var _gdgf []TextMark ;for _ ,_ddead :=range _acba ._ggdd {if _ddead ._bbdga {_gdgf =_befb (_gdgf ,_fcbd ,"\u0020");
|
||
};_ccgf :=_ddead .toTextMarks (_fcbd );_gdgf =append (_gdgf ,_ccgf ...);};return _gdgf ;};type textTable struct{_bg .PdfRectangle ;_afaff ,_gaaa int ;_cfabb bool ;_feeba map[uint64 ]*textPara ;_bgag map[uint64 ]compositeCell ;};func (_eecd *shapesState )fill (_ffg *[]pathSection ){_dccf :=pathSection {_aadg :_eecd ._bfeb ,Color :_eecd ._fdc .getFillColor ()};
|
||
*_ffg =append (*_ffg ,_dccf );if _bffg {_ceb :=_dccf .bbox ();_eg .Printf ("\u0020 \u0020\u0020\u0046\u0049\u004c\u004c\u003a %\u0032\u0064\u0020\u0066\u0069\u006c\u006c\u0073\u0020\u0028\u0025\u0064\u0020\u006ee\u0077\u0029 \u0073\u0073\u003d%\u0073\u0020\u0063\u006f\u006c\u006f\u0072\u003d\u0025\u0033\u0076\u0020\u0025\u0036\u002e\u0032f\u003d\u00256.\u0032\u0066\u0078%\u0036\u002e\u0032\u0066\u000a",len (*_ffg ),len (_dccf ._aadg ),_eecd ,_dccf .Color ,_ceb ,_ceb .Width (),_ceb .Height ());
|
||
if _dcgf {for _fdb ,_efad :=range _dccf ._aadg {_eg .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_fdb ,_efad );if _fdb ==10{break ;};};};};};func _dcgd (_fccbf []rulingList )(rulingList ,rulingList ){var _dbcd rulingList ;for _ ,_bfaa :=range _fccbf {_dbcd =append (_dbcd ,_bfaa ...);
|
||
};return _dbcd .vertsHorzs ();};func (_bfbdb paraList )inTile (_gdfba gridTile )paraList {var _gfggg paraList ;for _ ,_abaceb :=range _bfbdb {if _gdfba .contains (_abaceb .PdfRectangle ){_gfggg =append (_gfggg ,_abaceb );};};if _aeff {_eg .Printf ("\u0020 \u0020\u0069\u006e\u0054i\u006c\u0065\u003a\u0020\u0020%\u0073 \u0069n\u0073\u0069\u0064\u0065\u003d\u0025\u0064\n",_gdfba ,len (_gfggg ));
|
||
for _bdgef ,_effeb :=range _gfggg {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bdgef ,_effeb );};_eg .Println ("");};return _gfggg ;};func (_dgda *textTable )putComposite (_dgeea ,_efdae int ,_bfbadg paraList ,_ecdc _bg .PdfRectangle ){if len (_bfbadg )==0{_bb .Log .Error ("\u0074\u0065xt\u0054\u0061\u0062l\u0065\u0029\u0020\u0070utC\u006fmp\u006f\u0073\u0069\u0074\u0065\u003a\u0020em\u0070\u0074\u0079\u0020\u0070\u0061\u0072a\u0073");
|
||
return ;};_cebg :=compositeCell {PdfRectangle :_ecdc ,paraList :_bfbadg };if _aeff {_eg .Printf ("\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0070\u0075\u0074\u0043\u006f\u006d\u0070o\u0073i\u0074\u0065\u0028\u0025\u0064\u002c\u0025\u0064\u0029\u003c\u002d\u0025\u0073\u000a",_dgeea ,_efdae ,_cebg .String ());
|
||
};_cebg .updateBBox ();_dgda ._bgag [_gbbce (_dgeea ,_efdae )]=_cebg ;};var (_edcb =map[rune ]string {0x0060:"\u0300",0x02CB:"\u0300",0x0027:"\u0301",0x00B4:"\u0301",0x02B9:"\u0301",0x02CA:"\u0301",0x005E:"\u0302",0x02C6:"\u0302",0x007E:"\u0303",0x02DC:"\u0303",0x00AF:"\u0304",0x02C9:"\u0304",0x02D8:"\u0306",0x02D9:"\u0307",0x00A8:"\u0308",0x00B0:"\u030a",0x02DA:"\u030a",0x02BA:"\u030b",0x02DD:"\u030b",0x02C7:"\u030c",0x02C8:"\u030d",0x0022:"\u030e",0x02BB:"\u0312",0x02BC:"\u0313",0x0486:"\u0313",0x055A:"\u0313",0x02BD:"\u0314",0x0485:"\u0314",0x0559:"\u0314",0x02D4:"\u031d",0x02D5:"\u031e",0x02D6:"\u031f",0x02D7:"\u0320",0x02B2:"\u0321",0x00B8:"\u0327",0x02CC:"\u0329",0x02B7:"\u032b",0x02CD:"\u0331",0x005F:"\u0332",0x204E:"\u0359"};
|
||
);func (_edea rulingList )secMinMax ()(float64 ,float64 ){_ffcg ,_bgdd :=_edea [0]._bafb ,_edea [0]._cbdc ;for _ ,_egda :=range _edea [1:]{if _egda ._bafb < _ffcg {_ffcg =_egda ._bafb ;};if _egda ._cbdc > _bgdd {_bgdd =_egda ._cbdc ;};};return _ffcg ,_bgdd ;
|
||
};func (_ceec *textPara )getListLines ()[]*textLine {var _feaa []*textLine ;_bcee :=_dfcgg (_ceec ._bbagc );for _ ,_ffgfb :=range _ceec ._bbagc {_cafe :=_ffgfb ._ggdd [0]._bdbgb [0];if _dgcf (_cafe ){_feaa =append (_feaa ,_ffgfb );};};_feaa =append (_feaa ,_bcee ...);
|
||
return _feaa ;};
|
||
|
||
// Elements returns the TextMarks in `ma`.
|
||
func (_afcb *TextMarkArray )Elements ()[]TextMark {return _afcb ._dafa };
|
||
|
||
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
|
||
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
|
||
func (_gac *Extractor )ExtractTextWithStats ()(_edb string ,_aaag int ,_eacb int ,_cedg error ){_cbga ,_aaag ,_eacb ,_cedg :=_gac .ExtractPageText ();if _cedg !=nil {return "",_aaag ,_eacb ,_cedg ;};return _cbga .Text (),_aaag ,_eacb ,nil ;};func (_dfee *textObject )showText (_bgef _ga .PdfObject ,_abad []byte ,_bfa int )error {return _dfee .renderText (_bgef ,_abad ,_bfa );
|
||
};type wordBag struct{_bg .PdfRectangle ;_ddgcf float64 ;_ecgb ,_bfadd rulingList ;_gcdf float64 ;_dceb map[int ][]*textWord ;};type rectRuling struct{_fbgd rulingKind ;_edad markKind ;_g .Color ;_bg .PdfRectangle ;};func (_aaaf *TextMarkArray )getTextMarkAtOffset (_eegb int )*TextMark {for _ ,_fgac :=range _aaaf ._dafa {if _fgac .Offset ==_eegb {return &_fgac ;
|
||
};};return nil ;};func (_ddgc *shapesState )lastpointEstablished ()(_cb .Point ,bool ){if _ddgc ._abac {return _ddgc ._bfgg ,false ;};_ggg :=len (_ddgc ._bfeb );if _ggg > 0&&_ddgc ._bfeb [_ggg -1]._ffff {return _ddgc ._bfeb [_ggg -1].last (),false ;};return _cb .Point {},true ;
|
||
};func (_gaeb *textPara )depth ()float64 {if _gaeb ._fcag {return -1.0;};if len (_gaeb ._bbagc )> 0{return _gaeb ._bbagc [0]._ffbb ;};return _gaeb ._gbgg .depth ();};type gridTile struct{_bg .PdfRectangle ;_fed ,_gafdd ,_faba ,_deage bool ;};func (_bfae *stateStack )size ()int {return len (*_bfae )};
|
||
func (_bdcg *textTable )compositeRowCorridors ()map[int ][]float64 {_fcgd :=make (map[int ][]float64 ,_bdcg ._gaaa );if _aeff {_bb .Log .Info ("c\u006f\u006d\u0070\u006f\u0073\u0069t\u0065\u0052\u006f\u0077\u0043\u006f\u0072\u0072\u0069d\u006f\u0072\u0073:\u0020h\u003d\u0025\u0064",_bdcg ._gaaa );
|
||
};for _aaafa :=1;_aaafa < _bdcg ._gaaa ;_aaafa ++{var _acfe []compositeCell ;for _cegda :=0;_cegda < _bdcg ._afaff ;_cegda ++{if _cbcg ,_gbcb :=_bdcg ._bgag [_gbbce (_cegda ,_aaafa )];_gbcb {_acfe =append (_acfe ,_cbcg );};};if len (_acfe )==0{continue ;
|
||
};_fgbf :=_eeffbf (_acfe );_fcgd [_aaafa ]=_fgbf ;if _aeff {_eg .Printf ("\u0020\u0020\u0020\u0025\u0032\u0064\u003a\u0020\u00256\u002e\u0032\u0066\u000a",_aaafa ,_fgbf );};};return _fcgd ;};func (_dcgba gridTile )numBorders ()int {_gbbgb :=0;if _dcgba ._gafdd {_gbbgb ++;
|
||
};if _dcgba ._deage {_gbbgb ++;};if _dcgba ._faba {_gbbgb ++;};if _dcgba ._fed {_gbbgb ++;};return _gbbgb ;};func (_bba *wordBag )depthIndexes ()[]int {if len (_bba ._dceb )==0{return nil ;};_ggcg :=make ([]int ,len (_bba ._dceb ));_dfcf :=0;for _fbg :=range _bba ._dceb {_ggcg [_dfcf ]=_fbg ;
|
||
_dfcf ++;};_ff .Ints (_ggcg );return _ggcg ;};func _fbgg (_becc _bg .PdfRectangle )*ruling {return &ruling {_ffdb :_fcec ,_ecdge :_becc .Ury ,_bafb :_becc .Llx ,_cbdc :_becc .Urx };};func (_bfca paraList )toTextMarks ()[]TextMark {_ddbb :=0;var _ddga []TextMark ;
|
||
for _cegbf ,_ggba :=range _bfca {if _ggba ._fcag {continue ;};_agafc :=_ggba .toTextMarks (&_ddbb );_ddga =append (_ddga ,_agafc ...);if _cegbf !=len (_bfca )-1{if _dgee (_ggba ,_bfca [_cegbf +1]){_ddga =_befb (_ddga ,&_ddbb ,"\u0020");}else {_ddga =_befb (_ddga ,&_ddbb ,"\u000a");
|
||
_ddga =_befb (_ddga ,&_ddbb ,"\u000a");};};};_ddga =_befb (_ddga ,&_ddbb ,"\u000a");_ddga =_befb (_ddga ,&_ddbb ,"\u000a");return _ddga ;};func (_dfcc *textObject )getFont (_cee string )(*_bg .PdfFont ,error ){if _dfcc ._afb ._cf !=nil {_edg ,_abfg :=_dfcc .getFontDict (_cee );
|
||
if _abfg !=nil {_bb .Log .Debug ("\u0045\u0052\u0052OR\u003a\u0020\u0067\u0065\u0074\u0046\u006f\u006e\u0074:\u0020n\u0061m\u0065=\u0025\u0073\u002c\u0020\u0065\u0072\u0072\u006f\u0072\u003a\u0020\u0025\u0073",_cee ,_abfg .Error ());return nil ,_abfg ;
|
||
};_dfcc ._afb ._ec ++;_egfa ,_gda :=_dfcc ._afb ._cf [_edg .String ()];if _gda {_egfa ._cacb =_dfcc ._afb ._ec ;return _egfa ._fefa ,nil ;};};_ecgf ,_bfaec :=_dfcc .getFontDict (_cee );if _bfaec !=nil {return nil ,_bfaec ;};_ddgb ,_bfaec :=_dfcc .getFontDirect (_cee );
|
||
if _bfaec !=nil {return nil ,_bfaec ;};if _dfcc ._afb ._cf !=nil {_ccec :=fontEntry {_ddgb ,_dfcc ._afb ._ec };if len (_dfcc ._afb ._cf )>=_dfcgb {var _deff []string ;for _abgd :=range _dfcc ._afb ._cf {_deff =append (_deff ,_abgd );};_ff .Slice (_deff ,func (_gadg ,_bfeag int )bool {return _dfcc ._afb ._cf [_deff [_gadg ]]._cacb < _dfcc ._afb ._cf [_deff [_bfeag ]]._cacb ;
|
||
});delete (_dfcc ._afb ._cf ,_deff [0]);};_dfcc ._afb ._cf [_ecgf .String ()]=_ccec ;};return _ddgb ,nil ;};func (_fgbga *wordBag )firstWord (_fdaf int )*textWord {return _fgbga ._dceb [_fdaf ][0]};type structElement struct{_ddag string ;_acdgb []structElement ;
|
||
_dfdgf int64 ;_gcbe _ga .PdfObject ;};func (_ccfdc *wordBag )removeDuplicates (){if _bcce {_bb .Log .Info ("r\u0065m\u006f\u0076\u0065\u0044\u0075\u0070\u006c\u0069c\u0061\u0074\u0065\u0073: \u0025\u0071",_ccfdc .text ());};for _ ,_eabe :=range _ccfdc .depthIndexes (){if len (_ccfdc ._dceb [_eabe ])==0{continue ;
|
||
};_gdce :=_ccfdc ._dceb [_eabe ][0];_eggc :=_bbff *_gdce ._beaeg ;_cbef :=_gdce ._gdfbg ;for _ ,_ceeg :=range _ccfdc .depthBand (_cbef ,_cbef +_eggc ){_beff :=map[*textWord ]struct{}{};_cfea :=_ccfdc ._dceb [_ceeg ];for _ ,_eefb :=range _cfea {if _ ,_cbed :=_beff [_eefb ];
|
||
_cbed {continue ;};for _ ,_fbdd :=range _cfea {if _ ,_gcfb :=_beff [_fbdd ];_gcfb {continue ;};if _fbdd !=_eefb &&_fbdd ._bdbgb ==_eefb ._bdbgb &&_ef .Abs (_fbdd .Llx -_eefb .Llx )< _eggc &&_ef .Abs (_fbdd .Urx -_eefb .Urx )< _eggc &&_ef .Abs (_fbdd .Lly -_eefb .Lly )< _eggc &&_ef .Abs (_fbdd .Ury -_eefb .Ury )< _eggc {_beff [_fbdd ]=struct{}{};
|
||
};};};if len (_beff )> 0{_ebfc :=0;for _ ,_aegb :=range _cfea {if _ ,_ebdfa :=_beff [_aegb ];!_ebdfa {_cfea [_ebfc ]=_aegb ;_ebfc ++;};};_ccfdc ._dceb [_ceeg ]=_cfea [:len (_cfea )-len (_beff )];if len (_ccfdc ._dceb [_ceeg ])==0{delete (_ccfdc ._dceb ,_ceeg );
|
||
};};};};};func (_dgcc *textLine )bbox ()_bg .PdfRectangle {return _dgcc .PdfRectangle };
|
||
|
||
// Marks returns the TextMark collection for a page. It represents all the text on the page.
|
||
func (_egff PageText )Marks ()*TextMarkArray {return &TextMarkArray {_dafa :_egff ._baeg }};func (_dddda *wordBag )arrangeText ()*textPara {_dddda .sort ();if _aceb {_dddda .removeDuplicates ();};var _gcda []*textLine ;for _ ,_fage :=range _dddda .depthIndexes (){for !_dddda .empty (_fage ){_ceda :=_dddda .firstReadingIndex (_fage );
|
||
_adgb :=_dddda .firstWord (_ceda );_ebgc :=_bfbd (_dddda ,_ceda );_becb :=_adgb ._beaeg ;_defa :=_adgb ._gdfbg -_abea *_becb ;_gfbcg :=_adgb ._gdfbg +_abea *_becb ;_bdddb :=_ffcdf *_becb ;_adafc :=_gfbc *_becb ;_gdgfb :for {var _cgab *textWord ;_cdbcg :=0;
|
||
for _ ,_cbefd :=range _dddda .depthBand (_defa ,_gfbcg ){_bega :=_dddda .highestWord (_cbefd ,_defa ,_gfbcg );if _bega ==nil {continue ;};_fdgc :=_dfea (_bega ,_ebgc ._ggdd [len (_ebgc ._ggdd )-1]);if _fdgc < -_adafc {break _gdgfb ;};if _fdgc > _bdddb {continue ;
|
||
};if _cgab !=nil &&_gffg (_bega ,_cgab )>=0{continue ;};_cgab =_bega ;_cdbcg =_cbefd ;};if _cgab ==nil {break ;};_ebgc .pullWord (_dddda ,_cgab ,_cdbcg );};_ebgc .markWordBoundaries ();_gcda =append (_gcda ,_ebgc );};};if len (_gcda )==0{return nil ;};
|
||
_ff .Slice (_gcda ,func (_egcc ,_ggcbe int )bool {return _bacd (_gcda [_egcc ],_gcda [_ggcbe ])< 0});_cgef :=_efce (_dddda .PdfRectangle ,_gcda );if _cdgd {_bb .Log .Info ("\u0061\u0072\u0072an\u0067\u0065\u0054\u0065\u0078\u0074\u0020\u0021\u0021\u0021\u0020\u0070\u0061\u0072\u0061\u003d\u0025\u0073",_cgef .String ());
|
||
if _aabd {for _geac ,_bacc :=range _cgef ._bbagc {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_geac ,_bacc .String ());if _gdfe {for _abbf ,_agbf :=range _bacc ._ggdd {_eg .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_abbf ,_agbf .String ());
|
||
for _degc ,_badc :=range _agbf ._dbff {_eg .Printf ("\u00251\u0032\u0064\u003a\u0020\u0025\u0073\n",_degc ,_badc .String ());};};};};};};return _cgef ;};func (_aeda *subpath )last ()_cb .Point {return _aeda ._egfb [len (_aeda ._egfb )-1]};func (_baf *textObject )getFillColor ()_g .Color {return _bagcde (_baf ._fgd .ColorspaceNonStroking ,_baf ._fgd .ColorNonStroking );
|
||
};func _eaaad (_cdcc string )bool {if _ba .RuneCountInString (_cdcc )< _adea {return false ;};_gab ,_dace :=_ba .DecodeLastRuneInString (_cdcc );if _dace <=0||!_e .Is (_e .Hyphen ,_gab ){return false ;};_gab ,_dace =_ba .DecodeLastRuneInString (_cdcc [:len (_cdcc )-_dace ]);
|
||
return _dace > 0&&!_e .IsSpace (_gab );};func _fbcdb (_caacb *_bg .Image ,_aegf _g .Color )_be .Image {_dbcda ,_ddbbbd :=int (_caacb .Width ),int (_caacb .Height );_afeda :=_be .NewRGBA (_be .Rect (0,0,_dbcda ,_ddbbbd ));for _fecfgf :=0;_fecfgf < _ddbbbd ;
|
||
_fecfgf ++{for _gfgf :=0;_gfgf < _dbcda ;_gfgf ++{_dgbca ,_befcd :=_caacb .ColorAt (_gfgf ,_fecfgf );if _befcd !=nil {_bb .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063o\u0075\u006c\u0064\u0020\u006e\u006f\u0074\u0020\u0072\u0065\u0074\u0072\u0069\u0065v\u0065 \u0069\u006d\u0061\u0067\u0065\u0020m\u0061\u0073\u006b\u0020\u0076\u0061\u006cu\u0065\u0020\u0061\u0074\u0020\u0028\u0025\u0064\u002c\u0020\u0025\u0064\u0029\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074\u0020\u006da\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063\u006f\u0072\u0072\u0065\u0063t\u002e",_gfgf ,_fecfgf );
|
||
continue ;};_cfgfa ,_febad ,_ceegc ,_ :=_dgbca .RGBA ();var _dfeb _g .Color ;if _cfgfa +_febad +_ceegc ==0{_dfeb =_g .Transparent ;}else {_dfeb =_aegf ;};_afeda .Set (_gfgf ,_fecfgf ,_dfeb );};};return _afeda ;};func (_fea *textObject )setTextRenderMode (_adac int ){if _fea ==nil {return ;
|
||
};_fea ._aed ._agbg =RenderMode (_adac );};func (_agb *imageExtractContext )extractInlineImage (_eda *_de .ContentStreamInlineImage ,_fcg _de .GraphicsState ,_fgb *_bg .PdfPageResources )error {_gb ,_ece :=_eda .ToImage (_fgb );if _ece !=nil {return _ece ;
|
||
};_ecce ,_ece :=_eda .GetColorSpace (_fgb );if _ece !=nil {return _ece ;};if _ecce ==nil {_ecce =_bg .NewPdfColorspaceDeviceGray ();};_faff ,_ece :=_ecce .ImageToRGB (*_gb );if _ece !=nil {return _ece ;};_fb :=ImageMark {Image :&_faff ,Width :_fcg .CTM .ScalingFactorX (),Height :_fcg .CTM .ScalingFactorY (),Angle :_fcg .CTM .Angle ()};
|
||
_fb .X ,_fb .Y =_fcg .CTM .Translation ();_agb ._cc =append (_agb ._cc ,_fb );_agb ._fafc ++;return nil ;};func _ccdb (_dbcca *list )[]*textLine {for _ ,_accbg :=range _dbcca ._bcdd {switch _accbg ._cfcfa {case "\u004c\u0042\u006fd\u0079":if len (_accbg ._dgef )!=0{return _accbg ._dgef ;
|
||
};return _ccdb (_accbg );case "\u0053\u0070\u0061\u006e":return _accbg ._dgef ;case "I\u006e\u006c\u0069\u006e\u0065\u0053\u0068\u0061\u0070\u0065":return _accbg ._dgef ;};};return nil ;};func (_adaa *PageText )computeViews (){_eega :=_adaa .getParagraphs ();
|
||
_dcce :=new (_db .Buffer );_eega .writeText (_dcce );_adaa ._bffc =_dcce .String ();_adaa ._baeg =_eega .toTextMarks ();_adaa ._ceaa =_eega .tables ();if _aeff {_bb .Log .Info ("\u0063\u006f\u006dpu\u0074\u0065\u0056\u0069\u0065\u0077\u0073\u003a\u0020\u0074\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064",len (_adaa ._ceaa ));
|
||
};};func _efce (_afab _bg .PdfRectangle ,_ggef []*textLine )*textPara {return &textPara {PdfRectangle :_afab ,_bbagc :_ggef };};func (_dccfa *textTable )put (_fbfb ,_eccc int ,_deaa *textPara ){_dccfa ._feeba [_gbbce (_fbfb ,_eccc )]=_deaa ;};func (_adgg *textTable )compositeColCorridors ()map[int ][]float64 {_dbbcf :=make (map[int ][]float64 ,_adgg ._afaff );
|
||
if _aeff {_bb .Log .Info ("\u0063\u006f\u006d\u0070o\u0073\u0069\u0074\u0065\u0043\u006f\u006c\u0043\u006f\u0072r\u0069d\u006f\u0072\u0073\u003a\u0020\u0077\u003d%\u0064\u0020",_adgg ._afaff );};for _edbe :=0;_edbe < _adgg ._afaff ;_edbe ++{_dbbcf [_edbe ]=nil ;
|
||
};return _dbbcf ;};func (_cbff *textObject )setCharSpacing (_cdd float64 ){if _cbff ==nil {return ;};_cbff ._aed ._edf =_cdd ;if _gbec {_bb .Log .Info ("\u0073\u0065t\u0043\u0068\u0061\u0072\u0053\u0070\u0061\u0063\u0069\u006e\u0067\u003a\u0020\u0025\u002e\u0032\u0066\u0020\u0073\u0074\u0061\u0074e=\u0025\u0073",_cdd ,_cbff ._aed .String ());
|
||
};};func (_fcee rulingList )vertsHorzs ()(rulingList ,rulingList ){var _cdgac ,_gbce rulingList ;for _ ,_fdaggf :=range _fcee {switch _fdaggf ._ffdb {case _gfbd :_cdgac =append (_cdgac ,_fdaggf );case _fcec :_gbce =append (_gbce ,_fdaggf );};};return _cdgac ,_gbce ;
|
||
};
|
||
|
||
// String returns a string describing the current state of the textState stack.
|
||
func (_ddbf *stateStack )String ()string {_bfcb :=[]string {_eg .Sprintf ("\u002d\u002d\u002d\u002d f\u006f\u006e\u0074\u0020\u0073\u0074\u0061\u0063\u006b\u003a\u0020\u0025\u0064",len (*_ddbf ))};for _dfc ,_agac :=range *_ddbf {_cgcc :="\u003c\u006e\u0069l\u003e";
|
||
if _agac !=nil {_cgcc =_agac .String ();};_bfcb =append (_bfcb ,_eg .Sprintf ("\u0009\u0025\u0032\u0064\u003a\u0020\u0025\u0073",_dfc ,_cgcc ));};return _f .Join (_bfcb ,"\u000a");};func (_gbef rulingList )isActualGrid ()(rulingList ,bool ){_fefcd ,_ffcgf :=_gbef .augmentGrid ();
|
||
if !(len (_fefcd )>=_geda +1&&len (_ffcgf )>=_efdde +1){if _bffg {_bb .Log .Info ("\u0069s\u0041\u0063t\u0075\u0061\u006c\u0047r\u0069\u0064\u003a \u004e\u006f\u0074\u0020\u0061\u006c\u0069\u0067\u006eed\u002e\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u003c\u0020\u0025d\u0020\u0078 \u0025\u0064",len (_fefcd ),len (_ffcgf ),_geda +1,_efdde +1);
|
||
};return nil ,false ;};if _bffg {_bb .Log .Info ("\u0069\u0073\u0041\u0063\u0074\u0075a\u006c\u0047\u0072\u0069\u0064\u003a\u0020\u0025\u0073\u0020\u003a\u0020\u0025t\u0020\u0026\u0020\u0025\u0074\u0020\u2192 \u0025\u0074",_gbef ,len (_fefcd )>=2,len (_ffcgf )>=2,len (_fefcd )>=2&&len (_ffcgf )>=2);
|
||
for _cccfd ,_bdge :=range _gbef {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0076\u000a",_cccfd ,_bdge );};};if _bdec {_fbgc ,_dbcgc :=_fefcd [0],_fefcd [len (_fefcd )-1];_bdab ,_abcf :=_ffcgf [0],_ffcgf [len (_ffcgf )-1];if !(_cebc (_fbgc ._ecdge -_bdab ._bafb )&&_cebc (_dbcgc ._ecdge -_bdab ._cbdc )&&_cebc (_bdab ._ecdge -_fbgc ._cbdc )&&_cebc (_abcf ._ecdge -_fbgc ._bafb )){if _bffg {_bb .Log .Info ("\u0069\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069d\u003a\u0020\u0020N\u006f\u0074 \u0061\u006c\u0069\u0067\u006e\u0065d\u002e\n\t\u0076\u0030\u003d\u0025\u0073\u000a\u0009\u0076\u0031\u003d\u0025\u0073\u000a\u0009\u0068\u0030\u003d\u0025\u0073\u000a\u0009\u0068\u0031\u003d\u0025\u0073",_fbgc ,_dbcgc ,_bdab ,_abcf );
|
||
};return nil ,false ;};}else {if !_fefcd .aligned (){if _cfec {_bb .Log .Info ("i\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069\u0064\u003a\u0020N\u006f\u0074\u0020\u0061\u006c\u0069\u0067n\u0065\u0064\u0020\u0076\u0065\u0072\u0074\u0073\u002e\u0020%\u0064",len (_fefcd ));
|
||
};return nil ,false ;};if !_ffcgf .aligned (){if _bffg {_bb .Log .Info ("i\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069\u0064\u003a\u0020N\u006f\u0074\u0020\u0061\u006c\u0069\u0067n\u0065\u0064\u0020\u0068\u006f\u0072\u007a\u0073\u002e\u0020%\u0064",len (_ffcgf ));
|
||
};return nil ,false ;};};_eaea :=append (_fefcd ,_ffcgf ...);return _eaea ,true ;};func _agfb (_fbbdd []TextMark ,_fgfe *TextTable )[]TextMark {var _eccdb []TextMark ;for _ ,_ecdg :=range _fbbdd {_ecdg ._cda =true ;_ecdg ._gfc =_fgfe ;_eccdb =append (_eccdb ,_ecdg );
|
||
};return _eccdb ;};func (_beed intSet )has (_edfdb int )bool {_ ,_gadab :=_beed [_edfdb ];return _gadab };func (_afbad *PageText )getParagraphs ()paraList {var _abg rulingList ;if _bcbc {_acfb :=_cbcb (_afbad ._fcbc );_abg =append (_abg ,_acfb ...);};if _bdag {_dfa :=_fecd (_afbad ._eeec );
|
||
_abg =append (_abg ,_dfa ...);};_abg ,_bcabf :=_abg .toTilings ();var _ebgf paraList ;_bcgg :=len (_afbad ._bae );for _egdb :=0;_egdb < 360&&_bcgg > 0;_egdb +=90{_eed :=make ([]*textMark ,0,len (_afbad ._bae )-_bcgg );for _ ,_ffe :=range _afbad ._bae {if _ffe ._eabg ==_egdb {_eed =append (_eed ,_ffe );
|
||
};};if len (_eed )> 0{_bfea :=_efde (_eed ,_afbad ._abfb ,_abg ,_bcabf ,_afbad ._afc ._add );_ebgf =append (_ebgf ,_bfea ...);_bcgg -=len (_eed );};};return _ebgf ;};type textObject struct{_afb *Extractor ;_adc *_bg .PdfPageResources ;_fgd _de .GraphicsState ;
|
||
_aed *textState ;_cea *stateStack ;_eaaf _cb .Matrix ;_bbde _cb .Matrix ;_dfge []*textMark ;_cgeb bool ;};func (_fcea *textObject )getFontDict (_eea string )(_ccdd _ga .PdfObject ,_ecea error ){_bbg :=_fcea ._adc ;if _bbg ==nil {_bb .Log .Debug ("g\u0065\u0074\u0046\u006f\u006e\u0074D\u0069\u0063\u0074\u002e\u0020\u004eo\u0020\u0072\u0065\u0073\u006f\u0075\u0072c\u0065\u0073\u002e\u0020\u006e\u0061\u006d\u0065\u003d\u0025#\u0071",_eea );
|
||
return nil ,nil ;};_ccdd ,_gfae :=_bbg .GetFontByName (_ga .PdfObjectName (_eea ));if !_gfae {_bb .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u0067\u0065t\u0046\u006f\u006et\u0044\u0069\u0063\u0074\u003a\u0020\u0046\u006f\u006et \u006e\u006f\u0074 \u0066\u006fu\u006e\u0064\u003a\u0020\u006e\u0061m\u0065\u003d%\u0023\u0071",_eea );
|
||
return nil ,_b .New ("f\u006f\u006e\u0074\u0020no\u0074 \u0069\u006e\u0020\u0072\u0065s\u006f\u0075\u0072\u0063\u0065\u0073");};return _ccdd ,nil ;};func _adbg (_ggdg float64 )int {var _dbce int ;if _ggdg >=0{_dbce =int (_ggdg /_fbedc );}else {_dbce =int (_ggdg /_fbedc )-1;
|
||
};return _dbce ;};
|
||
|
||
// PageText represents the layout of text on a device page.
|
||
type PageText struct{_bae []*textMark ;_bffc string ;_baeg []TextMark ;_ceaa []TextTable ;_abfb _bg .PdfRectangle ;_fcbc []pathSection ;_eeec []pathSection ;_abba *_ga .PdfObject ;_ccff _ga .PdfObject ;_fad *_de .ContentStreamOperations ;_afc PageTextOptions ;
|
||
};func (_ebdg *textMark )bbox ()_bg .PdfRectangle {return _ebdg .PdfRectangle };func (_ead *textObject )moveText (_edde ,_dbbe float64 ){_ead .moveLP (_edde ,_dbbe )};func (_fbf *wordBag )firstReadingIndex (_dfag int )int {_agcfc :=_fbf .firstWord (_dfag )._beaeg ;
|
||
_cbaf :=float64 (_dfag +1)*_fbedc ;_cbdb :=_cbaf +_bafc *_agcfc ;_beef :=_dfag ;for _ ,_bfgb :=range _fbf .depthBand (_cbaf ,_cbdb ){if _gffg (_fbf .firstWord (_bfgb ),_fbf .firstWord (_beef ))< 0{_beef =_bfgb ;};};return _beef ;};func (_cgd *shapesState )stroke (_agcb *[]pathSection ){_feaf :=pathSection {_aadg :_cgd ._bfeb ,Color :_cgd ._fdc .getStrokeColor ()};
|
||
*_agcb =append (*_agcb ,_feaf );if _bffg {_eg .Printf ("\u0020 \u0020\u0020S\u0054\u0052\u004fK\u0045\u003a\u0020\u0025\u0064\u0020\u0073t\u0072\u006f\u006b\u0065\u0073\u0020s\u0073\u003d\u0025\u0073\u0020\u0063\u006f\u006c\u006f\u0072\u003d%\u002b\u0076\u0020\u0025\u0036\u002e\u0032\u0066\u000a",len (*_agcb ),_cgd ,_cgd ._fdc .getStrokeColor (),_feaf .bbox ());
|
||
if _dcgf {for _aca ,_fafd :=range _cgd ._bfeb {_eg .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_aca ,_fafd );if _aca ==10{break ;};};};};};func (_dccg *shapesState )drawRectangle (_egeb ,_daae ,_abec ,_bdfd float64 ){if _ggbc {_cgebf :=_dccg .devicePoint (_egeb ,_daae );
|
||
_aad :=_dccg .devicePoint (_egeb +_abec ,_daae +_bdfd );_fcfde :=_bg .PdfRectangle {Llx :_cgebf .X ,Lly :_cgebf .Y ,Urx :_aad .X ,Ury :_aad .Y };_bb .Log .Info ("d\u0072a\u0077\u0052\u0065\u0063\u0074\u0061\u006e\u0067l\u0065\u003a\u0020\u00256.\u0032\u0066",_fcfde );
|
||
};_dccg .newSubPath ();_dccg .moveTo (_egeb ,_daae );_dccg .lineTo (_egeb +_abec ,_daae );_dccg .lineTo (_egeb +_abec ,_daae +_bdfd );_dccg .lineTo (_egeb ,_daae +_bdfd );_dccg .closePath ();};
|
||
|
||
// ExtractPageImages returns the image contents of the page extractor, including data
|
||
// and position, size information for each image.
|
||
// A set of options to control page image extraction can be passed in. The options
|
||
// parameter can be nil for the default options. By default, inline stencil masks
|
||
// are not extracted.
|
||
func (_aef *Extractor )ExtractPageImages (options *ImageExtractOptions )(*PageImages ,error ){_dbc :=&imageExtractContext {_caa :options };_dad :=_dbc .extractContentStreamImages (_aef ._cd ,_aef ._ee );if _dad !=nil {return nil ,_dad ;};return &PageImages {Images :_dbc ._cc },nil ;
|
||
};
|
||
|
||
// String returns a description of `k`.
|
||
func (_eeff rulingKind )String ()string {_bbcd ,_deagd :=_egdbc [_eeff ];if !_deagd {return _eg .Sprintf ("\u004e\u006ft\u0020\u0061\u0020r\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0025\u0064",_eeff );};return _bbcd ;};func (_egfd *textPara )taken ()bool {return _egfd ==nil ||_egfd ._ggcbf };
|
||
func _gagaf (_fgggb ,_agcad bounded )float64 {_gfe :=_gffg (_fgggb ,_agcad );if !_cadfe (_gfe ){return _gfe ;};return _gade (_fgggb ,_agcad );};func _egde (_ccad string ,_abegf int )string {if len (_ccad )< _abegf {return _ccad ;};return _ccad [:_abegf ];
|
||
};const (_aefg =true ;_aceb =true ;_egffa =true ;_cagc =false ;_ffffd =false ;_cccg =6;_bcea =3.0;_baga =200;_gegg =true ;_eab =true ;_bcbc =true ;_bdag =true ;_bdec =false ;);const (_adcc =1.0e-6;_bagd =1.0e-4;_cedd =10;_fbedc =6;_abea =0.5;_cfgg =0.12;
|
||
_eced =0.19;_ebdf =0.04;_fcbe =0.04;_abcc =1.0;_dgbg =0.04;_agbd =0.4;_dacd =0.7;_cbc =1.0;_edbd =0.1;_ffcdf =1.4;_gfbc =0.46;_gfeb =0.02;_bbff =0.2;_dgge =0.5;_adea =4;_bafc =4.0;_bbgee =6;_abfgg =0.3;_afdc =0.01;_fbc =0.02;_geda =2;_efdde =2;_gadb =500;
|
||
_afdf =4.0;_egcbe =4.0;_dfbd =0.05;_cga =0.1;_adca =2.0;_ceaf =2.0;_dcdg =1.5;_ddbac =3.0;_cgac =0.25;);func _gfea (_fdbe []*textLine ,_cfab string ,_fead []*list )*list {return &list {_dgef :_fdbe ,_cfcfa :_cfab ,_bcdd :_fead };};func _afbeg (_bcfe ,_degcd ,_ffffa ,_afca *textPara )*textTable {_gbag :=&textTable {_afaff :2,_gaaa :2,_feeba :make (map[uint64 ]*textPara ,4)};
|
||
_gbag .put (0,0,_bcfe );_gbag .put (1,0,_degcd );_gbag .put (0,1,_ffffa );_gbag .put (1,1,_afca );return _gbag ;};var _fabd *_bf .Regexp =_bf .MustCompile (_aaefc +"\u007c"+_feg );func (_bagb *wordBag )depthRange (_adfg ,_debc int )[]int {var _aaec []int ;
|
||
for _fdfe :=range _bagb ._dceb {if _adfg <=_fdfe &&_fdfe <=_debc {_aaec =append (_aaec ,_fdfe );};};if len (_aaec )==0{return nil ;};_ff .Ints (_aaec );return _aaec ;};func (_bfde *textWord )appendMark (_ddgf *textMark ,_bbac _bg .PdfRectangle ){_bfde ._dbff =append (_bfde ._dbff ,_ddgf );
|
||
_bfde .PdfRectangle =_dfba (_bfde .PdfRectangle ,_ddgf .PdfRectangle );if _ddgf ._cffb > _bfde ._beaeg {_bfde ._beaeg =_ddgf ._cffb ;};_bfde ._gdfbg =_bbac .Ury -_bfde .PdfRectangle .Lly ;};func _egccd (_dgagf map[int ][]float64 )string {_cagd :=_cfbe (_dgagf );
|
||
_efdf :=make ([]string ,len (_dgagf ));for _dagfg ,_dccbb :=range _cagd {_efdf [_dagfg ]=_eg .Sprintf ("\u0025\u0064\u003a\u0020\u0025\u002e\u0032\u0066",_dccbb ,_dgagf [_dccbb ]);};return _eg .Sprintf ("\u007b\u0025\u0073\u007d",_f .Join (_efdf ,"\u002c\u0020"));
|
||
};var _fgea =[]string {"\u2756","\u27a2","\u2713","\u2022","\uf0a7","\u25a1","\u2212","\u25a0","\u25aa","\u006f"};func (_gebc *textTable )growTable (){_fabg :=func (_dfeec paraList ){_gebc ._gaaa ++;for _bbec :=0;_bbec < _gebc ._afaff ;_bbec ++{_fcbb :=_dfeec [_bbec ];
|
||
_gebc .put (_bbec ,_gebc ._gaaa -1,_fcbb );};};_fbaf :=func (_fdgd paraList ){_gebc ._afaff ++;for _afed :=0;_afed < _gebc ._gaaa ;_afed ++{_ccbd :=_fdgd [_afed ];_gebc .put (_gebc ._afaff -1,_afed ,_ccbd );};};if _cgcee {_gebc .log ("\u0067r\u006f\u0077\u0054\u0061\u0062\u006ce");
|
||
};for _aebfd :=0;;_aebfd ++{_fccgc :=false ;_fdega :=_gebc .getDown ();_gegf :=_gebc .getRight ();if _cgcee {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_aebfd ,_gebc );_eg .Printf ("\u0020\u0020 \u0020\u0020\u0020 \u0020\u0064\u006f\u0077\u006e\u003d\u0025\u0073\u000a",_fdega );
|
||
_eg .Printf ("\u0020\u0020 \u0020\u0020\u0020 \u0072\u0069\u0067\u0068\u0074\u003d\u0025\u0073\u000a",_gegf );};if _fdega !=nil &&_gegf !=nil {_gbgaa :=_fdega [len (_fdega )-1];if !_gbgaa .taken ()&&_gbgaa ==_gegf [len (_gegf )-1]{_fabg (_fdega );if _gegf =_gebc .getRight ();
|
||
_gegf !=nil {_fbaf (_gegf );_gebc .put (_gebc ._afaff -1,_gebc ._gaaa -1,_gbgaa );};_fccgc =true ;};};if !_fccgc &&_fdega !=nil {_fabg (_fdega );_fccgc =true ;};if !_fccgc &&_gegf !=nil {_fbaf (_gegf );_fccgc =true ;};if !_fccgc {break ;};};};
|
||
|
||
// PageTextOptions holds various options available in extraction process.
|
||
type PageTextOptions struct{_ggc bool ;_add bool ;};func _aged (_gecc *wordBag ,_afbf float64 ,_bceae ,_ddfe rulingList )[]*wordBag {var _adg []*wordBag ;for _ ,_cbda :=range _gecc .depthIndexes (){_ebag :=false ;for !_gecc .empty (_cbda ){_aedacc :=_gecc .firstReadingIndex (_cbda );
|
||
_aecdb :=_gecc .firstWord (_aedacc );_gccc :=_efdd (_aecdb ,_afbf ,_bceae ,_ddfe );_gecc .removeWord (_aecdb ,_aedacc );if _egcbf {_bb .Log .Info ("\u0066\u0069\u0072\u0073\u0074\u0057\u006f\u0072\u0064\u0020\u005e\u005e^\u005e\u0020\u0025\u0073",_aecdb .String ());
|
||
};for _cfce :=true ;_cfce ;_cfce =_ebag {_ebag =false ;_dcbb :=_cbc *_gccc ._ddgcf ;_adfc :=_agbd *_gccc ._ddgcf ;_bdda :=_abcc *_gccc ._ddgcf ;if _egcbf {_bb .Log .Info ("\u0070a\u0072a\u0057\u006f\u0072\u0064\u0073\u0020\u0064\u0065\u0070\u0074\u0068 \u0025\u002e\u0032\u0066 \u002d\u0020\u0025\u002e\u0032f\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061\u0044\u0065\u0070\u0074\u0068\u0047\u0061\u0070\u003d\u0025\u002e\u0032\u0066\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061R\u0065\u0061\u0064\u0069\u006e\u0067\u0047\u0061p\u003d\u0025\u002e\u0032\u0066",_gccc .minDepth (),_gccc .maxDepth (),_bdda ,_adfc );
|
||
};if _gecc .scanBand ("\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c",_gccc ,_deda (_acea ,0),_gccc .minDepth ()-_bdda ,_gccc .maxDepth ()+_bdda ,_dgbg ,false ,false )> 0{_ebag =true ;};if _gecc .scanBand ("\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c",_gccc ,_deda (_acea ,_adfc ),_gccc .minDepth (),_gccc .maxDepth (),_dacd ,false ,false )> 0{_ebag =true ;
|
||
};if _ebag {continue ;};_abcg :=_gecc .scanBand ("",_gccc ,_deda (_ebbc ,_dcbb ),_gccc .minDepth (),_gccc .maxDepth (),_edbd ,true ,false );if _abcg > 0{_fbcd :=(_gccc .maxDepth ()-_gccc .minDepth ())/_gccc ._ddgcf ;if (_abcg > 1&&float64 (_abcg )> 0.3*_fbcd )||_abcg <=10{if _gecc .scanBand ("\u006f\u0074\u0068e\u0072",_gccc ,_deda (_ebbc ,_dcbb ),_gccc .minDepth (),_gccc .maxDepth (),_edbd ,false ,true )> 0{_ebag =true ;
|
||
};};};};_adg =append (_adg ,_gccc );};};return _adg ;};func (_cbac *wordBag )blocked (_gddc *textWord )bool {if _gddc .Urx < _cbac .Llx {_bccab :=_caggg (_gddc .PdfRectangle );_geeb :=_acefg (_cbac .PdfRectangle );if _cbac ._ecgb .blocks (_bccab ,_geeb ){if _ebdca {_bb .Log .Info ("\u0062\u006c\u006f\u0063ke\u0064\u0020\u2190\u0078\u003a\u0020\u0025\u0073\u0020\u0025\u0073",_gddc ,_cbac );
|
||
};return true ;};}else if _cbac .Urx < _gddc .Llx {_ceea :=_caggg (_cbac .PdfRectangle );_fba :=_acefg (_gddc .PdfRectangle );if _cbac ._ecgb .blocks (_ceea ,_fba ){if _ebdca {_bb .Log .Info ("b\u006co\u0063\u006b\u0065\u0064\u0020\u0078\u2192\u0020:\u0020\u0025\u0073\u0020%s",_gddc ,_cbac );
|
||
};return true ;};};if _gddc .Ury < _cbac .Lly {_fbeg :=_fbgg (_gddc .PdfRectangle );_ageg :=_aecaf (_cbac .PdfRectangle );if _cbac ._bfadd .blocks (_fbeg ,_ageg ){if _ebdca {_bb .Log .Info ("\u0062\u006c\u006f\u0063ke\u0064\u0020\u2190\u0079\u003a\u0020\u0025\u0073\u0020\u0025\u0073",_gddc ,_cbac );
|
||
};return true ;};}else if _cbac .Ury < _gddc .Lly {_ecb :=_fbgg (_cbac .PdfRectangle );_bdca :=_aecaf (_gddc .PdfRectangle );if _cbac ._bfadd .blocks (_ecb ,_bdca ){if _ebdca {_bb .Log .Info ("b\u006co\u0063\u006b\u0065\u0064\u0020\u0079\u2192\u0020:\u0020\u0025\u0073\u0020%s",_gddc ,_cbac );
|
||
};return true ;};};return false ;};func (_fedf *ruling )alignsPrimary (_agdc *ruling )bool {return _fedf ._ffdb ==_agdc ._ffdb &&_ef .Abs (_fedf ._ecdge -_agdc ._ecdge )< _ceaf *0.5;};var _ffbf =_bf .MustCompile ("\u005e\u005c\u0073\u002a\u0028\u005c\u0064\u002b\u005c\u002e\u003f|\u005b\u0049\u0069\u0076\u005d\u002b\u0029\u005c\u0073\u002a\\\u0029\u003f\u0024");
|
||
func _babg (_fafb ,_cfbg _cb .Point )bool {return _fafb .X ==_cfbg .X &&_fafb .Y ==_cfbg .Y };func (_defd *textPara )isAtom ()*textTable {_beagc :=_defd ;_bgged :=_defd ._gdfgd ;_aaddb :=_defd ._daag ;if _bgged .taken ()||_aaddb .taken (){return nil ;};
|
||
_aefeg :=_bgged ._daag ;if _aefeg .taken ()||_aefeg !=_aaddb ._gdfgd {return nil ;};return _afbeg (_beagc ,_bgged ,_aaddb ,_aefeg );};func (_aaagc paraList )readBefore (_bfga []int ,_fada ,_ddfga int )bool {_gddce ,_edgfd :=_aaagc [_fada ],_aaagc [_ddfga ];
|
||
if _fece (_gddce ,_edgfd )&&_gddce .Lly > _edgfd .Lly {return true ;};if !(_gddce ._aaga .Urx < _edgfd ._aaga .Llx ){return false ;};_adfcb ,_bcde :=_gddce .Lly ,_edgfd .Lly ;if _adfcb > _bcde {_bcde ,_adfcb =_adfcb ,_bcde ;};_dbaf :=_ef .Max (_gddce ._aaga .Llx ,_edgfd ._aaga .Llx );
|
||
_cgccc :=_ef .Min (_gddce ._aaga .Urx ,_edgfd ._aaga .Urx );_bbbg :=_aaagc .llyRange (_bfga ,_adfcb ,_bcde );for _ ,_gfcc :=range _bbbg {if _gfcc ==_fada ||_gfcc ==_ddfga {continue ;};_dffb :=_aaagc [_gfcc ];if _dffb ._aaga .Llx <=_cgccc &&_dbaf <=_dffb ._aaga .Urx {return false ;
|
||
};};return true ;};func (_eddf rulingList )bbox ()_bg .PdfRectangle {var _fcfac _bg .PdfRectangle ;if len (_eddf )==0{_bb .Log .Error ("r\u0075\u006c\u0069\u006e\u0067\u004ci\u0073\u0074\u002e\u0062\u0062\u006f\u0078\u003a\u0020n\u006f\u0020\u0072u\u006ci\u006e\u0067\u0073");
|
||
return _bg .PdfRectangle {};};if _eddf [0]._ffdb ==_fcec {_fcfac .Llx ,_fcfac .Urx =_eddf .secMinMax ();_fcfac .Lly ,_fcfac .Ury =_eddf .primMinMax ();}else {_fcfac .Llx ,_fcfac .Urx =_eddf .primMinMax ();_fcfac .Lly ,_fcfac .Ury =_eddf .secMinMax ();};
|
||
return _fcfac ;};func (_aeeb *wordBag )scanBand (_cfcfd string ,_baa *wordBag ,_dbge func (_beag *wordBag ,_cebb *textWord )bool ,_gbd ,_eeeg ,_ccdgc float64 ,_bbge ,_afef bool )int {_egae :=_baa ._ddgcf ;var _edaa map[int ]map[*textWord ]struct{};if !_bbge {_edaa =_aeeb .makeRemovals ();
|
||
};_gdfae :=_abea *_egae ;_dbdf :=0;for _ ,_dbcc :=range _aeeb .depthBand (_gbd -_gdfae ,_eeeg +_gdfae ){if len (_aeeb ._dceb [_dbcc ])==0{continue ;};for _ ,_cedgf :=range _aeeb ._dceb [_dbcc ]{if !(_gbd -_gdfae <=_cedgf ._gdfbg &&_cedgf ._gdfbg <=_eeeg +_gdfae ){continue ;
|
||
};if !_dbge (_baa ,_cedgf ){continue ;};_edba :=2.0*_ef .Abs (_cedgf ._beaeg -_baa ._ddgcf )/(_cedgf ._beaeg +_baa ._ddgcf );_cbgd :=_ef .Max (_cedgf ._beaeg /_baa ._ddgcf ,_baa ._ddgcf /_cedgf ._beaeg );_bcaa :=_ef .Min (_edba ,_cbgd );if _ccdgc > 0&&_bcaa > _ccdgc {continue ;
|
||
};if _baa .blocked (_cedgf ){continue ;};if !_bbge {_baa .pullWord (_cedgf ,_dbcc ,_edaa );};_dbdf ++;if !_afef {if _cedgf ._gdfbg < _gbd {_gbd =_cedgf ._gdfbg ;};if _cedgf ._gdfbg > _eeeg {_eeeg =_cedgf ._gdfbg ;};};if _bbge {break ;};};};if !_bbge {_aeeb .applyRemovals (_edaa );
|
||
};return _dbdf ;};
|
||
|
||
// Text gets the extracted text contained in `l`.
|
||
func (_cege *list )Text ()string {_gebd :=&_f .Builder {};_aagd :="";_feae (_cege ,_gebd ,&_aagd );return _gebd .String ();};func _debb (_bebf *textLine ,_dfcd []*textLine ,_gfcb []float64 )float64 {var _abaca float64 =-1;for _ ,_bcecb :=range _dfcd {if _bcecb ._ffbb > _bebf ._ffbb {if _ef .Round (_bcecb .Llx )>=_ef .Round (_bebf .Llx ){_abaca =_bcecb ._ffbb ;
|
||
}else {break ;};};};return _abaca ;};func _gade (_dgga ,_gagg bounded )float64 {return _ddbfd (_dgga )-_ddbfd (_gagg )};func (_ccfe rulingList )removeDuplicates ()rulingList {if len (_ccfe )==0{return nil ;};_ccfe .sort ();_fecg :=rulingList {_ccfe [0]};
|
||
for _ ,_dbaa :=range _ccfe [1:]{if _dbaa .equals (_fecg [len (_fecg )-1]){continue ;};_fecg =append (_fecg ,_dbaa );};return _fecg ;};func (_cccgb paraList )findTables (_cdabc []gridTiling )[]*textTable {_cccgb .addNeighbours ();_ff .Slice (_cccgb ,func (_egdfa ,_baag int )bool {return _gagaf (_cccgb [_egdfa ],_cccgb [_baag ])< 0});
|
||
var _efgff []*textTable ;if _gegg {_acfbg :=_cccgb .findGridTables (_cdabc );_efgff =append (_efgff ,_acfbg ...);};if _eab {_gabcd :=_cccgb .findTextTables ();_efgff =append (_efgff ,_gabcd ...);};return _efgff ;};
|
||
|
||
// String returns a human readable description of `vecs`.
|
||
func (_agcab rulingList )String ()string {if len (_agcab )==0{return "\u007b \u0045\u004d\u0050\u0054\u0059\u0020}";};_cbbf ,_ffgd :=_agcab .vertsHorzs ();_dgdea :=len (_cbbf );_abage :=len (_ffgd );if _dgdea ==0||_abage ==0{return _eg .Sprintf ("\u007b%\u0064\u0020\u0078\u0020\u0025\u0064}",_dgdea ,_abage );
|
||
};_bcac :=_bg .PdfRectangle {Llx :_cbbf [0]._ecdge ,Urx :_cbbf [_dgdea -1]._ecdge ,Lly :_ffgd [_abage -1]._ecdge ,Ury :_ffgd [0]._ecdge };return _eg .Sprintf ("\u007b\u0025d\u0020\u0078\u0020%\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u007d",_dgdea ,_abage ,_bcac );
|
||
};func _aac (_bccf *Extractor ,_fecf *_bg .PdfPageResources ,_faaf _de .GraphicsState ,_ggf *textState ,_cbbd *stateStack )*textObject {return &textObject {_afb :_bccf ,_adc :_fecf ,_fgd :_faaf ,_cea :_cbbd ,_aed :_ggf ,_eaaf :_cb .IdentityMatrix (),_bbde :_cb .IdentityMatrix ()};
|
||
};type structTreeRoot struct{_aaef []structElement ;_cfbff string ;};func (_gea *shapesState )moveTo (_cffe ,_gff float64 ){_gea ._abac =true ;_gea ._bfgg =_gea .devicePoint (_cffe ,_gff );if _ggbc {_bb .Log .Info ("\u006d\u006fv\u0065\u0054\u006f\u003a\u0020\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0064\u0065\u0076\u0069\u0063\u0065\u003d%.\u0032\u0066",_cffe ,_gff ,_gea ._bfgg );
|
||
};};func (_dae *TextMarkArray )exists (_caae TextMark )bool {for _ ,_cbaa :=range _dae .Elements (){if _dc .DeepEqual (_caae .DirectObject ,_cbaa .DirectObject )&&_dc .DeepEqual (_caae .BBox ,_cbaa .BBox )&&_cbaa .Text ==_caae .Text {return true ;};};return false ;
|
||
};type textResult struct{_bgd PageText ;_eec int ;_fbbd int ;};func (_fabb *wordBag )makeRemovals ()map[int ]map[*textWord ]struct{}{_edgb :=make (map[int ]map[*textWord ]struct{},len (_fabb ._dceb ));for _fdgab :=range _fabb ._dceb {_edgb [_fdgab ]=make (map[*textWord ]struct{});
|
||
};return _edgb ;};func _bfbd (_defb *wordBag ,_fagf int )*textLine {_bad :=_defb .firstWord (_fagf );_caf :=textLine {PdfRectangle :_bad .PdfRectangle ,_bbeb :_bad ._beaeg ,_ffbb :_bad ._gdfbg };_caf .pullWord (_defb ,_bad ,_fagf );return &_caf ;};
|
||
|
||
// ExtractFonts returns all font information from the page extractor, including
|
||
// font name, font type, the raw data of the embedded font file (if embedded), font descriptor and more.
|
||
//
|
||
// The argument `previousPageFonts` is used when trying to build a complete font catalog for multiple pages or the entire document.
|
||
// The entries from `previousPageFonts` are added to the returned result unless already included in the page, i.e. no duplicate entries.
|
||
//
|
||
// NOTE: If previousPageFonts is nil, all fonts from the page will be returned. Use it when building up a full list of fonts for a document or page range.
|
||
func (_ffcd *Extractor )ExtractFonts (previousPageFonts *PageFonts )(*PageFonts ,error ){_bfd :=PageFonts {};_ae :=_bfd .extractPageResourcesToFont (_ffcd ._ee );if _ae !=nil {return nil ,_ae ;};if previousPageFonts !=nil {for _ ,_bca :=range previousPageFonts .Fonts {if !_fcd (_bfd .Fonts ,_bca .FontName ){_bfd .Fonts =append (_bfd .Fonts ,_bca );
|
||
};};};return &PageFonts {Fonts :_bfd .Fonts },nil ;};func _fcd (_def []Font ,_aaf string )bool {for _ ,_agc :=range _def {if _agc .FontName ==_aaf {return true ;};};return false ;};func (_cagg *textObject )renderText (_bbb _ga .PdfObject ,_ecf []byte ,_ebbd int )error {if _cagg ._cgeb {_bb .Log .Debug ("\u0072\u0065\u006e\u0064\u0065r\u0054\u0065\u0078\u0074\u003a\u0020\u0049\u006e\u0076\u0061\u006c\u0069\u0064 \u0066\u006f\u006e\u0074\u002e\u0020\u004e\u006f\u0074\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u002e");
|
||
return nil ;};_bbdg :=_cagg .getCurrentFont ();_gbbg :=_bbdg .BytesToCharcodes (_ecf );_cac ,_gaegg ,_ccef :=_bbdg .CharcodesToStrings (_gbbg );if _ccef > 0{_bb .Log .Debug ("\u0072\u0065nd\u0065\u0072\u0054e\u0078\u0074\u003a\u0020num\u0043ha\u0072\u0073\u003d\u0025\u0064\u0020\u006eum\u004d\u0069\u0073\u0073\u0065\u0073\u003d%\u0064",_gaegg ,_ccef );
|
||
};_cagg ._aed ._ggaa +=_gaegg ;_cagg ._aed ._gba +=_ccef ;_bff :=_cagg ._aed ;_gace :=_bff ._fbda ;_cfebc :=_bff ._dcc /100.0;_efb :=_egdf ;if _bbdg .Subtype ()=="\u0054\u0079\u0070e\u0033"{_efb =1;};_eccd ,_fffe :=_bbdg .GetRuneMetrics (' ');if !_fffe {_eccd ,_fffe =_bbdg .GetCharMetrics (32);
|
||
};if !_fffe {_eccd ,_ =_bg .DefaultFont ().GetRuneMetrics (' ');};_ccga :=_eccd .Wx *_efb ;_bb .Log .Trace ("\u0073p\u0061\u0063e\u0057\u0069\u0064t\u0068\u003d\u0025\u002e\u0032\u0066\u0020t\u0065\u0078\u0074\u003d\u0025\u0071 \u0066\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0066\u006f\u006et\u0053\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066",_ccga ,_cac ,_bbdg ,_gace );
|
||
_daf :=_cb .NewMatrix (_gace *_cfebc ,0,0,_gace ,0,_bff ._adf );if _gbec {_bb .Log .Info ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0074\u0065\u0078t\u0073\u003d\u0025\u0071",len (_gbbg ),_gbbg ,_cac );
|
||
};_bb .Log .Trace ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0072\u0075\u006ee\u0073\u003d\u0025\u0071",len (_gbbg ),_gbbg ,len (_cac ));_edbf :=_cagg .getFillColor ();
|
||
_fefe :=_cagg .getStrokeColor ();for _fcfa ,_dac :=range _cac {_gefg :=[]rune (_dac );if len (_gefg )==1&&_gefg [0]=='\x00'{continue ;};_dgeb :=_gbbg [_fcfa ];_bcb :=_cagg ._fgd .CTM .Mult (_cagg ._eaaf ).Mult (_daf );_befc :=0.0;if len (_gefg )==1&&_gefg [0]==32{_befc =_bff ._dedcd ;
|
||
};_afge ,_gbcc :=_bbdg .GetCharMetrics (_dgeb );if !_gbcc {_bb .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u004e\u006f \u006d\u0065\u0074r\u0069\u0063\u0020\u0066\u006f\u0072\u0020\u0063\u006fde\u003d\u0025\u0064 \u0072\u003d0\u0078\u0025\u0030\u0034\u0078\u003d%\u002b\u0071 \u0025\u0073",_dgeb ,_gefg ,_gefg ,_bbdg );
|
||
return _eg .Errorf ("\u006e\u006f\u0020\u0063\u0068\u0061\u0072\u0020\u006d\u0065\u0074\u0072\u0069\u0063\u0073:\u0020f\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0063\u006f\u0064\u0065\u003d\u0025\u0064",_bbdg .String (),_dgeb );};_afba :=_cb .Point {X :_afge .Wx *_efb ,Y :_afge .Wy *_efb };
|
||
_aag :=_cb .Point {X :(_afba .X *_gace +_befc )*_cfebc };_fffec :=_cb .Point {X :(_afba .X *_gace +_bff ._edf +_befc )*_cfebc };if _gbec {_bb .Log .Info ("\u0074\u0066\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0063\u003d\u0025\u002e\u0032f\u0020t\u0077\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0068\u003d\u0025\u002e\u0032\u0066",_gace ,_bff ._edf ,_bff ._dedcd ,_cfebc );
|
||
_bb .Log .Info ("\u0064x\u002c\u0064\u0079\u003d%\u002e\u0033\u0066\u0020\u00740\u003d%\u002e3\u0066\u0020\u0074\u003d\u0025\u002e\u0033f",_afba ,_aag ,_fffec );};_cead :=_cgge (_aag );_faca :=_cgge (_fffec );_ggfg :=_cagg ._fgd .CTM .Mult (_cagg ._eaaf ).Mult (_cead );
|
||
if _fcba {_bb .Log .Info ("e\u006e\u0064\u003a\u000a\tC\u0054M\u003d\u0025\u0073\u000a\u0009 \u0074\u006d\u003d\u0025\u0073\u000a"+"\u0009\u0020t\u0064\u003d\u0025s\u0020\u0078\u006c\u0061\u0074\u003d\u0025\u0073\u000a"+"\u0009t\u0064\u0030\u003d\u0025s\u000a\u0009\u0020\u0020\u2192 \u0025s\u0020x\u006c\u0061\u0074\u003d\u0025\u0073",_cagg ._fgd .CTM ,_cagg ._eaaf ,_faca ,_acbe (_cagg ._fgd .CTM .Mult (_cagg ._eaaf ).Mult (_faca )),_cead ,_ggfg ,_acbe (_ggfg ));
|
||
};_fgg ,_ede :=_cagg .newTextMark (_fa .ExpandLigatures (_gefg ),_bcb ,_acbe (_ggfg ),_ef .Abs (_ccga *_bcb .ScalingFactorX ()),_bbdg ,_cagg ._aed ._edf ,_edbf ,_fefe ,_bbb ,_cac ,_fcfa ,_ebbd );if !_ede {_bb .Log .Debug ("\u0054\u0065\u0078\u0074\u0020\u006d\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069d\u0065 \u0070\u0061\u0067\u0065\u002e\u0020\u0053\u006b\u0069\u0070\u0070\u0069\u006e\u0067");
|
||
continue ;};if _bbdg ==nil {_bb .Log .Debug ("\u0045R\u0052O\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u002e");}else if _bbdg .Encoder ()==nil {_bb .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020N\u006f\u0020\u0065\u006e\u0063\u006f\u0064\u0069\u006eg\u002e\u0020\u0066o\u006et\u003d\u0025\u0073",_bbdg );
|
||
}else {if _edce ,_aaab :=_bbdg .Encoder ().CharcodeToRune (_dgeb );_aaab {_fgg ._ebaa =string (_edce );};};_bb .Log .Trace ("i\u003d\u0025\u0064\u0020\u0063\u006fd\u0065\u003d\u0025\u0064\u0020\u006d\u0061\u0072\u006b=\u0025\u0073\u0020t\u0072m\u003d\u0025\u0073",_fcfa ,_dgeb ,_fgg ,_bcb );
|
||
_cagg ._dfge =append (_cagg ._dfge ,&_fgg );_cagg ._eaaf .Concat (_faca );};return nil ;};func (_ccddb rulingList )toTilings ()(rulingList ,[]gridTiling ){_ccddb .log ("\u0074o\u0054\u0069\u006c\u0069\u006e\u0067s");if len (_ccddb )==0{return nil ,nil ;
|
||
};_ccddb =_ccddb .tidied ("\u0061\u006c\u006c");_ccddb .log ("\u0074\u0069\u0064\u0069\u0065\u0064");_faafc :=_ccddb .toGrids ();_gfeg :=make ([]gridTiling ,len (_faafc ));for _gbgeb ,_afdfa :=range _faafc {_gfeg [_gbgeb ]=_afdfa .asTiling ();};return _ccddb ,_gfeg ;
|
||
};const (_ffb =false ;_abfa =false ;_gfda =false ;_fcba =false ;_ggbc =false ;_gbec =false ;_egcbf =false ;_bdcc =false ;_cdgd =false ;_aabd =_cdgd &&true ;_gdfe =_aabd &&false ;_bcce =_cdgd &&true ;_aeff =false ;_cgcee =_aeff &&false ;_bebb =_aeff &&true ;
|
||
_bffg =false ;_dcgf =_bffg &&false ;_cfec =_bffg &&false ;_dcdd =_bffg &&true ;_dfdg =_bffg &&false ;_ebdca =_bffg &&false ;);type imageExtractContext struct{_cc []ImageMark ;_fafc int ;_bab int ;_bac int ;_dab map[*_ga .PdfObjectStream ]*cachedImage ;
|
||
_caa *ImageExtractOptions ;_fca bool ;};func _dfba (_gdfg ,_faeg _bg .PdfRectangle )_bg .PdfRectangle {return _bg .PdfRectangle {Llx :_ef .Min (_gdfg .Llx ,_faeg .Llx ),Lly :_ef .Min (_gdfg .Lly ,_faeg .Lly ),Urx :_ef .Max (_gdfg .Urx ,_faeg .Urx ),Ury :_ef .Max (_gdfg .Ury ,_faeg .Ury )};
|
||
};func _gfdb (_accd ,_bedg _bg .PdfRectangle )bool {return _bedg .Llx <=_accd .Urx &&_accd .Llx <=_bedg .Urx ;};type stateStack []*textState ;func _ebbbg (_ecedc ,_ffcae int )int {if _ecedc > _ffcae {return _ecedc ;};return _ffcae ;};
|
||
|
||
// Len returns the number of TextMarks in `ma`.
|
||
func (_acca *TextMarkArray )Len ()int {if _acca ==nil {return 0;};return len (_acca ._dafa );};func (_gfcbe compositeCell )split (_acbf ,_ebggb []float64 )*textTable {_affc :=len (_acbf )+1;_gdffe :=len (_ebggb )+1;if _aeff {_bb .Log .Info ("\u0063\u006f\u006d\u0070\u006f\u0073\u0069t\u0065\u0043\u0065l\u006c\u002e\u0073\u0070l\u0069\u0074\u003a\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u000a\u0009\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u003d\u0025\u0073\u000a"+"\u0009\u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073=\u0025\u0036\u002e\u0032\u0066\u000a\t\u0063\u006f\u006c\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u003d%\u0036\u002e\u0032\u0066",_gdffe ,_affc ,_gfcbe ,_acbf ,_ebggb );
|
||
_eg .Printf ("\u0020\u0020\u0020\u0020\u0025\u0064\u0020\u0070\u0061\u0072\u0061\u0073\u000a",len (_gfcbe .paraList ));for _gbgee ,_ccdc :=range _gfcbe .paraList {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_gbgee ,_ccdc .String ());
|
||
};_eg .Printf ("\u0020\u0020\u0020\u0020\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u000a",len (_gfcbe .lines ()));for _ddabg ,_ebcbg :=range _gfcbe .lines (){_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_ddabg ,_ebcbg );};};_acbf =_aeece (_acbf ,_gfcbe .Ury ,_gfcbe .Lly );
|
||
_ebggb =_aeece (_ebggb ,_gfcbe .Llx ,_gfcbe .Urx );_cgebfd :=make (map[uint64 ]*textPara ,_gdffe *_affc );_cfgec :=textTable {_afaff :_gdffe ,_gaaa :_affc ,_feeba :_cgebfd };_cgbfe :=_gfcbe .paraList ;_ff .Slice (_cgbfe ,func (_daabeb ,_feac int )bool {_caaa ,_afggc :=_cgbfe [_daabeb ],_cgbfe [_feac ];
|
||
_cbea ,_abeg :=_caaa .Lly ,_afggc .Lly ;if _cbea !=_abeg {return _cbea < _abeg ;};return _caaa .Llx < _afggc .Llx ;});_ccgac :=make (map[uint64 ]_bg .PdfRectangle ,_gdffe *_affc );for _fegf ,_aegd :=range _acbf [1:]{_ddae :=_acbf [_fegf ];for _acebc ,_fcbgc :=range _ebggb [1:]{_ecgd :=_ebggb [_acebc ];
|
||
_ccgac [_gbbce (_acebc ,_fegf )]=_bg .PdfRectangle {Llx :_ecgd ,Urx :_fcbgc ,Lly :_aegd ,Ury :_ddae };};};if _aeff {_bb .Log .Info ("\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u0043\u0065l\u006c\u002e\u0073\u0070\u006c\u0069\u0074\u003a\u0020\u0072e\u0063\u0074\u0073");
|
||
_eg .Printf ("\u0020\u0020\u0020\u0020");for _gbaff :=0;_gbaff < _gdffe ;_gbaff ++{_eg .Printf ("\u0025\u0033\u0030\u0064\u002c\u0020",_gbaff );};_eg .Println ();for _cgdf :=0;_cgdf < _affc ;_cgdf ++{_eg .Printf ("\u0020\u0020\u0025\u0032\u0064\u003a",_cgdf );
|
||
for _dccd :=0;_dccd < _gdffe ;_dccd ++{_eg .Printf ("\u00256\u002e\u0032\u0066\u002c\u0020",_ccgac [_gbbce (_dccd ,_cgdf )]);};_eg .Println ();};};_ffgg :=func (_eacce *textLine )(int ,int ){for _cedf :=0;_cedf < _affc ;_cedf ++{for _dabc :=0;_dabc < _gdffe ;
|
||
_dabc ++{if _adfd (_ccgac [_gbbce (_dabc ,_cedf )],_eacce .PdfRectangle ){return _dabc ,_cedf ;};};};return -1,-1;};_dgdd :=make (map[uint64 ][]*textLine ,_gdffe *_affc );for _ ,_beec :=range _cgbfe .lines (){_eebb ,_ecabe :=_ffgg (_beec );if _eebb < 0{continue ;
|
||
};_dgdd [_gbbce (_eebb ,_ecabe )]=append (_dgdd [_gbbce (_eebb ,_ecabe )],_beec );};for _efddc :=0;_efddc < len (_acbf )-1;_efddc ++{_affd :=_acbf [_efddc ];_bedgg :=_acbf [_efddc +1];for _egebf :=0;_egebf < len (_ebggb )-1;_egebf ++{_fbegd :=_ebggb [_egebf ];
|
||
_dfab :=_ebggb [_egebf +1];_ebgd :=_bg .PdfRectangle {Llx :_fbegd ,Urx :_dfab ,Lly :_bedgg ,Ury :_affd };_cccd :=_dgdd [_gbbce (_egebf ,_efddc )];if len (_cccd )==0{continue ;};_fcef :=_efce (_ebgd ,_cccd );_cfgec .put (_egebf ,_efddc ,_fcef );};};return &_cfgec ;
|
||
};func (_gacg *subpath )add (_gfde ..._cb .Point ){_gacg ._egfb =append (_gacg ._egfb ,_gfde ...)};func (_agefd *textWord )addDiacritic (_gfdd string ){_ggfb :=_agefd ._dbff [len (_agefd ._dbff )-1];_ggfb ._gcgb +=_gfdd ;_ggfb ._gcgb =_a .NFKC .String (_ggfb ._gcgb );
|
||
};func _ccdg (_daab []*textWord ,_edcc float64 ,_accae ,_efe rulingList )*wordBag {_bgad :=_efdd (_daab [0],_edcc ,_accae ,_efe );for _ ,_bcbf :=range _daab [1:]{_bgaa :=_adbg (_bcbf ._gdfbg );_bgad ._dceb [_bgaa ]=append (_bgad ._dceb [_bgaa ],_bcbf );
|
||
_bgad .PdfRectangle =_dfba (_bgad .PdfRectangle ,_bcbf .PdfRectangle );};_bgad .sort ();return _bgad ;};func _afdbc (_fgbac *PageText )error {_bced :=_ge .GetLicenseKey ();if _bced !=nil &&_bced .IsLicensed ()||_cg {return nil ;};_eg .Printf ("\u0055\u006e\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0064\u0020c\u006f\u0070\u0079\u0020\u006f\u0066\u0020\u0055\u006e\u0069P\u0044\u0046\u000a");
|
||
_eg .Println ("-\u0020\u0047\u0065\u0074\u0020\u0061\u0020\u0066\u0072e\u0065\u0020\u0074\u0072\u0069\u0061\u006c l\u0069\u0063\u0065\u006es\u0065\u0020\u006f\u006e\u0020\u0068\u0074\u0074\u0070s:\u002f\u002fu\u006e\u0069\u0064\u006f\u0063\u002e\u0069\u006f");
|
||
return _b .New ("\u0075\u006e\u0069\u0070d\u0066\u0020\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0020c\u006fd\u0065\u0020\u0072\u0065\u0071\u0075\u0069r\u0065\u0064");};func _egba (_gfg *textLine )bool {_bfeba :=true ;_bbce :=-1;for _ ,_abbc :=range _gfg ._ggdd {for _ ,_afae :=range _abbc ._dbff {_adfbc :=_afae ._egaa ;
|
||
if _bbce ==-1{_bbce =_adfbc ;}else {if _bbce !=_adfbc {_bfeba =false ;break ;};};};};return _bfeba ;};func _eafgb (_fabbc []int )[]int {_cggc :=make ([]int ,len (_fabbc ));for _gded ,_deag :=range _fabbc {_cggc [len (_fabbc )-1-_gded ]=_deag ;};return _cggc ;
|
||
};func _bfcdg (_gdgg []*textMark ,_ggec _bg .PdfRectangle )*textWord {_gcbd :=_gdgg [0].PdfRectangle ;_ddceg :=_gdgg [0]._cffb ;for _ ,_caag :=range _gdgg [1:]{_gcbd =_dfba (_gcbd ,_caag .PdfRectangle );if _caag ._cffb > _ddceg {_ddceg =_caag ._cffb ;};
|
||
};return &textWord {PdfRectangle :_gcbd ,_dbff :_gdgg ,_gdfbg :_ggec .Ury -_gcbd .Lly ,_beaeg :_ddceg };};const (RenderModeStroke RenderMode =1<<iota ;RenderModeFill ;RenderModeClip ;);func (_fcged paraList )merge ()*textPara {_bb .Log .Trace ("\u006d\u0065\u0072\u0067\u0065:\u0020\u0070\u0061\u0072\u0061\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_fcged ));
|
||
if len (_fcged )==0{return nil ;};_fcged .sortReadingOrder ();_fged :=_fcged [0].PdfRectangle ;_gacgg :=_fcged [0]._bbagc ;for _ ,_baec :=range _fcged [1:]{_fged =_dfba (_fged ,_baec .PdfRectangle );_gacgg =append (_gacgg ,_baec ._bbagc ...);};return _efce (_fged ,_gacgg );
|
||
};
|
||
|
||
// TableInfo gets table information of the textmark `tm`.
|
||
func (_afga *TextMark )TableInfo ()(*TextTable ,[][]int ){if !_afga ._cda {return nil ,nil ;};_ade :=_afga ._gfc ;_fggg :=_ade .getCellInfo (*_afga );return _ade ,_fggg ;};func (_fgce *wordBag )empty (_bdbd int )bool {_ ,_ggfdd :=_fgce ._dceb [_bdbd ];
|
||
return !_ggfdd };type textWord struct{_bg .PdfRectangle ;_gdfbg float64 ;_bdbgb string ;_dbff []*textMark ;_beaeg float64 ;_bbdga bool ;};var _cg =false ;func (_ccdfe *textWord )computeText ()string {_fcbfd :=make ([]string ,len (_ccdfe ._dbff ));for _febg ,_fdeba :=range _ccdfe ._dbff {_fcbfd [_febg ]=_fdeba ._gcgb ;
|
||
};return _f .Join (_fcbfd ,"");};func (_caaae paraList )findTextTables ()[]*textTable {var _cbee []*textTable ;for _ ,_dgeee :=range _caaae {if _dgeee .taken ()||_dgeee .Width ()==0{continue ;};_dgefa :=_dgeee .isAtom ();if _dgefa ==nil {continue ;};_dgefa .growTable ();
|
||
if _dgefa ._afaff *_dgefa ._gaaa < _bbgee {continue ;};_dgefa .markCells ();_dgefa .log ("\u0067\u0072\u006fw\u006e");_cbee =append (_cbee ,_dgefa );};return _cbee ;};func (_bcggg paraList )extractTables (_bdebb []gridTiling )paraList {if _aeff {_bb .Log .Debug ("\u0065\u0078\u0074r\u0061\u0063\u0074\u0054\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_bcggg ));
|
||
};if len (_bcggg )< _bbgee {return _bcggg ;};_abbg :=_bcggg .findTables (_bdebb );if _aeff {_bb .Log .Info ("c\u006f\u006d\u0062\u0069\u006e\u0065d\u0020\u0074\u0061\u0062\u006c\u0065s\u0020\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d",len (_abbg ));
|
||
for _gddg ,_eeac :=range _abbg {_eeac .log (_eg .Sprintf ("c\u006f\u006d\u0062\u0069\u006e\u0065\u0064\u0020\u0025\u0064",_gddg ));};};return _bcggg .applyTables (_abbg );};func _ccfde (_egbfe []*textLine )map[float64 ][]*textLine {_ff .Slice (_egbfe ,func (_egbc ,_bgcd int )bool {return _egbfe [_egbc ]._ffbb < _egbfe [_bgcd ]._ffbb });
|
||
_cfeg :=map[float64 ][]*textLine {};for _ ,_baegb :=range _egbfe {_eedg :=_eded (_baegb );_eedg =_ef .Round (_eedg );_cfeg [_eedg ]=append (_cfeg [_eedg ],_baegb );};return _cfeg ;};func (_cdeae *textWord )bbox ()_bg .PdfRectangle {return _cdeae .PdfRectangle };
|
||
func (_bde *textObject )checkOp (_ccf *_de .ContentStreamOperation ,_fcfb int ,_gbe bool )(_ffae bool ,_fbd error ){if _bde ==nil {var _fdf []_ga .PdfObject ;if _fcfb > 0{_fdf =_ccf .Params ;if len (_fdf )> _fcfb {_fdf =_fdf [:_fcfb ];};};_bb .Log .Debug ("\u0025\u0023q \u006f\u0070\u0065r\u0061\u006e\u0064\u0020out\u0073id\u0065\u0020\u0074\u0065\u0078\u0074\u002e p\u0061\u0072\u0061\u006d\u0073\u003d\u0025+\u0076",_ccf .Operand ,_fdf );
|
||
};if _fcfb >=0{if len (_ccf .Params )!=_fcfb {if _gbe {_fbd =_b .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");};_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_ccf .Operand ,_fcfb ,len (_ccf .Params ),_ccf .Params );
|
||
return false ,_fbd ;};};return true ,nil ;};func (_aeea *wordBag )maxDepth ()float64 {return _aeea ._gcdf -_aeea .Lly };func _gffg (_edga ,_cbeg bounded )float64 {return _edga .bbox ().Llx -_cbeg .bbox ().Llx };
|
||
|
||
// PageImages represents extracted images on a PDF page with spatial information:
|
||
// display position and size.
|
||
type PageImages struct{Images []ImageMark ;};func (_dcec *ruling )alignsSec (_egcfb *ruling )bool {const _fegb =_ceaf +1.0;return _dcec ._bafb -_fegb <=_egcfb ._cbdc &&_egcfb ._bafb -_fegb <=_dcec ._cbdc ;};func (_ecff *wordBag )text ()string {_adaad :=_ecff .allWords ();
|
||
_fdba :=make ([]string ,len (_adaad ));for _agdg ,_ddge :=range _adaad {_fdba [_agdg ]=_ddge ._bdbgb ;};return _f .Join (_fdba ,"\u0020");};func (_cecg gridTiling )log (_accab string ){if !_dcdd {return ;};_bb .Log .Info ("\u0074i\u006ci\u006e\u0067\u003a\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0071",len (_cecg ._dbdc ),len (_cecg ._eaag ),_accab );
|
||
_eg .Printf ("\u0020\u0020\u0020l\u006c\u0078\u003d\u0025\u002e\u0032\u0066\u000a",_cecg ._dbdc );_eg .Printf ("\u0020\u0020\u0020l\u006c\u0079\u003d\u0025\u002e\u0032\u0066\u000a",_cecg ._eaag );for _ddbfa ,_ggeb :=range _cecg ._eaag {_cefg ,_fgcaa :=_cecg ._eagfa [_ggeb ];
|
||
if !_fgcaa {continue ;};_eg .Printf ("%\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u000a",_ddbfa ,_ggeb );for _bdcd ,_abaac :=range _cecg ._dbdc {_gcgbc ,_abab :=_cefg [_abaac ];if !_abab {continue ;};_eg .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_bdcd ,_gcgbc .String ());
|
||
};};};func (_ccgff rulingList )splitSec ()[]rulingList {_ff .Slice (_ccgff ,func (_bacf ,_afgcd int )bool {_afcff ,_gdbaf :=_ccgff [_bacf ],_ccgff [_afgcd ];if _afcff ._bafb !=_gdbaf ._bafb {return _afcff ._bafb < _gdbaf ._bafb ;};return _afcff ._cbdc < _gdbaf ._cbdc ;
|
||
});_acbfc :=make (map[*ruling ]struct{},len (_ccgff ));_gfgg :=func (_gfbb *ruling )rulingList {_bbga :=rulingList {_gfbb };_acbfc [_gfbb ]=struct{}{};for _ ,_bdfb :=range _ccgff {if _ ,_gcdeg :=_acbfc [_bdfb ];_gcdeg {continue ;};for _ ,_gdfac :=range _bbga {if _bdfb .alignsSec (_gdfac ){_bbga =append (_bbga ,_bdfb );
|
||
_acbfc [_bdfb ]=struct{}{};break ;};};};return _bbga ;};_cfbc :=[]rulingList {_gfgg (_ccgff [0])};for _ ,_ecga :=range _ccgff [1:]{if _ ,_ffdeb :=_acbfc [_ecga ];_ffdeb {continue ;};_cfbc =append (_cfbc ,_gfgg (_ecga ));};return _cfbc ;};func (_dbbc rulingList )augmentGrid ()(rulingList ,rulingList ){_abfe ,_cbeca :=_dbbc .vertsHorzs ();
|
||
if len (_abfe )==0||len (_cbeca )==0{return _abfe ,_cbeca ;};_bbdag ,_dbef :=_abfe ,_cbeca ;_baab :=_abfe .bbox ();_eeef :=_cbeca .bbox ();if _bffg {_bb .Log .Info ("\u0061u\u0067\u006d\u0065\u006e\u0074\u0047\u0072\u0069\u0064\u003a\u0020b\u0062\u006f\u0078\u0056\u003d\u0025\u0036\u002e\u0032\u0066",_baab );
|
||
_bb .Log .Info ("\u0061u\u0067\u006d\u0065\u006e\u0074\u0047\u0072\u0069\u0064\u003a\u0020b\u0062\u006f\u0078\u0048\u003d\u0025\u0036\u002e\u0032\u0066",_eeef );};var _dgdcd ,_ddec ,_efgf ,_facd *ruling ;if _eeef .Llx < _baab .Llx -_adca {_dgdcd =&ruling {_cbbcc :_ebgfa ,_ffdb :_gfbd ,_ecdge :_eeef .Llx ,_bafb :_baab .Lly ,_cbdc :_baab .Ury };
|
||
_abfe =append (rulingList {_dgdcd },_abfe ...);};if _eeef .Urx > _baab .Urx +_adca {_ddec =&ruling {_cbbcc :_ebgfa ,_ffdb :_gfbd ,_ecdge :_eeef .Urx ,_bafb :_baab .Lly ,_cbdc :_baab .Ury };_abfe =append (_abfe ,_ddec );};if _baab .Lly < _eeef .Lly -_adca {_efgf =&ruling {_cbbcc :_ebgfa ,_ffdb :_fcec ,_ecdge :_baab .Lly ,_bafb :_eeef .Llx ,_cbdc :_eeef .Urx };
|
||
_cbeca =append (rulingList {_efgf },_cbeca ...);};if _baab .Ury > _eeef .Ury +_adca {_facd =&ruling {_cbbcc :_ebgfa ,_ffdb :_fcec ,_ecdge :_baab .Ury ,_bafb :_eeef .Llx ,_cbdc :_eeef .Urx };_cbeca =append (_cbeca ,_facd );};if len (_abfe )+len (_cbeca )==len (_dbbc ){return _bbdag ,_dbef ;
|
||
};_ccebd :=append (_abfe ,_cbeca ...);_dbbc .log ("u\u006e\u0061\u0075\u0067\u006d\u0065\u006e\u0074\u0065\u0064");_ccebd .log ("\u0061u\u0067\u006d\u0065\u006e\u0074\u0065d");return _abfe ,_cbeca ;};func (_fdbb gridTile )contains (_aagec _bg .PdfRectangle )bool {if _fdbb .numBorders ()< 3{return false ;
|
||
};if _fdbb ._gafdd &&_aagec .Llx < _fdbb .Llx -_dcdg {return false ;};if _fdbb ._deage &&_aagec .Urx > _fdbb .Urx +_dcdg {return false ;};if _fdbb ._faba &&_aagec .Lly < _fdbb .Lly -_dcdg {return false ;};if _fdbb ._fed &&_aagec .Ury > _fdbb .Ury +_dcdg {return false ;
|
||
};return true ;};func _edc (_eaaa *_de .ContentStreamOperation )(float64 ,error ){if len (_eaaa .Params )!=1{_bbf :=_b .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_eaaa .Operand ,1,len (_eaaa .Params ),_eaaa .Params );
|
||
return 0.0,_bbf ;};return _ga .GetNumberAsFloat (_eaaa .Params [0]);};func (_bgffa rulingList )findPrimSec (_fbgcf ,_gcbec float64 )*ruling {for _ ,_dfec :=range _bgffa {if _cadfe (_dfec ._ecdge -_fbgcf )&&_dfec ._bafb -_adca <=_gcbec &&_gcbec <=_dfec ._cbdc +_adca {return _dfec ;
|
||
};};return nil ;};func _fdga (_gacf _cb .Point )*subpath {return &subpath {_egfb :[]_cb .Point {_gacf }}};func _eeffbf (_bcccb []compositeCell )[]float64 {var _cdgc []*textLine ;_effbb :=0;for _ ,_gbbcb :=range _bcccb {_effbb +=len (_gbbcb .paraList );
|
||
_cdgc =append (_cdgc ,_gbbcb .lines ()...);};_ff .Slice (_cdgc ,func (_aegdg ,_cgga int )bool {_cebd ,_aedb :=_cdgc [_aegdg ],_cdgc [_cgga ];_gagdf ,_bbgdg :=_cebd ._ffbb ,_aedb ._ffbb ;if !_cadfe (_gagdf -_bbgdg ){return _gagdf < _bbgdg ;};return _cebd .Llx < _aedb .Llx ;
|
||
});if _aeff {_eg .Printf ("\u0020\u0020\u0020 r\u006f\u0077\u0042\u006f\u0072\u0064\u0065\u0072\u0073:\u0020%\u0064 \u0070a\u0072\u0061\u0073\u0020\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u000a",_effbb ,len (_cdgc ));for _bagc ,_fbaab :=range _cdgc {_eg .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_bagc ,_fbaab );
|
||
};};var _gfceb []float64 ;_cdac :=_cdgc [0];var _caaf [][]*textLine ;_dfad :=[]*textLine {_cdac };for _aecda ,_ebfbg :=range _cdgc [1:]{if _ebfbg .Ury < _cdac .Lly {_gbbdg :=0.5*(_ebfbg .Ury +_cdac .Lly );if _aeff {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u003c\u0020\u0025\u0036.\u0032f\u0020\u0062\u006f\u0072\u0064\u0065\u0072\u003d\u0025\u0036\u002e\u0032\u0066\u000a"+"\u0009\u0020\u0071\u003d\u0025\u0073\u000a\u0009\u0020p\u003d\u0025\u0073\u000a",_aecda ,_ebfbg .Ury ,_cdac .Lly ,_gbbdg ,_cdac ,_ebfbg );
|
||
};_gfceb =append (_gfceb ,_gbbdg );_caaf =append (_caaf ,_dfad );_dfad =nil ;};_dfad =append (_dfad ,_ebfbg );if _ebfbg .Lly < _cdac .Lly {_cdac =_ebfbg ;};};if len (_dfad )> 0{_caaf =append (_caaf ,_dfad );};if _aeff {_eg .Printf (" \u0020\u0020\u0020\u0020\u0020\u0020 \u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064o\u0072\u0073\u003d%\u0036.\u0032\u0066\u000a",_gfceb );
|
||
};if _aeff {_bb .Log .Info ("\u0072\u006f\u0077\u003d\u0025\u0064",len (_bcccb ));for _fcfc ,_ebbfg :=range _bcccb {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_fcfc ,_ebbfg );};_bb .Log .Info ("\u0067r\u006f\u0075\u0070\u0073\u003d\u0025d",len (_caaf ));
|
||
for _fgece ,_eegc :=range _caaf {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0064\u000a",_fgece ,len (_eegc ));for _ceaed ,_fcgedb :=range _eegc {_eg .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_ceaed ,_fcgedb );};};};_eafa :=true ;
|
||
for _dbaccc ,_bfff :=range _caaf {_bgbcg :=true ;for _ccgg ,_cgcaf :=range _bcccb {if _aeff {_eg .Printf ("\u0020\u0020\u0020\u007e\u007e\u007e\u0067\u0072\u006f\u0075\u0070\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064\u0020\u0063\u0065\u006cl\u0020\u0025\u0064\u0020\u006ff\u0020\u0025d\u0020\u0025\u0073\u000a",_dbaccc ,len (_caaf ),_ccgg ,len (_bcccb ),_cgcaf );
|
||
};if !_cgcaf .hasLines (_bfff ){if _aeff {_eg .Printf ("\u0020\u0020\u0020\u0021\u0021\u0021\u0067\u0072\u006f\u0075\u0070\u0020\u0025d\u0020\u006f\u0066\u0020\u0025\u0064 \u0063\u0065\u006c\u006c\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064 \u004f\u0055\u0054\u000a",_dbaccc ,len (_caaf ),_ccgg ,len (_bcccb ));
|
||
};_bgbcg =false ;break ;};};if !_bgbcg {_eafa =false ;break ;};};if !_eafa {if _aeff {_bb .Log .Info ("\u0072\u006f\u0077\u0020\u0063o\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u0020\u0064\u006f\u006e\u0027\u0074 \u0073\u0070\u0061\u006e\u0020\u0061\u006c\u006c\u0020\u0063\u0065\u006c\u006c\u0073\u0020\u0069\u006e\u0020\u0072\u006f\u0077\u002e\u0020\u0069\u0067\u006e\u006f\u0072\u0069\u006eg");
|
||
};_gfceb =nil ;};if _aeff &&_gfceb !=nil {_eg .Printf ("\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u002a\u002a*\u0072\u006f\u0077\u0043\u006f\u0072\u0072i\u0064\u006f\u0072\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u000a",_gfceb );};return _gfceb ;
|
||
};func (_bcdfg *textTable )emptyCompositeColumn (_bdabc int )bool {for _dbaca :=0;_dbaca < _bcdfg ._gaaa ;_dbaca ++{if _befec ,_bded :=_bcdfg ._bgag [_gbbce (_bdabc ,_dbaca )];_bded {if len (_befec .paraList )> 0{return false ;};};};return true ;};func (_bfcd *textLine )text ()string {var _acfbd []string ;
|
||
for _ ,_ffca :=range _bfcd ._ggdd {if _ffca ._bbdga {_acfbd =append (_acfbd ,"\u0020");};_acfbd =append (_acfbd ,_ffca ._bdbgb );};return _f .Join (_acfbd ,"");};func _ddbc (_bbcg _bg .PdfRectangle )rulingKind {_cddcf :=_bbcg .Width ();_eagc :=_bbcg .Height ();
|
||
if _cddcf > _eagc {if _cddcf >=_afdf {return _fcec ;};}else {if _eagc >=_afdf {return _gfbd ;};};return _dgeec ;};func (_ecdd compositeCell )parasBBox ()(paraList ,_bg .PdfRectangle ){return _ecdd .paraList ,_ecdd .PdfRectangle ;};func _eded (_gcdd *textLine )float64 {return _gcdd ._ggdd [0].Llx };
|
||
|
||
|
||
// ApplyArea processes the page text only within the specified area `bbox`.
|
||
// Each time ApplyArea is called, it updates the result set in `pt`.
|
||
// Can be called multiple times in a row with different bounding boxes.
|
||
func (_cedbf *PageText )ApplyArea (bbox _bg .PdfRectangle ){_dgfb :=make ([]*textMark ,0,len (_cedbf ._bae ));for _ ,_agbed :=range _cedbf ._bae {if _gcfe (_agbed .bbox (),bbox ){_dgfb =append (_dgfb ,_agbed );};};var _ggfa paraList ;_edbg :=len (_dgfb );
|
||
for _acff :=0;_acff < 360&&_edbg > 0;_acff +=90{_gcg :=make ([]*textMark ,0,len (_dgfb )-_edbg );for _ ,_gag :=range _dgfb {if _gag ._eabg ==_acff {_gcg =append (_gcg ,_gag );};};if len (_gcg )> 0{_dff :=_efde (_gcg ,_cedbf ._abfb ,nil ,nil ,_cedbf ._afc ._add );
|
||
_ggfa =append (_ggfa ,_dff ...);_edbg -=len (_gcg );};};_bfbc :=new (_db .Buffer );_ggfa .writeText (_bfbc );_cedbf ._bffc =_bfbc .String ();_cedbf ._baeg =_ggfa .toTextMarks ();_cedbf ._ceaa =_ggfa .tables ();};
|
||
|
||
// String returns a description of `w`.
|
||
func (_gafe *textWord )String ()string {return _eg .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_gafe ._gdfbg ,_gafe .PdfRectangle ,_gafe ._beaeg ,_gafe ._bdbgb );
|
||
};func (_afgge *ruling )equals (_abbec *ruling )bool {return _afgge ._ffdb ==_abbec ._ffdb &&_bccb (_afgge ._ecdge ,_abbec ._ecdge )&&_bccb (_afgge ._bafb ,_abbec ._bafb )&&_bccb (_afgge ._cbdc ,_abbec ._cbdc );};type textLine struct{_bg .PdfRectangle ;
|
||
_ffbb float64 ;_ggdd []*textWord ;_bbeb float64 ;};var _ecg =TextMark {Text :"\u005b\u0058\u005d",Original :"\u0020",Meta :true ,FillColor :_g .White ,StrokeColor :_g .White };func _ddagf (_cfeda string ,_ebcg []rulingList ){_bb .Log .Info ("\u0024\u0024 \u0025\u0064\u0020g\u0072\u0069\u0064\u0073\u0020\u002d\u0020\u0025\u0073",len (_ebcg ),_cfeda );
|
||
for _ddgg ,_gead :=range _ebcg {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_ddgg ,_gead .String ());};};
|
||
|
||
// Extractor stores and offers functionality for extracting content from PDF pages.
|
||
type Extractor struct{_cd string ;_ee *_bg .PdfPageResources ;_ea _bg .PdfRectangle ;_gc *_bg .PdfRectangle ;_cf map[string ]fontEntry ;_ce map[string ]textResult ;_ec int64 ;_ffc int ;_ab *Options ;_ed *_ga .PdfObject ;_eee _ga .PdfObject ;};func (_begg paraList )xNeighbours (_gfcda float64 )map[*textPara ][]int {_cdef :=make ([]event ,2*len (_begg ));
|
||
if _gfcda ==0{for _fegfa ,_eebaa :=range _begg {_cdef [2*_fegfa ]=event {_eebaa .Llx ,true ,_fegfa };_cdef [2*_fegfa +1]=event {_eebaa .Urx ,false ,_fegfa };};}else {for _ddbcf ,_ggddg :=range _begg {_cdef [2*_ddbcf ]=event {_ggddg .Llx -_gfcda *_ggddg .fontsize (),true ,_ddbcf };
|
||
_cdef [2*_ddbcf +1]=event {_ggddg .Urx +_gfcda *_ggddg .fontsize (),false ,_ddbcf };};};return _begg .eventNeighbours (_cdef );};func (_cfgf *textLine )markWordBoundaries (){_gdcb :=_gfeb *_cfgf ._bbeb ;for _fcae ,_daabe :=range _cfgf ._ggdd [1:]{if _dfea (_daabe ,_cfgf ._ggdd [_fcae ])>=_gdcb {_daabe ._bbdga =true ;
|
||
};};};func (_ggfd *shapesState )devicePoint (_bcgc ,_afbd float64 )_cb .Point {_cfede :=_ggfd ._bfac .Mult (_ggfd ._dea );_bcgc ,_afbd =_cfede .Transform (_bcgc ,_afbd );return _cb .NewPoint (_bcgc ,_afbd );};func (_eede rulingList )comp (_eacfd ,_cdad int )bool {_befe ,_gbac :=_eede [_eacfd ],_eede [_cdad ];
|
||
_ddeg ,_cbgeb :=_befe ._ffdb ,_gbac ._ffdb ;if _ddeg !=_cbgeb {return _ddeg > _cbgeb ;};if _ddeg ==_dgeec {return false ;};_bbagcg :=func (_agbgb bool )bool {if _ddeg ==_fcec {return _agbgb ;};return !_agbgb ;};_ggdf ,_cgfb :=_befe ._ecdge ,_gbac ._ecdge ;
|
||
if _ggdf !=_cgfb {return _bbagcg (_ggdf > _cgfb );};_ggdf ,_cgfb =_befe ._bafb ,_gbac ._bafb ;if _ggdf !=_cgfb {return _bbagcg (_ggdf < _cgfb );};return _bbagcg (_befe ._cbdc < _gbac ._cbdc );};func (_cccf pathSection )bbox ()_bg .PdfRectangle {_edbfa :=_cccf ._aadg [0]._egfb [0];
|
||
_ceca :=_bg .PdfRectangle {Llx :_edbfa .X ,Urx :_edbfa .X ,Lly :_edbfa .Y ,Ury :_edbfa .Y };_bcca :=func (_dfca _cb .Point ){if _dfca .X < _ceca .Llx {_ceca .Llx =_dfca .X ;}else if _dfca .X > _ceca .Urx {_ceca .Urx =_dfca .X ;};if _dfca .Y < _ceca .Lly {_ceca .Lly =_dfca .Y ;
|
||
}else if _dfca .Y > _ceca .Ury {_ceca .Ury =_dfca .Y ;};};for _ ,_fcff :=range _cccf ._aadg [0]._egfb [1:]{_bcca (_fcff );};for _ ,_dfdb :=range _cccf ._aadg [1:]{for _ ,_efg :=range _dfdb ._egfb {_bcca (_efg );};};return _ceca ;};func (_bbeg paraList )tables ()[]TextTable {var _ddaba []TextTable ;
|
||
if _aeff {_bb .Log .Info ("\u0070\u0061\u0072\u0061\u0073\u002e\u0074\u0061\u0062\u006c\u0065\u0073\u003a");};for _ ,_cbdg :=range _bbeg {_ccfc :=_cbdg ._gbgg ;if _ccfc !=nil &&_ccfc .isExportable (){_ddaba =append (_ddaba ,_ccfc .toTextTable ());};};return _ddaba ;
|
||
};
|
||
|
||
// RenderMode specifies the text rendering mode (Tmode), which determines whether showing text shall cause
|
||
// glyph outlines to be stroked, filled, used as a clipping boundary, or some combination of the three.
|
||
// Stroking, filling, and clipping shall have the same effects for a text object as they do for a path object
|
||
// (see 8.5.3, "Path-Painting Operators" and 8.5.4, "Clipping Path Operators").
|
||
type RenderMode int ;func (_fcdca rulingList )connections (_bgff map[int ]intSet ,_cfac int )intSet {_cagcb :=make (intSet );_bfbed :=make (intSet );var _bcdda func (int );_bcdda =func (_edaga int ){if !_bfbed .has (_edaga ){_bfbed .add (_edaga );for _bdfcc :=range _fcdca {if _bgff [_bdfcc ].has (_edaga ){_cagcb .add (_bdfcc );
|
||
};};for _bddg :=range _fcdca {if _cagcb .has (_bddg ){_bcdda (_bddg );};};};};_bcdda (_cfac );return _cagcb ;};func (_bgdeb *structElement )parseStructElement (_bgac _ga .PdfObject ){_ddf ,_fbad :=_ga .GetDict (_bgac );if !_fbad {_bb .Log .Debug ("\u0070\u0061\u0072\u0073\u0065\u0053\u0074\u0072u\u0063\u0074\u0045le\u006d\u0065\u006e\u0074\u003a\u0020d\u0069\u0063\u0074\u0069\u006f\u006e\u0061\u0072\u0079\u0020\u006f\u0062\u006a\u0065\u0063t\u0020\u006e\u006f\u0074\u0020\u0066\u006f\u0075n\u0064\u002e");
|
||
return ;};_abbad :=_ddf .Get ("\u0053");_bdfc :=_ddf .Get ("\u0050\u0067");_abbb :="";if _abbad !=nil {_abbb =_abbad .String ();};_gabc :=_ddf .Get ("\u004b");_bgdeb ._ddag =_abbb ;_bgdeb ._gcbe =_bdfc ;switch _ffgb :=_gabc .(type ){case *_ga .PdfObjectInteger :_bgdeb ._ddag =_abbb ;
|
||
_bgdeb ._dfdgf =int64 (*_ffgb );_bgdeb ._gcbe =_bdfc ;case *_ga .PdfObjectReference :_fdcb :=*_ga .MakeArray (_ffgb );var _fgaf int64 =-1;_bgdeb ._dfdgf =_fgaf ;if _fdcb .Len ()==1{_aeca :=_fdcb .Elements ()[0];_fcad ,_fcge :=_aeca .(*_ga .PdfObjectInteger );
|
||
if _fcge {_fgaf =int64 (*_fcad );_bgdeb ._dfdgf =_fgaf ;_bgdeb ._ddag =_abbb ;_bgdeb ._gcbe =_bdfc ;return ;};};_caaed :=[]structElement {};for _ ,_fcbda :=range _fdcb .Elements (){_bdae ,_agee :=_fcbda .(*_ga .PdfObjectInteger );if _agee {_fgaf =int64 (*_bdae );
|
||
_bgdeb ._dfdgf =_fgaf ;_bgdeb ._ddag =_abbb ;}else {_bege :=&structElement {};_bege .parseStructElement (_fcbda );_caaed =append (_caaed ,*_bege );};_fgaf =-1;};_bgdeb ._acdgb =_caaed ;case *_ga .PdfObjectArray :_eadf :=_gabc .(*_ga .PdfObjectArray );var _ggcb int64 =-1;
|
||
_bgdeb ._dfdgf =_ggcb ;if _eadf .Len ()==1{_cddc :=_eadf .Elements ()[0];_fceg ,_acbg :=_cddc .(*_ga .PdfObjectInteger );if _acbg {_ggcb =int64 (*_fceg );_bgdeb ._dfdgf =_ggcb ;_bgdeb ._ddag =_abbb ;_bgdeb ._gcbe =_bdfc ;return ;};};_fdcg :=[]structElement {};
|
||
for _ ,_feab :=range _eadf .Elements (){_cedgb ,_begc :=_feab .(*_ga .PdfObjectInteger );if _begc {_ggcb =int64 (*_cedgb );_bgdeb ._dfdgf =_ggcb ;_bgdeb ._ddag =_abbb ;_bgdeb ._gcbe =_bdfc ;}else {_afda :=&structElement {};_afda .parseStructElement (_feab );
|
||
_fdcg =append (_fdcg ,*_afda );};_ggcb =-1;};_bgdeb ._acdgb =_fdcg ;};};func (_bee *shapesState )closePath (){if _bee ._abac {_bee ._bfeb =append (_bee ._bfeb ,_fdga (_bee ._bfgg ));_bee ._abac =false ;}else if len (_bee ._bfeb )==0{if _ggbc {_bb .Log .Debug ("\u0063\u006c\u006f\u0073eP\u0061\u0074\u0068\u0020\u0077\u0069\u0074\u0068\u0020\u006e\u006f\u0020\u0070\u0061t\u0068");
|
||
};_bee ._abac =false ;return ;};_bee ._bfeb [len (_bee ._bfeb )-1].close ();if _ggbc {_bb .Log .Info ("\u0063\u006c\u006f\u0073\u0065\u0050\u0061\u0074\u0068\u003a\u0020\u0025\u0073",_bee );};};
|
||
|
||
// String returns a description of `t`.
|
||
func (_eagde *textTable )String ()string {return _eg .Sprintf ("\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u0020\u0025\u0074",_eagde ._afaff ,_eagde ._gaaa ,_eagde ._cfabb );};func (_abeae *textPara )text ()string {_gdbe :=new (_db .Buffer );_abeae .writeText (_gdbe );
|
||
return _gdbe .String ();};func (_cdaf rulingList )tidied (_decgc string )rulingList {_bede :=_cdaf .removeDuplicates ();_bede .log ("\u0075n\u0069\u0071\u0075\u0065\u0073");_dcfd :=_bede .snapToGroups ();if _dcfd ==nil {return nil ;};_dcfd .sort ();if _bffg {_bb .Log .Info ("\u0074\u0069\u0064i\u0065\u0064\u003a\u0020\u0025\u0071\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u0075\u006e\u0069\u0071\u0075\u0065\u0073\u003d\u0025\u0064\u0020\u0063\u006f\u0061l\u0065\u0073\u0063\u0065\u0064\u003d\u0025\u0064",_decgc ,len (_cdaf ),len (_bede ),len (_dcfd ));
|
||
};_dcfd .log ("\u0063o\u0061\u006c\u0065\u0073\u0063\u0065d");return _dcfd ;};
|
||
|
||
// String returns a description of `k`.
|
||
func (_eeffb markKind )String ()string {_aegbg ,_aefc :=_cgbb [_eeffb ];if !_aefc {return _eg .Sprintf ("\u004e\u006f\u0074\u0020\u0061\u0020\u006d\u0061\u0072k\u003a\u0020\u0025\u0064",_eeffb );};return _aegbg ;};const _dfcgb =10;func (_gbae intSet )add (_ffea int ){_gbae [_ffea ]=struct{}{}};
|
||
func _cbcb (_adde []pathSection )rulingList {_dgebc (_adde );if _bffg {_bb .Log .Info ("\u006d\u0061k\u0065\u0053\u0074\u0072\u006f\u006b\u0065\u0052\u0075\u006c\u0069\u006e\u0067\u0073\u003a\u0020\u0025\u0064\u0020\u0073\u0074\u0072ok\u0065\u0073",len (_adde ));
|
||
};var _eabf rulingList ;for _ ,_fbbc :=range _adde {for _ ,_bdecf :=range _fbbc ._aadg {if len (_bdecf ._egfb )< 2{continue ;};_gbad :=_bdecf ._egfb [0];for _ ,_ecbd :=range _bdecf ._egfb [1:]{if _fgafg ,_ccffc :=_gdcbd (_gbad ,_ecbd ,_fbbc .Color );_ccffc {_eabf =append (_eabf ,_fgafg );
|
||
};_gbad =_ecbd ;};};};if _bffg {_bb .Log .Info ("m\u0061\u006b\u0065\u0053tr\u006fk\u0065\u0052\u0075\u006c\u0069n\u0067\u0073\u003a\u0020\u0025\u0073",_eabf );};return _eabf ;};func _cafca (_abcgg []*textWord ,_bfdd *textWord )[]*textWord {for _ffgbac ,_afad :=range _abcgg {if _afad ==_bfdd {return _ebecb (_abcgg ,_ffgbac );
|
||
};};_bb .Log .Error ("\u0072\u0065\u006d\u006f\u0076e\u0057\u006f\u0072\u0064\u003a\u0020\u0077\u006f\u0072\u0064\u0073\u0020\u0064o\u0065\u0073\u006e\u0027\u0074\u0020\u0063\u006f\u006e\u0074\u0061\u0069\u006e\u0020\u0077\u006f\u0072\u0064\u003d\u0025\u0073",_bfdd );
|
||
return nil ;};func (_bcdf *textObject )reset (){_bcdf ._eaaf =_cb .IdentityMatrix ();_bcdf ._bbde =_cb .IdentityMatrix ();_bcdf ._dfge =nil ;};type compositeCell struct{_bg .PdfRectangle ;paraList ;};
|
||
|
||
// BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
|
||
func (_ecfg *TextMarkArray )BBox ()(_bg .PdfRectangle ,bool ){var _dbba _bg .PdfRectangle ;_ebdd :=false ;for _ ,_gbbb :=range _ecfg ._dafa {if _gbbb .Meta ||_effeg (_gbbb .Text ){continue ;};if _ebdd {_dbba =_dfba (_dbba ,_gbbb .BBox );}else {_dbba =_gbbb .BBox ;
|
||
_ebdd =true ;};};return _dbba ,_ebdd ;};func (_gbca lineRuling )xMean ()float64 {return 0.5*(_gbca ._gabf .X +_gbca ._fccf .X )};var _cgbb =map[markKind ]string {_feeb :"\u0073\u0074\u0072\u006f\u006b\u0065",_cede :"\u0066\u0069\u006c\u006c",_ebgfa :"\u0061u\u0067\u006d\u0065\u006e\u0074"};
|
||
type shapesState struct{_dea _cb .Matrix ;_bfac _cb .Matrix ;_bfeb []*subpath ;_abac bool ;_bfgg _cb .Point ;_fdc *textObject ;};func _eaeab (_eeae int ,_cbced func (int ,int )bool )[]int {_bbdc :=make ([]int ,_eeae );for _acdc :=range _bbdc {_bbdc [_acdc ]=_acdc ;
|
||
};_ff .Slice (_bbdc ,func (_fcbdd ,_dfccf int )bool {return _cbced (_bbdc [_fcbdd ],_bbdc [_dfccf ])});return _bbdc ;};func (_dcff paraList )reorder (_gcdb []int ){_feadg :=make (paraList ,len (_dcff ));for _edbff ,_dcbf :=range _gcdb {_feadg [_edbff ]=_dcff [_dcbf ];
|
||
};copy (_dcff ,_feadg );};const (_gceg markKind =iota ;_feeb ;_cede ;_ebgfa ;);func (_deab rulingList )log (_fcdc string ){if !_bffg {return ;};_bb .Log .Info ("\u0023\u0023\u0023\u0020\u0025\u0031\u0030\u0073\u003a\u0020\u0076\u0065c\u0073\u003d\u0025\u0073",_fcdc ,_deab .String ());
|
||
for _aeagg ,_aegdc :=range _deab {_eg .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_aeagg ,_aegdc .String ());};};func _debfd (_ggfdcb ,_ggfgc _cb .Point )bool {_fgcfe :=_ef .Abs (_ggfdcb .X -_ggfgc .X );_ceafd :=_ef .Abs (_ggfdcb .Y -_ggfgc .Y );
|
||
return _eacaf (_fgcfe ,_ceafd );};func (_fgge *wordBag )highestWord (_acda int ,_gdgb ,_dedb float64 )*textWord {for _ ,_gaff :=range _fgge ._dceb [_acda ]{if _gdgb <=_gaff ._gdfbg &&_gaff ._gdfbg <=_dedb {return _gaff ;};};return nil ;};func (_bcccg *textObject )newTextMark (_beea string ,_abaf _cb .Matrix ,_badg _cb .Point ,_gcdc float64 ,_aeg *_bg .PdfFont ,_abccg float64 ,_dggb ,_ddebd _g .Color ,_egbe _ga .PdfObject ,_afgg []string ,_cbgad int ,_fdcc int )(textMark ,bool ){_dafac :=_abaf .Angle ();
|
||
_aebf :=_dgcff (_dafac ,_cedd );var _fbfg float64 ;if _aebf %180!=90{_fbfg =_abaf .ScalingFactorY ();}else {_fbfg =_abaf .ScalingFactorX ();};_baee :=_acbe (_abaf );_fecfg :=_bg .PdfRectangle {Llx :_baee .X ,Lly :_baee .Y ,Urx :_badg .X ,Ury :_badg .Y };
|
||
switch _aebf %360{case 90:_fecfg .Urx -=_fbfg ;case 180:_fecfg .Ury -=_fbfg ;case 270:_fecfg .Urx +=_fbfg ;case 0:_fecfg .Ury +=_fbfg ;default:_aebf =0;_fecfg .Ury +=_fbfg ;};if _fecfg .Llx > _fecfg .Urx {_fecfg .Llx ,_fecfg .Urx =_fecfg .Urx ,_fecfg .Llx ;
|
||
};if _fecfg .Lly > _fecfg .Ury {_fecfg .Lly ,_fecfg .Ury =_fecfg .Ury ,_fecfg .Lly ;};_cegf :=true ;if _bcccg ._afb ._ea .Width ()> 0{_eecda ,_debf :=_dgde (_fecfg ,_bcccg ._afb ._ea );if !_debf {_cegf =false ;_bb .Log .Debug ("\u0054\u0065\u0078\u0074\u0020m\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069\u0064\u0065\u0020\u0070a\u0067\u0065\u002e\u0020\u0062\u0062\u006f\u0078\u003d\u0025\u0067\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u003d\u0025\u0067\u0020\u0074\u0065\u0078\u0074\u003d\u0025q",_fecfg ,_bcccg ._afb ._ea ,_beea );
|
||
};_fecfg =_eecda ;};_ceae :=_fecfg ;_eebd :=_bcccg ._afb ._ea ;switch _aebf %360{case 90:_eebd .Urx ,_eebd .Ury =_eebd .Ury ,_eebd .Urx ;_ceae =_bg .PdfRectangle {Llx :_eebd .Urx -_fecfg .Ury ,Urx :_eebd .Urx -_fecfg .Lly ,Lly :_fecfg .Llx ,Ury :_fecfg .Urx };
|
||
case 180:_ceae =_bg .PdfRectangle {Llx :_eebd .Urx -_fecfg .Llx ,Urx :_eebd .Urx -_fecfg .Urx ,Lly :_eebd .Ury -_fecfg .Lly ,Ury :_eebd .Ury -_fecfg .Ury };case 270:_eebd .Urx ,_eebd .Ury =_eebd .Ury ,_eebd .Urx ;_ceae =_bg .PdfRectangle {Llx :_fecfg .Ury ,Urx :_fecfg .Lly ,Lly :_eebd .Ury -_fecfg .Llx ,Ury :_eebd .Ury -_fecfg .Urx };
|
||
};if _ceae .Llx > _ceae .Urx {_ceae .Llx ,_ceae .Urx =_ceae .Urx ,_ceae .Llx ;};if _ceae .Lly > _ceae .Ury {_ceae .Lly ,_ceae .Ury =_ceae .Ury ,_ceae .Lly ;};_edfb :=textMark {_gcgb :_beea ,PdfRectangle :_ceae ,_cedgd :_fecfg ,_gcded :_aeg ,_cffb :_fbfg ,_gaac :_abccg ,_ddfg :_abaf ,_ebcb :_badg ,_eabg :_aebf ,_dfbf :_dggb ,_ffdf :_ddebd ,_eggb :_egbe ,_fcbgg :_afgg ,Th :_bcccg ._aed ._dcc ,Tw :_bcccg ._aed ._dedcd ,_egaa :_fdcc ,_fcadf :_cbgad };
|
||
if _abfa {_bb .Log .Info ("n\u0065\u0077\u0054\u0065\u0078\u0074M\u0061\u0072\u006b\u003a\u0020\u0073t\u0061\u0072\u0074\u003d\u0025\u002e\u0032f\u0020\u0065\u006e\u0064\u003d\u0025\u002e\u0032\u0066\u0020%\u0073",_baee ,_badg ,_edfb .String ());};return _edfb ,_cegf ;
|
||
};
|
||
|
||
// String returns a description of `v`.
|
||
func (_cdgb *ruling )String ()string {if _cdgb ._ffdb ==_dgeec {return "\u004e\u004f\u0054\u0020\u0052\u0055\u004c\u0049\u004e\u0047";};_fbdae ,_edbge :="\u0078","\u0079";if _cdgb ._ffdb ==_fcec {_fbdae ,_edbge ="\u0079","\u0078";};_fbbg :="";if _cdgb ._gafcd !=0.0{_fbbg =_eg .Sprintf (" \u0077\u0069\u0064\u0074\u0068\u003d\u0025\u002e\u0032\u0066",_cdgb ._gafcd );
|
||
};return _eg .Sprintf ("\u0025\u00310\u0073\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u002d\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0020\u0025\u0076\u0025\u0073",_cdgb ._ffdb ,_fbdae ,_cdgb ._ecdge ,_edbge ,_cdgb ._bafb ,_cdgb ._cbdc ,_cdgb ._cbdc -_cdgb ._bafb ,_cdgb ._cbbcc ,_cdgb .Color ,_fbbg );
|
||
};var (_da =_b .New ("\u0074\u0079p\u0065\u0020\u0063h\u0065\u0063\u006b\u0020\u0065\u0072\u0072\u006f\u0072");_eb =_b .New ("\u0072\u0061\u006e\u0067\u0065\u0020\u0063\u0068\u0065\u0063\u006b\u0020e\u0072\u0072\u006f\u0072"););func _aeabd (_ecfec _bg .PdfRectangle ,_bbgdb ,_abfcc ,_caef ,_effbc *ruling )gridTile {_ebcd :=_ecfec .Llx ;
|
||
_aecag :=_ecfec .Urx ;_baabd :=_ecfec .Lly ;_gccca :=_ecfec .Ury ;return gridTile {PdfRectangle :_ecfec ,_gafdd :_bbgdb !=nil &&_bbgdb .encloses (_baabd ,_gccca ),_deage :_abfcc !=nil &&_abfcc .encloses (_baabd ,_gccca ),_faba :_caef !=nil &&_caef .encloses (_ebcd ,_aecag ),_fed :_effbc !=nil &&_effbc .encloses (_ebcd ,_aecag )};
|
||
};func (_caee paraList )sortReadingOrder (){_bb .Log .Trace ("\u0073\u006fr\u0074\u0052\u0065\u0061\u0064i\u006e\u0067\u004f\u0072\u0064e\u0072\u003a\u0020\u0070\u0061\u0072\u0061\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_caee ));
|
||
if len (_caee )<=1{return ;};_caee .computeEBBoxes ();_ff .Slice (_caee ,func (_accef ,_fgaba int )bool {return _bacd (_caee [_accef ],_caee [_fgaba ])<=0});};
|
||
|
||
// String returns a string describing `ma`.
|
||
func (_dca TextMarkArray )String ()string {_bfad :=len (_dca ._dafa );if _bfad ==0{return "\u0045\u004d\u0050T\u0059";};_fbed :=_dca ._dafa [0];_efd :=_dca ._dafa [_bfad -1];return _eg .Sprintf ("\u007b\u0054\u0045\u0058\u0054\u004d\u0041\u0052K\u0041\u0052\u0052AY\u003a\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025s\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d%\u0073\u007d",_bfad ,_fbed ,_efd );
|
||
};func _acbe (_ecec _cb .Matrix )_cb .Point {_fgab ,_bgbde :=_ecec .Translation ();return _cb .Point {X :_fgab ,Y :_bgbde };};func _fdae (_bcbeg float64 )float64 {return _bagd *_ef .Round (_bcbeg /_bagd )};func (_deeb *textTable )emptyCompositeRow (_dgafc int )bool {for _cdbfg :=0;
|
||
_cdbfg < _deeb ._afaff ;_cdbfg ++{if _gfgb ,_eggfa :=_deeb ._bgag [_gbbce (_cdbfg ,_dgafc )];_eggfa {if len (_gfgb .paraList )> 0{return false ;};};};return true ;};type subpath struct{_egfb []_cb .Point ;_ffff bool ;};func (_cgbe rulingList )snapToGroupsDirection ()rulingList {_cgbe .sortStrict ();
|
||
_bcdfd :=make (map[*ruling ]rulingList ,len (_cgbe ));_bdgb :=_cgbe [0];_cfbag :=func (_aegce *ruling ){_bdgb =_aegce ;_bcdfd [_bdgb ]=rulingList {_aegce }};_cfbag (_cgbe [0]);for _ ,_dabd :=range _cgbe [1:]{if _dabd ._ecdge < _bdgb ._ecdge -_adcc {_bb .Log .Error ("\u0073\u006e\u0061\u0070T\u006f\u0047\u0072\u006f\u0075\u0070\u0073\u0044\u0069r\u0065\u0063\u0074\u0069\u006f\u006e\u003a\u0020\u0057\u0072\u006f\u006e\u0067\u0020\u0070\u0072\u0069\u006da\u0072\u0079\u0020\u006f\u0072d\u0065\u0072\u002e\u000a\u0009\u0076\u0030\u003d\u0025\u0073\u000a\u0009\u0020\u0076\u003d\u0025\u0073",_bdgb ,_dabd );
|
||
};if _dabd ._ecdge > _bdgb ._ecdge +_ceaf {_cfbag (_dabd );}else {_bcdfd [_bdgb ]=append (_bcdfd [_bdgb ],_dabd );};};_bbegb :=make (map[*ruling ]float64 ,len (_bcdfd ));_gefa :=make (map[*ruling ]*ruling ,len (_cgbe ));for _dcfe ,_dcgbe :=range _bcdfd {_bbegb [_dcfe ]=_dcgbe .mergePrimary ();
|
||
for _ ,_gebda :=range _dcgbe {_gefa [_gebda ]=_dcfe ;};};for _ ,_eccdbe :=range _cgbe {_eccdbe ._ecdge =_bbegb [_gefa [_eccdbe ]];};_abde :=make (rulingList ,0,len (_cgbe ));for _ ,_afgc :=range _bcdfd {_gcfed :=_afgc .splitSec ();for _cggcg ,_babdbe :=range _gcfed {_ebddb :=_babdbe .merge ();
|
||
if len (_abde )> 0{_abaec :=_abde [len (_abde )-1];if _abaec .alignsPrimary (_ebddb )&&_abaec .alignsSec (_ebddb ){_bb .Log .Error ("\u0073\u006e\u0061\u0070\u0054\u006fG\u0072\u006f\u0075\u0070\u0073\u0044\u0069\u0072\u0065\u0063\u0074\u0069\u006f\u006e\u003a\u0020\u0044\u0075\u0070\u006ci\u0063\u0061\u0074\u0065\u0020\u0069\u003d\u0025\u0064\u000a\u0009\u0077\u003d\u0025s\u000a\t\u0076\u003d\u0025\u0073",_cggcg ,_abaec ,_ebddb );
|
||
continue ;};};_abde =append (_abde ,_ebddb );};};_abde .sortStrict ();return _abde ;};func _gcef (_ceef ,_eaec int )int {if _ceef < _eaec {return _ceef ;};return _eaec ;};func (_cba *imageExtractContext )extractContentStreamImages (_gcd string ,_af *_bg .PdfPageResources )error {_bdg :=_de .NewContentStreamParser (_gcd );
|
||
_gcf ,_ebb :=_bdg .Parse ();if _ebb !=nil {return _ebb ;};if _cba ._dab ==nil {_cba ._dab =map[*_ga .PdfObjectStream ]*cachedImage {};};if _cba ._caa ==nil {_cba ._caa =&ImageExtractOptions {};};_bgba :=_de .NewContentStreamProcessor (*_gcf );_bgba .AddHandler (_de .HandlerConditionEnumAllOperands ,"",_cba .processOperand );
|
||
return _bgba .Process (_af );};func (_dfdc *textPara )writeCellText (_gadee _c .Writer ){for _ecab ,_ccdf :=range _dfdc ._bbagc {_bcfa :=_ccdf .text ();_caca :=_aefg &&_ccdf .endsInHyphen ()&&_ecab !=len (_dfdc ._bbagc )-1;if _caca {_bcfa =_aaaa (_bcfa );
|
||
};_gadee .Write ([]byte (_bcfa ));if !(_caca ||_ecab ==len (_dfdc ._bbagc )-1){_gadee .Write ([]byte (_cegd (_ccdf ._ffbb ,_dfdc ._bbagc [_ecab +1]._ffbb )));};};};func (_bacdc paraList )computeEBBoxes (){if _ffb {_bb .Log .Info ("\u0063o\u006dp\u0075\u0074\u0065\u0045\u0042\u0042\u006f\u0078\u0065\u0073\u003a");
|
||
};for _ ,_aage :=range _bacdc {_aage ._aaga =_aage .PdfRectangle ;};_ccbff :=_bacdc .yNeighbours (0);for _cacc ,_aaca :=range _bacdc {_cgbf :=_aaca ._aaga ;_aacd ,_edbdc :=-1.0e9,+1.0e9;for _ ,_cbfb :=range _ccbff [_aaca ]{_eegd :=_bacdc [_cbfb ]._aaga ;
|
||
if _eegd .Urx < _cgbf .Llx {_aacd =_ef .Max (_aacd ,_eegd .Urx );}else if _cgbf .Urx < _eegd .Llx {_edbdc =_ef .Min (_edbdc ,_eegd .Llx );};};for _adba ,_faea :=range _bacdc {_aaea :=_faea ._aaga ;if _cacc ==_adba ||_aaea .Ury > _cgbf .Lly {continue ;};
|
||
if _aacd <=_aaea .Llx &&_aaea .Llx < _cgbf .Llx {_cgbf .Llx =_aaea .Llx ;}else if _aaea .Urx <=_edbdc &&_cgbf .Urx < _aaea .Urx {_cgbf .Urx =_aaea .Urx ;};};if _ffb {_eg .Printf ("\u0025\u0034\u0064\u003a %\u0036\u002e\u0032\u0066\u2192\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0071\u000a",_cacc ,_aaca ._aaga ,_cgbf ,_egde (_aaca .text (),50));
|
||
};_aaca ._aaga =_cgbf ;};if _cagc {for _ ,_ddfgf :=range _bacdc {_ddfgf .PdfRectangle =_ddfgf ._aaga ;};};};func (_cfcf *textObject )getStrokeColor ()_g .Color {return _bagcde (_cfcf ._fgd .ColorspaceStroking ,_cfcf ._fgd .ColorStroking );};func (_afgf *textTable )depth ()float64 {_ebef :=1e10;
|
||
for _dgcd :=0;_dgcd < _afgf ._afaff ;_dgcd ++{_cbae :=_afgf .get (_dgcd ,0);if _cbae ==nil ||_cbae ._fcag {continue ;};_ebef =_ef .Min (_ebef ,_cbae .depth ());};return _ebef ;};func _effeg (_fdac string )bool {for _ ,_fgfa :=range _fdac {if !_e .IsSpace (_fgfa ){return false ;
|
||
};};return true ;};
|
||
|
||
// GetContentStreamOps returns the contentStreamOps field of `pt`.
|
||
func (_gbg *PageText )GetContentStreamOps ()*_de .ContentStreamOperations {return _gbg ._fad };func (_cbfac *textWord )absorb (_fbee *textWord ){_cbfac .PdfRectangle =_dfba (_cbfac .PdfRectangle ,_fbee .PdfRectangle );_cbfac ._dbff =append (_cbfac ._dbff ,_fbee ._dbff ...);
|
||
};
|
||
|
||
// String returns a string describing `pt`.
|
||
func (_cfgc PageText )String ()string {_agef :=_eg .Sprintf ("P\u0061\u0067\u0065\u0054ex\u0074:\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073",len (_cfgc ._bae ));_aggd :=[]string {"\u002d"+_agef };for _ ,_bgdf :=range _cfgc ._bae {_aggd =append (_aggd ,_bgdf .String ());
|
||
};_aggd =append (_aggd ,"\u002b"+_agef );return _f .Join (_aggd ,"\u000a");};func _ebecb (_acgb []*textWord ,_gfca int )[]*textWord {_ddcge :=len (_acgb );copy (_acgb [_gfca :],_acgb [_gfca +1:]);return _acgb [:_ddcge -1];};type textPara struct{_bg .PdfRectangle ;
|
||
_aaga _bg .PdfRectangle ;_bbagc []*textLine ;_gbgg *textTable ;_ggcbf bool ;_fcag bool ;_dgaf *textPara ;_gdfgd *textPara ;_eeba *textPara ;_daag *textPara ;_gdge []list ;};
|
||
|
||
// String returns a description of `l`.
|
||
func (_bbdea *textLine )String ()string {return _eg .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_bbdea ._ffbb ,_bbdea .PdfRectangle ,_bbdea ._bbeb ,_bbdea .text ());
|
||
};func (_bag *PageFonts )extractPageResourcesToFont (_bfc *_bg .PdfPageResources )error {_bgbg ,_ffcb :=_ga .GetDict (_bfc .Font );if !_ffcb {return _b .New (_bcc );};for _ ,_cfd :=range _bgbg .Keys (){var (_fc =true ;_ca []byte ;_bge string ;);_eac ,_faf :=_bfc .GetFontByName (_cfd );
|
||
if !_faf {return _b .New (_ag );};_cge ,_gdga :=_bg .NewPdfFontFromPdfObject (_eac );if _gdga !=nil {return _gdga ;};_ebd :=_cge .FontDescriptor ();_dd :=_cge .FontDescriptor ().FontName .String ();_cgf :=_cge .Subtype ();if _fcd (_bag .Fonts ,_dd ){continue ;
|
||
};if len (_cge .ToUnicode ())==0{_fc =false ;};if _ebd .FontFile !=nil {if _cgc ,_aa :=_ga .GetStream (_ebd .FontFile );_aa {_ca ,_gdga =_ga .DecodeStream (_cgc );if _gdga !=nil {return _gdga ;};_bge =_dd +"\u002e\u0070\u0066\u0062";};}else if _ebd .FontFile2 !=nil {if _gg ,_cgec :=_ga .GetStream (_ebd .FontFile2 );
|
||
_cgec {_ca ,_gdga =_ga .DecodeStream (_gg );if _gdga !=nil {return _gdga ;};_bge =_dd +"\u002e\u0074\u0074\u0066";};}else if _ebd .FontFile3 !=nil {if _gef ,_abb :=_ga .GetStream (_ebd .FontFile3 );_abb {_ca ,_gdga =_ga .DecodeStream (_gef );if _gdga !=nil {return _gdga ;
|
||
};_bge =_dd +"\u002e\u0063\u0066\u0066";};};if len (_bge )< 1{_bb .Log .Debug (_gad );};_eef :=Font {FontName :_dd ,PdfFont :_cge ,IsCID :_cge .IsCID (),IsSimple :_cge .IsSimple (),ToUnicode :_fc ,FontType :_cgf ,FontData :_ca ,FontFileName :_bge ,FontDescriptor :_ebd };
|
||
_bag .Fonts =append (_bag .Fonts ,_eef );};return nil ;};func _bbbe (_eacd []*textLine ,_abee ,_feag float64 )[]*textLine {var _eaaff []*textLine ;for _ ,_fgbc :=range _eacd {if _abee ==-1{if _fgbc ._ffbb > _feag {_eaaff =append (_eaaff ,_fgbc );};}else {if _fgbc ._ffbb > _feag &&_fgbc ._ffbb < _abee {_eaaff =append (_eaaff ,_fgbc );
|
||
};};};return _eaaff ;};func (_bfbag rectRuling )checkWidth (_dddde ,_bgga float64 )(float64 ,bool ){_aaee :=_bgga -_dddde ;_affcc :=_aaee <=_ceaf ;return _aaee ,_affcc ;};func (_aeab lineRuling )yMean ()float64 {return 0.5*(_aeab ._gabf .Y +_aeab ._fccf .Y )};
|