2020-08-27 21:45:09 +00:00
|
|
|
|
//
|
|
|
|
|
// Copyright 2020 FoxyUtils ehf. All rights reserved.
|
|
|
|
|
//
|
|
|
|
|
// This is a commercial product and requires a license to operate.
|
|
|
|
|
// A trial license can be obtained at https://unidoc.io
|
|
|
|
|
//
|
|
|
|
|
// DO NOT EDIT: generated by unitwist Go source code obfuscator.
|
|
|
|
|
//
|
|
|
|
|
// Use of this source code is governed by the UniDoc End User License Agreement
|
|
|
|
|
// terms that can be accessed at https://unidoc.io/eula/
|
2018-03-22 14:03:47 +00:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
//
|
|
|
|
|
// Package extractor is used for quickly extracting PDF content through a simple interface.
|
|
|
|
|
// Currently offers functionality for extracting textual content.
|
|
|
|
|
//
|
2020-09-21 01:20:10 +00:00
|
|
|
|
package extractor ;import (_fc "bytes";_c "errors";_db "fmt";_df "github.com/unidoc/unipdf/v3/common";_dbd "github.com/unidoc/unipdf/v3/common/license";_gb "github.com/unidoc/unipdf/v3/contentstream";_ef "github.com/unidoc/unipdf/v3/core";_e "github.com/unidoc/unipdf/v3/internal/textencoding";_b "github.com/unidoc/unipdf/v3/internal/transform";_ab "github.com/unidoc/unipdf/v3/model";_gc "golang.org/x/text/unicode/norm";_ca "golang.org/x/xerrors";_cf "image/color";_g "io";_ge "math";_ac "regexp";_f "sort";_d "strings";_ad "unicode";_cc "unicode/utf8";);func _efff (_dcbd ,_ffeab _ab .PdfRectangle )(_ab .PdfRectangle ,bool ){if !_gab (_dcbd ,_ffeab ){return _ab .PdfRectangle {},false ;};return _ab .PdfRectangle {Llx :_ge .Max (_dcbd .Llx ,_ffeab .Llx ),Urx :_ge .Min (_dcbd .Urx ,_ffeab .Urx ),Lly :_ge .Max (_dcbd .Lly ,_ffeab .Lly ),Ury :_ge .Min (_dcbd .Ury ,_ffeab .Ury )},true ;};
|
2018-03-22 13:01:04 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// String returns a description of `b`.
|
|
|
|
|
func (_eefe *wordBag )String ()string {var _deddc []string ;for _ ,_fdd :=range _eefe .depthIndexes (){_ecda ,_ :=_eefe ._cgba [_fdd ];for _ ,_fac :=range _ecda {_deddc =append (_deddc ,_fac ._bbdbfb );};};return _db .Sprintf ("\u0025.\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065=\u0025\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0025\u0071",_eefe .PdfRectangle ,_eefe ._ffed ,len (_deddc ),_deddc );};func _bccaa (_ggdc *wordBag ,_bbad *textWord ,_fafd float64 )bool {return _ggdc .Urx <=_bbad .Llx &&_bbad .Llx < _ggdc .Urx +_fafd ;};func _fgfa (_ebd _b .Matrix )_b .Point {_fdf ,_gbe :=_ebd .Translation ();return _b .Point {X :_fdf ,Y :_gbe };};func (_eggfa paraList )reorder (_dbec []int ){_ccaa :=make (paraList ,len (_eggfa ));for _bgcc ,_fafa :=range _dbec {_ccaa [_bgcc ]=_eggfa [_fafa ];};copy (_eggfa ,_ccaa );};func _bbdgg (_aggc ,_aafg _b .Point )rulingKind {_fffg :=_ge .Abs (_aggc .X -_aafg .X );_aabba :=_ge .Abs (_aggc .Y -_aafg .Y );return _fbdgg (_fffg ,_aabba );};func _edfgf (_ggcg *wordBag ,_gccg *textWord ,_ebde float64 )bool {return _gccg .Llx < _ggcg .Urx +_ebde &&_ggcg .Llx -_ebde < _gccg .Urx ;};func (_fdae *textPara )fontsize ()float64 {return _fdae ._efad [0]._bafec };
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// String returns a human readable description of `ss`.
|
|
|
|
|
func (_ceab *shapesState )String ()string {return _db .Sprintf ("%\u0064 \u0073\u0075\u0062\u0070\u0061\u0074\u0068\u0073 \u0066\u0072\u0065\u0073h=\u0025\u0074",len (_ceab ._gggc ),_ceab ._adc );};func _bbdbc (_deac string )string {_aefa :=[]rune (_deac );return string (_aefa [:len (_aefa )-1])};func _bdae (_adag []rune )bool {return len (_adag )>=_bgbb &&_ad .Is (_ad .Hyphen ,_adag [len (_adag )-1])&&!_ad .IsSpace (_adag [len (_adag )-2]);};func (_eff *stateStack )pop ()*textState {if _eff .empty (){return nil ;};_bee :=*(*_eff )[len (*_eff )-1];*_eff =(*_eff )[:len (*_eff )-1];return &_bee ;};func _edef (_fbgc ,_egacb *textPara )bool {return _decfa (_fbgc .depth ()-_egacb .depth ())};const (_dce =true ;_facce =true ;_dgeg =true ;_gfae =false ;_cfe =false ;_bagc =6;_ddgg =3.0;_bgeg =200;_gfag =true ;_bgcd =true ;_afba =true ;_eeea =true ;);const (RenderModeStroke RenderMode =1<<iota ;RenderModeFill ;RenderModeClip ;);func _fbdgg (_bead ,_fgbca float64 )rulingKind {if _bead >=_dgde &&_fbc (_fgbca ,_bead ){return _cbdac ;};if _fgbca >=_dgde &&_fbc (_bead ,_fgbca ){return _acab ;};return _agga ;};func (_aebd *subpath )makeRectRuling ()(*ruling ,bool ){if _geac {_df .Log .Info ("\u006d\u0061\u006beR\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0070\u0061\u0074\u0068\u003d\u0025\u0076",_aebd );};_gfcd :=_aebd ._acddg [:4];_fead :=make (map[int ]rulingKind ,len (_gfcd ));for _gdba ,_fbede :=range _gfcd {_ccce :=_aebd ._acddg [(_gdba +1)%4];_fead [_gdba ]=_bbdgg (_fbede ,_ccce );};if _geac {_db .Printf ("\u0020\u0020\u0020\u006b\u0069\u006e\u0064\u0073\u003d\u0025\u002b\u0076\u000a",_fead );};var _facb ,_bgca []int ;for _ddbe ,_fbdga :=range _fead {switch _fbdga {case _cbdac :_bgca =append (_bgca ,_ddbe );case _acab :_facb =append (_facb ,_ddbe );};};if _geac {_db .Printf ("\u0020\u0020 \u0068\u006f\u0072z\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a",len (_bgca ),_bgca );_db .Printf ("\u0020\u0020 \u0076\u0065\u0072t\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a",len (_facb ),_facb );};_cgaaa :=(len (_bgca )==2&&len (_facb )==2)||(len (_bgca )==2&&len (_facb )==0&&_eeab (_gfcd [_bgca [0]],_gfcd [_bgca [1]]))||(len (_facb )==2&&len (_bgca )==0&&_feca (_gfcd [_facb [0]],_gfcd [_facb [1]]));if _geac {_db .Printf (" \u0020\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064\u0020\u0076\u0065\u0072\u0074\u0073=\u0025\u0064\u0020o\u006b=\u0025\u0074\u000a",len (_bgca ),len (_facb ),_cgaaa );};if !_cgaaa {return &ruling {},false ;};if len (_facb )==0{for _dggga ,_gabe :=range _fead {if _gabe !=_cbdac {_facb =append (_facb ,_dggga );};};};if len (_bgca )==0{for _cbcgc ,_gdfga :=range _fead {if _gdfga !=_acab {_bgca =append (_bgca ,_cbcgc );};};};if _geac {_df .Log .Info ("\u0020\u0020\u0068\u006f\u0072\u007a\u0073\u003d\u0025\u0064 \u0076\u0065\u0072\u0074\u0073\u003d\u0025d\u0020\u0070\u006f\u0069\u006e\u0074\u0073\u003d\u0025\u0064\u000a"+"\u0009\u0020\u0068o\u0072\u007a\u0073\u003d\u0025\u002b\u0076\u000a"+"\u0009\u0020\u0076e\u0072\u0074\u0073\u003d\u0025\u002b\u0076\u000a"+"\t\u0070\u006f\u0069\u006e\u0074\u0073\u003d\u0025\u002b\u0076",len (_bgca ),len (_facb ),len (_gfcd ),_bgca ,_facb ,_gfcd );};var _ecfg ,_gafb ,_fcfg ,_gbbfb _b .Point ;if _gfcd [_bgca [0]].Y > _gfcd [_bgca [1]].Y {_fcfg ,_gbbfb =_gfcd [_bgca [0]],_gfcd [_bgca [1]];}else {_fcfg ,_gbbfb =_gfcd [_bgca [1]],_gfcd [_bgca [0]];};if _gfcd [_facb [0]].X > _gfcd [_facb [1]].X {_ecfg ,_gafb =_gfcd [_facb [0]],_gfcd [_facb [1]];}else {_ecfg ,_gafb =_gfcd [_facb [1]],_gfcd [_facb [0]];};_gcff :=_ab .PdfRectangle {Llx :_ecfg .X ,Urx :_gafb .X ,Lly :_gbbfb .Y ,Ury :_fcfg .Y };if _gcff .Llx > _gcff .Urx {_gcff .Llx ,_gcff .Urx =_gcff .Urx ,_gcff .Llx ;};if _gcff .Lly > _gcff .Ury {_gcff .Lly ,_gcff .Ury =_gcff .Ury ,_gcff .Lly ;};_bfgg :=rectRuling {PdfRectangle :_gcff ,_eagg :_dbfa (_gcff )};if _bfgg ._eagg ==_agga {return nil ,false ;};_fcge ,_fgfaa :=_bfgg .asRuling ();if !_fgfaa {return nil ,false ;};if _geac {_db .Printf ("\u0020\u0020\u0020\u0072\u003d\u0025\u0073\u000a",_fcge .String ());};return _fcge ,true ;};func _baaaf (_gc
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
|
|
|
|
|
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
|
|
|
|
|
func (_fceg *Extractor )ExtractTextWithStats ()(_bed string ,_dgef int ,_eaa int ,_ggf error ){_ccf ,_dgef ,_eaa ,_ggf :=_fceg .ExtractPageText ();if _ggf !=nil {return "",_dgef ,_eaa ,_ggf ;};return _ccf .Text (),_dgef ,_eaa ,nil ;};func (_bbag *shapesState )clearPath (){_bbag ._gggc =nil ;_bbag ._adc =false ;if _fcfd {_df .Log .Info ("\u0043\u004c\u0045A\u0052\u003a\u0020\u0073\u0073\u003d\u0025\u0073",_bbag );};};func (_fecfd paraList )tables ()[]TextTable {var _cfgbe []TextTable ;for _ ,_aedc :=range _fecfd {_cegd :=_aedc ._accb ;if _cegd !=nil &&_cegd .isExportable (){_cfgbe =append (_cfgbe ,_cegd .toTextTable ());};};return _cfgbe ;};func _cbcd (_agdc ,_adba _b .Point )(*ruling ,bool ){_bceb :=lineRuling {_afad :_agdc ,_baacb :_adba ,_cdcb :_bbdgg (_agdc ,_adba )};if _bceb ._cdcb ==_agga {return nil ,false ;};return _bceb .asRuling ();};func (_aaf *textObject )setTextLeading (_afa float64 ){if _aaf ==nil {return ;};_aaf ._fdag ._aede =_afa ;};const (_dcbg =1.0e-6;_gcg =1.0e-4;_fcfdg =10;_gdaf =6;_fegec =0.5;_defbd =0.11;_aaac =0.19;_gfdg =0.04;_adce =0.04;_gbeg =1.0;_affb =0.04;_agcf =0.4;_dfca =0.7;_dea =1.0;_dacg =0.1;_defc =1.4;_dcc =0.46;_cacfd =0.02;_ecdb =0.2;_fbeb =0.5;_bgbb =4;_degd =4.0;_bae =6;_gcea =0.01;_bbdb =0.02;_gbcg =2;_bagcg =2;_dgde =10.0;_feab =0.05;_bfb =0.3;_cded =1.0;_bdeb =1.0;);func (_gbgf *ruling )intersects (_fcca *ruling )bool {_faef :=(_gbgf ._aead ==_acab &&_fcca ._aead ==_cbdac )||(_fcca ._aead ==_acab &&_gbgf ._aead ==_cbdac );_cede :=func (_bdcg ,_abadc *ruling )bool {return _bdcg ._egaa <=_abadc ._ffd +_bfb &&_abadc ._ffd -_bfb <=_bdcg ._dcfe ;};_bdbb :=_cede (_gbgf ,_fcca );_dedbd :=_cede (_fcca ,_gbgf );if _geac {_df .Log .Info ("\u0069\u006e\u0074\u0065r\u0073\u0065\u0063\u0074\u0073\u003a\u0020\u0020\u006ft\u0068\u006f\u0067\u006f\u006e\u0061\u006c\u003d\u0025\u0074\u0020\u006f\u0031\u003d\u0025\u0074\u0020\u006f2\u003d\u0025\u0074\u0020\u002d>\u0020\u0025\u0074\u000a\u0009\u0020\u0076\u003d\u0025\u0073\u000a\u0009\u0076\u0032\u003d\u0025\u0073",_faef ,_bdbb ,_dedbd ,_faef &&_bdbb &&_dedbd ,_gbgf ,_fcca );};return _faef &&_bdbb &&_dedbd ;};func (_gagaed *textWord )appendMark (_dfcd *textMark ,_fgdd _ab .PdfRectangle ){_gagaed ._gbgg =append (_gagaed ._gbgg ,_dfcd );_gagaed .PdfRectangle =_ddac (_gagaed .PdfRectangle ,_dfcd .PdfRectangle );if _dfcd ._ebbg > _gagaed ._bbcaa {_gagaed ._bbcaa =_dfcd ._ebbg ;};_gagaed ._aggfa =_fgdd .Ury -_gagaed .PdfRectangle .Lly ;};func (_ccag *shapesState )establishSubpath ()*subpath {_cegfe ,_dgcb :=_ccag .lastpointEstablished ();if !_dgcb {_ccag ._gggc =append (_ccag ._gggc ,_dgdc (_cegfe ));};if len (_ccag ._gggc )==0{return nil ;};_ccag ._adc =false ;return _ccag ._gggc [len (_ccag ._gggc )-1];};func (_bdcf rulingList )vertsHorzs ()(rulingList ,rulingList ){var _dfbbe ,_ddge rulingList ;for _ ,_ebff :=range _bdcf {switch _ebff ._aead {case _acab :_dfbbe =append (_dfbbe ,_ebff );case _cbdac :_ddge =append (_ddge ,_ebff );};};return _dfbbe ,_ddge ;};func (_gfeb *stateStack )size ()int {return len (*_gfeb )};func _fagd (_beedg string ,_bgbgc int )string {if len (_beedg )< _bgbgc {return _beedg ;};return _beedg [:_bgbgc ];};func (_gffc *stateStack )push (_bab *textState ){_cdb :=*_bab ;*_gffc =append (*_gffc ,&_cdb )};type imageExtractContext struct{_bec []ImageMark ;_gd int ;_bb int ;_ff int ;_da map[*_ef .PdfObjectStream ]*cachedImage ;_bg *ImageExtractOptions ;};func (_cbc *textObject )reset (){_cbc ._aff =_b .IdentityMatrix ();_cbc ._dgg =_b .IdentityMatrix ();_cbc ._dgce =nil ;};func (_edfb *textPara )toTextMarks (_fgeg *int )[]TextMark {if _edfb ._accb ==nil {return _edfb .toCellTextMarks (_fgeg );};var _dcdf []TextMark ;for _fgfde :=0;_fgfde < _edfb ._accb ._bged ;_fgfde ++{for _dcggc :=0;_dcggc < _edfb ._accb ._cfgec ;_dcggc ++{_acbd :=_edfb ._accb .get (_dcggc ,_fgfde );if _acbd ==nil {_dcdf =_gegg (_dcdf ,_fgeg ,"\u0009");}else {_ggdce :=_acbd .toCellTextMarks (_fgeg );_dcdf =append (_dcdf ,_ggdce ...);};_dcdf =_gegg (_dcdf ,_fgeg ,"\u0020");};if _fgfde < _edfb ._accb ._bged -1{_dcdf =_gegg (_dcdf ,_fgeg ,"\u000a");};};return _dcdf ;}
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// NewFromContents creates a new extractor from contents and page resources.
|
|
|
|
|
func NewFromContents (contents string ,resources *_ab .PdfPageResources )(*Extractor ,error ){_eg :=&Extractor {_ade :contents ,_dd :resources ,_dde :map[string ]fontEntry {},_cg :map[string ]textResult {}};return _eg ,nil ;};func (_edgg *shapesState )fill (_gace *[]*subpath ){*_gace =append (*_gace ,_edgg ._gggc ...);if _geac {_df .Log .Info ("\u0046\u0049L\u004c\u003a\u0020\u0025\u0064\u0020\u0066\u0069\u006c\u006c\u0073\u0020\u0028\u0025\u0064\u0020\u006e\u0065\u0077\u0029\u0020\u0073s=\u0025\u0073",len (*_gace ),len (_edgg ._gggc ),_edgg );for _egb ,_dbe :=range _edgg ._gggc {_db .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_egb ,_dbe );if _egb ==10{break ;};};};};func (_fdeg *textPara )toCellTextMarks (_gfbg *int )[]TextMark {var _fdff []TextMark ;for _cdedc ,_dcbe :=range _fdeg ._efad {_gcefc :=_dcbe .toTextMarks (_gfbg );_cafg :=_dce &&_dcbe .endsInHyphen ()&&_cdedc !=len (_fdeg ._efad )-1;if _cafg {_gcefc =_gcagdd (_gcefc ,_gfbg );};_fdff =append (_fdff ,_gcefc ...);if !(_cafg ||_cdedc ==len (_fdeg ._efad )-1){_fdff =_gegg (_fdff ,_gfbg ,_cbe (_dcbe ._adcf ,_fdeg ._efad [_cdedc +1]._adcf ));};};return _fdff ;};func (_faa *imageExtractContext )processOperand (_ee *_gb .ContentStreamOperation ,_adb _gb .GraphicsState ,_gg *_ab .PdfPageResources )error {if _ee .Operand =="\u0042\u0049"&&len (_ee .Params )==1{_dcd ,_dg :=_ee .Params [0].(*_gb .ContentStreamInlineImage );if !_dg {return nil ;};if _dgc ,_acdc :=_ef .GetBoolVal (_dcd .ImageMask );_acdc {if _dgc &&!_faa ._bg .IncludeInlineStencilMasks {return nil ;};};return _faa .extractInlineImage (_dcd ,_adb ,_gg );}else if _ee .Operand =="\u0044\u006f"&&len (_ee .Params )==1{_gaa ,_cb :=_ef .GetName (_ee .Params [0]);if !_cb {_df .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065");return _cac ;};_ ,_becg :=_gg .GetXObjectByName (*_gaa );switch _becg {case _ab .XObjectTypeImage :return _faa .extractXObjectImage (_gaa ,_adb ,_gg );case _ab .XObjectTypeForm :return _faa .extractFormImages (_gaa ,_adb ,_gg );};};return nil ;};func (_adcce lineRuling )yDelta ()float64 {return _ge .Abs (_adcce ._baacb .Y -_adcce ._baacb .Y )};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// String returns a string describing `ma`.
|
|
|
|
|
func (_fgg TextMarkArray )String ()string {_efbac :=len (_fgg ._ggfd );if _efbac ==0{return "\u0045\u004d\u0050T\u0059";};_gdgcg :=_fgg ._ggfd [0];_fbed :=_fgg ._ggfd [_efbac -1];return _db .Sprintf ("\u007b\u0054\u0045\u0058\u0054\u004d\u0041\u0052K\u0041\u0052\u0052AY\u003a\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025s\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d%\u0073\u007d",_efbac ,_gdgcg ,_fbed );};func (_aabb *ruling )equals (_gfgfd *ruling )bool {return _aabb ._aead ==_gfgfd ._aead &&_aabb ._ffd ==_gfgfd ._ffd &&_aabb ._egaa ==_gfgfd ._egaa &&_aabb ._dcfe ==_gfgfd ._dcfe ;};func (_fcce *textLine )toTextMarks (_fabc *int )[]TextMark {var _bfed []TextMark ;for _ ,_bffg :=range _fcce ._fbae {if _bffg ._egce {_bfed =_gegg (_bfed ,_fabc ,"\u0020");};_dfed :=_bffg .toTextMarks (_fabc );_bfed =append (_bfed ,_dfed ...);};return _bfed ;};const _ce =20;type wordBag struct{_ab .PdfRectangle ;_ffed float64 ;_accg float64 ;_cgba map[int ][]*textWord ;};func _bccc (_eedgc []int )[]int {_dcgg :=make ([]int ,len (_eedgc ));for _gbcgf ,_ggbc :=range _eedgc {_dcgg [len (_eedgc )-1-_gbcgf ]=_ggbc ;};return _dcgg ;};func (_cddg *textTable )computeBbox ()_ab .PdfRectangle {_dbafe :=_cddg .get (0,0).PdfRectangle ;for _afbe :=1;_afbe < _cddg ._cfgec ;_afbe ++{_dbafe =_ddac (_dbafe ,_cddg .get (_afbe ,0).PdfRectangle );};for _aeaf :=1;_aeaf < _cddg ._bged ;_aeaf ++{for _ffgc :=0;_ffgc < _cddg ._cfgec ;_ffgc ++{_cbbc :=_cddg .get (_ffgc ,_aeaf );if _cbbc !=nil {_dbafe =_ddac (_dbafe ,_cbbc .PdfRectangle );};};};return _dbafe ;};
|
|
|
|
|
|
|
|
|
|
// Elements returns the TextMarks in `ma`.
|
|
|
|
|
func (_fgfd *TextMarkArray )Elements ()[]TextMark {return _fgfd ._ggfd };func _aecd (_deae *wordBag ,_fccg int )*textLine {_accda :=_deae .firstWord (_fccg );_efcf :=textLine {PdfRectangle :_accda .PdfRectangle ,_bafec :_accda ._bbcaa ,_adcf :_accda ._aggfa };_efcf .pullWord (_deae ,_accda ,_fccg );return &_efcf ;};
|
|
|
|
|
|
|
|
|
|
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
|
|
|
|
|
// TODO(peterwilliams97): The stats complicate this function signature and aren't very useful.
|
|
|
|
|
// Replace with a function like Extract() (*PageText, error)
|
|
|
|
|
func (_eba *Extractor )ExtractPageText ()(*PageText ,int ,int ,error ){_cbg ,_agf ,_acb ,_fad :=_eba .extractPageText (_eba ._ade ,_eba ._dd ,_b .IdentityMatrix (),0);if _fad !=nil {return nil ,0,0,_fad ;};_cbg .computeViews ();_fad =_gcfd (_cbg );if _fad !=nil {return nil ,0,0,_fad ;};return _cbg ,_agf ,_acb ,nil ;};func (_decbad *textWord )bbox ()_ab .PdfRectangle {return _decbad .PdfRectangle };const _eeg =1.0/1000.0;func _dbeaf (_fcac *wordBag ,_fbdd float64 )[]*wordBag {var _caeg []*wordBag ;for _ ,_acfd :=range _fcac .depthIndexes (){_baag :=false ;for !_fcac .empty (_acfd ){_cdbf :=_fcac .firstReadingIndex (_acfd );_gfefe :=_fcac .firstWord (_cdbf );_gaad :=_eefc (_gfefe ,_fbdd );_fcac .removeWord (_gfefe ,_cdbf );if _bbcd {_df .Log .Info ("\u0066\u0069\u0072\u0073\u0074\u0057\u006f\u0072\u0064\u0020\u005e\u005e^\u005e\u0020\u0025\u0073",_gfefe .String ());};for _eefae :=true ;_eefae ;_eefae =_baag {_baag =false ;_bfda :=_dea *_gaad ._ffed ;_bdaa :=_agcf *_gaad ._ffed ;_fgdf :=_gbeg *_gaad ._ffed ;if _bbcd {_df .Log .Info ("\u0070a\u0072a\u0057\u006f\u0072\u0064\u0073\u0020\u0064\u0065\u0070\u0074\u0068 \u0025\u002e\u0032\u0066 \u002d\u0020\u0025\u002e\u0032f\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061\u0044\u0065\u0070\u0074\u0068\u0047\u0061\u0070\u003d\u0025\u002e\u0032\u0066\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061R\u0065\u0061\u0064\u0069\u006e\u0067\u0047\u0061p\u003d\u0025\u002e\u0032\u0066",_gaad .minDepth (),_gaad .maxDepth (),_fgdf ,_bdaa );};if _fcac .scanBand ("\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c",_gaad ,_acee (_edfgf ,0),_gaad .minDepth ()-_fgdf ,_gaad .maxDepth ()+_fgdf ,_affb ,false ,false )> 0{_baag =true ;};if _fcac .scanBand ("\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c",_gaad ,_acee (_edfgf ,_bdaa ),_gaad .minDepth (),_gaad .maxDepth (),_dfca ,false ,false )> 0{_baag =true ;};if _baag {continue ;};_febgb :=_fcac .scanBand ("",_gaad ,_acee (_bccaa ,_bfda ),_gaad .minDepth (),_gaad .maxDepth (),_dacg ,true ,false );if _febgb > 0{_agba :=(_gaad .maxDepth ()-_gaad .minDepth ())/_gaad ._ffed ;if (_febgb > 1&&float64 (_febgb )> 0.3*_agba )||_febgb <=10{if _fcac .scanBand ("\u006f\u0074\u0068e\u0072",_gaad ,_acee (_bccaa ,_bfda ),_gaad .minDepth (),_gaad .maxDepth (),_dacg ,false ,true )> 0{_baag =true ;};};};};_caeg =append (_caeg ,_gaad );};};return _caeg ;};func (_ccge *textTable )put (_dcdb ,_bcabe int ,_ccba *textPara ){_ccge ._acaf [_ggfc (_dcdb ,_bcabe )]=_ccba ;};
|
|
|
|
|
|
|
|
|
|
// PageText represents the layout of text on a device page.
|
|
|
|
|
type PageText struct{_ebfe []*textMark ;_bdc string ;_dedb []TextMark ;_acdd []TextTable ;_bgb _ab .PdfRectangle ;_bbbg []*subpath ;_edf []*subpath ;};func (_dccd *textLine )text ()string {var _bada []string ;for _ ,_gfaa :=range _dccd ._fbae {if _gfaa ._egce {_bada =append (_bada ,"\u0020");};_bada =append (_bada ,_gfaa ._bbdbfb );};return _d .Join (_bada ,"");};func _dbcf (_baff []*textMark ,_becb _ab .PdfRectangle )*textWord {_cgdc :=_baff [0].PdfRectangle ;_cecdf :=_baff [0]._ebbg ;for _ ,_gecdf :=range _baff [1:]{_cgdc =_ddac (_cgdc ,_gecdf .PdfRectangle );if _gecdf ._ebbg > _cecdf {_cecdf =_gecdf ._ebbg ;};};return &textWord {PdfRectangle :_cgdc ,_gbgg :_baff ,_aggfa :_becb .Ury -_cgdc .Lly ,_bbcaa :_cecdf };};func (_bacd *textWord )addDiacritic (_gccd string ){_bfbb :=_bacd ._gbgg [len (_bacd ._gbgg )-1];_bfbb ._afca =_bfbb ._afca +_gccd ;_bfbb ._afca =_gc .NFKC .String (_bfbb ._afca );};func (_def *textObject )showText (_ggcb []byte )error {return _def .renderText (_ggcb )};var _eb =false ;
|
|
|
|
|
|
|
|
|
|
// Tables returns the tables extracted from the page.
|
|
|
|
|
func (_aea PageText )Tables ()[]TextTable {return _aea ._acdd };func _efab (_dff *_gb .ContentStreamOperation )(float64 ,error ){if len (_dff .Params )!=1{_fgf :=_c .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");_df .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_dff .Operand ,1,len (_dff .Params ),_dff .Params );return 0.0,_fgf ;};return _ef .GetNumberAsFloat (_dff .Params [0]);};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `state`.
|
|
|
|
|
func (_dbda *textState )String ()string {_deb :="\u005bN\u004f\u0054\u0020\u0053\u0045\u0054]";if _dbda ._feb !=nil {_deb =_dbda ._feb .BaseFont ();};return _db .Sprintf ("\u0074\u0063\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0077\u003d\u0025\u002e\u0032\u0066 \u0074f\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u003d\u0025\u0071",_dbda ._bfef ,_dbda ._ebc ,_dbda ._gga ,_deb );};func (_ddca rulingList )sort (){_f .Slice (_ddca ,func (_fefdd ,_ccdd int )bool {return _ddca .comp (_fefdd ,_ccdd )});};func (_bcfg paraList )computeEBBoxes (){if _gbbf {_df .Log .Info ("\u0063o\u006dp\u0075\u0074\u0065\u0045\u0042\u0042\u006f\u0078\u0065\u0073\u003a");};for _ ,_gbbe :=range _bcfg {_gbbe ._agbd =_gbbe .PdfRectangle ;};_acaee :=_bcfg .yNeighbours (0);for _gcde ,_cfaf :=range _bcfg {_gdbb :=_cfaf ._agbd ;_cdfg ,_efge :=-1.0e9,+1.0e9;for _ ,_deda :=range _acaee [_cfaf ]{_baace :=_bcfg [_deda ]._agbd ;if _baace .Urx < _gdbb .Llx {_cdfg =_ge .Max (_cdfg ,_baace .Urx );}else if _gdbb .Urx < _baace .Llx {_efge =_ge .Min (_efge ,_baace .Llx );};};for _fefd ,_ggdb :=range _bcfg {_dgdg :=_ggdb ._agbd ;if _gcde ==_fefd ||_dgdg .Ury > _gdbb .Lly {continue ;};if _cdfg <=_dgdg .Llx &&_dgdg .Llx < _gdbb .Llx {_gdbb .Llx =_dgdg .Llx ;}else if _dgdg .Urx <=_efge &&_gdbb .Urx < _dgdg .Urx {_gdbb .Urx =_dgdg .Urx ;};};if _gbbf {_db .Printf ("\u0025\u0034\u0064\u003a %\u0036\u002e\u0032\u0066\u2192\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0071\u000a",_gcde ,_cfaf ._agbd ,_gdbb ,_fagd (_cfaf .text (),50));};_cfaf ._agbd =_gdbb ;};if _gfae {for _ ,_effc :=range _bcfg {_effc .PdfRectangle =_effc ._agbd ;};};};func (_baae *PageText )computeViews (){_cce :=_gagc (_baae ._bbbg );_fga :=_cfac (_baae ._edf );var _ddg []rulingList ;if _afba {_ddg =append (_ddg ,_cce ...);};if _eeea {_ddg =append (_ddg ,_fga ...);};if _geac {if len (_cce )> 0{_df .Log .Info ("S\u0074\u0072\u006f\u006b\u0065\u0073\u003a\u0020\u0025\u0064",len (_baae ._bbbg ));_df .Log .Info ("\u0053\u0074r\u006f\u006b\u0065 \u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0025\u0064",len (_cce ));for _eecf ,_cbde :=range _cce {_db .Printf ("\u0025\u0034d\u003a\u0020\u0025d\u0020\u0072\u0075\u006c\u0069\u006e\u0067\u0073\u000a",_eecf ,len (_cbde ));for _fdc ,_becf :=range _cbde {_db .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_fdc ,_becf );};};};if len (_fga )> 0{_df .Log .Info ("\u0046i\u006c\u006c\u0073\u003a\u0020\u0025d",len (_baae ._edf ));_df .Log .Info ("\u0046\u0069\u006c\u006c\u0020\u0047\u0072\u0069\u0064s\u003a\u0020\u0025\u0064",len (_fga ));for _bfg ,_geg :=range _fga {_db .Printf ("\u0025\u0034d\u003a\u0020\u0025d\u0020\u0072\u0075\u006c\u0069\u006e\u0067\u0073\u000a",_bfg ,len (_geg ));for _affe ,_cfd :=range _geg {_db .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_affe ,_cfd );};};};};var _deeg paraList ;_dbb :=len (_baae ._ebfe );for _gebf :=0;_gebf < 360&&_dbb > 0;_gebf +=90{_daed :=make ([]*textMark ,0,len (_baae ._ebfe )-_dbb );for _ ,_gac :=range _baae ._ebfe {if _gac ._cega ==_gebf {_daed =append (_daed ,_gac );};};if len (_daed )> 0{_fgbc :=_cead (_daed ,_baae ._bgb ,_ddg );_deeg =append (_deeg ,_fgbc ...);_dbb -=len (_daed );};};_bfab :=new (_fc .Buffer );_deeg .writeText (_bfab );_baae ._bdc =_bfab .String ();_baae ._dedb =_deeg .toTextMarks ();_baae ._acdd =_deeg .tables ();};func (_ebcb *wordBag )text ()string {_eefd :=_ebcb .allWords ();_dcdg :=make ([]string ,len (_eefd ));for _ccec ,_adbc :=range _eefd {_dcdg [_ccec ]=_adbc ._bbdbfb ;};return _d .Join (_dcdg ,"\u0020");};
|
|
|
|
|
|
|
|
|
|
// TextMarkArray is a collection of TextMarks.
|
|
|
|
|
type TextMarkArray struct{_ggfd []TextMark };func _feca (_ffbd ,_ebffg _b .Point )bool {_dadd :=_ge .Abs (_ffbd .X -_ebffg .X );_afeg :=_ge .Abs (_ffbd .Y -_ebffg .Y );return _fbc (_dadd ,_afeg );};func (_de *imageExtractContext )extractXObjectImage (_gae *_ef .PdfObjectName ,_fag _gb .GraphicsState ,_fgc *_ab .PdfPageResources )error {_cca ,_ :=_fgc .GetXObjectByName (*_gae );if _cca ==nil {return nil ;};_edb ,_gef :=_de ._da [_cca ];if !_gef {_dge ,_cgg :=_fgc .GetXObjectImageByName (*_gae );if _cgg !=nil {return _cgg ;};if _dge ==nil {return nil ;};_gba ,_cgg :=_dge .ToImage ();if _cgg !=nil {return _cgg ;};_edb =&cachedImage {_dc :_gba ,_eae :_dge .ColorSpace };_de ._da [_cca ]=_edb ;};_dbdb :=_edb ._dc ;_efe :=_edb ._eae ;_cgga ,_ggc :=_efe .ImageToRGB (*_dbdb );if _ggc !=nil {return _ggc ;};_df .Log .Debug ("@\u0044\u006f\u0020\u0043\u0054\u004d\u003a\u0020\u0025\u0073",_fag .CTM .String ());_ag :=ImageMark {Image :&_cgga ,Width :_fag .CTM .ScalingFactorX (),Height :_fag .CTM .ScalingFactorY (),Angle :_fag .CTM .Angle ()};_ag .X ,_ag .Y =_fag .CTM .Translation ();_de ._bec =append (_de ._bec ,_ag );_de ._bb ++;return nil ;};func (_gggg paraList )writeText (_cbcf _g .Writer ){for _dggg ,_bbga :=range _gggg {_bbga .writeText (_cbcf );if _dggg !=len (_gggg )-1{if _edef (_bbga ,_gggg [_dggg +1]){_cbcf .Write ([]byte ("\u0020"));}else {_cbcf .Write ([]byte ("\u000a"));_cbcf .Write ([]byte ("\u000a"));};};};_cbcf .Write ([]byte ("\u000a"));_cbcf .Write ([]byte ("\u000a"));};
|
|
|
|
|
|
|
|
|
|
// PageImages represents extracted images on a PDF page with spatial information:
|
|
|
|
|
// display position and size.
|
|
|
|
|
type PageImages struct{Images []ImageMark ;};func _dbba (_fecd []*textWord ,_cgbac float64 )*wordBag {_fgfg :=_eefc (_fecd [0],_cgbac );for _ ,_dedd :=range _fecd [1:]{_ccc :=_cgce (_dedd ._aggfa );_fgfg ._cgba [_ccc ]=append (_fgfg ._cgba [_ccc ],_dedd );};_fgfg .sort ();return _fgfg ;};
|
|
|
|
|
|
|
|
|
|
// Extractor stores and offers functionality for extracting content from PDF pages.
|
|
|
|
|
type Extractor struct{_ade string ;_dd *_ab .PdfPageResources ;_af _ab .PdfRectangle ;_dde map[string ]fontEntry ;_cg map[string ]textResult ;_ga int64 ;_ea int ;};
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-14 09:32:45 +00:00
|
|
|
|
// String returns a string describing the current state of the textState stack.
|
2020-09-21 01:20:10 +00:00
|
|
|
|
func (_ddb *stateStack )String ()string {_eaad :=[]string {_db .Sprintf ("\u002d\u002d\u002d\u002d f\u006f\u006e\u0074\u0020\u0073\u0074\u0061\u0063\u006b\u003a\u0020\u0025\u0064",len (*_ddb ))};for _ffef ,_ace :=range *_ddb {_aaba :="\u003c\u006e\u0069l\u003e";if _ace !=nil {_aaba =_ace .String ();};_eaad =append (_eaad ,_db .Sprintf ("\u0009\u0025\u0032\u0064\u003a\u0020\u0025\u0073",_ffef ,_aaba ));};return _d .Join (_eaad ,"\u000a");};func (_eada *textPara )taken ()bool {return _eada ==nil ||_eada ._geda };const (_gbbf =false ;_baab =false ;_fada =false ;_fcfd =false ;_fdec =false ;_bbcd =false ;_bbeaa =false ;_fdfg =false ;_gafc =_fdfg &&true ;_ggcc =_gafc &&false ;_aedd =_fdfg &&true ;_fbgd =false ;_efcc =_fbgd ||false ;_geac =false ;);func _dae (_ffec _ab .PdfRectangle )textState {return textState {_adac :100,_adf :RenderModeFill ,_bdg :_ffec };};func (_cdbd *textTable )growTable (){_gggd :=func (_afg paraList ){_cdbd ._bged ++;for _edcf :=0;_edcf < _cdbd ._cfgec ;_edcf ++{_dggc :=_afg [_edcf ];_cdbd .put (_edcf ,_cdbd ._bged -1,_dggc );};};_cfbb :=func (_acegc paraList ){_cdbd ._cfgec ++;for _eface :=0;_eface < _cdbd ._bged ;_eface ++{_dcaa :=_acegc [_eface ];_cdbd .put (_cdbd ._cfgec -1,_eface ,_dcaa );};};for {_acfcg :=false ;_ddfd :=_cdbd .getDown ();_ecad :=_cdbd .getRight ();if _ddfd !=nil &&_ecad !=nil {_dfba :=_ddfd [len (_ddfd )-1];if _dfba !=nil &&!_dfba ._geda &&_dfba ==_ecad [len (_ecad )-1]{_gggd (_ddfd );if _ecad =_cdbd .getRight ();_ecad !=nil {_cfbb (_ecad );_cdbd .put (_cdbd ._cfgec -1,_cdbd ._bged -1,_dfba );};_acfcg =true ;};};if !_acfcg &&_ddfd !=nil {_gggd (_ddfd );_acfcg =true ;};if !_acfcg &&_ecad !=nil {_cfbb (_ecad );_acfcg =true ;};if !_acfcg {break ;};};};func (_bfea paraList )topoOrder ()[]int {if _bbeaa {_df .Log .Info ("\u0074\u006f\u0070\u006f\u004f\u0072\u0064\u0065\u0072\u003a");};_agg :=len (_bfea );_cgcf :=make ([]bool ,_agg );_bage :=make ([]int ,0,_agg );_cbfd :=_bfea .llyOrdering ();var _geeb func (_cfcd int );_geeb =func (_gdd int ){_cgcf [_gdd ]=true ;for _egdc :=0;_egdc < _agg ;_egdc ++{if !_cgcf [_egdc ]{if _bfea .readBefore (_cbfd ,_gdd ,_egdc ){_geeb (_egdc );};};};_bage =append (_bage ,_gdd );};for _fcebe :=0;_fcebe < _agg ;_fcebe ++{if !_cgcf [_fcebe ]{_geeb (_fcebe );};};return _bccc (_bage );};func (_ecae *textTable )bbox ()_ab .PdfRectangle {return _ecae .PdfRectangle };type textLine struct{_ab .PdfRectangle ;_adcf float64 ;_fbae []*textWord ;_bafec float64 ;};
|
|
|
|
|
|
|
|
|
|
// BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
|
|
|
|
|
func (_efec *TextMarkArray )BBox ()(_ab .PdfRectangle ,bool ){var _fca _ab .PdfRectangle ;_gege :=false ;for _ ,_gega :=range _efec ._ggfd {if _gega .Meta ||_fecc (_gega .Text ){continue ;};if _gege {_fca =_ddac (_fca ,_gega .BBox );}else {_fca =_gega .BBox ;_gege =true ;};};return _fca ,_gege ;};func (_efc *textObject )setHorizScaling (_ede float64 ){if _efc ==nil {return ;};_efc ._fdag ._adac =_ede ;};func (_baac *textObject )setTextRise (_eag float64 ){if _baac ==nil {return ;};_baac ._fdag ._daa =_eag ;};type rectRuling struct{_eagg rulingKind ;_ab .PdfRectangle ;};func (_cbdg *Extractor )extractPageText (_ada string ,_dfe *_ab .PdfPageResources ,_cbf _b .Matrix ,_fbb int )(*PageText ,int ,int ,error ){_df .Log .Trace ("\u0065x\u0074\u0072\u0061\u0063t\u0050\u0061\u0067\u0065\u0054e\u0078t\u003a \u006c\u0065\u0076\u0065\u006c\u003d\u0025d",_fbb );_beb :=&PageText {_bgb :_cbdg ._af };_cae :=_dae (_cbdg ._af );var _dbdf stateStack ;_fcg :=_ebf (_cbdg ,_dfe ,_gb .GraphicsState {},&_cae ,&_dbdf );_bbe :=shapesState {_eecff :_cbf };var _caed bool ;if _fbb > _ce {_ffe :=_c .New ("\u0066\u006f\u0072\u006d s\u0074\u0061\u0063\u006b\u0020\u006f\u0076\u0065\u0072\u0066\u006c\u006f\u0077");_df .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a \u0065\u0078\u0074\u0072\u0061\u0063\u0074\u0050\u0061\u0067\u0065\u0054\u0065\u0078\u0074\u002e\u0020\u0072\u0065\u0063u\u0072\u0073\u0069\u006f\u006e\u0020\u006c\u0065\u0076\u0065\u006c\u003d\u0025\u0064 \u0065r\u0072\u003d\u0025\u0076",_fbb ,_ffe );return _beb ,_cae ._agfd ,_cae ._fbea ,_ffe ;};_fee :=_gb .NewContentStreamParser (_ada );_cdf ,_eeb :=_fee .Parse ();if _eeb !=nil {_df .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020e\u0078\u0074\u0072a\u0063\u0074\u0050\u0061g\u0065\u0054\u0065\u0078\u0074\u0020\u0070\u0061\u0072\u0073\u0065\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_eeb );return _beb ,_cae ._agfd ,_cae ._fbea ,_eeb ;};_ddf :=_gb .NewContentStreamProcessor (*_cdf );_ddf .AddHandler (_gb .HandlerConditionEnumAllOperands ,"",func (_dcg *_gb .ContentStreamOperation ,_dfc _gb .GraphicsState ,_gcb *_ab .PdfPageResources )error {_gea :=_dcg .Operand ;if _baab {_df .Log .Info ("\u0026&\u0026\u0020\u006f\u0070\u003d\u0025s",_dcg );};switch _gea {case "\u0071":_dbdf .push (&_cae );case "\u0051":if !_dbdf .empty (){_cae =*_dbdf .top ();if len (_dbdf )>=2{_dbdf .pop ();};};_bbe ._dfeb =_dfc .CTM ;case "\u0042\u0054":if _caed {_df .Log .Debug ("\u0042\u0054\u0020\u0063\u0061\u006c\u006c\u0065\u0064\u0020\u0077\u0068\u0069\u006c\u0065 \u0069n\u0020\u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074");_beb ._ebfe =append (_beb ._ebfe ,_fcg ._dgce ...);};_caed =true ;_ecb :=_dfc ;_ecb .CTM =_cbf .Mult (_ecb .CTM );_fcg =_ebf (_cbdg ,_gcb ,_ecb ,&_cae ,&_dbdf );case "\u0045\u0054":if !_caed {_df .Log .Debug ("\u0045\u0054\u0020ca\u006c\u006c\u0065\u0064\u0020\u006f\u0075\u0074\u0073i\u0064e\u0020o\u0066 \u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074");};_caed =false ;_beb ._ebfe =append (_beb ._ebfe ,_fcg ._dgce ...);_fcg .reset ();case "\u0054\u002a":_fcg .nextLine ();case "\u0054\u0064":if _eec ,_afb :=_fcg .checkOp (_dcg ,2,true );!_eec {_df .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_afb );return _afb ;};_efa ,_gfd ,_ccfg :=_eecfa (_dcg .Params );if _ccfg !=nil {return _ccfg ;};_fcg .moveText (_efa ,_gfd );case "\u0054\u0044":if _gbg ,_egd :=_fcg .checkOp (_dcg ,2,true );!_gbg {_df .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_egd );return _egd ;};_ddd ,_fcd ,_ae :=_eecfa (_dcg .Params );if _ae !=nil {_df .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_ae );return _ae ;};_fcg .moveTextSetLeading (_ddd ,_fcd );case "\u0054\u006a":if _bbd ,_geaa :=_fcg .checkOp (_dcg ,1,true );!_bbd {_df .Log .Debug ("\u0045\u0052\u0052\u004fR:\u0020\u0054\u006a\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0065\u0072\u0072\u003d%\u0076",_dcg ,_
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-14 09:32:45 +00:00
|
|
|
|
// String returns a description of `tm`.
|
2020-09-21 01:20:10 +00:00
|
|
|
|
func (_dgge *textMark )String ()string {return _db .Sprintf ("\u0025\u002e\u0032f \u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\u0022\u0025\u0073\u0022",_dgge .PdfRectangle ,_dgge ._ebbg ,_dgge ._afca );};func _faec (_bfgab ,_bda bounded )float64 {return _bfgab .bbox ().Llx -_bda .bbox ().Urx };
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// RenderMode specifies the text rendering mode (Tmode), which determines whether showing text shall cause
|
|
|
|
|
// glyph outlines to be stroked, filled, used as a clipping boundary, or some combination of the three.
|
|
|
|
|
// Stroking, filling, and clipping shall have the same effects for a text object as they do for a path object
|
|
|
|
|
// (see 8.5.3, "Path-Painting Operators" and 8.5.4, "Clipping Path Operators").
|
|
|
|
|
type RenderMode int ;func (_edbd intSet )del (_gcgb int ){delete (_edbd ,_gcgb )};func (_ggbd *wordBag )arrangeText ()*textPara {_ggbd .sort ();if _facce {_ggbd .removeDuplicates ();};var _feggc []*textLine ;for _ ,_dbgc :=range _ggbd .depthIndexes (){for !_ggbd .empty (_dbgc ){_dbdag :=_ggbd .firstReadingIndex (_dbgc );_cbbbf :=_ggbd .firstWord (_dbdag );_bagd :=_aecd (_ggbd ,_dbdag );_bgcb :=_cbbbf ._bbcaa ;_gcgg :=_cbbbf ._aggfa -_fegec *_bgcb ;_gfdc :=_cbbbf ._aggfa +_fegec *_bgcb ;_dcce :=_defc *_bgcb ;_agfdg :=_dcc *_bgcb ;_fcdg :for {var _adegd *textWord ;_abgd :=0;for _ ,_bbeb :=range _ggbd .depthBand (_gcgg ,_gfdc ){_age :=_ggbd .highestWord (_bbeb ,_gcgg ,_gfdc );if _age ==nil {continue ;};_abee :=_faec (_age ,_bagd ._fbae [len (_bagd ._fbae )-1]);if _abee < -_agfdg {break _fcdg ;};if _abee > _dcce {continue ;};if _adegd !=nil &&_efdg (_age ,_adegd )>=0{continue ;};_adegd =_age ;_abgd =_bbeb ;};if _adegd ==nil {break ;};_bagd .pullWord (_ggbd ,_adegd ,_abgd );};_bagd .markWordBoundaries ();_feggc =append (_feggc ,_bagd );};};if len (_feggc )==0{return nil ;};_f .Slice (_feggc ,func (_bfccg ,_agd int )bool {return _dcff (_feggc [_bfccg ],_feggc [_agd ])< 0});_ddaa :=_fgfe (_ggbd .PdfRectangle ,_feggc );if _fdfg {_df .Log .Info ("\u0061\u0072\u0072an\u0067\u0065\u0054\u0065\u0078\u0074\u0020\u0021\u0021\u0021\u0020\u0070\u0061\u0072\u0061\u003d\u0025\u0073",_ddaa .String ());if _gafc {for _bafee ,_cdca :=range _ddaa ._efad {_db .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bafee ,_cdca .String ());if _ggcc {for _ffedd ,_cfdf :=range _cdca ._fbae {_db .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_ffedd ,_cfdf .String ());for _edff ,_fdedf :=range _cfdf ._gbgg {_db .Printf ("\u00251\u0032\u0064\u003a\u0020\u0025\u0073\n",_edff ,_fdedf .String ());};};};};};};return _ddaa ;};
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// String returns a description of `v`.
|
|
|
|
|
func (_dbfc *ruling )String ()string {if _dbfc ._aead ==_agga {return "\u004e\u004f\u0054\u0020\u0052\u0055\u004c\u0049\u004e\u0047";};_dfdb ,_afae :="\u0078","\u0079";if _dbfc ._aead ==_cbdac {_dfdb ,_afae ="\u0079","\u0078";};return _db .Sprintf ("\u0025\u0031\u0030\u0073\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u002d \u0025\u0036\u002e\u0032\u0066 \u0028\u00256\u002e\u0032\u0066\u0029",_dbfc ._aead ,_dfdb ,_dbfc ._ffd ,_afae ,_dbfc ._egaa ,_dbfc ._dcfe ,_dbfc ._dcfe -_dbfc ._egaa );};func (_cba *subpath )clear (){*_cba =subpath {}};func (_eafb paraList )addNeighbours (){_dafc :=func (_geace []int ,_bacgf *textPara )([]*textPara ,[]*textPara ){_ffba :=make ([]*textPara ,0,len (_geace )-1);_ggcff :=make ([]*textPara ,0,len (_geace )-1);for _ ,_cadf :=range _geace {_adde :=_eafb [_cadf ];if _adde .Urx <=_bacgf .Llx {_ffba =append (_ffba ,_adde );}else if _adde .Llx >=_bacgf .Urx {_ggcff =append (_ggcff ,_adde );};};return _ffba ,_ggcff ;};_eadd :=func (_fcebg []int ,_bgcab *textPara )([]*textPara ,[]*textPara ){_cbdd :=make ([]*textPara ,0,len (_fcebg )-1);_gcad :=make ([]*textPara ,0,len (_fcebg )-1);for _ ,_dffc :=range _fcebg {_dbded :=_eafb [_dffc ];if _dbded .Ury <=_bgcab .Lly {_gcad =append (_gcad ,_dbded );}else if _dbded .Lly >=_bgcab .Ury {_cbdd =append (_cbdd ,_dbded );};};return _cbdd ,_gcad ;};_ceebd :=_eafb .yNeighbours (_bbdb );for _ ,_gcca :=range _eafb {_eaada :=_ceebd [_gcca ];if len (_eaada )==0{continue ;};_bfdd ,_fadb :=_dafc (_eaada ,_gcca );if len (_bfdd )==0&&len (_fadb )==0{continue ;};if len (_bfdd )> 0{_fegb :=_bfdd [0];for _ ,_ddfc :=range _bfdd [1:]{if _ddfc .Urx >=_fegb .Urx {_fegb =_ddfc ;};};for _ ,_aggb :=range _bfdd {if _aggb !=_fegb &&_aggb .Urx > _fegb .Llx {_fegb =nil ;break ;};};if _fegb !=nil &&_dfad (_gcca .PdfRectangle ,_fegb .PdfRectangle ){_gcca ._aced =_fegb ;};};if len (_fadb )> 0{_gcffb :=_fadb [0];for _ ,_cedb :=range _fadb [1:]{if _cedb .Llx <=_gcffb .Llx {_gcffb =_cedb ;};};for _ ,_bgba :=range _fadb {if _bgba !=_gcffb &&_bgba .Llx < _gcffb .Urx {_gcffb =nil ;break ;};};if _gcffb !=nil &&_dfad (_gcca .PdfRectangle ,_gcffb .PdfRectangle ){_gcca ._gcced =_gcffb ;};};};_ceebd =_eafb .xNeighbours (_gcea );for _ ,_bbaf :=range _eafb {_efdda :=_ceebd [_bbaf ];if len (_efdda )==0{continue ;};_dcfcg ,_eecdg :=_eadd (_efdda ,_bbaf );if len (_dcfcg )==0&&len (_eecdg )==0{continue ;};if len (_eecdg )> 0{_gdca :=_eecdg [0];for _ ,_eebcg :=range _eecdg [1:]{if _eebcg .Ury >=_gdca .Ury {_gdca =_eebcg ;};};for _ ,_cfff :=range _eecdg {if _cfff !=_gdca &&_cfff .Ury > _gdca .Lly {_gdca =nil ;break ;};};if _gdca !=nil &&_efcb (_bbaf .PdfRectangle ,_gdca .PdfRectangle ){_bbaf ._gagf =_gdca ;};};if len (_dcfcg )> 0{_agcg :=_dcfcg [0];for _ ,_bcdd :=range _dcfcg [1:]{if _bcdd .Lly <=_agcg .Lly {_agcg =_bcdd ;};};for _ ,_gecd :=range _dcfcg {if _gecd !=_agcg &&_gecd .Lly < _agcg .Ury {_agcg =nil ;break ;};};if _agcg !=nil &&_efcb (_bbaf .PdfRectangle ,_agcg .PdfRectangle ){_bbaf ._aaegc =_agcg ;};};};for _ ,_fagef :=range _eafb {if _fagef ._aced !=nil &&_fagef ._aced ._gcced !=_fagef {_fagef ._aced =nil ;};if _fagef ._aaegc !=nil &&_fagef ._aaegc ._gagf !=_fagef {_fagef ._aaegc =nil ;};if _fagef ._gcced !=nil &&_fagef ._gcced ._aced !=_fagef {_fagef ._gcced =nil ;};if _fagef ._gagf !=nil &&_fagef ._gagf ._aaegc !=_fagef {_fagef ._gagf =nil ;};};};func (_cafc *textPara )writeCellText (_gbef _g .Writer ){for _cced ,_ebad :=range _cafc ._efad {_fgaee :=_ebad .text ();_agce :=_dce &&_ebad .endsInHyphen ()&&_cced !=len (_cafc ._efad )-1;if _agce {_fgaee =_bbdbc (_fgaee );};_gbef .Write ([]byte (_fgaee ));if !(_agce ||_cced ==len (_cafc ._efad )-1){_gbef .Write ([]byte (_cbe (_ebad ._adcf ,_cafc ._efad [_cced +1]._adcf )));};};};func (_gcd *wordBag )absorb (_bbea *wordBag ){for _bce ,_fae :=range _bbea ._cgba {for _ ,_edee :=range _fae {_gcd .pullWord (_bbea ,_edee ,_bce );};};};func _bffc (_aag ,_dgbg bounded )float64 {return _eggb (_aag )-_eggb (_dgbg )};func (_aaeg *wordBag )removeWord (_fafg *textWord ,_cbfc int ){_bafe :=_d
|
|
|
|
|
|
|
|
|
|
// String returns a string describing `tm`.
|
|
|
|
|
func (_gfb TextMark )String ()string {_dddc :=_gfb .BBox ;var _dgga string ;if _gfb .Font !=nil {_dgga =_gfb .Font .String ();if len (_dgga )> 50{_dgga =_dgga [:50]+"\u002e\u002e\u002e";};};var _decba string ;if _gfb .Meta {_decba ="\u0020\u002a\u004d\u002a";};return _db .Sprintf ("\u007b\u0054\u0065\u0078t\u004d\u0061\u0072\u006b\u003a\u0020\u0025\u0064\u0020%\u0071\u003d\u0025\u0030\u0032\u0078\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e2\u0066\u0029\u0020\u0028\u00256\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0025\u0073\u007d",_gfb .Offset ,_gfb .Text ,[]rune (_gfb .Text ),_dddc .Llx ,_dddc .Lly ,_dddc .Urx ,_dddc .Ury ,_dgga ,_decba );};func (_dgcd rulingList )intersections ()map[int ]intSet {var _aeca ,_dfge []int ;for _degdb ,_bdbf :=range _dgcd {switch _bdbf ._aead {case _acab :_aeca =append (_aeca ,_degdb );case _cbdac :_dfge =append (_dfge ,_degdb );};};_afaa :=make (map[int ]intSet ,len (_aeca )+len (_dfge ));for _ ,_abeda :=range _aeca {for _ ,_fdbb :=range _dfge {if _dgcd [_abeda ].intersects (_dgcd [_fdbb ]){if _ ,_ffbbf :=_afaa [_abeda ];!_ffbbf {_afaa [_abeda ]=make (intSet );};if _ ,_dded :=_afaa [_fdbb ];!_dded {_afaa [_fdbb ]=make (intSet );};_afaa [_abeda ].add (_fdbb );_afaa [_fdbb ].add (_abeda );};};};return _afaa ;};func _fecc (_eaebg string )bool {for _ ,_fbda :=range _eaebg {if !_ad .IsSpace (_fbda ){return false ;};};return true ;};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `k`.
|
|
|
|
|
func (_bbdc rulingKind )String ()string {_bgbbf ,_gbbbe :=_fbdc [_bbdc ];if !_gbbbe {return _db .Sprintf ("\u004e\u006ft\u0020\u0061\u0020r\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0025\u0064",_bbdc );};return _bgbbf ;};func (_cecd *wordBag )getDepthIdx (_efbfg float64 )int {_aee :=_cecd .depthIndexes ();_gcag :=_cgce (_efbfg );if _gcag < _aee [0]{return _aee [0];};if _gcag > _aee [len (_aee )-1]{return _aee [len (_aee )-1];};return _gcag ;};func (_beed *shapesState )devicePoint (_ccb ,_edfg float64 )_b .Point {_eaaa :=_beed ._eecff .Mult (_beed ._dfeb );_ccb ,_edfg =_eaaa .Transform (_ccb ,_edfg );return _b .NewPoint (_ccb ,_edfg );};func _fgfe (_daga _ab .PdfRectangle ,_gcef []*textLine )*textPara {return &textPara {PdfRectangle :_daga ,_efad :_gcef };};func (_gdac paraList )findTables (_ggbg []rulingList )[]*textTable {_gdac .addNeighbours ();_f .Slice (_gdac ,func (_efbcf ,_ddcbb int )bool {return _cfdcb (_gdac [_efbcf ],_gdac [_ddcbb ])< 0});var _fgdff []*textTable ;if _gfag {_eaedb :=_gdac .findGridTables (_ggbg );_fgdff =append (_fgdff ,_eaedb ...);};if _bgcd {_ffae :=_gdac .findTextTables ();_fgdff =append (_fgdff ,_ffae ...);};return _fgdff ;};func (_egcd *textObject )setTextMatrix (_beaa []float64 ){if len (_beaa )!=6{_df .Log .Debug ("\u0045\u0052\u0052OR\u003a\u0020\u006c\u0065\u006e\u0028\u0066\u0029\u0020\u0021\u003d\u0020\u0036\u0020\u0028\u0025\u0064\u0029",len (_beaa ));return ;};_agfe ,_ced ,_eed ,_faba ,_baa ,_aed :=_beaa [0],_beaa [1],_beaa [2],_beaa [3],_beaa [4],_beaa [5];_egcd ._aff =_b .NewMatrix (_agfe ,_ced ,_eed ,_faba ,_baa ,_aed );_egcd ._dgg =_egcd ._aff ;};func (_gecg *shapesState )lineTo (_gdfg ,_cegf float64 ){_gecg .addPoint (_gdfg ,_cegf );if _fcfd {_df .Log .Info ("\u006c\u0069\u006eeT\u006f\u0028\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0070\u003d\u0025\u002e\u0032\u0066",_gdfg ,_cegf ,_gecg .devicePoint (_gdfg ,_cegf ));};};var (_cac =_c .New ("\u0074\u0079p\u0065\u0020\u0063h\u0065\u0063\u006b\u0020\u0065\u0072\u0072\u006f\u0072");_cff =_c .New ("\u0072\u0061\u006e\u0067\u0065\u0020\u0063\u0068\u0065\u0063\u006b\u0020e\u0072\u0072\u006f\u0072"););
|
|
|
|
|
|
|
|
|
|
// New returns an Extractor instance for extracting content from the input PDF page.
|
|
|
|
|
func New (page *_ab .PdfPage )(*Extractor ,error ){_fd ,_acd :=page .GetAllContentStreams ();if _acd !=nil {return nil ,_acd ;};_ec ,_acd :=page .GetMediaBox ();if _acd !=nil {return nil ,_db .Errorf ("\u0065\u0078\u0074r\u0061\u0063\u0074\u006fr\u0020\u0072\u0065\u0071\u0075\u0069\u0072e\u0073\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u002e\u0020\u0025\u0076",_acd );};_fg :=&Extractor {_ade :_fd ,_dd :page .Resources ,_af :*_ec ,_dde :map[string ]fontEntry {},_cg :map[string ]textResult {}};if _fg ._af .Llx > _fg ._af .Urx {_df .Log .Info ("\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0058\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e",_fg ._af );_fg ._af .Llx ,_fg ._af .Urx =_fg ._af .Urx ,_fg ._af .Llx ;};if _fg ._af .Lly > _fg ._af .Ury {_df .Log .Info ("\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0059\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e",_fg ._af );_fg ._af .Lly ,_fg ._af .Ury =_fg ._af .Ury ,_fg ._af .Lly ;};return _fg ,nil ;};
|
|
|
|
|
|
|
|
|
|
// ToText returns the page text as a single string.
|
|
|
|
|
// Deprecated: This function is deprecated and will be removed in a future major version. Please use
|
|
|
|
|
// Text() instead.
|
|
|
|
|
func (_gcf PageText )ToText ()string {return _gcf .Text ()};type cachedImage struct{_dc *_ab .Image ;_eae _ab .PdfColorspace ;};func (_ccgf lineRuling )xDelta ()float64 {return _ge .Abs (_ccgf ._baacb .X -_ccgf ._baacb .X )};func _cfbdc (_ebgg _ab .PdfRectangle ,_bcca bounded )float64 {return _ebgg .Ury -_bcca .bbox ().Lly };func (_agaf *textTable )log (_beegf string ){if !_fbgd {return ;};_df .Log .Info ("~\u007e\u007e\u0020\u0025\u0073\u003a \u0025\u0064\u0020\u0078\u0020\u0025d\u0020\u0067\u0072\u0069\u0064\u003d\u0025t\u000a\u0020\u0020\u0020\u0020\u0020\u0020\u0025\u0036\u002e2\u0066",_beegf ,_agaf ._cfgec ,_agaf ._bged ,_agaf ._dbccf ,_agaf .PdfRectangle );for _cacaf :=0;_cacaf < _agaf ._bged ;_cacaf ++{for _aedfd :=0;_aedfd < _agaf ._cfgec ;_aedfd ++{_aecad :=_agaf .get (_aedfd ,_cacaf );_db .Printf ("%\u0034\u0064\u0020\u00252d\u003a \u0025\u0036\u002e\u0032\u0066 \u0025\u0071\u0020\u0025\u0064\u000a",_aedfd ,_cacaf ,_aecad .PdfRectangle ,_fagd (_aecad .text (),50),_cc .RuneCountInString (_aecad .text ()));};};};func _ece (_egbd float64 ,_efdd int )int {if _efdd ==0{_efdd =1;};_dfbb :=float64 (_efdd );return int (_ge .Round (_egbd /_dfbb )*_dfbb );};func _dgdc (_gbcc _b .Point )*subpath {return &subpath {_acddg :[]_b .Point {_gbcc }}};func _dadb (_fbgbe ,_ddaaa _b .Point )bool {return _fbgbe .X ==_ddaaa .X &&_fbgbe .Y ==_ddaaa .Y };func (_ddfa *textObject )newTextMark (_ggcf string ,_gabb _b .Matrix ,_ccg _b .Point ,_gbddb float64 ,_ebe *_ab .PdfFont ,_cfec float64 ,_dbea ,_gdb _cf .Color )(textMark ,bool ){_cag :=_gabb .Angle ();_bbbc :=_ece (_cag ,_fcfdg );var _gcce float64 ;if _bbbc %180!=90{_gcce =_gabb .ScalingFactorY ();}else {_gcce =_gabb .ScalingFactorX ();};_aeee :=_fgfa (_gabb );_bgbc :=_ab .PdfRectangle {Llx :_aeee .X ,Lly :_aeee .Y ,Urx :_ccg .X ,Ury :_ccg .Y };switch _bbbc %360{case 90:_bgbc .Urx -=_gcce ;case 180:_bgbc .Ury -=_gcce ;case 270:_bgbc .Urx +=_gcce ;case 0:_bgbc .Ury +=_gcce ;default:_bbbc =0;_bgbc .Ury +=_gcce ;};if _bgbc .Llx > _bgbc .Urx {_bgbc .Llx ,_bgbc .Urx =_bgbc .Urx ,_bgbc .Llx ;};if _bgbc .Lly > _bgbc .Ury {_bgbc .Lly ,_bgbc .Ury =_bgbc .Ury ,_bgbc .Lly ;};_daf ,_fdgfa :=_efff (_bgbc ,_ddfa ._gfg ._af );if !_fdgfa {_df .Log .Debug ("\u0054\u0065\u0078\u0074\u0020m\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069\u0064\u0065\u0020\u0070a\u0067\u0065\u002e\u0020\u0062\u0062\u006f\u0078\u003d\u0025\u0067\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u003d\u0025\u0067\u0020\u0074\u0065\u0078\u0074\u003d\u0025q",_bgbc ,_ddfa ._gfg ._af ,_ggcf );};_bgbc =_daf ;_ccbg :=_bgbc ;_dfg :=_ddfa ._gfg ._af ;switch _bbbc %360{case 90:_dfg .Urx ,_dfg .Ury =_dfg .Ury ,_dfg .Urx ;_ccbg =_ab .PdfRectangle {Llx :_dfg .Urx -_bgbc .Ury ,Urx :_dfg .Urx -_bgbc .Lly ,Lly :_bgbc .Llx ,Ury :_bgbc .Urx };case 180:_ccbg =_ab .PdfRectangle {Llx :_dfg .Urx -_bgbc .Llx ,Urx :_dfg .Urx -_bgbc .Urx ,Lly :_dfg .Ury -_bgbc .Lly ,Ury :_dfg .Ury -_bgbc .Ury };case 270:_dfg .Urx ,_dfg .Ury =_dfg .Ury ,_dfg .Urx ;_ccbg =_ab .PdfRectangle {Llx :_bgbc .Ury ,Urx :_bgbc .Lly ,Lly :_dfg .Ury -_bgbc .Llx ,Ury :_dfg .Ury -_bgbc .Urx };};if _ccbg .Llx > _ccbg .Urx {_ccbg .Llx ,_ccbg .Urx =_ccbg .Urx ,_ccbg .Llx ;};if _ccbg .Lly > _ccbg .Ury {_ccbg .Lly ,_ccbg .Ury =_ccbg .Ury ,_ccbg .Lly ;};_agfg :=textMark {_afca :_ggcf ,PdfRectangle :_ccbg ,_bbae :_bgbc ,_ebdd :_ebe ,_ebbg :_gcce ,_bdaec :_cfec ,_bdcb :_gabb ,_fffc :_ccg ,_cega :_bbbc ,_gdab :_dbea ,_agbc :_gdb };if _fada {_df .Log .Info ("n\u0065\u0077\u0054\u0065\u0078\u0074M\u0061\u0072\u006b\u003a\u0020\u0073t\u0061\u0072\u0074\u003d\u0025\u002e\u0032f\u0020\u0065\u006e\u0064\u003d\u0025\u002e\u0032\u0066\u0020%\u0073",_aeee ,_ccg ,_agfg .String ());};return _agfg ,_fdgfa ;};func _ddac (_degbg ,_abad _ab .PdfRectangle )_ab .PdfRectangle {return _ab .PdfRectangle {Llx :_ge .Min (_degbg .Llx ,_abad .Llx ),Lly :_ge .Min (_degbg .Lly ,_abad .Lly ),Urx :_ge .Max (_degbg .Urx ,_abad .Urx ),Ury :_ge .Max (_degbg .Ury ,_abad .Ury )};};func _eecfa (_gcgf []_ef .PdfObject )(_bbafa ,_aeac float64 ,_adeb error ){if len (_gcgf )!=2{return 0,0,_db .Errorf ("\u0069\u00
|
|
|
|
|
|
|
|
|
|
// String returns a description of `l`.
|
|
|
|
|
func (_bbagc *textLine )String ()string {return _db .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_bbagc ._adcf ,_bbagc .PdfRectangle ,_bbagc ._bafec ,_bbagc .text ());};func (_ccaf *textTable )getDown ()paraList {_bcd :=make (paraList ,_ccaf ._cfgec );for _cfcda :=0;_cfcda < _ccaf ._cfgec ;_cfcda ++{_bafb :=_ccaf .get (_cfcda ,_ccaf ._bged -1)._gagf ;if _bafb ==nil ||_bafb ._geda {return nil ;};_bcd [_cfcda ]=_bafb ;};for _cdda :=0;_cdda < _ccaf ._cfgec -1;_cdda ++{if _bcd [_cdda ]._gcced !=_bcd [_cdda +1]{return nil ;};};return _bcd ;};func _cbe (_cge ,_bfee float64 )string {_fggf :=!_decfa (_cge -_bfee );if _fggf {return "\u000a";};return "\u0020";};func (_gcecc *wordBag )minDepth ()float64 {return _gcecc ._accg -(_gcecc .Ury -_gcecc ._ffed )};func (_bafa *shapesState )drawRectangle (_dab ,_ceaa ,_befb ,_ddbf float64 ){if _fcfd {_cfbd :=_bafa .devicePoint (_dab ,_ceaa );_cbcg :=_bafa .devicePoint (_dab +_befb ,_ceaa +_ddbf );_agag :=_ab .PdfRectangle {Llx :_cfbd .X ,Lly :_cfbd .Y ,Urx :_cbcg .X ,Ury :_cbcg .Y };_df .Log .Info ("d\u0072a\u0077\u0052\u0065\u0063\u0074\u0061\u006e\u0067l\u0065\u003a\u0020\u00256.\u0032\u0066",_agag );};_bafa .newSubPath ();_bafa .moveTo (_dab ,_ceaa );_bafa .lineTo (_dab +_befb ,_ceaa );_bafa .lineTo (_dab +_befb ,_ceaa +_ddbf );_bafa .lineTo (_dab ,_ceaa +_ddbf );_bafa .closePath ();};type shapesState struct{_dfeb _b .Matrix ;_eecff _b .Matrix ;_gggc []*subpath ;_adc bool ;_dgd _b .Point ;};
|
2018-03-22 13:01:04 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// ImageMark represents an image drawn on a page and its position in device coordinates.
|
|
|
|
|
// All coordinates are in device coordinates.
|
2020-09-14 09:32:45 +00:00
|
|
|
|
type ImageMark struct{Image *_ab .Image ;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// Dimensions of the image as displayed in the PDF.
|
|
|
|
|
Width float64 ;Height float64 ;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// Position of the image in PDF coordinates (lower left corner).
|
|
|
|
|
X float64 ;Y float64 ;
|
|
|
|
|
|
|
|
|
|
// Angle in degrees, if rotated.
|
2020-09-21 01:20:10 +00:00
|
|
|
|
Angle float64 ;};func (_afcf *textObject )getFontDict (_bad string )(_bag _ef .PdfObject ,_eeca error ){_ddc :=_afcf ._adfd ;if _ddc ==nil {_df .Log .Debug ("g\u0065\u0074\u0046\u006f\u006e\u0074D\u0069\u0063\u0074\u002e\u0020\u004eo\u0020\u0072\u0065\u0073\u006f\u0075\u0072c\u0065\u0073\u002e\u0020\u006e\u0061\u006d\u0065\u003d\u0025#\u0071",_bad );return nil ,nil ;};_bag ,_bcaa :=_ddc .GetFontByName (_ef .PdfObjectName (_bad ));if !_bcaa {_df .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u0067\u0065t\u0046\u006f\u006et\u0044\u0069\u0063\u0074\u003a\u0020\u0046\u006f\u006et \u006e\u006f\u0074 \u0066\u006fu\u006e\u0064\u003a\u0020\u006e\u0061m\u0065\u003d%\u0023\u0071",_bad );return nil ,_c .New ("f\u006f\u006e\u0074\u0020no\u0074 \u0069\u006e\u0020\u0072\u0065s\u006f\u0075\u0072\u0063\u0065\u0073");};return _bag ,nil ;};func (_gca *textObject )getFontDirect (_cfa string )(*_ab .PdfFont ,error ){_cfdc ,_ecd :=_gca .getFontDict (_cfa );if _ecd !=nil {return nil ,_ecd ;};_gagd ,_ecd :=_ab .NewPdfFontFromPdfObject (_cfdc );if _ecd !=nil {_df .Log .Debug ("\u0067\u0065\u0074\u0046\u006f\u006e\u0074\u0044\u0069\u0072\u0065\u0063\u0074\u003a\u0020\u004e\u0065\u0077Pd\u0066F\u006f\u006e\u0074\u0046\u0072\u006f\u006d\u0050\u0064\u0066\u004f\u0062j\u0065\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u006e\u0061\u006d\u0065\u003d%\u0023\u0071\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_cfa ,_ecd );};return _gagd ,_ecd ;};func (_fgag *textWord )absorb (_ccbc *textWord ){_fgag .PdfRectangle =_ddac (_fgag .PdfRectangle ,_ccbc .PdfRectangle );_fgag ._gbgg =append (_fgag ._gbgg ,_ccbc ._gbgg ...);};func (_bacg *textPara )bbox ()_ab .PdfRectangle {return _bacg .PdfRectangle };func (_fdfe paraList )extractTables (_dcee []rulingList )paraList {if _fbgd {_df .Log .Debug ("\u0065\u0078\u0074r\u0061\u0063\u0074\u0054\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_fdfe ));};if len (_fdfe )< _bae {return _fdfe ;};_ffaa :=_fdfe .findTables (_dcee );if _fbgd {_df .Log .Info ("c\u006f\u006d\u0062\u0069\u006e\u0065d\u0020\u0074\u0061\u0062\u006c\u0065s\u0020\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d",len (_ffaa ));for _dgcf ,_decfg :=range _ffaa {_decfg .log (_db .Sprintf ("c\u006f\u006d\u0062\u0069\u006e\u0065\u0064\u0020\u0025\u0064",_dgcf ));};};return _fdfe .applyTables (_ffaa );};func (_egdg *textTable )toTextTable ()TextTable {if _fbgd {_df .Log .Info ("t\u006fT\u0065\u0078\u0074\u0054\u0061\u0062\u006c\u0065:\u0020\u0025\u0064\u0020x \u0025\u0064",_egdg ._cfgec ,_egdg ._bged );};_fdfeg :=make ([][]TableCell ,_egdg ._bged );for _caea :=0;_caea < _egdg ._bged ;_caea ++{_fdfeg [_caea ]=make ([]TableCell ,_egdg ._cfgec );for _bcb :=0;_bcb < _egdg ._cfgec ;_bcb ++{_eced :=_egdg .get (_bcb ,_caea );if _fbgd {_db .Printf ("\u0025\u0034\u0064 \u0025\u0032\u0064\u003a\u0020\u0025\u0073\u000a",_bcb ,_caea ,_eced );};if _eced ==nil {continue ;};_fdfeg [_caea ][_bcb ].Text =_eced .text ();_cgceg :=0;_fdfeg [_caea ][_bcb ].Marks ._ggfd =_eced .toTextMarks (&_cgceg );};};return TextTable {W :_egdg ._cfgec ,H :_egdg ._bged ,Cells :_fdfeg };};func (_bac *textObject )showTextAdjusted (_efbc *_ef .PdfObjectArray )error {_eecd :=false ;for _ ,_fege :=range _efbc .Elements (){switch _fege .(type ){case *_ef .PdfObjectFloat ,*_ef .PdfObjectInteger :_fdee ,_ccdg :=_ef .GetNumberAsFloat (_fege );if _ccdg !=nil {_df .Log .Debug ("\u0045\u0052\u0052\u004fR\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078t\u0041\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0042\u0061\u0064\u0020\u006e\u0075\u006d\u0065r\u0069\u0063\u0061\u006c\u0020a\u0072\u0067\u002e\u0020\u006f\u003d\u0025\u0073\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_fege ,_efbc );return _ccdg ;};_gaeg ,_cde :=-_fdee *0.001*_bac ._fdag ._gga ,0.0;if _eecd {_cde ,_gaeg =_gaeg ,_cde ;};_eebg :=_egeb (_b .Point {X :_gaeg ,Y :_cde });_bac ._aff .Co
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// String returns a string describing `pt`.
|
|
|
|
|
func (_eega PageText )String ()string {_gefc :=_db .Sprintf ("P\u0061\u0067\u0065\u0054ex\u0074:\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073",len (_eega ._ebfe ));_eage :=[]string {"\u002d"+_gefc };for _ ,_fff :=range _eega ._ebfe {_eage =append (_eage ,_fff .String ());};_eage =append (_eage ,"\u002b"+_gefc );return _d .Join (_eage ,"\u000a");};func (_efgg paraList )findGridTables (_feadf []rulingList )[]*textTable {if _fbgd {_df .Log .Info ("\u0066\u0069\u006e\u0064T\u0061\u0062\u006c\u0065\u0057\u0069\u0074\u0068\u0047\u0072i\u0064s\u003a\u0020\u0025\u0064\u0020\u0070\u0061r\u0061\u0073",len (_efgg ));for _ecfc ,_adda :=range _efgg {_db .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_ecfc ,_adda );};};var _fdcc []*textTable ;for _facd ,_faeg :=range _feadf {_gacf :=_efgg .findTableGrid (_faeg );if _gacf !=nil {_gacf .log (_db .Sprintf ("\u0066\u0069\u006e\u0064Ta\u0062\u006c\u0065\u0057\u0069\u0074\u0068\u0047\u0072\u0069\u0064\u0073\u003a\u0020%\u0064",_facd ));_fdcc =append (_fdcc ,_gacf );_gacf .markCells ();};};return _fdcc ;};func (_agbe *textPara )text ()string {_bgeb :=new (_fc .Buffer );_agbe .writeText (_bgeb );return _bgeb .String ();};func (_cbfb *subpath )isQuadrilateral ()bool {if len (_cbfb ._acddg )< 4||len (_cbfb ._acddg )> 5{return false ;};if len (_cbfb ._acddg )==5{_cfbf :=_cbfb ._acddg [0];_eggd :=_cbfb ._acddg [4];if _cfbf .X !=_eggd .X ||_cfbf .Y !=_eggd .Y {return false ;};};return true ;};func (_cggab *textMark )inDiacriticArea (_decf *textMark )bool {_eaed :=_cggab .Llx -_decf .Llx ;_abaf :=_cggab .Urx -_decf .Urx ;_aefc :=_cggab .Lly -_decf .Lly ;return _ge .Abs (_eaed +_abaf )< _cggab .Width ()*_fbeb &&_ge .Abs (_aefc )< _cggab .Height ()*_fbeb ;};func _degeb (_aeddd ,_dfec *textPara )bool {return _efcb (_aeddd ._agbd ,_dfec ._agbd )};func (_gde *imageExtractContext )extractContentStreamImages (_bd string ,_egc *_ab .PdfPageResources )error {_bdd :=_gb .NewContentStreamParser (_bd );_fde ,_geb :=_bdd .Parse ();if _geb !=nil {return _geb ;};if _gde ._da ==nil {_gde ._da =map[*_ef .PdfObjectStream ]*cachedImage {};};if _gde ._bg ==nil {_gde ._bg =&ImageExtractOptions {};};_gf :=_gb .NewContentStreamProcessor (*_fde );_gf .AddHandler (_gb .HandlerConditionEnumAllOperands ,"",func (_fdg *_gb .ContentStreamOperation ,_bc _gb .GraphicsState ,_fa *_ab .PdfPageResources )error {return _gde .processOperand (_fdg ,_bc ,_fa );});return _gf .Process (_egc );};var _fbdc =map[rulingKind ]string {_agga :"\u006e\u006f\u006e\u0065",_cbdac :"\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c",_acab :"\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c"};type textTable struct{_ab .PdfRectangle ;_cfgec ,_bged int ;_dbccf bool ;_acaf map[uint64 ]*textPara ;};type textWord struct{_ab .PdfRectangle ;_aggfa float64 ;_bbdbfb string ;_gbgg []*textMark ;_bbcaa float64 ;_egce bool ;};func (_ddae rectRuling )asRuling ()(*ruling ,bool ){_gdfe :=ruling {_aead :_ddae ._eagg };switch _ddae ._eagg {case _acab :_gdfe ._ffd =0.5*(_ddae .Llx +_ddae .Urx );_gdfe ._egaa =_ddae .Lly ;_gdfe ._dcfe =_ddae .Ury ;case _cbdac :_gdfe ._ffd =0.5*(_ddae .Lly +_ddae .Ury );_gdfe ._egaa =_ddae .Llx ;_gdfe ._dcfe =_ddae .Urx ;default:_df .Log .Error ("\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064",_ddae ._eagg );return nil ,false ;};return &_gdfe ,true ;};func _cfdcb (_gfda ,_gfc bounded )float64 {_acae :=_efdg (_gfda ,_gfc );if !_decfa (_acae ){return _acae ;};return _bffc (_gfda ,_gfc );};func (_fb *imageExtractContext )extractFormImages (_cd *_ef .PdfObjectName ,_cbd _gb .GraphicsState ,_bbb *_ab .PdfPageResources )error {_dgec ,_bcfd :=_bbb .GetXObjectFormByName (*_cd );if _bcfd !=nil {return _bcfd ;};if _dgec ==nil {return nil ;};_bgc ,_bcfd :=_dgec .GetContentStream ();if _bcfd !=nil {return _bcfd ;};_aba :=_dgec .Resources ;if _aba ==nil {_aba =_bbb ;};_bcfd =_fb .extractContentStreamImages (string (_bgc ),_aba );if _bcfd !=nil {return _bcfd ;};_fb ._ff ++;return nil ;};func (_aca *imageExtractContext )extractInlineImage (_cga *_gb .
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// Len returns the number of TextMarks in `ma`.
|
|
|
|
|
func (_adea *TextMarkArray )Len ()int {if _adea ==nil {return 0;};return len (_adea ._ggfd );};func _dcff (_cgcc ,_edfe bounded )float64 {_bbca :=_bffc (_cgcc ,_edfe );if !_decfa (_bbca ){return _bbca ;};return _efdg (_cgcc ,_edfe );};
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// TableCell is a cell in a TextTable.
|
|
|
|
|
type TableCell struct{
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// Text is the extracted text.
|
|
|
|
|
Text string ;
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// Marks returns the TextMarks corresponding to the text in Text.
|
|
|
|
|
Marks TextMarkArray ;};func (_aec *textObject )setCharSpacing (_egg float64 ){if _aec ==nil {return ;};_aec ._fdag ._bfef =_egg ;if _fdec {_df .Log .Info ("\u0073\u0065t\u0043\u0068\u0061\u0072\u0053\u0070\u0061\u0063\u0069\u006e\u0067\u003a\u0020\u0025\u002e\u0032\u0066\u0020\u0073\u0074\u0061\u0074e=\u0025\u0073",_egg ,_aec ._fdag .String ());};};var _gbab =_ac .MustCompile ("\u005c\u0064\u002b\u005c\u002e\u003f");
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// String returns a human readable description of `s`.
|
|
|
|
|
func (_fagf intSet )String ()string {var _geagc []int ;for _efdbd :=range _fagf {if _fagf .has (_efdbd ){_geagc =append (_geagc ,_efdbd );};};_f .Ints (_geagc );return _db .Sprintf ("\u0025\u002b\u0076",_geagc );};func _dfad (_gbea ,_gfgf _ab .PdfRectangle )bool {return _gbea .Lly <=_gfgf .Ury &&_gfgf .Lly <=_gbea .Ury ;};func (_gce *textObject )getFont (_bbff string )(*_ab .PdfFont ,error ){if _gce ._gfg ._dde !=nil {_gce ._gfg ._ga ++;_fcfe ,_dfegd :=_gce ._gfg ._dde [_bbff ];if _dfegd {_fcfe ._bfd =_gce ._gfg ._ga ;return _fcfe ._cgbe ,nil ;};};_efg ,_gfef :=_gce .getFontDirect (_bbff );if _gfef !=nil {return nil ,_gfef ;};if _gce ._gfg ._dde !=nil {_degaa :=fontEntry {_efg ,_gce ._gfg ._ga };if len (_gce ._gfg ._dde )>=_edge {var _eda []string ;for _efabb :=range _gce ._gfg ._dde {_eda =append (_eda ,_efabb );};_f .Slice (_eda ,func (_dcf ,_ega int )bool {return _gce ._gfg ._dde [_eda [_dcf ]]._bfd < _gce ._gfg ._dde [_eda [_ega ]]._bfd });delete (_gce ._gfg ._dde ,_eda [0]);};_gce ._gfg ._dde [_bbff ]=_degaa ;};return _efg ,nil ;};type textPara struct{_ab .PdfRectangle ;_agbd _ab .PdfRectangle ;_efad []*textLine ;_accb *textTable ;_geda bool ;_aced *textPara ;_gcced *textPara ;_aaegc *textPara ;_gagf *textPara ;};type stateStack []*textState ;func (_efef intSet )has (_edbe int )bool {_ ,_fgcd :=_efef [_edbe ];return _fgcd };func (_fda *textObject )moveTextSetLeading (_ecf ,_gcbb float64 ){_fda ._fdag ._aede =-_gcbb ;_fda .moveLP (_ecf ,_gcbb );};func (_ccca *wordBag )empty (_bgea int )bool {_ ,_fgdb :=_ccca ._cgba [_bgea ];return !_fgdb };func _edc (_egac []TextMark ,_bgbd *int ,_fecf TextMark )[]TextMark {_fecf .Offset =*_bgbd ;_egac =append (_egac ,_fecf );*_bgbd +=len (_fecf .Text );return _egac ;};func _eefc (_adfde *textWord ,_dbgb float64 )*wordBag {_gee :=_cgce (_adfde ._aggfa );_dbcc :=[]*textWord {_adfde };_daag :=wordBag {_cgba :map[int ][]*textWord {_gee :_dbcc },PdfRectangle :_adfde .PdfRectangle ,_ffed :_adfde ._bbcaa ,_accg :_dbgb };return &_daag ;};func _eeab (_gbec ,_cged _b .Point )bool {_fdgga :=_ge .Abs (_gbec .X -_cged .X );_bcae :=_ge .Abs (_gbec .Y -_cged .Y );return _fbc (_bcae ,_fdgga );};type intSet map[int ]struct{};func (_ddgf paraList )inRect (_acbbc _ab .PdfRectangle )*textPara {var _dbed paraList ;for _ ,_dbcg :=range _ddgf {_cbgcb :=_dbcg .PdfRectangle ;_cbgcb .Lly +=_bdeb ;_cbgcb .Ury -=_bdeb ;if _accd (_acbbc ,_cbgcb ){_dbed =append (_dbed ,_dbcg );};};if len (_dbed )!=1{return nil ;};return _dbed [0];};type textResult struct{_feg PageText ;_fage int ;_fded int ;};func (_cbgcbe *textWord )computeText ()string {_ebdf :=make ([]string ,len (_cbgcbe ._gbgg ));for _effd ,_face :=range _cbgcbe ._gbgg {_ebdf [_effd ]=_face ._afca ;};return _d .Join (_ebdf ,"");};func (_fabd *textTable )markCells (){for _abaa :=0;_abaa < _fabd ._bged ;_abaa ++{for _aac :=0;_aac < _fabd ._cfgec ;_aac ++{_deba :=_fabd .get (_aac ,_abaa );_deba ._geda =true ;};};};type lineRuling struct{_cdcb rulingKind ;_afad ,_baacb _b .Point ;};func (_gaae *textTable )getRight ()paraList {_fccf :=make (paraList ,_gaae ._bged );for _gagb :=0;_gagb < _gaae ._bged ;_gagb ++{_bbdbf :=_gaae .get (_gaae ._cfgec -1,_gagb )._gcced ;if _bbdbf ==nil ||_bbdbf ._geda {return nil ;};_fccf [_gagb ]=_bbdbf ;};for _bbcda :=0;_bbcda < _gaae ._bged -1;_bbcda ++{if _fccf [_bbcda ]._gagf !=_fccf [_bbcda +1]{return nil ;};};return _fccf ;};func (_fgd *subpath )removeDuplicates (){if len (_fgd ._acddg )==0{return ;};_daaf :=[]_b .Point {_fgd ._acddg [0]};for _ ,_gaab :=range _fgd ._acddg [1:]{if !_dadb (_gaab ,_daaf [len (_daaf )-1]){_daaf =append (_daaf ,_gaab );};};_fgd ._acddg =_daaf ;};func (_feag *textLine )markWordBoundaries (){_fdcf :=_cacfd *_feag ._bafec ;for _dbbda ,_fcfc :=range _feag ._fbae [1:]{if _faec (_fcfc ,_feag ._fbae [_dbbda ])>=_fdcf {_fcfc ._egce =true ;};};};func _gcfd (_bbfe *PageText )error {_fdebb :=_dbd .GetLicenseKey ();if _fdebb !=nil &&_fdebb .IsLicensed ()||_eb {return nil ;};_db .Printf ("\u0055\u006e\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0064\u0020c\u006f\u0070\u0079\u0020\u006f\u0066\u0020\u0055\u006e\u006
|
2020-08-31 21:12:07 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// ImageExtractOptions contains options for controlling image extraction from
|
|
|
|
|
// PDF pages.
|
|
|
|
|
type ImageExtractOptions struct{IncludeInlineStencilMasks bool ;};func (_dcbb *textTable )get (_agdb ,_acdgc int )*textPara {return _dcbb ._acaf [_ggfc (_agdb ,_acdgc )]};func (_efcg *textWord )toTextMarks (_fgad *int )[]TextMark {var _fbce []TextMark ;for _ ,_dggce :=range _efcg ._gbgg {_fbce =_edc (_fbce ,_fgad ,_dggce .ToTextMark ());};return _fbce ;};func (_gdfef rulingList )toGrids ()[]rulingList {if len (_gdfef )==0{return nil ;};if len (_gdfef )> 200{_df .Log .Info ("\u0074\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0054\u004f\u0020\u004d\u0041\u004eY\u0020r\u0075\u006c\u0069\u006e\u0067\u0073\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064",len (_gdfef ));return nil ;};_gbdc :=_gdfef .intersections ();if _geac {_df .Log .Info ("\u0074\u006f\u0047\u0072\u0069\u0064s\u003a\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u0069\u006e\u0074e\u0072\u0073\u0065\u0063\u0074\u0073\u003d%\u0064\u0020\u0025\u0076",len (_gdfef ),len (_gbdc ),_gbdc );};_cfae :=make (map[int ]intSet ,len (_gdfef ));for _fbdf :=range _gdfef {_febd :=_gdfef .connections (_gbdc ,_fbdf );if len (_febd )> 0{_cfae [_fbdf ]=_febd ;};};if _geac {_df .Log .Info ("t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u0063\u006fn\u006e\u0065\u0063\u0074s=\u0025\u0076",_cfae );};_caec :=_ddfdf (len (_gdfef ),func (_ffdd ,_cagd int )bool {_beeg ,_gdbc :=len (_cfae [_ffdd ]),len (_cfae [_cagd ]);if _beeg !=_gdbc {return _beeg > _gdbc ;};return _gdfef .comp (_ffdd ,_cagd );});if _geac {_df .Log .Info ("t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u006f\u0072d\u0065\u0072\u0069\u006eg=\u0025\u0076",_caec );};_fadf :=[][]int {{_caec [0]}};_efdb :for _ ,_cgde :=range _caec [1:]{for _bffe ,_gad :=range _fadf {for _ ,_bbeg :=range _gad {if _cfae [_bbeg ].has (_cgde ){_fadf [_bffe ]=append (_gad ,_cgde );continue _efdb ;};};};_fadf =append (_fadf ,[]int {_cgde });};if _geac {_df .Log .Info ("\u0074o\u0047r\u0069\u0064\u0073\u003a\u0020i\u0067\u0072i\u0064\u0073\u003d\u0025\u0076",_fadf );};_f .SliceStable (_fadf ,func (_debg ,_cafd int )bool {return len (_fadf [_debg ])> len (_fadf [_cafd ])});for _ ,_gbfgc :=range _fadf {_f .Slice (_gbfgc ,func (_dfebd ,_fcef int )bool {return _gdfef .comp (_gbfgc [_dfebd ],_gbfgc [_fcef ])});};_feac :=make ([]rulingList ,len (_fadf ));for _ceabf ,_caaf :=range _fadf {_fbdb :=make (rulingList ,len (_caaf ));for _geef ,_ceb :=range _caaf {_fbdb [_geef ]=_gdfef [_ceb ];};_feac [_ceabf ]=_fbdb ;};if _geac {_df .Log .Info ("\u0074\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0067\u0072\u0069d\u0073\u003d\u0025\u0076",_feac );};var _bfgc []rulingList ;for _ ,_dfdee :=range _feac {if _dfdee .isActualGrid (){_bfgc =append (_bfgc ,_dfdee );};};if _geac {_df .Log .Info ("\u0074\u006f\u0047ri\u0064\u0073\u003a\u0020\u0061\u0063\u0074\u0075\u0061\u006c\u0047\u0072\u0069\u0064\u0073\u003d\u0025\u0076",_bfgc );_df .Log .Info ("\u0074\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0067\u0072\u0069\u0064\u0073\u003d%\u0064 \u0061\u0063\u0074\u0075\u0061\u006c\u0047\u0072\u0069\u0064\u0073\u003d\u0025\u0064",len (_feac ),len (_bfgc ));};return _bfgc ;};func (_eegc rulingList )cells ()(int ,int ,[]_ab .PdfRectangle ){_eegc .sortStrict ();_cdfd ,_agge :=_eegc .vertsHorzs ();_ddaea :=len (_cdfd )-1;_ccbb :=len (_agge )-1;if _geac {_df .Log .Info ("\u0072\u0075\u006c\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002ec\u0065\u006c\u006c\u0073\u003a\u0020\u0076\u0065\u0072\u0074s\u003d\u0025\u0064",len (_cdfd ));for _ecef ,_feee :=range _cdfd {_db .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_ecef ,_feee );};_df .Log .Info ("\u0072\u0075\u006c\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002ec\u0065\u006c\u006c\u0073\u003a\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064",len (_agge ));for _gdae ,_ebgef :=range _agge {_db .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_gdae ,_ebgef );};_df .Log .Info ("r\u0075\u006c\u0069\u006e\u0067\u004ci\u0073\u0074\u002e\u0063\u0065\u006cl\u0073\u003a\u0020\u0076\u0065\u0063\u0073=\u0025\u0064\u0020\u0077\u0078\u0068\u003d\u0025\u0064\u0078%\u0064",len (_eegc ),_ddaea ,_ccbb );};_
|
2020-08-31 21:12:07 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// ApplyArea processes the page text only within the specified area `bbox`.
|
|
|
|
|
// Each time ApplyArea is called, it updates the result set in `pt`.
|
|
|
|
|
// Can be called multiple times in a row with different bounding boxes.
|
|
|
|
|
func (_eggf *PageText )ApplyArea (bbox _ab .PdfRectangle ){_abb :=make ([]*textMark ,0,len (_eggf ._ebfe ));for _ ,_acdb :=range _eggf ._ebfe {if _gab (_acdb .bbox (),bbox ){_abb =append (_abb ,_acdb );};};var _afc paraList ;_fffd :=len (_abb );for _gacd :=0;_gacd < 360&&_fffd > 0;_gacd +=90{_fed :=make ([]*textMark ,0,len (_abb )-_fffd );for _ ,_ecc :=range _abb {if _ecc ._cega ==_gacd {_fed =append (_fed ,_ecc );};};if len (_fed )> 0{_acg :=_cead (_fed ,_eggf ._bgb ,nil );_afc =append (_afc ,_acg ...);_fffd -=len (_fed );};};_fcf :=new (_fc .Buffer );_afc .writeText (_fcf );_eggf ._bdc =_fcf .String ();_eggf ._dedb =_afc .toTextMarks ();_eggf ._acdd =_afc .tables ();};
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// String returns a description of `t`.
|
|
|
|
|
func (_gbfc *textTable )String ()string {return _db .Sprintf ("\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u0020\u0025\u0074",_gbfc ._cfgec ,_gbfc ._bged ,_gbfc ._dbccf );};func (_ebbc *textMark )bbox ()_ab .PdfRectangle {return _ebbc .PdfRectangle };func (_bfdg paraList )xNeighbours (_fddg float64 )map[*textPara ][]int {_bagf :=make ([]event ,2*len (_bfdg ));if _fddg ==0{for _bege ,_fcfef :=range _bfdg {_bagf [2*_bege ]=event {_fcfef .Llx ,true ,_bege };_bagf [2*_bege +1]=event {_fcfef .Urx ,false ,_bege };};}else {for _geeag ,_bbbcg :=range _bfdg {_bagf [2*_geeag ]=event {_bbbcg .Llx -_fddg *_bbbcg .fontsize (),true ,_geeag };_bagf [2*_geeag +1]=event {_bbbcg .Urx +_fddg *_bbbcg .fontsize (),false ,_geeag };};};return _bfdg .eventNeighbours (_bagf );};func (_gecb *wordBag )depthIndexes ()[]int {if len (_gecb ._cgba )==0{return nil ;};_cbbb :=make ([]int ,len (_gecb ._cgba ));_eefa :=0;for _ggfbf :=range _gecb ._cgba {_cbbb [_eefa ]=_ggfbf ;_eefa ++;};_f .Ints (_cbbb );return _cbbb ;};type event struct{_eedf float64 ;_bcgd bool ;_efggb int ;};func (_cceb *wordBag )depthRange (_fabf ,_adcg int )[]int {_ebaf :=_cceb .depthIndexes ();var _bga []int ;for _ ,_aef :=range _ebaf {if _fabf <=_aef &&_aef <=_adcg {_bga =append (_bga ,_aef );};};return _bga ;};func (_dca *stateStack )top ()*textState {if _dca .empty (){return nil ;};return (*_dca )[_dca .size ()-1];};func (_fece rulingList )comp (_ceac ,_gagdd int )bool {_edggc ,_fddb :=_fece [_ceac ],_fece [_gagdd ];_ggde ,_aebc :=_edggc ._aead ,_fddb ._aead ;if _ggde !=_aebc {return _ggde > _aebc ;};if _ggde ==_agga {return false ;};_dggab :=func (_cgfb bool )bool {if _ggde ==_cbdac {return _cgfb ;};return !_cgfb ;};_gfggc ,_adgd :=_edggc ._ffd ,_fddb ._ffd ;if _gfggc !=_adgd {return _dggab (_gfggc > _adgd );};_gfggc ,_adgd =_edggc ._egaa ,_fddb ._egaa ;if _gfggc !=_adgd {return _dggab (_gfggc < _adgd );};return _dggab (_edggc ._dcfe < _fddb ._dcfe );};func (_bgfg *subpath )close (){if !_dadb (_bgfg ._acddg [0],_bgfg .last ()){_bgfg .add (_bgfg ._acddg [0]);};_bgfg ._bbc =true ;_bgfg .removeDuplicates ();};
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// ToTextMark returns the public view of `tm`.
|
|
|
|
|
func (_gcbbb *textMark )ToTextMark ()TextMark {return TextMark {Text :_gcbbb ._afca ,Original :_gcbbb ._geea ,BBox :_gcbbb ._bbae ,Font :_gcbbb ._ebdd ,FontSize :_gcbbb ._ebbg ,FillColor :_gcbbb ._gdab ,StrokeColor :_gcbbb ._agbc };};func (_gbgb *textObject )getStrokeColor ()_cf .Color {return _ccfb (_gbgb ._ebbb .ColorspaceStroking ,_gbgb ._ebbb .ColorStroking );};func (_adagb *textPara )writeText (_faea _g .Writer ){if _adagb ._accb ==nil {_adagb .writeCellText (_faea );return ;};for _bdb :=0;_bdb < _adagb ._accb ._bged ;_bdb ++{for _agadg :=0;_agadg < _adagb ._accb ._cfgec ;_agadg ++{_afe :=_adagb ._accb .get (_agadg ,_bdb );if _afe ==nil {_faea .Write ([]byte ("\u0009"));}else {_afe .writeCellText (_faea );};_faea .Write ([]byte ("\u0020"));};if _bdb < _adagb ._accb ._bged -1{_faea .Write ([]byte ("\u000a"));};};};func (_ceabac *textTable )isExportable ()bool {_fdfed :=func (_ffaf int )bool {_ffdc :=_ceabac .get (0,_ffaf );_afbfb :=_ffdc .text ();_bgaa :=_cc .RuneCountInString (_afbfb );_ccaba :=_gbab .MatchString (_afbfb );return _bgaa <=1||_ccaba ;};for _cdedcc :=0;_cdedcc < _ceabac ._bged ;_cdedcc ++{if !_fdfed (_cdedcc ){return true ;};};return false ;};func (_acbb rulingList )coalesce ()rulingList {if len (_acbb )==0{return nil ;};_acbb .sortStrict ();_ceef :=_acbb [0];var _bfbg rulingList ;for _ ,_dbde :=range _acbb [1:]{_befcb :=_ceef ._aead ==_dbde ._aead &&_ceef ._ffd ==_dbde ._ffd &&_dbde ._egaa <=_ceef ._dcfe +1.0;if _befcb {_baga :=*_ceef ;_ceef ._dcfe =_dbde ._dcfe ;if _ceef ._dcfe < _ceef ._egaa {_df .Log .Error ("\u0076\u0030\u002ehi\u0020\u003c\u0020\u0076\u0030\u002e\u006c\u006f\u000a\t\u00760\u003d%\u0073\n\u0009\u0020\u0076\u003d\u0025\u0073\u000a\u0009\u0020\u002d\u003e\u0025\u0073",_baga .String (),_dbde .String (),_ceef .String ());return nil ;};}else {_bfbg =append (_bfbg ,_ceef );_ceef =_dbde ;};};_bfbg =append (_bfbg ,_ceef );return _bfbg ;};func (_dfbd paraList )toTextMarks ()[]TextMark {_cfgb :=0;var _ebaa []TextMark ;for _bfbe ,_bedf :=range _dfbd {_cgfa :=_bedf .toTextMarks (&_cfgb );_ebaa =append (_ebaa ,_cgfa ...);if _bfbe !=len (_dfbd )-1{if _edef (_bedf ,_dfbd [_bfbe +1]){_ebaa =_gegg (_ebaa ,&_cfgb ,"\u0020");}else {_ebaa =_gegg (_ebaa ,&_cfgb ,"\u000a");_ebaa =_gegg (_ebaa ,&_cfgb ,"\u000a");};};};_ebaa =_gegg (_ebaa ,&_cfgb ,"\u000a");_ebaa =_gegg (_ebaa ,&_cfgb ,"\u000a");return _ebaa ;};func (_ggg *textObject )getFillColor ()_cf .Color {return _ccfb (_ggg ._ebbb .ColorspaceNonStroking ,_ggg ._ebbb .ColorNonStroking );};func _ccfb (_dbeb _ab .PdfColorspace ,_acdgf _ab .PdfColor )_cf .Color {if _dbeb ==nil ||_acdgf ==nil {return _cf .Black ;};_bebcc ,_fafag :=_dbeb .ColorToRGB (_acdgf );if _fafag !=nil {_df .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006fu\u006c\u0064\u0020no\u0074\u0020\u0063\u006f\u006e\u0076e\u0072\u0074\u0020\u0063\u006f\u006c\u006f\u0072\u0020\u0025\u0076\u0020\u0028\u0025\u0076)\u0020\u0074\u006f\u0020\u0052\u0047\u0042\u003a \u0025\u0073",_acdgf ,_dbeb ,_fafag );return _cf .Black ;};_feacea ,_ceeac :=_bebcc .(*_ab .PdfColorDeviceRGB );if !_ceeac {_df .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006f\u006e\u0076\u0065\u0072\u0074\u0065\u0064 \u0063\u006f\u006c\u006f\u0072\u0020\u0069\u0073\u0020\u006e\u006f\u0074\u0020i\u006e\u0020\u0074\u0068\u0065\u0020\u0052\u0047\u0042\u0020\u0063\u006flo\u0072\u0073\u0070\u0061\u0063\u0065\u003a\u0020\u0025\u0076",_bebcc );return _cf .Black ;};return _cf .NRGBA {R :uint8 (_feacea .R ()*255),G :uint8 (_feacea .G ()*255),B :uint8 (_feacea .B ()*255),A :uint8 (255)};};func (_bbg *textObject )checkOp (_befc *_gb .ContentStreamOperation ,_gag int ,_eacf bool )(_fgcb bool ,_gdgc error ){if _bbg ==nil {var _fdedg []_ef .PdfObject ;if _gag > 0{_fdedg =_befc .Params ;if len (_fdedg )> _gag {_fdedg =_fdedg [:_gag ];};};_df .Log .Debug ("\u0025\u0023q \u006f\u0070\u0065r\u0061\u006e\u0064\u0020out\u0073id\u0065\u0020\u0074\u0065\u0078\u0074\u002e p\u0061\u0072\u0061\u006d\u0073\u003d\u0025+\u0076",_befc .Operand ,_fdedg );};if _gag >=0{if len (_befc .Params )!=_gag {if _eacf {_gdgc =_c .New ("\u0069n\u00
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// Marks returns the TextMark collection for a page. It represents all the text on the page.
|
|
|
|
|
func (_aga PageText )Marks ()*TextMarkArray {return &TextMarkArray {_ggfd :_aga ._dedb }};func _eggb (_dga bounded )float64 {return -_dga .bbox ().Lly };func _decfa (_bagaf float64 )bool {return _ge .Abs (_bagaf )< _dcbg };
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-14 09:32:45 +00:00
|
|
|
|
// TextTable represents a table.
|
|
|
|
|
// Cells are ordered top-to-bottom, left-to-right.
|
|
|
|
|
// Cells[y] is the (0-offset) y'th row in the table.
|
|
|
|
|
// Cells[y][x] is the (0-offset) x'th column in the table.
|
2020-09-21 01:20:10 +00:00
|
|
|
|
type TextTable struct{W ,H int ;Cells [][]TableCell ;};func _acee (_eaecc func (*wordBag ,*textWord ,float64 )bool ,_fddc float64 )func (*wordBag ,*textWord )bool {return func (_deeff *wordBag ,_bde *textWord )bool {return _eaecc (_deeff ,_bde ,_fddc )};};func (_gede *wordBag )firstWord (_ceaba int )*textWord {return _gede ._cgba [_ceaba ][0]};func _dfeeg (_aebf ,_agbac int )int {if _aebf > _agbac {return _aebf ;};return _agbac ;};func _ggfc (_dcgdb ,_gece int )uint64 {return uint64 (_dcgdb )*0x1000000+uint64 (_gece )};func _gdgdg (_gbed ,_bbbb int )int {if _gbed < _bbbb {return _gbed ;};return _bbbb ;};func (_bdebc rulingList )tidied (_ccede string )rulingList {_dcfc :=_bdebc .removeDuplicates ();_bffd :=_dcfc .coalesce ();if _bffd ==nil {return nil ;};_bffd .sort ();if _geac {_df .Log .Info ("\u0074\u0069\u0064i\u0065\u0064\u003a\u0020\u0025\u0071\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u0075\u006e\u0069\u0071\u0075\u0065\u0073\u003d\u0025\u0064\u0020\u0063\u006f\u0061l\u0065\u0073\u0063\u0065\u0064\u003d\u0025\u0064",_ccede ,len (_bdebc ),len (_dcfc ),len (_bffd ));for _cfbdfe ,_fafc :=range _bffd {_db .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_cfbdfe ,_fafc );};};return _bffd ;};func (_dbbd *shapesState )cubicTo (_agb ,_dcgf ,_feaf ,_baca ,_aae ,_gbdg float64 ){_dbbd .addPoint (_aae ,_gbdg );};
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// String returns a human readable description of `path`.
|
|
|
|
|
func (_defb *subpath )String ()string {_bcab :=_defb ._acddg ;_cec :=len (_bcab );if _cec <=5{return _db .Sprintf ("\u0025d\u003a\u0020\u0025\u0036\u002e\u0032f",_cec ,_bcab );};return _db .Sprintf ("\u0025d\u003a\u0020\u0025\u0036.\u0032\u0066\u0020\u0025\u0036.\u0032f\u0020.\u002e\u002e\u0020\u0025\u0036\u002e\u0032f",_cec ,_bcab [0],_bcab [1],_bcab [_cec -1]);};func (_deegg paraList )yNeighbours (_dfadb float64 )map[*textPara ][]int {_dgeaa :=make ([]event ,2*len (_deegg ));if _dfadb ==0{for _eaca ,_cacgc :=range _deegg {_dgeaa [2*_eaca ]=event {_cacgc .Lly ,true ,_eaca };_dgeaa [2*_eaca +1]=event {_cacgc .Ury ,false ,_eaca };};}else {for _agda ,_bagae :=range _deegg {_dgeaa [2*_agda ]=event {_bagae .Lly -_dfadb *_bagae .fontsize (),true ,_agda };_dgeaa [2*_agda +1]=event {_bagae .Ury +_dfadb *_bagae .fontsize (),false ,_agda };};};return _deegg .eventNeighbours (_dgeaa );};type bounded interface{bbox ()_ab .PdfRectangle };func _gcagdd (_cbda []TextMark ,_geec *int )[]TextMark {_aedf :=_cbda [len (_cbda )-1];_bcee :=[]rune (_aedf .Text );if len (_bcee )==1{_cbda =_cbda [:len (_cbda )-1];_bdcc :=_cbda [len (_cbda )-1];*_geec =_bdcc .Offset +len (_bdcc .Text );}else {_fcfce :=_bbdbc (_aedf .Text );*_geec +=len (_fcfce )-len (_aedf .Text );_aedf .Text =_fcfce ;};return _cbda ;};func (_aadc rulingList )sortStrict (){_f .Slice (_aadc ,func (_edgb ,_dfee int )bool {_ggfbb ,_gdcc :=_aadc [_edgb ],_aadc [_dfee ];_bbba ,_agbfg :=_ggfbb ._aead ,_gdcc ._aead ;if _bbba !=_agbfg {return _bbba > _agbfg ;};_gcffg ,_bfbc :=_ggfbb ._ffd ,_gdcc ._ffd ;if _gcffg !=_bfbc {return _gcffg < _bfbc ;};_gcffg ,_bfbc =_ggfbb ._egaa ,_gdcc ._egaa ;if _gcffg !=_bfbc {return _gcffg < _bfbc ;};return _ggfbb ._dcfe < _gdcc ._dcfe ;});};func (_geag *wordBag )removeDuplicates (){if _aedd {_df .Log .Info ("r\u0065m\u006f\u0076\u0065\u0044\u0075\u0070\u006c\u0069c\u0061\u0074\u0065\u0073: \u0025\u0071",_geag .text ());};for _ ,_gaba :=range _geag .depthIndexes (){if len (_geag ._cgba [_gaba ])==0{continue ;};_degab :=_geag ._cgba [_gaba ][0];_adbf :=_ecdb *_degab ._bbcaa ;_badd :=_degab ._aggfa ;for _ ,_adeg :=range _geag .depthBand (_badd ,_badd +_adbf ){_cafb :=map[*textWord ]struct{}{};_aceg :=_geag ._cgba [_adeg ];for _ ,_fbec :=range _aceg {if _ ,_ggab :=_cafb [_fbec ];_ggab {continue ;};for _ ,_afdb :=range _aceg {if _ ,_ggff :=_cafb [_afdb ];_ggff {continue ;};if _afdb !=_fbec &&_afdb ._bbdbfb ==_fbec ._bbdbfb &&_ge .Abs (_afdb .Llx -_fbec .Llx )< _adbf &&_ge .Abs (_afdb .Urx -_fbec .Urx )< _adbf &&_ge .Abs (_afdb .Lly -_fbec .Lly )< _adbf &&_ge .Abs (_afdb .Ury -_fbec .Ury )< _adbf {_cafb [_afdb ]=struct{}{};};};};if len (_cafb )> 0{_eebc :=0;for _ ,_dccg :=range _aceg {if _ ,_dceb :=_cafb [_dccg ];!_dceb {_aceg [_eebc ]=_dccg ;_eebc ++;};};_geag ._cgba [_adeg ]=_aceg [:len (_aceg )-len (_cafb )];if len (_geag ._cgba [_adeg ])==0{delete (_geag ._cgba ,_adeg );};};};};};func (_cceg paraList )applyTables (_gdbd []*textTable )paraList {_fbca :=make (map[*textPara ]struct{});var _aaag paraList ;for _ ,_acgc :=range _gdbd {for _ ,_ffbba :=range _acgc ._acaf {_fbca [_ffbba ]=struct{}{};};_aaag =append (_aaag ,_acgc .newTablePara ());};for _ ,_gcdg :=range _cceg {if _ ,_eabba :=_fbca [_gcdg ];!_eabba {_aaag =append (_aaag ,_gcdg );};};return _aaag ;};func _gab (_dad ,_abce _ab .PdfRectangle )bool {return _efcb (_dad ,_abce )&&_dfad (_dad ,_abce )};
|
|
|
|
|
|
|
|
|
|
// Append appends `mark` to the mark array.
|
|
|
|
|
func (_afd *TextMarkArray )Append (mark TextMark ){_afd ._ggfd =append (_afd ._ggfd ,mark )};type paraList []*textPara ;func (_dgdf rulingList )connections (_gefb map[int ]intSet ,_gcbe int )intSet {_ffdg :=make (intSet );_aaga :=make (intSet );var _bbee func (int );_bbee =func (_bded int ){if !_aaga .has (_bded ){_aaga .add (_bded );for _gdgb :=range _dgdf {if _gefb [_gdgb ].has (_bded ){_ffdg .add (_gdgb );};};for _fggg :=range _dgdf {if _ffdg .has (_fggg ){_bbee (_fggg );};};};};_bbee (_gcbe );return _ffdg ;};func (_bca *textObject )renderText (_eeba []byte )error {if _bca ._fcgd {_df .Log .Debug ("\u0072\u0065\u006e\u0064\u0065r\u0054\u0065\u0078\u0074\u003a\u0020\u0049\u006e\u0076\u0061\u006c\u0069\u0064 \u0066\u006f\u006e\u0074\u002e\u0020\u004e\u006f\u0074\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u002e");return nil ;};_ege :=_bca .getCurrentFont ();_ggfb :=_ege .BytesToCharcodes (_eeba );_daaa ,_efba ,_dfd :=_ege .CharcodesToStrings (_ggfb );if _dfd > 0{_df .Log .Debug ("\u0072\u0065nd\u0065\u0072\u0054e\u0078\u0074\u003a\u0020num\u0043ha\u0072\u0073\u003d\u0025\u0064\u0020\u006eum\u004d\u0069\u0073\u0073\u0065\u0073\u003d%\u0064",_efba ,_dfd );};_bca ._fdag ._agfd +=_efba ;_bca ._fdag ._fbea +=_dfd ;_eee :=_bca ._fdag ;_efbe :=_eee ._gga ;_gda :=_eee ._adac /100.0;_ddff ,_eab :=_ege .GetRuneMetrics (' ');if !_eab {_ddff ,_eab =_ege .GetCharMetrics (32);};if !_eab {_ddff ,_ =_ab .DefaultFont ().GetRuneMetrics (' ');};_bfa :=_ddff .Wx *_eeg ;_df .Log .Trace ("\u0073p\u0061\u0063e\u0057\u0069\u0064t\u0068\u003d\u0025\u002e\u0032\u0066\u0020t\u0065\u0078\u0074\u003d\u0025\u0071 \u0066\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0066\u006f\u006et\u0053\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066",_bfa ,_daaa ,_ege ,_efbe );_cdfc :=_b .NewMatrix (_efbe *_gda ,0,0,_efbe ,0,_eee ._daa );if _fdec {_df .Log .Info ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0074\u0065\u0078t\u0073\u003d\u0025\u0071",len (_ggfb ),_ggfb ,_daaa );};_df .Log .Trace ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0072\u0075\u006ee\u0073\u003d\u0025\u0071",len (_ggfb ),_ggfb ,len (_daaa ));_dac :=_bca .getFillColor ();_fcbg :=_bca .getStrokeColor ();for _gdec ,_dega :=range _daaa {_abf :=[]rune (_dega );if len (_abf )==1&&_abf [0]=='\x00'{continue ;};_eeaf :=_ggfb [_gdec ];_cffa :=_bca ._ebbb .CTM .Mult (_bca ._aff ).Mult (_cdfc );_ffa :=0.0;if len (_abf )==1&&_abf [0]==32{_ffa =_eee ._ebc ;};_fec ,_fbgb :=_ege .GetCharMetrics (_eeaf );if !_fbgb {_df .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u004e\u006f \u006d\u0065\u0074r\u0069\u0063\u0020\u0066\u006f\u0072\u0020\u0063\u006fde\u003d\u0025\u0064 \u0072\u003d0\u0078\u0025\u0030\u0034\u0078\u003d%\u002b\u0071 \u0025\u0073",_eeaf ,_abf ,_abf ,_ege );return _db .Errorf ("\u006e\u006f\u0020\u0063\u0068\u0061\u0072\u0020\u006d\u0065\u0074\u0072\u0069\u0063\u0073:\u0020f\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0063\u006f\u0064\u0065\u003d\u0025\u0064",_ege .String (),_eeaf );};_fdeb :=_b .Point {X :_fec .Wx *_eeg ,Y :_fec .Wy *_eeg };_dfeg :=_b .Point {X :(_fdeb .X *_efbe +_ffa )*_gda };_caca :=_b .Point {X :(_fdeb .X *_efbe +_eee ._bfef +_ffa )*_gda };if _fdec {_df .Log .Info ("\u0074\u0066\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0063\u003d\u0025\u002e\u0032f\u0020t\u0077\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0068\u003d\u0025\u002e\u0032\u0066",_efbe ,_eee ._bfef ,_eee ._ebc ,_gda );_df .Log .Info ("\u0064x\u002c\u0064\u0079\u003d%\u002e\u0033\u0066\u0020\u00740\u003d%\u002e3\u0066\u0020\u0074\u003d\u0025\u002e\u0033f",_fdeb ,_dfeg ,_caca );};_bbda :=_egeb (_dfeg );_dba :=_egeb (_caca );_dbf :=_bca ._ebbb .CTM .Mult (_bca ._aff ).Mult (_bbda );if _fada {_df .Log .Info ("e\u006e\u0064\u003a\u000a\tC\u0054M\u003d\u0025\u0073\u000a\u0009 \u0074\u006d\u003d\u0025\u0073\u000a"+"\u0009\u0020t\u0064\u003d\u0025s\u0020\u0078\u006c\u0061\u0074\u003d\u0025\u0073\u000a"+
|
|
|
|
|
|
|
|
|
|
// String returns a description of `w`.
|
|
|
|
|
func (_gbgce *textWord )String ()string {return _db .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_gbgce ._aggfa ,_gbgce .PdfRectangle ,_gbgce ._bbcaa ,_gbgce ._bbdbfb );};func (_aad *wordBag )stratum (_eecdd int )[]*textWord {_affg :=_aad ._cgba [_eecdd ];_dagg :=make ([]*textWord ,len (_affg ));copy (_dagg ,_affg );return _dagg ;};func (_egcf *shapesState )quadraticTo (_fceb ,_fcab ,_febg ,_dbg float64 ){_egcf .addPoint (_febg ,_dbg )};func _fbgcf (_bdf []*textWord ,_ecdf int )[]*textWord {_fgdfb :=len (_bdf );copy (_bdf [_ecdf :],_bdf [_ecdf +1:]);return _bdf [:_fgdfb -1];};type rulingKind int ;func (_cgaa *textLine )appendWord (_ggdcc *textWord ){_cgaa ._fbae =append (_cgaa ._fbae ,_ggdcc );_cgaa .PdfRectangle =_ddac (_cgaa .PdfRectangle ,_ggdcc .PdfRectangle );if _ggdcc ._bbcaa > _cgaa ._bafec {_cgaa ._bafec =_ggdcc ._bbcaa ;};if _ggdcc ._aggfa > _cgaa ._adcf {_cgaa ._adcf =_ggdcc ._aggfa ;};};type ruling struct{_aead rulingKind ;_ffd float64 ;_egaa float64 ;_dcfe float64 ;};func (_dddb *textObject )getCurrentFont ()*_ab .PdfFont {var _cfde *_ab .PdfFont ;if !_dddb ._fgb .empty (){_cfde =_dddb ._fgb .top ()._feb ;};if _cfde ==nil {_df .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u0020\u0064\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020U\u0073\u0069\u006e\u0067\u0020d\u0065\u0066a\u0075\u006c\u0074\u002e");return _ab .DefaultFont ();};return _cfde ;};func (_abeg *textObject )setFont (_cbdc string ,_dag float64 )error {if _abeg ==nil {return nil ;};_abeg ._fdag ._gga =_dag ;_ecg ,_cgdb :=_abeg .getFont (_cbdc );if _cgdb !=nil {return _cgdb ;};_abeg ._fdag ._feb =_ecg ;if _abeg ._fgb .empty (){_abeg ._fgb .push (_abeg ._fdag );}else {_abeg ._fgb .top ()._feb =_abeg ._fdag ._feb ;};return nil ;};func (_gedfd lineRuling )asRuling ()(*ruling ,bool ){_gbac :=ruling {_aead :_gedfd ._cdcb };switch _gedfd ._cdcb {case _acab :_gbac ._ffd =_gedfd .xMean ();_gbac ._egaa =_ge .Min (_gedfd ._afad .Y ,_gedfd ._baacb .Y );_gbac ._dcfe =_ge .Max (_gedfd ._afad .Y ,_gedfd ._baacb .Y );case _cbdac :_gbac ._ffd =_gedfd .yMean ();_gbac ._egaa =_ge .Min (_gedfd ._afad .X ,_gedfd ._baacb .X );_gbac ._dcfe =_ge .Max (_gedfd ._afad .X ,_gedfd ._baacb .X );default:_df .Log .Error ("\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064",_gedfd ._cdcb );return nil ,false ;};return &_gbac ,true ;};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `p`.
|
|
|
|
|
func (_acdbg *textPara )String ()string {_dade :="";if _acdbg ._accb !=nil {_dade =_db .Sprintf ("\u005b\u0025\u0064\u0078\u0025\u0064\u005d\u0020",_acdbg ._accb ._cfgec ,_acdbg ._accb ._bged );};return _db .Sprintf ("\u0025\u0036\u002e\u0032f \u0025\u0073\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u0020\u0025\u0071",_acdbg .PdfRectangle ,_dade ,len (_acdbg ._efad ),_fagd (_acdbg .text (),50));};type fontEntry struct{_cgbe *_ab .PdfFont ;_bfd int64 ;};func _bgd (_aecg float64 )float64 {return _gcg *_ge .Round (_aecg /_gcg )};const _edge =10;func (_bdde *wordBag )maxDepth ()float64 {return _bdde ._accg -_bdde .Lly };func (_geff *wordBag )firstReadingIndex (_aafd int )int {_degb :=_geff .firstWord (_aafd )._bbcaa ;_abc :=float64 (_aafd +1)*_gdaf ;_gfa :=_abc +_degd *_degb ;_dege :=_aafd ;for _ ,_daad :=range _geff .depthBand (_abc ,_gfa ){if _efdg (_geff .firstWord (_daad ),_geff .firstWord (_dege ))< 0{_dege =_daad ;};};return _dege ;};func _bbbcf (_dbbb []*textMark ,_fbf _ab .PdfRectangle )[]*textWord {var _bdede []*textWord ;var _addaa *textWord ;_cgff :=func (){if _addaa !=nil {_aadcd :=_addaa .computeText ();if !_fecc (_aadcd ){_addaa ._bbdbfb =_aadcd ;_bdede =append (_bdede ,_addaa );if _gbbf {_df .Log .Info ("\u0077o\u0072\u0064\u003d\u0025\u0073",_addaa .String ());for _afga ,_eddbg :=range _addaa ._gbgg {_db .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_afga ,_eddbg .String ());};};};_addaa =nil ;};};for _ ,_efbeg :=range _dbbb {if _dgeg &&_addaa !=nil &&len (_addaa ._gbgg )> 0{_fefb :=_addaa ._gbgg [len (_addaa ._gbgg )-1];_baaefe ,_dgbb :=_decd (_efbeg ._afca );_dbfaa ,_adef :=_decd (_fefb ._afca );if _dgbb &&!_adef &&_fefb .inDiacriticArea (_efbeg ){_addaa .addDiacritic (_baaefe );continue ;};if _adef &&!_dgbb &&_efbeg .inDiacriticArea (_fefb ){_addaa ._gbgg =_addaa ._gbgg [:len (_addaa ._gbgg )-1];_addaa .appendMark (_efbeg ,_fbf );_addaa .addDiacritic (_dbfaa );continue ;};};_acgb :=_fecc (_efbeg ._afca );if _acgb {_cgff ();continue ;};if _addaa ==nil &&!_acgb {_addaa =_dbcf ([]*textMark {_efbeg },_fbf );continue ;};_fgbg :=_addaa ._bbcaa ;_bgfc :=_ge .Abs (_cfbdc (_fbf ,_efbeg )-_addaa ._aggfa )/_fgbg ;_dccef :=_faec (_efbeg ,_addaa )/_fgbg ;if _dccef >=_defbd ||!(-_aaac <=_dccef &&_bgfc <=_gfdg ){_cgff ();_addaa =_dbcf ([]*textMark {_efbeg },_fbf );continue ;};_addaa .appendMark (_efbeg ,_fbf );};_cgff ();return _bdede ;};func (_adece *textObject )moveText (_fegd ,_cgb float64 ){_adece .moveLP (_fegd ,_cgb )};func (_feeab *textObject )moveLP (_dacc ,_debd float64 ){_feeab ._dgg .Concat (_b .NewMatrix (1,0,0,1,_dacc ,_debd ));_feeab ._aff =_feeab ._dgg ;};func (_bba *textObject )setWordSpacing (_bff float64 ){if _bba ==nil {return ;};_bba ._fdag ._ebc =_bff ;};
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-14 09:32:45 +00:00
|
|
|
|
// ExtractPageImages returns the image contents of the page extractor, including data
|
|
|
|
|
// and position, size information for each image.
|
|
|
|
|
// A set of options to control page image extraction can be passed in. The options
|
|
|
|
|
// parameter can be nil for the default options. By default, inline stencil masks
|
|
|
|
|
// are not extracted.
|
2020-09-21 01:20:10 +00:00
|
|
|
|
func (_be *Extractor )ExtractPageImages (options *ImageExtractOptions )(*PageImages ,error ){_beg :=&imageExtractContext {_bg :options };_ccd :=_beg .extractContentStreamImages (_be ._ade ,_be ._dd );if _ccd !=nil {return nil ,_ccd ;};return &PageImages {Images :_beg ._bec },nil ;};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
|
|
|
|
// TextMark represents extracted text on a page with information regarding both textual content,
|
|
|
|
|
// formatting (font and size) and positioning.
|
|
|
|
|
// It is the smallest unit of text on a PDF page, typically a single character.
|
|
|
|
|
//
|
|
|
|
|
// getBBox() in test_text.go shows how to compute bounding boxes of substrings of extracted text.
|
|
|
|
|
// The following code extracts the text on PDF page `page` into `text` then finds the bounding box
|
|
|
|
|
// `bbox` of substring `term` in `text`.
|
|
|
|
|
//
|
|
|
|
|
// ex, _ := New(page)
|
|
|
|
|
// // handle errors
|
|
|
|
|
// pageText, _, _, err := ex.ExtractPageText()
|
|
|
|
|
// // handle errors
|
|
|
|
|
// text := pageText.Text()
|
|
|
|
|
// textMarks := pageText.Marks()
|
|
|
|
|
//
|
|
|
|
|
// start := strings.Index(text, term)
|
|
|
|
|
// end := start + len(term)
|
|
|
|
|
// spanMarks, err := textMarks.RangeOffset(start, end)
|
|
|
|
|
// // handle errors
|
|
|
|
|
// bbox, ok := spanMarks.BBox()
|
|
|
|
|
// // handle errors
|
|
|
|
|
type TextMark struct{
|
|
|
|
|
|
|
|
|
|
// Text is the extracted text.
|
|
|
|
|
Text string ;
|
|
|
|
|
|
|
|
|
|
// Original is the text in the PDF. It has not been decoded like `Text`.
|
|
|
|
|
Original string ;
|
2018-09-22 09:28:18 +10:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// BBox is the bounding box of the text.
|
2020-09-14 09:32:45 +00:00
|
|
|
|
BBox _ab .PdfRectangle ;
|
2018-09-22 09:28:18 +10:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// Font is the font the text was drawn with.
|
2020-09-14 09:32:45 +00:00
|
|
|
|
Font *_ab .PdfFont ;
|
2018-12-27 20:51:34 +11:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// FontSize is the font size the text was drawn with.
|
|
|
|
|
FontSize float64 ;
|
2018-11-28 18:06:03 +11:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// Offset is the offset of the start of TextMark.Text in the extracted text. If you do this
|
|
|
|
|
// text, textMarks := pageText.Text(), pageText.Marks()
|
|
|
|
|
// marks := textMarks.Elements()
|
|
|
|
|
// then marks[i].Offset is the offset of marks[i].Text in text.
|
|
|
|
|
Offset int ;
|
|
|
|
|
|
|
|
|
|
// Meta is set true for spaces and line breaks that we insert in the extracted text. We insert
|
|
|
|
|
// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
|
|
|
|
|
// distance apart. See wordJoiner (lineJoiner) in PageText.computeViews().
|
|
|
|
|
Meta bool ;
|
|
|
|
|
|
|
|
|
|
// FillColor is the fill color of the text.
|
|
|
|
|
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
2020-09-21 01:20:10 +00:00
|
|
|
|
FillColor _cf .Color ;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
|
|
|
|
// StrokeColor is the stroke color of the text.
|
|
|
|
|
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
2020-09-21 01:20:10 +00:00
|
|
|
|
StrokeColor _cf .Color ;};func _ccac (_eccdb ,_cfef ,_bfca ,_babee *textPara )*textTable {_ffg :=&textTable {_cfgec :2,_bged :2,_acaf :make (map[uint64 ]*textPara ,4)};_ffg .put (0,0,_eccdb );_ffg .put (1,0,_cfef );_ffg .put (0,1,_bfca );_ffg .put (1,1,_babee );return _ffg ;};func (_egfc *shapesState )newSubPath (){_egfc .clearPath ();if _fcfd {_df .Log .Info ("\u006e\u0065\u0077\u0053\u0075\u0062\u0050\u0061\u0074h\u003a\u0020\u0025\u0073",_egfc );};};func (_cadb *textLine )bbox ()_ab .PdfRectangle {return _cadb .PdfRectangle };func (_fgdc paraList )llyRange (_gaaa []int ,_efbg ,_dged float64 )[]int {_feaa :=len (_fgdc );if _dged < _fgdc [_gaaa [0]].Lly ||_efbg > _fgdc [_gaaa [_feaa -1]].Lly {return nil ;};_fdfa :=_f .Search (_feaa ,func (_gaed int )bool {return _fgdc [_gaaa [_gaed ]].Lly >=_efbg });_fdgfd :=_f .Search (_feaa ,func (_dbeae int )bool {return _fgdc [_gaaa [_dbeae ]].Lly > _dged });return _gaaa [_fdfa :_fdgfd ];};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-09-14 09:32:45 +00:00
|
|
|
|
// Text returns the extracted page text.
|
2020-09-21 01:20:10 +00:00
|
|
|
|
func (_gdgd PageText )Text ()string {return _gdgd ._bdc };func (_gdad *shapesState )lastpointEstablished ()(_b .Point ,bool ){if _gdad ._adc {return _gdad ._dgd ,false ;};_efbf :=len (_gdad ._gggc );if _efbf > 0&&_gdad ._gggc [_efbf -1]._bbc {return _gdad ._gggc [_efbf -1].last (),false ;};return _b .Point {},true ;};
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// ExtractText processes and extracts all text data in content streams and returns as a string.
|
|
|
|
|
// It takes into account character encodings in the PDF file, which are decoded by
|
|
|
|
|
// CharcodeBytesToUnicode.
|
|
|
|
|
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
|
|
|
|
|
func (_eea *Extractor )ExtractText ()(string ,error ){_fe ,_ ,_ ,_acfa :=_eea .ExtractTextWithStats ();return _fe ,_acfa ;};const (_agga rulingKind =iota ;_cbdac ;_acab ;);func (_cadc *textLine )pullWord (_eedg *wordBag ,_cecg *textWord ,_bdgg int ){_cadc .appendWord (_cecg );_eedg .removeWord (_cecg ,_bdgg );};func (_cgaf *textPara )isAtom ()*textTable {_dfbg :=_cgaf ;_cbce :=_cgaf ._gcced ;_afbf :=_cgaf ._gagf ;if !(_cbce !=nil &&!_cbce ._geda &&_afbf !=nil &&!_afbf ._geda ){return nil ;};_bgae :=_cbce ._gagf ;if !(_bgae !=nil &&!_bgae ._geda &&_bgae ==_afbf ._gcced ){return nil ;};return _ccac (_dfbg ,_cbce ,_afbf ,_bgae );};func (_agad *wordBag )sort (){for _ ,_bgag :=range _agad ._cgba {_f .Slice (_bgag ,func (_bcgg ,_dgea int )bool {return _efdg (_bgag [_bcgg ],_bgag [_dgea ])< 0});};};func _dbfa (_gdea _ab .PdfRectangle )rulingKind {_eegab :=_gdea .Width ();_cfad :=_gdea .Height ();return _fbdgg (_eegab ,_cfad );};func (_bfga *wordBag )allWords ()[]*textWord {var _gcbd []*textWord ;for _ ,_egef :=range _bfga ._cgba {_gcbd =append (_gcbd ,_egef ...);};return _gcbd ;};func _fbc (_bbcac ,_geae float64 )bool {return _bbcac /_ge .Max (1.0,_geae )< _feab };func (_fcgg *wordBag )pullWord (_befaa *wordBag ,_ebge *textWord ,_bbdg int ){_fcgg .PdfRectangle =_ddac (_fcgg .PdfRectangle ,_ebge .PdfRectangle );if _ebge ._bbcaa > _fcgg ._ffed {_fcgg ._ffed =_ebge ._bbcaa ;};_fcgg ._cgba [_bbdg ]=append (_fcgg ._cgba [_bbdg ],_ebge );_befaa .removeWord (_ebge ,_bbdg );};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// RangeOffset returns the TextMarks in `ma` that overlap text[start:end] in the extracted text.
|
|
|
|
|
// These are tm: `start` <= tm.Offset + len(tm.Text) && tm.Offset < `end` where
|
|
|
|
|
// `start` and `end` are offsets in the extracted text.
|
|
|
|
|
// NOTE: TextMarks can contain multiple characters. e.g. "ffi" for the ffi ligature so the first and
|
|
|
|
|
// last elements of the returned TextMarkArray may only partially overlap text[start:end].
|
2020-09-21 01:20:10 +00:00
|
|
|
|
func (_fede *TextMarkArray )RangeOffset (start ,end int )(*TextMarkArray ,error ){if _fede ==nil {return nil ,_c .New ("\u006da\u003d\u003d\u006e\u0069\u006c");};if end < start {return nil ,_db .Errorf ("\u0065\u006e\u0064\u0020\u003c\u0020\u0073\u0074\u0061\u0072\u0074\u002e\u0020\u0052\u0061n\u0067\u0065\u004f\u0066\u0066\u0073\u0065\u0074\u0020\u006e\u006f\u0074\u0020d\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020\u0073\u0074\u0061\u0072t=\u0025\u0064\u0020\u0065\u006e\u0064\u003d\u0025\u0064\u0020",start ,end );};_baf :=len (_fede ._ggfd );if _baf ==0{return _fede ,nil ;};if start < _fede ._ggfd [0].Offset {start =_fede ._ggfd [0].Offset ;};if end > _fede ._ggfd [_baf -1].Offset +1{end =_fede ._ggfd [_baf -1].Offset +1;};_gaga :=_f .Search (_baf ,func (_dgb int )bool {return _fede ._ggfd [_dgb ].Offset +len (_fede ._ggfd [_dgb ].Text )-1>=start });if !(0<=_gaga &&_gaga < _baf ){_gdef :=_db .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020\u0072\u0061\u006e\u0067\u0065\u002e\u0020\u0073\u0074\u0061\u0072\u0074\u003d%\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009 \u006c\u0061\u0073\u0074\u003d%\u0076",start ,_gaga ,_baf ,_fede ._ggfd [0],_fede ._ggfd [_baf -1]);return nil ,_gdef ;};_gbc :=_f .Search (_baf ,func (_afcc int )bool {return _fede ._ggfd [_afcc ].Offset > end -1});if !(0<=_gbc &&_gbc < _baf ){_aebb :=_db .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020r\u0061\u006e\u0067e\u002e\u0020\u0065n\u0064\u003d%\u0064\u0020\u0069\u0045\u006e\u0064=\u0025d \u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d\u0025\u0076",end ,_gbc ,_baf ,_fede ._ggfd [0],_fede ._ggfd [_baf -1]);return nil ,_aebb ;};if _gbc <=_gaga {return nil ,_db .Errorf ("\u0069\u0045\u006e\u0064\u0020\u003c=\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003a\u0020\u0073\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u0065\u006ed\u003d\u0025\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020i\u0045n\u0064\u003d\u0025\u0064",start ,end ,_gaga ,_gbc );};return &TextMarkArray {_ggfd :_fede ._ggfd [_gaga :_gbc ]},nil ;};func _gagc (_dagf []*subpath )[]rulingList {_aeg (_dagf );var _eefb rulingList ;for _ ,_gcaa :=range _dagf {if len (_gcaa ._acddg )< 2{continue ;};_afcfg :=_gcaa ._acddg [0];for _ ,_cfgf :=range _gcaa ._acddg [1:]{if _ccga ,_addg :=_cbcd (_afcfg ,_cfgf );_addg {_eefb =append (_eefb ,_ccga );};_afcfg =_cfgf ;};};_eefb =_eefb .tidied ("\u0073t\u0072\u006f\u006b\u0065\u0073");return _eefb .toGrids ();};func (_gebfg *subpath )add (_bbgb ..._b .Point ){_gebfg ._acddg =append (_gebfg ._acddg ,_bbgb ...)};func (_eagd *shapesState )stroke (_ccdc *[]*subpath ){*_ccdc =append (*_ccdc ,_eagd ._gggc ...);if _geac {_df .Log .Info ("\u0053T\u0052\u004f\u004b\u0045\u003a\u0020\u0025\u0064\u0020\u0073\u0074r\u006f\u006b\u0065\u0073\u0020\u0073\u0073\u003d\u0025\u0073",len (*_ccdc ),_eagd );for _cfbc ,_edd :=range _eagd ._gggc {_db .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_cfbc ,_edd );if _cfbc ==10{break ;};};};};func (_cfc *shapesState )moveTo (_degf ,_feda float64 ){_cfc ._adc =true ;_cfc ._dgd =_cfc .devicePoint (_degf ,_feda );if _fcfd {_df .Log .Info ("\u006d\u006fv\u0065\u0054\u006f\u0028\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0063\u0075\u0072\u0072\u0065\u006e\u0074\u003d%.\u0032\u0066",_degf ,_feda ,_cfc ._dgd );};};var (_beaf =map[rune ]string {0x0060:"\u0300",0x02CB:"\u0300",0x0027:"\u0301",0x00B4:"\u0301",0x02B9:"\u0301",0x02CA:"\u0301",0x005E:"\u0302",0x02C6:"\u0302",0x007E:"\u0303",0x02DC:"\u0303",0x00AF:"\u0304",0x02C9:"\u0304",0x02D8:"\u0306",0x02D9:"\u0307",0x00A8:"\u0308",0x00B0:"\u030a",0x02DA:"\u030a",0x02BA:"\u030b",0x02DD:"\u030b",0x02C7:"\u030c",0x02C8:"\u030d",0x0022:"\u030e",0x02BB:"\u0312",0x02BC:"\u0313",0x0486:"\u0313",0x055A:"\u0313",0x02BD:"\u0314",0x0485:"\u0314",0x0559:"\u0314",0x02D4:"\u031d",0x02D5:"\u031e",0x02D6:"\u031f",0x02D7:
|