mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00
226 lines
93 KiB
Go
226 lines
93 KiB
Go
//
|
||
// Copyright 2020 FoxyUtils ehf. All rights reserved.
|
||
//
|
||
// This is a commercial product and requires a license to operate.
|
||
// A trial license can be obtained at https://unidoc.io
|
||
//
|
||
// DO NOT EDIT: generated by unitwist Go source code obfuscator.
|
||
//
|
||
// Use of this source code is governed by the UniDoc End User License Agreement
|
||
// terms that can be accessed at https://unidoc.io/eula/
|
||
|
||
//
|
||
// Package extractor is used for quickly extracting PDF content through a simple interface.
|
||
// Currently offers functionality for extracting textual content.
|
||
//
|
||
package extractor ;import (_b "bytes";_f "errors";_dbd "fmt";_fbc "github.com/unidoc/unipdf/v3/common";_ce "github.com/unidoc/unipdf/v3/common/license";_bg "github.com/unidoc/unipdf/v3/contentstream";_bb "github.com/unidoc/unipdf/v3/core";_eg "github.com/unidoc/unipdf/v3/internal/textencoding";_dbf "github.com/unidoc/unipdf/v3/internal/transform";_fef "github.com/unidoc/unipdf/v3/model";_ff "golang.org/x/text/unicode/norm";_e "golang.org/x/xerrors";_fe "image/color";_c "io";_fb "math";_db "sort";_da "strings";_cf "unicode";);func (_ccba *textObject )moveText (_dca ,_beed float64 ){_ccba .moveTo (_dca ,_beed )};func (_dge *wordBag )maxDepth ()float64 {return _dge ._bab -_dge .Lly };
|
||
|
||
// TextTable represents a table.
|
||
// Cells are ordered top-to-bottom, left-to-right.
|
||
// Cells[y] is the (0-offset) y'th row in the table.
|
||
// Cells[y][x] is the (0-offset) x'th column in the table.
|
||
type TextTable struct{W ,H int ;Cells [][]TableCell ;};func _ccbc (_fgac *wordBag ,_dbdg *textWord ,_ebbb float64 )bool {return _dbdg .Llx < _fgac .Urx +_ebbb &&_fgac .Llx -_ebbb < _dbdg .Urx ;};func (_fccc *stateStack )top ()*textState {if _fccc .empty (){return nil ;};return (*_fccc )[_fccc .size ()-1];};type textObject struct{_aed *Extractor ;_afac *_fef .PdfPageResources ;_cfge _bg .GraphicsState ;_ffe *textState ;_becb *stateStack ;_acg _dbf .Matrix ;_bgf _dbf .Matrix ;_aga []*textMark ;_bdb bool ;};
|
||
|
||
// TextMarkArray is a collection of TextMarks.
|
||
type TextMarkArray struct{_deffd []TextMark };type textResult struct{_bgga PageText ;_eef int ;_bagg int ;};func (_gfg *stateStack )empty ()bool {return len (*_gfg )==0};func (_dadf *wordBag )removeDuplicates (){for _ ,_fgde :=range _dadf .depthIndexes (){if len (_dadf ._edcf [_fgde ])==0{continue ;};_fagb :=_dadf ._edcf [_fgde ][0];_bcge :=_daa *_fagb ._afcg ;_gbeec :=_fagb ._fedfa ;for _ ,_bbgb :=range _dadf .depthBand (_gbeec ,_gbeec +_bcge ){_bdfg :=map[*textWord ]struct{}{};_aead :=_dadf ._edcf [_bbgb ];for _ ,_gfeg :=range _aead {if _gfeg !=_fagb &&_gfeg ._caag ==_fagb ._caag &&_fb .Abs (_gfeg .Llx -_fagb .Llx )< _bcge &&_fb .Abs (_gfeg .Urx -_fagb .Urx )< _bcge &&_fb .Abs (_gfeg .Lly -_fagb .Lly )< _bcge &&_fb .Abs (_gfeg .Ury -_fagb .Ury )< _bcge {_bdfg [_gfeg ]=struct{}{};};};if len (_bdfg )> 0{_gbb :=0;for _ ,_bbbge :=range _aead {if _ ,_ddc :=_bdfg [_bbbge ];!_ddc {_aead [_gbb ]=_bbbge ;_gbb ++;};};_dadf ._edcf [_bbgb ]=_aead [:len (_aead )-len (_bdfg )];if len (_dadf ._edcf [_bbgb ])==0{delete (_dadf ._edcf ,_bbgb );};};};};};func (_dafg *textObject )setTextRise (_baac float64 ){if _dafg ==nil {return ;};_dafg ._ffe ._afa =_baac ;};
|
||
|
||
// Elements returns the TextMarks in `ma`.
|
||
func (_ecfc *TextMarkArray )Elements ()[]TextMark {return _ecfc ._deffd };func _dabg (_edeb []*textWord ,_gagca int )[]*textWord {_baea :=len (_edeb );copy (_edeb [_gagca :],_edeb [_gagca +1:]);return _edeb [:_baea -1];};
|
||
|
||
// NewFromContents creates a new extractor from contents and page resources.
|
||
func NewFromContents (contents string ,resources *_fef .PdfPageResources )(*Extractor ,error ){_cd :=&Extractor {_g :contents ,_ffd :resources ,_a :map[string ]fontEntry {},_bbb :map[string ]textResult {}};return _cd ,nil ;};const _dfabf =10;var _ega =false ;
|
||
|
||
// String returns a description of `p`.
|
||
func (_fdfgb *textPara )String ()string {_adfe :="";if _fdfgb ._fcge !=nil {_adfe =_dbd .Sprintf ("\u005b\u0025\u0064\u0078\u0025\u0064\u005d\u0020",_fdfgb ._fcge ._fbdef ,_fdfgb ._fcge ._ccebee );};return _dbd .Sprintf ("\u0025\u0036\u002e\u0032f \u0025\u0073\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u0020\u0025\u0071",_fdfgb .PdfRectangle ,_adfe ,len (_fdfgb ._gfgb ),_degd (_fdfgb .text (),50));};func (_daeea paraList )log (_cead string ){if !_gbfb {return ;};_fbc .Log .Info ("%\u0038\u0073\u003a\u0020\u0025\u0064 \u0070\u0061\u0072\u0061\u0073\u0020=\u003d\u003d\u003d\u003d\u003d\u003d\u002d-\u002d\u002d\u002d\u002d\u002d\u003d\u003d\u003d\u003d\u003d=\u003d",_cead ,len (_daeea ));for _abdd ,_cacf :=range _daeea {if _cacf ==nil {continue ;};_ffdf :=_cacf .text ();_aagd :="\u0020\u0020";if _cacf ._fcge !=nil {_aagd =_dbd .Sprintf ("\u005b%\u0064\u0078\u0025\u0064\u005d",_cacf ._fcge ._fbdef ,_cacf ._fcge ._ccebee );};_dbd .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0025s\u0020\u0025\u0071\u000a",_abdd ,_cacf .PdfRectangle ,_aagd ,_degd (_ffdf ,50));};};
|
||
|
||
// Tables returns the tables extracted from the page.
|
||
func (_bdc PageText )Tables ()[]TextTable {return _bdc ._bggd };
|
||
|
||
// Text returns the extracted page text.
|
||
func (_cbc PageText )Text ()string {return _cbc ._ecea };func (_abgcg *textObject )getCurrentFont ()*_fef .PdfFont {var _cdea *_fef .PdfFont ;if !_abgcg ._becb .empty (){_cdea =_abgcg ._becb .top ()._ggb ;};if _cdea ==nil {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u0020\u0064\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020U\u0073\u0069\u006e\u0067\u0020d\u0065\u0066a\u0075\u006c\u0074\u002e");return _fef .DefaultFont ();};return _cdea ;};type imageExtractContext struct{_ae []ImageMark ;_ea int ;_ec int ;_ffc int ;_bga map[*_bb .PdfObjectStream ]*cachedImage ;_ab *ImageExtractOptions ;};
|
||
|
||
// ImageExtractOptions contains options for controlling image extraction from
|
||
// PDF pages.
|
||
type ImageExtractOptions struct{IncludeInlineStencilMasks bool ;};func (_gbdd paraList )eventNeighbours (_bbcf []event )map[*textPara ][]int {_db .Slice (_bbcf ,func (_gfab ,_affe int )bool {_aede ,_fade :=_bbcf [_gfab ],_bbcf [_affe ];_cgbd ,_gfbf :=_aede ._cace ,_fade ._cace ;if _cgbd !=_gfbf {return _cgbd < _gfbf ;};if _aede ._faea !=_fade ._faea {return _aede ._faea ;};return _gfab < _affe ;});_cbbag :=map[int ]map[int ]struct{}{};_cfedd :=map[int ]struct{}{};for _ ,_fcfc :=range _bbcf {if _fcfc ._faea {_cbbag [_fcfc ._gggb ]=map[int ]struct{}{};for _beea :=range _cfedd {if _beea !=_fcfc ._gggb {_cbbag [_fcfc ._gggb ][_beea ]=struct{}{};_cbbag [_beea ][_fcfc ._gggb ]=struct{}{};};};_cfedd [_fcfc ._gggb ]=struct{}{};}else {delete (_cfedd ,_fcfc ._gggb );};};_dcdd :=map[*textPara ][]int {};for _geac ,_dafc :=range _cbbag {_gaf :=_gbdd [_geac ];_bebg :=make ([]int ,len (_dafc ));_dfcgg :=0;for _ffgd :=range _dafc {_bebg [_dfcgg ]=_ffgd ;_dfcgg ++;};_dcdd [_gaf ]=_bebg ;};return _dcdd ;};func (_faed *textTable )get (_bcgdf ,_adgag int )*textPara {return _faed ._cgca [_ceeg (_bcgdf ,_adgag )]};func (_ffg *textObject )setWordSpacing (_cgea float64 ){if _ffg ==nil {return ;};_ffg ._ffe ._bfb =_cgea ;};func (_fgcd *textLine )endsInHyphen ()bool {_fae :=_fgcd ._aecg [len (_fgcd ._aecg )-1];_abfd :=[]rune (_fae ._caag );if !_cf .Is (_cf .Hyphen ,_abfd [len (_abfd )-1]){return false ;};if _fae ._acac &&_gdce (_abfd ){return true ;};return _gdce ([]rune (_fgcd .text ()));};func (_gfda *textMark )inDiacriticArea (_cdeb *textMark )bool {_gaba :=_gfda .Llx -_cdeb .Llx ;_eafbc :=_gfda .Urx -_cdeb .Urx ;_eedb :=_gfda .Lly -_cdeb .Lly ;return _fb .Abs (_gaba +_eafbc )< _gfda .Width ()*_ddb &&_fb .Abs (_eedb )< _gfda .Height ()*_ddb ;};func (_caef paraList )llyOrdering ()[]int {_bbbf :=make ([]int ,len (_caef ));for _efcd :=range _caef {_bbbf [_efcd ]=_efcd ;};_db .SliceStable (_bbbf ,func (_bgbb ,_ceb int )bool {_bfg ,_cged :=_bbbf [_bgbb ],_bbbf [_ceb ];return _caef [_bfg ].Lly < _caef [_cged ].Lly ;});return _bbbf ;};func (_ecb *textObject )moveTo (_aee ,_gec float64 ){_ecb ._bgf .Concat (_dbf .NewMatrix (1,0,0,1,_aee ,_gec ));_ecb ._acg =_ecb ._bgf ;};const _ddg =20;
|
||
|
||
// BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
|
||
func (_cea *TextMarkArray )BBox ()(_fef .PdfRectangle ,bool ){var _gecd _fef .PdfRectangle ;_dgag :=false ;for _ ,_bbf :=range _cea ._deffd {if _bbf .Meta ||_bedf (_bbf .Text ){continue ;};if _dgag {_gecd =_eefdc (_gecd ,_bbf .BBox );}else {_gecd =_bbf .BBox ;_dgag =true ;};};return _gecd ,_dgag ;};func _bcgcf (_gefa _fef .PdfRectangle ,_bfee bounded )float64 {return _gefa .Ury -_bfee .bbox ().Lly };func (_gdg *wordBag )firstWord (_fbe int )*textWord {return _gdg ._edcf [_fbe ][0]};func _bgeba (_edee func (*wordBag ,*textWord ,float64 )bool ,_cfaec float64 )func (*wordBag ,*textWord )bool {return func (_ddga *wordBag ,_acdfb *textWord )bool {return _edee (_ddga ,_acdfb ,_cfaec )};};func (_afg *PageText )computeViews (){var _gefb paraList ;_aeeg :=len (_afg ._fdb );for _acd :=0;_acd < 360&&_aeeg > 0;_acd +=90{_gdfg :=make ([]*textMark ,0,len (_afg ._fdb )-_aeeg );for _ ,_beg :=range _afg ._fdb {if _beg ._bbbg ==_acd {_gdfg =append (_gdfg ,_beg );};};if len (_gdfg )> 0{_gdbc :=_bdga (_gdfg ,_afg ._cceb );_gefb =append (_gefb ,_gdbc ...);_aeeg -=len (_gdfg );};};_afe :=new (_b .Buffer );_gefb .writeText (_afe );_afg ._ecea =_afe .String ();_afg ._fdbc =_gefb .toTextMarks ();_afg ._bggd =_gefb .tables ();};func (_bddg *wordBag )stratum (_acdf int )[]*textWord {_cadee :=_bddg ._edcf [_acdf ];_cfae :=make ([]*textWord ,len (_cadee ));copy (_cfae ,_cadee );return _cfae ;};const (_ebfb =10;_fddeg =6;_aabf =0.5;_cede =0.11;_bda =0.19;_agea =0.04;_dabe =0.04;_dfc =1.0;_ebbf =0.04;_gffd =0.4;_fbde =0.7;_cbfb =1.0;_aeea =0.1;_cbba =1.4;_dbee =0.46;_daee =0.02;_daa =0.2;_ddb =0.5;_gfc =4;_dgge =4.0;_gagg =6;);func _bdgb (_dgd _dbf .Matrix )_dbf .Point {_aeb ,_egge :=_dgd .Translation ();return _dbf .Point {X :_aeb ,Y :_egge };};const TOL =1.0e-6;func (_ecfef *textTable )getDown ()paraList {_bcdc :=make (paraList ,_ecfef ._fbdef );for _bbef :=0;_bbef < _ecfef ._fbdef ;_bbef ++{_eaed :=_ecfef .get (_bbef ,_ecfef ._ccebee -1)._bfc ;if _eaed ==nil ||_eaed ._gcbc {return nil ;};_bcdc [_bbef ]=_eaed ;};for _adgg :=0;_adgg < _ecfef ._fbdef -1;_adgg ++{if _bcdc [_adgg ]._egcca !=_bcdc [_adgg +1]{return nil ;};};return _bcdc ;};func _fcda (_eabg []*textMark ,_cfbcb _fef .PdfRectangle )[]*textWord {var _bcgeg []*textWord ;var _cddg *textWord ;_cadb :=func (){if _cddg !=nil {_bcdd :=_cddg .computeText ();if !_bedf (_bcdd ){_cddg ._caag =_bcdd ;_bcgeg =append (_bcgeg ,_cddg );};_cddg =nil ;};};for _ ,_ecde :=range _eabg {if _gacf &&_cddg !=nil &&len (_cddg ._gbbd )> 0{_bacg :=_cddg ._gbbd [len (_cddg ._gbbd )-1];_ddee ,_caga :=_facg (_ecde ._gfeba );_bgfec ,_aefa :=_facg (_bacg ._gfeba );if _caga &&!_aefa &&_bacg .inDiacriticArea (_ecde ){_cddg .addDiacritic (_ddee );continue ;};if _aefa &&!_caga &&_ecde .inDiacriticArea (_bacg ){_cddg ._gbbd =_cddg ._gbbd [:len (_cddg ._gbbd )-1];_cddg .appendMark (_ecde ,_cfbcb );_cddg .addDiacritic (_bgfec );continue ;};};_egae :=_bedf (_ecde ._gfeba );if _egae {_cadb ();continue ;};if _cddg ==nil &&!_egae {_cddg =_geba ([]*textMark {_ecde },_cfbcb );continue ;};_fgcg :=_cddg ._afcg ;_bfaf :=_fb .Abs (_bcgcf (_cfbcb ,_ecde )-_cddg ._fedfa )/_fgcg ;_afba :=_fggd (_ecde ,_cddg )/_fgcg ;if _afba >=_cede ||!(-_bda <=_afba &&_bfaf <=_agea ){_cadb ();_cddg =_geba ([]*textMark {_ecde },_cfbcb );continue ;};_cddg .appendMark (_ecde ,_cfbcb );};_cadb ();return _bcgeg ;};func (_cdg *imageExtractContext )processOperand (_cc *_bg .ContentStreamOperation ,_bc _bg .GraphicsState ,_df *_fef .PdfPageResources )error {if _cc .Operand =="\u0042\u0049"&&len (_cc .Params )==1{_bba ,_ee :=_cc .Params [0].(*_bg .ContentStreamInlineImage );if !_ee {return nil ;};if _ac ,_bag :=_bb .GetBoolVal (_bba .ImageMask );_bag {if _ac &&!_cdg ._ab .IncludeInlineStencilMasks {return nil ;};};return _cdg .extractInlineImage (_bba ,_bc ,_df );}else if _cc .Operand =="\u0044\u006f"&&len (_cc .Params )==1{_dc ,_ed :=_bb .GetName (_cc .Params [0]);if !_ed {_fbc .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065");return _be ;};_ ,_afd :=_df .GetXObjectByName (*_dc );switch _afd {case _fef .XObjectTypeImage :return _cdg .extractXObjectImage (_dc ,_bc ,_df );case _fef .XObjectTypeForm :return _cdg .extractFormImages (_dc ,_bc ,_df );};};return nil ;};
|
||
|
||
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
|
||
// TODO(peterwilliams97): The stats complicate this function signature and aren't very useful.
|
||
// Replace with a function like Extract() (*PageText, error)
|
||
func (_fdfe *Extractor )ExtractPageText ()(*PageText ,int ,int ,error ){_edcg ,_abe ,_gbe ,_gbea :=_fdfe .extractPageText (_fdfe ._g ,_fdfe ._ffd ,_dbf .IdentityMatrix (),0);if _gbea !=nil {return nil ,0,0,_gbea ;};_edcg .computeViews ();_gbea =_ccef (_edcg );if _gbea !=nil {return nil ,0,0,_gbea ;};return _edcg ,_abe ,_gbe ,nil ;};func (_gcad *textLine )bbox ()_fef .PdfRectangle {return _gcad .PdfRectangle };func _abc (_cff *_bg .ContentStreamOperation )(float64 ,error ){if len (_cff .Params )!=1{_aadg :=_f .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_cff .Operand ,1,len (_cff .Params ),_cff .Params );return 0.0,_aadg ;};return _bb .GetNumberAsFloat (_cff .Params [0]);};func (_ecgd paraList )applyTables (_adeab []*textTable )paraList {_cfec :=map[*textPara ]struct{}{};var _ecfa paraList ;for _ ,_ecd :=range _adeab {for _ ,_fdbe :=range _ecd ._cgca {_cfec [_fdbe ]=struct{}{};};_ecfa =append (_ecfa ,_ecd .newTablePara ());};for _ ,_aadc :=range _ecgd {if _ ,_ddbgc :=_cfec [_aadc ];!_ddbgc {_ecfa =append (_ecfa ,_aadc );};};return _ecfa ;};func (_aeag *textObject )newTextMark (_aebd string ,_bbag _dbf .Matrix ,_fgacc _dbf .Point ,_cgae float64 ,_bddb *_fef .PdfFont ,_dagf float64 ,_dfgc ,_edbf _fe .Color )(textMark ,bool ){_ddf :=_bbag .Angle ();_gdae :=_fcde (_ddf ,_ebfb );var _aabe float64 ;if _gdae %180!=90{_aabe =_bbag .ScalingFactorY ();}else {_aabe =_bbag .ScalingFactorX ();};_acee :=_bdgb (_bbag );_dceb :=_fef .PdfRectangle {Llx :_acee .X ,Lly :_acee .Y ,Urx :_fgacc .X ,Ury :_fgacc .Y };switch _gdae %360{case 90:_dceb .Urx -=_aabe ;case 180:_dceb .Ury -=_aabe ;case 270:_dceb .Urx +=_aabe ;case 0:_dceb .Ury +=_aabe ;default:_gdae =0;_dceb .Ury +=_aabe ;};if _dceb .Llx > _dceb .Urx {_dceb .Llx ,_dceb .Urx =_dceb .Urx ,_dceb .Llx ;};if _dceb .Lly > _dceb .Ury {_dceb .Lly ,_dceb .Ury =_dceb .Ury ,_dceb .Lly ;};_cffa ,_agga :=_egff (_dceb ,_aeag ._aed ._dd );if !_agga {_fbc .Log .Debug ("\u0054\u0065\u0078\u0074\u0020m\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069\u0064\u0065\u0020\u0070a\u0067\u0065\u002e\u0020\u0062\u0062\u006f\u0078\u003d\u0025\u0067\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u003d\u0025\u0067\u0020\u0074\u0065\u0078\u0074\u003d\u0025q",_dceb ,_aeag ._aed ._dd ,_aebd );};_dceb =_cffa ;_ggf :=_dceb ;_bdfb :=_aeag ._aed ._dd ;switch _gdae %360{case 90:_bdfb .Urx ,_bdfb .Ury =_bdfb .Ury ,_bdfb .Urx ;_ggf =_fef .PdfRectangle {Llx :_bdfb .Urx -_dceb .Ury ,Urx :_bdfb .Urx -_dceb .Lly ,Lly :_dceb .Llx ,Ury :_dceb .Urx };case 180:_ggf =_fef .PdfRectangle {Llx :_bdfb .Urx -_dceb .Llx ,Urx :_bdfb .Urx -_dceb .Urx ,Lly :_bdfb .Ury -_dceb .Lly ,Ury :_bdfb .Ury -_dceb .Ury };case 270:_bdfb .Urx ,_bdfb .Ury =_bdfb .Ury ,_bdfb .Urx ;_ggf =_fef .PdfRectangle {Llx :_dceb .Ury ,Urx :_dceb .Lly ,Lly :_bdfb .Ury -_dceb .Llx ,Ury :_bdfb .Ury -_dceb .Urx };};if _ggf .Llx > _ggf .Urx {_ggf .Llx ,_ggf .Urx =_ggf .Urx ,_ggf .Llx ;};if _ggf .Lly > _ggf .Ury {_ggf .Lly ,_ggf .Ury =_ggf .Ury ,_ggf .Lly ;};_gdbb :=textMark {_gfeba :_aebd ,PdfRectangle :_ggf ,_gead :_dceb ,_eceg :_bddb ,_fff :_aabe ,_dcec :_dagf ,_dgdc :_bbag ,_aaab :_fgacc ,_bbbg :_gdae ,_ffeeg :_dfgc ,_dgabg :_edbf };if _dea {_fbc .Log .Info ("n\u0065\u0077\u0054\u0065\u0078\u0074M\u0061\u0072\u006b\u003a\u0020\u0073t\u0061\u0072\u0074\u003d\u0025\u002e\u0032f\u0020\u0065\u006e\u0064\u003d\u0025\u002e\u0032\u0066\u0020%\u0073",_acee ,_fgacc ,_gdbb .String ());};return _gdbb ,_agga ;};func (_gb *imageExtractContext )extractInlineImage (_ge *_bg .ContentStreamInlineImage ,_egab _bg .GraphicsState ,_bbaf *_fef .PdfPageResources )error {_ag ,_fg :=_ge .ToImage (_bbaf );if _fg !=nil {return _fg ;};_dfb ,_fg :=_ge .GetColorSpace (_bbaf );if _fg !=nil {return _fg ;};if _dfb ==nil {_dfb =_fef .NewPdfColorspaceDeviceGray ();};_fdd ,_fg :=_dfb .ImageToRGB (*_ag );if _fg !=nil {return _fg ;};_abg :=ImageMark {Image :&_fdd ,Width :_egab .CTM .ScalingFactorX (),Height :_egab .CTM .ScalingFactorY (),Angle :_egab .CTM .Angle ()};_abg .X ,_abg .Y =_egab .CTM .Translation ();_gb ._ae =append (_gb ._ae ,_abg );_gb ._ea ++;return nil ;};func (_adfg *textObject )setTextMatrix (_bfe []float64 ){if len (_bfe )!=6{_fbc .Log .Debug ("\u0045\u0052\u0052OR\u003a\u0020\u006c\u0065\u006e\u0028\u0066\u0029\u0020\u0021\u003d\u0020\u0036\u0020\u0028\u0025\u0064\u0029",len (_bfe ));return ;};_fbgg ,_dgb ,_cbeg ,_gcf ,_dde ,_eafe :=_bfe [0],_bfe [1],_bfe [2],_bfe [3],_bfe [4],_bfe [5];_adfg ._acg =_dbf .NewMatrix (_fbgg ,_dgb ,_cbeg ,_gcf ,_dde ,_eafe );_adfg ._bgf =_adfg ._acg ;};func (_aacc *wordBag )minDepth ()float64 {return _aacc ._bab -(_aacc .Ury -_aacc ._ceaf )};func _abbd (_efaf []TextMark ,_ecfe *int )[]TextMark {_bagdg :=_efaf [len (_efaf )-1];_dfag :=[]rune (_bagdg .Text );if len (_dfag )==1{_efaf =_efaf [:len (_efaf )-1];_cgba :=_efaf [len (_efaf )-1];*_ecfe =_cgba .Offset +len (_cgba .Text );}else {_baab :=_dcecb (_bagdg .Text );*_ecfe +=len (_baab )-len (_bagdg .Text );_bagdg .Text =_baab ;};return _efaf ;};func (_gfed *textObject )getFillColor ()_fe .Color {return _gdgc (_gfed ._cfge .ColorspaceNonStroking ,_gfed ._cfge .ColorNonStroking );};func _degd (_aefag string ,_aggac int )string {if len (_aefag )< _aggac {return _aefag ;};return _aefag [:_aggac ];};func (_fefb paraList )findTables ()[]*textTable {_fefb .addNeighbours ();_db .Slice (_fefb ,func (_gcfc ,_fedf int )bool {return _gdcb (_fefb [_gcfc ],_fefb [_fedf ])< 0});var _ccbec []*textTable ;for _ ,_gcfda :=range _fefb {if _gcfda ._gcbc {continue ;};_cfcb :=_gcfda .isAtom ();if _cfcb ==nil {continue ;};_cfcb .growTable ();if _cfcb ._fbdef *_cfcb ._ccebee < _gagg {continue ;};_cfcb .markCells ();_cfcb .log ("\u0067\u0072\u006fw\u006e");_ccbec =append (_ccbec ,_cfcb );};return _ccbec ;};func (_afgg *textTable )growTable (){_efaa :=func (_gcfa paraList ){_afgg ._ccebee ++;for _dgbe :=0;_dgbe < _afgg ._fbdef ;_dgbe ++{_affa :=_gcfa [_dgbe ];_afgg .put (_dgbe ,_afgg ._ccebee -1,_affa );};};_eegd :=func (_dcbe paraList ){_afgg ._fbdef ++;for _ecedb :=0;_ecedb < _afgg ._ccebee ;_ecedb ++{_cafc :=_dcbe [_ecedb ];_afgg .put (_afgg ._fbdef -1,_ecedb ,_cafc );};};for {_dgee :=false ;_cege :=_afgg .getDown ();_cbbc :=_afgg .getRight ();if _cege !=nil &&_cbbc !=nil {_bbfbc :=_cege [len (_cege )-1];if _bbfbc !=nil &&!_bbfbc ._gcbc &&_bbfbc ==_cbbc [len (_cbbc )-1]{_efaa (_cege );if _cbbc =_afgg .getRight ();_cbbc !=nil {_eegd (_cbbc );_afgg .put (_afgg ._fbdef -1,_afgg ._ccebee -1,_bbfbc );};_dgee =true ;};};if !_dgee &&_cege !=nil {_efaa (_cege );_dgee =true ;};if !_dgee &&_cbbc !=nil {_eegd (_cbbc );_dgee =true ;};if !_dgee {break ;};};};func (_dffecb *textPara )isAtom ()*textTable {_defe :=_dffecb ;_gfcd :=_dffecb ._egcca ;_fafee :=_dffecb ._bfc ;if !(_gfcd !=nil &&!_gfcd ._gcbc &&_fafee !=nil &&!_fafee ._gcbc ){return nil ;};_fcdee :=_gfcd ._bfc ;if !(_fcdee !=nil &&!_fcdee ._gcbc &&_fcdee ==_fafee ._egcca ){return nil ;};if _gfcd ._cfc !=_defe ||_fafee ._aafb !=_defe ||_fcdee ._cfc !=_fafee ||_fcdee ._aafb !=_gfcd {return nil ;};return _aadec (_defe ,_gfcd ,_fafee ,_fcdee );};func _aag (_ebcg _fef .PdfRectangle ,_adea []*textLine )*textPara {return &textPara {PdfRectangle :_ebcg ,_gfgb :_adea };};
|
||
|
||
// TableCell is a cell in a TextTable.
|
||
type TableCell struct{
|
||
|
||
// Text is the extracted text.
|
||
Text string ;
|
||
|
||
// Marks returns the TextMarks corresponding to the text in Text.
|
||
Marks TextMarkArray ;};func _bbff (_egce ,_gagc bounded )float64 {return _aeeb (_egce )-_aeeb (_gagc )};func _ggd (_abge _dbf .Point )_dbf .Matrix {return _dbf .TranslationMatrix (_abge .X ,_abge .Y )};func (_bfcg *textTable )log (_bbeec string ){if !_efcb {return ;};_fbc .Log .Info ("~\u007e\u007e\u0020\u0025\u0073\u003a \u0025\u0064\u0020\u0078\u0020\u0025\u0064\u000a\u0020 \u0020\u0020\u0020 \u00256\u002e\u0032\u0066",_bbeec ,_bfcg ._fbdef ,_bfcg ._ccebee ,_bfcg .PdfRectangle );for _bdgbf :=0;_bdgbf < _bfcg ._ccebee ;_bdgbf ++{for _edbg :=0;_edbg < _bfcg ._fbdef ;_edbg ++{_ddfa :=_bfcg .get (_edbg ,_bdgbf );_dbd .Printf ("\u00254\u0064 \u0025\u0032\u0064\u003a\u0020%\u0036\u002e2\u0066\u0020\u0025\u0071\u000a",_edbg ,_bdgbf ,_ddfa .PdfRectangle ,_degd (_ddfa .text (),50));};};};func (_gacgg *textWord )appendMark (_ebfg *textMark ,_dabec _fef .PdfRectangle ){_gacgg ._gbbd =append (_gacgg ._gbbd ,_ebfg );_gacgg .PdfRectangle =_eefdc (_gacgg .PdfRectangle ,_ebfg .PdfRectangle );if _ebfg ._fff > _gacgg ._afcg {_gacgg ._afcg =_ebfg ._fff ;};_gacgg ._fedfa =_dabec .Ury -_gacgg .PdfRectangle .Lly ;};
|
||
|
||
// String returns a string describing `ma`.
|
||
func (_aab TextMarkArray )String ()string {_afga :=len (_aab ._deffd );if _afga ==0{return "\u0045\u004d\u0050T\u0059";};_fec :=_aab ._deffd [0];_fgee :=_aab ._deffd [_afga -1];return _dbd .Sprintf ("\u007b\u0054\u0045\u0058\u0054\u004d\u0041\u0052K\u0041\u0052\u0052AY\u003a\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025s\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d%\u0073\u007d",_afga ,_fec ,_fgee );};func _fcde (_cgfb float64 ,_cdge int )int {if _cdge ==0{_cdge =1;};_bbee :=float64 (_cdge );return int (_fb .Round (_cgfb /_bbee )*_bbee );};const (_fbcd =true ;_fbcc =true ;_gacf =true ;_fgbe =false ;);func (_beedf paraList )sortReadingOrder (){_fbc .Log .Trace ("\u0073\u006fr\u0074\u0052\u0065\u0061\u0064i\u006e\u0067\u004f\u0072\u0064e\u0072\u003a\u0020\u0070\u0061\u0072\u0061\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_beedf ));if len (_beedf )<=1{return ;};_beedf .computeEBBoxes ();_db .Slice (_beedf ,func (_cdc ,_dbdf int )bool {return _cgg (_beedf [_cdc ],_beedf [_dbdf ])<=0});_cgaf :=_beedf .topoOrder ();_beedf .reorder (_cgaf );};func (_afc paraList )topoOrder ()[]int {if _gbfb {_fbc .Log .Info ("\u0074\u006f\u0070\u006f\u004f\u0072\u0064\u0065\u0072\u003a");};_bggda :=len (_afc );_bed :=make ([]bool ,_bggda );_dcebd :=make ([]int ,0,_bggda );_cgdf :=_afc .llyOrdering ();var _afgd func (_fee int );_afgd =func (_aadd int ){_bed [_aadd ]=true ;for _fgec :=0;_fgec < _bggda ;_fgec ++{if !_bed [_fgec ]{if _afc .readBefore (_cgdf ,_aadd ,_fgec ){_afgd (_fgec );};};};_dcebd =append (_dcebd ,_aadd );};for _eace :=0;_eace < _bggda ;_eace ++{if !_bed [_eace ]{_afgd (_eace );};};return _gabd (_dcebd );};func _cgg (_gagcb ,_fgfd bounded )float64 {_cbbb :=_bbff (_gagcb ,_fgfd );if !_bcgf (_cbbb ){return _cbbb ;};return _cegg (_gagcb ,_fgfd );};func (_cbgg *textObject )setFont (_gef string ,_acb float64 )error {if _cbgg ==nil {return nil ;};_cbgg ._ffe ._fca =_acb ;_dbbb ,_ddgf :=_cbgg .getFont (_gef );if _ddgf !=nil {return _ddgf ;};_cbgg ._ffe ._ggb =_dbbb ;if _cbgg ._becb .empty (){_cbgg ._becb .push (_cbgg ._ffe );}else {_cbgg ._becb .top ()._ggb =_cbgg ._ffe ._ggb ;};return nil ;};
|
||
|
||
// Marks returns the TextMark collection for a page. It represents all the text on the page.
|
||
func (_ecg PageText )Marks ()*TextMarkArray {return &TextMarkArray {_deffd :_ecg ._fdbc }};func (_gbee *wordBag )text ()string {_beb :=_gbee .allWords ();_caeb :=make ([]string ,len (_beb ));for _facc ,_afbc :=range _beb {_caeb [_facc ]=_afbc ._caag ;};return _da .Join (_caeb ,"\u0020");};func (_acdfg *textTable )toTextTable ()TextTable {_ebfc :=make ([][]TableCell ,_acdfg ._ccebee );for _dedg :=0;_dedg < _acdfg ._ccebee ;_dedg ++{_ebfc [_dedg ]=make ([]TableCell ,_acdfg ._fbdef );for _befc :=0;_befc < _acdfg ._fbdef ;_befc ++{_ggee :=_acdfg .get (_befc ,_dedg );_ebfc [_dedg ][_befc ].Text =_ggee .text ();_aaff :=0;_ebfc [_dedg ][_befc ].Marks ._deffd =_ggee .toTextMarks (&_aaff );};};return TextTable {W :_acdfg ._fbdef ,H :_acdfg ._ccebee ,Cells :_ebfc };};type textState struct{_ccf float64 ;_bfb float64 ;_fggg float64 ;_decc float64 ;_fca float64 ;_dbe RenderMode ;_afa float64 ;_ggb *_fef .PdfFont ;_ada _fef .PdfRectangle ;_gba int ;_dga int ;};func (_eced *textObject )showTextAdjusted (_egb *_bb .PdfObjectArray )error {_gfa :=false ;for _ ,_gfeb :=range _egb .Elements (){switch _gfeb .(type ){case *_bb .PdfObjectFloat ,*_bb .PdfObjectInteger :_fgf ,_fbcb :=_bb .GetNumberAsFloat (_gfeb );if _fbcb !=nil {_fbc .Log .Debug ("\u0045\u0052\u0052\u004fR\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078t\u0041\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0042\u0061\u0064\u0020\u006e\u0075\u006d\u0065r\u0069\u0063\u0061\u006c\u0020a\u0072\u0067\u002e\u0020\u006f\u003d\u0025\u0073\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_gfeb ,_egb );return _fbcb ;};_gac ,_aef :=-_fgf *0.001*_eced ._ffe ._fca ,0.0;if _gfa {_aef ,_gac =_gac ,_aef ;};_bac :=_ggd (_dbf .Point {X :_gac ,Y :_aef });_eced ._acg .Concat (_bac );case *_bb .PdfObjectString :_egbg ,_abgc :=_bb .GetStringBytes (_gfeb );if !_abgc {_fbc .Log .Trace ("s\u0068\u006f\u0077\u0054\u0065\u0078\u0074\u0041\u0064j\u0075\u0073\u0074\u0065\u0064\u003a\u0020Ba\u0064\u0020\u0073\u0074r\u0069\u006e\u0067\u0020\u0061\u0072\u0067\u002e\u0020o=\u0025\u0073 \u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_gfeb ,_egb );return _bb .ErrTypeError ;};_eced .renderText (_egbg );default:_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078\u0074A\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0055\u006e\u0065\u0078p\u0065\u0063\u0074\u0065\u0064\u0020\u0074\u0079\u0070\u0065\u0020\u0028%T\u0029\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_gfeb ,_egb );return _bb .ErrTypeError ;};};return nil ;};func _aadec (_gcbd ,_dbda ,_agbaa ,_dbefc *textPara )*textTable {_bcggg :=&textTable {_fbdef :2,_ccebee :2,_cgca :map[uint64 ]*textPara {}};_bcggg .put (0,0,_gcbd );_bcggg .put (1,0,_dbda );_bcggg .put (0,1,_agbaa );_bcggg .put (1,1,_dbefc );return _bcggg ;};func (_dfebb *textPara )depth ()float64 {if len (_dfebb ._gfgb )> 0{return _dfebb ._gfgb [0]._bcf ;};return _dfebb ._fcge .get (0,0).depth ();};func _gabd (_faga []int )[]int {_daab :=make ([]int ,len (_faga ));for _ade ,_ddgfe :=range _faga {_daab [len (_faga )-1-_ade ]=_ddgfe ;};return _daab ;};func (_aad *textObject )nextLine (){_aad .moveTo (0,-_aad ._ffe ._decc )};func _egff (_cgde ,_cadeea _fef .PdfRectangle )(_fef .PdfRectangle ,bool ){if !_aeee (_cgde ,_cadeea ){return _fef .PdfRectangle {},false ;};return _fef .PdfRectangle {Llx :_fb .Max (_cgde .Llx ,_cadeea .Llx ),Urx :_fb .Min (_cgde .Urx ,_cadeea .Urx ),Lly :_fb .Max (_cgde .Lly ,_cadeea .Lly ),Ury :_fb .Min (_cgde .Ury ,_cadeea .Ury )},true ;};
|
||
|
||
// ExtractPageImages returns the image contents of the page extractor, including data
|
||
// and position, size information for each image.
|
||
// A set of options to control page image extraction can be passed in. The options
|
||
// parameter can be nil for the default options. By default, inline stencil masks
|
||
// are not extracted.
|
||
func (_fd *Extractor )ExtractPageImages (options *ImageExtractOptions )(*PageImages ,error ){_dda :=&imageExtractContext {_ab :options };_ba :=_dda .extractContentStreamImages (_fd ._g ,_fd ._ffd );if _ba !=nil {return nil ,_ba ;};return &PageImages {Images :_dda ._ae },nil ;};const (_dfge =false ;_dea =false ;_gbfb =false ;_cgege =false ;_ecbb =_cgege &&false ;_fefcg =_ecbb &&false ;_efcb =false ;);func (_egceb *textMark )bbox ()_fef .PdfRectangle {return _egceb .PdfRectangle };func (_ccbb *textPara )toCellTextMarks (_aade *int )[]TextMark {var _cdedc []TextMark ;for _bea ,_facd :=range _ccbb ._gfgb {_agdg :=_facd .toTextMarks (_aade );_ccd :=_fbcd &&_facd .endsInHyphen ()&&_bea !=len (_ccbb ._gfgb )-1;if _ccd {_agdg =_abbd (_agdg ,_aade );};_cdedc =append (_cdedc ,_agdg ...);if !(_ccd ||_bea ==len (_ccbb ._gfgb )-1){_cdedc =_gccb (_cdedc ,_aade ,_dcag (_facd ._bcf ,_ccbb ._gfgb [_bea +1]._bcf ));};};return _cdedc ;};func _eafa (_dfeg []*textWord ,_ffa *textWord )[]*textWord {for _dfba ,_ceege :=range _dfeg {if _ceege ==_ffa {return _dabg (_dfeg ,_dfba );};};_fbc .Log .Error ("\u0072\u0065\u006d\u006f\u0076e\u0057\u006f\u0072\u0064\u003a\u0020\u0077\u006f\u0072\u0064\u0073\u0020\u0064o\u0065\u0073\u006e\u0027\u0074\u0020\u0063\u006f\u006e\u0074\u0061\u0069\u006e\u0020\u0077\u006f\u0072\u0064\u003d\u0025\u0073",_ffa );return nil ;};func _gaac (_bgbg ,_ecgf int )int {if _bgbg > _ecgf {return _bgbg ;};return _ecgf ;};func _bgca (_fdfg ,_bfeee _fef .PdfRectangle )bool {return _fdfg .Llx <=_bfeee .Llx &&_bfeee .Urx <=_fdfg .Urx &&_fdfg .Lly <=_bfeee .Lly &&_bfeee .Ury <=_fdfg .Ury ;};
|
||
|
||
// String returns a description of `t`.
|
||
func (_bege *textTable )String ()string {return _dbd .Sprintf ("\u0025d\u0020\u0078\u0020\u0025\u0064",_bege ._fbdef ,_bege ._ccebee );};func (_afef paraList )xNeighbours ()map[*textPara ][]int {_fafec :=make ([]event ,2*len (_afef ));for _eggd ,_fdca :=range _afef {_fafec [2*_eggd ]=event {_fdca .Llx ,true ,_eggd };_fafec [2*_eggd +1]=event {_fdca .Urx ,false ,_eggd };};return _afef .eventNeighbours (_fafec );};type bounded interface{bbox ()_fef .PdfRectangle };func _caae (_bdba []*wordBag )[]*wordBag {if len (_bdba )<=1{return _bdba ;};if _dfge {_fbc .Log .Info ("\u006d\u0065\u0072\u0067\u0065\u0057\u006f\u0072\u0064B\u0061\u0067\u0073\u003a");};_db .Slice (_bdba ,func (_adgb ,_fbad int )bool {_aaeb ,_ccebe :=_bdba [_adgb ],_bdba [_fbad ];_egcd :=_aaeb .Width ()*_aaeb .Height ();_egbgg :=_ccebe .Width ()*_ccebe .Height ();if _egcd !=_egbgg {return _egcd > _egbgg ;};if _aaeb .Height ()!=_ccebe .Height (){return _aaeb .Height ()> _ccebe .Height ();};return _adgb < _fbad ;});var _eaad []*wordBag ;_cbcb :=map[int ]struct{}{};for _ccc :=0;_ccc < len (_bdba );_ccc ++{if _ ,_cabf :=_cbcb [_ccc ];_cabf {continue ;};_defd :=_bdba [_ccc ];for _fgeb :=_ccc +1;_fgeb < len (_bdba );_fgeb ++{if _ ,_adgbd :=_cbcb [_ccc ];_adgbd {continue ;};_fbee :=_bdba [_fgeb ];_eea :=_defd .PdfRectangle ;_eea .Llx -=_defd ._ceaf ;if _bgca (_eea ,_fbee .PdfRectangle ){_defd .absorb (_fbee );_cbcb [_fgeb ]=struct{}{};};};_eaad =append (_eaad ,_defd );};if len (_bdba )!=len (_eaad )+len (_cbcb ){_fbc .Log .Error ("\u006d\u0065\u0072\u0067\u0065\u0057o\u0072\u0064\u0042\u0061\u0067\u0073\u003a\u0020\u0025\u0064\u002d\u003e\u0025d\u0020\u0061\u0062\u0073\u006f\u0072\u0062e\u0064\u003d\u0025\u0064",len (_bdba ),len (_eaad ),len (_cbcb ));};return _eaad ;};func _fggd (_ceee ,_gdgg bounded )float64 {return _ceee .bbox ().Llx -_gdgg .bbox ().Urx };func (_cgda *stateStack )size ()int {return len (*_cgda )};func (_gfbe *textTable )newTablePara ()*textPara {_gagce :=_gfbe .computeBbox ();return &textPara {PdfRectangle :_gagce ,_bdgeb :_gagce ,_fcge :_gfbe };};func (_fgefc *wordBag )depthRange (_ggcf ,_aeced int )[]int {_aadf :=_fgefc .depthIndexes ();var _fcdd []int ;for _ ,_gecg :=range _aadf {if _ggcf <=_gecg &&_gecg <=_aeced {_fcdd =append (_fcdd ,_gecg );};};return _fcdd ;};func (_cbfg *textPara )bbox ()_fef .PdfRectangle {return _cbfg .PdfRectangle };
|
||
|
||
// String returns a description of `b`.
|
||
func (_cga *wordBag )String ()string {var _dafd []string ;for _ ,_aba :=range _cga .depthIndexes (){_cded ,_ :=_cga ._edcf [_aba ];for _ ,_dffe :=range _cded {_dafd =append (_dafd ,_dffe ._caag );};};return _dbd .Sprintf ("\u0025.\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065=\u0025\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0025\u0071",_cga .PdfRectangle ,_cga ._ceaf ,len (_dafd ),_dafd );};func (_ddea *wordBag )removeWord (_fcdb *textWord ,_fdba int ){_gag :=_eafa (_ddea .stratum (_fdba ),_fcdb );if len (_gag )==0{delete (_ddea ._edcf ,_fdba );}else {_ddea ._edcf [_fdba ]=_gag ;};};func _aeee (_fbgb ,_bgge _fef .PdfRectangle )bool {return _aedf (_fbgb ,_bgge )&&_edcfd (_fbgb ,_bgge )};func _ccef (_deaed *PageText )error {_cadd :=_ce .GetLicenseKey ();if _cadd !=nil &&_cadd .IsLicensed ()||_ega {return nil ;};_dbd .Printf ("\u0055\u006e\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0064\u0020c\u006f\u0070\u0079\u0020\u006f\u0066\u0020\u0055\u006e\u0069P\u0044\u0046\u000a");_dbd .Println ("-\u0020\u0047\u0065\u0074\u0020\u0061\u0020\u0066\u0072e\u0065\u0020\u0074\u0072\u0069\u0061\u006c l\u0069\u0063\u0065\u006es\u0065\u0020\u006f\u006e\u0020\u0068\u0074\u0074\u0070s:\u002f\u002fu\u006e\u0069\u0064\u006f\u0063\u002e\u0069\u006f");return _f .New ("\u0075\u006e\u0069\u0070d\u0066\u0020\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0020c\u006fd\u0065\u0020\u0072\u0065\u0071\u0075\u0069r\u0065\u0064");};func _aeeb (_fdbaf bounded )float64 {return -_fdbaf .bbox ().Lly };type wordBag struct{_fef .PdfRectangle ;_ceaf float64 ;_bab float64 ;_edcf map[int ][]*textWord ;};
|
||
|
||
// ImageMark represents an image drawn on a page and its position in device coordinates.
|
||
// All coordinates are in device coordinates.
|
||
type ImageMark struct{Image *_fef .Image ;
|
||
|
||
// Dimensions of the image as displayed in the PDF.
|
||
Width float64 ;Height float64 ;
|
||
|
||
// Position of the image in PDF coordinates (lower left corner).
|
||
X float64 ;Y float64 ;
|
||
|
||
// Angle in degrees, if rotated.
|
||
Angle float64 ;};
|
||
|
||
// PageImages represents extracted images on a PDF page with spatial information:
|
||
// display position and size.
|
||
type PageImages struct{Images []ImageMark ;};func (_afdd *textPara )toTextMarks (_becae *int )[]TextMark {if _afdd ._fcge ==nil {return _afdd .toCellTextMarks (_becae );};var _bdgc []TextMark ;for _efbg :=0;_efbg < _afdd ._fcge ._ccebee ;_efbg ++{for _dfgf :=0;_dfgf < _afdd ._fcge ._fbdef ;_dfgf ++{_agbg :=_afdd ._fcge .get (_dfgf ,_efbg );if _agbg ==nil {_bdgc =_gccb (_bdgc ,_becae ,"\u0009");}else {_abfa :=_agbg .toCellTextMarks (_becae );_bdgc =append (_bdgc ,_abfa ...);};_bdgc =_gccb (_bdgc ,_becae ,"\u0020");};if _efbg < _afdd ._fcge ._ccebee -1{_bdgc =_gccb (_bdgc ,_becae ,"\u000a");};};return _bdgc ;};func (_cbec *textTable )bbox ()_fef .PdfRectangle {return _cbec .PdfRectangle };func (_dafgc *textLine )appendWord (_gdcd *textWord ){_dafgc ._aecg =append (_dafgc ._aecg ,_gdcd );_dafgc .PdfRectangle =_eefdc (_dafgc .PdfRectangle ,_gdcd .PdfRectangle );if _gdcd ._afcg > _dafgc ._aaa {_dafgc ._aaa =_gdcd ._afcg ;};if _gdcd ._fedfa > _dafgc ._bcf {_dafgc ._bcf =_gdcd ._fedfa ;};};
|
||
|
||
// String returns a string describing `tm`.
|
||
func (_bcd TextMark )String ()string {_aggf :=_bcd .BBox ;var _ggc string ;if _bcd .Font !=nil {_ggc =_bcd .Font .String ();if len (_ggc )> 50{_ggc =_ggc [:50]+"\u002e\u002e\u002e";};};var _bbbe string ;if _bcd .Meta {_bbbe ="\u0020\u002a\u004d\u002a";};return _dbd .Sprintf ("\u007b\u0054\u0065\u0078t\u004d\u0061\u0072\u006b\u003a\u0020\u0025\u0064\u0020%\u0071\u003d\u0025\u0030\u0032\u0078\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e2\u0066\u0029\u0020\u0028\u00256\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0025\u0073\u007d",_bcd .Offset ,_bcd .Text ,[]rune (_bcd .Text ),_aggf .Llx ,_aggf .Lly ,_aggf .Urx ,_aggf .Ury ,_ggc ,_bbbe );};
|
||
|
||
// String returns a description of `l`.
|
||
func (_dce *textLine )String ()string {return _dbd .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_dce ._bcf ,_dce .PdfRectangle ,_dce ._aaa ,_dce .text ());};func (_eefb paraList )writeText (_becc _c .Writer ){for _dbbed ,_cdga :=range _eefb {_cdga .writeText (_becc );if _dbbed !=len (_eefb )-1{if _dddb (_cdga ,_eefb [_dbbed +1]){_becc .Write ([]byte ("\u0020"));}else {_becc .Write ([]byte ("\u000a"));_becc .Write ([]byte ("\u000a"));};};};_becc .Write ([]byte ("\u000a"));_becc .Write ([]byte ("\u000a"));};
|
||
|
||
// String returns a description of `state`.
|
||
func (_gbag *textState )String ()string {_aac :="\u005bN\u004f\u0054\u0020\u0053\u0045\u0054]";if _gbag ._ggb !=nil {_aac =_gbag ._ggb .BaseFont ();};return _dbd .Sprintf ("\u0074\u0063\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0077\u003d\u0025\u002e\u0032\u0066 \u0074f\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u003d\u0025\u0071",_gbag ._ccf ,_gbag ._bfb ,_gbag ._fca ,_aac );};var (_adeb =map[rune ]string {0x0060:"\u0300",0x02CB:"\u0300",0x0027:"\u0301",0x00B4:"\u0301",0x02B9:"\u0301",0x02CA:"\u0301",0x005E:"\u0302",0x02C6:"\u0302",0x007E:"\u0303",0x02DC:"\u0303",0x00AF:"\u0304",0x02C9:"\u0304",0x02D8:"\u0306",0x02D9:"\u0307",0x00A8:"\u0308",0x00B0:"\u030a",0x02DA:"\u030a",0x02BA:"\u030b",0x02DD:"\u030b",0x02C7:"\u030c",0x02C8:"\u030d",0x0022:"\u030e",0x02BB:"\u0312",0x02BC:"\u0313",0x0486:"\u0313",0x055A:"\u0313",0x02BD:"\u0314",0x0485:"\u0314",0x0559:"\u0314",0x02D4:"\u031d",0x02D5:"\u031e",0x02D6:"\u031f",0x02D7:"\u0320",0x02B2:"\u0321",0x00B8:"\u0327",0x02CC:"\u0329",0x02B7:"\u032b",0x02CD:"\u0331",0x005F:"\u0332",0x204E:"\u0359"};);func _ffcb (_eaea *textWord ,_cfa float64 )*wordBag {_dege :=_bbaa (_eaea ._fedfa );_dbbe :=[]*textWord {_eaea };_egaa :=wordBag {_edcf :map[int ][]*textWord {_dege :_dbbe },PdfRectangle :_eaea .PdfRectangle ,_ceaf :_eaea ._afcg ,_bab :_cfa };return &_egaa ;};func _edcb (_gecf []TextMark ,_gaec *int ,_cac TextMark )[]TextMark {_cac .Offset =*_gaec ;_gecf =append (_gecf ,_cac );*_gaec +=len (_cac .Text );return _gecf ;};func _ebdg (_ccaa ,_aaef *textPara )bool {return _aedf (_ccaa ._bdgeb ,_aaef ._bdgeb )};func (_bbc *textObject )getStrokeColor ()_fe .Color {return _gdgc (_bbc ._cfge .ColorspaceStroking ,_bbc ._cfge .ColorStroking );};func _bedf (_deae string )bool {for _ ,_ged :=range _deae {if !_cf .IsSpace (_ged ){return false ;};};return true ;};func _bdga (_fbdc []*textMark ,_aeaef _fef .PdfRectangle )paraList {_fbc .Log .Trace ("\u006d\u0061\u006b\u0065\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065\u003a \u0025\u0064\u0020\u0065\u006c\u0065m\u0065\u006e\u0074\u0073\u0020\u0070\u0061\u0067\u0065\u0053\u0069\u007a\u0065=\u0025\u002e\u0032\u0066",len (_fbdc ),_aeaef );if len (_fbdc )==0{return nil ;};_aedg :=_fcda (_fbdc ,_aeaef );if len (_aedg )==0{return nil ;};_acea :=_ebb (_aedg ,_aeaef .Ury );_eaee :=_eecd (_acea ,_aeaef .Ury );_eaee =_caae (_eaee );_fbb :=make (paraList ,0,len (_eaee ));for _ ,_cfbf :=range _eaee {_egee :=_cfbf .arrangeText ();if _egee !=nil {_fbb =append (_fbb ,_egee );};};if len (_fbb )>=_gagg {_fbb =_fbb .extractTables ();};_fbb .sortReadingOrder ();_fbb .log ("\u0073\u006f\u0072te\u0064\u0020\u0069\u006e\u0020\u0072\u0065\u0061\u0064\u0069\u006e\u0067\u0020\u006f\u0072\u0064\u0065\u0072");return _fbb ;};func (_fafe *stateStack )pop ()*textState {if _fafe .empty (){return nil ;};_gdb :=*(*_fafe )[len (*_fafe )-1];*_fafe =(*_fafe )[:len (*_fafe )-1];return &_gdb ;};func (_eagc *stateStack )push (_bagc *textState ){_aaf :=*_bagc ;*_eagc =append (*_eagc ,&_aaf )};func (_badb *textObject )getFont (_bbfb string )(*_fef .PdfFont ,error ){if _badb ._aed ._a !=nil {_badb ._aed ._fc ++;_gge ,_dag :=_badb ._aed ._a [_bbfb ];if _dag {_gge ._dbfe =_badb ._aed ._fc ;return _gge ._efeg ,nil ;};};_ggdb ,_aedb :=_badb .getFontDirect (_bbfb );if _aedb !=nil {return nil ,_aedb ;};if _badb ._aed ._a !=nil {_gcg :=fontEntry {_ggdb ,_badb ._aed ._fc };if len (_badb ._aed ._a )>=_dfabf {var _bgeg []string ;for _gegc :=range _badb ._aed ._a {_bgeg =append (_bgeg ,_gegc );};_db .Slice (_bgeg ,func (_ffee ,_ede int )bool {return _badb ._aed ._a [_bgeg [_ffee ]]._dbfe < _badb ._aed ._a [_bgeg [_ede ]]._dbfe ;});delete (_badb ._aed ._a ,_bgeg [0]);};_badb ._aed ._a [_bbfb ]=_gcg ;};return _ggdb ,nil ;};func (_deb *textObject )setTextRenderMode (_aae int ){if _deb ==nil {return ;};_deb ._ffe ._dbe =RenderMode (_aae );};func (_dedb *textWord )absorb (_bbcfg *textWord ){_dedb .PdfRectangle =_eefdc (_dedb .PdfRectangle ,_bbcfg .PdfRectangle );_dedb ._gbbd =append (_dedb ._gbbd ,_bbcfg ._gbbd ...);};func _gdcb (_cbcg ,_eabd bounded )float64 {_agaa :=_cegg (_cbcg ,_eabd );if !_bcgf (_agaa ){return _agaa ;};return _bbff (_cbcg ,_eabd );};func (_fbbg paraList )computeEBBoxes (){if _dfge {_fbc .Log .Info ("\u0063o\u006dp\u0075\u0074\u0065\u0045\u0042\u0042\u006f\u0078\u0065\u0073\u003a");};for _ ,_afbd :=range _fbbg {_afbd ._bdgeb =_afbd .PdfRectangle ;};_fedc :=_fbbg .yNeighbours ();for _febb ,_efefc :=range _fbbg {_afbb :=_efefc ._bdgeb ;_ddfc ,_baeb :=-1.0e9,+1.0e9;for _ ,_dfbe :=range _fedc [_efefc ]{_cadef :=_fbbg [_dfbe ]._bdgeb ;if _cadef .Urx < _afbb .Llx {_ddfc =_fb .Max (_ddfc ,_cadef .Urx );}else if _afbb .Urx < _cadef .Llx {_baeb =_fb .Min (_baeb ,_cadef .Llx );};};for _ebbg ,_gdceg :=range _fbbg {_feca :=_gdceg ._bdgeb ;if _febb ==_ebbg ||_feca .Ury > _afbb .Lly {continue ;};if _ddfc <=_feca .Llx &&_feca .Llx < _afbb .Llx {_afbb .Llx =_feca .Llx ;}else if _feca .Urx <=_baeb &&_afbb .Urx < _feca .Urx {_afbb .Urx =_feca .Urx ;};};if _dfge {_dbd .Printf ("%\u0034\u0064\u003a\u0020%6\u002e2\u0066\u002d\u003e\u0025\u0036.\u0032\u0066\u0020\u0025\u0071\u000a",_febb ,_efefc ._bdgeb ,_afbb ,_degd (_efefc .text (),50));};_efefc ._bdgeb =_afbb ;};if _fgbe {for _ ,_ebaf :=range _fbbg {_ebaf .PdfRectangle =_ebaf ._bdgeb ;};};};func (_egcc *wordBag )highestWord (_edf int ,_bagd ,_caeg float64 )*textWord {for _ ,_cgf :=range _egcc ._edcf [_edf ]{if _bagd <=_cgf ._fedfa &&_cgf ._fedfa <=_caeg {return _cgf ;};};return nil ;};func (_dgcg paraList )yNeighbours ()map[*textPara ][]int {_dbdab :=make ([]event ,2*len (_dgcg ));for _gcgf ,_febc :=range _dgcg {_dbdab [2*_gcgf ]=event {_febc .Lly ,true ,_gcgf };_dbdab [2*_gcgf +1]=event {_febc .Ury ,false ,_gcgf };};return _dgcg .eventNeighbours (_dbdab );};func (_eeae *textPara )writeCellText (_ddeg _c .Writer ){for _abdb ,_faee :=range _eeae ._gfgb {_gfff :=_faee .text ();_edcfdb :=_fbcd &&_faee .endsInHyphen ()&&_abdb !=len (_eeae ._gfgb )-1;if _edcfdb {_gfff =_dcecb (_gfff );};_ddeg .Write ([]byte (_gfff ));if !(_edcfdb ||_abdb ==len (_eeae ._gfgb )-1){_ddeg .Write ([]byte (_dcag (_faee ._bcf ,_eeae ._gfgb [_abdb +1]._bcf )));};};};func (_gg *imageExtractContext )extractFormImages (_cdfa *_bb .PdfObjectName ,_cgc _bg .GraphicsState ,_baa *_fef .PdfPageResources )error {_bdg ,_bcg :=_baa .GetXObjectFormByName (*_cdfa );if _bcg !=nil {return _bcg ;};if _bdg ==nil {return nil ;};_fcg ,_bcg :=_bdg .GetContentStream ();if _bcg !=nil {return _bcg ;};_cfd :=_bdg .Resources ;if _cfd ==nil {_cfd =_baa ;};_bcg =_gg .extractContentStreamImages (string (_fcg ),_cfd );if _bcg !=nil {return _bcg ;};_gg ._ffc ++;return nil ;};
|
||
|
||
// ApplyArea processes the page text only within the specified area `bbox`.
|
||
// Each time ApplyArea is called, it updates the result set in `pt`.
|
||
// Can be called multiple times in a row with different bounding boxes.
|
||
func (_cdd *PageText )ApplyArea (bbox _fef .PdfRectangle ){_gcfd :=make ([]*textMark ,0,len (_cdd ._fdb ));for _ ,_eafb :=range _cdd ._fdb {if _aeee (_eafb .bbox (),bbox ){_gcfd =append (_gcfd ,_eafb );};};var _fdfb paraList ;_dfab :=len (_gcfd );for _fddf :=0;_fddf < 360&&_dfab > 0;_fddf +=90{_cade :=make ([]*textMark ,0,len (_gcfd )-_dfab );for _ ,_gdfe :=range _gcfd {if _gdfe ._bbbg ==_fddf {_cade =append (_cade ,_gdfe );};};if len (_cade )> 0{_efde :=_bdga (_cade ,_cdd ._cceb );_fdfb =append (_fdfb ,_efde ...);_dfab -=len (_cade );};};_fcgc :=new (_b .Buffer );_fdfb .writeText (_fcgc );_cdd ._ecea =_fcgc .String ();_cdd ._fdbc =_fdfb .toTextMarks ();_cdd ._bggd =_fdfb .tables ();};func _gdce (_dcc []rune )bool {return len (_dcc )>=_gfc &&_cf .Is (_cf .Hyphen ,_dcc [len (_dcc )-1])&&!_cf .IsSpace (_dcc [len (_dcc )-2]);};func (_aaee paraList )addNeighbours (){_bdcf :=_aaee .yNeighbours ();for _ ,_eacc :=range _aaee {var _gegd *textPara ;_adbd :=false ;for _ ,_eabf :=range _bdcf [_eacc ]{_deab :=_aaee [_eabf ];if _deab .Urx <=_eacc .Llx {if _gegd ==nil {_gegd =_deab ;}else {if _deab .Llx > _gegd .Llx {_gegd =_deab ;_adbd =false ;}else if _deab .Llx ==_gegd .Llx {_adbd =true ;};};};};if !_adbd {_eacc ._cfc =_gegd ;};};for _ ,_efbgc :=range _aaee {var _gcfbe *textPara ;_efacc :=false ;for _ ,_bcgggg :=range _bdcf [_efbgc ]{_ddaa :=_aaee [_bcgggg ];if _ddaa .Llx >=_efbgc .Urx {if _gcfbe ==nil {_gcfbe =_ddaa ;}else {if _ddaa .Llx < _gcfbe .Llx {_gcfbe =_ddaa ;_efacc =false ;}else if _ddaa .Llx ==_gcfbe .Llx {_efacc =true ;};};};};if !_efacc {_efbgc ._egcca =_gcfbe ;};};_bdcf =_aaee .xNeighbours ();for _ ,_afed :=range _aaee {var _dccc *textPara ;_begb :=false ;for _ ,_aadgg :=range _bdcf [_afed ]{_dgfb :=_aaee [_aadgg ];if _dgfb .Lly >=_afed .Ury {if _dccc ==nil {_dccc =_dgfb ;}else {if _dgfb .Ury < _dccc .Ury {_dccc =_dgfb ;_begb =false ;}else if _dgfb .Ury ==_dccc .Ury {_begb =true ;};};};};if !_begb {_afed ._aafb =_dccc ;};};for _ ,_cfbc :=range _aaee {var _cfaef *textPara ;_bggee :=false ;for _ ,_afge :=range _bdcf [_cfbc ]{_dfbc :=_aaee [_afge ];if _dfbc .Ury <=_cfbc .Lly {if _cfaef ==nil {_cfaef =_dfbc ;}else {if _dfbc .Ury > _cfaef .Ury {_cfaef =_dfbc ;_bggee =false ;}else if _dfbc .Ury ==_cfaef .Ury {_bggee =true ;};};};};if !_bggee {_cfbc ._bfc =_cfaef ;};};};func (_fgba *textTable )computeBbox ()_fef .PdfRectangle {_bbdf :=_fgba .get (0,0).PdfRectangle ;for _gbedd :=1;_gbedd < _fgba ._fbdef ;_gbedd ++{_bbdf =_eefdc (_bbdf ,_fgba .get (_gbedd ,0).PdfRectangle );};for _facce :=1;_facce < _fgba ._ccebee ;_facce ++{for _cdbb :=0;_cdbb < _fgba ._fbdef ;_cdbb ++{_bbdf =_eefdc (_bbdf ,_fgba .get (_cdbb ,_facce ).PdfRectangle );};};return _bbdf ;};
|
||
|
||
// ExtractText processes and extracts all text data in content streams and returns as a string.
|
||
// It takes into account character encodings in the PDF file, which are decoded by
|
||
// CharcodeBytesToUnicode.
|
||
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
|
||
func (_aece *Extractor )ExtractText ()(string ,error ){_de ,_ ,_ ,_bbge :=_aece .ExtractTextWithStats ();return _de ,_bbge ;};
|
||
|
||
// String returns a description of `tm`.
|
||
func (_bgae *textMark )String ()string {return _dbd .Sprintf ("\u0025\u002e\u0032f \u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\u0022\u0025\u0073\u0022",_bgae .PdfRectangle ,_bgae ._fff ,_bgae ._gfeba );};func (_egdd *textWord )computeText ()string {_fddg :=make ([]string ,len (_egdd ._gbbd ));for _fdcb ,_cccg :=range _egdd ._gbbd {_fddg [_fdcb ]=_cccg ._gfeba ;};return _da .Join (_fddg ,"");};func (_becd *wordBag )sort (){for _ ,_gbc :=range _becd ._edcf {_db .Slice (_gbc ,func (_edba ,_acef int )bool {return _cegg (_gbc [_edba ],_gbc [_acef ])< 0});};};func _gfgd (_efef _fef .PdfRectangle )textState {return textState {_fggg :100,_dbe :RenderModeFill ,_ada :_efef };};func (_egabf *wordBag )arrangeText ()*textPara {_egabf .sort ();if _fbcc {_egabf .removeDuplicates ();};var _ccdg []*textLine ;for _ ,_bef :=range _egabf .depthIndexes (){for !_egabf .empty (_bef ){_gfbc :=_egabf .firstReadingIndex (_bef );_dgbbe :=_egabf .firstWord (_gfbc );_eff :=_daefe (_egabf ,_gfbc );_ddca :=_dgbbe ._afcg ;_gagcf :=_dgbbe ._fedfa -_aabf *_ddca ;_aecd :=_dgbbe ._fedfa +_aabf *_ddca ;_dba :=_cbba *_ddca ;_aaga :=_dbee *_ddca ;_adgd :for {var _fcfba *textWord ;_efgf :=0;for _ ,_dbed :=range _egabf .depthBand (_gagcf ,_aecd ){_dffec :=_egabf .highestWord (_dbed ,_gagcf ,_aecd );if _dffec ==nil {continue ;};_bgda :=_fggd (_dffec ,_eff ._aecg [len (_eff ._aecg )-1]);if _bgda < -_aaga {break _adgd ;};if _bgda > _dba {continue ;};if _fcfba !=nil &&_cegg (_dffec ,_fcfba )>=0{continue ;};_fcfba =_dffec ;_efgf =_dbed ;};if _fcfba ==nil {break ;};_eff .pullWord (_egabf ,_fcfba ,_efgf );};_eff .markWordBoundaries ();_ccdg =append (_ccdg ,_eff );};};if len (_ccdg )==0{return nil ;};_db .Slice (_ccdg ,func (_cadgb ,_faa int )bool {return _cgg (_ccdg [_cadgb ],_ccdg [_faa ])< 0});_fbcg :=_aag (_egabf .PdfRectangle ,_ccdg );if _cgege {_fbc .Log .Info ("\u0061\u0072\u0072an\u0067\u0065\u0054\u0065\u0078\u0074\u0020\u0021\u0021\u0021\u0020\u0070\u0061\u0072\u0061\u003d\u0025\u0073",_fbcg .String ());if _ecbb {for _bcbdc ,_beac :=range _fbcg ._gfgb {_dbd .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bcbdc ,_beac .String ());if _fefcg {for _faaf ,_bdbg :=range _beac ._aecg {_dbd .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_faaf ,_bdbg .String ());for _ddbb ,_ccdb :=range _bdbg ._gbbd {_dbd .Printf ("\u00251\u0032\u0064\u003a\u0020\u0025\u0073\n",_ddbb ,_ccdb .String ());};};};};};};return _fbcg ;};var (_be =_f .New ("\u0074\u0079p\u0065\u0020\u0063h\u0065\u0063\u006b\u0020\u0065\u0072\u0072\u006f\u0072"););func (_fed *imageExtractContext )extractXObjectImage (_fgg *_bb .PdfObjectName ,_adf _bg .GraphicsState ,_cad *_fef .PdfPageResources )error {_gc ,_ :=_cad .GetXObjectByName (*_fgg );if _gc ==nil {return nil ;};_dfbb ,_dfg :=_fed ._bga [_gc ];if !_dfg {_fga ,_fcd :=_cad .GetXObjectImageByName (*_fgg );if _fcd !=nil {return _fcd ;};if _fga ==nil {return nil ;};_cdf ,_fcd :=_fga .ToImage ();if _fcd !=nil {return _fcd ;};_dfbb =&cachedImage {_af :_cdf ,_fdf :_fga .ColorSpace };_fed ._bga [_gc ]=_dfbb ;};_fde :=_dfbb ._af ;_aec :=_dfbb ._fdf ;_dg ,_cadf :=_aec .ImageToRGB (*_fde );if _cadf !=nil {return _cadf ;};_fbc .Log .Debug ("@\u0044\u006f\u0020\u0043\u0054\u004d\u003a\u0020\u0025\u0073",_adf .CTM .String ());_aea :=ImageMark {Image :&_dg ,Width :_adf .CTM .ScalingFactorX (),Height :_adf .CTM .ScalingFactorY (),Angle :_adf .CTM .Angle ()};_aea .X ,_aea .Y =_adf .CTM .Translation ();_fed ._ae =append (_fed ._ae ,_aea );_fed ._ec ++;return nil ;};func (_badg *textTable )getRight ()paraList {_bgffc :=make (paraList ,_badg ._ccebee );for _cdgag :=0;_cdgag < _badg ._ccebee ;_cdgag ++{_egeeg :=_badg .get (_badg ._fbdef -1,_cdgag )._egcca ;if _egeeg ==nil ||_egeeg ._gcbc {return nil ;};_bgffc [_cdgag ]=_egeeg ;};for _edd :=0;_edd < _badg ._ccebee -1;_edd ++{if _bgffc [_edd ]._bfc !=_bgffc [_edd +1]{return nil ;};};return _bgffc ;};func (_ebc paraList )llyRange (_eca []int ,_fdfcg ,_bcba float64 )[]int {_beec :=len (_ebc );if _bcba < _ebc [_eca [0]].Lly ||_fdfcg > _ebc [_eca [_beec -1]].Lly {return nil ;};_bfgg :=_db .Search (_beec ,func (_aeceg int )bool {return _ebc [_eca [_aeceg ]].Lly >=_fdfcg });_aeegf :=_db .Search (_beec ,func (_accd int )bool {return _ebc [_eca [_accd ]].Lly > _bcba });return _eca [_bfgg :_aeegf ];};func (_fgbb *textObject )checkOp (_fbd *_bg .ContentStreamOperation ,_bdf int ,_ebd bool )(_ffce bool ,_gcb error ){if _fgbb ==nil {var _gea []_bb .PdfObject ;if _bdf > 0{_gea =_fbd .Params ;if len (_gea )> _bdf {_gea =_gea [:_bdf ];};};_fbc .Log .Debug ("\u0025\u0023q \u006f\u0070\u0065r\u0061\u006e\u0064\u0020out\u0073id\u0065\u0020\u0074\u0065\u0078\u0074\u002e p\u0061\u0072\u0061\u006d\u0073\u003d\u0025+\u0076",_fbd .Operand ,_gea );};if _bdf >=0{if len (_fbd .Params )!=_bdf {if _ebd {_gcb =_f .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");};_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_fbd .Operand ,_bdf ,len (_fbd .Params ),_fbd .Params );return false ,_gcb ;};};return true ,nil ;};type textLine struct{_fef .PdfRectangle ;_bcf float64 ;_aecg []*textWord ;_aaa float64 ;};func (_dcdf *wordBag )scanBand (_bca string ,_eedc *wordBag ,_egc func (_efg *wordBag ,_aca *textWord )bool ,_dcab ,_gffad ,_bcb float64 ,_cgac ,_bce bool )int {_fdac :=_eedc ._ceaf ;_gaa :=_aabf *_fdac ;_fgef :=0;_gga ,_cabg :=_dcab ,_gffad ;var _gefbg []*textWord ;for _ ,_bdcc :=range _dcdf .depthBand (_dcab -_gaa ,_gffad +_gaa ){for _ ,_debc :=range _dcdf ._edcf [_bdcc ]{if !(_dcab -_gaa <=_debc ._fedfa &&_debc ._fedfa <=_gffad +_gaa ){continue ;};if !_egc (_eedc ,_debc ){continue ;};_acbd :=_fb .Abs (_debc ._afcg -_fdac )/_fdac ;_cdb :=_debc ._afcg /_fdac ;_abdg :=_fb .Min (_acbd ,_cdb );if _bcb > 0{if _abdg > _bcb {continue ;};};if !_cgac {_eedc .pullWord (_dcdf ,_debc ,_bdcc );};_gefbg =append (_gefbg ,_debc );_fgef ++;if !_bce {if _debc ._fedfa < _dcab {_dcab =_debc ._fedfa ;};if _debc ._fedfa > _gffad {_gffad =_debc ._fedfa ;};};if _cgac {break ;};};};if _dfge {if len (_bca )> 0{_fbc .Log .Info ("\u0073\u0063\u0061\u006e\u0042\u0061\u006e\u0064\u003a\u0020\u0025\u0073\u0020\u005b\u0025\u002e\u0032f\u0020\u0025\u002e\u0032\u0066\u005d\u002d\u003e\u005b\u0025.\u0032\u0066\u0020\u0025\u002e\u0032\u0066\u005d\u0020\u0070\u0061\u0072\u0061\u003d\u0025\u002e\u0032\u0066 \u0066\u006f\u006e\u0074\u0073\u0069z\u0065\u003d%\u002e\u0032f\u0020%\u0071",_bca ,_gga ,_cabg ,_dcab ,_gffad ,_eedc .PdfRectangle ,_eedc ._ceaf ,_degd (_eedc .text (),20));for _fdab ,_egbd :=range _gefbg {_dbd .Printf ("\u0020\u0020\u0025\u0071",_egbd ._caag );if _fdab >=5{break ;};};if len (_gefbg )> 0{_dbd .Println ();};};};return _fgef ;};func (_aaed paraList )reorder (_dfeef []int ){_decb :=make (paraList ,len (_aaed ));for _fgbbb ,_cgeag :=range _dfeef {_decb [_fgbbb ]=_aaed [_cgeag ];};copy (_aaed ,_decb );};func (_fdde *textObject )setTextLeading (_dec float64 ){if _fdde ==nil ||_fdde ._ffe ==nil {return ;};_fdde ._ffe ._decc =_dec ;};func _aage (_ccea []_bb .PdfObject )(_abcf ,_daegf float64 ,_eeag error ){if len (_ccea )!=2{return 0,0,_dbd .Errorf ("\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u006e\u0075\u006d\u0062\u0065\u0072\u0020o\u0066 \u0070\u0061\u0072\u0061\u006d\u0073\u003a \u0025\u0064",len (_ccea ));};_fdaa ,_eeag :=_bb .GetNumbersAsFloat (_ccea );if _eeag !=nil {return 0,0,_eeag ;};return _fdaa [0],_fdaa [1],nil ;};func (_bd *imageExtractContext )extractContentStreamImages (_cb string ,_ga *_fef .PdfPageResources )error {_cg :=_bg .NewContentStreamParser (_cb );_bbgf ,_caa :=_cg .Parse ();if _caa !=nil {return _caa ;};if _bd ._bga ==nil {_bd ._bga =map[*_bb .PdfObjectStream ]*cachedImage {};};if _bd ._ab ==nil {_bd ._ab =&ImageExtractOptions {};};_gd :=_bg .NewContentStreamProcessor (*_bbgf );_gd .AddHandler (_bg .HandlerConditionEnumAllOperands ,"",func (_dab *_bg .ContentStreamOperation ,_gdf _bg .GraphicsState ,_cae *_fef .PdfPageResources )error {return _bd .processOperand (_dab ,_gdf ,_cae );});return _gd .Process (_ga );};func (_edbfc *textPara )text ()string {_eadd :=new (_b .Buffer );_edbfc .writeText (_eadd );return _eadd .String ();};func (_cfed *textTable )markCells (){for _ccda :=0;_ccda < _cfed ._ccebee ;_ccda ++{for _dgf :=0;_dgf < _cfed ._fbdef ;_dgf ++{_acgc :=_cfed .get (_dgf ,_ccda );_acgc ._gcbc =true ;};};};func (_cge *Extractor )extractPageText (_dbdb string ,_deff *_fef .PdfPageResources ,_bge _dbf .Matrix ,_ece int )(*PageText ,int ,int ,error ){_fbc .Log .Trace ("\u0065x\u0074\u0072\u0061\u0063t\u0050\u0061\u0067\u0065\u0054e\u0078t\u003a \u006c\u0065\u0076\u0065\u006c\u003d\u0025d",_ece );_bee :=&PageText {_cceb :_cge ._dd };_acc :=_gfgd (_cge ._dd );var _gae stateStack ;_fcb :=_efb (_cge ,_deff ,_bg .GraphicsState {},&_acc ,&_gae );var _eae bool ;if _ece > _ddg {_gda :=_f .New ("\u0066\u006f\u0072\u006d s\u0074\u0061\u0063\u006b\u0020\u006f\u0076\u0065\u0072\u0066\u006c\u006f\u0077");_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a \u0065\u0078\u0074\u0072\u0061\u0063\u0074\u0050\u0061\u0067\u0065\u0054\u0065\u0078\u0074\u002e\u0020\u0072\u0065\u0063u\u0072\u0073\u0069\u006f\u006e\u0020\u006c\u0065\u0076\u0065\u006c\u003d\u0025\u0064 \u0065r\u0072\u003d\u0025\u0076",_ece ,_gda );return _bee ,_acc ._gba ,_acc ._dga ,_gda ;};_gcc :=_bg .NewContentStreamParser (_dbdb );_gf ,_bdd :=_gcc .Parse ();if _bdd !=nil {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020e\u0078\u0074\u0072a\u0063\u0074\u0050\u0061g\u0065\u0054\u0065\u0078\u0074\u0020\u0070\u0061\u0072\u0073\u0065\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bdd );return _bee ,_acc ._gba ,_acc ._dga ,_bdd ;};_bdda :=_bg .NewContentStreamProcessor (*_gf );_bdda .AddHandler (_bg .HandlerConditionEnumAllOperands ,"",func (_ef *_bg .ContentStreamOperation ,_cfg _bg .GraphicsState ,_cgd *_fef .PdfPageResources )error {_dac :=_ef .Operand ;if _dea {_fbc .Log .Info ("\u0026&\u0026\u0020\u006f\u0070\u003d\u0025s",_ef );};switch _dac {case "\u0071":_gae .push (&_acc );case "\u0051":if !_gae .empty (){_acc =*_gae .top ();if len (_gae )>=2{_gae .pop ();};};case "\u0042\u0054":if _eae {_fbc .Log .Debug ("\u0042\u0054\u0020\u0063\u0061\u006c\u006c\u0065\u0064\u0020\u0077\u0068\u0069\u006c\u0065 \u0069n\u0020\u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074");_bee ._fdb =append (_bee ._fdb ,_fcb ._aga ...);};_eae =true ;_bgb :=_cfg ;_bgb .CTM =_bge .Mult (_bgb .CTM );_fcb =_efb (_cge ,_cgd ,_bgb ,&_acc ,&_gae );case "\u0045\u0054":if !_eae {_fbc .Log .Debug ("\u0045\u0054\u0020ca\u006c\u006c\u0065\u0064\u0020\u006f\u0075\u0074\u0073i\u0064e\u0020o\u0066 \u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074");};_eae =false ;_bee ._fdb =append (_bee ._fdb ,_fcb ._aga ...);_fcb .reset ();case "\u0054\u002a":_fcb .nextLine ();case "\u0054\u0064":if _dfe ,_daf :=_fcb .checkOp (_ef ,2,true );!_dfe {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_daf );return _daf ;};_eaf ,_bgg ,_afb :=_aage (_ef .Params );if _afb !=nil {return _afb ;};_fcb .moveText (_eaf ,_bgg );case "\u0054\u0044":if _cbb ,_eed :=_fcb .checkOp (_ef ,2,true );!_cbb {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_eed );return _eed ;};_adg ,_adga ,_eaa :=_aage (_ef .Params );if _eaa !=nil {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_eaa );return _eaa ;};_fcb .moveTextSetLeading (_adg ,_adga );case "\u0054\u006a":if _ccb ,_gde :=_fcb .checkOp (_ef ,1,true );!_ccb {_fbc .Log .Debug ("\u0045\u0052\u0052\u004fR:\u0020\u0054\u006a\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0065\u0072\u0072\u003d%\u0076",_ef ,_gde );return _gde ;};_agd ,_cfb :=_bb .GetStringBytes (_ef .Params [0]);if !_cfb {_fbc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a\u0020T\u006a\u0020o\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074S\u0074\u0072\u0069\u006e\u0067\u0042\u0079\u0074\u0065\u0073\u0020\u0066a\u0069\u006c\u0065\u0064",_ef );return _bb .ErrTypeError ;};return _fcb .showText (_agd );case "\u0054\u004a":if _bcgd ,_dff :=_fcb .checkOp (_ef ,1,true );!_bcgd {_fbc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u004a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_dff );return _dff ;};_cabe ,_aa :=_bb .GetArray (_ef .Params [0]);if !_aa {_fbc .Log .Debug ("\u0045\u0052\u0052OR\u003a\u0020\u0054\u004a\u0020\u006f\u0070\u003d\u0025s\u0020G\u0065t\u0041r\u0072\u0061\u0079\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065\u0064",_ef );return _bdd ;};return _fcb .showTextAdjusted (_cabe );case "\u0027":if _dgc ,_bf :=_fcb .checkOp (_ef ,1,true );!_dgc {_fbc .Log .Debug ("\u0045R\u0052O\u0052\u003a\u0020\u0027\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bf );return _bf ;};_abgf ,_fba :=_bb .GetStringBytes (_ef .Params [0]);if !_fba {_fbc .Log .Debug ("\u0045\u0052RO\u0052\u003a\u0020'\u0020\u006f\u0070\u003d%s \u0047et\u0053\u0074\u0072\u0069\u006e\u0067\u0042yt\u0065\u0073\u0020\u0066\u0061\u0069\u006ce\u0064",_ef );return _bb .ErrTypeError ;};_fcb .nextLine ();return _fcb .showText (_abgf );case "\u0022":if _cfe ,_fa :=_fcb .checkOp (_ef ,3,true );!_cfe {_fbc .Log .Debug ("\u0045R\u0052O\u0052\u003a\u0020\u0022\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_fa );return _fa ;};_efa ,_bgc ,_dfa :=_aage (_ef .Params [:2]);if _dfa !=nil {return _dfa ;};_eb ,_ege :=_bb .GetStringBytes (_ef .Params [2]);if !_ege {_fbc .Log .Debug ("\u0045\u0052RO\u0052\u003a\u0020\"\u0020\u006f\u0070\u003d%s \u0047et\u0053\u0074\u0072\u0069\u006e\u0067\u0042yt\u0065\u0073\u0020\u0066\u0061\u0069\u006ce\u0064",_ef );return _bb .ErrTypeError ;};_fcb .setCharSpacing (_efa );_fcb .setWordSpacing (_bgc );_fcb .nextLine ();return _fcb .showText (_eb );case "\u0054\u004c":_abde ,_aeg :=_abc (_ef );if _aeg !=nil {_fbc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u004c\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_aeg );return _aeg ;};_fcb .setTextLeading (_abde );case "\u0054\u0063":_ded ,_fgb :=_abc (_ef );if _fgb !=nil {_fbc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0063\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_fgb );return _fgb ;};_fcb .setCharSpacing (_ded );case "\u0054\u0066":if _gdab ,_dcg :=_fcb .checkOp (_ef ,2,true );!_gdab {_fbc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0066\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_dcg );return _dcg ;};_fgc ,_gdc :=_bb .GetNameVal (_ef .Params [0]);if !_gdc {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a \u0054\u0066\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074\u004ea\u006d\u0065\u0056\u0061\u006c\u0020\u0066a\u0069\u006c\u0065\u0064",_ef );return _bb .ErrTypeError ;};_eab ,_fbg :=_bb .GetNumberAsFloat (_ef .Params [1]);if !_gdc {_fbc .Log .Debug ("\u0045\u0052\u0052O\u0052\u003a\u0020\u0054\u0066\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074\u0046\u006c\u006f\u0061\u0074\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065d\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_ef ,_fbg );return _fbg ;};_fbg =_fcb .setFont (_fgc ,_eab );_fcb ._bdb =_e .Is (_fbg ,_bb .ErrNotSupported );if _fbg !=nil &&!_fcb ._bdb {return _fbg ;};case "\u0054\u006d":if _gfd ,_deg :=_fcb .checkOp (_ef ,6,true );!_gfd {_fbc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u006d\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_deg );return _deg ;};_aff ,_agg :=_bb .GetNumbersAsFloat (_ef .Params );if _agg !=nil {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_agg );return _agg ;};_fcb .setTextMatrix (_aff );case "\u0054\u0072":if _ddd ,_efc :=_fcb .checkOp (_ef ,1,true );!_ddd {_fbc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0072\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_efc );return _efc ;};_feb ,_efd :=_bb .GetIntVal (_ef .Params [0]);if !_efd {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0072\u0020\u006f\u0070\u003d\u0025\u0073 \u0047e\u0074\u0049\u006e\u0074\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065\u0064",_ef );return _bb .ErrTypeError ;};_fcb .setTextRenderMode (_feb );case "\u0054\u0073":if _fcc ,_eac :=_fcb .checkOp (_ef ,1,true );!_fcc {_fbc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0073\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_eac );return _eac ;};_fda ,_dgg :=_bb .GetNumberAsFloat (_ef .Params [0]);if _dgg !=nil {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_dgg );return _dgg ;};_fcb .setTextRise (_fda );case "\u0054\u0077":if _dfd ,_dad :=_fcb .checkOp (_ef ,1,true );!_dfd {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_dad );return _dad ;};_age ,_accg :=_bb .GetNumberAsFloat (_ef .Params [0]);if _accg !=nil {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_accg );return _accg ;};_fcb .setWordSpacing (_age );case "\u0054\u007a":if _acf ,_acfg :=_fcb .checkOp (_ef ,1,true );!_acf {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_acfg );return _acfg ;};_ffb ,_bgad :=_bb .GetNumberAsFloat (_ef .Params [0]);if _bgad !=nil {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bgad );return _bgad ;};_fcb .setHorizScaling (_ffb );case "\u0044\u006f":if len (_ef .Params )==0{_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0078\u0070\u0065\u0063\u0074\u0065\u0064\u0020\u0058\u004fbj\u0065c\u0074\u0020\u006e\u0061\u006d\u0065\u0020\u006f\u0070\u0065\u0072\u0061n\u0064\u0020\u0066\u006f\u0072\u0020\u0044\u006f\u0020\u006f\u0070\u0065\u0072\u0061\u0074\u006f\u0072.\u0020\u0047\u006f\u0074\u0020\u0025\u002b\u0076\u002e",_ef .Params );return _bb .ErrRangeError ;};_cag ,_bbd :=_bb .GetName (_ef .Params [0]);if !_bbd {_fbc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u0044\u006f\u0020\u006f\u0070e\u0072a\u0074\u006f\u0072\u0020\u0058\u004f\u0062\u006a\u0065\u0063\u0074\u0020\u006e\u0061\u006d\u0065\u0020\u006fp\u0065\u0072\u0061\u006e\u0064\u003a\u0020\u0025\u002b\u0076\u002e",_ef .Params [0]);return _bb .ErrTypeError ;};_ ,_cce :=_cgd .GetXObjectByName (*_cag );if _cce !=_fef .XObjectTypeForm {break ;};_agf ,_bbd :=_cge ._bbb [_cag .String ()];if !_bbd {_fgbg ,_faf :=_cgd .GetXObjectFormByName (*_cag );if _faf !=nil {_fbc .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v",_faf );return _faf ;};_eag ,_faf :=_fgbg .GetContentStream ();if _faf !=nil {_fbc .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v",_faf );return _faf ;};_dbb :=_fgbg .Resources ;if _dbb ==nil {_dbb =_cgd ;};_fge ,_ggg ,_edb ,_faf :=_cge .extractPageText (string (_eag ),_dbb ,_bge .Mult (_cfg .CTM ),_ece +1);if _faf !=nil {_fbc .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v",_faf );return _faf ;};_agf =textResult {*_fge ,_ggg ,_edb };_cge ._bbb [_cag .String ()]=_agf ;};_bee ._fdb =append (_bee ._fdb ,_agf ._bgga ._fdb ...);_acc ._gba +=_agf ._eef ;_acc ._dga +=_agf ._bagg ;case "\u0072\u0067","\u0067","\u006b","\u0063\u0073","\u0073\u0063","\u0073\u0063\u006e":_fcb ._cfge .ColorspaceNonStroking =_cfg .ColorspaceNonStroking ;_fcb ._cfge .ColorNonStroking =_cfg .ColorNonStroking ;case "\u0052\u0047","\u0047","\u004b","\u0043\u0053","\u0053\u0043","\u0053\u0043\u004e":_fcb ._cfge .ColorspaceStroking =_cfg .ColorspaceStroking ;_fcb ._cfge .ColorStroking =_cfg .ColorStroking ;};return nil ;});_bdd =_bdda .Process (_deff );if _bdd !=nil {_fbc .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a \u0050\u0072\u006f\u0063\u0065s\u0073i\u006eg\u003a\u0020\u0065\u0072\u0072\u003d\u0025v",_bdd );};return _bee ,_acc ._gba ,_acc ._dga ,_bdd ;};type textMark struct{_fef .PdfRectangle ;_bbbg int ;_gfeba string ;_fad string ;_eceg *_fef .PdfFont ;_fff float64 ;_dcec float64 ;_dgdc _dbf .Matrix ;_aaab _dbf .Point ;_gead _fef .PdfRectangle ;_ffeeg _fe .Color ;_dgabg _fe .Color ;};func _efb (_bgaf *Extractor ,_eceb *_fef .PdfPageResources ,_agac _bg .GraphicsState ,_gbd *textState ,_cee *stateStack )*textObject {return &textObject {_aed :_bgaf ,_afac :_eceb ,_cfge :_agac ,_becb :_cee ,_ffe :_gbd ,_acg :_dbf .IdentityMatrix (),_bgf :_dbf .IdentityMatrix ()};};const (RenderModeStroke RenderMode =1<<iota ;RenderModeFill ;RenderModeClip ;);func _gccb (_ceecb []TextMark ,_abcc *int ,_edbe string )[]TextMark {_adge :=_ceac ;_adge .Text =_edbe ;return _edcb (_ceecb ,_abcc ,_adge );};func _facg (_efda string )(string ,bool ){_dddc :=[]rune (_efda );if len (_dddc )!=1{return "",false ;};_cfcbd ,_fegc :=_adeb [_dddc [0]];return _cfcbd ,_fegc ;};func (_cdde *textTable )put (_baef ,_efgb int ,_ebeb *textPara ){_cdde ._cgca [_ceeg (_baef ,_efgb )]=_ebeb ;};func _ebb (_eefd []*textWord ,_bcgc float64 )*wordBag {_dcb :=_ffcb (_eefd [0],_bcgc );for _ ,_cefc :=range _eefd [1:]{_aeegd :=_bbaa (_cefc ._fedfa );_dcb ._edcf [_aeegd ]=append (_dcb ._edcf [_aeegd ],_cefc );};_dcb .sort ();return _dcb ;};func (_edg *wordBag )getDepthIdx (_daff float64 )int {_dfae :=_edg .depthIndexes ();_abdgc :=_bbaa (_daff );if _abdgc < _dfae [0]{return _dfae [0];};if _abdgc > _dfae [len (_dfae )-1]{return _dfae [len (_dfae )-1];};return _abdgc ;};func _cegg (_fcfb ,_fgdb bounded )float64 {return _fcfb .bbox ().Llx -_fgdb .bbox ().Llx };func _eecd (_gffc *wordBag ,_gage float64 )[]*wordBag {var _dgcb []*wordBag ;for _ ,_aafa :=range _gffc .depthIndexes (){_fbca :=false ;for !_gffc .empty (_aafa ){_eeeb :=_gffc .firstReadingIndex (_aafa );_dbg :=_gffc .firstWord (_eeeb );_bbfc :=_ffcb (_dbg ,_gage );_gffc .removeWord (_dbg ,_eeeb );if _gbfb {_fbc .Log .Info ("w\u006f\u0072\u0064\u0073\u005b\u0030\u005d\u003d\u0025\u0073",_dbg .String ());};_dfed :=_cbfb *_bbfc ._ceaf ;_facca :=_gffd *_bbfc ._ceaf ;_fbf :=_dfc *_bbfc ._ceaf ;for _cgb :=true ;_cgb ;_cgb =_fbca {_fbca =false ;if _gbfb {_fbc .Log .Info ("\u0070\u0061\u0072\u0061\u0057\u006f\u0072\u0064\u0073\u0020\u0064\u0065\u0070\u0074\u0068\u0020\u0025\u002e2\u0066\u0020\u002d\u0020\u0025\u002e\u0032f\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061\u0044\u0065p\u0074\u0068\u0047\u0061\u0070\u003d\u0025\u002e\u0032\u0066\u0020",_bbfc .minDepth (),_bbfc .maxDepth (),_fbf );};if _gffc .scanBand ("\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c",_bbfc ,_bgeba (_ccbc ,0),_bbfc .minDepth ()-_fbf ,_bbfc .maxDepth ()+_fbf ,_ebbf ,false ,false )> 0{_fbca =true ;};if _gffc .scanBand ("\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c",_bbfc ,_bgeba (_ccbc ,_facca ),_bbfc .minDepth (),_bbfc .maxDepth (),_fbde ,false ,false )> 0{_fbca =true ;};if _fbca {continue ;};_cdebe :=_gffc .scanBand ("",_bbfc ,_bgeba (_cggd ,_dfed ),_bbfc .minDepth (),_bbfc .maxDepth (),_aeea ,true ,false );if _cdebe > 0{_gcgb :=(_bbfc .maxDepth ()-_bbfc .minDepth ())/_bbfc ._ceaf ;if (_cdebe > 1&&float64 (_cdebe )> 0.3*_gcgb )||_cdebe <=10{if _gffc .scanBand ("\u006f\u0074\u0068e\u0072",_bbfc ,_bgeba (_cggd ,_dfed ),_bbfc .minDepth (),_bbfc .maxDepth (),_aeea ,false ,true )> 0{_fbca =true ;};};};};_dgcb =append (_dgcb ,_bbfc );};};return _dgcb ;};
|
||
|
||
// PageText represents the layout of text on a device page.
|
||
type PageText struct{_fdb []*textMark ;_ecea string ;_fdbc []TextMark ;_bggd []TextTable ;_cceb _fef .PdfRectangle ;};var _ceac =TextMark {Text :"\u005b\u0058\u005d",Original :"\u0020",Meta :true ,FillColor :_fe .White ,StrokeColor :_fe .White };func (_eegb paraList )extractTables ()paraList {if _efcb {_fbc .Log .Debug ("\u0065\u0078\u0074r\u0061\u0063\u0074\u0054\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_eegb ));};if len (_eegb )< _gagg {return _eegb ;};_dfbf :=_eegb .findTables ();if _efcb {_fbc .Log .Info ("c\u006f\u006d\u0062\u0069\u006e\u0065d\u0020\u0074\u0061\u0062\u006c\u0065s\u0020\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d",len (_dfbf ));for _gfga ,_bced :=range _dfbf {_bced .log (_dbd .Sprintf ("c\u006f\u006d\u0062\u0069\u006e\u0065\u0064\u0020\u0025\u0064",_gfga ));};};return _eegb .applyTables (_dfbf );};const _gcdb =1.0/1000.0;
|
||
|
||
// New returns an Extractor instance for extracting content from the input PDF page.
|
||
func New (page *_fef .PdfPage )(*Extractor ,error ){_ca ,_bbe :=page .GetAllContentStreams ();if _bbe !=nil {return nil ,_bbe ;};_bbg ,_bbe :=page .GetMediaBox ();if _bbe !=nil {return nil ,_dbd .Errorf ("\u0065\u0078\u0074r\u0061\u0063\u0074\u006fr\u0020\u0072\u0065\u0071\u0075\u0069\u0072e\u0073\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u002e\u0020\u0025\u0076",_bbe );};_adc :=&Extractor {_g :_ca ,_ffd :page .Resources ,_dd :*_bbg ,_a :map[string ]fontEntry {},_bbb :map[string ]textResult {}};return _adc ,nil ;};func (_bgcf paraList )readBefore (_bcbd []int ,_ead ,_gecb int )bool {_bcc ,_fccb :=_bgcf [_ead ],_bgcf [_gecb ];if _ebdg (_bcc ,_fccb )&&_bcc .Lly > _fccb .Lly {return true ;};if !(_bcc ._bdgeb .Urx < _fccb ._bdgeb .Llx ){return false ;};_gdd ,_abff :=_bcc .Lly ,_fccb .Lly ;if _gdd > _abff {_abff ,_gdd =_gdd ,_abff ;};_dgdf :=_fb .Max (_bcc ._bdgeb .Llx ,_fccb ._bdgeb .Llx );_gad :=_fb .Min (_bcc ._bdgeb .Urx ,_fccb ._bdgeb .Urx );_cgee :=_bgcf .llyRange (_bcbd ,_gdd ,_abff );for _ ,_dedc :=range _cgee {if _dedc ==_ead ||_dedc ==_gecb {continue ;};_bccc :=_bgcf [_dedc ];if _bccc ._bdgeb .Llx <=_gad &&_dgdf <=_bccc ._bdgeb .Urx {return false ;};};return true ;};func _eefdc (_efbd ,_geb _fef .PdfRectangle )_fef .PdfRectangle {return _fef .PdfRectangle {Llx :_fb .Min (_efbd .Llx ,_geb .Llx ),Lly :_fb .Min (_efbd .Lly ,_geb .Lly ),Urx :_fb .Max (_efbd .Urx ,_geb .Urx ),Ury :_fb .Max (_efbd .Ury ,_geb .Ury )};};
|
||
|
||
// Extractor stores and offers functionality for extracting content from PDF pages.
|
||
type Extractor struct{_g string ;_ffd *_fef .PdfPageResources ;_dd _fef .PdfRectangle ;_a map[string ]fontEntry ;_bbb map[string ]textResult ;_fc int64 ;_ad int ;};func (_fafa *wordBag )depthBand (_eeg ,_gacg float64 )[]int {if len (_fafa ._edcf )==0{return nil ;};return _fafa .depthRange (_fafa .getDepthIdx (_eeg ),_fafa .getDepthIdx (_gacg ));};
|
||
|
||
// ToText returns the page text as a single string.
|
||
// Deprecated: This function is deprecated and will be removed in a future major version. Please use
|
||
// Text() instead.
|
||
func (_gffa PageText )ToText ()string {return _gffa .Text ()};func _dcecb (_eaeea string )string {_fab :=[]rune (_eaeea );return string (_fab [:len (_fab )-1])};func (_ebea *textPara )fontsize ()float64 {return _ebea ._gfgb [0]._aaa };func (_dfcc *textPara )writeText (_fce _c .Writer ){if _dfcc ._fcge ==nil {_dfcc .writeCellText (_fce );return ;};for _eacb :=0;_eacb < _dfcc ._fcge ._ccebee ;_eacb ++{for _gbaa :=0;_gbaa < _dfcc ._fcge ._fbdef ;_gbaa ++{_egag :=_dfcc ._fcge .get (_gbaa ,_eacb );if _egag ==nil {_fce .Write ([]byte ("\u0009"));}else {_egag .writeCellText (_fce );};_fce .Write ([]byte ("\u0020"));};if _eacb < _dfcc ._fcge ._ccebee -1{_fce .Write ([]byte ("\u000a"));};};};func (_eeeg *textWord )toTextMarks (_adcb *int )[]TextMark {var _gggd []TextMark ;for _ ,_fgfc :=range _eeeg ._gbbd {_gggd =_edcb (_gggd ,_adcb ,_fgfc .ToTextMark ());};return _gggd ;};type textTable struct{_fef .PdfRectangle ;_fbdef ,_ccebee int ;_cgca map[uint64 ]*textPara ;};func (_cgbe *textWord )addDiacritic (_bcea string ){_ffcf :=_cgbe ._gbbd [len (_cgbe ._gbbd )-1];_ffcf ._gfeba =_ffcf ._gfeba +_bcea ;_ffcf ._gfeba =_ff .NFKC .String (_ffcf ._gfeba );};type textPara struct{_fef .PdfRectangle ;_bdgeb _fef .PdfRectangle ;_gfgb []*textLine ;_fcge *textTable ;_gcbc bool ;_cfc *textPara ;_egcca *textPara ;_aafb *textPara ;_bfc *textPara ;};func (_eec *textObject )setCharSpacing (_cbf float64 ){if _eec ==nil {return ;};_eec ._ffe ._ccf =_cbf ;if _dea {_fbc .Log .Info ("\u0073\u0065t\u0043\u0068\u0061\u0072\u0053\u0070\u0061\u0063\u0069\u006e\u0067\u003a\u0020\u0025\u002e\u0032\u0066\u0020\u0073\u0074\u0061\u0074e=\u0025\u0073",_cbf ,_eec ._ffe .String ());};};
|
||
|
||
// String returns a string describing `pt`.
|
||
func (_egbe PageText )String ()string {_beca :=_dbd .Sprintf ("P\u0061\u0067\u0065\u0054ex\u0074:\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073",len (_egbe ._fdb ));_fac :=[]string {"\u002d"+_beca };for _ ,_dabc :=range _egbe ._fdb {_fac =append (_fac ,_dabc .String ());};_fac =append (_fac ,"\u002b"+_beca );return _da .Join (_fac ,"\u000a");};func _bbaa (_cgdc float64 )int {var _cabef int ;if _cgdc >=0{_cabef =int (_cgdc /_fddeg );}else {_cabef =int (_cgdc /_fddeg )-1;};return _cabef ;};
|
||
|
||
// String returns a description of `w`.
|
||
func (_dacfg *textWord )String ()string {return _dbd .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_dacfg ._fedfa ,_dacfg .PdfRectangle ,_dacfg ._afcg ,_dacfg ._caag );};func _daefe (_acbf *wordBag ,_abea int )*textLine {_abf :=_acbf .firstWord (_abea );_egcdb :=textLine {PdfRectangle :_abf .PdfRectangle ,_aaa :_abf ._afcg ,_bcf :_abf ._fedfa };_egcdb .pullWord (_acbf ,_abf ,_abea );return &_egcdb ;};
|
||
|
||
// RenderMode specifies the text rendering mode (Tmode), which determines whether showing text shall cause
|
||
// glyph outlines to be stroked, filled, used as a clipping boundary, or some combination of the three.
|
||
// Stroking, filling, and clipping shall have the same effects for a text object as they do for a path object
|
||
// (see 8.5.3, "Path-Painting Operators" and 8.5.4, "Clipping Path Operators").
|
||
type RenderMode int ;type textWord struct{_fef .PdfRectangle ;_fedfa float64 ;_caag string ;_gbbd []*textMark ;_afcg float64 ;_acac bool ;};func (_afbe *textObject )showText (_efe []byte )error {return _afbe .renderText (_efe )};func (_deffa *textLine )text ()string {var _efae []string ;for _ ,_egbda :=range _deffa ._aecg {if _egbda ._acac {_efae =append (_efae ,"\u0020");};_efae =append (_efae ,_egbda ._caag );};return _da .Join (_efae ,"");};type cachedImage struct{_af *_fef .Image ;_fdf _fef .PdfColorspace ;};type event struct{_cace float64 ;_faea bool ;_gggb int ;};func _edeg (_bdbd ,_daaf int )int {if _bdbd < _daaf {return _bdbd ;};return _daaf ;};
|
||
|
||
// Len returns the number of TextMarks in `ma`.
|
||
func (_ceec *TextMarkArray )Len ()int {if _ceec ==nil {return 0;};return len (_ceec ._deffd );};func (_gab *wordBag )allWords ()[]*textWord {var _faccc []*textWord ;for _ ,_bgeb :=range _gab ._edcf {_faccc =append (_faccc ,_bgeb ...);};return _faccc ;};func (_gfb *textObject )getFontDirect (_efac string )(*_fef .PdfFont ,error ){_bggb ,_dgbb :=_gfb .getFontDict (_efac );if _dgbb !=nil {return nil ,_dgbb ;};_dffa ,_dgbb :=_fef .NewPdfFontFromPdfObject (_bggb );if _dgbb !=nil {_fbc .Log .Debug ("\u0067\u0065\u0074\u0046\u006f\u006e\u0074\u0044\u0069\u0072\u0065\u0063\u0074\u003a\u0020\u004e\u0065\u0077Pd\u0066F\u006f\u006e\u0074\u0046\u0072\u006f\u006d\u0050\u0064\u0066\u004f\u0062j\u0065\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u006e\u0061\u006d\u0065\u003d%\u0023\u0071\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_efac ,_dgbb );};return _dffa ,_dgbb ;};func _bcgf (_dee float64 )bool {return _fb .Abs (_dee )< TOL };func (_dbef *wordBag )pullWord (_aadb *wordBag ,_bgef *textWord ,_agc int ){_dbef .PdfRectangle =_eefdc (_dbef .PdfRectangle ,_bgef .PdfRectangle );if _bgef ._afcg > _dbef ._ceaf {_dbef ._ceaf =_bgef ._afcg ;};_dbef ._edcf [_agc ]=append (_dbef ._edcf [_agc ],_bgef );_aadb .removeWord (_bgef ,_agc );};func (_aeae *textObject )renderText (_aggb []byte )error {if _aeae ._bdb {_fbc .Log .Debug ("\u0072\u0065\u006e\u0064\u0065r\u0054\u0065\u0078\u0074\u003a\u0020\u0049\u006e\u0076\u0061\u006c\u0069\u0064 \u0066\u006f\u006e\u0074\u002e\u0020\u004e\u006f\u0074\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u002e");return nil ;};_egg :=_aeae .getCurrentFont ();_bbcg :=_egg .BytesToCharcodes (_aggb );_fcf ,_bad ,_dcac :=_egg .CharcodesToStrings (_bbcg );if _dcac > 0{_fbc .Log .Debug ("\u0072\u0065nd\u0065\u0072\u0054e\u0078\u0074\u003a\u0020num\u0043ha\u0072\u0073\u003d\u0025\u0064\u0020\u006eum\u004d\u0069\u0073\u0073\u0065\u0073\u003d%\u0064",_bad ,_dcac );};_aeae ._ffe ._gba +=_bad ;_aeae ._ffe ._dga +=_dcac ;_bde :=_aeae ._ffe ;_ceg :=_bde ._fca ;_dae :=_bde ._fggg /100.0;_cde ,_gdfd :=_egg .GetRuneMetrics (' ');if !_gdfd {_cde ,_gdfd =_egg .GetCharMetrics (32);};if !_gdfd {_cde ,_ =_fef .DefaultFont ().GetRuneMetrics (' ');};_gbf :=_cde .Wx *_gcdb ;_fbc .Log .Trace ("\u0073p\u0061\u0063e\u0057\u0069\u0064t\u0068\u003d\u0025\u002e\u0032\u0066\u0020t\u0065\u0078\u0074\u003d\u0025\u0071 \u0066\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0066\u006f\u006et\u0053\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066",_gbf ,_fcf ,_egg ,_ceg );_gca :=_dbf .NewMatrix (_ceg *_dae ,0,0,_ceg ,0,_bde ._afa );if _dea {_fbc .Log .Info ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0074\u0065\u0078t\u0073\u003d\u0025\u0071",len (_bbcg ),_bbcg ,_fcf );};_fbc .Log .Trace ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0072\u0075\u006ee\u0073\u003d\u0025\u0071",len (_bbcg ),_bbcg ,len (_fcf ));_dcd :=_aeae .getFillColor ();_eagb :=_aeae .getStrokeColor ();for _bgce ,_geg :=range _fcf {_fdfc :=[]rune (_geg );if len (_fdfc )==1&&_fdfc [0]=='\x00'{continue ;};_egd :=_bbcg [_bgce ];_efdf :=_aeae ._cfge .CTM .Mult (_aeae ._acg ).Mult (_gca );_gff :=0.0;if len (_fdfc )==1&&_fdfc [0]==32{_gff =_bde ._bfb ;};_gfag ,_cbff :=_egg .GetCharMetrics (_egd );if !_cbff {_fbc .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u004e\u006f \u006d\u0065\u0074r\u0069\u0063\u0020\u0066\u006f\u0072\u0020\u0063\u006fde\u003d\u0025\u0064 \u0072\u003d0\u0078\u0025\u0030\u0034\u0078\u003d%\u002b\u0071 \u0025\u0073",_egd ,_fdfc ,_fdfc ,_egg );return _dbd .Errorf ("\u006e\u006f\u0020\u0063\u0068\u0061\u0072\u0020\u006d\u0065\u0074\u0072\u0069\u0063\u0073:\u0020f\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0063\u006f\u0064\u0065\u003d\u0025\u0064",_egg .String (),_egd );};_caba :=_dbf .Point {X :_gfag .Wx *_gcdb ,Y :_gfag .Wy *_gcdb };_dcgc :=_dbf .Point {X :(_caba .X *_ceg +_gff )*_dae };_bddab :=_dbf .Point {X :(_caba .X *_ceg +_bde ._ccf +_gff )*_dae };if _dea {_fbc .Log .Info ("\u0074\u0066\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0063\u003d\u0025\u002e\u0032f\u0020t\u0077\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0068\u003d\u0025\u002e\u0032\u0066",_ceg ,_bde ._ccf ,_bde ._bfb ,_dae );_fbc .Log .Info ("\u0064x\u002c\u0064\u0079\u003d%\u002e\u0033\u0066\u0020\u00740\u003d%\u002e2\u0066\u0020\u0074\u003d\u0025\u002e\u0032f",_caba ,_dcgc ,_bddab );};_gbed :=_ggd (_dcgc );_dacf :=_ggd (_bddab );_cbd :=_aeae ._cfge .CTM .Mult (_aeae ._acg ).Mult (_gbed );if _dea {_fbc .Log .Info ("e\u006e\u0064\u003a\u000a\tC\u0054M\u003d\u0025\u0073\u000a\u0009 \u0074\u006d\u003d\u0025\u0073\u000a"+"\u0009\u0020t\u0064\u003d\u0025s\u0020\u0078\u006c\u0061\u0074\u003d\u0025\u0073\u000a"+"\u0009t\u0064\u0030\u003d\u0025\u0073\u000a\u0009\u0020\u2192\u0020\u0025s\u0020\u0078\u006c\u0061\u0074\u003d\u0025\u0073",_aeae ._cfge .CTM ,_aeae ._acg ,_dacf ,_bdgb (_aeae ._cfge .CTM .Mult (_aeae ._acg ).Mult (_dacf )),_gbed ,_cbd ,_bdgb (_cbd ));};_dfaf ,_eee :=_aeae .newTextMark (_eg .ExpandLigatures (_fdfc ),_efdf ,_bdgb (_cbd ),_fb .Abs (_gbf *_efdf .ScalingFactorX ()),_egg ,_aeae ._ffe ._ccf ,_dcd ,_eagb );if !_eee {_fbc .Log .Debug ("\u0054\u0065\u0078\u0074\u0020\u006d\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069d\u0065 \u0070\u0061\u0067\u0065\u002e\u0020\u0053\u006b\u0069\u0070\u0070\u0069\u006e\u0067");continue ;};if _egg ==nil {_fbc .Log .Debug ("\u0045R\u0052O\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u002e");}else if _egg .Encoder ()==nil {_fbc .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020N\u006f\u0020\u0065\u006e\u0063\u006f\u0064\u0069\u006eg\u002e\u0020\u0066o\u006et\u003d\u0025\u0073",_egg );}else {if _daef ,_eecc :=_egg .Encoder ().CharcodeToRune (_egd );_eecc {_dfaf ._fad =string (_daef );};};_fbc .Log .Trace ("i\u003d\u0025\u0064\u0020\u0063\u006fd\u0065\u003d\u0025\u0064\u0020\u006d\u0061\u0072\u006b=\u0025\u0073\u0020t\u0072m\u003d\u0025\u0073",_bgce ,_egd ,_dfaf ,_efdf );_aeae ._aga =append (_aeae ._aga ,&_dfaf );_aeae ._acg .Concat (_dacf );};return nil ;};
|
||
|
||
// TextMark represents extracted text on a page with information regarding both textual content,
|
||
// formatting (font and size) and positioning.
|
||
// It is the smallest unit of text on a PDF page, typically a single character.
|
||
//
|
||
// getBBox() in test_text.go shows how to compute bounding boxes of substrings of extracted text.
|
||
// The following code extracts the text on PDF page `page` into `text` then finds the bounding box
|
||
// `bbox` of substring `term` in `text`.
|
||
//
|
||
// ex, _ := New(page)
|
||
// // handle errors
|
||
// pageText, _, _, err := ex.ExtractPageText()
|
||
// // handle errors
|
||
// text := pageText.Text()
|
||
// textMarks := pageText.Marks()
|
||
//
|
||
// start := strings.Index(text, term)
|
||
// end := start + len(term)
|
||
// spanMarks, err := textMarks.RangeOffset(start, end)
|
||
// // handle errors
|
||
// bbox, ok := spanMarks.BBox()
|
||
// // handle errors
|
||
type TextMark struct{
|
||
|
||
// Text is the extracted text.
|
||
Text string ;
|
||
|
||
// Original is the text in the PDF. It has not been decoded like `Text`.
|
||
Original string ;
|
||
|
||
// BBox is the bounding box of the text.
|
||
BBox _fef .PdfRectangle ;
|
||
|
||
// Font is the font the text was drawn with.
|
||
Font *_fef .PdfFont ;
|
||
|
||
// FontSize is the font size the text was drawn with.
|
||
FontSize float64 ;
|
||
|
||
// Offset is the offset of the start of TextMark.Text in the extracted text. If you do this
|
||
// text, textMarks := pageText.Text(), pageText.Marks()
|
||
// marks := textMarks.Elements()
|
||
// then marks[i].Offset is the offset of marks[i].Text in text.
|
||
Offset int ;
|
||
|
||
// Meta is set true for spaces and line breaks that we insert in the extracted text. We insert
|
||
// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
|
||
// distance apart. See wordJoiner (lineJoiner) in PageText.computeViews().
|
||
Meta bool ;
|
||
|
||
// FillColor is the fill color of the text.
|
||
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
||
FillColor _fe .Color ;
|
||
|
||
// StrokeColor is the stroke color of the text.
|
||
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
||
StrokeColor _fe .Color ;};func (_dfff *textObject )getFontDict (_addg string )(_bcgdg _bb .PdfObject ,_aacd error ){_debe :=_dfff ._afac ;if _debe ==nil {_fbc .Log .Debug ("g\u0065\u0074\u0046\u006f\u006e\u0074D\u0069\u0063\u0074\u002e\u0020\u004eo\u0020\u0072\u0065\u0073\u006f\u0075\u0072c\u0065\u0073\u002e\u0020\u006e\u0061\u006d\u0065\u003d\u0025#\u0071",_addg );return nil ,nil ;};_bcgdg ,_caf :=_debe .GetFontByName (_bb .PdfObjectName (_addg ));if !_caf {_fbc .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u0067\u0065t\u0046\u006f\u006et\u0044\u0069\u0063\u0074\u003a\u0020\u0046\u006f\u006et \u006e\u006f\u0074 \u0066\u006fu\u006e\u0064\u003a\u0020\u006e\u0061m\u0065\u003d%\u0023\u0071",_addg );return nil ,_f .New ("f\u006f\u006e\u0074\u0020no\u0074 \u0069\u006e\u0020\u0072\u0065s\u006f\u0075\u0072\u0063\u0065\u0073");};return _bcgdg ,nil ;};func (_add *textObject )setHorizScaling (_ffbb float64 ){if _add ==nil {return ;};_add ._ffe ._fggg =_ffbb ;};func (_gcd *textObject )reset (){_gcd ._acg =_dbf .IdentityMatrix ();_gcd ._bgf =_dbf .IdentityMatrix ();_gcd ._aga =nil ;};func (_gce *textWord )bbox ()_fef .PdfRectangle {return _gce .PdfRectangle };func _cggd (_gdeb *wordBag ,_bgbc *textWord ,_eegc float64 )bool {return _gdeb .Urx <=_bgbc .Llx &&_bgbc .Llx < _gdeb .Urx +_eegc ;};type stateStack []*textState ;func (_dbfag *textLine )markWordBoundaries (){_ccff :=_daee *_dbfag ._aaa ;for _cca ,_acdfba :=range _dbfag ._aecg [1:]{if _fggd (_acdfba ,_dbfag ._aecg [_cca ])>=_ccff {_acdfba ._acac =true ;};};};
|
||
|
||
// Append appends `mark` to the mark array.
|
||
func (_cddc *TextMarkArray )Append (mark TextMark ){_cddc ._deffd =append (_cddc ._deffd ,mark )};
|
||
|
||
// ToTextMark returns the public view of `tm`.
|
||
func (_begf *textMark )ToTextMark ()TextMark {return TextMark {Text :_begf ._gfeba ,Original :_begf ._fad ,BBox :_begf ._gead ,Font :_begf ._eceg ,FontSize :_begf ._fff ,FillColor :_begf ._ffeeg ,StrokeColor :_begf ._dgabg };};func _gdgc (_cedd _fef .PdfColorspace ,_adcg _fef .PdfColor )_fe .Color {if _cedd ==nil ||_adcg ==nil {return _fe .Black ;};_cdff ,_bfae :=_cedd .ColorToRGB (_adcg );if _bfae !=nil {_fbc .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006fu\u006c\u0064\u0020no\u0074\u0020\u0063\u006f\u006e\u0076e\u0072\u0074\u0020\u0063\u006f\u006c\u006f\u0072\u0020\u0025\u0076\u0020\u0028\u0025\u0076)\u0020\u0074\u006f\u0020\u0052\u0047\u0042\u003a \u0025\u0073",_adcg ,_cedd ,_bfae );return _fe .Black ;};_dded ,_bbbd :=_cdff .(*_fef .PdfColorDeviceRGB );if !_bbbd {_fbc .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006f\u006e\u0076\u0065\u0072\u0074\u0065\u0064 \u0063\u006f\u006c\u006f\u0072\u0020\u0069\u0073\u0020\u006e\u006f\u0074\u0020i\u006e\u0020\u0074\u0068\u0065\u0020\u0052\u0047\u0042\u0020\u0063\u006flo\u0072\u0073\u0070\u0061\u0063\u0065\u003a\u0020\u0025\u0076",_cdff );return _fe .Black ;};return _fe .NRGBA {R :uint8 (_dded .R ()*255),G :uint8 (_dded .G ()*255),B :uint8 (_dded .B ()*255),A :uint8 (255)};};func _aedf (_dfeb ,_agfe _fef .PdfRectangle )bool {return _agfe .Llx <=_dfeb .Urx &&_dfeb .Llx <=_agfe .Urx ;};func (_gfebg *textLine )pullWord (_cfbg *wordBag ,_dfcg *textWord ,_ffge int ){_gfebg .appendWord (_dfcg );_cfbg .removeWord (_dfcg ,_ffge );};func (_agbd paraList )tables ()[]TextTable {var _fgcf []TextTable ;for _ ,_ffea :=range _agbd {if _ffea ._fcge !=nil {_fgcf =append (_fgcf ,_ffea ._fcge .toTextTable ());};};return _fgcf ;};type fontEntry struct{_efeg *_fef .PdfFont ;_dbfe int64 ;};func _ceeg (_ccfc ,_efbc int )uint64 {return uint64 (_ccfc )*0x1000000+uint64 (_efbc )};type paraList []*textPara ;func (_ggfd paraList )toTextMarks ()[]TextMark {_becaa :=0;var _bdge []TextMark ;for _cgag ,_gdced :=range _ggfd {_adbe :=_gdced .toTextMarks (&_becaa );_bdge =append (_bdge ,_adbe ...);if _cgag !=len (_ggfd )-1{if _dddb (_gdced ,_ggfd [_cgag +1]){_bdge =_gccb (_bdge ,&_becaa ,"\u0020");}else {_bdge =_gccb (_bdge ,&_becaa ,"\u000a");_bdge =_gccb (_bdge ,&_becaa ,"\u000a");};};};_bdge =_gccb (_bdge ,&_becaa ,"\u000a");_bdge =_gccb (_bdge ,&_becaa ,"\u000a");return _bdge ;};func _geba (_ecdeb []*textMark ,_eaadg _fef .PdfRectangle )*textWord {_bgadb :=_ecdeb [0].PdfRectangle ;_eceaf :=_ecdeb [0]._fff ;for _ ,_cdgd :=range _ecdeb [1:]{_bgadb =_eefdc (_bgadb ,_cdgd .PdfRectangle );if _cdgd ._fff > _eceaf {_eceaf =_cdgd ._fff ;};};return &textWord {PdfRectangle :_bgadb ,_gbbd :_ecdeb ,_fedfa :_eaadg .Ury -_bgadb .Lly ,_afcg :_eceaf };};func _dddb (_gggc ,_ebe *textPara )bool {return _bcgf (_gggc .depth ()-_ebe .depth ())};func (_gece *wordBag )absorb (_cdfg *wordBag ){for _fefc ,_gefbe :=range _cdfg ._edcf {for _ ,_egccg :=range _gefbe {_gece .pullWord (_cdfg ,_egccg ,_fefc );};};};func (_daeg *wordBag )depthIndexes ()[]int {if len (_daeg ._edcf )==0{return nil ;};_eaag :=make ([]int ,len (_daeg ._edcf ));_gccc :=0;for _dfee :=range _daeg ._edcf {_eaag [_gccc ]=_dfee ;_gccc ++;};_db .Ints (_eaag );return _eaag ;};
|
||
|
||
// String returns a string describing the current state of the textState stack.
|
||
func (_cbbe *stateStack )String ()string {_cgeg :=[]string {_dbd .Sprintf ("\u002d\u002d\u002d\u002d f\u006f\u006e\u0074\u0020\u0073\u0074\u0061\u0063\u006b\u003a\u0020\u0025\u0064",len (*_cbbe ))};for _bec ,_ecc :=range *_cbbe {_ccbe :="\u003c\u006e\u0069l\u003e";if _ecc !=nil {_ccbe =_ecc .String ();};_cgeg =append (_cgeg ,_dbd .Sprintf ("\u0009\u0025\u0032\u0064\u003a\u0020\u0025\u0073",_bec ,_ccbe ));};return _da .Join (_cgeg ,"\u000a");};func (_bae *textObject )moveTextSetLeading (_ecf ,_fdc float64 ){_bae ._ffe ._decc =-_fdc ;_bae .moveTo (_ecf ,_fdc );};func (_adgc *wordBag )empty (_gbdg int )bool {_ ,_aebf :=_adgc ._edcf [_gbdg ];return !_aebf };
|
||
|
||
// RangeOffset returns the TextMarks in `ma` that overlap text[start:end] in the extracted text.
|
||
// These are tm: `start` <= tm.Offset + len(tm.Text) && tm.Offset < `end` where
|
||
// `start` and `end` are offsets in the extracted text.
|
||
// NOTE: TextMarks can contain multiple characters. e.g. "ffi" for the ffi ligature so the first and
|
||
// last elements of the returned TextMarkArray may only partially overlap text[start:end].
|
||
func (_ddge *TextMarkArray )RangeOffset (start ,end int )(*TextMarkArray ,error ){if _ddge ==nil {return nil ,_f .New ("\u006da\u003d\u003d\u006e\u0069\u006c");};if end < start {return nil ,_dbd .Errorf ("\u0065\u006e\u0064\u0020\u003c\u0020\u0073\u0074\u0061\u0072\u0074\u002e\u0020\u0052\u0061n\u0067\u0065\u004f\u0066\u0066\u0073\u0065\u0074\u0020\u006e\u006f\u0074\u0020d\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020\u0073\u0074\u0061\u0072t=\u0025\u0064\u0020\u0065\u006e\u0064\u003d\u0025\u0064\u0020",start ,end );};_accb :=len (_ddge ._deffd );if _accb ==0{return _ddge ,nil ;};if start < _ddge ._deffd [0].Offset {start =_ddge ._deffd [0].Offset ;};if end > _ddge ._deffd [_accb -1].Offset +1{end =_ddge ._deffd [_accb -1].Offset +1;};_beee :=_db .Search (_accb ,func (_fgbd int )bool {return _ddge ._deffd [_fgbd ].Offset +len (_ddge ._deffd [_fgbd ].Text )-1>=start });if !(0<=_beee &&_beee < _accb ){_fag :=_dbd .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020\u0072\u0061\u006e\u0067\u0065\u002e\u0020\u0073\u0074\u0061\u0072\u0074\u003d%\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009 \u006c\u0061\u0073\u0074\u003d%\u0076",start ,_beee ,_accb ,_ddge ._deffd [0],_ddge ._deffd [_accb -1]);return nil ,_fag ;};_gdca :=_db .Search (_accb ,func (_agb int )bool {return _ddge ._deffd [_agb ].Offset > end -1});if !(0<=_gdca &&_gdca < _accb ){_cgcg :=_dbd .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020r\u0061\u006e\u0067e\u002e\u0020\u0065n\u0064\u003d%\u0064\u0020\u0069\u0045\u006e\u0064=\u0025d \u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d\u0025\u0076",end ,_gdca ,_accb ,_ddge ._deffd [0],_ddge ._deffd [_accb -1]);return nil ,_cgcg ;};if _gdca <=_beee {return nil ,_dbd .Errorf ("\u0069\u0045\u006e\u0064\u0020\u003c=\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003a\u0020\u0073\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u0065\u006ed\u003d\u0025\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020i\u0045n\u0064\u003d\u0025\u0064",start ,end ,_beee ,_gdca );};return &TextMarkArray {_deffd :_ddge ._deffd [_beee :_gdca ]},nil ;};func (_bade *wordBag )firstReadingIndex (_bgac int )int {_cafe :=_bade .firstWord (_bgac )._afcg ;_dbfa :=float64 (_bgac +1)*_fddeg ;_gee :=_dbfa +_dgge *_cafe ;_bdccb :=_bgac ;for _ ,_ced :=range _bade .depthBand (_dbfa ,_gee ){if _cegg (_bade .firstWord (_ced ),_bade .firstWord (_bdccb ))< 0{_bdccb =_ced ;};};return _bdccb ;};func _edcfd (_gegg ,_abb _fef .PdfRectangle )bool {return _gegg .Lly <=_abb .Ury &&_abb .Lly <=_gegg .Ury ;};func _dcag (_aeaed ,_cadg float64 )string {_ddbg :=!_bcgf (_aeaed -_cadg );if _ddbg {return "\u000a";};return "\u0020";};func (_ebff *textLine )toTextMarks (_fafd *int )[]TextMark {var _bcgg []TextMark ;for _ ,_efcf :=range _ebff ._aecg {if _efcf ._acac {_bcgg =_gccb (_bcgg ,_fafd ,"\u0020");};_bcff :=_efcf .toTextMarks (_fafd );_bcgg =append (_bcgg ,_bcff ...);};return _bcgg ;};
|
||
|
||
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
|
||
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
|
||
func (_gdfc *Extractor )ExtractTextWithStats ()(_feg string ,_cef int ,_cbg int ,_def error ){_abd ,_cef ,_cbg ,_def :=_gdfc .ExtractPageText ();if _def !=nil {return "",_cef ,_cbg ,_def ;};return _abd .Text (),_cef ,_cbg ,nil ;}; |