mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-26 13:48:55 +08:00
995 lines
216 KiB
Go
995 lines
216 KiB
Go
//
|
||
// Copyright 2020 FoxyUtils ehf. All rights reserved.
|
||
//
|
||
// This is a commercial product and requires a license to operate.
|
||
// A trial license can be obtained at https://unidoc.io
|
||
//
|
||
// DO NOT EDIT: generated by unitwist Go source code obfuscator.
|
||
//
|
||
// Use of this source code is governed by the UniDoc End User License Agreement
|
||
// terms that can be accessed at https://unidoc.io/eula/
|
||
|
||
// Package extractor is used for quickly extracting PDF content through a simple interface.
|
||
// Currently offers functionality for extracting textual content.
|
||
package extractor ;import (_fe "bytes";_d "errors";_ae "fmt";_fc "github.com/unidoc/unipdf/v3/common";_ba "github.com/unidoc/unipdf/v3/contentstream";_bad "github.com/unidoc/unipdf/v3/core";_ff "github.com/unidoc/unipdf/v3/internal/license";_bbg "github.com/unidoc/unipdf/v3/internal/textencoding";
|
||
_g "github.com/unidoc/unipdf/v3/internal/transform";_aec "github.com/unidoc/unipdf/v3/model";_ag "golang.org/x/image/draw";_da "golang.org/x/text/unicode/norm";_ec "image";_eg "image/color";_bc "io";_ea "math";_fb "reflect";_f "regexp";_a "sort";_bb "strings";
|
||
_be "unicode";_e "unicode/utf8";);
|
||
|
||
// TextTable represents a table.
|
||
// Cells are ordered top-to-bottom, left-to-right.
|
||
// Cells[y] is the (0-offset) y'th row in the table.
|
||
// Cells[y][x] is the (0-offset) x'th column in the table.
|
||
type TextTable struct{_aec .PdfRectangle ;W ,H int ;Cells [][]TableCell ;};func (_adde paraList )reorder (_dedb []int ){_cacg :=make (paraList ,len (_adde ));for _gffg ,_feag :=range _dedb {_cacg [_gffg ]=_adde [_feag ];};copy (_adde ,_cacg );};func (_dce *stateStack )push (_cfdc *textState ){_bbc :=*_cfdc ;
|
||
*_dce =append (*_dce ,&_bbc )};const _gbgd =10;func (_fgac rulingList )isActualGrid ()(rulingList ,bool ){_ceae ,_fgcbf :=_fgac .augmentGrid ();if !(len (_ceae )>=_aaaa +1&&len (_fgcbf )>=_acebd +1){if _eceg {_fc .Log .Info ("\u0069s\u0041\u0063t\u0075\u0061\u006c\u0047r\u0069\u0064\u003a \u004e\u006f\u0074\u0020\u0061\u006c\u0069\u0067\u006eed\u002e\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u003c\u0020\u0025d\u0020\u0078 \u0025\u0064",len (_ceae ),len (_fgcbf ),_aaaa +1,_acebd +1);
|
||
};return nil ,false ;};if _eceg {_fc .Log .Info ("\u0069\u0073\u0041\u0063\u0074\u0075a\u006c\u0047\u0072\u0069\u0064\u003a\u0020\u0025\u0073\u0020\u003a\u0020\u0025t\u0020\u0026\u0020\u0025\u0074\u0020\u2192 \u0025\u0074",_fgac ,len (_ceae )>=2,len (_fgcbf )>=2,len (_ceae )>=2&&len (_fgcbf )>=2);
|
||
for _ddgb ,_cbgf :=range _fgac {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0076\u000a",_ddgb ,_cbgf );};};if _aaeb {_geeg ,_bdfc :=_ceae [0],_ceae [len (_ceae )-1];_gdaab ,_bgdac :=_fgcbf [0],_fgcbf [len (_fgcbf )-1];if !(_ecag (_geeg ._gbgc -_gdaab ._fgad )&&_ecag (_bdfc ._gbgc -_gdaab ._ababc )&&_ecag (_gdaab ._gbgc -_geeg ._ababc )&&_ecag (_bgdac ._gbgc -_geeg ._fgad )){if _eceg {_fc .Log .Info ("\u0069\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069d\u003a\u0020\u0020N\u006f\u0074 \u0061\u006c\u0069\u0067\u006e\u0065d\u002e\n\t\u0076\u0030\u003d\u0025\u0073\u000a\u0009\u0076\u0031\u003d\u0025\u0073\u000a\u0009\u0068\u0030\u003d\u0025\u0073\u000a\u0009\u0068\u0031\u003d\u0025\u0073",_geeg ,_bdfc ,_gdaab ,_bgdac );
|
||
};return nil ,false ;};}else {if !_ceae .aligned (){if _geda {_fc .Log .Info ("i\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069\u0064\u003a\u0020N\u006f\u0074\u0020\u0061\u006c\u0069\u0067n\u0065\u0064\u0020\u0076\u0065\u0072\u0074\u0073\u002e\u0020%\u0064",len (_ceae ));
|
||
};return nil ,false ;};if !_fgcbf .aligned (){if _eceg {_fc .Log .Info ("i\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069\u0064\u003a\u0020N\u006f\u0074\u0020\u0061\u006c\u0069\u0067n\u0065\u0064\u0020\u0068\u006f\u0072\u007a\u0073\u002e\u0020%\u0064",len (_fgcbf ));
|
||
};return nil ,false ;};};_edcd :=append (_ceae ,_fgcbf ...);return _edcd ,true ;};func (_ddbg paraList )computeEBBoxes (){if _acgbb {_fc .Log .Info ("\u0063o\u006dp\u0075\u0074\u0065\u0045\u0042\u0042\u006f\u0078\u0065\u0073\u003a");};for _ ,_ddebb :=range _ddbg {_ddebb ._bbbc =_ddebb .PdfRectangle ;
|
||
};_cbab :=_ddbg .yNeighbours (0);for _fcagd ,_ecga :=range _ddbg {_fgeea :=_ecga ._bbbc ;_dbfdb ,_ggbg :=-1.0e9,+1.0e9;for _ ,_aga :=range _cbab [_ecga ]{_gbdc :=_ddbg [_aga ]._bbbc ;if _gbdc .Urx < _fgeea .Llx {_dbfdb =_ea .Max (_dbfdb ,_gbdc .Urx );}else if _fgeea .Urx < _gbdc .Llx {_ggbg =_ea .Min (_ggbg ,_gbdc .Llx );
|
||
};};for _aace ,_dgceg :=range _ddbg {_bagag :=_dgceg ._bbbc ;if _fcagd ==_aace ||_bagag .Ury > _fgeea .Lly {continue ;};if _dbfdb <=_bagag .Llx &&_bagag .Llx < _fgeea .Llx {_fgeea .Llx =_bagag .Llx ;}else if _bagag .Urx <=_ggbg &&_fgeea .Urx < _bagag .Urx {_fgeea .Urx =_bagag .Urx ;
|
||
};};if _acgbb {_ae .Printf ("\u0025\u0034\u0064\u003a %\u0036\u002e\u0032\u0066\u2192\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0071\u000a",_fcagd ,_ecga ._bbbc ,_fgeea ,_bgfd (_ecga .text (),50));};_ecga ._bbbc =_fgeea ;};if _acge {for _ ,_ccaga :=range _ddbg {_ccaga .PdfRectangle =_ccaga ._bbbc ;
|
||
};};};func (_gagac *shapesState )newSubPath (){_gagac .clearPath ();if _bcge {_fc .Log .Info ("\u006e\u0065\u0077\u0053\u0075\u0062\u0050\u0061\u0074h\u003a\u0020\u0025\u0073",_gagac );};};func (_aded *wordBag )applyRemovals (_bcag map[int ]map[*textWord ]struct{}){for _cea ,_afac :=range _bcag {if len (_afac )==0{continue ;
|
||
};_dfga :=_aded ._gbbd [_cea ];_cegf :=len (_dfga )-len (_afac );if _cegf ==0{delete (_aded ._gbbd ,_cea );continue ;};_fgag :=make ([]*textWord ,_cegf );_deg :=0;for _ ,_bef :=range _dfga {if _ ,_gcc :=_afac [_bef ];!_gcc {_fgag [_deg ]=_bef ;_deg ++;
|
||
};};_aded ._gbbd [_cea ]=_fgag ;};};func _dddc (_cgda ,_fded bounded )float64 {_geaf :=_dgc (_cgda ,_fded );if !_ecfbd (_geaf ){return _geaf ;};return _bcea (_cgda ,_fded );};type lists []*list ;func (_gabg *textObject )showText (_gag _bad .PdfObject ,_dcc []byte ,_gcae int )error {return _gabg .renderText (_gag ,_dcc ,_gcae );
|
||
};func _ecea (_cdgf *textWord ,_gdag float64 ,_gfeg ,_afabd rulingList )*wordBag {_acffg :=_dafa (_cdgf ._aecg );_gfg :=[]*textWord {_cdgf };_addbg :=wordBag {_gbbd :map[int ][]*textWord {_acffg :_gfg },PdfRectangle :_cdgf .PdfRectangle ,_aad :_cdgf ._aeegf ,_aeceg :_gdag ,_eddc :_gfeg ,_gbfd :_afabd };
|
||
return &_addbg ;};
|
||
|
||
// ExtractFonts returns all font information from the page extractor, including
|
||
// font name, font type, the raw data of the embedded font file (if embedded), font descriptor and more.
|
||
//
|
||
// The argument `previousPageFonts` is used when trying to build a complete font catalog for multiple pages or the entire document.
|
||
// The entries from `previousPageFonts` are added to the returned result unless already included in the page, i.e. no duplicate entries.
|
||
//
|
||
// NOTE: If previousPageFonts is nil, all fonts from the page will be returned. Use it when building up a full list of fonts for a document or page range.
|
||
func (_ee *Extractor )ExtractFonts (previousPageFonts *PageFonts )(*PageFonts ,error ){_gc :=PageFonts {};_fcc :=_gc .extractPageResourcesToFont (_ee ._gf );if _fcc !=nil {return nil ,_fcc ;};if previousPageFonts !=nil {for _ ,_eed :=range previousPageFonts .Fonts {if !_ggb (_gc .Fonts ,_eed .FontName ){_gc .Fonts =append (_gc .Fonts ,_eed );
|
||
};};};return &PageFonts {Fonts :_gc .Fonts },nil ;};
|
||
|
||
// PageFonts represents extracted fonts on a PDF page.
|
||
type PageFonts struct{Fonts []Font ;};func _aggg (_cebg _aec .PdfRectangle ,_ecfeg ,_dbeb ,_dbacf ,_fegb *ruling )gridTile {_fbcc :=_cebg .Llx ;_ebgbc :=_cebg .Urx ;_bec :=_cebg .Lly ;_acda :=_cebg .Ury ;return gridTile {PdfRectangle :_cebg ,_cbfd :_ecfeg !=nil &&_ecfeg .encloses (_bec ,_acda ),_cbdbf :_dbeb !=nil &&_dbeb .encloses (_bec ,_acda ),_fcgc :_dbacf !=nil &&_dbacf .encloses (_fbcc ,_ebgbc ),_fbbf :_fegb !=nil &&_fegb .encloses (_fbcc ,_ebgbc )};
|
||
};func (_aaddg rulingList )aligned ()bool {if len (_aaddg )< 2{return false ;};_ceddc :=make (map[*ruling ]int );_ceddc [_aaddg [0]]=0;for _ ,_geaaf :=range _aaddg [1:]{_eceee :=false ;for _daec :=range _ceddc {if _geaaf .gridIntersecting (_daec ){_ceddc [_daec ]++;
|
||
_eceee =true ;break ;};};if !_eceee {_ceddc [_geaaf ]=0;};};_daefe :=0;for _ ,_eeded :=range _ceddc {if _eeded ==0{_daefe ++;};};_dcfg :=float64 (_daefe )/float64 (len (_aaddg ));_cdbg :=_dcfg <=1.0-_efea ;if _eceg {_fc .Log .Info ("\u0061\u006c\u0069\u0067\u006e\u0065\u0064\u003d\u0025\u0074\u0020\u0075\u006em\u0061\u0074\u0063\u0068\u0065\u0064=\u0025\u002e\u0032\u0066\u003d\u0025\u0064\u002f\u0025\u0064\u0020\u0076\u0065c\u0073\u003d\u0025\u0073",_cdbg ,_dcfg ,_daefe ,len (_aaddg ),_aaddg .String ());
|
||
};return _cdbg ;};type textWord struct{_aec .PdfRectangle ;_aecg float64 ;_eedc string ;_ebfa []*textMark ;_aeegf float64 ;_ceff bool ;};func _aagaf (_gagec float64 )float64 {return _edge *_ea .Round (_gagec /_edge )};
|
||
|
||
// String returns a description of `state`.
|
||
func (_bdge *textState )String ()string {_gce :="\u005bN\u004f\u0054\u0020\u0053\u0045\u0054]";if _bdge ._cbad !=nil {_gce =_bdge ._cbad .BaseFont ();};return _ae .Sprintf ("\u0074\u0063\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0077\u003d\u0025\u002e\u0032\u0066 \u0074f\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u003d\u0025\u0071",_bdge ._cga ,_bdge ._ecd ,_bdge ._dgad ,_gce );
|
||
};
|
||
|
||
// Marks returns the TextMark collection for a page. It represents all the text on the page.
|
||
func (_cbcg PageText )Marks ()*TextMarkArray {return &TextMarkArray {_bade :_cbcg ._fgb }};func _eefcd (_bdbgb []*textMark ,_cdda _aec .PdfRectangle )[]*textWord {var _ecgca []*textWord ;var _gdgaa *textWord ;if _baf {_fc .Log .Info ("\u006d\u0061\u006beT\u0065\u0078\u0074\u0057\u006f\u0072\u0064\u0073\u003a\u0020\u0025\u0064\u0020\u006d\u0061\u0072\u006b\u0073",len (_bdbgb ));
|
||
};_aefaa :=func (){if _gdgaa !=nil {_ccbc :=_gdgaa .computeText ();if !_dfca (_ccbc ){_gdgaa ._eedc =_ccbc ;_ecgca =append (_ecgca ,_gdgaa );if _baf {_fc .Log .Info ("\u0061\u0064\u0064Ne\u0077\u0057\u006f\u0072\u0064\u003a\u0020\u0025\u0064\u003a\u0020\u0077\u006f\u0072\u0064\u003d\u0025\u0073",len (_ecgca )-1,_gdgaa .String ());
|
||
for _efee ,_abc :=range _gdgaa ._ebfa {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_efee ,_abc .String ());};};};_gdgaa =nil ;};};for _ ,_dabfd :=range _bdbgb {if _ccgb &&_gdgaa !=nil &&len (_gdgaa ._ebfa )> 0{_ebbb :=_gdgaa ._ebfa [len (_gdgaa ._ebfa )-1];
|
||
_gcbe ,_effb :=_eabc (_dabfd ._gded );_fggc ,_baaf :=_eabc (_ebbb ._gded );if _effb &&!_baaf &&_ebbb .inDiacriticArea (_dabfd ){_gdgaa .addDiacritic (_gcbe );continue ;};if _baaf &&!_effb &&_dabfd .inDiacriticArea (_ebbb ){_gdgaa ._ebfa =_gdgaa ._ebfa [:len (_gdgaa ._ebfa )-1];
|
||
_gdgaa .appendMark (_dabfd ,_cdda );_gdgaa .addDiacritic (_fggc );continue ;};};_fdeaa :=_dfca (_dabfd ._gded );if _fdeaa {_aefaa ();continue ;};if _gdgaa ==nil &&!_fdeaa {_gdgaa =_fbccc ([]*textMark {_dabfd },_cdda );continue ;};_gfdac :=_gdgaa ._aeegf ;
|
||
_aefde :=_ea .Abs (_gadb (_cdda ,_dabfd )-_gdgaa ._aecg )/_gfdac ;_cfcfa :=_cdfb (_dabfd ,_gdgaa )/_gfdac ;if _cfcfa >=_fbdbc ||!(-_babe <=_cfcfa &&_aefde <=_accc ){_aefaa ();_gdgaa =_fbccc ([]*textMark {_dabfd },_cdda );continue ;};_gdgaa .appendMark (_dabfd ,_cdda );
|
||
};_aefaa ();return _ecgca ;};func _cgfcc (_bgdee *PageText )error {_cggc :=_ff .GetLicenseKey ();if _cggc !=nil &&_cggc .IsLicensed ()||_gg {return nil ;};_ae .Printf ("\u0055\u006e\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0064\u0020c\u006f\u0070\u0079\u0020\u006f\u0066\u0020\u0055\u006e\u0069P\u0044\u0046\u000a");
|
||
_ae .Println ("-\u0020\u0047\u0065\u0074\u0020\u0061\u0020\u0066\u0072e\u0065\u0020\u0074\u0072\u0069\u0061\u006c l\u0069\u0063\u0065\u006es\u0065\u0020\u006f\u006e\u0020\u0068\u0074\u0074\u0070s:\u002f\u002fu\u006e\u0069\u0064\u006f\u0063\u002e\u0069\u006f");
|
||
return _d .New ("\u0075\u006e\u0069\u0070d\u0066\u0020\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0020c\u006fd\u0065\u0020\u0072\u0065\u0071\u0075\u0069r\u0065\u0064");};func _ggdd (_cfb *textLine )bool {_fecd :=true ;_bgbc :=-1;for _ ,_ffcd :=range _cfb ._bfag {for _ ,_daef :=range _ffcd ._ebfa {_afbf :=_daef ._ebgag ;
|
||
if _bgbc ==-1{_bgbc =_afbf ;}else {if _bgbc !=_afbf {_fecd =false ;break ;};};};};return _fecd ;};func (_adccc intSet )add (_ddebde int ){_adccc [_ddebde ]=struct{}{}};func (_aeef *textTable )getRight ()paraList {_agfc :=make (paraList ,_aeef ._agdc );
|
||
for _fcbb :=0;_fcbb < _aeef ._agdc ;_fcbb ++{_dbdbe :=_aeef .get (_aeef ._afcga -1,_fcbb )._aabe ;if _dbdbe .taken (){return nil ;};_agfc [_fcbb ]=_dbdbe ;};for _febc :=0;_febc < _aeef ._agdc -1;_febc ++{if _agfc [_febc ]._ccee !=_agfc [_febc +1]{return nil ;
|
||
};};return _agfc ;};func (_eaggc rulingList )bbox ()_aec .PdfRectangle {var _bdfg _aec .PdfRectangle ;if len (_eaggc )==0{_fc .Log .Error ("r\u0075\u006c\u0069\u006e\u0067\u004ci\u0073\u0074\u002e\u0062\u0062\u006f\u0078\u003a\u0020n\u006f\u0020\u0072u\u006ci\u006e\u0067\u0073");
|
||
return _aec .PdfRectangle {};};if _eaggc [0]._egdf ==_bfgb {_bdfg .Llx ,_bdfg .Urx =_eaggc .secMinMax ();_bdfg .Lly ,_bdfg .Ury =_eaggc .primMinMax ();}else {_bdfg .Llx ,_bdfg .Urx =_eaggc .primMinMax ();_bdfg .Lly ,_bdfg .Ury =_eaggc .secMinMax ();};return _bdfg ;
|
||
};func _gbca (_gbddb ,_bfcfa int )int {if _gbddb < _bfcfa {return _gbddb ;};return _bfcfa ;};func _age (_dad func (*wordBag ,*textWord ,float64 )bool ,_fecag float64 )func (*wordBag ,*textWord )bool {return func (_fafb *wordBag ,_ggee *textWord )bool {return _dad (_fafb ,_ggee ,_fecag )};
|
||
};func _cbace (_cbfbd *_aec .Image ,_cecf _eg .Color )_ec .Image {_fdgab ,_bgdda :=int (_cbfbd .Width ),int (_cbfbd .Height );_ccfda :=_ec .NewRGBA (_ec .Rect (0,0,_fdgab ,_bgdda ));for _gcebe :=0;_gcebe < _bgdda ;_gcebe ++{for _eadae :=0;_eadae < _fdgab ;
|
||
_eadae ++{_daea ,_cfef :=_cbfbd .ColorAt (_eadae ,_gcebe );if _cfef !=nil {_fc .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063o\u0075\u006c\u0064\u0020\u006e\u006f\u0074\u0020\u0072\u0065\u0074\u0072\u0069\u0065v\u0065 \u0069\u006d\u0061\u0067\u0065\u0020m\u0061\u0073\u006b\u0020\u0076\u0061\u006cu\u0065\u0020\u0061\u0074\u0020\u0028\u0025\u0064\u002c\u0020\u0025\u0064\u0029\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074\u0020\u006da\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063\u006f\u0072\u0072\u0065\u0063t\u002e",_eadae ,_gcebe );
|
||
continue ;};_fecdb ,_afaf ,_efbd ,_ :=_daea .RGBA ();var _bacee _eg .Color ;if _fecdb +_afaf +_efbd ==0{_bacee =_cecf ;}else {_bacee =_eg .Transparent ;};_ccfda .Set (_eadae ,_gcebe ,_bacee );};};return _ccfda ;};func (_fbbdf rulingList )primMinMax ()(float64 ,float64 ){_ffgc ,_bgge :=_fbbdf [0]._gbgc ,_fbbdf [0]._gbgc ;
|
||
for _ ,_gggff :=range _fbbdf [1:]{if _gggff ._gbgc < _ffgc {_ffgc =_gggff ._gbgc ;}else if _gggff ._gbgc > _bgge {_bgge =_gggff ._gbgc ;};};return _ffgc ,_bgge ;};func _ggcce (_gfff []TextMark ,_fgee *int ,_bfgg string )[]TextMark {_ecdb :=_dfba ;_ecdb .Text =_bfgg ;
|
||
return _fccge (_gfff ,_fgee ,_ecdb );};func (_abfdf *textTable )log (_aaaf string ){if !_gbead {return ;};_fc .Log .Info ("~\u007e\u007e\u0020\u0025\u0073\u003a \u0025\u0064\u0020\u0078\u0020\u0025d\u0020\u0067\u0072\u0069\u0064\u003d\u0025t\u000a\u0020\u0020\u0020\u0020\u0020\u0020\u0025\u0036\u002e2\u0066",_aaaf ,_abfdf ._afcga ,_abfdf ._agdc ,_abfdf ._fbccb ,_abfdf .PdfRectangle );
|
||
for _gdcd :=0;_gdcd < _abfdf ._agdc ;_gdcd ++{for _fbbb :=0;_fbbb < _abfdf ._afcga ;_fbbb ++{_gfcb :=_abfdf .get (_fbbb ,_gdcd );if _gfcb ==nil {continue ;};_ae .Printf ("%\u0034\u0064\u0020\u00252d\u003a \u0025\u0036\u002e\u0032\u0066 \u0025\u0071\u0020\u0025\u0064\u000a",_fbbb ,_gdcd ,_gfcb .PdfRectangle ,_bgfd (_gfcb .text (),50),_e .RuneCountInString (_gfcb .text ()));
|
||
};};};type shapesState struct{_affc _g .Matrix ;_gdfbg _g .Matrix ;_edc []*subpath ;_effc bool ;_cgbcf _g .Point ;_bbdg *textObject ;};func (_eac *wordBag )sort (){for _ ,_dbdb :=range _eac ._gbbd {_a .Slice (_dbdb ,func (_efg ,_gbbg int )bool {return _bcea (_dbdb [_efg ],_dbdb [_gbbg ])< 0});
|
||
};};func _gbgg (_bccaa map[float64 ][]*textLine )[]float64 {_gfddd :=[]float64 {};for _aeag :=range _bccaa {_gfddd =append (_gfddd ,_aeag );};_a .Float64s (_gfddd );return _gfddd ;};
|
||
|
||
// PageImages represents extracted images on a PDF page with spatial information:
|
||
// display position and size.
|
||
type PageImages struct{Images []ImageMark ;};func _bbfff (_edfec ,_ccaag _g .Point )bool {_gdfbc :=_ea .Abs (_edfec .X -_ccaag .X );_gbcba :=_ea .Abs (_edfec .Y -_ccaag .Y );return _gdba (_gdfbc ,_gbcba );};func (_acgef paraList )readBefore (_gaea []int ,_eefg ,_bdcd int )bool {_cdbbe ,_egbed :=_acgef [_eefg ],_acgef [_bdcd ];
|
||
if _dcaa (_cdbbe ,_egbed )&&_cdbbe .Lly > _egbed .Lly {return true ;};if !(_cdbbe ._bbbc .Urx < _egbed ._bbbc .Llx ){return false ;};_eege ,_cgdc :=_cdbbe .Lly ,_egbed .Lly ;if _eege > _cgdc {_cgdc ,_eege =_eege ,_cgdc ;};_fbbe :=_ea .Max (_cdbbe ._bbbc .Llx ,_egbed ._bbbc .Llx );
|
||
_fdgg :=_ea .Min (_cdbbe ._bbbc .Urx ,_egbed ._bbbc .Urx );_bafc :=_acgef .llyRange (_gaea ,_eege ,_cgdc );for _ ,_gdgd :=range _bafc {if _gdgd ==_eefg ||_gdgd ==_bdcd {continue ;};_cdfg :=_acgef [_gdgd ];if _cdfg ._bbbc .Llx <=_fdgg &&_fbbe <=_cdfg ._bbbc .Urx {return false ;
|
||
};};return true ;};
|
||
|
||
// TableInfo gets table information of the textmark `tm`.
|
||
func (_fga *TextMark )TableInfo ()(*TextTable ,[][]int ){if !_fga ._adgb {return nil ,nil ;};_bcad :=_fga ._dfeb ;_fdd :=_bcad .getCellInfo (*_fga );return _bcad ,_fdd ;};func (_gbbgf rulingList )sort (){_a .Slice (_gbbgf ,_gbbgf .comp )};func (_dbcc paraList )llyOrdering ()[]int {_eadead :=make ([]int ,len (_dbcc ));
|
||
for _fcccb :=range _dbcc {_eadead [_fcccb ]=_fcccb ;};_a .SliceStable (_eadead ,func (_cgg ,_adbc int )bool {_cdea ,_egeg :=_eadead [_cgg ],_eadead [_adbc ];return _dbcc [_cdea ].Lly < _dbcc [_egeg ].Lly ;});return _eadead ;};func (_edde paraList )eventNeighbours (_accfd []event )map[*textPara ][]int {_a .Slice (_accfd ,func (_ebgbf ,_ffgec int )bool {_fade ,_gccb :=_accfd [_ebgbf ],_accfd [_ffgec ];
|
||
_gacfe ,_gegce :=_fade ._dfada ,_gccb ._dfada ;if _gacfe !=_gegce {return _gacfe < _gegce ;};if _fade ._aedb !=_gccb ._aedb {return _fade ._aedb ;};return _ebgbf < _ffgec ;});_cagdc :=make (map[int ]intSet );_cdecb :=make (intSet );for _ ,_bbgbc :=range _accfd {if _bbgbc ._aedb {_cagdc [_bbgbc ._dacaa ]=make (intSet );
|
||
for _cbcfa :=range _cdecb {if _cbcfa !=_bbgbc ._dacaa {_cagdc [_bbgbc ._dacaa ].add (_cbcfa );_cagdc [_cbcfa ].add (_bbgbc ._dacaa );};};_cdecb .add (_bbgbc ._dacaa );}else {_cdecb .del (_bbgbc ._dacaa );};};_ccac :=map[*textPara ][]int {};for _dgggb ,_befcc :=range _cagdc {_dcfge :=_edde [_dgggb ];
|
||
if len (_befcc )==0{_ccac [_dcfge ]=nil ;continue ;};_feade :=make ([]int ,len (_befcc ));_bbgf :=0;for _feef :=range _befcc {_feade [_bbgf ]=_feef ;_bbgf ++;};_ccac [_dcfge ]=_feade ;};return _ccac ;};func (_fdde rulingList )secMinMax ()(float64 ,float64 ){_bdgb ,_cfbaa :=_fdde [0]._fgad ,_fdde [0]._ababc ;
|
||
for _ ,_bcgf :=range _fdde [1:]{if _bcgf ._fgad < _bdgb {_bdgb =_bcgf ._fgad ;};if _bcgf ._ababc > _cfbaa {_cfbaa =_bcgf ._ababc ;};};return _bdgb ,_cfbaa ;};func _fgacb (_gbdga []*textWord ,_dadf *textWord )[]*textWord {for _gebg ,_gcgcd :=range _gbdga {if _gcgcd ==_dadf {return _eedce (_gbdga ,_gebg );
|
||
};};_fc .Log .Error ("\u0072\u0065\u006d\u006f\u0076e\u0057\u006f\u0072\u0064\u003a\u0020\u0077\u006f\u0072\u0064\u0073\u0020\u0064o\u0065\u0073\u006e\u0027\u0074\u0020\u0063\u006f\u006e\u0074\u0061\u0069\u006e\u0020\u0077\u006f\u0072\u0064\u003d\u0025\u0073",_dadf );
|
||
return nil ;};func (_dca *shapesState )closePath (){if _dca ._effc {_dca ._edc =append (_dca ._edc ,_bbcb (_dca ._cgbcf ));_dca ._effc =false ;}else if len (_dca ._edc )==0{if _bcge {_fc .Log .Debug ("\u0063\u006c\u006f\u0073eP\u0061\u0074\u0068\u0020\u0077\u0069\u0074\u0068\u0020\u006e\u006f\u0020\u0070\u0061t\u0068");
|
||
};_dca ._effc =false ;return ;};_dca ._edc [len (_dca ._edc )-1].close ();if _bcge {_fc .Log .Info ("\u0063\u006c\u006f\u0073\u0065\u0050\u0061\u0074\u0068\u003a\u0020\u0025\u0073",_dca );};};func (_cfdcb *textTable )emptyCompositeColumn (_fbbge int )bool {for _eadaa :=0;
|
||
_eadaa < _cfdcb ._agdc ;_eadaa ++{if _cgfe ,_ffdaa :=_cfdcb ._gaeb [_fgged (_fbbge ,_eadaa )];_ffdaa {if len (_cgfe .paraList )> 0{return false ;};};};return true ;};func (_bdddg *shapesState )drawRectangle (_dabf ,_fgc ,_daed ,_aedf float64 ){if _bcge {_fffc :=_bdddg .devicePoint (_dabf ,_fgc );
|
||
_bdc :=_bdddg .devicePoint (_dabf +_daed ,_fgc +_aedf );_bbaf :=_aec .PdfRectangle {Llx :_fffc .X ,Lly :_fffc .Y ,Urx :_bdc .X ,Ury :_bdc .Y };_fc .Log .Info ("d\u0072a\u0077\u0052\u0065\u0063\u0074\u0061\u006e\u0067l\u0065\u003a\u0020\u00256.\u0032\u0066",_bbaf );
|
||
};_bdddg .newSubPath ();_bdddg .moveTo (_dabf ,_fgc );_bdddg .lineTo (_dabf +_daed ,_fgc );_bdddg .lineTo (_dabf +_daed ,_fgc +_aedf );_bdddg .lineTo (_dabf ,_fgc +_aedf );_bdddg .closePath ();};func _bgbf (_bgdea string ,_efgcf []rulingList ){_fc .Log .Info ("\u0024\u0024 \u0025\u0064\u0020g\u0072\u0069\u0064\u0073\u0020\u002d\u0020\u0025\u0073",len (_efgcf ),_bgdea );
|
||
for _cccb ,_cdccb :=range _efgcf {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_cccb ,_cdccb .String ());};};func _geaa (_fecf _aec .PdfRectangle ,_caga []*textLine )*textPara {return &textPara {PdfRectangle :_fecf ,_bfagf :_caga };};
|
||
func (_dafdc paraList )writeText (_cbcgc _bc .Writer ){for _bgad ,_fbgb :=range _dafdc {if _fbgb ._egbea {continue ;};_fbgb .writeText (_cbcgc );if _bgad !=len (_dafdc )-1{if _gceb (_fbgb ,_dafdc [_bgad +1]){_cbcgc .Write ([]byte ("\u0020"));}else {_cbcgc .Write ([]byte ("\u000a"));
|
||
_cbcgc .Write ([]byte ("\u000a"));};};};_cbcgc .Write ([]byte ("\u000a"));_cbcgc .Write ([]byte ("\u000a"));};func (_cebc *textLine )bbox ()_aec .PdfRectangle {return _cebc .PdfRectangle };func (_abfg rulingList )snapToGroupsDirection ()rulingList {_abfg .sortStrict ();
|
||
_addg :=make (map[*ruling ]rulingList ,len (_abfg ));_aabcd :=_abfg [0];_abbb :=func (_baef *ruling ){_aabcd =_baef ;_addg [_aabcd ]=rulingList {_baef }};_abbb (_abfg [0]);for _ ,_dgcedb :=range _abfg [1:]{if _dgcedb ._gbgc < _aabcd ._gbgc -_bedg {_fc .Log .Error ("\u0073\u006e\u0061\u0070T\u006f\u0047\u0072\u006f\u0075\u0070\u0073\u0044\u0069r\u0065\u0063\u0074\u0069\u006f\u006e\u003a\u0020\u0057\u0072\u006f\u006e\u0067\u0020\u0070\u0072\u0069\u006da\u0072\u0079\u0020\u006f\u0072d\u0065\u0072\u002e\u000a\u0009\u0076\u0030\u003d\u0025\u0073\u000a\u0009\u0020\u0076\u003d\u0025\u0073",_aabcd ,_dgcedb );
|
||
};if _dgcedb ._gbgc > _aabcd ._gbgc +_cabc {_abbb (_dgcedb );}else {_addg [_aabcd ]=append (_addg [_aabcd ],_dgcedb );};};_bcee :=make (map[*ruling ]float64 ,len (_addg ));_ffgeb :=make (map[*ruling ]*ruling ,len (_abfg ));for _fbab ,_ffbf :=range _addg {_bcee [_fbab ]=_ffbf .mergePrimary ();
|
||
for _ ,_gcffa :=range _ffbf {_ffgeb [_gcffa ]=_fbab ;};};for _ ,_aeca :=range _abfg {_aeca ._gbgc =_bcee [_ffgeb [_aeca ]];};_fbedf :=make (rulingList ,0,len (_abfg ));for _ ,_ggefd :=range _addg {_beag :=_ggefd .splitSec ();for _dddfg ,_feaa :=range _beag {_geebc :=_feaa .merge ();
|
||
if len (_fbedf )> 0{_cafcg :=_fbedf [len (_fbedf )-1];if _cafcg .alignsPrimary (_geebc )&&_cafcg .alignsSec (_geebc ){_fc .Log .Error ("\u0073\u006e\u0061\u0070\u0054\u006fG\u0072\u006f\u0075\u0070\u0073\u0044\u0069\u0072\u0065\u0063\u0074\u0069\u006f\u006e\u003a\u0020\u0044\u0075\u0070\u006ci\u0063\u0061\u0074\u0065\u0020\u0069\u003d\u0025\u0064\u000a\u0009\u0077\u003d\u0025s\u000a\t\u0076\u003d\u0025\u0073",_dddfg ,_cafcg ,_geebc );
|
||
continue ;};};_fbedf =append (_fbedf ,_geebc );};};_fbedf .sortStrict ();return _fbedf ;};
|
||
|
||
// String returns a human readable description of `vecs`.
|
||
func (_bfgcb rulingList )String ()string {if len (_bfgcb )==0{return "\u007b \u0045\u004d\u0050\u0054\u0059\u0020}";};_eacgc ,_bgbda :=_bfgcb .vertsHorzs ();_dafg :=len (_eacgc );_efdga :=len (_bgbda );if _dafg ==0||_efdga ==0{return _ae .Sprintf ("\u007b%\u0064\u0020\u0078\u0020\u0025\u0064}",_dafg ,_efdga );
|
||
};_gfdg :=_aec .PdfRectangle {Llx :_eacgc [0]._gbgc ,Urx :_eacgc [_dafg -1]._gbgc ,Lly :_bgbda [_efdga -1]._gbgc ,Ury :_bgbda [0]._gbgc };return _ae .Sprintf ("\u007b\u0025d\u0020\u0078\u0020%\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u007d",_dafg ,_efdga ,_gfdg );
|
||
};func (_ddd *wordBag )maxDepth ()float64 {return _ddd ._aeceg -_ddd .Lly };func (_egdd *shapesState )clearPath (){_egdd ._edc =nil ;_egdd ._effc =false ;if _bcge {_fc .Log .Info ("\u0043\u004c\u0045A\u0052\u003a\u0020\u0073\u0073\u003d\u0025\u0073",_egdd );
|
||
};};func (_bacc rulingList )intersections ()map[int ]intSet {var _gbdag ,_fdeg []int ;for _fcacc ,_bebg :=range _bacc {switch _bebg ._egdf {case _eebe :_gbdag =append (_gbdag ,_fcacc );case _bfgb :_fdeg =append (_fdeg ,_fcacc );};};if len (_gbdag )< _aaaa +1||len (_fdeg )< _acebd +1{return nil ;
|
||
};if len (_gbdag )+len (_fdeg )> _edfde {_fc .Log .Debug ("\u0069\u006e\u0074\u0065\u0072\u0073e\u0063\u0074\u0069\u006f\u006e\u0073\u003a\u0020\u0054\u004f\u004f\u0020\u004d\u0041\u004e\u0059\u0020\u0072\u0075\u006ci\u006e\u0067\u0073\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u003d\u0020%\u0064 \u0078\u0020\u0025\u0064",len (_bacc ),len (_gbdag ),len (_fdeg ));
|
||
return nil ;};_baae :=make (map[int ]intSet ,len (_gbdag )+len (_fdeg ));for _ ,_dace :=range _gbdag {for _ ,_ddacf :=range _fdeg {if _bacc [_dace ].intersects (_bacc [_ddacf ]){if _ ,_defg :=_baae [_dace ];!_defg {_baae [_dace ]=make (intSet );};if _ ,_cbfb :=_baae [_ddacf ];
|
||
!_cbfb {_baae [_ddacf ]=make (intSet );};_baae [_dace ].add (_ddacf );_baae [_ddacf ].add (_dace );};};};return _baae ;};func _bfc (_gebe ,_ccgg _aec .PdfRectangle )bool {return _gebe .Lly <=_ccgg .Ury &&_ccgg .Lly <=_gebe .Ury ;};func (_acdec *textTable )put (_gffd ,_abed int ,_caed *textPara ){_acdec ._bfdff [_fgged (_gffd ,_abed )]=_caed ;
|
||
};func (_eec *textObject )getFontDirect (_faf string )(*_aec .PdfFont ,error ){_cegbd ,_dcce :=_eec .getFontDict (_faf );if _dcce !=nil {return nil ,_dcce ;};_bbd ,_dcce :=_aec .NewPdfFontFromPdfObject (_cegbd );if _dcce !=nil {_fc .Log .Debug ("\u0067\u0065\u0074\u0046\u006f\u006e\u0074\u0044\u0069\u0072\u0065\u0063\u0074\u003a\u0020\u004e\u0065\u0077Pd\u0066F\u006f\u006e\u0074\u0046\u0072\u006f\u006d\u0050\u0064\u0066\u004f\u0062j\u0065\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u006e\u0061\u006d\u0065\u003d%\u0023\u0071\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_faf ,_dcce );
|
||
};return _bbd ,_dcce ;};func _bada (_efcce _aec .PdfRectangle )*ruling {return &ruling {_egdf :_bfgb ,_gbgc :_efcce .Lly ,_fgad :_efcce .Llx ,_ababc :_efcce .Urx };};func (_dgfcd *ruling )alignsPrimary (_aegeb *ruling )bool {return _dgfcd ._egdf ==_aegeb ._egdf &&_ea .Abs (_dgfcd ._gbgc -_aegeb ._gbgc )< _cabc *0.5;
|
||
};func (_adag *textLine )endsInHyphen ()bool {_bdef :=_adag ._bfag [len (_adag ._bfag )-1];_edeee :=_bdef ._eedc ;_eeed ,_aaac :=_e .DecodeLastRuneInString (_edeee );if _aaac <=0||!_be .Is (_be .Hyphen ,_eeed ){return false ;};if _bdef ._ceff &&_ecfgf (_edeee ){return true ;
|
||
};return _ecfgf (_adag .text ());};func (_egba compositeCell )split (_bebdg ,_gbeab []float64 )*textTable {_gbgfe :=len (_bebdg )+1;_bgff :=len (_gbeab )+1;if _gbead {_fc .Log .Info ("\u0063\u006f\u006d\u0070\u006f\u0073\u0069t\u0065\u0043\u0065l\u006c\u002e\u0073\u0070l\u0069\u0074\u003a\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u000a\u0009\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u003d\u0025\u0073\u000a"+"\u0009\u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073=\u0025\u0036\u002e\u0032\u0066\u000a\t\u0063\u006f\u006c\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u003d%\u0036\u002e\u0032\u0066",_bgff ,_gbgfe ,_egba ,_bebdg ,_gbeab );
|
||
_ae .Printf ("\u0020\u0020\u0020\u0020\u0025\u0064\u0020\u0070\u0061\u0072\u0061\u0073\u000a",len (_egba .paraList ));for _ecfdgc ,_efbea :=range _egba .paraList {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_ecfdgc ,_efbea .String ());
|
||
};_ae .Printf ("\u0020\u0020\u0020\u0020\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u000a",len (_egba .lines ()));for _ccfe ,_edce :=range _egba .lines (){_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_ccfe ,_edce );};};_bebdg =_afgc (_bebdg ,_egba .Ury ,_egba .Lly );
|
||
_gbeab =_afgc (_gbeab ,_egba .Llx ,_egba .Urx );_dcda :=make (map[uint64 ]*textPara ,_bgff *_gbgfe );_efgc :=textTable {_afcga :_bgff ,_agdc :_gbgfe ,_bfdff :_dcda };_gfaea :=_egba .paraList ;_a .Slice (_gfaea ,func (_bedb ,_edbc int )bool {_ggff ,_dcddb :=_gfaea [_bedb ],_gfaea [_edbc ];
|
||
_edcea ,_gcbc :=_ggff .Lly ,_dcddb .Lly ;if _edcea !=_gcbc {return _edcea < _gcbc ;};return _ggff .Llx < _dcddb .Llx ;});_bdbf :=make (map[uint64 ]_aec .PdfRectangle ,_bgff *_gbgfe );for _bfad ,_eacg :=range _bebdg [1:]{_feab :=_bebdg [_bfad ];for _gedc ,_aefa :=range _gbeab [1:]{_eafgb :=_gbeab [_gedc ];
|
||
_bdbf [_fgged (_gedc ,_bfad )]=_aec .PdfRectangle {Llx :_eafgb ,Urx :_aefa ,Lly :_eacg ,Ury :_feab };};};if _gbead {_fc .Log .Info ("\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u0043\u0065l\u006c\u002e\u0073\u0070\u006c\u0069\u0074\u003a\u0020\u0072e\u0063\u0074\u0073");
|
||
_ae .Printf ("\u0020\u0020\u0020\u0020");for _dcegd :=0;_dcegd < _bgff ;_dcegd ++{_ae .Printf ("\u0025\u0033\u0030\u0064\u002c\u0020",_dcegd );};_ae .Println ();for _daadc :=0;_daadc < _gbgfe ;_daadc ++{_ae .Printf ("\u0020\u0020\u0025\u0032\u0064\u003a",_daadc );
|
||
for _decca :=0;_decca < _bgff ;_decca ++{_ae .Printf ("\u00256\u002e\u0032\u0066\u002c\u0020",_bdbf [_fgged (_decca ,_daadc )]);};_ae .Println ();};};_afca :=func (_cgcga *textLine )(int ,int ){for _ggbgc :=0;_ggbgc < _gbgfe ;_ggbgc ++{for _cedda :=0;_cedda < _bgff ;
|
||
_cedda ++{if _dfdf (_bdbf [_fgged (_cedda ,_ggbgc )],_cgcga .PdfRectangle ){return _cedda ,_ggbgc ;};};};return -1,-1;};_gdaba :=make (map[uint64 ][]*textLine ,_bgff *_gbgfe );for _ ,_bfce :=range _gfaea .lines (){_ddacc ,_geed :=_afca (_bfce );if _ddacc < 0{continue ;
|
||
};_gdaba [_fgged (_ddacc ,_geed )]=append (_gdaba [_fgged (_ddacc ,_geed )],_bfce );};for _gbfdd :=0;_gbfdd < len (_bebdg )-1;_gbfdd ++{_ggdge :=_bebdg [_gbfdd ];_deea :=_bebdg [_gbfdd +1];for _deee :=0;_deee < len (_gbeab )-1;_deee ++{_eeaa :=_gbeab [_deee ];
|
||
_cebf :=_gbeab [_deee +1];_gbadd :=_aec .PdfRectangle {Llx :_eeaa ,Urx :_cebf ,Lly :_deea ,Ury :_ggdge };_cgedg :=_gdaba [_fgged (_deee ,_gbfdd )];if len (_cgedg )==0{continue ;};_ceeda :=_geaa (_gbadd ,_cgedg );_efgc .put (_deee ,_gbfdd ,_ceeda );};};
|
||
return &_efgc ;};func _gfbf (_fedfg []structElement ,_edgb map[int ][]*textLine ,_cfbg _bad .PdfObject )[]*list {_cabbc :=[]*list {};for _ ,_cedag :=range _fedfg {_dgdbb :=_cedag ._ccaac ;_ebgb :=int (_cedag ._fab );_cbfg :=_cedag ._aeff ;_bcgc :=[]*textLine {};
|
||
_gagd :=[]*list {};_aabc :=_cedag ._ecdd ;_gdgb ,_fcda :=(_aabc .(*_bad .PdfObjectReference ));if !_fcda {_fc .Log .Debug ("\u0066\u0061\u0069l\u0065\u0064\u0020\u006f\u0074\u0020\u0063\u0061\u0073\u0074\u0020\u0074\u006f\u0020\u002a\u0063\u006f\u0072\u0065\u002e\u0050\u0064\u0066\u004f\u0062\u006a\u0065\u0063\u0074R\u0065\u0066\u0065\u0072\u0065\u006e\u0063\u0065");
|
||
};if _ebgb !=-1&&_gdgb !=nil {if _dgcc ,_bbea :=_edgb [_ebgb ];_bbea {if _bdeb ,_dccea :=_cfbg .(*_bad .PdfIndirectObject );_dccea {_aaec :=_bdeb .PdfObjectReference ;if _fb .DeepEqual (*_gdgb ,_aaec ){_bcgc =_dgcc ;};};};};if _dgdbb !=nil {_gagd =_gfbf (_dgdbb ,_edgb ,_cfbg );
|
||
};_ddab :=_deec (_bcgc ,_cbfg ,_gagd );_cabbc =append (_cabbc ,_ddab );};return _cabbc ;};func (_afdd *textTable )subdivide ()*textTable {_afdd .logComposite ("\u0073u\u0062\u0064\u0069\u0076\u0069\u0064e");_edgcd :=_afdd .compositeRowCorridors ();_gaee :=_afdd .compositeColCorridors ();
|
||
if _gbead {_fc .Log .Info ("\u0073u\u0062\u0064i\u0076\u0069\u0064\u0065:\u000a\u0009\u0072o\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s=\u0025\u0073\u000a\t\u0063\u006fl\u0043\u006f\u0072\u0072\u0069\u0064o\u0072\u0073=\u0025\u0073",_dbeg (_edgcd ),_dbeg (_gaee ));
|
||
};if len (_edgcd )==0||len (_gaee )==0{return _afdd ;};_bedae (_edgcd );_bedae (_gaee );if _gbead {_fc .Log .Info ("\u0073\u0075\u0062\u0064\u0069\u0076\u0069\u0064\u0065\u0020\u0066\u0069\u0078\u0065\u0064\u003a\u000a\u0009r\u006f\u0077\u0043\u006f\u0072\u0072\u0069d\u006f\u0072\u0073\u003d\u0025\u0073\u000a\u0009\u0063\u006f\u006cC\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u003d\u0025\u0073",_dbeg (_edgcd ),_dbeg (_gaee ));
|
||
};_baff ,_fgfbb :=_fbacb (_afdd ._agdc ,_edgcd );_ceedc ,_bbcac :=_fbacb (_afdd ._afcga ,_gaee );_egbd :=make (map[uint64 ]*textPara ,_bbcac *_fgfbb );_ffed :=&textTable {PdfRectangle :_afdd .PdfRectangle ,_fbccb :_afdd ._fbccb ,_agdc :_fgfbb ,_afcga :_bbcac ,_bfdff :_egbd };
|
||
if _gbead {_fc .Log .Info ("\u0073\u0075b\u0064\u0069\u0076\u0069\u0064\u0065\u003a\u0020\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u0020\u003d\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u0020\u0063\u0065\u006c\u006c\u0073\u003d\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u000a"+"\u0009\u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s\u003d\u0025\u0073\u000a"+"\u0009\u0063\u006f\u006c\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s\u003d\u0025\u0073\u000a"+"\u0009\u0079\u004f\u0066\u0066\u0073\u0065\u0074\u0073=\u0025\u002b\u0076\u000a"+"\u0009\u0078\u004f\u0066\u0066\u0073\u0065\u0074\u0073\u003d\u0025\u002b\u0076",_afdd ._afcga ,_afdd ._agdc ,_bbcac ,_fgfbb ,_dbeg (_edgcd ),_dbeg (_gaee ),_baff ,_ceedc );
|
||
};for _gabfd :=0;_gabfd < _afdd ._agdc ;_gabfd ++{_ccda :=_baff [_gabfd ];for _acab :=0;_acab < _afdd ._afcga ;_acab ++{_fcgfd :=_ceedc [_acab ];if _gbead {_ae .Printf ("\u0025\u0036\u0064\u002c %\u0032\u0064\u003a\u0020\u0078\u0030\u003d\u0025\u0064\u0020\u0079\u0030\u003d\u0025d\u000a",_acab ,_gabfd ,_fcgfd ,_ccda );
|
||
};_gcdaa ,_fdcf :=_afdd ._gaeb [_fgged (_acab ,_gabfd )];if !_fdcf {continue ;};_deac :=_gcdaa .split (_edgcd [_gabfd ],_gaee [_acab ]);for _bdacf :=0;_bdacf < _deac ._agdc ;_bdacf ++{for _aecea :=0;_aecea < _deac ._afcga ;_aecea ++{_dbgf :=_deac .get (_aecea ,_bdacf );
|
||
_ffed .put (_fcgfd +_aecea ,_ccda +_bdacf ,_dbgf );if _gbead {_ae .Printf ("\u0025\u0038\u0064\u002c\u0020\u0025\u0032\u0064\u003a\u0020\u0025\u0073\u000a",_fcgfd +_aecea ,_ccda +_bdacf ,_dbgf );};};};};};return _ffed ;};func (_ebeg *wordBag )makeRemovals ()map[int ]map[*textWord ]struct{}{_ceda :=make (map[int ]map[*textWord ]struct{},len (_ebeg ._gbbd ));
|
||
for _bdbc :=range _ebeg ._gbbd {_ceda [_bdbc ]=make (map[*textWord ]struct{});};return _ceda ;};
|
||
|
||
// ApplyArea processes the page text only within the specified area `bbox`.
|
||
// Each time ApplyArea is called, it updates the result set in `pt`.
|
||
// Can be called multiple times in a row with different bounding boxes.
|
||
func (_eggd *PageText )ApplyArea (bbox _aec .PdfRectangle ){_faad :=make ([]*textMark ,0,len (_eggd ._dbfe ));for _ ,_aac :=range _eggd ._dbfe {if _fbdc (_aac .bbox (),bbox ){_faad =append (_faad ,_aac );};};var _ebedg paraList ;_cadf :=len (_faad );for _gdad :=0;
|
||
_gdad < 360&&_cadf > 0;_gdad +=90{_ccb :=make ([]*textMark ,0,len (_faad )-_cadf );for _ ,_bbbg :=range _faad {if _bbbg ._eeacf ==_gdad {_ccb =append (_ccb ,_bbbg );};};if len (_ccb )> 0{_aedd :=_fdba (_ccb ,_eggd ._babf ,nil ,nil ,_eggd ._cdgg ._cgff );
|
||
_ebedg =append (_ebedg ,_aedd ...);_cadf -=len (_ccb );};};_bedd :=new (_fe .Buffer );_ebedg .writeText (_bedd );_eggd ._eede =_bedd .String ();_eggd ._fgb =_ebedg .toTextMarks ();_eggd ._gba =_ebedg .tables ();};func _eccfd (_ebgd ,_cced int )int {if _ebgd > _cced {return _ebgd ;
|
||
};return _cced ;};func _dbdad (_ddga []*textLine ,_fbdbb map[float64 ][]*textLine )[]*list {_gaae :=_gbgg (_fbdbb );_gcff :=[]*list {};if len (_gaae )==0{return _gcff ;};_faag :=_gaae [0];_afdc :=1;_fegf :=_fbdbb [_faag ];for _dccbd ,_eaa :=range _fegf {var _bgabg float64 ;
|
||
_efced :=[]*list {};_debg :=_eaa ._gaca ;_gcdc :=-1.0;if _dccbd < len (_fegf )-1{_gcdc =_fegf [_dccbd +1]._gaca ;};if _afdc < len (_gaae ){_efced =_fgeda (_ddga ,_fbdbb ,_gaae ,_afdc ,_debg ,_gcdc );};_bgabg =_gcdc ;if len (_efced )> 0{_eacd :=_efced [0];
|
||
if len (_eacd ._gagag )> 0{_bgabg =_eacd ._gagag [0]._gaca ;};};_bbac :=[]*textLine {_eaa };_fcfd :=_eagc (_eaa ,_ddga ,_gaae ,_debg ,_bgabg );_bbac =append (_bbac ,_fcfd ...);_cedd :=_deec (_bbac ,"\u0062\u0075\u006c\u006c\u0065\u0074",_efced );_cedd ._begg =_aadg (_bbac ,"");
|
||
_gcff =append (_gcff ,_cedd );};return _gcff ;};type list struct{_gagag []*textLine ;_efac string ;_bffd []*list ;_begg string ;};func _fbbg (_ccad *paraList )map[int ][]*textLine {_cabb :=map[int ][]*textLine {};for _ ,_baec :=range *_ccad {for _ ,_cce :=range _baec ._bfagf {if !_ggdd (_cce ){_fc .Log .Debug ("g\u0072\u006f\u0075p\u004c\u0069\u006e\u0065\u0073\u003a\u0020\u0054\u0068\u0065\u0020\u0074\u0065\u0078\u0074\u0020\u006c\u0069\u006e\u0065\u0020\u0063\u006f\u006e\u0074a\u0069\u006e\u0073 \u006d\u006f\u0072\u0065\u0020\u0074\u0068\u0061\u006e\u0020\u006f\u006e\u0065 \u006d\u0063\u0069\u0064 \u006e\u0075\u006d\u0062e\u0072\u002e\u0020\u0049\u0074\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020\u0062\u0065\u0020\u0073p\u006c\u0069\u0074\u002e");
|
||
continue ;};_ddfb :=_cce ._bfag [0]._ebfa [0]._ebgag ;_cabb [_ddfb ]=append (_cabb [_ddfb ],_cce );};if _baec ._caaa !=nil {_bdgd :=_baec ._caaa ._bfdff ;for _ ,_gcgf :=range _bdgd {for _ ,_gddg :=range _gcgf ._bfagf {if !_ggdd (_gddg ){_fc .Log .Debug ("g\u0072\u006f\u0075p\u004c\u0069\u006e\u0065\u0073\u003a\u0020\u0054\u0068\u0065\u0020\u0074\u0065\u0078\u0074\u0020\u006c\u0069\u006e\u0065\u0020\u0063\u006f\u006e\u0074a\u0069\u006e\u0073 \u006d\u006f\u0072\u0065\u0020\u0074\u0068\u0061\u006e\u0020\u006f\u006e\u0065 \u006d\u0063\u0069\u0064 \u006e\u0075\u006d\u0062e\u0072\u002e\u0020\u0049\u0074\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020\u0062\u0065\u0020\u0073p\u006c\u0069\u0074\u002e");
|
||
continue ;};_ggdc :=_gddg ._bfag [0]._ebfa [0]._ebgag ;_cabb [_ggdc ]=append (_cabb [_ggdc ],_gddg );};};};};return _cabb ;};func _cffd (_feca *_ba .ContentStreamOperation )(float64 ,error ){if len (_feca .Params )!=1{_dgff :=_d .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");
|
||
_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_feca .Operand ,1,len (_feca .Params ),_feca .Params );
|
||
return 0.0,_dgff ;};return _bad .GetNumberAsFloat (_feca .Params [0]);};func (_cca *shapesState )devicePoint (_fdfc ,_bgg float64 )_g .Point {_bcfa :=_cca ._gdfbg .Mult (_cca ._affc );_fdfc ,_bgg =_bcfa .Transform (_fdfc ,_bgg );return _g .NewPoint (_fdfc ,_bgg );
|
||
};func _cffa (_abgcc *wordBag ,_egce int )*textLine {_abddg :=_abgcc .firstWord (_egce );_fdca :=textLine {PdfRectangle :_abddg .PdfRectangle ,_fgcb :_abddg ._aeegf ,_gaca :_abddg ._aecg };_fdca .pullWord (_abgcc ,_abddg ,_egce );return &_fdca ;};func (_eddf *textTable )getComposite (_bffg ,_ffef int )(paraList ,_aec .PdfRectangle ){_eeced ,_cdeef :=_eddf ._gaeb [_fgged (_bffg ,_ffef )];
|
||
if _gbead {_ae .Printf ("\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0067\u0065\u0074\u0043\u006f\u006d\u0070o\u0073i\u0074\u0065\u0028\u0025\u0064\u002c\u0025\u0064\u0029\u002d\u003e\u0025\u0073\u000a",_bffg ,_ffef ,_eeced .String ());};if !_cdeef {return nil ,_aec .PdfRectangle {};
|
||
};return _eeced .parasBBox ();};type compositeCell struct{_aec .PdfRectangle ;paraList ;};func _deec (_fgce []*textLine ,_gcge string ,_baagd []*list )*list {return &list {_gagag :_fgce ,_efac :_gcge ,_bffd :_baagd };};func (_cbed rulingList )splitSec ()[]rulingList {_a .Slice (_cbed ,func (_eebeg ,_gbfdg int )bool {_gfeda ,_gddf :=_cbed [_eebeg ],_cbed [_gbfdg ];
|
||
if _gfeda ._fgad !=_gddf ._fgad {return _gfeda ._fgad < _gddf ._fgad ;};return _gfeda ._ababc < _gddf ._ababc ;});_cgaca :=make (map[*ruling ]struct{},len (_cbed ));_ecaed :=func (_dfgde *ruling )rulingList {_feafc :=rulingList {_dfgde };_cgaca [_dfgde ]=struct{}{};
|
||
for _ ,_bdaaf :=range _cbed {if _ ,_edfce :=_cgaca [_bdaaf ];_edfce {continue ;};for _ ,_gdef :=range _feafc {if _bdaaf .alignsSec (_gdef ){_feafc =append (_feafc ,_bdaaf );_cgaca [_bdaaf ]=struct{}{};break ;};};};return _feafc ;};_ggga :=[]rulingList {_ecaed (_cbed [0])};
|
||
for _ ,_fbad :=range _cbed [1:]{if _ ,_eefce :=_cgaca [_fbad ];_eefce {continue ;};_ggga =append (_ggga ,_ecaed (_fbad ));};return _ggga ;};func (_bbff *textObject )reset (){_bbff ._acc =_g .IdentityMatrix ();_bbff ._dde =_g .IdentityMatrix ();_bbff ._fcee =nil ;
|
||
};func (_gfee *ruling )equals (_bfgd *ruling )bool {return _gfee ._egdf ==_bfgd ._egdf &&_cdffd (_gfee ._gbgc ,_bfgd ._gbgc )&&_cdffd (_gfee ._fgad ,_bfgd ._fgad )&&_cdffd (_gfee ._ababc ,_bfgd ._ababc );};func (_abb *wordBag )blocked (_aca *textWord )bool {if _aca .Urx < _abb .Llx {_bgdc :=_dgea (_aca .PdfRectangle );
|
||
_dfed :=_fdfb (_abb .PdfRectangle );if _abb ._eddc .blocks (_bgdc ,_dfed ){if _dfbb {_fc .Log .Info ("\u0062\u006c\u006f\u0063ke\u0064\u0020\u2190\u0078\u003a\u0020\u0025\u0073\u0020\u0025\u0073",_aca ,_abb );};return true ;};}else if _abb .Urx < _aca .Llx {_gaadd :=_dgea (_abb .PdfRectangle );
|
||
_cecb :=_fdfb (_aca .PdfRectangle );if _abb ._eddc .blocks (_gaadd ,_cecb ){if _dfbb {_fc .Log .Info ("b\u006co\u0063\u006b\u0065\u0064\u0020\u0078\u2192\u0020:\u0020\u0025\u0073\u0020%s",_aca ,_abb );};return true ;};};if _aca .Ury < _abb .Lly {_bcae :=_ggabg (_aca .PdfRectangle );
|
||
_cfdd :=_bada (_abb .PdfRectangle );if _abb ._gbfd .blocks (_bcae ,_cfdd ){if _dfbb {_fc .Log .Info ("\u0062\u006c\u006f\u0063ke\u0064\u0020\u2190\u0079\u003a\u0020\u0025\u0073\u0020\u0025\u0073",_aca ,_abb );};return true ;};}else if _abb .Ury < _aca .Lly {_dbe :=_ggabg (_abb .PdfRectangle );
|
||
_fdbg :=_bada (_aca .PdfRectangle );if _abb ._gbfd .blocks (_dbe ,_fdbg ){if _dfbb {_fc .Log .Info ("b\u006co\u0063\u006b\u0065\u0064\u0020\u0079\u2192\u0020:\u0020\u0025\u0073\u0020%s",_aca ,_abb );};return true ;};};return false ;};func _bbbba (_abeeg []compositeCell )[]float64 {var _bdgee []*textLine ;
|
||
_bfdea :=0;for _ ,_aacdc :=range _abeeg {_bfdea +=len (_aacdc .paraList );_bdgee =append (_bdgee ,_aacdc .lines ()...);};_a .Slice (_bdgee ,func (_dgead ,_eace int )bool {_fgcea ,_abdbe :=_bdgee [_dgead ],_bdgee [_eace ];_cedaeg ,_egddd :=_fgcea ._gaca ,_abdbe ._gaca ;
|
||
if !_ecfbd (_cedaeg -_egddd ){return _cedaeg < _egddd ;};return _fgcea .Llx < _abdbe .Llx ;});if _gbead {_ae .Printf ("\u0020\u0020\u0020 r\u006f\u0077\u0042\u006f\u0072\u0064\u0065\u0072\u0073:\u0020%\u0064 \u0070a\u0072\u0061\u0073\u0020\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u000a",_bfdea ,len (_bdgee ));
|
||
for _ecebb ,_cafdc :=range _bdgee {_ae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_ecebb ,_cafdc );};};var _egff []float64 ;_fbdab :=_bdgee [0];var _ddba [][]*textLine ;_facb :=[]*textLine {_fbdab };for _facd ,_bfbed :=range _bdgee [1:]{if _bfbed .Ury < _fbdab .Lly {_abaac :=0.5*(_bfbed .Ury +_fbdab .Lly );
|
||
if _gbead {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u003c\u0020\u0025\u0036.\u0032f\u0020\u0062\u006f\u0072\u0064\u0065\u0072\u003d\u0025\u0036\u002e\u0032\u0066\u000a"+"\u0009\u0020\u0071\u003d\u0025\u0073\u000a\u0009\u0020p\u003d\u0025\u0073\u000a",_facd ,_bfbed .Ury ,_fbdab .Lly ,_abaac ,_fbdab ,_bfbed );
|
||
};_egff =append (_egff ,_abaac );_ddba =append (_ddba ,_facb );_facb =nil ;};_facb =append (_facb ,_bfbed );if _bfbed .Lly < _fbdab .Lly {_fbdab =_bfbed ;};};if len (_facb )> 0{_ddba =append (_ddba ,_facb );};if _gbead {_ae .Printf (" \u0020\u0020\u0020\u0020\u0020\u0020 \u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064o\u0072\u0073\u003d%\u0036.\u0032\u0066\u000a",_egff );
|
||
};if _gbead {_fc .Log .Info ("\u0072\u006f\u0077\u003d\u0025\u0064",len (_abeeg ));for _bdgcf ,_cegd :=range _abeeg {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bdgcf ,_cegd );};_fc .Log .Info ("\u0067r\u006f\u0075\u0070\u0073\u003d\u0025d",len (_ddba ));
|
||
for _ccfg ,_adcae :=range _ddba {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0064\u000a",_ccfg ,len (_adcae ));for _cdfdf ,_ccdgd :=range _adcae {_ae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_cdfdf ,_ccdgd );};};};_gdcee :=true ;
|
||
for _agabc ,_aedg :=range _ddba {_dfgc :=true ;for _ffffb ,_efbg :=range _abeeg {if _gbead {_ae .Printf ("\u0020\u0020\u0020\u007e\u007e\u007e\u0067\u0072\u006f\u0075\u0070\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064\u0020\u0063\u0065\u006cl\u0020\u0025\u0064\u0020\u006ff\u0020\u0025d\u0020\u0025\u0073\u000a",_agabc ,len (_ddba ),_ffffb ,len (_abeeg ),_efbg );
|
||
};if !_efbg .hasLines (_aedg ){if _gbead {_ae .Printf ("\u0020\u0020\u0020\u0021\u0021\u0021\u0067\u0072\u006f\u0075\u0070\u0020\u0025d\u0020\u006f\u0066\u0020\u0025\u0064 \u0063\u0065\u006c\u006c\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064 \u004f\u0055\u0054\u000a",_agabc ,len (_ddba ),_ffffb ,len (_abeeg ));
|
||
};_dfgc =false ;break ;};};if !_dfgc {_gdcee =false ;break ;};};if !_gdcee {if _gbead {_fc .Log .Info ("\u0072\u006f\u0077\u0020\u0063o\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u0020\u0064\u006f\u006e\u0027\u0074 \u0073\u0070\u0061\u006e\u0020\u0061\u006c\u006c\u0020\u0063\u0065\u006c\u006c\u0073\u0020\u0069\u006e\u0020\u0072\u006f\u0077\u002e\u0020\u0069\u0067\u006e\u006f\u0072\u0069\u006eg");
|
||
};_egff =nil ;};if _gbead &&_egff !=nil {_ae .Printf ("\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u002a\u002a*\u0072\u006f\u0077\u0043\u006f\u0072\u0072i\u0064\u006f\u0072\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u000a",_egff );};return _egff ;};
|
||
|
||
|
||
// New returns an Extractor instance for extracting content from the input PDF page.
|
||
func New (page *_aec .PdfPage )(*Extractor ,error ){return NewWithOptions (page ,nil )};
|
||
|
||
// ImageExtractOptions contains options for controlling image extraction from
|
||
// PDF pages.
|
||
type ImageExtractOptions struct{IncludeInlineStencilMasks bool ;};var _gg =false ;func _eagbe (_degdd map[float64 ]map[float64 ]gridTile )[]float64 {_fdbcb :=make ([]float64 ,0,len (_degdd ));_fdfcc :=make (map[float64 ]struct{},len (_degdd ));for _ ,_aaba :=range _degdd {for _fcgf :=range _aaba {if _ ,_ccfdb :=_fdfcc [_fcgf ];
|
||
_ccfdb {continue ;};_fdbcb =append (_fdbcb ,_fcgf );_fdfcc [_fcgf ]=struct{}{};};};_a .Float64s (_fdbcb );return _fdbcb ;};
|
||
|
||
// String returns a description of `b`.
|
||
func (_bgbb *wordBag )String ()string {var _gagf []string ;for _ ,_aecf :=range _bgbb .depthIndexes (){_efbe :=_bgbb ._gbbd [_aecf ];for _ ,_egdc :=range _efbe {_gagf =append (_gagf ,_egdc ._eedc );};};return _ae .Sprintf ("\u0025.\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065=\u0025\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0025\u0071",_bgbb .PdfRectangle ,_bgbb ._aad ,len (_gagf ),_gagf );
|
||
};func (_dfe *textObject )checkOp (_edfd *_ba .ContentStreamOperation ,_agbe int ,_ecb bool )(_gbdf bool ,_dbad error ){if _dfe ==nil {var _fedf []_bad .PdfObject ;if _agbe > 0{_fedf =_edfd .Params ;if len (_fedf )> _agbe {_fedf =_fedf [:_agbe ];};};_fc .Log .Debug ("\u0025\u0023q \u006f\u0070\u0065r\u0061\u006e\u0064\u0020out\u0073id\u0065\u0020\u0074\u0065\u0078\u0074\u002e p\u0061\u0072\u0061\u006d\u0073\u003d\u0025+\u0076",_edfd .Operand ,_fedf );
|
||
};if _agbe >=0{if len (_edfd .Params )!=_agbe {if _ecb {_dbad =_d .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");};_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_edfd .Operand ,_agbe ,len (_edfd .Params ),_edfd .Params );
|
||
return false ,_dbad ;};};return true ,nil ;};type rectRuling struct{_cbae rulingKind ;_bgc markKind ;_eg .Color ;_aec .PdfRectangle ;};func _adgf (_bgadg []rulingList )(rulingList ,rulingList ){var _ffgca rulingList ;for _ ,_defa :=range _bgadg {_ffgca =append (_ffgca ,_defa ...);
|
||
};return _ffgca .vertsHorzs ();};func (_ebfdd *textTable )computeBbox ()_aec .PdfRectangle {var _fbff _aec .PdfRectangle ;_cggf :=false ;for _ccdf :=0;_ccdf < _ebfdd ._agdc ;_ccdf ++{for _ccba :=0;_ccba < _ebfdd ._afcga ;_ccba ++{_fbef :=_ebfdd .get (_ccba ,_ccdf );
|
||
if _fbef ==nil {continue ;};if !_cggf {_fbff =_fbef .PdfRectangle ;_cggf =true ;}else {_fbff =_agfb (_fbff ,_fbef .PdfRectangle );};};};return _fbff ;};func (_geagf *shapesState )stroke (_gfdf *[]pathSection ){_dgd :=pathSection {_gbag :_geagf ._edc ,Color :_geagf ._bbdg .getStrokeColor ()};
|
||
*_gfdf =append (*_gfdf ,_dgd );if _eceg {_ae .Printf ("\u0020 \u0020\u0020S\u0054\u0052\u004fK\u0045\u003a\u0020\u0025\u0064\u0020\u0073t\u0072\u006f\u006b\u0065\u0073\u0020s\u0073\u003d\u0025\u0073\u0020\u0063\u006f\u006c\u006f\u0072\u003d%\u002b\u0076\u0020\u0025\u0036\u002e\u0032\u0066\u000a",len (*_gfdf ),_geagf ,_geagf ._bbdg .getStrokeColor (),_dgd .bbox ());
|
||
if _ggbbe {for _egdgg ,_gbfe :=range _geagf ._edc {_ae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_egdgg ,_gbfe );if _egdgg ==10{break ;};};};};};
|
||
|
||
// Text gets the extracted text contained in `l`.
|
||
func (_ffffa *list )Text ()string {_dcag :=&_bb .Builder {};_gafc :="";_ggge (_ffffa ,_dcag ,&_gafc );return _dcag .String ();};func (_fbg *imageExtractContext )processOperand (_cbg *_ba .ContentStreamOperation ,_afd _ba .GraphicsState ,_egg *_aec .PdfPageResources )error {if _cbg .Operand =="\u0042\u0049"&&len (_cbg .Params )==1{_dbc ,_dgg :=_cbg .Params [0].(*_ba .ContentStreamInlineImage );
|
||
if !_dgg {return nil ;};if _cdg ,_cg :=_bad .GetBoolVal (_dbc .ImageMask );_cg {if _cdg &&!_fbg ._gfc .IncludeInlineStencilMasks {return nil ;};};return _fbg .extractInlineImage (_dbc ,_afd ,_egg );}else if _cbg .Operand =="\u0044\u006f"&&len (_cbg .Params )==1{_gd ,_eeb :=_bad .GetName (_cbg .Params [0]);
|
||
if !_eeb {_fc .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065");return _ad ;};_ ,_cgd :=_egg .GetXObjectByName (*_gd );switch _cgd {case _aec .XObjectTypeImage :return _fbg .extractXObjectImage (_gd ,_afd ,_egg );case _aec .XObjectTypeForm :return _fbg .extractFormImages (_gd ,_afd ,_egg );
|
||
};}else if _fbg ._ece &&(_cbg .Operand =="\u0073\u0063\u006e"||_cbg .Operand =="\u0053\u0043\u004e")&&len (_cbg .Params )==1{_ebe ,_gcde :=_bad .GetName (_cbg .Params [0]);if !_gcde {_fc .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065");
|
||
return _ad ;};_cab ,_gcde :=_egg .GetPatternByName (*_ebe );if !_gcde {_fc .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0050\u0061\u0074\u0074\u0065\u0072n\u0020\u006e\u006f\u0074\u0020\u0066\u006f\u0075\u006e\u0064");return nil ;};if _cab .IsTiling (){_caf :=_cab .GetAsTilingPattern ();
|
||
_fed ,_afb :=_caf .GetContentStream ();if _afb !=nil {return _afb ;};_afb =_fbg .extractContentStreamImages (string (_fed ),_caf .Resources );if _afb !=nil {return _afb ;};};}else if (_cbg .Operand =="\u0063\u0073"||_cbg .Operand =="\u0043\u0053")&&len (_cbg .Params )>=1{_fbg ._ece =_cbg .Params [0].String ()=="\u0050a\u0074\u0074\u0065\u0072\u006e";
|
||
};return nil ;};func _degd (_agfbf ,_fcfe _aec .PdfRectangle )(_aec .PdfRectangle ,bool ){if !_fbdc (_agfbf ,_fcfe ){return _aec .PdfRectangle {},false ;};return _aec .PdfRectangle {Llx :_ea .Max (_agfbf .Llx ,_fcfe .Llx ),Urx :_ea .Min (_agfbf .Urx ,_fcfe .Urx ),Lly :_ea .Max (_agfbf .Lly ,_fcfe .Lly ),Ury :_ea .Min (_agfbf .Ury ,_fcfe .Ury )},true ;
|
||
};type rulingKind int ;func (_ecddc *structTreeRoot )parseStructTreeRoot (_gbgf _bad .PdfObject ){if _gbgf !=nil {_baga ,_cef :=_bad .GetDict (_gbgf );if !_cef {_fc .Log .Debug ("\u0070\u0061\u0072s\u0065\u0053\u0074\u0072\u0075\u0063\u0074\u0054\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u003a\u0020\u0064\u0069\u0063\u0074\u0069\u006f\u006e\u0061\u0072\u0079\u0020\u006eo\u0074\u0020\u0066\u006f\u0075\u006e\u0064\u002e");
|
||
};K :=_baga .Get ("\u004b");_gced :=_baga .Get ("\u0054\u0079\u0070\u0065").String ();var _abge *_bad .PdfObjectArray ;switch _egdggd :=K .(type ){case *_bad .PdfObjectArray :_abge =_egdggd ;case *_bad .PdfObjectReference :_abge =_bad .MakeArray (K );};
|
||
_ddgd :=[]structElement {};for _ ,_aeddg :=range _abge .Elements (){_daedf :=&structElement {};_daedf .parseStructElement (_aeddg );_ddgd =append (_ddgd ,*_daedf );};_ecddc ._bagdg =_ddgd ;_ecddc ._gage =_gced ;};};func _dbef (_bfac ,_efggf _g .Point )bool {_gfge :=_ea .Abs (_bfac .X -_efggf .X );
|
||
_gadf :=_ea .Abs (_bfac .Y -_efggf .Y );return _gdba (_gadf ,_gfge );};func (_gabgd *wordBag )absorb (_cdba *wordBag ){_gbea :=_cdba .makeRemovals ();for _fcag ,_agd :=range _cdba ._gbbd {for _ ,_egca :=range _agd {_gabgd .pullWord (_egca ,_fcag ,_gbea );
|
||
};};_cdba .applyRemovals (_gbea );};var _gdfd *_f .Regexp =_f .MustCompile (_cgce +"\u007c"+_ddac );type ruling struct{_egdf rulingKind ;_dcebd markKind ;_eg .Color ;_gbgc float64 ;_fgad float64 ;_ababc float64 ;_aecega float64 ;};
|
||
|
||
// Extractor stores and offers functionality for extracting content from PDF pages.
|
||
type Extractor struct{_ffd string ;_gf *_aec .PdfPageResources ;_c _aec .PdfRectangle ;_af *_aec .PdfRectangle ;_fba map[string ]fontEntry ;_gfe map[string ]textResult ;_fec int64 ;_fff int ;_fg *Options ;_ga *_bad .PdfObject ;_bd _bad .PdfObject ;};func (_feed *textTable )isExportable ()bool {if _feed ._fbccb {return true ;
|
||
};_badc :=func (_fbge int )bool {_acbdb :=_feed .get (0,_fbge );if _acbdb ==nil {return false ;};_fdcd :=_acbdb .text ();_cdffe :=_e .RuneCountInString (_fdcd );_dddce :=_aecab .MatchString (_fdcd );return _cdffe <=1||_dddce ;};for _bdebc :=0;_bdebc < _feed ._agdc ;
|
||
_bdebc ++{if !_badc (_bdebc ){return true ;};};return false ;};func (_fgbc *wordBag )getDepthIdx (_fag float64 )int {_geea :=_fgbc .depthIndexes ();_gbbb :=_dafa (_fag );if _gbbb < _geea [0]{return _geea [0];};if _gbbb > _geea [len (_geea )-1]{return _geea [len (_geea )-1];
|
||
};return _gbbb ;};func (_bbdc *textPara )writeText (_fage _bc .Writer ){if _bbdc ._caaa ==nil {_bbdc .writeCellText (_fage );return ;};for _faffg :=0;_faffg < _bbdc ._caaa ._agdc ;_faffg ++{for _egbeg :=0;_egbeg < _bbdc ._caaa ._afcga ;_egbeg ++{_dgfb :=_bbdc ._caaa .get (_egbeg ,_faffg );
|
||
if _dgfb ==nil {_fage .Write ([]byte ("\u0009"));}else {_dgfb .writeCellText (_fage );};_fage .Write ([]byte ("\u0020"));};if _faffg < _bbdc ._caaa ._agdc -1{_fage .Write ([]byte ("\u000a"));};};};func _bedae (_agfe map[int ][]float64 ){if len (_agfe )<=1{return ;
|
||
};_dggb :=_gged (_agfe );if _gbead {_fc .Log .Info ("\u0066i\u0078C\u0065\u006c\u006c\u0073\u003a \u006b\u0065y\u0073\u003d\u0025\u002b\u0076",_dggb );};var _edec ,_agaba int ;for _edec ,_agaba =range _dggb {if _agfe [_agaba ]!=nil {break ;};};for _dgbce ,_daga :=range _dggb [_edec :]{_cfdfb :=_agfe [_daga ];
|
||
if _cfdfb ==nil {continue ;};if _gbead {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u006b\u0030\u003d\u0025\u0064\u0020\u006b1\u003d\u0025\u0064\u000a",_edec +_dgbce ,_agaba ,_daga );};_cadffc :=_agfe [_daga ];if _cadffc [len (_cadffc )-1]> _cfdfb [0]{_cadffc [len (_cadffc )-1]=_cfdfb [0];
|
||
_agfe [_agaba ]=_cadffc ;};_agaba =_daga ;};};type cachedImage struct{_cb *_aec .Image ;_ecf _aec .PdfColorspace ;};func _acfc (_cbgda []*textLine )map[float64 ][]*textLine {_a .Slice (_cbgda ,func (_aebff ,_dgdf int )bool {return _cbgda [_aebff ]._gaca < _cbgda [_dgdf ]._gaca });
|
||
_abdc :=map[float64 ][]*textLine {};for _ ,_cdfd :=range _cbgda {_addd :=_egfdf (_cdfd );_addd =_ea .Round (_addd );_abdc [_addd ]=append (_abdc [_addd ],_cdfd );};return _abdc ;};func (_bbda *wordBag )removeDuplicates (){if _gabga {_fc .Log .Info ("r\u0065m\u006f\u0076\u0065\u0044\u0075\u0070\u006c\u0069c\u0061\u0074\u0065\u0073: \u0025\u0071",_bbda .text ());
|
||
};for _ ,_ggbad :=range _bbda .depthIndexes (){if len (_bbda ._gbbd [_ggbad ])==0{continue ;};_abgf :=_bbda ._gbbd [_ggbad ][0];_ddgab :=_dfdcc *_abgf ._aeegf ;_ddad :=_abgf ._aecg ;for _ ,_gfce :=range _bbda .depthBand (_ddad ,_ddad +_ddgab ){_fdec :=map[*textWord ]struct{}{};
|
||
_ecfe :=_bbda ._gbbd [_gfce ];for _ ,_aagg :=range _ecfe {if _ ,_abgec :=_fdec [_aagg ];_abgec {continue ;};for _ ,_afcg :=range _ecfe {if _ ,_cefc :=_fdec [_afcg ];_cefc {continue ;};if _afcg !=_aagg &&_afcg ._eedc ==_aagg ._eedc &&_ea .Abs (_afcg .Llx -_aagg .Llx )< _ddgab &&_ea .Abs (_afcg .Urx -_aagg .Urx )< _ddgab &&_ea .Abs (_afcg .Lly -_aagg .Lly )< _ddgab &&_ea .Abs (_afcg .Ury -_aagg .Ury )< _ddgab {_fdec [_afcg ]=struct{}{};
|
||
};};};if len (_fdec )> 0{_cfgdb :=0;for _ ,_gecb :=range _ecfe {if _ ,_addec :=_fdec [_gecb ];!_addec {_ecfe [_cfgdb ]=_gecb ;_cfgdb ++;};};_bbda ._gbbd [_gfce ]=_ecfe [:len (_ecfe )-len (_fdec )];if len (_bbda ._gbbd [_gfce ])==0{delete (_bbda ._gbbd ,_gfce );
|
||
};};};};};func (_feeg *subpath )last ()_g .Point {return _feeg ._acfg [len (_feeg ._acfg )-1]};type lineRuling struct{_cdbfg rulingKind ;_cdac markKind ;_eg .Color ;_bcfgb ,_befe _g .Point ;};func _cgdbc (_fcff []int )[]int {_babfd :=make ([]int ,len (_fcff ));
|
||
for _cafc ,_gcee :=range _fcff {_babfd [len (_fcff )-1-_cafc ]=_gcee ;};return _babfd ;};func (_bffce paraList )toTextMarks ()[]TextMark {_gbdgg :=0;var _eadea []TextMark ;for _egde ,_egbe :=range _bffce {if _egbe ._egbea {continue ;};_gegc :=_egbe .toTextMarks (&_gbdgg );
|
||
_eadea =append (_eadea ,_gegc ...);if _egde !=len (_bffce )-1{if _gceb (_egbe ,_bffce [_egde +1]){_eadea =_ggcce (_eadea ,&_gbdgg ,"\u0020");}else {_eadea =_ggcce (_eadea ,&_gbdgg ,"\u000a");_eadea =_ggcce (_eadea ,&_gbdgg ,"\u000a");};};};_eadea =_ggcce (_eadea ,&_gbdgg ,"\u000a");
|
||
_eadea =_ggcce (_eadea ,&_gbdgg ,"\u000a");return _eadea ;};func (_aegf *textTable )logComposite (_cgfef string ){if !_gbead {return ;};_fc .Log .Info ("\u007e~\u007eP\u0061\u0072\u0061\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0073",_aegf ._afcga ,_aegf ._agdc ,_cgfef );
|
||
_ae .Printf ("\u0025\u0035\u0073 \u007c","");for _eedee :=0;_eedee < _aegf ._afcga ;_eedee ++{_ae .Printf ("\u0025\u0033\u0064 \u007c",_eedee );};_ae .Println ("");_ae .Printf ("\u0025\u0035\u0073 \u002b","");for _cdgab :=0;_cdgab < _aegf ._afcga ;_cdgab ++{_ae .Printf ("\u0025\u0033\u0073 \u002b","\u002d\u002d\u002d");
|
||
};_ae .Println ("");for _cbbad :=0;_cbbad < _aegf ._agdc ;_cbbad ++{_ae .Printf ("\u0025\u0035\u0064 \u007c",_cbbad );for _agdcc :=0;_agdcc < _aegf ._afcga ;_agdcc ++{_fdce ,_ :=_aegf ._gaeb [_fgged (_agdcc ,_cbbad )].parasBBox ();_ae .Printf ("\u0025\u0033\u0064 \u007c",len (_fdce ));
|
||
};_ae .Println ("");};_fc .Log .Info ("\u007e~\u007eT\u0065\u0078\u0074\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0073",_aegf ._afcga ,_aegf ._agdc ,_cgfef );_ae .Printf ("\u0025\u0035\u0073 \u007c","");for _ecaf :=0;_ecaf < _aegf ._afcga ;_ecaf ++{_ae .Printf ("\u0025\u0031\u0032\u0064\u0020\u007c",_ecaf );
|
||
};_ae .Println ("");_ae .Printf ("\u0025\u0035\u0073 \u002b","");for _ggcd :=0;_ggcd < _aegf ._afcga ;_ggcd ++{_ae .Print ("\u002d\u002d\u002d\u002d\u002d\u002d\u002d\u002d\u002d-\u002d\u002d\u002d\u002b");};_ae .Println ("");for _bfgdb :=0;_bfgdb < _aegf ._agdc ;
|
||
_bfgdb ++{_ae .Printf ("\u0025\u0035\u0064 \u007c",_bfgdb );for _ddebd :=0;_ddebd < _aegf ._afcga ;_ddebd ++{_gabdg ,_ :=_aegf ._gaeb [_fgged (_ddebd ,_bfgdb )].parasBBox ();_fcbe :="";_addbd :=_gabdg .merge ();if _addbd !=nil {_fcbe =_addbd .text ();};
|
||
_fcbe =_ae .Sprintf ("\u0025\u0071",_bgfd (_fcbe ,12));_fcbe =_fcbe [1:len (_fcbe )-1];_ae .Printf ("\u0025\u0031\u0032\u0073\u0020\u007c",_fcbe );};_ae .Println ("");};};func (_gdbc *textPara )bbox ()_aec .PdfRectangle {return _gdbc .PdfRectangle };func (_cadb *structElement )parseStructElement (_aaag _bad .PdfObject ){_ggcc ,_dbga :=_bad .GetDict (_aaag );
|
||
if !_dbga {_fc .Log .Debug ("\u0070\u0061\u0072\u0073\u0065\u0053\u0074\u0072u\u0063\u0074\u0045le\u006d\u0065\u006e\u0074\u003a\u0020d\u0069\u0063\u0074\u0069\u006f\u006e\u0061\u0072\u0079\u0020\u006f\u0062\u006a\u0065\u0063t\u0020\u006e\u006f\u0074\u0020\u0066\u006f\u0075n\u0064\u002e");
|
||
return ;};_bcafg :=_ggcc .Get ("\u0053");_cdgd :=_ggcc .Get ("\u0050\u0067");_gaadf :="";if _bcafg !=nil {_gaadf =_bcafg .String ();};_badb :=_ggcc .Get ("\u004b");_cadb ._aeff =_gaadf ;_cadb ._ecdd =_cdgd ;switch _feg :=_badb .(type ){case *_bad .PdfObjectInteger :_cadb ._aeff =_gaadf ;
|
||
_cadb ._fab =int64 (*_feg );_cadb ._ecdd =_cdgd ;case *_bad .PdfObjectReference :_efeaa :=*_bad .MakeArray (_feg );var _ddce int64 =-1;_cadb ._fab =_ddce ;if _efeaa .Len ()==1{_cadc :=_efeaa .Elements ()[0];_cbgb ,_ddfag :=_cadc .(*_bad .PdfObjectInteger );
|
||
if _ddfag {_ddce =int64 (*_cbgb );_cadb ._fab =_ddce ;_cadb ._aeff =_gaadf ;_cadb ._ecdd =_cdgd ;return ;};};_eedg :=[]structElement {};for _ ,_agcce :=range _efeaa .Elements (){_geeb ,_dge :=_agcce .(*_bad .PdfObjectInteger );if _dge {_ddce =int64 (*_geeb );
|
||
_cadb ._fab =_ddce ;_cadb ._aeff =_gaadf ;}else {_gbfa :=&structElement {};_gbfa .parseStructElement (_agcce );_eedg =append (_eedg ,*_gbfa );};_ddce =-1;};_cadb ._ccaac =_eedg ;case *_bad .PdfObjectArray :_efaa :=_badb .(*_bad .PdfObjectArray );var _ecgb int64 =-1;
|
||
_cadb ._fab =_ecgb ;if _efaa .Len ()==1{_gdea :=_efaa .Elements ()[0];_eece ,_gdee :=_gdea .(*_bad .PdfObjectInteger );if _gdee {_ecgb =int64 (*_eece );_cadb ._fab =_ecgb ;_cadb ._aeff =_gaadf ;_cadb ._ecdd =_cdgd ;return ;};};_cdbb :=[]structElement {};
|
||
for _ ,_egge :=range _efaa .Elements (){_cadfb ,_cfeeg :=_egge .(*_bad .PdfObjectInteger );if _cfeeg {_ecgb =int64 (*_cadfb );_cadb ._fab =_ecgb ;_cadb ._aeff =_gaadf ;_cadb ._ecdd =_cdgd ;}else {_fedfc :=&structElement {};_fedfc .parseStructElement (_egge );
|
||
_cdbb =append (_cdbb ,*_fedfc );};_ecgb =-1;};_cadb ._ccaac =_cdbb ;};};func (_ffgb *imageExtractContext )extractContentStreamImages (_ggc string ,_ef *_aec .PdfPageResources )error {_dbf :=_ba .NewContentStreamParser (_ggc );_eff ,_bga :=_dbf .Parse ();
|
||
if _bga !=nil {return _bga ;};if _ffgb ._cf ==nil {_ffgb ._cf =map[*_bad .PdfObjectStream ]*cachedImage {};};if _ffgb ._gfc ==nil {_ffgb ._gfc =&ImageExtractOptions {};};_fdc :=_ba .NewContentStreamProcessor (*_eff );_fdc .AddHandler (_ba .HandlerConditionEnumAllOperands ,"",_ffgb .processOperand );
|
||
return _fdc .Process (_ef );};func (_badea *textTable )emptyCompositeRow (_ebde int )bool {for _fcdg :=0;_fcdg < _badea ._afcga ;_fcdg ++{if _bdba ,_ggcba :=_badea ._gaeb [_fgged (_fcdg ,_ebde )];_ggcba {if len (_bdba .paraList )> 0{return false ;};};};
|
||
return true ;};type pathSection struct{_gbag []*subpath ;_eg .Color ;};
|
||
|
||
// NewWithOptions an Extractor instance for extracting content from the input PDF page with options.
|
||
func NewWithOptions (page *_aec .PdfPage ,options *Options )(*Extractor ,error ){const _ffc ="\u0065x\u0074\u0072\u0061\u0063\u0074\u006f\u0072\u002e\u004e\u0065\u0077W\u0069\u0074\u0068\u004f\u0070\u0074\u0069\u006f\u006e\u0073";_fa ,_df :=page .GetAllContentStreams ();
|
||
if _df !=nil {return nil ,_df ;};_dac ,_afa :=page .GetStructTreeRoot ();if !_afa {_fc .Log .Info ("T\u0068\u0065\u0020\u0070\u0064\u0066\u0020\u0064\u006f\u0063\u0075\u006d\u0065\u006e\u0074\u0020\u0069\u0073\u0020\u006e\u006f\u0074\u0020\u0074\u0061\u0067g\u0065d\u002e\u0020\u0053\u0074r\u0075\u0063t\u0054\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u0020\u0064\u006f\u0065\u0073\u006e\u0027\u0074\u0020\u0065\u0078\u0069\u0073\u0074\u002e");
|
||
};_cd :=page .GetContainingPdfObject ();_agg ,_df :=page .GetMediaBox ();if _df !=nil {return nil ,_ae .Errorf ("\u0065\u0078\u0074r\u0061\u0063\u0074\u006fr\u0020\u0072\u0065\u0071\u0075\u0069\u0072e\u0073\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u002e\u0020\u0025\u0076",_df );
|
||
};_gga :=&Extractor {_ffd :_fa ,_gf :page .Resources ,_c :*_agg ,_af :page .CropBox ,_fba :map[string ]fontEntry {},_gfe :map[string ]textResult {},_fg :options ,_ga :_dac ,_bd :_cd };if _gga ._c .Llx > _gga ._c .Urx {_fc .Log .Info ("\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0058\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e",_gga ._c );
|
||
_gga ._c .Llx ,_gga ._c .Urx =_gga ._c .Urx ,_gga ._c .Llx ;};if _gga ._c .Lly > _gga ._c .Ury {_fc .Log .Info ("\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0059\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e",_gga ._c );
|
||
_gga ._c .Lly ,_gga ._c .Ury =_gga ._c .Ury ,_gga ._c .Lly ;};_ff .TrackUse (_ffc );return _gga ,nil ;};func (_efe *shapesState )lastpointEstablished ()(_g .Point ,bool ){if _efe ._effc {return _efe ._cgbcf ,false ;};_agbef :=len (_efe ._edc );if _agbef > 0&&_efe ._edc [_agbef -1]._bbdf {return _efe ._edc [_agbef -1].last (),false ;
|
||
};return _g .Point {},true ;};func (_efddf *ruling )encloses (_dcae ,_edgdc float64 )bool {return _efddf ._fgad -_ecce <=_dcae &&_edgdc <=_efddf ._ababc +_ecce ;};func _fbdc (_dfa ,_gfaf _aec .PdfRectangle )bool {return _bae (_dfa ,_gfaf )&&_bfc (_dfa ,_gfaf )};
|
||
|
||
|
||
// ExtractPageImages returns the image contents of the page extractor, including data
|
||
// and position, size information for each image.
|
||
// A set of options to control page image extraction can be passed in. The options
|
||
// parameter can be nil for the default options. By default, inline stencil masks
|
||
// are not extracted.
|
||
func (_ge *Extractor )ExtractPageImages (options *ImageExtractOptions )(*PageImages ,error ){_ade :=&imageExtractContext {_gfc :options };_gcd :=_ade .extractContentStreamImages (_ge ._ffd ,_ge ._gf );if _gcd !=nil {return nil ,_gcd ;};return &PageImages {Images :_ade ._bff },nil ;
|
||
};func (_dfedb *wordBag )depthBand (_aefee ,_cgffb float64 )[]int {if len (_dfedb ._gbbd )==0{return nil ;};return _dfedb .depthRange (_dfedb .getDepthIdx (_aefee ),_dfedb .getDepthIdx (_cgffb ));};func (_geee *shapesState )lineTo (_adgdc ,_cagf float64 ){if _bcge {_fc .Log .Info ("\u006c\u0069\u006eeT\u006f\u0028\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0070\u003d\u0025\u002e\u0032\u0066",_adgdc ,_cagf ,_geee .devicePoint (_adgdc ,_cagf ));
|
||
};_geee .addPoint (_adgdc ,_cagf );};func _affac (_ebga *list )[]*textLine {for _ ,_dgced :=range _ebga ._bffd {switch _dgced ._efac {case "\u004c\u0042\u006fd\u0079":if len (_dgced ._gagag )!=0{return _dgced ._gagag ;};return _affac (_dgced );case "\u0053\u0070\u0061\u006e":return _dgced ._gagag ;
|
||
case "I\u006e\u006c\u0069\u006e\u0065\u0053\u0068\u0061\u0070\u0065":return _dgced ._gagag ;};};return nil ;};func (_bfaag rulingList )comp (_gcffb ,_dadbc int )bool {_adaef ,_aeagc :=_bfaag [_gcffb ],_bfaag [_dadbc ];_ceef ,_ebf :=_adaef ._egdf ,_aeagc ._egdf ;
|
||
if _ceef !=_ebf {return _ceef > _ebf ;};if _ceef ==_cbfe {return false ;};_fdac :=func (_cbac bool )bool {if _ceef ==_bfgb {return _cbac ;};return !_cbac ;};_dbae ,_cggd :=_adaef ._gbgc ,_aeagc ._gbgc ;if _dbae !=_cggd {return _fdac (_dbae > _cggd );};
|
||
_dbae ,_cggd =_adaef ._fgad ,_aeagc ._fgad ;if _dbae !=_cggd {return _fdac (_dbae < _cggd );};return _fdac (_adaef ._ababc < _aeagc ._ababc );};var _debga =map[markKind ]string {_bggb :"\u0073\u0074\u0072\u006f\u006b\u0065",_gagfa :"\u0066\u0069\u006c\u006c",_dbed :"\u0061u\u0067\u006d\u0065\u006e\u0074"};
|
||
|
||
|
||
// String returns a description of `k`.
|
||
func (_dfff rulingKind )String ()string {_bfcef ,_gacf :=_bcccf [_dfff ];if !_gacf {return _ae .Sprintf ("\u004e\u006ft\u0020\u0061\u0020r\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0025\u0064",_dfff );};return _bfcef ;};func (_fgfd *textLine )markWordBoundaries (){_fcae :=_gdda *_fgfd ._fgcb ;
|
||
for _edef ,_gdab :=range _fgfd ._bfag [1:]{if _cdfb (_gdab ,_fgfd ._bfag [_edef ])>=_fcae {_gdab ._ceff =true ;};};};
|
||
|
||
// String returns a description of `tm`.
|
||
func (_gfag *textMark )String ()string {return _ae .Sprintf ("\u0025\u002e\u0032f \u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\u0022\u0025\u0073\u0022",_gfag .PdfRectangle ,_gfag ._cbbae ,_gfag ._gded );};func _faff (_cgdg structElement )[]structElement {_fdcbc :=[]structElement {};
|
||
for _ ,_dedg :=range _cgdg ._ccaac {for _ ,_def :=range _dedg ._ccaac {for _ ,_efce :=range _def ._ccaac {if _efce ._aeff =="\u004c"{_fdcbc =append (_fdcbc ,_efce );};};};};return _fdcbc ;};func (_abaf *TextMarkArray )exists (_fcacd TextMark )bool {for _ ,_bdb :=range _abaf .Elements (){if _fb .DeepEqual (_fcacd .DirectObject ,_bdb .DirectObject )&&_fb .DeepEqual (_fcacd .BBox ,_bdb .BBox )&&_bdb .Text ==_fcacd .Text {return true ;
|
||
};};return false ;};var _ddac string ="\u005e\u005b\u0061\u002d\u007a\u0041\u002dZ\u005d\u0028\u005c)\u007c\u005c\u002e)\u007c\u005e[\u005c\u0064\u005d\u002b\u0028\u005c)\u007c\\.\u0029\u007c\u005e\u005c\u0028\u005b\u0061\u002d\u007a\u0041\u002d\u005a\u005d\u005c\u0029\u007c\u005e\u005c\u0028\u005b\u005c\u0064\u005d\u002b\u005c\u0029";
|
||
func _edcb (_edgfg *list )[]*list {var _bbccd []*list ;for _ ,_cgeg :=range _edgfg ._bffd {switch _cgeg ._efac {case "\u004c\u0049":_dbdcb :=_affac (_cgeg );_eceb :=_edcb (_cgeg );_ebef :=_deec (_dbdcb ,"\u0062\u0075\u006c\u006c\u0065\u0074",_eceb );_cagfe :=_aadg (_dbdcb ,"");
|
||
_ebef ._begg =_cagfe ;_bbccd =append (_bbccd ,_ebef );case "\u004c\u0042\u006fd\u0079":return _edcb (_cgeg );case "\u004c":_dgcf :=_edcb (_cgeg );_bbccd =append (_bbccd ,_dgcf ...);return _bbccd ;};};return _bbccd ;};func (_fbeg *wordBag )allWords ()[]*textWord {var _ddf []*textWord ;
|
||
for _ ,_dbdc :=range _fbeg ._gbbd {_ddf =append (_ddf ,_dbdc ...);};return _ddf ;};func _dgc (_bbbe ,_eade bounded )float64 {return _aadc (_bbbe )-_aadc (_eade )};func (_acgb *stateStack )pop ()*textState {if _acgb .empty (){return nil ;};_cdcc :=*(*_acgb )[len (*_acgb )-1];
|
||
*_acgb =(*_acgb )[:len (*_acgb )-1];return &_cdcc ;};func _eedce (_edcf []*textWord ,_cbbbd int )[]*textWord {_bcbdb :=len (_edcf );copy (_edcf [_cbbbd :],_edcf [_cbbbd +1:]);return _edcf [:_bcbdb -1];};func (_aaaca gridTile )contains (_eafb _aec .PdfRectangle )bool {if _aaaca .numBorders ()< 3{return false ;
|
||
};if _aaaca ._cbfd &&_eafb .Llx < _aaaca .Llx -_ebcd {return false ;};if _aaaca ._cbdbf &&_eafb .Urx > _aaaca .Urx +_ebcd {return false ;};if _aaaca ._fcgc &&_eafb .Lly < _aaaca .Lly -_ebcd {return false ;};if _aaaca ._fbbf &&_eafb .Ury > _aaaca .Ury +_ebcd {return false ;
|
||
};return true ;};
|
||
|
||
// BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
|
||
func (_ecae *TextMarkArray )BBox ()(_aec .PdfRectangle ,bool ){var _cfe _aec .PdfRectangle ;_aeea :=false ;for _ ,_cfea :=range _ecae ._bade {if _cfea .Meta ||_dfca (_cfea .Text ){continue ;};if _aeea {_cfe =_agfb (_cfe ,_cfea .BBox );}else {_cfe =_cfea .BBox ;
|
||
_aeea =true ;};};return _cfe ,_aeea ;};func (_bcbgf rulingList )removeDuplicates ()rulingList {if len (_bcbgf )==0{return nil ;};_bcbgf .sort ();_bcgcf :=rulingList {_bcbgf [0]};for _ ,_daebe :=range _bcbgf [1:]{if _daebe .equals (_bcgcf [len (_bcgcf )-1]){continue ;
|
||
};_bcgcf =append (_bcgcf ,_daebe );};return _bcgcf ;};var (_abec =map[rune ]string {0x0060:"\u0300",0x02CB:"\u0300",0x0027:"\u0301",0x00B4:"\u0301",0x02B9:"\u0301",0x02CA:"\u0301",0x005E:"\u0302",0x02C6:"\u0302",0x007E:"\u0303",0x02DC:"\u0303",0x00AF:"\u0304",0x02C9:"\u0304",0x02D8:"\u0306",0x02D9:"\u0307",0x00A8:"\u0308",0x00B0:"\u030a",0x02DA:"\u030a",0x02BA:"\u030b",0x02DD:"\u030b",0x02C7:"\u030c",0x02C8:"\u030d",0x0022:"\u030e",0x02BB:"\u0312",0x02BC:"\u0313",0x0486:"\u0313",0x055A:"\u0313",0x02BD:"\u0314",0x0485:"\u0314",0x0559:"\u0314",0x02D4:"\u031d",0x02D5:"\u031e",0x02D6:"\u031f",0x02D7:"\u0320",0x02B2:"\u0321",0x00B8:"\u0327",0x02CC:"\u0329",0x02B7:"\u032b",0x02CD:"\u0331",0x005F:"\u0332",0x204E:"\u0359"};
|
||
);func (_agbbf rulingList )toGrids ()[]rulingList {if _eceg {_fc .Log .Info ("t\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0025\u0073",_agbbf );};_bdfb :=_agbbf .intersections ();if _eceg {_fc .Log .Info ("\u0074\u006f\u0047r\u0069\u0064\u0073\u003a \u0076\u0065\u0063\u0073\u003d\u0025\u0064 \u0069\u006e\u0074\u0065\u0072\u0073\u0065\u0063\u0074\u0073\u003d\u0025\u0064\u0020",len (_agbbf ),len (_bdfb ));
|
||
for _ ,_dffa :=range _fcea (_bdfb ){_ae .Printf ("\u00254\u0064\u003a\u0020\u0025\u002b\u0076\n",_dffa ,_bdfb [_dffa ]);};};_bdgg :=make (map[int ]intSet ,len (_agbbf ));for _gecc :=range _agbbf {_bccg :=_agbbf .connections (_bdfb ,_gecc );if len (_bccg )> 0{_bdgg [_gecc ]=_bccg ;
|
||
};};if _eceg {_fc .Log .Info ("t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u0063\u006fn\u006e\u0065\u0063\u0074s=\u0025\u0064",len (_bdgg ));for _ ,_gcgac :=range _fcea (_bdgg ){_ae .Printf ("\u00254\u0064\u003a\u0020\u0025\u002b\u0076\n",_gcgac ,_bdgg [_gcgac ]);
|
||
};};_bcace :=_dbega (len (_agbbf ),func (_fcfeg ,_cdde int )bool {_fdbbb ,_eggeg :=len (_bdgg [_fcfeg ]),len (_bdgg [_cdde ]);if _fdbbb !=_eggeg {return _fdbbb > _eggeg ;};return _agbbf .comp (_fcfeg ,_cdde );});if _eceg {_fc .Log .Info ("t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u006f\u0072d\u0065\u0072\u0069\u006eg=\u0025\u0076",_bcace );
|
||
};_gede :=[][]int {{_bcace [0]}};_fdaa :for _ ,_gebc :=range _bcace [1:]{for _acef ,_adcc :=range _gede {for _ ,_beff :=range _adcc {if _bdgg [_beff ].has (_gebc ){_gede [_acef ]=append (_adcc ,_gebc );continue _fdaa ;};};};_gede =append (_gede ,[]int {_gebc });
|
||
};if _eceg {_fc .Log .Info ("\u0074o\u0047r\u0069\u0064\u0073\u003a\u0020i\u0067\u0072i\u0064\u0073\u003d\u0025\u0076",_gede );};_a .SliceStable (_gede ,func (_dgbad ,_bebdb int )bool {return len (_gede [_dgbad ])> len (_gede [_bebdb ])});for _ ,_geaaa :=range _gede {_a .Slice (_geaaa ,func (_cagd ,_degef int )bool {return _agbbf .comp (_geaaa [_cagd ],_geaaa [_degef ])});
|
||
};_cacd :=make ([]rulingList ,len (_gede ));for _fgbb ,_afbb :=range _gede {_gcacb :=make (rulingList ,len (_afbb ));for _efgcg ,_dggc :=range _afbb {_gcacb [_efgcg ]=_agbbf [_dggc ];};_cacd [_fgbb ]=_gcacb ;};if _eceg {_fc .Log .Info ("\u0074o\u0047r\u0069\u0064\u0073\u003a\u0020g\u0072\u0069d\u0073\u003d\u0025\u002b\u0076",_cacd );
|
||
};var _aabb []rulingList ;for _ ,_cffg :=range _cacd {if _eefc ,_ffab :=_cffg .isActualGrid ();_ffab {_cffg =_eefc ;_cffg =_cffg .snapToGroups ();_aabb =append (_aabb ,_cffg );};};if _eceg {_bgbf ("t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u0061\u0063t\u0075\u0061\u006c\u0047ri\u0064\u0073",_aabb );
|
||
_fc .Log .Info ("\u0074\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0067\u0072\u0069\u0064\u0073\u003d%\u0064 \u0061\u0063\u0074\u0075\u0061\u006c\u0047\u0072\u0069\u0064\u0073\u003d\u0025\u0064",len (_cacd ),len (_aabb ));};return _aabb ;};
|
||
|
||
// Len returns the number of TextMarks in `ma`.
|
||
func (_fef *TextMarkArray )Len ()int {if _fef ==nil {return 0;};return len (_fef ._bade );};
|
||
|
||
// Font represents the font properties on a PDF page.
|
||
type Font struct{PdfFont *_aec .PdfFont ;
|
||
|
||
// FontName represents Font Name from font properties.
|
||
FontName string ;
|
||
|
||
// FontType represents Font Subtype entry in the font dictionary inside page resources.
|
||
// Examples : type0, Type1, MMType1, Type3, TrueType, CIDFont.
|
||
FontType string ;
|
||
|
||
// ToUnicode is true if font provides a `ToUnicode` mapping.
|
||
ToUnicode bool ;
|
||
|
||
// IsCID is true if underlying font is a composite font.
|
||
// Composite font is represented by a font dictionary whose Subtype is `Type0`
|
||
IsCID bool ;
|
||
|
||
// IsSimple is true if font is simple font.
|
||
// A simple font is limited to only 8 bit (255) character codes.
|
||
IsSimple bool ;
|
||
|
||
// FontData represents the raw data of the embedded font file.
|
||
// It can have format TrueType (TTF), PostScript Font (PFB) or Compact Font Format (CCF).
|
||
// FontData value can be indicates from `FontFile`, `FontFile2` or `FontFile3` inside Font Descriptor.
|
||
// At most, only one of `FontFile`, `FontFile2` or `FontFile3` will be FontData value.
|
||
FontData []byte ;
|
||
|
||
// FontFileName is a name representing the font. it has format:
|
||
// (Font Name) + (Font Type Extension), example: helvetica.ttf.
|
||
FontFileName string ;
|
||
|
||
// FontDescriptor represents metrics and other attributes inside font properties from PDF Structure (Font Descriptor).
|
||
FontDescriptor *_aec .PdfFontDescriptor ;};func (_dfag *textMark )inDiacriticArea (_cdcf *textMark )bool {_dcea :=_dfag .Llx -_cdcf .Llx ;_dgfd :=_dfag .Urx -_cdcf .Urx ;_eabf :=_dfag .Lly -_cdcf .Lly ;return _ea .Abs (_dcea +_dgfd )< _dfag .Width ()*_gccc &&_ea .Abs (_eabf )< _dfag .Height ()*_gccc ;
|
||
};func (_gecac *textWord )addDiacritic (_fadd string ){_afgf :=_gecac ._ebfa [len (_gecac ._ebfa )-1];_afgf ._gded +=_fadd ;_afgf ._gded =_da .NFKC .String (_afgf ._gded );};func (_bafe rectRuling )asRuling ()(*ruling ,bool ){_decb :=ruling {_egdf :_bafe ._cbae ,Color :_bafe .Color ,_dcebd :_gagfa };
|
||
switch _bafe ._cbae {case _eebe :_decb ._gbgc =0.5*(_bafe .Llx +_bafe .Urx );_decb ._fgad =_bafe .Lly ;_decb ._ababc =_bafe .Ury ;_agbfa ,_bbag :=_bafe .checkWidth (_bafe .Llx ,_bafe .Urx );if !_bbag {if _bdf {_fc .Log .Error ("\u0072\u0065\u0063\u0074\u0052\u0075l\u0069\u006e\u0067\u002e\u0061\u0073\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0072\u0075\u006c\u0069\u006e\u0067V\u0065\u0072\u0074\u0020\u0021\u0063\u0068\u0065\u0063\u006b\u0057\u0069\u0064\u0074h\u0020v\u003d\u0025\u002b\u0076",_bafe );
|
||
};return nil ,false ;};_decb ._aecega =_agbfa ;case _bfgb :_decb ._gbgc =0.5*(_bafe .Lly +_bafe .Ury );_decb ._fgad =_bafe .Llx ;_decb ._ababc =_bafe .Urx ;_eagcf ,_gbcb :=_bafe .checkWidth (_bafe .Lly ,_bafe .Ury );if !_gbcb {if _bdf {_fc .Log .Error ("\u0072\u0065\u0063\u0074\u0052\u0075l\u0069\u006e\u0067\u002e\u0061\u0073\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0072\u0075\u006c\u0069\u006e\u0067H\u006f\u0072\u007a\u0020\u0021\u0063\u0068\u0065\u0063\u006b\u0057\u0069\u0064\u0074h\u0020v\u003d\u0025\u002b\u0076",_bafe );
|
||
};return nil ,false ;};_decb ._aecega =_eagcf ;default:_fc .Log .Error ("\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064",_bafe ._cbae );return nil ,false ;};return &_decb ,true ;};func (_eaac *textTable )bbox ()_aec .PdfRectangle {return _eaac .PdfRectangle };
|
||
|
||
|
||
// ExtractText processes and extracts all text data in content streams and returns as a string.
|
||
// It takes into account character encodings in the PDF file, which are decoded by
|
||
// CharcodeBytesToUnicode.
|
||
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
|
||
func (_aaf *Extractor )ExtractText ()(string ,error ){_cfc ,_ ,_ ,_edf :=_aaf .ExtractTextWithStats ();return _cfc ,_edf ;};func _egf (_dfgf _g .Point )_g .Matrix {return _g .TranslationMatrix (_dfgf .X ,_dfgf .Y )};func (_eccgb *textTable )reduceTiling (_eecc gridTiling ,_gfafd float64 )*textTable {_efcg :=make ([]int ,0,_eccgb ._agdc );
|
||
_cgaa :=make ([]int ,0,_eccgb ._afcga );_dagf :=_eecc ._fecg ;_ebfg :=_eecc ._eceeed ;for _dcdcc :=0;_dcdcc < _eccgb ._agdc ;_dcdcc ++{_cafga :=_dcdcc > 0&&_ea .Abs (_ebfg [_dcdcc -1]-_ebfg [_dcdcc ])< _gfafd &&_eccgb .emptyCompositeRow (_dcdcc );if !_cafga {_efcg =append (_efcg ,_dcdcc );
|
||
};};for _ecba :=0;_ecba < _eccgb ._afcga ;_ecba ++{_dgcfd :=_ecba < _eccgb ._afcga -1&&_ea .Abs (_dagf [_ecba +1]-_dagf [_ecba ])< _gfafd &&_eccgb .emptyCompositeColumn (_ecba );if !_dgcfd {_cgaa =append (_cgaa ,_ecba );};};if len (_efcg )==_eccgb ._agdc &&len (_cgaa )==_eccgb ._afcga {return _eccgb ;
|
||
};_ffga :=textTable {_fbccb :_eccgb ._fbccb ,_afcga :len (_cgaa ),_agdc :len (_efcg ),_gaeb :make (map[uint64 ]compositeCell ,len (_cgaa )*len (_efcg ))};if _gbead {_fc .Log .Info ("\u0072\u0065\u0064\u0075c\u0065\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0025d\u0078%\u0064\u0020\u002d\u003e\u0020\u0025\u0064x\u0025\u0064",_eccgb ._afcga ,_eccgb ._agdc ,len (_cgaa ),len (_efcg ));
|
||
_fc .Log .Info ("\u0072\u0065d\u0075\u0063\u0065d\u0043\u006f\u006c\u0073\u003a\u0020\u0025\u002b\u0076",_cgaa );_fc .Log .Info ("\u0072\u0065d\u0075\u0063\u0065d\u0052\u006f\u0077\u0073\u003a\u0020\u0025\u002b\u0076",_efcg );};for _ffee ,_bbfdb :=range _efcg {for _acecd ,_agec :=range _cgaa {_gaba ,_bege :=_eccgb .getComposite (_agec ,_bbfdb );
|
||
if len (_gaba )==0{continue ;};if _gbead {_ae .Printf ("\u0020 \u0025\u0032\u0064\u002c \u0025\u0032\u0064\u0020\u0028%\u0032d\u002c \u0025\u0032\u0064\u0029\u0020\u0025\u0071\n",_acecd ,_ffee ,_agec ,_bbfdb ,_bgfd (_gaba .merge ().text (),50));};_ffga .putComposite (_acecd ,_ffee ,_gaba ,_bege );
|
||
};};return &_ffga ;};func (_dffg rulingList )toTilings ()(rulingList ,[]gridTiling ){_dffg .log ("\u0074o\u0054\u0069\u006c\u0069\u006e\u0067s");if len (_dffg )==0{return nil ,nil ;};_dffg =_dffg .tidied ("\u0061\u006c\u006c");_dffg .log ("\u0074\u0069\u0064\u0069\u0065\u0064");
|
||
_cgdba :=_dffg .toGrids ();_gbce :=make ([]gridTiling ,len (_cgdba ));for _agab ,_bgde :=range _cgdba {_gbce [_agab ]=_bgde .asTiling ();};return _dffg ,_gbce ;};func (_cdbab paraList )findTables (_cgcge []gridTiling )[]*textTable {_cdbab .addNeighbours ();
|
||
_a .Slice (_cdbab ,func (_gfedd ,_acbdf int )bool {return _abaa (_cdbab [_gfedd ],_cdbab [_acbdf ])< 0});var _fbebg []*textTable ;if _eecg {_effcf :=_cdbab .findGridTables (_cgcge );_fbebg =append (_fbebg ,_effcf ...);};if _ebada {_affe :=_cdbab .findTextTables ();
|
||
_fbebg =append (_fbebg ,_affe ...);};return _fbebg ;};const (_aea ="\u0045\u0052R\u004f\u0052\u003a\u0020\u0043\u0061\u006e\u0027\u0074\u0020\u0063\u006f\u006e\u0076\u0065\u0072\u0074\u0020\u0066\u006f\u006e\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074\u002c\u0020\u0069\u006e\u0076\u0061\u006c\u0069\u0064\u0020\u0074\u0079\u0070\u0065";
|
||
_bf ="\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0043a\u006e\u0027\u0074 g\u0065\u0074\u0020\u0066\u006f\u006et\u0020\u0070\u0072\u006f\u0070\u0065\u0072\u0074\u0069\u0065\u0073\u002c\u0020\u0066\u006fn\u0074\u0020\u006e\u006f\u0074\u0020\u0066\u006fu\u006e\u0064";
|
||
_de ="\u0045\u0052\u0052O\u0052\u003a\u0020\u0043\u0061\u006e\u0027\u0074\u0020\u0067\u0065\u0074\u0020\u0066\u006f\u006e\u0074\u0020\u0073\u0074\u0072\u0065\u0061\u006d\u002c\u0020\u0069\u006e\u0076a\u006c\u0069\u0064\u0020\u0074\u0079\u0070\u0065";);
|
||
func (_fedb *textObject )getFillColor ()_eg .Color {return _ffdg (_fedb ._ggd .ColorspaceNonStroking ,_fedb ._ggd .ColorNonStroking );};type event struct{_dfada float64 ;_aedb bool ;_dacaa int ;};func _faadf (_bbgd *textLine ,_aead []*textLine ,_fddf []float64 )float64 {var _dfab float64 =-1;
|
||
for _ ,_bdcf :=range _aead {if _bdcf ._gaca > _bbgd ._gaca {if _ea .Round (_bdcf .Llx )>=_ea .Round (_bbgd .Llx ){_dfab =_bdcf ._gaca ;}else {break ;};};};return _dfab ;};func _aced (_ccec map[float64 ]map[float64 ]gridTile )[]float64 {_ggcg :=make ([]float64 ,0,len (_ccec ));
|
||
for _dgdbe :=range _ccec {_ggcg =append (_ggcg ,_dgdbe );};_a .Float64s (_ggcg );_gbdfb :=len (_ggcg );for _efaab :=0;_efaab < _gbdfb /2;_efaab ++{_ggcg [_efaab ],_ggcg [_gbdfb -1-_efaab ]=_ggcg [_gbdfb -1-_efaab ],_ggcg [_efaab ];};return _ggcg ;};func (_efdgb intSet )del (_agfae int ){delete (_efdgb ,_agfae )};
|
||
func _fcfa (_cadff []*wordBag )[]*wordBag {if len (_cadff )<=1{return _cadff ;};if _fbbd {_fc .Log .Info ("\u006d\u0065\u0072\u0067\u0065\u0057\u006f\u0072\u0064B\u0061\u0067\u0073\u003a");};_a .Slice (_cadff ,func (_gfed ,_bcdb int )bool {_acffgb ,_bce :=_cadff [_gfed ],_cadff [_bcdb ];
|
||
_gbebf :=_acffgb .Width ()*_acffgb .Height ();_fdbb :=_bce .Width ()*_bce .Height ();if _gbebf !=_fdbb {return _gbebf > _fdbb ;};if _acffgb .Height ()!=_bce .Height (){return _acffgb .Height ()> _bce .Height ();};return _gfed < _bcdb ;});var _aagc []*wordBag ;
|
||
_gefba :=make (intSet );for _dbcfb :=0;_dbcfb < len (_cadff );_dbcfb ++{if _gefba .has (_dbcfb ){continue ;};_aaga :=_cadff [_dbcfb ];for _geac :=_dbcfb +1;_geac < len (_cadff );_geac ++{if _gefba .has (_dbcfb ){continue ;};_abfe :=_cadff [_geac ];_ffbd :=_aaga .PdfRectangle ;
|
||
_ffbd .Llx -=_aaga ._aad ;if _dfdf (_ffbd ,_abfe .PdfRectangle ){_aaga .absorb (_abfe );_gefba .add (_geac );};};_aagc =append (_aagc ,_aaga );};if len (_cadff )!=len (_aagc )+len (_gefba ){_fc .Log .Error ("\u006d\u0065\u0072ge\u0057\u006f\u0072\u0064\u0042\u0061\u0067\u0073\u003a \u0025d\u2192%\u0064 \u0061\u0062\u0073\u006f\u0072\u0062\u0065\u0064\u003d\u0025\u0064",len (_cadff ),len (_aagc ),len (_gefba ));
|
||
};return _aagc ;};func _dgde (_dadac ,_ebeff _g .Point ,_faeb _eg .Color )(*ruling ,bool ){_dcgg :=lineRuling {_bcfgb :_dadac ,_befe :_ebeff ,_cdbfg :_cgbfe (_dadac ,_ebeff ),Color :_faeb };if _dcgg ._cdbfg ==_cbfe {return nil ,false ;};return _dcgg .asRuling ();
|
||
};func (_gfbc paraList )llyRange (_cgbcc []int ,_aecc ,_cfgd float64 )[]int {_dfdd :=len (_gfbc );if _cfgd < _gfbc [_cgbcc [0]].Lly ||_aecc > _gfbc [_cgbcc [_dfdd -1]].Lly {return nil ;};_cfbb :=_a .Search (_dfdd ,func (_dagd int )bool {return _gfbc [_cgbcc [_dagd ]].Lly >=_aecc });
|
||
_daad :=_a .Search (_dfdd ,func (_ffaae int )bool {return _gfbc [_cgbcc [_ffaae ]].Lly > _cfgd });return _cgbcc [_cfbb :_daad ];};func (_aageb rectRuling )checkWidth (_fagdc ,_fdga float64 )(float64 ,bool ){_acfcd :=_fdga -_fagdc ;_adbb :=_acfcd <=_cabc ;
|
||
return _acfcd ,_adbb ;};func (_gad *imageExtractContext )extractFormImages (_ed *_bad .PdfObjectName ,_cgb _ba .GraphicsState ,_cdag *_aec .PdfPageResources )error {_dga ,_edg :=_cdag .GetXObjectFormByName (*_ed );if _edg !=nil {return _edg ;};if _dga ==nil {return nil ;
|
||
};_cbf ,_edg :=_dga .GetContentStream ();if _edg !=nil {return _edg ;};_geb :=_dga .Resources ;if _geb ==nil {_geb =_cdag ;};_edg =_gad .extractContentStreamImages (string (_cbf ),_geb );if _edg !=nil {return _edg ;};_gad ._bgb ++;return nil ;};func _afgg (_gabd *Extractor ,_fcdc *_aec .PdfPageResources ,_gdfb _ba .GraphicsState ,_aag *textState ,_bdde *stateStack )*textObject {return &textObject {_eegd :_gabd ,_bcg :_fcdc ,_ggd :_gdfb ,_aed :_bdde ,_cbgd :_aag ,_acc :_g .IdentityMatrix (),_dde :_g .IdentityMatrix ()};
|
||
};const (_cbfe rulingKind =iota ;_bfgb ;_eebe ;);func (_cbdb *textMark )bbox ()_aec .PdfRectangle {return _cbdb .PdfRectangle };
|
||
|
||
// NewFromContents creates a new extractor from contents and page resources.
|
||
func NewFromContents (contents string ,resources *_aec .PdfPageResources )(*Extractor ,error ){const _ffa ="\u0065x\u0074\u0072\u0061\u0063t\u006f\u0072\u002e\u004e\u0065w\u0046r\u006fm\u0043\u006f\u006e\u0074\u0065\u006e\u0074s";_eb :=&Extractor {_ffd :contents ,_gf :resources ,_fba :map[string ]fontEntry {},_gfe :map[string ]textResult {}};
|
||
_ff .TrackUse (_ffa );return _eb ,nil ;};func (_bgdg *textObject )getStrokeColor ()_eg .Color {return _ffdg (_bgdg ._ggd .ColorspaceStroking ,_bgdg ._ggd .ColorStroking );};func (_bbbgc *ruling )gridIntersecting (_cdee *ruling )bool {return _cdffd (_bbbgc ._fgad ,_cdee ._fgad )&&_cdffd (_bbbgc ._ababc ,_cdee ._ababc );
|
||
};func _fdfb (_caeba _aec .PdfRectangle )*ruling {return &ruling {_egdf :_eebe ,_gbgc :_caeba .Llx ,_fgad :_caeba .Lly ,_ababc :_caeba .Ury };};func _fcbd (_aaff []*textWord ,_cgbd float64 ,_acbc ,_gebb rulingList )*wordBag {_gacg :=_ecea (_aaff [0],_cgbd ,_acbc ,_gebb );
|
||
for _ ,_aaa :=range _aaff [1:]{_acd :=_dafa (_aaa ._aecg );_gacg ._gbbd [_acd ]=append (_gacg ._gbbd [_acd ],_aaa );_gacg .PdfRectangle =_agfb (_gacg .PdfRectangle ,_aaa .PdfRectangle );};_gacg .sort ();return _gacg ;};func _cdfb (_bde ,_eeac bounded )float64 {return _bde .bbox ().Llx -_eeac .bbox ().Urx };
|
||
func _ffdg (_acged _aec .PdfColorspace ,_bbfffb _aec .PdfColor )_eg .Color {if _acged ==nil ||_bbfffb ==nil {return _eg .Black ;};_agfg ,_adeef :=_acged .ColorToRGB (_bbfffb );if _adeef !=nil {_fc .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006fu\u006c\u0064\u0020no\u0074\u0020\u0063\u006f\u006e\u0076e\u0072\u0074\u0020\u0063\u006f\u006c\u006f\u0072\u0020\u0025\u0076\u0020\u0028\u0025\u0076)\u0020\u0074\u006f\u0020\u0052\u0047\u0042\u003a \u0025\u0073",_bbfffb ,_acged ,_adeef );
|
||
return _eg .Black ;};_aedc ,_ccdfg :=_agfg .(*_aec .PdfColorDeviceRGB );if !_ccdfg {_fc .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006f\u006e\u0076\u0065\u0072\u0074\u0065\u0064 \u0063\u006f\u006c\u006f\u0072\u0020\u0069\u0073\u0020\u006e\u006f\u0074\u0020i\u006e\u0020\u0074\u0068\u0065\u0020\u0052\u0047\u0042\u0020\u0063\u006flo\u0072\u0073\u0070\u0061\u0063\u0065\u003a\u0020\u0025\u0076",_agfg );
|
||
return _eg .Black ;};return _eg .NRGBA {R :uint8 (_aedc .R ()*255),G :uint8 (_aedc .G ()*255),B :uint8 (_aedc .B ()*255),A :uint8 (255)};};func (_bfdfa rulingList )merge ()*ruling {_ecbd :=_bfdfa [0]._gbgc ;_bcbe :=_bfdfa [0]._fgad ;_ddaa :=_bfdfa [0]._ababc ;
|
||
for _ ,_cgcc :=range _bfdfa [1:]{_ecbd +=_cgcc ._gbgc ;if _cgcc ._fgad < _bcbe {_bcbe =_cgcc ._fgad ;};if _cgcc ._ababc > _ddaa {_ddaa =_cgcc ._ababc ;};};_febb :=&ruling {_egdf :_bfdfa [0]._egdf ,_dcebd :_bfdfa [0]._dcebd ,Color :_bfdfa [0].Color ,_gbgc :_ecbd /float64 (len (_bfdfa )),_fgad :_bcbe ,_ababc :_ddaa };
|
||
if _geda {_fc .Log .Info ("\u006de\u0072g\u0065\u003a\u0020\u0025\u0032d\u0020\u0076e\u0063\u0073\u0020\u0025\u0073",len (_bfdfa ),_febb );for _aeeg ,_fefb :=range _bfdfa {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_aeeg ,_fefb );};
|
||
};return _febb ;};
|
||
|
||
// String returns a string describing `ma`.
|
||
func (_adab TextMarkArray )String ()string {_egb :=len (_adab ._bade );if _egb ==0{return "\u0045\u004d\u0050T\u0059";};_abdb :=_adab ._bade [0];_bbe :=_adab ._bade [_egb -1];return _ae .Sprintf ("\u007b\u0054\u0045\u0058\u0054\u004d\u0041\u0052K\u0041\u0052\u0052AY\u003a\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025s\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d%\u0073\u007d",_egb ,_abdb ,_bbe );
|
||
};func (_deeb *stateStack )top ()*textState {if _deeb .empty (){return nil ;};return (*_deeb )[_deeb .size ()-1];};func (_bbcd *stateStack )size ()int {return len (*_bbcd )};func _ggeeg (_cbefb []pathSection )rulingList {_aeefg (_cbefb );if _eceg {_fc .Log .Info ("\u006da\u006b\u0065\u0046\u0069l\u006c\u0052\u0075\u006c\u0069n\u0067s\u003a \u0025\u0064\u0020\u0066\u0069\u006c\u006cs",len (_cbefb ));
|
||
};var _daba rulingList ;for _ ,_fbac :=range _cbefb {for _ ,_dgbb :=range _fbac ._gbag {if !_dgbb .isQuadrilateral (){if _eceg {_fc .Log .Error ("!\u0069s\u0051\u0075\u0061\u0064\u0072\u0069\u006c\u0061t\u0065\u0072\u0061\u006c: \u0025\u0073",_dgbb );};
|
||
continue ;};if _befg ,_caebb :=_dgbb .makeRectRuling (_fbac .Color );_caebb {_daba =append (_daba ,_befg );}else {if _bdf {_fc .Log .Error ("\u0021\u006d\u0061\u006beR\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0025\u0073",_dgbb );
|
||
};};};};if _eceg {_fc .Log .Info ("\u006d\u0061\u006b\u0065Fi\u006c\u006c\u0052\u0075\u006c\u0069\u006e\u0067\u0073\u003a\u0020\u0025\u0073",_daba .String ());};return _daba ;};func _bgfd (_bdgea string ,_ggae int )string {if len (_bdgea )< _ggae {return _bdgea ;
|
||
};return _bdgea [:_ggae ];};func (_ffca *textLine )text ()string {var _cbba []string ;for _ ,_ecabe :=range _ffca ._bfag {if _ecabe ._ceff {_cbba =append (_cbba ,"\u0020");};_cbba =append (_cbba ,_ecabe ._eedc );};return _bb .Join (_cbba ,"");};type markKind int ;
|
||
func _gbeb (_ccfa _aec .PdfRectangle )textState {return textState {_aafe :100,_ffgba :RenderModeFill ,_edfa :_ccfa };};
|
||
|
||
// String returns a description of `p`.
|
||
func (_beda *textPara )String ()string {if _beda ._egbea {return _ae .Sprintf ("\u0025\u0036\u002e\u0032\u0066\u0020\u005b\u0045\u004d\u0050\u0054\u0059\u005d",_beda .PdfRectangle );};_fecda :="";if _beda ._caaa !=nil {_fecda =_ae .Sprintf ("\u005b\u0025\u0064\u0078\u0025\u0064\u005d\u0020",_beda ._caaa ._afcga ,_beda ._caaa ._agdc );
|
||
};return _ae .Sprintf ("\u0025\u0036\u002e\u0032f \u0025\u0073\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u0020\u0025\u0071",_beda .PdfRectangle ,_fecda ,len (_beda ._bfagf ),_bgfd (_beda .text (),50));};func (_bdgec *wordBag )highestWord (_ddeb int ,_adbg ,_adeb float64 )*textWord {for _ ,_dffb :=range _bdgec ._gbbd [_ddeb ]{if _adbg <=_dffb ._aecg &&_dffb ._aecg <=_adeb {return _dffb ;
|
||
};};return nil ;};func (_ecdaf *textTable )getDown ()paraList {_fdbf :=make (paraList ,_ecdaf ._afcga );for _cabg :=0;_cabg < _ecdaf ._afcga ;_cabg ++{_gfbg :=_ecdaf .get (_cabg ,_ecdaf ._agdc -1)._ccee ;if _gfbg .taken (){return nil ;};_fdbf [_cabg ]=_gfbg ;
|
||
};for _dcga :=0;_dcga < _ecdaf ._afcga -1;_dcga ++{if _fdbf [_dcga ]._aabe !=_fdbf [_dcga +1]{return nil ;};};return _fdbf ;};func (_gdg *shapesState )moveTo (_eedeg ,_aceb float64 ){_gdg ._effc =true ;_gdg ._cgbcf =_gdg .devicePoint (_eedeg ,_aceb );if _bcge {_fc .Log .Info ("\u006d\u006fv\u0065\u0054\u006f\u003a\u0020\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0064\u0065\u0076\u0069\u0063\u0065\u003d%.\u0032\u0066",_eedeg ,_aceb ,_gdg ._cgbcf );
|
||
};};func (_bfef *PageText )computeViews (){_ebd :=_bfef .getParagraphs ();_ead :=new (_fe .Buffer );_ebd .writeText (_ead );_bfef ._eede =_ead .String ();_bfef ._fgb =_ebd .toTextMarks ();_bfef ._gba =_ebd .tables ();if _gbead {_fc .Log .Info ("\u0063\u006f\u006dpu\u0074\u0065\u0056\u0069\u0065\u0077\u0073\u003a\u0020\u0074\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064",len (_bfef ._gba ));
|
||
};};
|
||
|
||
// RangeOffset returns the TextMarks in `ma` that overlap text[start:end] in the extracted text.
|
||
// These are tm: `start` <= tm.Offset + len(tm.Text) && tm.Offset < `end` where
|
||
// `start` and `end` are offsets in the extracted text.
|
||
// NOTE: TextMarks can contain multiple characters. e.g. "ffi" for the ffi ligature so the first and
|
||
// last elements of the returned TextMarkArray may only partially overlap text[start:end].
|
||
func (_egec *TextMarkArray )RangeOffset (start ,end int )(*TextMarkArray ,error ){if _egec ==nil {return nil ,_d .New ("\u006da\u003d\u003d\u006e\u0069\u006c");};if end < start {return nil ,_ae .Errorf ("\u0065\u006e\u0064\u0020\u003c\u0020\u0073\u0074\u0061\u0072\u0074\u002e\u0020\u0052\u0061n\u0067\u0065\u004f\u0066\u0066\u0073\u0065\u0074\u0020\u006e\u006f\u0074\u0020d\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020\u0073\u0074\u0061\u0072t=\u0025\u0064\u0020\u0065\u006e\u0064\u003d\u0025\u0064\u0020",start ,end );
|
||
};_gagg :=len (_egec ._bade );if _gagg ==0{return _egec ,nil ;};if start < _egec ._bade [0].Offset {start =_egec ._bade [0].Offset ;};if end > _egec ._bade [_gagg -1].Offset +1{end =_egec ._bade [_gagg -1].Offset +1;};_ggdg :=_a .Search (_gagg ,func (_aagf int )bool {return _egec ._bade [_aagf ].Offset +len (_egec ._bade [_aagf ].Text )-1>=start });
|
||
if !(0<=_ggdg &&_ggdg < _gagg ){_ecdf :=_ae .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020\u0072\u0061\u006e\u0067\u0065\u002e\u0020\u0073\u0074\u0061\u0072\u0074\u003d%\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009 \u006c\u0061\u0073\u0074\u003d%\u0076",start ,_ggdg ,_gagg ,_egec ._bade [0],_egec ._bade [_gagg -1]);
|
||
return nil ,_ecdf ;};_dgba :=_a .Search (_gagg ,func (_ggf int )bool {return _egec ._bade [_ggf ].Offset > end -1});if !(0<=_dgba &&_dgba < _gagg ){_deaa :=_ae .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020r\u0061\u006e\u0067e\u002e\u0020\u0065n\u0064\u003d%\u0064\u0020\u0069\u0045\u006e\u0064=\u0025d \u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d\u0025\u0076",end ,_dgba ,_gagg ,_egec ._bade [0],_egec ._bade [_gagg -1]);
|
||
return nil ,_deaa ;};if _dgba <=_ggdg {return nil ,_ae .Errorf ("\u0069\u0045\u006e\u0064\u0020\u003c=\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003a\u0020\u0073\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u0065\u006ed\u003d\u0025\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020i\u0045n\u0064\u003d\u0025\u0064",start ,end ,_ggdg ,_dgba );
|
||
};return &TextMarkArray {_bade :_egec ._bade [_ggdg :_dgba ]},nil ;};func (_aefe *shapesState )quadraticTo (_ggbf ,_gefe ,_ggab ,_cade float64 ){if _bcge {_fc .Log .Info ("\u0071\u0075\u0061d\u0072\u0061\u0074\u0069\u0063\u0054\u006f\u003a");};_aefe .addPoint (_ggab ,_cade );
|
||
};func (_abbgd lineRuling )yMean ()float64 {return 0.5*(_abbgd ._bcfgb .Y +_abbgd ._befe .Y )};
|
||
|
||
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
|
||
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
|
||
func (_eae *Extractor )ExtractTextWithStats ()(_bge string ,_dgb int ,_fbe int ,_gfb error ){_eef ,_dgb ,_fbe ,_gfb :=_eae .ExtractPageText ();if _gfb !=nil {return "",_dgb ,_fbe ,_gfb ;};return _eef .Text (),_dgb ,_fbe ,nil ;};func (_bfbd *shapesState )establishSubpath ()*subpath {_gacd ,_bgab :=_bfbd .lastpointEstablished ();
|
||
if !_bgab {_bfbd ._edc =append (_bfbd ._edc ,_bbcb (_gacd ));};if len (_bfbd ._edc )==0{return nil ;};_bfbd ._effc =false ;return _bfbd ._edc [len (_bfbd ._edc )-1];};func _gdba (_cgdcd ,_dbac float64 )bool {return _cgdcd /_ea .Max (_bedf ,_dbac )< _dccc };
|
||
func (_gadbe paraList )findGridTables (_eccf []gridTiling )[]*textTable {if _gbead {_fc .Log .Info ("\u0066i\u006e\u0064\u0047\u0072\u0069\u0064\u0054\u0061\u0062\u006c\u0065s\u003a\u0020\u0025\u0064\u0020\u0070\u0061\u0072\u0061\u0073",len (_gadbe ));
|
||
for _bdgf ,_ecddce :=range _gadbe {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bdgf ,_ecddce );};};var _cgfa []*textTable ;for _cdbbd ,_fgdg :=range _eccf {_fagcd ,_bdfe :=_gadbe .findTableGrid (_fgdg );if _fagcd !=nil {_fagcd .log (_ae .Sprintf ("\u0066\u0069\u006e\u0064Ta\u0062\u006c\u0065\u0057\u0069\u0074\u0068\u0047\u0072\u0069\u0064\u0073\u003a\u0020%\u0064",_cdbbd ));
|
||
_cgfa =append (_cgfa ,_fagcd );_fagcd .markCells ();};for _ddfce :=range _bdfe {_ddfce ._dcddf =true ;};};if _gbead {_fc .Log .Info ("\u0066i\u006e\u0064\u0047\u0072i\u0064\u0054\u0061\u0062\u006ce\u0073:\u0020%\u0064\u0020\u0074\u0061\u0062\u006c\u0065s",len (_cgfa ));
|
||
};return _cgfa ;};var _cgce string ="\u0028\u003f\u0069\u0029\u005e\u0028\u004d\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u0043\u0028?\u003a\u0044\u007cM\u0029\u007c\u0044\u003f\u0043{\u0030\u002c\u0033\u007d\u0029\u0028\u0058\u0028\u003f\u003a\u004c\u007c\u0043\u0029\u007cL\u003f\u0058\u007b\u0030\u002c\u0033}\u0029\u0028\u0049\u0028\u003f\u003a\u0056\u007c\u0058\u0029\u007c\u0056\u003f\u0049\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u005c\u0029\u007c\u005c\u002e\u0029\u007c\u005e\u005c\u0028\u0028\u004d\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u0043\u0028\u003f\u003aD\u007cM\u0029\u007c\u0044\u003f\u0043\u007b\u0030\u002c\u0033\u007d\u0029\u0028\u0058\u0028?\u003a\u004c\u007c\u0043\u0029\u007c\u004c?\u0058\u007b0\u002c\u0033\u007d\u0029(\u0049\u0028\u003f\u003a\u0056|\u0058\u0029\u007c\u0056\u003f\u0049\u007b\u0030\u002c\u0033\u007d\u0029\u005c\u0029";
|
||
func _egfdf (_acbb *textLine )float64 {return _acbb ._bfag [0].Llx };type textLine struct{_aec .PdfRectangle ;_gaca float64 ;_bfag []*textWord ;_fgcb float64 ;};func (_bfd *textObject )newTextMark (_egcg string ,_ecdc _g .Matrix ,_bcged _g .Point ,_dgabb float64 ,_abe *_aec .PdfFont ,_bbcf float64 ,_dfbd ,_fgbg _eg .Color ,_gada _bad .PdfObject ,_ggaa []string ,_gcac int ,_cgdb int )(textMark ,bool ){_cdga :=_ecdc .Angle ();
|
||
_bggg :=_begdb (_cdga ,_ceba );var _geec float64 ;if _bggg %180!=90{_geec =_ecdc .ScalingFactorY ();}else {_geec =_ecdc .ScalingFactorX ();};_ggfe :=_eee (_ecdc );_fdcc :=_aec .PdfRectangle {Llx :_ggfe .X ,Lly :_ggfe .Y ,Urx :_bcged .X ,Ury :_bcged .Y };
|
||
switch _bggg %360{case 90:_fdcc .Urx -=_geec ;case 180:_fdcc .Ury -=_geec ;case 270:_fdcc .Urx +=_geec ;case 0:_fdcc .Ury +=_geec ;default:_bggg =0;_fdcc .Ury +=_geec ;};if _fdcc .Llx > _fdcc .Urx {_fdcc .Llx ,_fdcc .Urx =_fdcc .Urx ,_fdcc .Llx ;};if _fdcc .Lly > _fdcc .Ury {_fdcc .Lly ,_fdcc .Ury =_fdcc .Ury ,_fdcc .Lly ;
|
||
};_dgef :=true ;if _bfd ._eegd ._c .Width ()> 0{_dgaa ,_dged :=_degd (_fdcc ,_bfd ._eegd ._c );if !_dged {_dgef =false ;_fc .Log .Debug ("\u0054\u0065\u0078\u0074\u0020m\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069\u0064\u0065\u0020\u0070a\u0067\u0065\u002e\u0020\u0062\u0062\u006f\u0078\u003d\u0025\u0067\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u003d\u0025\u0067\u0020\u0074\u0065\u0078\u0074\u003d\u0025q",_fdcc ,_bfd ._eegd ._c ,_egcg );
|
||
};_fdcc =_dgaa ;};_gagc :=_fdcc ;_gbba :=_bfd ._eegd ._c ;switch _bggg %360{case 90:_gbba .Urx ,_gbba .Ury =_gbba .Ury ,_gbba .Urx ;_gagc =_aec .PdfRectangle {Llx :_gbba .Urx -_fdcc .Ury ,Urx :_gbba .Urx -_fdcc .Lly ,Lly :_fdcc .Llx ,Ury :_fdcc .Urx };
|
||
case 180:_gagc =_aec .PdfRectangle {Llx :_gbba .Urx -_fdcc .Llx ,Urx :_gbba .Urx -_fdcc .Urx ,Lly :_gbba .Ury -_fdcc .Lly ,Ury :_gbba .Ury -_fdcc .Ury };case 270:_gbba .Urx ,_gbba .Ury =_gbba .Ury ,_gbba .Urx ;_gagc =_aec .PdfRectangle {Llx :_fdcc .Ury ,Urx :_fdcc .Lly ,Lly :_gbba .Ury -_fdcc .Llx ,Ury :_gbba .Ury -_fdcc .Urx };
|
||
};if _gagc .Llx > _gagc .Urx {_gagc .Llx ,_gagc .Urx =_gagc .Urx ,_gagc .Llx ;};if _gagc .Lly > _gagc .Ury {_gagc .Lly ,_gagc .Ury =_gagc .Ury ,_gagc .Lly ;};_cefe :=textMark {_gded :_egcg ,PdfRectangle :_gagc ,_gega :_fdcc ,_bccc :_abe ,_cbbae :_geec ,_dcfd :_bbcf ,_ddebg :_ecdc ,_bdbd :_bcged ,_eeacf :_bggg ,_fdag :_dfbd ,_aabcb :_fgbg ,_gafcd :_gada ,_agff :_ggaa ,Th :_bfd ._cbgd ._aafe ,Tw :_bfd ._cbgd ._ecd ,_ebgag :_cgdb ,_daeff :_gcac };
|
||
if _baf {_fc .Log .Info ("n\u0065\u0077\u0054\u0065\u0078\u0074M\u0061\u0072\u006b\u003a\u0020\u0073t\u0061\u0072\u0074\u003d\u0025\u002e\u0032f\u0020\u0065\u006e\u0064\u003d\u0025\u002e\u0032\u0066\u0020%\u0073",_ggfe ,_bcged ,_cefe .String ());};return _cefe ,_dgef ;
|
||
};func (_gdbcc *wordBag )arrangeText ()*textPara {_gdbcc .sort ();if _cfee {_gdbcc .removeDuplicates ();};var _fcaeb []*textLine ;for _ ,_gdbd :=range _gdbcc .depthIndexes (){for !_gdbcc .empty (_gdbd ){_daedb :=_gdbcc .firstReadingIndex (_gdbd );_eggb :=_gdbcc .firstWord (_daedb );
|
||
_efgg :=_cffa (_gdbcc ,_daedb );_fgfee :=_eggb ._aeegf ;_fcfff :=_eggb ._aecg -_ddfa *_fgfee ;_cffbd :=_eggb ._aecg +_ddfa *_fgfee ;_cfeaf :=_ggad *_fgfee ;_caeeb :=_decc *_fgfee ;_dceg :for {var _bdbdg *textWord ;_dcgb :=0;for _ ,_gbbaa :=range _gdbcc .depthBand (_fcfff ,_cffbd ){_faaa :=_gdbcc .highestWord (_gbbaa ,_fcfff ,_cffbd );
|
||
if _faaa ==nil {continue ;};_accf :=_cdfb (_faaa ,_efgg ._bfag [len (_efgg ._bfag )-1]);if _accf < -_caeeb {break _dceg ;};if _accf > _cfeaf {continue ;};if _bdbdg !=nil &&_bcea (_faaa ,_bdbdg )>=0{continue ;};_bdbdg =_faaa ;_dcgb =_gbbaa ;};if _bdbdg ==nil {break ;
|
||
};_efgg .pullWord (_gdbcc ,_bdbdg ,_dcgb );};_efgg .markWordBoundaries ();_fcaeb =append (_fcaeb ,_efgg );};};if len (_fcaeb )==0{return nil ;};_a .Slice (_fcaeb ,func (_efdgd ,_eaad int )bool {return _dddc (_fcaeb [_efdgd ],_fcaeb [_eaad ])< 0});_ebab :=_geaa (_gdbcc .PdfRectangle ,_fcaeb );
|
||
if _fbbd {_fc .Log .Info ("\u0061\u0072\u0072an\u0067\u0065\u0054\u0065\u0078\u0074\u0020\u0021\u0021\u0021\u0020\u0070\u0061\u0072\u0061\u003d\u0025\u0073",_ebab .String ());if _ecda {for _dfbag ,_gegg :=range _ebab ._bfagf {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_dfbag ,_gegg .String ());
|
||
if _baag {for _cbbab ,_fccfg :=range _gegg ._bfag {_ae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_cbbab ,_fccfg .String ());for _bbfd ,_aeab :=range _fccfg ._ebfa {_ae .Printf ("\u00251\u0032\u0064\u003a\u0020\u0025\u0073\n",_bbfd ,_aeab .String ());
|
||
};};};};};};return _ebab ;};const (_bfab =true ;_cfee =true ;_ccgb =true ;_acge =false ;_cbaf =false ;_fgeg =6;_bbbgd =3.0;_bgdcg =200;_eecg =true ;_ebada =true ;_bffc =true ;_gbdg =true ;_aaeb =false ;);func _fgdfed (_gcga ,_ddfg _g .Point )rulingKind {_bfea :=_ea .Abs (_gcga .X -_ddfg .X );
|
||
_ggbgca :=_ea .Abs (_gcga .Y -_ddfg .Y );return _gdgbf (_bfea ,_ggbgca ,_dccc );};func (_fdbc *wordBag )removeWord (_accg *textWord ,_bcbg int ){_adgg :=_fdbc ._gbbd [_bcbg ];_adgg =_fgacb (_adgg ,_accg );if len (_adgg )==0{delete (_fdbc ._gbbd ,_bcbg );
|
||
}else {_fdbc ._gbbd [_bcbg ]=_adgg ;};};func (_eebg *textWord )absorb (_bdcfa *textWord ){_eebg .PdfRectangle =_agfb (_eebg .PdfRectangle ,_bdcfa .PdfRectangle );_eebg ._ebfa =append (_eebg ._ebfa ,_bdcfa ._ebfa ...);};func _fccge (_abee []TextMark ,_gfde *int ,_gagga TextMark )[]TextMark {_gagga .Offset =*_gfde ;
|
||
_abee =append (_abee ,_gagga );*_gfde +=len (_gagga .Text );return _abee ;};
|
||
|
||
// PageText represents the layout of text on a device page.
|
||
type PageText struct{_dbfe []*textMark ;_eede string ;_fgb []TextMark ;_gba []TextTable ;_babf _aec .PdfRectangle ;_afgd []pathSection ;_ddae []pathSection ;_cegb *_bad .PdfObject ;_dfgd _bad .PdfObject ;_cafg *_ba .ContentStreamOperations ;_cdgg PageTextOptions ;
|
||
};func (_dae *textObject )setFont (_cfcd string ,_ged float64 )error {if _dae ==nil {return nil ;};_dae ._cbgd ._dgad =_ged ;_edaf ,_fcga :=_dae .getFont (_cfcd );if _fcga !=nil {return _fcga ;};_dae ._cbgd ._cbad =_edaf ;return nil ;};func (_gcdg rulingList )mergePrimary ()float64 {_befd :=_gcdg [0]._gbgc ;
|
||
for _ ,_bfca :=range _gcdg [1:]{_befd +=_bfca ._gbgc ;};return _befd /float64 (len (_gcdg ));};const (_acbdc markKind =iota ;_bggb ;_gagfa ;_dbed ;);func _aeefg (_acdef []pathSection ){if _edge < 0.0{return ;};if _eceg {_fc .Log .Info ("\u0067\u0072\u0061\u006e\u0075\u006c\u0061\u0072\u0069\u007a\u0065\u003a\u0020\u0025\u0064 \u0073u\u0062\u0070\u0061\u0074\u0068\u0020\u0073\u0065\u0063\u0074\u0069\u006f\u006e\u0073",len (_acdef ));
|
||
};for _fagdcf ,_cgfec :=range _acdef {for _geege ,_fcca :=range _cgfec ._gbag {for _ecgc ,_cfcc :=range _fcca ._acfg {_fcca ._acfg [_ecgc ]=_g .Point {X :_aagaf (_cfcc .X ),Y :_aagaf (_cfcc .Y )};if _eceg {_bfcc :=_fcca ._acfg [_ecgc ];if !_ceab (_cfcc ,_bfcc ){_cdce :=_g .Point {X :_bfcc .X -_cfcc .X ,Y :_bfcc .Y -_cfcc .Y };
|
||
_ae .Printf ("\u0025\u0034d \u002d\u0020\u00254\u0064\u0020\u002d\u0020%4d\u003a %\u002e\u0032\u0066\u0020\u2192\u0020\u0025.2\u0066\u0020\u0028\u0025\u0067\u0029\u000a",_fagdcf ,_geege ,_ecgc ,_cfcc ,_bfcc ,_cdce );};};};};};};func (_cfade *textPara )depth ()float64 {if _cfade ._egbea {return -1.0;
|
||
};if len (_cfade ._bfagf )> 0{return _cfade ._bfagf [0]._gaca ;};return _cfade ._caaa .depth ();};
|
||
|
||
// String returns a description of `w`.
|
||
func (_aeacd *textWord )String ()string {return _ae .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_aeacd ._aecg ,_aeacd .PdfRectangle ,_aeacd ._aeegf ,_aeacd ._eedc );
|
||
};type textPara struct{_aec .PdfRectangle ;_bbbc _aec .PdfRectangle ;_bfagf []*textLine ;_caaa *textTable ;_dcddf bool ;_egbea bool ;_bceaa *textPara ;_aabe *textPara ;_ggfbb *textPara ;_ccee *textPara ;_ddef []list ;};func _begdb (_eefe float64 ,_cgbb int )int {if _cgbb ==0{_cgbb =1;
|
||
};_eded :=float64 (_cgbb );return int (_ea .Round (_eefe /_eded )*_eded );};func (_bbcce paraList )sortReadingOrder (){_fc .Log .Trace ("\u0073\u006fr\u0074\u0052\u0065\u0061\u0064i\u006e\u0067\u004f\u0072\u0064e\u0072\u003a\u0020\u0070\u0061\u0072\u0061\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_bbcce ));
|
||
if len (_bbcce )<=1{return ;};_bbcce .computeEBBoxes ();_a .Slice (_bbcce ,func (_gdaeg ,_cfae int )bool {return _dddc (_bbcce [_gdaeg ],_bbcce [_cfae ])<=0});};func (_cadfbb *textTable )growTable (){_aggd :=func (_eafd paraList ){_cadfbb ._agdc ++;for _dcab :=0;
|
||
_dcab < _cadfbb ._afcga ;_dcab ++{_geafb :=_eafd [_dcab ];_cadfbb .put (_dcab ,_cadfbb ._agdc -1,_geafb );};};_cdaa :=func (_bddeb paraList ){_cadfbb ._afcga ++;for _fefba :=0;_fefba < _cadfbb ._agdc ;_fefba ++{_dcebb :=_bddeb [_fefba ];_cadfbb .put (_cadfbb ._afcga -1,_fefba ,_dcebb );
|
||
};};if _efcc {_cadfbb .log ("\u0067r\u006f\u0077\u0054\u0061\u0062\u006ce");};for _gfec :=0;;_gfec ++{_baabg :=false ;_beaa :=_cadfbb .getDown ();_dcdcd :=_cadfbb .getRight ();if _efcc {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_gfec ,_cadfbb );
|
||
_ae .Printf ("\u0020\u0020 \u0020\u0020\u0020 \u0020\u0064\u006f\u0077\u006e\u003d\u0025\u0073\u000a",_beaa );_ae .Printf ("\u0020\u0020 \u0020\u0020\u0020 \u0072\u0069\u0067\u0068\u0074\u003d\u0025\u0073\u000a",_dcdcd );};if _beaa !=nil &&_dcdcd !=nil {_edgcg :=_beaa [len (_beaa )-1];
|
||
if !_edgcg .taken ()&&_edgcg ==_dcdcd [len (_dcdcd )-1]{_aggd (_beaa );if _dcdcd =_cadfbb .getRight ();_dcdcd !=nil {_cdaa (_dcdcd );_cadfbb .put (_cadfbb ._afcga -1,_cadfbb ._agdc -1,_edgcg );};_baabg =true ;};};if !_baabg &&_beaa !=nil {_aggd (_beaa );
|
||
_baabg =true ;};if !_baabg &&_dcdcd !=nil {_cdaa (_dcdcd );_baabg =true ;};if !_baabg {break ;};};};func (_cfcf *textPara )isAtom ()*textTable {_adaa :=_cfcf ;_cbgcg :=_cfcf ._aabe ;_dcbf :=_cfcf ._ccee ;if _cbgcg .taken ()||_dcbf .taken (){return nil ;
|
||
};_ebbae :=_cbgcg ._ccee ;if _ebbae .taken ()||_ebbae !=_dcbf ._aabe {return nil ;};return _eadd (_adaa ,_cbgcg ,_dcbf ,_ebbae );};func (_cfba rulingList )connections (_fgcd map[int ]intSet ,_gege int )intSet {_bgdgf :=make (intSet );_gfgd :=make (intSet );
|
||
var _aeceb func (int );_aeceb =func (_bgedf int ){if !_gfgd .has (_bgedf ){_gfgd .add (_bgedf );for _acefc :=range _cfba {if _fgcd [_acefc ].has (_bgedf ){_bgdgf .add (_acefc );};};for _cadffb :=range _cfba {if _bgdgf .has (_cadffb ){_aeceb (_cadffb );
|
||
};};};};_aeceb (_gege );return _bgdgf ;};
|
||
|
||
// String returns a description of `t`.
|
||
func (_adgge *textTable )String ()string {return _ae .Sprintf ("\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u0020\u0025\u0074",_adgge ._afcga ,_adgge ._agdc ,_adgge ._fbccb );};
|
||
|
||
// ToTextMark returns the public view of `tm`.
|
||
func (_fcgg *textMark )ToTextMark ()TextMark {return TextMark {Text :_fcgg ._gded ,Original :_fcgg ._dcff ,BBox :_fcgg ._gega ,Font :_fcgg ._bccc ,FontSize :_fcgg ._cbbae ,FillColor :_fcgg ._fdag ,StrokeColor :_fcgg ._aabcb ,Orientation :_fcgg ._eeacf ,DirectObject :_fcgg ._gafcd ,ObjString :_fcgg ._agff ,Tw :_fcgg .Tw ,Th :_fcgg .Th ,Tc :_fcgg ._dcfd ,Index :_fcgg ._daeff };
|
||
};func (_afed *ruling )alignsSec (_adbcg *ruling )bool {const _eggdg =_cabc +1.0;return _afed ._fgad -_eggdg <=_adbcg ._ababc &&_adbcg ._fgad -_eggdg <=_afed ._ababc ;};
|
||
|
||
// ImageMark represents an image drawn on a page and its position in device coordinates.
|
||
// All coordinates are in device coordinates.
|
||
type ImageMark struct{Image *_aec .Image ;
|
||
|
||
// Dimensions of the image as displayed in the PDF.
|
||
Width float64 ;Height float64 ;
|
||
|
||
// Position of the image in PDF coordinates (lower left corner).
|
||
X float64 ;Y float64 ;
|
||
|
||
// Angle in degrees, if rotated.
|
||
Angle float64 ;};func (_acdeb *textWord )appendMark (_cddge *textMark ,_adcaec _aec .PdfRectangle ){_acdeb ._ebfa =append (_acdeb ._ebfa ,_cddge );_acdeb .PdfRectangle =_agfb (_acdeb .PdfRectangle ,_cddge .PdfRectangle );if _cddge ._cbbae > _acdeb ._aeegf {_acdeb ._aeegf =_cddge ._cbbae ;
|
||
};_acdeb ._aecg =_adcaec .Ury -_acdeb .PdfRectangle .Lly ;};func (_bgfe rulingList )sortStrict (){_a .Slice (_bgfe ,func (_dgdg ,_edbd int )bool {_egfa ,_daadcc :=_bgfe [_dgdg ],_bgfe [_edbd ];_fccd ,_bdea :=_egfa ._egdf ,_daadcc ._egdf ;if _fccd !=_bdea {return _fccd > _bdea ;
|
||
};_bggbg ,_baca :=_egfa ._gbgc ,_daadcc ._gbgc ;if !_ecfbd (_bggbg -_baca ){return _bggbg < _baca ;};_bggbg ,_baca =_egfa ._fgad ,_daadcc ._fgad ;if _bggbg !=_baca {return _bggbg < _baca ;};return _egfa ._ababc < _daadcc ._ababc ;});};func (_gbdgb *textPara )taken ()bool {return _gbdgb ==nil ||_gbdgb ._dcddf };
|
||
var _bcccf =map[rulingKind ]string {_cbfe :"\u006e\u006f\u006e\u0065",_bfgb :"\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c",_eebe :"\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c"};func (_feagc gridTile )complete ()bool {return _feagc .numBorders ()==4};
|
||
func (_gefd *textPara )toTextMarks (_bbbgg *int )[]TextMark {if _gefd ._caaa ==nil {return _gefd .toCellTextMarks (_bbbgg );};var _cbbf []TextMark ;for _eegeb :=0;_eegeb < _gefd ._caaa ._agdc ;_eegeb ++{for _ddge :=0;_ddge < _gefd ._caaa ._afcga ;_ddge ++{_gfgc :=_gefd ._caaa .get (_ddge ,_eegeb );
|
||
if _gfgc ==nil {_cbbf =_ggcce (_cbbf ,_bbbgg ,"\u0009");}else {_bedac :=_gfgc .toCellTextMarks (_bbbgg );_cbbf =append (_cbbf ,_bedac ...);};_cbbf =_ggcce (_cbbf ,_bbbgg ,"\u0020");};if _eegeb < _gefd ._caaa ._agdc -1{_cbbf =_ggcce (_cbbf ,_bbbgg ,"\u000a");
|
||
};};_eaff :=_gefd ._caaa ;if _eaff .isExportable (){_deab :=_eaff .toTextTable ();_cbbf =_cbabg (_cbbf ,&_deab );};return _cbbf ;};func (_gebad rulingList )blocks (_bafa ,_bfbb *ruling )bool {if _bafa ._fgad > _bfbb ._ababc ||_bfbb ._fgad > _bafa ._ababc {return false ;
|
||
};_bceaf :=_ea .Max (_bafa ._fgad ,_bfbb ._fgad );_ebdf :=_ea .Min (_bafa ._ababc ,_bfbb ._ababc );if _bafa ._gbgc > _bfbb ._gbgc {_bafa ,_bfbb =_bfbb ,_bafa ;};for _ ,_bgdd :=range _gebad {if _bafa ._gbgc <=_bgdd ._gbgc +_cabc &&_bgdd ._gbgc <=_bfbb ._gbgc +_cabc &&_bgdd ._fgad <=_ebdf &&_bceaf <=_bgdd ._ababc {return true ;
|
||
};};return false ;};func _abaa (_bcaf ,_bcff bounded )float64 {_fbaea :=_bcea (_bcaf ,_bcff );if !_ecfbd (_fbaea ){return _fbaea ;};return _dgc (_bcaf ,_bcff );};func _gged (_gefc map[int ][]float64 )[]int {_ebbe :=make ([]int ,len (_gefc ));_gbbgd :=0;
|
||
for _feaac :=range _gefc {_ebbe [_gbbgd ]=_feaac ;_gbbgd ++;};_a .Ints (_ebbe );return _ebbe ;};func (_adae paraList )topoOrder ()[]int {if _beee {_fc .Log .Info ("\u0074\u006f\u0070\u006f\u004f\u0072\u0064\u0065\u0072\u003a");};_bgec :=len (_adae );_gfea :=make ([]bool ,_bgec );
|
||
_dbec :=make ([]int ,0,_bgec );_cgbbc :=_adae .llyOrdering ();var _daabe func (_gbbag int );_daabe =func (_cbdf int ){_gfea [_cbdf ]=true ;for _feac :=0;_feac < _bgec ;_feac ++{if !_gfea [_feac ]{if _adae .readBefore (_cgbbc ,_cbdf ,_feac ){_daabe (_feac );
|
||
};};};_dbec =append (_dbec ,_cbdf );};for _aacg :=0;_aacg < _bgec ;_aacg ++{if !_gfea [_aacg ]{_daabe (_aacg );};};return _cgdbc (_dbec );};func _eeef (_fggb []*textLine ,_fege ,_fda float64 )[]*textLine {var _ddceb []*textLine ;for _ ,_cfg :=range _fggb {if _fege ==-1{if _cfg ._gaca > _fda {_ddceb =append (_ddceb ,_cfg );
|
||
};}else {if _cfg ._gaca > _fda &&_cfg ._gaca < _fege {_ddceb =append (_ddceb ,_cfg );};};};return _ddceb ;};
|
||
|
||
// String returns a human readable description of `s`.
|
||
func (_fecc intSet )String ()string {var _bfed []int ;for _eegde :=range _fecc {if _fecc .has (_eegde ){_bfed =append (_bfed ,_eegde );};};_a .Ints (_bfed );return _ae .Sprintf ("\u0025\u002b\u0076",_bfed );};func _eadd (_ceac ,_egga ,_feaad ,_eaba *textPara )*textTable {_ccdg :=&textTable {_afcga :2,_agdc :2,_bfdff :make (map[uint64 ]*textPara ,4)};
|
||
_ccdg .put (0,0,_ceac );_ccdg .put (1,0,_egga );_ccdg .put (0,1,_feaad );_ccdg .put (1,1,_eaba );return _ccdg ;};func _agfb (_fcacdd ,_ffge _aec .PdfRectangle )_aec .PdfRectangle {return _aec .PdfRectangle {Llx :_ea .Min (_fcacdd .Llx ,_ffge .Llx ),Lly :_ea .Min (_fcacdd .Lly ,_ffge .Lly ),Urx :_ea .Max (_fcacdd .Urx ,_ffge .Urx ),Ury :_ea .Max (_fcacdd .Ury ,_ffge .Ury )};
|
||
};func _fdba (_fbage []*textMark ,_cddb _aec .PdfRectangle ,_aeffa rulingList ,_cfdga []gridTiling ,_adad bool )paraList {_fc .Log .Trace ("\u006d\u0061\u006b\u0065\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065\u003a \u0025\u0064\u0020\u0065\u006c\u0065m\u0065\u006e\u0074\u0073\u0020\u0070\u0061\u0067\u0065\u0053\u0069\u007a\u0065=\u0025\u002e\u0032\u0066",len (_fbage ),_cddb );
|
||
if len (_fbage )==0{return nil ;};_cabbe :=_eefcd (_fbage ,_cddb );if len (_cabbe )==0{return nil ;};_aeffa .log ("\u006d\u0061\u006be\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065");_ceddb ,_dedf :=_aeffa .vertsHorzs ();_dafac :=_fcbd (_cabbe ,_cddb .Ury ,_ceddb ,_dedf );
|
||
_cgac :=_fcaa (_dafac ,_cddb .Ury ,_ceddb ,_dedf );_cgac =_fcfa (_cgac );_gegaf :=make (paraList ,0,len (_cgac ));for _ ,_bccb :=range _cgac {_gbcc :=_bccb .arrangeText ();if _gbcc !=nil {_gegaf =append (_gegaf ,_gbcc );};};if !_adad &&len (_gegaf )>=_dbdg {_gegaf =_gegaf .extractTables (_cfdga );
|
||
};_gegaf .sortReadingOrder ();if !_adad {_gegaf .sortTopoOrder ();};_gegaf .log ("\u0073\u006f\u0072te\u0064\u0020\u0069\u006e\u0020\u0072\u0065\u0061\u0064\u0069\u006e\u0067\u0020\u006f\u0072\u0064\u0065\u0072");return _gegaf ;};type structElement struct{_aeff string ;
|
||
_ccaac []structElement ;_fab int64 ;_ecdd _bad .PdfObject ;};func (_dgcd *subpath )makeRectRuling (_gfabf _eg .Color )(*ruling ,bool ){if _bdf {_fc .Log .Info ("\u006d\u0061\u006beR\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0070\u0061\u0074\u0068\u003d\u0025\u0076",_dgcd );
|
||
};_acgac :=_dgcd ._acfg [:4];_ababe :=make (map[int ]rulingKind ,len (_acgac ));for _dcffc ,_aaacf :=range _acgac {_bfde :=_dgcd ._acfg [(_dcffc +1)%4];_ababe [_dcffc ]=_fgdfed (_aaacf ,_bfde );if _bdf {_ae .Printf ("\u0025\u0034\u0064: \u0025\u0073\u0020\u003d\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u002d\u0020\u0025\u0036\u002e\u0032\u0066",_dcffc ,_ababe [_dcffc ],_aaacf ,_bfde );
|
||
};};if _bdf {_ae .Printf ("\u0020\u0020\u0020\u006b\u0069\u006e\u0064\u0073\u003d\u0025\u002b\u0076\u000a",_ababe );};var _adebe ,_acbf []int ;for _ecca ,_cegc :=range _ababe {switch _cegc {case _bfgb :_acbf =append (_acbf ,_ecca );case _eebe :_adebe =append (_adebe ,_ecca );
|
||
};};if _bdf {_ae .Printf ("\u0020\u0020 \u0068\u006f\u0072z\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a",len (_acbf ),_acbf );_ae .Printf ("\u0020\u0020 \u0076\u0065\u0072t\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a",len (_adebe ),_adebe );
|
||
};_geba :=(len (_acbf )==2&&len (_adebe )==2)||(len (_acbf )==2&&len (_adebe )==0&&_dbef (_acgac [_acbf [0]],_acgac [_acbf [1]]))||(len (_adebe )==2&&len (_acbf )==0&&_bbfff (_acgac [_adebe [0]],_acgac [_adebe [1]]));if _bdf {_ae .Printf (" \u0020\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064\u0020\u0076\u0065\u0072\u0074\u0073=\u0025\u0064\u0020o\u006b=\u0025\u0074\u000a",len (_acbf ),len (_adebe ),_geba );
|
||
};if !_geba {if _bdf {_fc .Log .Error ("\u0021!\u006d\u0061\u006b\u0065R\u0065\u0063\u0074\u0052\u0075l\u0069n\u0067:\u0020\u0070\u0061\u0074\u0068\u003d\u0025v",_dgcd );_ae .Printf (" \u0020\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064\u0020\u0076\u0065\u0072\u0074\u0073=\u0025\u0064\u0020o\u006b=\u0025\u0074\u000a",len (_acbf ),len (_adebe ),_geba );
|
||
};return &ruling {},false ;};if len (_adebe )==0{for _aaea ,_aadd :=range _ababe {if _aadd !=_bfgb {_adebe =append (_adebe ,_aaea );};};};if len (_acbf )==0{for _bdfd ,_fccb :=range _ababe {if _fccb !=_eebe {_acbf =append (_acbf ,_bdfd );};};};if _bdf {_fc .Log .Info ("\u006da\u006b\u0065R\u0065\u0063\u0074\u0052u\u006c\u0069\u006eg\u003a\u0020\u0068\u006f\u0072\u007a\u0073\u003d\u0025d \u0076\u0065\u0072t\u0073\u003d%\u0064\u0020\u0070\u006f\u0069\u006et\u0073\u003d%\u0064\u000a"+"\u0009\u0020\u0068o\u0072\u007a\u0073\u003d\u0025\u002b\u0076\u000a"+"\u0009\u0020\u0076e\u0072\u0074\u0073\u003d\u0025\u002b\u0076\u000a"+"\t\u0070\u006f\u0069\u006e\u0074\u0073\u003d\u0025\u002b\u0076",len (_acbf ),len (_adebe ),len (_acgac ),_acbf ,_adebe ,_acgac );
|
||
};var _bgca ,_gceff ,_cdfge ,_gfbcd _g .Point ;if _acgac [_acbf [0]].Y > _acgac [_acbf [1]].Y {_cdfge ,_gfbcd =_acgac [_acbf [0]],_acgac [_acbf [1]];}else {_cdfge ,_gfbcd =_acgac [_acbf [1]],_acgac [_acbf [0]];};if _acgac [_adebe [0]].X > _acgac [_adebe [1]].X {_bgca ,_gceff =_acgac [_adebe [0]],_acgac [_adebe [1]];
|
||
}else {_bgca ,_gceff =_acgac [_adebe [1]],_acgac [_adebe [0]];};_fffe :=_aec .PdfRectangle {Llx :_bgca .X ,Urx :_gceff .X ,Lly :_gfbcd .Y ,Ury :_cdfge .Y };if _fffe .Llx > _fffe .Urx {_fffe .Llx ,_fffe .Urx =_fffe .Urx ,_fffe .Llx ;};if _fffe .Lly > _fffe .Ury {_fffe .Lly ,_fffe .Ury =_fffe .Ury ,_fffe .Lly ;
|
||
};_gaac :=rectRuling {PdfRectangle :_fffe ,_cbae :_faab (_fffe ),Color :_gfabf };if _gaac ._cbae ==_cbfe {if _bdf {_fc .Log .Error ("\u006da\u006b\u0065\u0052\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006eg\u003a\u0020\u006b\u0069\u006e\u0064\u003d\u006e\u0069\u006c");
|
||
};return nil ,false ;};_egafg ,_dbfec :=_gaac .asRuling ();if !_dbfec {if _bdf {_fc .Log .Error ("\u006da\u006b\u0065\u0052\u0065c\u0074\u0052\u0075\u006c\u0069n\u0067:\u0020!\u0069\u0073\u0052\u0075\u006c\u0069\u006eg");};return nil ,false ;};if _eceg {_ae .Printf ("\u0020\u0020\u0020\u0072\u003d\u0025\u0073\u000a",_egafg .String ());
|
||
};return _egafg ,true ;};func _fcaa (_bdaa *wordBag ,_gbaf float64 ,_caag ,_edb rulingList )[]*wordBag {var _beec []*wordBag ;for _ ,_dada :=range _bdaa .depthIndexes (){_gaadff :=false ;for !_bdaa .empty (_dada ){_fcfcb :=_bdaa .firstReadingIndex (_dada );
|
||
_cfca :=_bdaa .firstWord (_fcfcb );_gecf :=_ecea (_cfca ,_gbaf ,_caag ,_edb );_bdaa .removeWord (_cfca ,_fcfcb );if _fedc {_fc .Log .Info ("\u0066\u0069\u0072\u0073\u0074\u0057\u006f\u0072\u0064\u0020\u005e\u005e^\u005e\u0020\u0025\u0073",_cfca .String ());
|
||
};for _cfadb :=true ;_cfadb ;_cfadb =_gaadff {_gaadff =false ;_bfbc :=_gefa *_gecf ._aad ;_dfabe :=_debe *_gecf ._aad ;_gfda :=_fefg *_gecf ._aad ;if _fedc {_fc .Log .Info ("\u0070a\u0072a\u0057\u006f\u0072\u0064\u0073\u0020\u0064\u0065\u0070\u0074\u0068 \u0025\u002e\u0032\u0066 \u002d\u0020\u0025\u002e\u0032f\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061\u0044\u0065\u0070\u0074\u0068\u0047\u0061\u0070\u003d\u0025\u002e\u0032\u0066\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061R\u0065\u0061\u0064\u0069\u006e\u0067\u0047\u0061p\u003d\u0025\u002e\u0032\u0066",_gecf .minDepth (),_gecf .maxDepth (),_gfda ,_dfabe );
|
||
};if _bdaa .scanBand ("\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c",_gecf ,_age (_gcda ,0),_gecf .minDepth ()-_gfda ,_gecf .maxDepth ()+_gfda ,_fdgc ,false ,false )> 0{_gaadff =true ;};if _bdaa .scanBand ("\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c",_gecf ,_age (_gcda ,_dfabe ),_gecf .minDepth (),_gecf .maxDepth (),_facc ,false ,false )> 0{_gaadff =true ;
|
||
};if _gaadff {continue ;};_bdded :=_bdaa .scanBand ("",_gecf ,_age (_cecg ,_bfbc ),_gecf .minDepth (),_gecf .maxDepth (),_fbc ,true ,false );if _bdded > 0{_eadeg :=(_gecf .maxDepth ()-_gecf .minDepth ())/_gecf ._aad ;if (_bdded > 1&&float64 (_bdded )> 0.3*_eadeg )||_bdded <=10{if _bdaa .scanBand ("\u006f\u0074\u0068e\u0072",_gecf ,_age (_cecg ,_bfbc ),_gecf .minDepth (),_gecf .maxDepth (),_fbc ,false ,true )> 0{_gaadff =true ;
|
||
};};};};_beec =append (_beec ,_gecf );};};return _beec ;};func (_cbe *PageText )getParagraphs ()paraList {var _eeda rulingList ;if _bffc {_aba :=_efda (_cbe ._afgd );_eeda =append (_eeda ,_aba ...);};if _gbdg {_cdd :=_ggeeg (_cbe ._ddae );_eeda =append (_eeda ,_cdd ...);
|
||
};_eeda ,_geca :=_eeda .toTilings ();var _affa paraList ;_gaad :=len (_cbe ._dbfe );for _addb :=0;_addb < 360&&_gaad > 0;_addb +=90{_ebed :=make ([]*textMark ,0,len (_cbe ._dbfe )-_gaad );for _ ,_bddd :=range _cbe ._dbfe {if _bddd ._eeacf ==_addb {_ebed =append (_ebed ,_bddd );
|
||
};};if len (_ebed )> 0{_cae :=_fdba (_ebed ,_cbe ._babf ,_eeda ,_geca ,_cbe ._cdgg ._cgff );_affa =append (_affa ,_cae ...);_gaad -=len (_ebed );};};return _affa ;};
|
||
|
||
// PageTextOptions holds various options available in extraction process.
|
||
type PageTextOptions struct{_cdec bool ;_cgff bool ;};func _dafa (_dabfc float64 )int {var _dgbc int ;if _dabfc >=0{_dgbc =int (_dabfc /_fdedc );}else {_dgbc =int (_dabfc /_fdedc )-1;};return _dgbc ;};func (_dcadb *textLine )appendWord (_bcaga *textWord ){_dcadb ._bfag =append (_dcadb ._bfag ,_bcaga );
|
||
_dcadb .PdfRectangle =_agfb (_dcadb .PdfRectangle ,_bcaga .PdfRectangle );if _bcaga ._aeegf > _dcadb ._fgcb {_dcadb ._fgcb =_bcaga ._aeegf ;};if _bcaga ._aecg > _dcadb ._gaca {_dcadb ._gaca =_bcaga ._aecg ;};};func (_gbgdg lineRuling )xMean ()float64 {return 0.5*(_gbgdg ._bcfgb .X +_gbgdg ._befe .X )};
|
||
|
||
|
||
// String returns a description of `v`.
|
||
func (_addde *ruling )String ()string {if _addde ._egdf ==_cbfe {return "\u004e\u004f\u0054\u0020\u0052\u0055\u004c\u0049\u004e\u0047";};_afaa ,_ecacc :="\u0078","\u0079";if _addde ._egdf ==_bfgb {_afaa ,_ecacc ="\u0079","\u0078";};_ebdg :="";if _addde ._aecega !=0.0{_ebdg =_ae .Sprintf (" \u0077\u0069\u0064\u0074\u0068\u003d\u0025\u002e\u0032\u0066",_addde ._aecega );
|
||
};return _ae .Sprintf ("\u0025\u00310\u0073\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u002d\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0020\u0025\u0076\u0025\u0073",_addde ._egdf ,_afaa ,_addde ._gbgc ,_ecacc ,_addde ._fgad ,_addde ._ababc ,_addde ._ababc -_addde ._fgad ,_addde ._dcebd ,_addde .Color ,_ebdg );
|
||
};type stateStack []*textState ;type gridTile struct{_aec .PdfRectangle ;_fbbf ,_cbfd ,_fcgc ,_cbdbf bool ;};func _gegca (_fafgc string )string {_caeb :=[]rune (_fafgc );return string (_caeb [:len (_caeb )-1])};
|
||
|
||
// String returns a human readable description of `path`.
|
||
func (_gfae *subpath )String ()string {_fbea :=_gfae ._acfg ;_ccag :=len (_fbea );if _ccag <=5{return _ae .Sprintf ("\u0025d\u003a\u0020\u0025\u0036\u002e\u0032f",_ccag ,_fbea );};return _ae .Sprintf ("\u0025d\u003a\u0020\u0025\u0036.\u0032\u0066\u0020\u0025\u0036.\u0032f\u0020.\u002e\u002e\u0020\u0025\u0036\u002e\u0032f",_ccag ,_fbea [0],_fbea [1],_fbea [_ccag -1]);
|
||
};const (RenderModeStroke RenderMode =1<<iota ;RenderModeFill ;RenderModeClip ;);type textMark struct{_aec .PdfRectangle ;_eeacf int ;_gded string ;_dcff string ;_bccc *_aec .PdfFont ;_cbbae float64 ;_dcfd float64 ;_ddebg _g .Matrix ;_bdbd _g .Point ;_gega _aec .PdfRectangle ;
|
||
_fdag _eg .Color ;_aabcb _eg .Color ;_gafcd _bad .PdfObject ;_agff []string ;Tw float64 ;Th float64 ;_ebgag int ;_daeff int ;};
|
||
|
||
// String returns a human readable description of `ss`.
|
||
func (_beb *shapesState )String ()string {return _ae .Sprintf ("\u007b\u0025\u0064\u0020su\u0062\u0070\u0061\u0074\u0068\u0073\u0020\u0066\u0072\u0065\u0073\u0068\u003d\u0025t\u007d",len (_beb ._edc ),_beb ._effc );};func _ecaa (_gfbfg ,_bedab float64 )string {_egbb :=!_ecfbd (_gfbfg -_bedab );
|
||
if _egbb {return "\u000a";};return "\u0020";};func (_ccbbc paraList )extractTables (_acgdf []gridTiling )paraList {if _gbead {_fc .Log .Debug ("\u0065\u0078\u0074r\u0061\u0063\u0074\u0054\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_ccbbc ));
|
||
};if len (_ccbbc )< _dbdg {return _ccbbc ;};_eedea :=_ccbbc .findTables (_acgdf );if _gbead {_fc .Log .Info ("c\u006f\u006d\u0062\u0069\u006e\u0065d\u0020\u0074\u0061\u0062\u006c\u0065s\u0020\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d",len (_eedea ));
|
||
for _bddee ,_fggaa :=range _eedea {_fggaa .log (_ae .Sprintf ("c\u006f\u006d\u0062\u0069\u006e\u0065\u0064\u0020\u0025\u0064",_bddee ));};};return _ccbbc .applyTables (_eedea );};
|
||
|
||
// TableCell is a cell in a TextTable.
|
||
type TableCell struct{_aec .PdfRectangle ;
|
||
|
||
// Text is the extracted text.
|
||
Text string ;
|
||
|
||
// Marks returns the TextMarks corresponding to the text in Text.
|
||
Marks TextMarkArray ;};func (_efgf *compositeCell )updateBBox (){for _ ,_cgge :=range _efgf .paraList {_efgf .PdfRectangle =_agfb (_efgf .PdfRectangle ,_cgge .PdfRectangle );};};func (_agcg rulingList )augmentGrid ()(rulingList ,rulingList ){_bcec ,_aegb :=_agcg .vertsHorzs ();
|
||
if len (_bcec )==0||len (_aegb )==0{return _bcec ,_aegb ;};_ddagg ,_debc :=_bcec ,_aegb ;_eedac :=_bcec .bbox ();_egfg :=_aegb .bbox ();if _eceg {_fc .Log .Info ("\u0061u\u0067\u006d\u0065\u006e\u0074\u0047\u0072\u0069\u0064\u003a\u0020b\u0062\u006f\u0078\u0056\u003d\u0025\u0036\u002e\u0032\u0066",_eedac );
|
||
_fc .Log .Info ("\u0061u\u0067\u006d\u0065\u006e\u0074\u0047\u0072\u0069\u0064\u003a\u0020b\u0062\u006f\u0078\u0048\u003d\u0025\u0036\u002e\u0032\u0066",_egfg );};var _gdbdc ,_faea ,_dfabf ,_eaeff *ruling ;if _egfg .Llx < _eedac .Llx -_ecce {_gdbdc =&ruling {_dcebd :_dbed ,_egdf :_eebe ,_gbgc :_egfg .Llx ,_fgad :_eedac .Lly ,_ababc :_eedac .Ury };
|
||
_bcec =append (rulingList {_gdbdc },_bcec ...);};if _egfg .Urx > _eedac .Urx +_ecce {_faea =&ruling {_dcebd :_dbed ,_egdf :_eebe ,_gbgc :_egfg .Urx ,_fgad :_eedac .Lly ,_ababc :_eedac .Ury };_bcec =append (_bcec ,_faea );};if _eedac .Lly < _egfg .Lly -_ecce {_dfabf =&ruling {_dcebd :_dbed ,_egdf :_bfgb ,_gbgc :_eedac .Lly ,_fgad :_egfg .Llx ,_ababc :_egfg .Urx };
|
||
_aegb =append (rulingList {_dfabf },_aegb ...);};if _eedac .Ury > _egfg .Ury +_ecce {_eaeff =&ruling {_dcebd :_dbed ,_egdf :_bfgb ,_gbgc :_eedac .Ury ,_fgad :_egfg .Llx ,_ababc :_egfg .Urx };_aegb =append (_aegb ,_eaeff );};if len (_bcec )+len (_aegb )==len (_agcg ){return _ddagg ,_debc ;
|
||
};_eefcf :=append (_bcec ,_aegb ...);_agcg .log ("u\u006e\u0061\u0075\u0067\u006d\u0065\u006e\u0074\u0065\u0064");_eefcf .log ("\u0061u\u0067\u006d\u0065\u006e\u0074\u0065d");return _bcec ,_aegb ;};func (_ecad paraList )xNeighbours (_fbabf float64 )map[*textPara ][]int {_bdddf :=make ([]event ,2*len (_ecad ));
|
||
if _fbabf ==0{for _fddc ,_ebadc :=range _ecad {_bdddf [2*_fddc ]=event {_ebadc .Llx ,true ,_fddc };_bdddf [2*_fddc +1]=event {_ebadc .Urx ,false ,_fddc };};}else {for _dccg ,_aacdf :=range _ecad {_bdddf [2*_dccg ]=event {_aacdf .Llx -_fbabf *_aacdf .fontsize (),true ,_dccg };
|
||
_bdddf [2*_dccg +1]=event {_aacdf .Urx +_fbabf *_aacdf .fontsize (),false ,_dccg };};};return _ecad .eventNeighbours (_bdddf );};func (_cbd *textObject )moveTextSetLeading (_gdcc ,_edgf float64 ){_cbd ._cbgd ._gbff =-_edgf ;_cbd .moveLP (_gdcc ,_edgf );
|
||
};
|
||
|
||
// Text returns the text content of the `bulletLists`.
|
||
func (_cbag *lists )Text ()string {_efaf :=&_bb .Builder {};for _ ,_bdec :=range *_cbag {_dafd :=_bdec .Text ();_efaf .WriteString (_dafd );};return _efaf .String ();};func (_ecef *wordBag )scanBand (_beeb string ,_eada *wordBag ,_bbefg func (_gbbe *wordBag ,_cac *textWord )bool ,_aeda ,_deb ,_agbg float64 ,_ddcc ,_eagf bool )int {_dfbf :=_eada ._aad ;
|
||
var _aeg map[int ]map[*textWord ]struct{};if !_ddcc {_aeg =_ecef .makeRemovals ();};_aafd :=_ddfa *_dfbf ;_fbda :=0;for _ ,_addf :=range _ecef .depthBand (_aeda -_aafd ,_deb +_aafd ){if len (_ecef ._gbbd [_addf ])==0{continue ;};for _ ,_bcagb :=range _ecef ._gbbd [_addf ]{if !(_aeda -_aafd <=_bcagb ._aecg &&_bcagb ._aecg <=_deb +_aafd ){continue ;
|
||
};if !_bbefg (_eada ,_bcagb ){continue ;};_afdf :=2.0*_ea .Abs (_bcagb ._aeegf -_eada ._aad )/(_bcagb ._aeegf +_eada ._aad );_gdaa :=_ea .Max (_bcagb ._aeegf /_eada ._aad ,_eada ._aad /_bcagb ._aeegf );_adea :=_ea .Min (_afdf ,_gdaa );if _agbg > 0&&_adea > _agbg {continue ;
|
||
};if _eada .blocked (_bcagb ){continue ;};if !_ddcc {_eada .pullWord (_bcagb ,_addf ,_aeg );};_fbda ++;if !_eagf {if _bcagb ._aecg < _aeda {_aeda =_bcagb ._aecg ;};if _bcagb ._aecg > _deb {_deb =_bcagb ._aecg ;};};if _ddcc {break ;};};};if !_ddcc {_ecef .applyRemovals (_aeg );
|
||
};return _fbda ;};func (_fcdcc rulingList )findPrimSec (_cafcf ,_ecfee float64 )*ruling {for _ ,_cfdag :=range _fcdcc {if _ecfbd (_cfdag ._gbgc -_cafcf )&&_cfdag ._fgad -_ecce <=_ecfee &&_ecfee <=_cfdag ._ababc +_ecce {return _cfdag ;};};return nil ;};
|
||
func (_bcgag paraList )addNeighbours (){_badbd :=func (_adgcca []int ,_agbd *textPara )([]*textPara ,[]*textPara ){_dbbgf :=make ([]*textPara ,0,len (_adgcca )-1);_gccf :=make ([]*textPara ,0,len (_adgcca )-1);for _ ,_geeba :=range _adgcca {_faaec :=_bcgag [_geeba ];
|
||
if _faaec .Urx <=_agbd .Llx {_dbbgf =append (_dbbgf ,_faaec );}else if _faaec .Llx >=_agbd .Urx {_gccf =append (_gccf ,_faaec );};};return _dbbgf ,_gccf ;};_cgfc :=func (_bgbbf []int ,_ggdbc *textPara )([]*textPara ,[]*textPara ){_cdae :=make ([]*textPara ,0,len (_bgbbf )-1);
|
||
_eecf :=make ([]*textPara ,0,len (_bgbbf )-1);for _ ,_gfafa :=range _bgbbf {_bafg :=_bcgag [_gfafa ];if _bafg .Ury <=_ggdbc .Lly {_eecf =append (_eecf ,_bafg );}else if _bafg .Lly >=_ggdbc .Ury {_cdae =append (_cdae ,_bafg );};};return _cdae ,_eecf ;};
|
||
_dfcf :=_bcgag .yNeighbours (_ffff );for _ ,_edbda :=range _bcgag {_cafgag :=_dfcf [_edbda ];if len (_cafgag )==0{continue ;};_edgfe ,_dcabb :=_badbd (_cafgag ,_edbda );if len (_edgfe )==0&&len (_dcabb )==0{continue ;};if len (_edgfe )> 0{_cbggg :=_edgfe [0];
|
||
for _ ,_accd :=range _edgfe [1:]{if _accd .Urx >=_cbggg .Urx {_cbggg =_accd ;};};for _ ,_fffga :=range _edgfe {if _fffga !=_cbggg &&_fffga .Urx > _cbggg .Llx {_cbggg =nil ;break ;};};if _cbggg !=nil &&_bfc (_edbda .PdfRectangle ,_cbggg .PdfRectangle ){_edbda ._bceaa =_cbggg ;
|
||
};};if len (_dcabb )> 0{_faagd :=_dcabb [0];for _ ,_egafd :=range _dcabb [1:]{if _egafd .Llx <=_faagd .Llx {_faagd =_egafd ;};};for _ ,_ggafg :=range _dcabb {if _ggafg !=_faagd &&_ggafg .Llx < _faagd .Urx {_faagd =nil ;break ;};};if _faagd !=nil &&_bfc (_edbda .PdfRectangle ,_faagd .PdfRectangle ){_edbda ._aabe =_faagd ;
|
||
};};};_dfcf =_bcgag .xNeighbours (_fgde );for _ ,_cebb :=range _bcgag {_cbgcb :=_dfcf [_cebb ];if len (_cbgcb )==0{continue ;};_cdcbd ,_aeec :=_cgfc (_cbgcb ,_cebb );if len (_cdcbd )==0&&len (_aeec )==0{continue ;};if len (_aeec )> 0{_gegba :=_aeec [0];
|
||
for _ ,_bggac :=range _aeec [1:]{if _bggac .Ury >=_gegba .Ury {_gegba =_bggac ;};};for _ ,_deeda :=range _aeec {if _deeda !=_gegba &&_deeda .Ury > _gegba .Lly {_gegba =nil ;break ;};};if _gegba !=nil &&_bae (_cebb .PdfRectangle ,_gegba .PdfRectangle ){_cebb ._ccee =_gegba ;
|
||
};};if len (_cdcbd )> 0{_eefec :=_cdcbd [0];for _ ,_fgba :=range _cdcbd [1:]{if _fgba .Lly <=_eefec .Lly {_eefec =_fgba ;};};for _ ,_cafb :=range _cdcbd {if _cafb !=_eefec &&_cafb .Lly < _eefec .Ury {_eefec =nil ;break ;};};if _eefec !=nil &&_bae (_cebb .PdfRectangle ,_eefec .PdfRectangle ){_cebb ._ggfbb =_eefec ;
|
||
};};};for _ ,_dffac :=range _bcgag {if _dffac ._bceaa !=nil &&_dffac ._bceaa ._aabe !=_dffac {_dffac ._bceaa =nil ;};if _dffac ._ggfbb !=nil &&_dffac ._ggfbb ._ccee !=_dffac {_dffac ._ggfbb =nil ;};if _dffac ._aabe !=nil &&_dffac ._aabe ._bceaa !=_dffac {_dffac ._aabe =nil ;
|
||
};if _dffac ._ccee !=nil &&_dffac ._ccee ._ggfbb !=_dffac {_dffac ._ccee =nil ;};};};func (_bdbe *subpath )removeDuplicates (){if len (_bdbe ._acfg )==0{return ;};_fgga :=[]_g .Point {_bdbe ._acfg [0]};for _ ,_fged :=range _bdbe ._acfg [1:]{if !_ceab (_fged ,_fgga [len (_fgga )-1]){_fgga =append (_fgga ,_fged );
|
||
};};_bdbe ._acfg =_fgga ;};func (_cfda lineRuling )asRuling ()(*ruling ,bool ){_ffbdd :=ruling {_egdf :_cfda ._cdbfg ,Color :_cfda .Color ,_dcebd :_bggb };switch _cfda ._cdbfg {case _eebe :_ffbdd ._gbgc =_cfda .xMean ();_ffbdd ._fgad =_ea .Min (_cfda ._bcfgb .Y ,_cfda ._befe .Y );
|
||
_ffbdd ._ababc =_ea .Max (_cfda ._bcfgb .Y ,_cfda ._befe .Y );case _bfgb :_ffbdd ._gbgc =_cfda .yMean ();_ffbdd ._fgad =_ea .Min (_cfda ._bcfgb .X ,_cfda ._befe .X );_ffbdd ._ababc =_ea .Max (_cfda ._bcfgb .X ,_cfda ._befe .X );default:_fc .Log .Error ("\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064",_cfda ._cdbfg );
|
||
return nil ,false ;};return &_ffbdd ,true ;};
|
||
|
||
// Elements returns the TextMarks in `ma`.
|
||
func (_ddc *TextMarkArray )Elements ()[]TextMark {return _ddc ._bade };const (_bedg =1.0e-6;_edge =1.0e-4;_ceba =10;_fdedc =6;_ddfa =0.5;_fbdbc =0.12;_babe =0.19;_accc =0.04;_agbf =0.04;_fefg =1.0;_fdgc =0.04;_debe =0.4;_facc =0.7;_gefa =1.0;_fbc =0.1;
|
||
_ggad =1.4;_decc =0.46;_gdda =0.02;_dfdcc =0.2;_gccc =0.5;_abdda =4;_dgce =4.0;_dbdg =6;_gcfa =0.3;_fgde =0.01;_ffff =0.02;_aaaa =2;_acebd =2;_edfde =500;_ebede =4.0;_gcb =4.0;_dccc =0.05;_bedf =0.1;_ecce =2.0;_cabc =2.0;_ebcd =1.5;_acggf =3.0;_efea =0.25;
|
||
);func _dgea (_ebadb _aec .PdfRectangle )*ruling {return &ruling {_egdf :_eebe ,_gbgc :_ebadb .Urx ,_fgad :_ebadb .Lly ,_ababc :_ebadb .Ury };};func (_bdgfd *textTable )get (_gaaf ,_fabb int )*textPara {return _bdgfd ._bfdff [_fgged (_gaaf ,_fabb )]};func _ecfgf (_bcfc string )bool {if _e .RuneCountInString (_bcfc )< _abdda {return false ;
|
||
};_adgcc ,_gbad :=_e .DecodeLastRuneInString (_bcfc );if _gbad <=0||!_be .Is (_be .Hyphen ,_adgcc ){return false ;};_adgcc ,_gbad =_e .DecodeLastRuneInString (_bcfc [:len (_bcfc )-_gbad ]);return _gbad > 0&&!_be .IsSpace (_adgcc );};func _ecfbd (_gddab float64 )bool {return _ea .Abs (_gddab )< _bedg };
|
||
func (_fbgg *textLine )toTextMarks (_gcfb *int )[]TextMark {var _bfgc []TextMark ;for _ ,_ccae :=range _fbgg ._bfag {if _ccae ._ceff {_bfgc =_ggcce (_bfgc ,_gcfb ,"\u0020");};_gbfb :=_ccae .toTextMarks (_gcfb );_bfgc =append (_bfgc ,_gbfb ...);};return _bfgc ;
|
||
};func _gdgbf (_ecbb ,_fbbee ,_geedg float64 )rulingKind {if _ecbb >=_geedg &&_gdba (_fbbee ,_ecbb ){return _bfgb ;};if _fbbee >=_geedg &&_gdba (_ecbb ,_fbbee ){return _eebe ;};return _cbfe ;};func _cgbfe (_daac ,_geab _g .Point )rulingKind {_bbce :=_ea .Abs (_daac .X -_geab .X );
|
||
_dege :=_ea .Abs (_daac .Y -_geab .Y );return _gdgbf (_bbce ,_dege ,_ebede );};func _bcea (_beefb ,_baaba bounded )float64 {return _beefb .bbox ().Llx -_baaba .bbox ().Llx };func (_daf *textObject )setTextMatrix (_eabe []float64 ){if len (_eabe )!=6{_fc .Log .Debug ("\u0045\u0052\u0052OR\u003a\u0020\u006c\u0065\u006e\u0028\u0066\u0029\u0020\u0021\u003d\u0020\u0036\u0020\u0028\u0025\u0064\u0029",len (_eabe ));
|
||
return ;};_geag ,_adecg ,_daff ,_ecfg ,_ada ,_cec :=_eabe [0],_eabe [1],_eabe [2],_eabe [3],_eabe [4],_eabe [5];_daf ._acc =_g .NewMatrix (_geag ,_adecg ,_daff ,_ecfg ,_ada ,_cec );_daf ._dde =_daf ._acc ;};func (_eba *textObject )setTextRenderMode (_cde int ){if _eba ==nil {return ;
|
||
};_eba ._cbgd ._ffgba =RenderMode (_cde );};func (_abfef *wordBag )depthIndexes ()[]int {if len (_abfef ._gbbd )==0{return nil ;};_beef :=make ([]int ,len (_abfef ._gbbd ));_fdbd :=0;for _cdcab :=range _abfef ._gbbd {_beef [_fdbd ]=_cdcab ;_fdbd ++;};_a .Ints (_beef );
|
||
return _beef ;};func (_agdcg intSet )has (_abfgb int )bool {_ ,_eaea :=_agdcg [_abfgb ];return _eaea };func (_ecabf paraList )sortTopoOrder (){_bccd :=_ecabf .topoOrder ();_ecabf .reorder (_bccd )};func (_agad paraList )inTile (_ccea gridTile )paraList {var _beab paraList ;
|
||
for _ ,_afef :=range _agad {if _ccea .contains (_afef .PdfRectangle ){_beab =append (_beab ,_afef );};};if _gbead {_ae .Printf ("\u0020 \u0020\u0069\u006e\u0054i\u006c\u0065\u003a\u0020\u0020%\u0073 \u0069n\u0073\u0069\u0064\u0065\u003d\u0025\u0064\n",_ccea ,len (_beab ));
|
||
for _bgagf ,_geae :=range _beab {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bgagf ,_geae );};_ae .Println ("");};return _beab ;};var _gdec =[]string {"\u2756","\u27a2","\u2713","\u2022","\uf0a7","\u25a1","\u2212","\u25a0","\u25aa","\u006f"};
|
||
func _gadb (_edfc _aec .PdfRectangle ,_abga bounded )float64 {return _edfc .Ury -_abga .bbox ().Lly };func _gceb (_ecaeg ,_cddg *textPara )bool {if _ecaeg ._egbea ||_cddg ._egbea {return true ;};return _ecfbd (_ecaeg .depth ()-_cddg .depth ());};type fontEntry struct{_gfab *_aec .PdfFont ;
|
||
_egfcg int64 ;};func (_cgfg *ruling )intersects (_ddedb *ruling )bool {_fbdaa :=(_cgfg ._egdf ==_eebe &&_ddedb ._egdf ==_bfgb )||(_ddedb ._egdf ==_eebe &&_cgfg ._egdf ==_bfgb );_bfgf :=func (_dfgdg ,_ebabb *ruling )bool {return _dfgdg ._fgad -_ecce <=_ebabb ._gbgc &&_ebabb ._gbgc <=_dfgdg ._ababc +_ecce ;
|
||
};_fdbdf :=_bfgf (_cgfg ,_ddedb );_ffbg :=_bfgf (_ddedb ,_cgfg );if _eceg {_ae .Printf ("\u0020\u0020\u0020\u0020\u0069\u006e\u0074\u0065\u0072\u0073\u0065\u0063\u0074\u0073\u003a\u0020\u0020\u006fr\u0074\u0068\u006f\u0067\u006f\u006e\u0061l\u003d\u0025\u0074\u0020\u006f\u0031\u003d\u0025\u0074\u0020\u006f2\u003d\u0025\u0074\u0020\u2192\u0020\u0025\u0074\u000a"+"\u0020\u0020\u0020 \u0020\u0020\u0020\u0076\u003d\u0025\u0073\u000a"+" \u0020\u0020\u0020\u0020\u0020\u0077\u003d\u0025\u0073\u000a",_fbdaa ,_fdbdf ,_ffbg ,_fbdaa &&_fdbdf &&_ffbg ,_cgfg ,_ddedb );
|
||
};return _fbdaa &&_fdbdf &&_ffbg ;};func _fbccc (_afec []*textMark ,_agcgg _aec .PdfRectangle )*textWord {_cacc :=_afec [0].PdfRectangle ;_gdeg :=_afec [0]._cbbae ;for _ ,_gaaeg :=range _afec [1:]{_cacc =_agfb (_cacc ,_gaaeg .PdfRectangle );if _gaaeg ._cbbae > _gdeg {_gdeg =_gaaeg ._cbbae ;
|
||
};};return &textWord {PdfRectangle :_cacc ,_ebfa :_afec ,_aecg :_agcgg .Ury -_cacc .Lly ,_aeegf :_gdeg };};func (_ecfef rulingList )asTiling ()gridTiling {if _ddb {_fc .Log .Info ("r\u0075\u006ci\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0076\u0065\u0063s\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u002b\u002b\u002b\u0020\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d",len (_ecfef ));
|
||
};for _faaf ,_egfcb :=range _ecfef [1:]{_cedae :=_ecfef [_faaf ];if _cedae .alignsPrimary (_egfcb )&&_cedae .alignsSec (_egfcb ){_fc .Log .Error ("a\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0044\u0075\u0070\u006c\u0069\u0063\u0061\u0074\u0065 \u0072\u0075\u006c\u0069\u006e\u0067\u0073\u002e\u000a\u0009v=\u0025\u0073\u000a\t\u0077=\u0025\u0073",_egfcb ,_cedae );
|
||
};};_ecfef .sortStrict ();_ecfef .log ("\u0073n\u0061\u0070\u0070\u0065\u0064");_fccce ,_faadfc :=_ecfef .vertsHorzs ();_bedbc :=_fccce .primaries ();_egdb :=_faadfc .primaries ();_daedd :=len (_bedbc )-1;_efbeag :=len (_egdb )-1;if _daedd ==0||_efbeag ==0{return gridTiling {};
|
||
};_agccf :=_aec .PdfRectangle {Llx :_bedbc [0],Urx :_bedbc [_daedd ],Lly :_egdb [0],Ury :_egdb [_efbeag ]};if _ddb {_fc .Log .Info ("\u0072\u0075l\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0076\u0065\u0072\u0074s=\u0025\u0064",len (_fccce ));
|
||
for _bfcf ,_ffbb :=range _fccce {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bfcf ,_ffbb );};_fc .Log .Info ("\u0072\u0075l\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0068\u006f\u0072\u007as=\u0025\u0064",len (_faadfc ));
|
||
for _egece ,_bgagd :=range _faadfc {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_egece ,_bgagd );};_fc .Log .Info ("\u0072\u0075\u006c\u0069\u006eg\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067:\u0020\u0020\u0077\u0078\u0068\u003d\u0025\u0064\u0078\u0025\u0064\u000a\u0009\u006c\u006c\u0078\u003d\u0025\u002e\u0032\u0066\u000a\u0009\u006c\u006c\u0079\u003d\u0025\u002e\u0032f",_daedd ,_efbeag ,_bedbc ,_egdb );
|
||
};_ffbc :=make ([]gridTile ,_daedd *_efbeag );for _adabd :=_efbeag -1;_adabd >=0;_adabd --{_acec :=_egdb [_adabd ];_dafbg :=_egdb [_adabd +1];for _edda :=0;_edda < _daedd ;_edda ++{_gdbg :=_bedbc [_edda ];_deccf :=_bedbc [_edda +1];_bebb :=_fccce .findPrimSec (_gdbg ,_acec );
|
||
_bbgdgc :=_fccce .findPrimSec (_deccf ,_acec );_gaeg :=_faadfc .findPrimSec (_acec ,_gdbg );_acgefd :=_faadfc .findPrimSec (_dafbg ,_gdbg );_babd :=_aec .PdfRectangle {Llx :_gdbg ,Urx :_deccf ,Lly :_acec ,Ury :_dafbg };_cbgg :=_aggg (_babd ,_bebb ,_bbgdgc ,_gaeg ,_acgefd );
|
||
_ffbc [_adabd *_daedd +_edda ]=_cbgg ;if _ddb {_ae .Printf ("\u0020\u0020\u0078\u003d\u0025\u0032\u0064\u0020\u0079\u003d\u0025\u0032\u0064\u003a\u0020%\u0073 \u0025\u0036\u002e\u0032\u0066\u0020\u0078\u0020\u0025\u0036\u002e\u0032\u0066\u000a",_edda ,_adabd ,_cbgg .String (),_cbgg .Width (),_cbgg .Height ());
|
||
};};};if _ddb {_fc .Log .Info ("r\u0075\u006c\u0069\u006e\u0067\u004c\u0069\u0073\u0074.\u0061\u0073\u0054\u0069\u006c\u0069\u006eg:\u0020\u0063\u006f\u0061l\u0065\u0073\u0063\u0065\u0020\u0068\u006f\u0072\u0069zo\u006e\u0074a\u006c\u002e\u0020\u0025\u0036\u002e\u0032\u0066",_agccf );
|
||
};_gfcfc :=make ([]map[float64 ]gridTile ,_efbeag );for _febag :=_efbeag -1;_febag >=0;_febag --{if _ddb {_ae .Printf ("\u0020\u0020\u0079\u003d\u0025\u0032\u0064\u000a",_febag );};_gfcfc [_febag ]=make (map[float64 ]gridTile ,_daedd );for _dcgd :=0;_dcgd < _daedd ;
|
||
_dcgd ++{_efdf :=_ffbc [_febag *_daedd +_dcgd ];if _ddb {_ae .Printf ("\u0020\u0020\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_dcgd ,_efdf );};if !_efdf ._cbfd {continue ;};_gcgd :=_dcgd ;for _bgbg :=_dcgd +1;!_efdf ._cbdbf &&_bgbg < _daedd ;_bgbg ++{_adca :=_ffbc [_febag *_daedd +_bgbg ];
|
||
_efdf .Urx =_adca .Urx ;_efdf ._fbbf =_efdf ._fbbf ||_adca ._fbbf ;_efdf ._fcgc =_efdf ._fcgc ||_adca ._fcgc ;_efdf ._cbdbf =_adca ._cbdbf ;if _ddb {_ae .Printf ("\u0020 \u0020%\u0034\u0064\u003a\u0020\u0025s\u0020\u2192 \u0025\u0073\u000a",_bgbg ,_adca ,_efdf );
|
||
};_gcgd =_bgbg ;};if _ddb {_ae .Printf (" \u0020 \u0025\u0032\u0064\u0020\u002d\u0020\u0025\u0032d\u0020\u2192\u0020\u0025s\n",_dcgd ,_gcgd ,_efdf );};_dcgd =_gcgd ;_gfcfc [_febag ][_efdf .Llx ]=_efdf ;};};_geccf :=make (map[float64 ]map[float64 ]gridTile ,_efbeag );
|
||
_ecde :=make (map[float64 ]map[float64 ]struct{},_efbeag );for _dcceg :=_efbeag -1;_dcceg >=0;_dcceg --{_ebdb :=_ffbc [_dcceg *_daedd ].Lly ;_geccf [_ebdb ]=make (map[float64 ]gridTile ,_daedd );_ecde [_ebdb ]=make (map[float64 ]struct{},_daedd );};if _ddb {_fc .Log .Info ("\u0072u\u006c\u0069n\u0067\u004c\u0069s\u0074\u002e\u0061\u0073\u0054\u0069\u006ci\u006e\u0067\u003a\u0020\u0063\u006fa\u006c\u0065\u0073\u0063\u0065\u0020\u0076\u0065\u0072\u0074\u0069c\u0061\u006c\u002e\u0020\u0025\u0036\u002e\u0032\u0066",_agccf );
|
||
};for _bgfeb :=_efbeag -1;_bgfeb >=0;_bgfeb --{_gbagg :=_ffbc [_bgfeb *_daedd ].Lly ;_cgdgb :=_gfcfc [_bgfeb ];if _ddb {_ae .Printf ("\u0020\u0020\u0079\u003d\u0025\u0032\u0064\u000a",_bgfeb );};for _ ,_bbbaa :=range _affg (_cgdgb ){if _ ,_gafgf :=_ecde [_gbagg ][_bbbaa ];
|
||
_gafgf {continue ;};_dbff :=_cgdgb [_bbbaa ];if _ddb {_ae .Printf (" \u0020\u0020\u0020\u0020\u0076\u0030\u003d\u0025\u0073\u000a",_dbff .String ());};for _baaa :=_bgfeb -1;_baaa >=0;_baaa --{if _dbff ._fcgc {break ;};_ebfd :=_gfcfc [_baaa ];_fafad ,_bdda :=_ebfd [_bbbaa ];
|
||
if !_bdda {break ;};if _fafad .Urx !=_dbff .Urx {break ;};_dbff ._fcgc =_fafad ._fcgc ;_dbff .Lly =_fafad .Lly ;if _ddb {_ae .Printf ("\u0020\u0020\u0020\u0020 \u0020\u0020\u0076\u003d\u0025\u0073\u0020\u0076\u0030\u003d\u0025\u0073\u000a",_fafad .String (),_dbff .String ());
|
||
};_ecde [_fafad .Lly ][_fafad .Llx ]=struct{}{};};if _bgfeb ==0{_dbff ._fcgc =true ;};if _dbff .complete (){_geccf [_gbagg ][_bbbaa ]=_dbff ;};};};_gdccc :=gridTiling {PdfRectangle :_agccf ,_fecg :_eagbe (_geccf ),_eceeed :_aced (_geccf ),_bbfc :_geccf };
|
||
_gdccc .log ("\u0043r\u0065\u0061\u0074\u0065\u0064");return _gdccc ;};
|
||
|
||
// Options extractor options.
|
||
type Options struct{
|
||
|
||
// DisableDocumentTags specifies whether to use the document tags during list extraction.
|
||
DisableDocumentTags bool ;
|
||
|
||
// ApplyCropBox will extract page text based on page cropbox if set to `true`.
|
||
ApplyCropBox bool ;
|
||
|
||
// UseSimplerExtractionProcess will skip topological text ordering and table processing.
|
||
//
|
||
// NOTE: While normally the extra processing is beneficial, it can also lead to problems when it does not work.
|
||
// Thus it is a flag to allow the user to control this process.
|
||
//
|
||
// Skipping some extraction processes would also lead to the reduced processing time.
|
||
UseSimplerExtractionProcess bool ;};func _edacb (_cafgf ,_aecaa _ec .Image )_ec .Image {_eagcd ,_deda :=_aecaa .Bounds ().Size (),_cafgf .Bounds ().Size ();_ecege ,_cedf :=_eagcd .X ,_eagcd .Y ;if _deda .X > _ecege {_ecege =_deda .X ;};if _deda .Y > _cedf {_cedf =_deda .Y ;
|
||
};_ggfeg :=_ec .Rect (0,0,_ecege ,_cedf );if _eagcd .X !=_ecege ||_eagcd .Y !=_cedf {_ddaba :=_ec .NewRGBA (_ggfeg );_ag .BiLinear .Scale (_ddaba ,_ggfeg ,_cafgf ,_aecaa .Bounds (),_ag .Over ,nil );_aecaa =_ddaba ;};if _deda .X !=_ecege ||_deda .Y !=_cedf {_cfeeb :=_ec .NewRGBA (_ggfeg );
|
||
_ag .BiLinear .Scale (_cfeeb ,_ggfeg ,_cafgf ,_cafgf .Bounds (),_ag .Over ,nil );_cafgf =_cfeeb ;};_cggef :=_ec .NewRGBA (_ggfeg );_ag .DrawMask (_cggef ,_ggfeg ,_cafgf ,_ec .Point {},_aecaa ,_ec .Point {},_ag .Over );return _cggef ;};func (_ccaa *subpath )add (_ddeg ..._g .Point ){_ccaa ._acfg =append (_ccaa ._acfg ,_ddeg ...)};
|
||
func (_dfeg paraList )findTableGrid (_cggfb gridTiling )(*textTable ,map[*textPara ]struct{}){_affec :=len (_cggfb ._fecg );_eaefe :=len (_cggfb ._eceeed );_cacga :=textTable {_fbccb :true ,_afcga :_affec ,_agdc :_eaefe ,_bfdff :make (map[uint64 ]*textPara ,_affec *_eaefe ),_gaeb :make (map[uint64 ]compositeCell ,_affec *_eaefe )};
|
||
_cacga .PdfRectangle =_cggfb .PdfRectangle ;_gcdbf :=make (map[*textPara ]struct{});_adbaf :=int ((1.0-_gcfa )*float64 (_affec *_eaefe ));_addgd :=0;if _ddb {_fc .Log .Info ("\u0066\u0069\u006e\u0064Ta\u0062\u006c\u0065\u0047\u0072\u0069\u0064\u003a\u0020\u0025\u0064\u0020\u0078\u0020%\u0064",_affec ,_eaefe );
|
||
};for _cedde ,_ddfbf :=range _cggfb ._eceeed {_gegb ,_gfeee :=_cggfb ._bbfc [_ddfbf ];if !_gfeee {continue ;};for _efad ,_ecec :=range _cggfb ._fecg {_dbadb ,_geebcf :=_gegb [_ecec ];if !_geebcf {continue ;};_feff :=_dfeg .inTile (_dbadb );if len (_feff )==0{_addgd ++;
|
||
if _addgd > _adbaf {if _ddb {_fc .Log .Info ("\u0021\u006e\u0075m\u0045\u006d\u0070\u0074\u0079\u003d\u0025\u0064",_addgd );};return nil ,nil ;};}else {_cacga .putComposite (_efad ,_cedde ,_feff ,_dbadb .PdfRectangle );for _ ,_bdca :=range _feff {_gcdbf [_bdca ]=struct{}{};
|
||
};};};};_fffce :=0;for _cbee :=0;_cbee < _affec ;_cbee ++{_bbced :=_cacga .get (_cbee ,0);if _bbced ==nil ||!_bbced ._egbea {_fffce ++;};};if _fffce ==0{if _ddb {_fc .Log .Info ("\u0021\u006e\u0075m\u0048\u0065\u0061\u0064\u0065\u0072\u003d\u0030");};return nil ,nil ;
|
||
};_adebf :=_cacga .reduceTiling (_cggfb ,_acggf );_adebf =_adebf .subdivide ();return _adebf ,_gcdbf ;};func _aadc (_afdg bounded )float64 {return -_afdg .bbox ().Lly };func (_agcd *wordBag )minDepth ()float64 {return _agcd ._aeceg -(_agcd .Ury -_agcd ._aad )};
|
||
func _cdffd (_bdfbb ,_beed float64 )bool {return _ea .Abs (_bdfbb -_beed )<=_ecce };func (_afcc *textPara )writeCellText (_gagfc _bc .Writer ){for _eadee ,_ecdce :=range _afcc ._bfagf {_cdff :=_ecdce .text ();_acggc :=_bfab &&_ecdce .endsInHyphen ()&&_eadee !=len (_afcc ._bfagf )-1;
|
||
if _acggc {_cdff =_gegca (_cdff );};_gagfc .Write ([]byte (_cdff ));if !(_acggc ||_eadee ==len (_afcc ._bfagf )-1){_gagfc .Write ([]byte (_ecaa (_ecdce ._gaca ,_afcc ._bfagf [_eadee +1]._gaca )));};};};func (_cdgc paraList )findTextTables ()[]*textTable {var _cdgfg []*textTable ;
|
||
for _ ,_bggd :=range _cdgc {if _bggd .taken ()||_bggd .Width ()==0{continue ;};_faabc :=_bggd .isAtom ();if _faabc ==nil {continue ;};_faabc .growTable ();if _faabc ._afcga *_faabc ._agdc < _dbdg {continue ;};_faabc .markCells ();_faabc .log ("\u0067\u0072\u006fw\u006e");
|
||
_cdgfg =append (_cdgfg ,_faabc );};return _cdgfg ;};func (_agb *imageExtractContext )extractInlineImage (_bcc *_ba .ContentStreamInlineImage ,_gabf _ba .GraphicsState ,_fee *_aec .PdfPageResources )error {_dfd ,_acf :=_bcc .ToImage (_fee );if _acf !=nil {return _acf ;
|
||
};_ced ,_acf :=_bcc .GetColorSpace (_fee );if _acf !=nil {return _acf ;};if _ced ==nil {_ced =_aec .NewPdfColorspaceDeviceGray ();};_dfdg ,_acf :=_ced .ImageToRGB (*_dfd );if _acf !=nil {return _acf ;};_bgbd :=ImageMark {Image :&_dfdg ,Width :_gabf .CTM .ScalingFactorX (),Height :_gabf .CTM .ScalingFactorY (),Angle :_gabf .CTM .Angle ()};
|
||
_bgbd .X ,_bgbd .Y =_gabf .CTM .Translation ();_agb ._bff =append (_agb ._bff ,_bgbd );_agb ._aa ++;return nil ;};func (_agegf rulingList )vertsHorzs ()(rulingList ,rulingList ){var _fgge ,_dgda rulingList ;for _ ,_geff :=range _agegf {switch _geff ._egdf {case _eebe :_fgge =append (_fgge ,_geff );
|
||
case _bfgb :_dgda =append (_dgda ,_geff );};};return _fgge ,_dgda ;};func (_cceb gridTiling )complete ()bool {for _ ,_bcbd :=range _cceb ._bbfc {for _ ,_agfa :=range _bcbd {if !_agfa .complete (){return false ;};};};return true ;};type gridTiling struct{_aec .PdfRectangle ;
|
||
_fecg []float64 ;_eceeed []float64 ;_bbfc map[float64 ]map[float64 ]gridTile ;};func _bdce (_ffce byte )bool {for _ ,_agcc :=range _gdec {if []byte (_agcc )[0]==_ffce {return true ;};};return false ;};func (_befc *textTable )reduce ()*textTable {_ddabe :=make ([]int ,0,_befc ._agdc );
|
||
_cedc :=make ([]int ,0,_befc ._afcga );for _bcdf :=0;_bcdf < _befc ._agdc ;_bcdf ++{if !_befc .emptyCompositeRow (_bcdf ){_ddabe =append (_ddabe ,_bcdf );};};for _ccfbf :=0;_ccfbf < _befc ._afcga ;_ccfbf ++{if !_befc .emptyCompositeColumn (_ccfbf ){_cedc =append (_cedc ,_ccfbf );
|
||
};};if len (_ddabe )==_befc ._agdc &&len (_cedc )==_befc ._afcga {return _befc ;};_cbbb :=textTable {_fbccb :_befc ._fbccb ,_afcga :len (_cedc ),_agdc :len (_ddabe ),_bfdff :make (map[uint64 ]*textPara ,len (_cedc )*len (_ddabe ))};if _gbead {_fc .Log .Info ("\u0072\u0065\u0064\u0075ce\u003a\u0020\u0025\u0064\u0078\u0025\u0064\u0020\u002d\u003e\u0020\u0025\u0064\u0078%\u0064",_befc ._afcga ,_befc ._agdc ,len (_cedc ),len (_ddabe ));
|
||
_fc .Log .Info ("\u0072\u0065d\u0075\u0063\u0065d\u0043\u006f\u006c\u0073\u003a\u0020\u0025\u002b\u0076",_cedc );_fc .Log .Info ("\u0072\u0065d\u0075\u0063\u0065d\u0052\u006f\u0077\u0073\u003a\u0020\u0025\u002b\u0076",_ddabe );};for _adcf ,_eeea :=range _ddabe {for _bdfdb ,_dfabg :=range _cedc {_bgbgg ,_fdfd :=_befc .getComposite (_dfabg ,_eeea );
|
||
if _bgbgg ==nil {continue ;};if _gbead {_ae .Printf ("\u0020 \u0025\u0032\u0064\u002c \u0025\u0032\u0064\u0020\u0028%\u0032d\u002c \u0025\u0032\u0064\u0029\u0020\u0025\u0071\n",_bdfdb ,_adcf ,_dfabg ,_eeea ,_bgfd (_bgbgg .merge ().text (),50));};_cbbb .putComposite (_bdfdb ,_adcf ,_bgbgg ,_fdfd );
|
||
};};return &_cbbb ;};func (_fdccb *textTable )depth ()float64 {_faga :=1e10;for _addge :=0;_addge < _fdccb ._afcga ;_addge ++{_cgbg :=_fdccb .get (_addge ,0);if _cgbg ==nil ||_cgbg ._egbea {continue ;};_faga =_ea .Min (_faga ,_cgbg .depth ());};return _faga ;
|
||
};func _eee (_dgfag _g .Matrix )_g .Point {_abf ,_gaa :=_dgfag .Translation ();return _g .Point {X :_abf ,Y :_gaa };};func (_gaffb *textTable )compositeColCorridors ()map[int ][]float64 {_eaeg :=make (map[int ][]float64 ,_gaffb ._afcga );if _gbead {_fc .Log .Info ("\u0063\u006f\u006d\u0070o\u0073\u0069\u0074\u0065\u0043\u006f\u006c\u0043\u006f\u0072r\u0069d\u006f\u0072\u0073\u003a\u0020\u0077\u003d%\u0064\u0020",_gaffb ._afcga );
|
||
};for _deed :=0;_deed < _gaffb ._afcga ;_deed ++{_eaeg [_deed ]=nil ;};return _eaeg ;};func _faab (_bagf _aec .PdfRectangle )rulingKind {_aefd :=_bagf .Width ();_acgec :=_bagf .Height ();if _aefd > _acgec {if _aefd >=_ebede {return _bfgb ;};}else {if _acgec >=_ebede {return _eebe ;
|
||
};};return _cbfe ;};func _ggge (_aebg *list ,_gcgc *_bb .Builder ,_edfe *string ){_cfdg :=_aeced (_aebg ,_edfe );_gcgc .WriteString (_cfdg );for _ ,_eacf :=range _aebg ._bffd {_abfc :=*_edfe +"\u0020\u0020\u0020";_ggge (_eacf ,_gcgc ,&_abfc );};};func _ecag (_dggg float64 )bool {return _ea .Abs (_dggg )< _cabc };
|
||
func _fegba (_beac []_bad .PdfObject )(_ceade ,_ffaba float64 ,_ggdgg error ){if len (_beac )!=2{return 0,0,_ae .Errorf ("\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u006e\u0075\u006d\u0062\u0065\u0072\u0020o\u0066 \u0070\u0061\u0072\u0061\u006d\u0073\u003a \u0025\u0064",len (_beac ));
|
||
};_faaab ,_ggdgg :=_bad .GetNumbersAsFloat (_beac );if _ggdgg !=nil {return 0,0,_ggdgg ;};return _faaab [0],_faaab [1],nil ;};func (_fccg *PageFonts )extractPageResourcesToFont (_ffaa *_aec .PdfPageResources )error {_bg ,_ecc :=_bad .GetDict (_ffaa .Font );
|
||
if !_ecc {return _d .New (_aea );};for _ ,_adg :=range _bg .Keys (){var (_bac =true ;_eeg []byte ;_dg string ;);_ce ,_aff :=_ffaa .GetFontByName (_adg );if !_aff {return _d .New (_bf );};_db ,_cee :=_aec .NewPdfFontFromPdfObject (_ce );if _cee !=nil {return _cee ;
|
||
};_afe :=_db .FontDescriptor ();_bba :=_db .FontDescriptor ().FontName .String ();_gab :=_db .Subtype ();if _ggb (_fccg .Fonts ,_bba ){continue ;};if len (_db .ToUnicode ())==0{_bac =false ;};if _afe .FontFile !=nil {if _ega ,_gb :=_bad .GetStream (_afe .FontFile );
|
||
_gb {_eeg ,_cee =_bad .DecodeStream (_ega );if _cee !=nil {return _cee ;};_dg =_bba +"\u002e\u0070\u0066\u0062";};}else if _afe .FontFile2 !=nil {if _fd ,_add :=_bad .GetStream (_afe .FontFile2 );_add {_eeg ,_cee =_bad .DecodeStream (_fd );if _cee !=nil {return _cee ;
|
||
};_dg =_bba +"\u002e\u0074\u0074\u0066";};}else if _afe .FontFile3 !=nil {if _beg ,_ffg :=_bad .GetStream (_afe .FontFile3 );_ffg {_eeg ,_cee =_bad .DecodeStream (_beg );if _cee !=nil {return _cee ;};_dg =_bba +"\u002e\u0063\u0066\u0066";};};if len (_dg )< 1{_fc .Log .Debug (_de );
|
||
};_ebb :=Font {FontName :_bba ,PdfFont :_db ,IsCID :_db .IsCID (),IsSimple :_db .IsSimple (),ToUnicode :_bac ,FontType :_gab ,FontData :_eeg ,FontFileName :_dg ,FontDescriptor :_afe };_fccg .Fonts =append (_fccg .Fonts ,_ebb );};return nil ;};func _gabfb (_fgbf _bad .PdfObject ,_ebfe _eg .Color )(_ec .Image ,error ){_ddccc ,_bbadd :=_bad .GetStream (_fgbf );
|
||
if !_bbadd {return nil ,nil ;};_fcgd ,_eccd :=_aec .NewXObjectImageFromStream (_ddccc );if _eccd !=nil {return nil ,_eccd ;};_gcgeb ,_eccd :=_fcgd .ToImage ();if _eccd !=nil {return nil ,_eccd ;};return _cbace (_gcgeb ,_ebfe ),nil ;};func (_feba pathSection )bbox ()_aec .PdfRectangle {_aacd :=_feba ._gbag [0]._acfg [0];
|
||
_cbfa :=_aec .PdfRectangle {Llx :_aacd .X ,Urx :_aacd .X ,Lly :_aacd .Y ,Ury :_aacd .Y };_fgdf :=func (_gefg _g .Point ){if _gefg .X < _cbfa .Llx {_cbfa .Llx =_gefg .X ;}else if _gefg .X > _cbfa .Urx {_cbfa .Urx =_gefg .X ;};if _gefg .Y < _cbfa .Lly {_cbfa .Lly =_gefg .Y ;
|
||
}else if _gefg .Y > _cbfa .Ury {_cbfa .Ury =_gefg .Y ;};};for _ ,_cgcg :=range _feba ._gbag [0]._acfg [1:]{_fgdf (_cgcg );};for _ ,_fgfe :=range _feba ._gbag [1:]{for _ ,_ffda :=range _fgfe ._acfg {_fgdf (_ffda );};};return _cbfa ;};func (_afba *textObject )setCharSpacing (_gbf float64 ){if _afba ==nil {return ;
|
||
};_afba ._cbgd ._cga =_gbf ;if _eged {_fc .Log .Info ("\u0073\u0065t\u0043\u0068\u0061\u0072\u0053\u0070\u0061\u0063\u0069\u006e\u0067\u003a\u0020\u0025\u002e\u0032\u0066\u0020\u0073\u0074\u0061\u0074e=\u0025\u0073",_gbf ,_afba ._cbgd .String ());};};func (_ecbf TextTable )getCellInfo (_gcg TextMark )[][]int {for _daca ,_cgebc :=range _ecbf .Cells {for _egdg :=range _cgebc {_gafe :=&_cgebc [_egdg ].Marks ;
|
||
if _gafe .exists (_gcg ){return [][]int {{_daca },{_egdg }};};};};return nil ;};var _dfba =TextMark {Text :"\u005b\u0058\u005d",Original :"\u0020",Meta :true ,FillColor :_eg .White ,StrokeColor :_eg .White };type wordBag struct{_aec .PdfRectangle ;_aad float64 ;
|
||
_eddc ,_gbfd rulingList ;_aeceg float64 ;_gbbd map[int ][]*textWord ;};
|
||
|
||
// String returns a string describing `tm`.
|
||
func (_eag TextMark )String ()string {_bfff :=_eag .BBox ;var _cag string ;if _eag .Font !=nil {_cag =_eag .Font .String ();if len (_cag )> 50{_cag =_cag [:50]+"\u002e\u002e\u002e";};};var _fbdf string ;if _eag .Meta {_fbdf ="\u0020\u002a\u004d\u002a";
|
||
};return _ae .Sprintf ("\u007b\u0054\u0065\u0078t\u004d\u0061\u0072\u006b\u003a\u0020\u0025\u0064\u0020%\u0071\u003d\u0025\u0030\u0032\u0078\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e2\u0066\u0029\u0020\u0028\u00256\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0025\u0073\u007d",_eag .Offset ,_eag .Text ,[]rune (_eag .Text ),_bfff .Llx ,_bfff .Lly ,_bfff .Urx ,_bfff .Ury ,_cag ,_fbdf );
|
||
};func (_abag paraList )lines ()[]*textLine {var _ecfdg []*textLine ;for _ ,_dgfc :=range _abag {_ecfdg =append (_ecfdg ,_dgfc ._bfagf ...);};return _ecfdg ;};type subpath struct{_acfg []_g .Point ;_bbdf bool ;};
|
||
|
||
// RenderMode specifies the text rendering mode (Tmode), which determines whether showing text shall cause
|
||
// glyph outlines to be stroked, filled, used as a clipping boundary, or some combination of the three.
|
||
// Stroking, filling, and clipping shall have the same effects for a text object as they do for a path object
|
||
// (see 8.5.3, "Path-Painting Operators" and 8.5.4, "Clipping Path Operators").
|
||
type RenderMode int ;func (_baeb rulingList )snapToGroups ()rulingList {_dfffg ,_edba :=_baeb .vertsHorzs ();if len (_dfffg )> 0{_dfffg =_dfffg .snapToGroupsDirection ();};if len (_edba )> 0{_edba =_edba .snapToGroupsDirection ();};_ffceb :=append (_dfffg ,_edba ...);
|
||
_ffceb .log ("\u0073\u006e\u0061p\u0054\u006f\u0047\u0072\u006f\u0075\u0070\u0073");return _ffceb ;};func _fgeda (_eagb []*textLine ,_eadg map[float64 ][]*textLine ,_bdaf []float64 ,_abfdb int ,_fegd ,_cffb float64 )[]*list {_ddbf :=[]*list {};_ffac :=_abfdb ;
|
||
_abfdb =_abfdb +1;_aade :=_bdaf [_ffac ];_fbag :=_eadg [_aade ];_bdag :=_eeef (_fbag ,_cffb ,_fegd );for _ageg ,_fgedaf :=range _bdag {var _eagg float64 ;_fegg :=[]*list {};_cgfd :=_fgedaf ._gaca ;_fagd :=_cffb ;if _ageg < len (_bdag )-1{_fagd =_bdag [_ageg +1]._gaca ;
|
||
};if _abfdb < len (_bdaf ){_fegg =_fgeda (_eagb ,_eadg ,_bdaf ,_abfdb ,_cgfd ,_fagd );};_eagg =_fagd ;if len (_fegg )> 0{_eegdb :=_fegg [0];if len (_eegdb ._gagag )> 0{_eagg =_eegdb ._gagag [0]._gaca ;};};_gbda :=[]*textLine {_fgedaf };_daab :=_eagc (_fgedaf ,_eagb ,_bdaf ,_cgfd ,_eagg );
|
||
_gbda =append (_gbda ,_daab ...);_gfca :=_deec (_gbda ,"\u0062\u0075\u006c\u006c\u0065\u0074",_fegg );_gfca ._begg =_aadg (_gbda ,"");_ddbf =append (_ddbf ,_gfca );};return _ddbf ;};const _cdab =1.0/1000.0;type bounded interface{bbox ()_aec .PdfRectangle };
|
||
func (_eddg *textObject )setWordSpacing (_aebf float64 ){if _eddg ==nil {return ;};_eddg ._cbgd ._ecd =_aebf ;};func (_adba paraList )applyTables (_bebdbf []*textTable )paraList {var _fgef paraList ;for _ ,_gbbccd :=range _bebdbf {_fgef =append (_fgef ,_gbbccd .newTablePara ());
|
||
};for _ ,_fffa :=range _adba {if _fffa ._dcddf {continue ;};_fgef =append (_fgef ,_fffa );};return _fgef ;};func (_eaef *textObject )moveText (_aef ,_gbg float64 ){_eaef .moveLP (_aef ,_gbg )};
|
||
|
||
// String returns a string describing the current state of the textState stack.
|
||
func (_ab *stateStack )String ()string {_ceb :=[]string {_ae .Sprintf ("\u002d\u002d\u002d\u002d f\u006f\u006e\u0074\u0020\u0073\u0074\u0061\u0063\u006b\u003a\u0020\u0025\u0064",len (*_ab ))};for _cgf ,_begd :=range *_ab {_dfdb :="\u003c\u006e\u0069l\u003e";
|
||
if _begd !=nil {_dfdb =_begd .String ();};_ceb =append (_ceb ,_ae .Sprintf ("\u0009\u0025\u0032\u0064\u003a\u0020\u0025\u0073",_cgf ,_dfdb ));};return _bb .Join (_ceb ,"\u000a");};func _fgged (_geggf ,_cadfe int )uint64 {return uint64 (_geggf )*0x1000000+uint64 (_cadfe )};
|
||
func _ceab (_gdbgf ,_dbfdc _g .Point )bool {return _gdbgf .X ==_dbfdc .X &&_gdbgf .Y ==_dbfdc .Y };func (_gdabc gridTiling )log (_gbeg string ){if !_ddb {return ;};_fc .Log .Info ("\u0074i\u006ci\u006e\u0067\u003a\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0071",len (_gdabc ._fecg ),len (_gdabc ._eceeed ),_gbeg );
|
||
_ae .Printf ("\u0020\u0020\u0020l\u006c\u0078\u003d\u0025\u002e\u0032\u0066\u000a",_gdabc ._fecg );_ae .Printf ("\u0020\u0020\u0020l\u006c\u0079\u003d\u0025\u002e\u0032\u0066\u000a",_gdabc ._eceeed );for _befa ,_dgec :=range _gdabc ._eceeed {_fabc ,_fffd :=_gdabc ._bbfc [_dgec ];
|
||
if !_fffd {continue ;};_ae .Printf ("%\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u000a",_befa ,_dgec );for _daee ,_efba :=range _gdabc ._fecg {_efcb ,_ecff :=_fabc [_efba ];if !_ecff {continue ;};_ae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_daee ,_efcb .String ());
|
||
};};};func _ffeed (_gfdgc *_aec .Image ,_cgag _eg .Color )_ec .Image {_ebdbg ,_eaega :=int (_gfdgc .Width ),int (_gfdgc .Height );_aefbb :=_ec .NewRGBA (_ec .Rect (0,0,_ebdbg ,_eaega ));for _bfge :=0;_bfge < _eaega ;_bfge ++{for _gcdaf :=0;_gcdaf < _ebdbg ;
|
||
_gcdaf ++{_cdacb ,_fgade :=_gfdgc .ColorAt (_gcdaf ,_bfge );if _fgade !=nil {_fc .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063o\u0075\u006c\u0064\u0020\u006e\u006f\u0074\u0020\u0072\u0065\u0074\u0072\u0069\u0065v\u0065 \u0069\u006d\u0061\u0067\u0065\u0020m\u0061\u0073\u006b\u0020\u0076\u0061\u006cu\u0065\u0020\u0061\u0074\u0020\u0028\u0025\u0064\u002c\u0020\u0025\u0064\u0029\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074\u0020\u006da\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063\u006f\u0072\u0072\u0065\u0063t\u002e",_gcdaf ,_bfge );
|
||
continue ;};_babc ,_dbdgc ,_cgacg ,_ :=_cdacb .RGBA ();var _cbfbe _eg .Color ;if _babc +_dbdgc +_cgacg ==0{_cbfbe =_eg .Transparent ;}else {_cbfbe =_cgag ;};_aefbb .Set (_gcdaf ,_bfge ,_cbfbe );};};return _aefbb ;};const _bfa =20;func (_ffe *textObject )getCurrentFont ()*_aec .PdfFont {_bgda :=_ffe ._cbgd ._cbad ;
|
||
if _bgda ==nil {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u0020\u0064\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020U\u0073\u0069\u006e\u0067\u0020d\u0065\u0066a\u0075\u006c\u0074\u002e");return _aec .DefaultFont ();
|
||
};return _bgda ;};type rulingList []*ruling ;func (_cffba compositeCell )String ()string {_eedd :="";if len (_cffba .paraList )> 0{_eedd =_bgfd (_cffba .paraList .merge ().text (),50);};return _ae .Sprintf ("\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0070\u0061\u0072a\u0073\u0020\u0025\u0071",_cffba .PdfRectangle ,len (_cffba .paraList ),_eedd );
|
||
};func (_dgeaa *textTable )markCells (){for _cdcbf :=0;_cdcbf < _dgeaa ._agdc ;_cdcbf ++{for _abagd :=0;_abagd < _dgeaa ._afcga ;_abagd ++{_bgga :=_dgeaa .get (_abagd ,_cdcbf );if _bgga !=nil {_bgga ._dcddf =true ;};};};};func (_egfc *textObject )getFont (_gaff string )(*_aec .PdfFont ,error ){if _egfc ._eegd ._fba !=nil {_gegf ,_afge :=_egfc .getFontDict (_gaff );
|
||
if _afge !=nil {_fc .Log .Debug ("\u0045\u0052\u0052OR\u003a\u0020\u0067\u0065\u0074\u0046\u006f\u006e\u0074:\u0020n\u0061m\u0065=\u0025\u0073\u002c\u0020\u0065\u0072\u0072\u006f\u0072\u003a\u0020\u0025\u0073",_gaff ,_afge .Error ());return nil ,_afge ;
|
||
};_egfc ._eegd ._fec ++;_bfeg ,_gcfd :=_egfc ._eegd ._fba [_gegf .String ()];if _gcfd {_bfeg ._egfcg =_egfc ._eegd ._fec ;return _bfeg ._gfab ,nil ;};};_efde ,_bfb :=_egfc .getFontDict (_gaff );if _bfb !=nil {return nil ,_bfb ;};_gee ,_bfb :=_egfc .getFontDirect (_gaff );
|
||
if _bfb !=nil {return nil ,_bfb ;};if _egfc ._eegd ._fba !=nil {_dbda :=fontEntry {_gee ,_egfc ._eegd ._fec };if len (_egfc ._eegd ._fba )>=_gbgd {var _beeg []string ;for _fede :=range _egfc ._eegd ._fba {_beeg =append (_beeg ,_fede );};_a .Slice (_beeg ,func (_fdg ,_begf int )bool {return _egfc ._eegd ._fba [_beeg [_fdg ]]._egfcg < _egfc ._eegd ._fba [_beeg [_begf ]]._egfcg ;
|
||
});delete (_egfc ._eegd ._fba ,_beeg [0]);};_egfc ._eegd ._fba [_efde .String ()]=_dbda ;};return _gee ,nil ;};func _dcaa (_gbgaa ,_aegc *textPara )bool {return _bae (_gbgaa ._bbbc ,_aegc ._bbbc )};func _dbega (_egdcb int ,_fagg func (int ,int )bool )[]int {_cead :=make ([]int ,_egdcb );
|
||
for _gggb :=range _cead {_cead [_gggb ]=_gggb ;};_a .Slice (_cead ,func (_dffc ,_abfccd int )bool {return _fagg (_cead [_dffc ],_cead [_abfccd ])});return _cead ;};func (_bgbca compositeCell )hasLines (_gbcdc []*textLine )bool {for _cbabc ,_cede :=range _gbcdc {_eecb :=_fbdc (_bgbca .PdfRectangle ,_cede .PdfRectangle );
|
||
if _gbead {_ae .Printf ("\u0020\u0020\u0020\u0020\u0020\u0020\u005e\u005e\u005e\u0069\u006e\u0074\u0065\u0072\u0073e\u0063t\u0073\u003d\u0025\u0074\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064\u000a",_eecb ,_cbabc ,len (_gbcdc ));_ae .Printf ("\u0020\u0020\u0020\u0020 \u005e\u005e\u005e\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u003d\u0025s\u000a",_bgbca );
|
||
_ae .Printf ("\u0020 \u0020 \u0020\u0020\u0020\u006c\u0069\u006e\u0065\u003d\u0025\u0073\u000a",_cede );};if _eecb {return true ;};};return false ;};func (_dgae *textObject )getFontDict (_agc string )(_bbef _bad .PdfObject ,_egfd error ){_abab :=_dgae ._bcg ;
|
||
if _abab ==nil {_fc .Log .Debug ("g\u0065\u0074\u0046\u006f\u006e\u0074D\u0069\u0063\u0074\u002e\u0020\u004eo\u0020\u0072\u0065\u0073\u006f\u0075\u0072c\u0065\u0073\u002e\u0020\u006e\u0061\u006d\u0065\u003d\u0025#\u0071",_agc );return nil ,nil ;};_bbef ,_ccfd :=_abab .GetFontByName (_bad .PdfObjectName (_agc ));
|
||
if !_ccfd {_fc .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u0067\u0065t\u0046\u006f\u006et\u0044\u0069\u0063\u0074\u003a\u0020\u0046\u006f\u006et \u006e\u006f\u0074 \u0066\u006fu\u006e\u0064\u003a\u0020\u006e\u0061m\u0065\u003d%\u0023\u0071",_agc );
|
||
return nil ,_d .New ("f\u006f\u006e\u0074\u0020no\u0074 \u0069\u006e\u0020\u0072\u0065s\u006f\u0075\u0072\u0063\u0065\u0073");};return _bbef ,nil ;};func (_cged *textObject )setTextLeading (_dba float64 ){if _cged ==nil {return ;};_cged ._cbgd ._gbff =_dba ;
|
||
};func _gcda (_cgba *wordBag ,_dgdb *textWord ,_eeaca float64 )bool {return _dgdb .Llx < _cgba .Urx +_eeaca &&_cgba .Llx -_eeaca < _dgdb .Urx ;};func (_gef *imageExtractContext )extractXObjectImage (_fdf *_bad .PdfObjectName ,_gbd _ba .GraphicsState ,_ffb *_aec .PdfPageResources )error {_cda ,_ :=_ffb .GetXObjectByName (*_fdf );
|
||
if _cda ==nil {return nil ;};_cge ,_cbc :=_gef ._cf [_cda ];if !_cbc {_egae ,_fca :=_ffb .GetXObjectImageByName (*_fdf );if _fca !=nil {return _fca ;};if _egae ==nil {return nil ;};_acb ,_fca :=_egae .ToImage ();if _fca !=nil {return _fca ;};var _eaf _ec .Image ;
|
||
if _egae .Mask !=nil {if _eaf ,_fca =_gabfb (_egae .Mask ,_eg .Opaque );_fca !=nil {_fc .Log .Debug ("\u0057\u0041\u0052\u004e\u003a \u0063\u006f\u0075\u006c\u0064 \u006eo\u0074\u0020\u0067\u0065\u0074\u0020\u0065\u0078\u0070\u006c\u0069\u0063\u0069\u0074\u0020\u0069\u006d\u0061\u0067e\u0020\u006d\u0061\u0073\u006b\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074\u0020\u006d\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063o\u0072\u0072\u0065\u0063\u0074\u002e");
|
||
};}else if _egae .SMask !=nil {_eaf ,_fca =_ddcae (_egae .SMask ,_eg .Opaque );if _fca !=nil {_fc .Log .Debug ("W\u0041\u0052\u004e\u003a\u0020\u0063\u006f\u0075\u006c\u0064\u0020\u006e\u006f\u0074\u0020\u0067\u0065\u0074\u0020\u0073\u006f\u0066\u0074\u0020\u0069\u006da\u0067e\u0020\u006d\u0061\u0073k\u002e\u0020O\u0075\u0074\u0070\u0075\u0074\u0020\u006d\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063\u006f\u0072\u0072\u0065\u0063\u0074\u002e");
|
||
};};if _eaf !=nil {_fad ,_eggf :=_acb .ToGoImage ();if _eggf !=nil {return _eggf ;};_fad =_edacb (_fad ,_eaf );switch _egae .ColorSpace .String (){case "\u0044\u0065\u0076\u0069\u0063\u0065\u0047\u0072\u0061\u0079","\u0049n\u0064\u0065\u0078\u0065\u0064":_acb ,_eggf =_aec .ImageHandling .NewGrayImageFromGoImage (_fad );
|
||
if _eggf !=nil {return _eggf ;};default:_acb ,_eggf =_aec .ImageHandling .NewImageFromGoImage (_fad );if _eggf !=nil {return _eggf ;};};};_cge =&cachedImage {_cb :_acb ,_ecf :_egae .ColorSpace };_gef ._cf [_cda ]=_cge ;};_fcd :=_cge ._cb ;_cgec :=_cge ._ecf ;
|
||
_gca ,_cdc :=_cgec .ImageToRGB (*_fcd );if _cdc !=nil {return _cdc ;};_fc .Log .Debug ("@\u0044\u006f\u0020\u0043\u0054\u004d\u003a\u0020\u0025\u0073",_gbd .CTM .String ());_gcf :=ImageMark {Image :&_gca ,Width :_gbd .CTM .ScalingFactorX (),Height :_gbd .CTM .ScalingFactorY (),Angle :_gbd .CTM .Angle ()};
|
||
_gcf .X ,_gcf .Y =_gbd .CTM .Translation ();_gef ._bff =append (_gef ._bff ,_gcf );_gef ._cdb ++;return nil ;};func _ggabg (_gbbac _aec .PdfRectangle )*ruling {return &ruling {_egdf :_bfgb ,_gbgc :_gbbac .Ury ,_fgad :_gbbac .Llx ,_ababc :_gbbac .Urx };
|
||
};func (_bcca *shapesState )addPoint (_bcb ,_dbcf float64 ){_adgbc :=_bcca .establishSubpath ();_ecac :=_bcca .devicePoint (_bcb ,_dbcf );if _adgbc ==nil {_bcca ._effc =true ;_bcca ._cgbcf =_ecac ;}else {_adgbc .add (_ecac );};};func _fbacb (_gaaag int ,_gbdd map[int ][]float64 )([]int ,int ){_bgfb :=make ([]int ,_gaaag );
|
||
_dgfg :=0;for _acgag :=0;_acgag < _gaaag ;_acgag ++{_bgfb [_acgag ]=_dgfg ;_dgfg +=len (_gbdd [_acgag ])+1;};return _bgfb ,_dgfg ;};func (_dceb *subpath )close (){if !_ceab (_dceb ._acfg [0],_dceb .last ()){_dceb .add (_dceb ._acfg [0]);};_dceb ._bbdf =true ;
|
||
_dceb .removeDuplicates ();};func (_gcfe *textObject )renderText (_gfa _bad .PdfObject ,_fbaa []byte ,_dgfa int )error {if _gcfe ._bbcdf {_fc .Log .Debug ("\u0072\u0065\u006e\u0064\u0065r\u0054\u0065\u0078\u0074\u003a\u0020\u0049\u006e\u0076\u0061\u006c\u0069\u0064 \u0066\u006f\u006e\u0074\u002e\u0020\u004e\u006f\u0074\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u002e");
|
||
return nil ;};_gcef :=_gcfe .getCurrentFont ();_cbb :=_gcef .BytesToCharcodes (_fbaa );_ebad ,_aage ,_feea :=_gcef .CharcodesToStrings (_cbb );if _feea > 0{_fc .Log .Debug ("\u0072\u0065nd\u0065\u0072\u0054e\u0078\u0074\u003a\u0020num\u0043ha\u0072\u0073\u003d\u0025\u0064\u0020\u006eum\u004d\u0069\u0073\u0073\u0065\u0073\u003d%\u0064",_aage ,_feea );
|
||
};_gcfe ._cbgd ._cgee +=_aage ;_gcfe ._cbgd ._aece +=_feea ;_gbde :=_gcfe ._cbgd ;_cdbfc :=_gbde ._dgad ;_ggbc :=_gbde ._aafe /100.0;_acff :=_cdab ;if _gcef .Subtype ()=="\u0054\u0079\u0070e\u0033"{_acff =1;};_egac ,_fccc :=_gcef .GetRuneMetrics (' ');
|
||
if !_fccc {_egac ,_fccc =_gcef .GetCharMetrics (32);};if !_fccc {_egac ,_ =_aec .DefaultFont ().GetRuneMetrics (' ');};_cbgc :=_egac .Wx *_acff ;_fc .Log .Trace ("\u0073p\u0061\u0063e\u0057\u0069\u0064t\u0068\u003d\u0025\u002e\u0032\u0066\u0020t\u0065\u0078\u0074\u003d\u0025\u0071 \u0066\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0066\u006f\u006et\u0053\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066",_cbgc ,_ebad ,_gcef ,_cdbfc );
|
||
_dcbd :=_g .NewMatrix (_cdbfc *_ggbc ,0,0,_cdbfc ,0,_gbde ._egc );if _eged {_fc .Log .Info ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0074\u0065\u0078t\u0073\u003d\u0025\u0071",len (_cbb ),_cbb ,_ebad );
|
||
};_fc .Log .Trace ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0072\u0075\u006ee\u0073\u003d\u0025\u0071",len (_cbb ),_cbb ,len (_ebad ));_cdccc :=_gcfe .getFillColor ();
|
||
_gde :=_gcfe .getStrokeColor ();for _dbbf ,_gdae :=range _ebad {_gdce :=[]rune (_gdae );if len (_gdce )==1&&_gdce [0]=='\x00'{continue ;};_fbb :=_cbb [_dbbf ];_dbfd :=_gcfe ._ggd .CTM .Mult (_gcfe ._acc ).Mult (_dcbd );_gefb :=0.0;if len (_gdce )==1&&_gdce [0]==32{_gefb =_gbde ._ecd ;
|
||
};_afab ,_abd :=_gcef .GetCharMetrics (_fbb );if !_abd {_fc .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u004e\u006f \u006d\u0065\u0074r\u0069\u0063\u0020\u0066\u006f\u0072\u0020\u0063\u006fde\u003d\u0025\u0064 \u0072\u003d0\u0078\u0025\u0030\u0034\u0078\u003d%\u002b\u0071 \u0025\u0073",_fbb ,_gdce ,_gdce ,_gcef );
|
||
return _ae .Errorf ("\u006e\u006f\u0020\u0063\u0068\u0061\u0072\u0020\u006d\u0065\u0074\u0072\u0069\u0063\u0073:\u0020f\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0063\u006f\u0064\u0065\u003d\u0025\u0064",_gcef .String (),_fbb );};_eafg :=_g .Point {X :_afab .Wx *_acff ,Y :_afab .Wy *_acff };
|
||
_fadb :=_g .Point {X :(_eafg .X *_cdbfc +_gefb )*_ggbc };_efd :=_g .Point {X :(_eafg .X *_cdbfc +_gbde ._cga +_gefb )*_ggbc };if _eged {_fc .Log .Info ("\u0074\u0066\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0063\u003d\u0025\u002e\u0032f\u0020t\u0077\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0068\u003d\u0025\u002e\u0032\u0066",_cdbfc ,_gbde ._cga ,_gbde ._ecd ,_ggbc );
|
||
_fc .Log .Info ("\u0064x\u002c\u0064\u0079\u003d%\u002e\u0033\u0066\u0020\u00740\u003d%\u002e3\u0066\u0020\u0074\u003d\u0025\u002e\u0033f",_eafg ,_fadb ,_efd );};_dbgcb :=_egf (_fadb );_ecfd :=_egf (_efd );_fadf :=_gcfe ._ggd .CTM .Mult (_gcfe ._acc ).Mult (_dbgcb );
|
||
if _egdce {_fc .Log .Info ("e\u006e\u0064\u003a\u000a\tC\u0054M\u003d\u0025\u0073\u000a\u0009 \u0074\u006d\u003d\u0025\u0073\u000a"+"\u0009\u0020t\u0064\u003d\u0025s\u0020\u0078\u006c\u0061\u0074\u003d\u0025\u0073\u000a"+"\u0009t\u0064\u0030\u003d\u0025s\u000a\u0009\u0020\u0020\u2192 \u0025s\u0020x\u006c\u0061\u0074\u003d\u0025\u0073",_gcfe ._ggd .CTM ,_gcfe ._acc ,_ecfd ,_eee (_gcfe ._ggd .CTM .Mult (_gcfe ._acc ).Mult (_ecfd )),_dbgcb ,_fadf ,_eee (_fadf ));
|
||
};_fcdfc ,_accb :=_gcfe .newTextMark (_bbg .ExpandLigatures (_gdce ),_dbfd ,_eee (_fadf ),_ea .Abs (_cbgc *_dbfd .ScalingFactorX ()),_gcef ,_gcfe ._cbgd ._cga ,_cdccc ,_gde ,_gfa ,_ebad ,_dbbf ,_dgfa );if !_accb {_fc .Log .Debug ("\u0054\u0065\u0078\u0074\u0020\u006d\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069d\u0065 \u0070\u0061\u0067\u0065\u002e\u0020\u0053\u006b\u0069\u0070\u0070\u0069\u006e\u0067");
|
||
continue ;};if _gcef ==nil {_fc .Log .Debug ("\u0045R\u0052O\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u002e");}else if _gcef .Encoder ()==nil {_fc .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020N\u006f\u0020\u0065\u006e\u0063\u006f\u0064\u0069\u006eg\u002e\u0020\u0066o\u006et\u003d\u0025\u0073",_gcef );
|
||
}else {if _bacf ,_efc :=_gcef .Encoder ().CharcodeToRune (_fbb );_efc {_fcdfc ._dcff =string (_bacf );};};_fc .Log .Trace ("i\u003d\u0025\u0064\u0020\u0063\u006fd\u0065\u003d\u0025\u0064\u0020\u006d\u0061\u0072\u006b=\u0025\u0073\u0020t\u0072m\u003d\u0025\u0073",_dbbf ,_fbb ,_fcdfc ,_dbfd );
|
||
_gcfe ._fcee =append (_gcfe ._fcee ,&_fcdfc );_gcfe ._acc .Concat (_ecfd );};return nil ;};func (_bfga *textPara )fontsize ()float64 {return _bfga ._bfagf [0]._fgcb };func (_fbed *textObject )showTextAdjusted (_bbbb *_bad .PdfObjectArray ,_eabea int )error {_dda :=false ;
|
||
for _ ,_ceg :=range _bbbb .Elements (){switch _ceg .(type ){case *_bad .PdfObjectFloat ,*_bad .PdfObjectInteger :_dee ,_ddg :=_bad .GetNumberAsFloat (_ceg );if _ddg !=nil {_fc .Log .Debug ("\u0045\u0052\u0052\u004fR\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078t\u0041\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0042\u0061\u0064\u0020\u006e\u0075\u006d\u0065r\u0069\u0063\u0061\u006c\u0020a\u0072\u0067\u002e\u0020\u006f\u003d\u0025\u0073\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_ceg ,_bbbb );
|
||
return _ddg ;};_cfad ,_fgf :=-_dee *0.001*_fbed ._cbgd ._dgad ,0.0;if _dda {_fgf ,_cfad =_cfad ,_fgf ;};_fdcb :=_egf (_g .Point {X :_cfad ,Y :_fgf });_fbed ._acc .Concat (_fdcb );case *_bad .PdfObjectString :_bed :=_bad .TraceToDirectObject (_ceg );_adga ,_cgcd :=_bad .GetStringBytes (_bed );
|
||
if !_cgcd {_fc .Log .Trace ("s\u0068\u006f\u0077\u0054\u0065\u0078\u0074\u0041\u0064j\u0075\u0073\u0074\u0065\u0064\u003a\u0020Ba\u0064\u0020\u0073\u0074r\u0069\u006e\u0067\u0020\u0061\u0072\u0067\u002e\u0020o=\u0025\u0073 \u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_ceg ,_bbbb );
|
||
return _bad .ErrTypeError ;};_fbed .renderText (_bed ,_adga ,_eabea );default:_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078\u0074A\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0055\u006e\u0065\u0078p\u0065\u0063\u0074\u0065\u0064\u0020\u0074\u0079\u0070\u0065\u0020\u0028%T\u0029\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_ceg ,_bbbb );
|
||
return _bad .ErrTypeError ;};};return nil ;};func (_fbgga *textTable )compositeRowCorridors ()map[int ][]float64 {_egaa :=make (map[int ][]float64 ,_fbgga ._agdc );if _gbead {_fc .Log .Info ("c\u006f\u006d\u0070\u006f\u0073\u0069t\u0065\u0052\u006f\u0077\u0043\u006f\u0072\u0072\u0069d\u006f\u0072\u0073:\u0020h\u003d\u0025\u0064",_fbgga ._agdc );
|
||
};for _aggga :=1;_aggga < _fbgga ._agdc ;_aggga ++{var _eaefa []compositeCell ;for _bdcg :=0;_bdcg < _fbgga ._afcga ;_bdcg ++{if _gbagc ,_dedfb :=_fbgga ._gaeb [_fgged (_bdcg ,_aggga )];_dedfb {_eaefa =append (_eaefa ,_gbagc );};};if len (_eaefa )==0{continue ;
|
||
};_ccgc :=_bbbba (_eaefa );_egaa [_aggga ]=_ccgc ;if _gbead {_ae .Printf ("\u0020\u0020\u0020\u0025\u0032\u0064\u003a\u0020\u00256\u002e\u0032\u0066\u000a",_aggga ,_ccgc );};};return _egaa ;};type textResult struct{_cgeb PageText ;_eefd int ;_cdbf int ;
|
||
};func _efda (_ggdgea []pathSection )rulingList {_aeefg (_ggdgea );if _eceg {_fc .Log .Info ("\u006d\u0061k\u0065\u0053\u0074\u0072\u006f\u006b\u0065\u0052\u0075\u006c\u0069\u006e\u0067\u0073\u003a\u0020\u0025\u0064\u0020\u0073\u0074\u0072ok\u0065\u0073",len (_ggdgea ));
|
||
};var _fcccg rulingList ;for _ ,_dccf :=range _ggdgea {for _ ,_eebd :=range _dccf ._gbag {if len (_eebd ._acfg )< 2{continue ;};_bbde :=_eebd ._acfg [0];for _ ,_fecfg :=range _eebd ._acfg [1:]{if _efge ,_ddag :=_dgde (_bbde ,_fecfg ,_dccf .Color );_ddag {_fcccg =append (_fcccg ,_efge );
|
||
};_bbde =_fecfg ;};};};if _eceg {_fc .Log .Info ("m\u0061\u006b\u0065\u0053tr\u006fk\u0065\u0052\u0075\u006c\u0069n\u0067\u0073\u003a\u0020\u0025\u0073",_fcccg );};return _fcccg ;};type paraList []*textPara ;func _fcea (_eaadd map[int ]intSet )[]int {_egdbb :=make ([]int ,0,len (_eaadd ));
|
||
for _bgfg :=range _eaadd {_egdbb =append (_egdbb ,_bgfg );};_a .Ints (_egdbb );return _egdbb ;};func (_eeff *Extractor )extractPageText (_dcd string ,_gdc *_aec .PdfPageResources ,_dbg _g .Matrix ,_caba int )(*PageText ,int ,int ,error ){_fc .Log .Trace ("\u0065x\u0074\u0072\u0061\u0063t\u0050\u0061\u0067\u0065\u0054e\u0078t\u003a \u006c\u0065\u0076\u0065\u006c\u003d\u0025d",_caba );
|
||
_bcf :=&PageText {_babf :_eeff ._c ,_cegb :_eeff ._ga ,_dfgd :_eeff ._bd };_cc :=_gbeb (_eeff ._c );var _dab stateStack ;_eda :=_afgg (_eeff ,_gdc ,_ba .GraphicsState {},&_cc ,&_dab );_dea :=shapesState {_gdfbg :_dbg ,_affc :_g .IdentityMatrix (),_bbdg :_eda };
|
||
var _baa bool ;_gaf :=-1;if _caba > _bfa {_geg :=_d .New ("\u0066\u006f\u0072\u006d s\u0074\u0061\u0063\u006b\u0020\u006f\u0076\u0065\u0072\u0066\u006c\u006f\u0077");_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a \u0065\u0078\u0074\u0072\u0061\u0063\u0074\u0050\u0061\u0067\u0065\u0054\u0065\u0078\u0074\u002e\u0020\u0072\u0065\u0063u\u0072\u0073\u0069\u006f\u006e\u0020\u006c\u0065\u0076\u0065\u006c\u003d\u0025\u0064 \u0065r\u0072\u003d\u0025\u0076",_caba ,_geg );
|
||
return _bcf ,_cc ._cgee ,_cc ._aece ,_geg ;};_gea :=_ba .NewContentStreamParser (_dcd );_fcf ,_dcg :=_gea .Parse ();if _dcg !=nil {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020e\u0078\u0074\u0072a\u0063\u0074\u0050\u0061g\u0065\u0054\u0065\u0078\u0074\u0020\u0070\u0061\u0072\u0073\u0065\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_dcg );
|
||
return _bcf ,_cc ._cgee ,_cc ._aece ,_dcg ;};_bcf ._cafg =_fcf ;_agf :=_ba .NewContentStreamProcessor (*_fcf );_agf .AddHandler (_ba .HandlerConditionEnumAllOperands ,"",func (_ebbf *_ba .ContentStreamOperation ,_faa _ba .GraphicsState ,_edd *_aec .PdfPageResources )error {_dd :=_ebbf .Operand ;
|
||
if _ecab {_fc .Log .Info ("\u0026&\u0026\u0020\u006f\u0070\u003d\u0025s",_ebbf );};switch _dd {case "\u0071":if _bcge {_fc .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_dea ._affc );};_dab .push (&_cc );case "\u0051":if !_dab .empty (){_cc =*_dab .pop ();
|
||
};_dea ._affc =_faa .CTM ;if _bcge {_fc .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_dea ._affc );};case "\u0042\u0044\u0043":_efa ,_bdg :=_bad .GetDict (_ebbf .Params [1]);if !_bdg {_fc .Log .Debug ("\u0045\u0052\u0052O\u0052\u003a\u0020\u0042D\u0043\u0020\u006f\u0070\u003d\u0025\u0073 \u0047\u0065\u0074\u0044\u0069\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064",_ebbf );
|
||
return _dcg ;};_ffcf :=_efa .Get ("\u004d\u0043\u0049\u0044");if _ffcf !=nil {_cfd ,_fbd :=_bad .GetIntVal (_ffcf );if !_fbd {_fc .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u0042\u0044C\u0020\u006f\u0070=\u0025\u0073\u002e\u0020\u0042\u0061\u0064\u0020\u006eum\u0065\u0072\u0069c\u0061\u006c \u006f\u0062\u006a\u0065\u0063\u0074.\u0020\u006f=\u0025\u0073",_ebbf ,_ffcf );
|
||
};_gaf =_cfd ;}else {_gaf =-1;};case "\u0045\u004d\u0043":_gaf =-1;case "\u0042\u0054":if _baa {_fc .Log .Debug ("\u0042\u0054\u0020\u0063\u0061\u006c\u006c\u0065\u0064\u0020\u0077\u0068\u0069\u006c\u0065 \u0069n\u0020\u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074");
|
||
_bcf ._dbfe =append (_bcf ._dbfe ,_eda ._fcee ...);};_baa =true ;_bab :=_faa ;_bab .CTM =_dbg .Mult (_bab .CTM );_eda =_afgg (_eeff ,_edd ,_bab ,&_cc ,&_dab );_dea ._bbdg =_eda ;case "\u0045\u0054":if !_baa {_fc .Log .Debug ("\u0045\u0054\u0020ca\u006c\u006c\u0065\u0064\u0020\u006f\u0075\u0074\u0073i\u0064e\u0020o\u0066 \u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074");
|
||
};_baa =false ;_bcf ._dbfe =append (_bcf ._dbfe ,_eda ._fcee ...);_eda .reset ();case "\u0054\u002a":_eda .nextLine ();case "\u0054\u0064":if _fcdd ,_fbdb :=_eda .checkOp (_ebbf ,2,true );!_fcdd {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_fbdb );
|
||
return _fbdb ;};_gafa ,_bgd ,_dfg :=_fegba (_ebbf .Params );if _dfg !=nil {return _dfg ;};_eda .moveText (_gafa ,_bgd );case "\u0054\u0044":if _bgaf ,_gac :=_eda .checkOp (_ebbf ,2,true );!_bgaf {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_gac );
|
||
return _gac ;};_dgge ,_ecg ,_cgc :=_fegba (_ebbf .Params );if _cgc !=nil {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_cgc );return _cgc ;};_eda .moveTextSetLeading (_dgge ,_ecg );case "\u0054\u006a":if _fdb ,_afg :=_eda .checkOp (_ebbf ,1,true );
|
||
!_fdb {_fc .Log .Debug ("\u0045\u0052\u0052\u004fR:\u0020\u0054\u006a\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0065\u0072\u0072\u003d%\u0076",_ebbf ,_afg );return _afg ;};_edgc :=_bad .TraceToDirectObject (_ebbf .Params [0]);_aae ,_gge :=_bad .GetStringBytes (_edgc );
|
||
if !_gge {_fc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a\u0020T\u006a\u0020o\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074S\u0074\u0072\u0069\u006e\u0067\u0042\u0079\u0074\u0065\u0073\u0020\u0066a\u0069\u006c\u0065\u0064",_ebbf );return _bad .ErrTypeError ;
|
||
};return _eda .showText (_edgc ,_aae ,_gaf );case "\u0054\u004a":if _cafd ,_cgbc :=_eda .checkOp (_ebbf ,1,true );!_cafd {_fc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u004a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_cgbc );return _cgbc ;};
|
||
_gec ,_cff :=_bad .GetArray (_ebbf .Params [0]);if !_cff {_fc .Log .Debug ("\u0045\u0052\u0052OR\u003a\u0020\u0054\u004a\u0020\u006f\u0070\u003d\u0025s\u0020G\u0065t\u0041r\u0072\u0061\u0079\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065\u0064",_ebbf );
|
||
return _dcg ;};return _eda .showTextAdjusted (_gec ,_gaf );case "\u0027":if _ccf ,_bfg :=_eda .checkOp (_ebbf ,1,true );!_ccf {_fc .Log .Debug ("\u0045R\u0052O\u0052\u003a\u0020\u0027\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bfg );return _bfg ;};_baab :=_bad .TraceToDirectObject (_ebbf .Params [0]);
|
||
_dgf ,_fbeb :=_bad .GetStringBytes (_baab );if !_fbeb {_fc .Log .Debug ("\u0045\u0052RO\u0052\u003a\u0020'\u0020\u006f\u0070\u003d%s \u0047et\u0053\u0074\u0072\u0069\u006e\u0067\u0042yt\u0065\u0073\u0020\u0066\u0061\u0069\u006ce\u0064",_ebbf );return _bad .ErrTypeError ;
|
||
};_eda .nextLine ();return _eda .showText (_baab ,_dgf ,_gaf );case "\u0022":if _dgab ,_cfa :=_eda .checkOp (_ebbf ,3,true );!_dgab {_fc .Log .Debug ("\u0045R\u0052O\u0052\u003a\u0020\u0022\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_cfa );return _cfa ;
|
||
};_dff ,_afc ,_dcf :=_fegba (_ebbf .Params [:2]);if _dcf !=nil {return _dcf ;};_edgd :=_bad .TraceToDirectObject (_ebbf .Params [2]);_dcde ,_ggba :=_bad .GetStringBytes (_edgd );if !_ggba {_fc .Log .Debug ("\u0045\u0052RO\u0052\u003a\u0020\"\u0020\u006f\u0070\u003d%s \u0047et\u0053\u0074\u0072\u0069\u006e\u0067\u0042yt\u0065\u0073\u0020\u0066\u0061\u0069\u006ce\u0064",_ebbf );
|
||
return _bad .ErrTypeError ;};_eda .setCharSpacing (_dff );_eda .setWordSpacing (_afc );_eda .nextLine ();return _eda .showText (_edgd ,_dcde ,_gaf );case "\u0054\u004c":_bbb ,_ceed :=_cffd (_ebbf );if _ceed !=nil {_fc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u004c\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_ceed );
|
||
return _ceed ;};_eda .setTextLeading (_bbb );case "\u0054\u0063":_bag ,_bcfg :=_cffd (_ebbf );if _bcfg !=nil {_fc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0063\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bcfg );return _bcfg ;};_eda .setCharSpacing (_bag );
|
||
case "\u0054\u0066":if _aaed ,_afag :=_eda .checkOp (_ebbf ,2,true );!_aaed {_fc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0066\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_afag );return _afag ;};_ccfc ,_bee :=_bad .GetNameVal (_ebbf .Params [0]);
|
||
if !_bee {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a \u0054\u0066\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074\u004ea\u006d\u0065\u0056\u0061\u006c\u0020\u0066a\u0069\u006c\u0065\u0064",_ebbf );return _bad .ErrTypeError ;};_acg ,_bcd :=_bad .GetNumberAsFloat (_ebbf .Params [1]);
|
||
if !_bee {_fc .Log .Debug ("\u0045\u0052\u0052O\u0052\u003a\u0020\u0054\u0066\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074\u0046\u006c\u006f\u0061\u0074\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065d\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_ebbf ,_bcd );
|
||
return _bcd ;};_bcd =_eda .setFont (_ccfc ,_acg );_eda ._bbcdf =_d .Is (_bcd ,_bad .ErrNotSupported );if _bcd !=nil &&!_eda ._bbcdf {return _bcd ;};case "\u0054\u006d":if _bfaa ,_fge :=_eda .checkOp (_ebbf ,6,true );!_bfaa {_fc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u006d\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_fge );
|
||
return _fge ;};_gbe ,_bdd :=_bad .GetNumbersAsFloat (_ebbf .Params );if _bdd !=nil {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bdd );return _bdd ;};_eda .setTextMatrix (_gbe );case "\u0054\u0072":if _gbc ,_dag :=_eda .checkOp (_ebbf ,1,true );
|
||
!_gbc {_fc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0072\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_dag );return _dag ;};_eddd ,_adf :=_bad .GetIntVal (_ebbf .Params [0]);if !_adf {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0072\u0020\u006f\u0070\u003d\u0025\u0073 \u0047e\u0074\u0049\u006e\u0074\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065\u0064",_ebbf );
|
||
return _bad .ErrTypeError ;};_eda .setTextRenderMode (_eddd );case "\u0054\u0073":if _dagc ,_fdfa :=_eda .checkOp (_ebbf ,1,true );!_dagc {_fc .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0073\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_fdfa );
|
||
return _fdfa ;};_fgg ,_dbd :=_bad .GetNumberAsFloat (_ebbf .Params [0]);if _dbd !=nil {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_dbd );return _dbd ;};_eda .setTextRise (_fgg );case "\u0054\u0077":if _eccg ,_adec :=_eda .checkOp (_ebbf ,1,true );
|
||
!_eccg {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_adec );return _adec ;};_dfde ,_bgbe :=_bad .GetNumberAsFloat (_ebbf .Params [0]);if _bgbe !=nil {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bgbe );
|
||
return _bgbe ;};_eda .setWordSpacing (_dfde );case "\u0054\u007a":if _acga ,_gfdd :=_eda .checkOp (_ebbf ,1,true );!_acga {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_gfdd );return _gfdd ;};_baad ,_eggfb :=_bad .GetNumberAsFloat (_ebbf .Params [0]);
|
||
if _eggfb !=nil {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_eggfb );return _eggfb ;};_eda .setHorizScaling (_baad );case "\u0063\u006d":_dea ._affc =_faa .CTM ;if _dea ._affc .Singular (){_aebe :=_g .IdentityMatrix ().Translate (_dea ._affc .Translation ());
|
||
_fc .Log .Debug ("S\u0069n\u0067\u0075\u006c\u0061\u0072\u0020\u0063\u0074m\u003d\u0025\u0073\u2192%s",_dea ._affc ,_aebe );_dea ._affc =_aebe ;};if _bcge {_fc .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_dea ._affc );};case "\u006d":if len (_ebbf .Params )!=2{_fc .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0065\u0072\u0072o\u0072\u0020\u0077\u0068\u0069\u006c\u0065\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u0020\u0060\u006d\u0060\u0020o\u0070\u0065r\u0061\u0074o\u0072\u003a\u0020\u0025\u0076\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074 m\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063o\u0072\u0072\u0065\u0063\u0074\u002e",_ac );
|
||
return nil ;};_ebg ,_gcaf :=_bad .GetNumbersAsFloat (_ebbf .Params );if _gcaf !=nil {return _gcaf ;};_dea .moveTo (_ebg [0],_ebg [1]);case "\u006c":if len (_ebbf .Params )!=2{_fc .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0065\u0072\u0072o\u0072\u0020\u0077\u0068\u0069\u006c\u0065\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u0020\u0060\u006c\u0060\u0020o\u0070\u0065r\u0061\u0074o\u0072\u003a\u0020\u0025\u0076\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074 m\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063o\u0072\u0072\u0065\u0063\u0074\u002e",_ac );
|
||
return nil ;};_ace ,_fea :=_bad .GetNumbersAsFloat (_ebbf .Params );if _fea !=nil {return _fea ;};_dea .lineTo (_ace [0],_ace [1]);case "\u0063":if len (_ebbf .Params )!=6{return _ac ;};_ccg ,_bged :=_bad .GetNumbersAsFloat (_ebbf .Params );if _bged !=nil {return _bged ;
|
||
};_fc .Log .Debug ("\u0043u\u0062\u0069\u0063\u0020b\u0065\u007a\u0069\u0065\u0072 \u0070a\u0072a\u006d\u0073\u003a\u0020\u0025\u002e\u0032f",_ccg );_dea .cubicTo (_ccg [0],_ccg [1],_ccg [2],_ccg [3],_ccg [4],_ccg [5]);case "\u0076","\u0079":if len (_ebbf .Params )!=4{return _ac ;
|
||
};_efb ,_dcb :=_bad .GetNumbersAsFloat (_ebbf .Params );if _dcb !=nil {return _dcb ;};_fc .Log .Debug ("\u0043u\u0062\u0069\u0063\u0020b\u0065\u007a\u0069\u0065\u0072 \u0070a\u0072a\u006d\u0073\u003a\u0020\u0025\u002e\u0032f",_efb );_dea .quadraticTo (_efb [0],_efb [1],_efb [2],_efb [3]);
|
||
case "\u0068":_dea .closePath ();case "\u0072\u0065":if len (_ebbf .Params )!=4{return _ac ;};_ded ,_cba :=_bad .GetNumbersAsFloat (_ebbf .Params );if _cba !=nil {return _cba ;};_dea .drawRectangle (_ded [0],_ded [1],_ded [2],_ded [3]);_dea .closePath ();
|
||
case "\u0053":_dea .stroke (&_bcf ._afgd );_dea .clearPath ();case "\u0073":_dea .closePath ();_dea .stroke (&_bcf ._afgd );_dea .clearPath ();case "\u0046":_dea .fill (&_bcf ._ddae );_dea .clearPath ();case "\u0066","\u0066\u002a":_dea .closePath ();_dea .fill (&_bcf ._ddae );
|
||
_dea .clearPath ();case "\u0042","\u0042\u002a":_dea .fill (&_bcf ._ddae );_dea .stroke (&_bcf ._afgd );_dea .clearPath ();case "\u0062","\u0062\u002a":_dea .closePath ();_dea .fill (&_bcf ._ddae );_dea .stroke (&_bcf ._afgd );_dea .clearPath ();case "\u006e":_dea .clearPath ();
|
||
case "\u0044\u006f":if len (_ebbf .Params )==0{_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0078\u0070\u0065\u0063\u0074\u0065\u0064\u0020\u0058\u004fbj\u0065c\u0074\u0020\u006e\u0061\u006d\u0065\u0020\u006f\u0070\u0065\u0072\u0061n\u0064\u0020\u0066\u006f\u0072\u0020\u0044\u006f\u0020\u006f\u0070\u0065\u0072\u0061\u0074\u006f\u0072.\u0020\u0047\u006f\u0074\u0020\u0025\u002b\u0076\u002e",_ebbf .Params );
|
||
return _bad .ErrRangeError ;};_afde ,_acgg :=_bad .GetName (_ebbf .Params [0]);if !_acgg {_fc .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u0044\u006f\u0020\u006f\u0070e\u0072a\u0074\u006f\u0072\u0020\u0058\u004f\u0062\u006a\u0065\u0063\u0074\u0020\u006e\u0061\u006d\u0065\u0020\u006fp\u0065\u0072\u0061\u006e\u0064\u003a\u0020\u0025\u002b\u0076\u002e",_ebbf .Params [0]);
|
||
return _bad .ErrTypeError ;};_ ,_feb :=_edd .GetXObjectByName (*_afde );if _feb !=_aec .XObjectTypeForm {break ;};_adgc ,_acgg :=_eeff ._gfe [_afde .String ()];if !_acgg {_dbb ,_aebb :=_edd .GetXObjectFormByName (*_afde );if _aebb !=nil {_fc .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v",_aebb );
|
||
return _aebb ;};_fcac ,_aebb :=_dbb .GetContentStream ();if _aebb !=nil {_fc .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v",_aebb );return _aebb ;};_gff :=_dbb .Resources ;if _gff ==nil {_gff =_edd ;};_fae :=_faa .CTM ;if _dfdc ,_dgag :=_bad .GetArray (_dbb .Matrix );
|
||
_dgag {_ebc ,_gdd :=_dfdc .GetAsFloat64Slice ();if _gdd !=nil {return _gdd ;};if len (_ebc )!=6{return _ac ;};_gbcd :=_g .NewMatrix (_ebc [0],_ebc [1],_ebc [2],_ebc [3],_ebc [4],_ebc [5]);_fae =_faa .CTM .Mult (_gbcd );};_dedc ,_ege ,_bgfc ,_aebb :=_eeff .extractPageText (string (_fcac ),_gff ,_dbg .Mult (_fae ),_caba +1);
|
||
if _aebb !=nil {_fc .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v",_aebb );return _aebb ;};_adgc =textResult {*_dedc ,_ege ,_bgfc };_eeff ._gfe [_afde .String ()]=_adgc ;};_dea ._affc =_faa .CTM ;if _bcge {_fc .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_dea ._affc );
|
||
};_bcf ._dbfe =append (_bcf ._dbfe ,_adgc ._cgeb ._dbfe ...);_bcf ._afgd =append (_bcf ._afgd ,_adgc ._cgeb ._afgd ...);_bcf ._ddae =append (_bcf ._ddae ,_adgc ._cgeb ._ddae ...);_cc ._cgee +=_adgc ._eefd ;_cc ._aece +=_adgc ._cdbf ;case "\u0072\u0067","\u0067","\u006b","\u0063\u0073","\u0073\u0063","\u0073\u0063\u006e":_eda ._ggd .ColorspaceNonStroking =_faa .ColorspaceNonStroking ;
|
||
_eda ._ggd .ColorNonStroking =_faa .ColorNonStroking ;case "\u0052\u0047","\u0047","\u004b","\u0043\u0053","\u0053\u0043","\u0053\u0043\u004e":_eda ._ggd .ColorspaceStroking =_faa .ColorspaceStroking ;_eda ._ggd .ColorStroking =_faa .ColorStroking ;};return nil ;
|
||
});_dcg =_agf .Process (_gdc );return _bcf ,_cc ._cgee ,_cc ._aece ,_dcg ;};type imageExtractContext struct{_bff []ImageMark ;_aa int ;_cdb int ;_bgb int ;_cf map[*_bad .PdfObjectStream ]*cachedImage ;_gfc *ImageExtractOptions ;_ece bool ;};
|
||
|
||
// Text returns the extracted page text.
|
||
func (_fcgaf PageText )Text ()string {return _fcgaf ._eede };func _gffa (_edac []TextMark ,_fafda *int )[]TextMark {_aacgc :=_edac [len (_edac )-1];_aggf :=[]rune (_aacgc .Text );if len (_aggf )==1{_edac =_edac [:len (_edac )-1];_fgbd :=_edac [len (_edac )-1];
|
||
*_fafda =_fgbd .Offset +len (_fgbd .Text );}else {_gddc :=_gegca (_aacgc .Text );*_fafda +=len (_gddc )-len (_aacgc .Text );_aacgc .Text =_gddc ;};return _edac ;};func _dbeg (_ffabf map[int ][]float64 )string {_caabb :=_gged (_ffabf );_adfg :=make ([]string ,len (_ffabf ));
|
||
for _gegec ,_edceb :=range _caabb {_adfg [_gegec ]=_ae .Sprintf ("\u0025\u0064\u003a\u0020\u0025\u002e\u0032\u0066",_edceb ,_ffabf [_edceb ]);};return _ae .Sprintf ("\u007b\u0025\u0073\u007d",_bb .Join (_adfg ,"\u002c\u0020"));};func (_gedcf rulingList )log (_acgcf string ){if !_eceg {return ;
|
||
};_fc .Log .Info ("\u0023\u0023\u0023\u0020\u0025\u0031\u0030\u0073\u003a\u0020\u0076\u0065c\u0073\u003d\u0025\u0073",_acgcf ,_gedcf .String ());for _fcef ,_fcbc :=range _gedcf {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_fcef ,_fcbc .String ());
|
||
};};func (_cggee *textTable )putComposite (_ccebe ,_decbf int ,_efdgde paraList ,_dacd _aec .PdfRectangle ){if len (_efdgde )==0{_fc .Log .Error ("\u0074\u0065xt\u0054\u0061\u0062l\u0065\u0029\u0020\u0070utC\u006fmp\u006f\u0073\u0069\u0074\u0065\u003a\u0020em\u0070\u0074\u0079\u0020\u0070\u0061\u0072a\u0073");
|
||
return ;};_cdbabb :=compositeCell {PdfRectangle :_dacd ,paraList :_efdgde };if _gbead {_ae .Printf ("\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0070\u0075\u0074\u0043\u006f\u006d\u0070o\u0073i\u0074\u0065\u0028\u0025\u0064\u002c\u0025\u0064\u0029\u003c\u002d\u0025\u0073\u000a",_ccebe ,_decbf ,_cdbabb .String ());
|
||
};_cdbabb .updateBBox ();_cggee ._gaeb [_fgged (_ccebe ,_decbf )]=_cdbabb ;};func (_cbcf *textTable )toTextTable ()TextTable {if _gbead {_fc .Log .Info ("t\u006fT\u0065\u0078\u0074\u0054\u0061\u0062\u006c\u0065:\u0020\u0025\u0064\u0020x \u0025\u0064",_cbcf ._afcga ,_cbcf ._agdc );
|
||
};_gecff :=make ([][]TableCell ,_cbcf ._agdc );for _beece :=0;_beece < _cbcf ._agdc ;_beece ++{_gecff [_beece ]=make ([]TableCell ,_cbcf ._afcga );for _ecfdd :=0;_ecfdd < _cbcf ._afcga ;_ecfdd ++{_dfad :=_cbcf .get (_ecfdd ,_beece );if _dfad ==nil {continue ;
|
||
};if _gbead {_ae .Printf ("\u0025\u0034\u0064 \u0025\u0032\u0064\u003a\u0020\u0025\u0073\u000a",_ecfdd ,_beece ,_dfad );};_gecff [_beece ][_ecfdd ].Text =_dfad .text ();_fgfb :=0;_gecff [_beece ][_ecfdd ].Marks ._bade =_dfad .toTextMarks (&_fgfb );};};
|
||
_fegeg :=TextTable {W :_cbcf ._afcga ,H :_cbcf ._agdc ,Cells :_gecff };_fegeg .PdfRectangle =_cbcf .bbox ();return _fegeg ;};func (_cagg paraList )yNeighbours (_geaea float64 )map[*textPara ][]int {_aceg :=make ([]event ,2*len (_cagg ));if _geaea ==0{for _affce ,_eefeb :=range _cagg {_aceg [2*_affce ]=event {_eefeb .Lly ,true ,_affce };
|
||
_aceg [2*_affce +1]=event {_eefeb .Ury ,false ,_affce };};}else {for _effg ,_cdggd :=range _cagg {_aceg [2*_effg ]=event {_cdggd .Lly -_geaea *_cdggd .fontsize (),true ,_effg };_aceg [2*_effg +1]=event {_cdggd .Ury +_geaea *_cdggd .fontsize (),false ,_effg };
|
||
};};return _cagg .eventNeighbours (_aceg );};func _eagc (_ffgf *textLine ,_adabc []*textLine ,_eccga []float64 ,_fafe ,_ecaef float64 )[]*textLine {_bacd :=[]*textLine {};for _ ,_ggfb :=range _adabc {if _ggfb ._gaca >=_fafe {if _ecaef !=-1&&_ggfb ._gaca < _ecaef {if _ggfb .text ()!=_ffgf .text (){if _ea .Round (_ggfb .Llx )< _ea .Round (_ffgf .Llx ){break ;
|
||
};_bacd =append (_bacd ,_ggfb );};}else if _ecaef ==-1{if _ggfb ._gaca ==_ffgf ._gaca {if _ggfb .text ()!=_ffgf .text (){_bacd =append (_bacd ,_ggfb );};continue ;};_gcce :=_faadf (_ffgf ,_adabc ,_eccga );if _gcce !=-1&&_ggfb ._gaca <=_gcce {_bacd =append (_bacd ,_ggfb );
|
||
};};};};return _bacd ;};func (_aege *wordBag )depthRange (_aadb ,_gbbc int )[]int {var _abbg []int ;for _fbae :=range _aege ._gbbd {if _aadb <=_fbae &&_fbae <=_gbbc {_abbg =append (_abbg ,_fbae );};};if len (_abbg )==0{return nil ;};_a .Ints (_abbg );return _abbg ;
|
||
};func (_dcaf *wordBag )firstReadingIndex (_abbd int )int {_fafd :=_dcaf .firstWord (_abbd )._aeegf ;_ccbb :=float64 (_abbd +1)*_fdedc ;_ffdaf :=_ccbb +_dgce *_fafd ;_aagfd :=_abbd ;for _ ,_cdf :=range _dcaf .depthBand (_ccbb ,_ffdaf ){if _bcea (_dcaf .firstWord (_cdf ),_dcaf .firstWord (_aagfd ))< 0{_aagfd =_cdf ;
|
||
};};return _aagfd ;};func (_bcadd paraList )tables ()[]TextTable {var _adfdc []TextTable ;if _gbead {_fc .Log .Info ("\u0070\u0061\u0072\u0061\u0073\u002e\u0074\u0061\u0062\u006c\u0065\u0073\u003a");};for _ ,_dcdc :=range _bcadd {_cfdf :=_dcdc ._caaa ;
|
||
if _cfdf !=nil &&_cfdf .isExportable (){_adfdc =append (_adfdc ,_cfdf .toTextTable ());};};return _adfdc ;};type textObject struct{_eegd *Extractor ;_bcg *_aec .PdfPageResources ;_ggd _ba .GraphicsState ;_cbgd *textState ;_aed *stateStack ;_acc _g .Matrix ;
|
||
_dde _g .Matrix ;_fcee []*textMark ;_bbcdf bool ;};func (_begcf *textTable )newTablePara ()*textPara {_eccb :=_begcf .computeBbox ();_gcagb :=&textPara {PdfRectangle :_eccb ,_bbbc :_eccb ,_caaa :_begcf };if _gbead {_fc .Log .Info ("\u006e\u0065w\u0054\u0061\u0062l\u0065\u0050\u0061\u0072\u0061\u003a\u0020\u0025\u0073",_gcagb );
|
||
};return _gcagb ;};func (_afdee *textPara )getListLines ()[]*textLine {var _bced []*textLine ;_cgbf :=_dbgcg (_afdee ._bfagf );for _ ,_bdac :=range _afdee ._bfagf {_gaaa :=_bdac ._bfag [0]._eedc [0];if _bdce (_gaaa ){_bced =append (_bced ,_bdac );};};_bced =append (_bced ,_cgbf ...);
|
||
return _bced ;};func _cbabg (_cgeef []TextMark ,_ecdfd *TextTable )[]TextMark {var _ccagc []TextMark ;for _ ,_bccdf :=range _cgeef {_bccdf ._adgb =true ;_bccdf ._dfeb =_ecdfd ;_ccagc =append (_ccagc ,_bccdf );};return _ccagc ;};type intSet map[int ]struct{};
|
||
func (_bbca gridTile )numBorders ()int {_deba :=0;if _bbca ._cbfd {_deba ++;};if _bbca ._cbdbf {_deba ++;};if _bbca ._fcgc {_deba ++;};if _bbca ._fbbf {_deba ++;};return _deba ;};func _dfdf (_adfd ,_dafb _aec .PdfRectangle )bool {return _adfd .Llx <=_dafb .Llx &&_dafb .Urx <=_adfd .Urx &&_adfd .Lly <=_dafb .Lly &&_dafb .Ury <=_adfd .Ury ;
|
||
};
|
||
|
||
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
|
||
// TODO(peterwilliams97): The stats complicate this function signature and aren't very useful.
|
||
//
|
||
// Replace with a function like Extract() (*PageText, error)
|
||
func (_bbf *Extractor )ExtractPageText ()(*PageText ,int ,int ,error ){_daa ,_aeb ,_fcg ,_gda :=_bbf .extractPageText (_bbf ._ffd ,_bbf ._gf ,_g .IdentityMatrix (),0);if _gda !=nil &&_gda !=_aec .ErrColorOutOfRange {return nil ,0,0,_gda ;};if _bbf ._fg !=nil {_daa ._cdgg ._cgff =_bbf ._fg .UseSimplerExtractionProcess ;
|
||
};_daa .computeViews ();_gda =_cgfcc (_daa );if _gda !=nil {return nil ,0,0,_gda ;};if _bbf ._fg !=nil {if _bbf ._fg .ApplyCropBox &&_bbf ._af !=nil {_daa .ApplyArea (*_bbf ._af );};_daa ._cdgg ._cdec =_bbf ._fg .DisableDocumentTags ;};return _daa ,_aeb ,_fcg ,nil ;
|
||
};func (_bcac *shapesState )cubicTo (_fac ,_acea ,_dcdd ,_aeace ,_ede ,_ggcb float64 ){if _bcge {_fc .Log .Info ("\u0063\u0075\u0062\u0069\u0063\u0054\u006f\u003a");};_bcac .addPoint (_ede ,_ggcb );};
|
||
|
||
// TextMarkArray is a collection of TextMarks.
|
||
type TextMarkArray struct{_bade []TextMark };func (_daedfd rulingList )primaries ()[]float64 {_cedeb :=make (map[float64 ]struct{},len (_daedfd ));for _ ,_bbgb :=range _daedfd {_cedeb [_bbgb ._gbgc ]=struct{}{};};_dddd :=make ([]float64 ,len (_cedeb ));
|
||
_cbeb :=0;for _cdcb :=range _cedeb {_dddd [_cbeb ]=_cdcb ;_cbeb ++;};_a .Float64s (_dddd );return _dddd ;};type textTable struct{_aec .PdfRectangle ;_afcga ,_agdc int ;_fbccb bool ;_bfdff map[uint64 ]*textPara ;_gaeb map[uint64 ]compositeCell ;};func (_fceg *subpath )clear (){*_fceg =subpath {}};
|
||
func (_bdebd rulingList )tidied (_dadd string )rulingList {_dgca :=_bdebd .removeDuplicates ();_dgca .log ("\u0075n\u0069\u0071\u0075\u0065\u0073");_ccd :=_dgca .snapToGroups ();if _ccd ==nil {return nil ;};_ccd .sort ();if _eceg {_fc .Log .Info ("\u0074\u0069\u0064i\u0065\u0064\u003a\u0020\u0025\u0071\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u0075\u006e\u0069\u0071\u0075\u0065\u0073\u003d\u0025\u0064\u0020\u0063\u006f\u0061l\u0065\u0073\u0063\u0065\u0064\u003d\u0025\u0064",_dadd ,len (_bdebd ),len (_dgca ),len (_ccd ));
|
||
};_ccd .log ("\u0063o\u0061\u006c\u0065\u0073\u0063\u0065d");return _ccd ;};func _eabc (_bcgg string )(string ,bool ){_eefde :=[]rune (_bcgg );if len (_eefde )!=1{return "",false ;};_dgaac ,_fgcgb :=_abec [_eefde [0]];return _dgaac ,_fgcgb ;};func _ggb (_dc []Font ,_gfd string )bool {for _ ,_bfe :=range _dc {if _bfe .FontName ==_gfd {return true ;
|
||
};};return false ;};
|
||
|
||
// ToText returns the page text as a single string.
|
||
// Deprecated: This function is deprecated and will be removed in a future major version. Please use
|
||
// Text() instead.
|
||
func (_bdgc PageText )ToText ()string {return _bdgc .Text ()};
|
||
|
||
// String returns a description of `k`.
|
||
func (_dgaec markKind )String ()string {_bbgdg ,_dfc :=_debga [_dgaec ];if !_dfc {return _ae .Sprintf ("\u004e\u006f\u0074\u0020\u0061\u0020\u006d\u0061\u0072k\u003a\u0020\u0025\u0064",_dgaec );};return _bbgdg ;};func (_ccge paraList )merge ()*textPara {_fc .Log .Trace ("\u006d\u0065\u0072\u0067\u0065:\u0020\u0070\u0061\u0072\u0061\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_ccge ));
|
||
if len (_ccge )==0{return nil ;};_ccge .sortReadingOrder ();_ffcg :=_ccge [0].PdfRectangle ;_gggf :=_ccge [0]._bfagf ;for _ ,_cfgdf :=range _ccge [1:]{_ffcg =_agfb (_ffcg ,_cfgdf .PdfRectangle );_gggf =append (_gggf ,_cfgdf ._bfagf ...);};return _geaa (_ffcg ,_gggf );
|
||
};func _aeced (_dagg *list ,_ffcc *string )string {_dcdb :=_bb .Split (_dagg ._begg ,"\u000a");_fadbc :=&_bb .Builder {};for _ ,_ddbd :=range _dcdb {if _ddbd !=""{_fadbc .WriteString (*_ffcc );_fadbc .WriteString (_ddbd );_fadbc .WriteString ("\u000a");
|
||
};};return _fadbc .String ();};func (_gaga *textObject )moveLP (_bgaa ,_aee float64 ){_gaga ._dde .Concat (_g .NewMatrix (1,0,0,1,_bgaa ,_aee ));_gaga ._acc =_gaga ._dde ;};func _dbgcg (_bfae []*textLine )[]*textLine {_gae :=[]*textLine {};for _ ,_gcba :=range _bfae {_fafbg :=_gcba .text ();
|
||
_fccf :=_gdfd .Find ([]byte (_fafbg ));if _fccf !=nil {_gae =append (_gae ,_gcba );};};return _gae ;};func (_bgag paraList )log (_ggda string ){if !_beee {return ;};_fc .Log .Info ("%\u0038\u0073\u003a\u0020\u0025\u0064 \u0070\u0061\u0072\u0061\u0073\u0020=\u003d\u003d\u003d\u003d\u003d\u003d\u002d-\u002d\u002d\u002d\u002d\u002d\u003d\u003d\u003d\u003d\u003d=\u003d",_ggda ,len (_bgag ));
|
||
for _dacf ,_eadag :=range _bgag {if _eadag ==nil {continue ;};_caac :=_eadag .text ();_bbba :="\u0020\u0020";if _eadag ._caaa !=nil {_bbba =_ae .Sprintf ("\u005b%\u0064\u0078\u0025\u0064\u005d",_eadag ._caaa ._afcga ,_eadag ._caaa ._agdc );};_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0025s\u0020\u0025\u0071\u000a",_dacf ,_eadag .PdfRectangle ,_bbba ,_bgfd (_caac ,50));
|
||
};};type textState struct{_cga float64 ;_ecd float64 ;_aafe float64 ;_gbff float64 ;_dgad float64 ;_ffgba RenderMode ;_egc float64 ;_cbad *_aec .PdfFont ;_edfa _aec .PdfRectangle ;_cgee int ;_aece int ;};func (_eea *textObject )nextLine (){_eea .moveLP (0,-_eea ._cbgd ._gbff )};
|
||
const (_acgbb =false ;_baf =false ;_ecab =false ;_egdce =false ;_bcge =false ;_eged =false ;_fedc =false ;_beee =false ;_fbbd =false ;_ecda =_fbbd &&true ;_baag =_ecda &&false ;_gabga =_fbbd &&true ;_gbead =false ;_efcc =_gbead &&false ;_caee =_gbead &&true ;
|
||
_eceg =false ;_ggbbe =_eceg &&false ;_geda =_eceg &&false ;_ddb =_eceg &&true ;_bdf =_eceg &&false ;_dfbb =_eceg &&false ;);func (_cffge *textWord )toTextMarks (_cdffa *int )[]TextMark {var _daeffc []TextMark ;for _ ,_dbeca :=range _cffge ._ebfa {_daeffc =_fccge (_daeffc ,_cdffa ,_dbeca .ToTextMark ());
|
||
};return _daeffc ;};func (_dcdg *textLine )pullWord (_abdbf *wordBag ,_ecaca *textWord ,_dfgdb int ){_dcdg .appendWord (_ecaca );_abdbf .removeWord (_ecaca ,_dfgdb );};
|
||
|
||
// GetContentStreamOps returns the contentStreamOps field of `pt`.
|
||
func (_cbce *PageText )GetContentStreamOps ()*_ba .ContentStreamOperations {return _cbce ._cafg };func (_ggef *textPara )text ()string {_gdga :=new (_fe .Buffer );_ggef .writeText (_gdga );return _gdga .String ();};
|
||
|
||
// List returns all the list objects detected on the page.
|
||
// It detects all the bullet point Lists from a given pdf page and builds a slice of bullet list objects.
|
||
// A given bullet list object has a tree structure.
|
||
// Each bullet point list is extracted with the text content it contains and all the sub lists found under it as children in the tree.
|
||
// The rest content of the pdf is ignored and only text in the bullet point lists are extracted.
|
||
// The list extraction is done in two ways.
|
||
// 1. If the document is tagged then the lists are extracted using the tags provided in the document.
|
||
// 2. Otherwise the bullet lists are extracted from the raw text using regex matching.
|
||
// By default the document tag is used if available.
|
||
// However this can be disabled using `DisableDocumentTags` in the `Options` object.
|
||
// Sometimes disabling document tags option might give a better bullet list extraction if the document was tagged incorrectly.
|
||
//
|
||
// options := &Options{
|
||
// DisableDocumentTags: false, // this means use document tag if available
|
||
// }
|
||
// ex, err := NewWithOptions(page, options)
|
||
// // handle error
|
||
// pageText, _, _, err := ex.ExtractPageText()
|
||
// // handle error
|
||
// lists := pageText.List()
|
||
// txt := lists.Text()
|
||
func (_cabcb PageText )List ()lists {_fgcg :=!_cabcb ._cdgg ._cdec ;_ddcdg :=_cabcb .getParagraphs ();_abfd :=true ;if _cabcb ._cegb ==nil ||*_cabcb ._cegb ==nil {_abfd =false ;};_ggdb :=_ddcdg .list ();if _abfd &&_fgcg {_bbdb :=_fbbg (&_ddcdg );_fgedb :=&structTreeRoot {};
|
||
_fgedb .parseStructTreeRoot (*_cabcb ._cegb );if _fgedb ._bagdg ==nil {_fc .Log .Debug ("\u004c\u0069\u0073\u0074\u003a\u0020\u0073t\u0072\u0075\u0063\u0074\u0054\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u0020\u0064\u006f\u0065\u0073\u006e'\u0074\u0020\u0068\u0061\u0076e\u0020\u0061\u006e\u0079\u0020\u0063\u006f\u006e\u0074e\u006e\u0074\u002c\u0020\u0075\u0073\u0069\u006e\u0067\u0020\u0074\u0065\u0078\u0074\u0020\u006d\u0061\u0074\u0063\u0068\u0069\u006e\u0067\u0020\u006d\u0065\u0074\u0068\u006f\u0064\u0020\u0069\u006e\u0073\u0074\u0065\u0061\u0064\u002e");
|
||
return _ggdb ;};_ggdb =_fgedb .buildList (_bbdb ,_cabcb ._dfgd );};return _ggdb ;};func (_bfcfg *textWord )computeText ()string {_cebaa :=make ([]string ,len (_bfcfg ._ebfa ));for _baed ,_ccdb :=range _bfcfg ._ebfa {_cebaa [_baed ]=_ccdb ._gded ;};return _bb .Join (_cebaa ,"");
|
||
};
|
||
|
||
// String returns a description of `l`.
|
||
func (_cgde *textLine )String ()string {return _ae .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_cgde ._gaca ,_cgde .PdfRectangle ,_cgde ._fgcb ,_cgde .text ());
|
||
};func _afgc (_baada []float64 ,_eegge ,_ebff float64 )[]float64 {_ebfdg ,_gbac :=_eegge ,_ebff ;if _gbac < _ebfdg {_ebfdg ,_gbac =_gbac ,_ebfdg ;};_cefa :=make ([]float64 ,0,len (_baada )+2);_cefa =append (_cefa ,_eegge );for _ ,_gfdaa :=range _baada {if _gfdaa <=_ebfdg {continue ;
|
||
}else if _gfdaa >=_gbac {break ;};_cefa =append (_cefa ,_gfdaa );};_cefa =append (_cefa ,_ebff );return _cefa ;};func (_bbbbg paraList )list ()[]*list {var _egaf []*textLine ;var _faae []*textLine ;for _ ,_dccca :=range _bbbbg {_feaf :=_dccca .getListLines ();
|
||
_egaf =append (_egaf ,_feaf ...);_faae =append (_faae ,_dccca ._bfagf ...);};_dafab :=_acfc (_egaf );_acde :=_dbdad (_faae ,_dafab );return _acde ;};func _cecg (_dcfc *wordBag ,_gbef *textWord ,_fffg float64 )bool {return _dcfc .Urx <=_gbef .Llx &&_gbef .Llx < _dcfc .Urx +_fffg ;
|
||
};func (_cccc *textPara )toCellTextMarks (_fcfdf *int )[]TextMark {var _cdbe []TextMark ;for _ffcab ,_fagf :=range _cccc ._bfagf {_ddbfg :=_fagf .toTextMarks (_fcfdf );_cfbd :=_bfab &&_fagf .endsInHyphen ()&&_ffcab !=len (_cccc ._bfagf )-1;if _cfbd {_ddbfg =_gffa (_ddbfg ,_fcfdf );
|
||
};_cdbe =append (_cdbe ,_ddbfg ...);if !(_cfbd ||_ffcab ==len (_cccc ._bfagf )-1){_cdbe =_ggcce (_cdbe ,_fcfdf ,_ecaa (_fagf ._gaca ,_cccc ._bfagf [_ffcab +1]._gaca ));};};return _cdbe ;};func (_eca *textObject )setTextRise (_gdf float64 ){if _eca ==nil {return ;
|
||
};_eca ._cbgd ._egc =_gdf ;};
|
||
|
||
// String returns a string describing `pt`.
|
||
func (_fde PageText )String ()string {_dbdd :=_ae .Sprintf ("P\u0061\u0067\u0065\u0054ex\u0074:\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073",len (_fde ._dbfe ));_efdd :=[]string {"\u002d"+_dbdd };for _ ,_gfcf :=range _fde ._dbfe {_efdd =append (_efdd ,_gfcf .String ());
|
||
};_efdd =append (_efdd ,"\u002b"+_dbdd );return _bb .Join (_efdd ,"\u000a");};func _affg (_gfbd map[float64 ]gridTile )[]float64 {_fega :=make ([]float64 ,0,len (_gfbd ));for _defd :=range _gfbd {_fega =append (_fega ,_defd );};_a .Float64s (_fega );return _fega ;
|
||
};
|
||
|
||
// Tables returns the tables extracted from the page.
|
||
func (_gbb PageText )Tables ()[]TextTable {if _gbead {_fc .Log .Info ("\u0054\u0061\u0062\u006c\u0065\u0073\u003a\u0020\u0025\u0064",len (_gbb ._gba ));};return _gbb ._gba ;};func _bae (_ecee ,_bfbe _aec .PdfRectangle )bool {return _bfbe .Llx <=_ecee .Urx &&_ecee .Llx <=_bfbe .Urx ;
|
||
};func (_ebcf *wordBag )text ()string {_gacdg :=_ebcf .allWords ();_fdea :=make ([]string ,len (_gacdg ));for _gcag ,_abdd :=range _gacdg {_fdea [_gcag ]=_abdd ._eedc ;};return _bb .Join (_fdea ,"\u0020");};func _bbcb (_ddcd _g .Point )*subpath {return &subpath {_acfg :[]_g .Point {_ddcd }}};
|
||
func _dfca (_fcgae string )bool {for _ ,_cdgdd :=range _fcgae {if !_be .IsSpace (_cdgdd ){return false ;};};return true ;};var (_ad =_d .New ("\u0074\u0079p\u0065\u0020\u0063h\u0065\u0063\u006b\u0020\u0065\u0072\u0072\u006f\u0072");_ac =_d .New ("\u0072\u0061\u006e\u0067\u0065\u0020\u0063\u0068\u0065\u0063\u006b\u0020e\u0072\u0072\u006f\u0072");
|
||
);func (_fagc *wordBag )firstWord (_geef int )*textWord {return _fagc ._gbbd [_geef ][0]};func (_bbcc *stateStack )empty ()bool {return len (*_bbcc )==0};func (_acbd compositeCell )parasBBox ()(paraList ,_aec .PdfRectangle ){return _acbd .paraList ,_acbd .PdfRectangle ;
|
||
};
|
||
|
||
// Append appends `mark` to the mark array.
|
||
func (_abg *TextMarkArray )Append (mark TextMark ){_abg ._bade =append (_abg ._bade ,mark )};type structTreeRoot struct{_bagdg []structElement ;_gage string ;};func (_aeac *textObject )setHorizScaling (_fcdf float64 ){if _aeac ==nil {return ;};_aeac ._cbgd ._aafe =_fcdf ;
|
||
};func (_ecfb *wordBag )pullWord (_gafg *textWord ,_cbaa int ,_dcad map[int ]map[*textWord ]struct{}){_ecfb .PdfRectangle =_agfb (_ecfb .PdfRectangle ,_gafg .PdfRectangle );if _gafg ._aeegf > _ecfb ._aad {_ecfb ._aad =_gafg ._aeegf ;};_ecfb ._gbbd [_cbaa ]=append (_ecfb ._gbbd [_cbaa ],_gafg );
|
||
_dcad [_cbaa ][_gafg ]=struct{}{};};func (_eegg *TextMarkArray )getTextMarkAtOffset (_dfb int )*TextMark {for _ ,_bca :=range _eegg ._bade {if _bca .Offset ==_dfb {return &_bca ;};};return nil ;};func _aadg (_bdbg []*textLine ,_cffe string )string {var _fbf _bb .Builder ;
|
||
_aagag :=0.0;for _dgdd ,_ccgd :=range _bdbg {_eebb :=_ccgd .text ();_fccff :=_ccgd ._gaca ;if _dgdd < len (_bdbg )-1{_aagag =_bdbg [_dgdd +1]._gaca ;}else {_aagag =0.0;};_fbf .WriteString (_cffe );_fbf .WriteString (_eebb );if _aagag !=_fccff {_fbf .WriteString ("\u000a");
|
||
}else {_fbf .WriteString ("\u0020");};};return _fbf .String ();};
|
||
|
||
// String returns a string descibing `i`.
|
||
func (_efec gridTile )String ()string {_acfa :=func (_aeba bool ,_cgdd string )string {if _aeba {return _cgdd ;};return "\u005f";};return _ae .Sprintf ("\u00256\u002e2\u0066\u0020\u0025\u0031\u0073%\u0031\u0073%\u0031\u0073\u0025\u0031\u0073",_efec .PdfRectangle ,_acfa (_efec ._cbfd ,"\u004c"),_acfa (_efec ._cbdbf ,"\u0052"),_acfa (_efec ._fcgc ,"\u0042"),_acfa (_efec ._fbbf ,"\u0054"));
|
||
};func (_bbab *subpath )isQuadrilateral ()bool {if len (_bbab ._acfg )< 4||len (_bbab ._acfg )> 5{return false ;};if len (_bbab ._acfg )==5{_ddfc :=_bbab ._acfg [0];_bdff :=_bbab ._acfg [4];if _ddfc .X !=_bdff .X ||_ddfc .Y !=_bdff .Y {return false ;};
|
||
};return true ;};func (_abgc *shapesState )fill (_cdgb *[]pathSection ){_cdabf :=pathSection {_gbag :_abgc ._edc ,Color :_abgc ._bbdg .getFillColor ()};*_cdgb =append (*_cdgb ,_cdabf );if _eceg {_dagb :=_cdabf .bbox ();_ae .Printf ("\u0020 \u0020\u0020\u0046\u0049\u004c\u004c\u003a %\u0032\u0064\u0020\u0066\u0069\u006c\u006c\u0073\u0020\u0028\u0025\u0064\u0020\u006ee\u0077\u0029 \u0073\u0073\u003d%\u0073\u0020\u0063\u006f\u006c\u006f\u0072\u003d\u0025\u0033\u0076\u0020\u0025\u0036\u002e\u0032f\u003d\u00256.\u0032\u0066\u0078%\u0036\u002e\u0032\u0066\u000a",len (*_cdgb ),len (_cdabf ._gbag ),_abgc ,_cdabf .Color ,_dagb ,_dagb .Width (),_dagb .Height ());
|
||
if _ggbbe {for _aefb ,_daeb :=range _cdabf ._gbag {_ae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_aefb ,_daeb );if _aefb ==10{break ;};};};};};func _ddcae (_bbgfa _bad .PdfObject ,_cdced _eg .Color )(_ec .Image ,error ){_ccbg ,_aedfd :=_bad .GetStream (_bbgfa );
|
||
if !_aedfd {return nil ,nil ;};_dade ,_dfcb :=_aec .NewXObjectImageFromStream (_ccbg );if _dfcb !=nil {return nil ,_dfcb ;};_bbcec ,_dfcb :=_dade .ToImage ();if _dfcb !=nil {return nil ,_dfcb ;};return _ffeed (_bbcec ,_cdced ),nil ;};
|
||
|
||
// TextMark represents extracted text on a page with information regarding both textual content,
|
||
// formatting (font and size) and positioning.
|
||
// It is the smallest unit of text on a PDF page, typically a single character.
|
||
//
|
||
// getBBox() in test_text.go shows how to compute bounding boxes of substrings of extracted text.
|
||
// The following code extracts the text on PDF page `page` into `text` then finds the bounding box
|
||
// `bbox` of substring `term` in `text`.
|
||
//
|
||
// ex, _ := New(page)
|
||
// // handle errors
|
||
// pageText, _, _, err := ex.ExtractPageText()
|
||
// // handle errors
|
||
// text := pageText.Text()
|
||
// textMarks := pageText.Marks()
|
||
//
|
||
// start := strings.Index(text, term)
|
||
// end := start + len(term)
|
||
// spanMarks, err := textMarks.RangeOffset(start, end)
|
||
// // handle errors
|
||
// bbox, ok := spanMarks.BBox()
|
||
// // handle errors
|
||
type TextMark struct{
|
||
|
||
// Text is the extracted text.
|
||
Text string ;
|
||
|
||
// Original is the text in the PDF. It has not been decoded like `Text`.
|
||
Original string ;
|
||
|
||
// BBox is the bounding box of the text.
|
||
BBox _aec .PdfRectangle ;
|
||
|
||
// Font is the font the text was drawn with.
|
||
Font *_aec .PdfFont ;
|
||
|
||
// FontSize is the font size the text was drawn with.
|
||
FontSize float64 ;
|
||
|
||
// Offset is the offset of the start of TextMark.Text in the extracted text. If you do this
|
||
// text, textMarks := pageText.Text(), pageText.Marks()
|
||
// marks := textMarks.Elements()
|
||
// then marks[i].Offset is the offset of marks[i].Text in text.
|
||
Offset int ;
|
||
|
||
// Meta is set true for spaces and line breaks that we insert in the extracted text. We insert
|
||
// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
|
||
// distance apart. See wordJoiner (lineJoiner) in PageText.computeViews().
|
||
Meta bool ;
|
||
|
||
// FillColor is the fill color of the text.
|
||
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
||
FillColor _eg .Color ;
|
||
|
||
// StrokeColor is the stroke color of the text.
|
||
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
||
StrokeColor _eg .Color ;
|
||
|
||
// Orientation is the text orientation
|
||
Orientation int ;
|
||
|
||
// DirectObject is the underlying PdfObject (Text Object) that represents the visible texts. This is introduced to get
|
||
// a simple access to the TextObject in case editing or replacment of some text is needed. E.g during redaction.
|
||
DirectObject _bad .PdfObject ;
|
||
|
||
// ObjString is a decoded string operand of a text-showing operator. It has the same value as `Text` attribute except
|
||
// when many glyphs are represented with the same Text Object that contains multiple length string operand in which case
|
||
// ObjString spans more than one character string that falls in different TextMark objects.
|
||
ObjString []string ;Tw float64 ;Th float64 ;Tc float64 ;Index int ;_adgb bool ;_dfeb *TextTable ;};var _aecab =_f .MustCompile ("\u005e\u005c\u0073\u002a\u0028\u005c\u0064\u002b\u005c\u002e\u003f|\u005b\u0049\u0069\u0076\u005d\u002b\u0029\u005c\u0073\u002a\\\u0029\u003f\u0024");
|
||
func (_acege *textWord )bbox ()_aec .PdfRectangle {return _acege .PdfRectangle };func (_caa *structTreeRoot )buildList (_ccfcb map[int ][]*textLine ,_fcdb _bad .PdfObject )[]*list {if _caa ==nil {_fc .Log .Debug ("\u0062\u0075\u0069\u006c\u0064\u004c\u0069\u0073\u0074\u003a\u0020t\u0072\u0065\u0065\u0052\u006f\u006f\u0074\u0020\u0069\u0073 \u006e\u0069\u006c");
|
||
return nil ;};var _cffdc *structElement ;_fadc :=[]structElement {};if len (_caa ._bagdg )==1{_ggg :=_caa ._bagdg [0]._aeff ;if _ggg =="\u0044\u006f\u0063\u0075\u006d\u0065\u006e\u0074"||_ggg =="\u0053\u0065\u0063\u0074"||_ggg =="\u0050\u0061\u0072\u0074"||_ggg =="\u0044\u0069\u0076"||_ggg =="\u0041\u0072\u0074"{_cffdc =&_caa ._bagdg [0];
|
||
};}else {_cffdc =&structElement {_ccaac :_caa ._bagdg ,_aeff :_caa ._gage };};if _cffdc ==nil {_fc .Log .Debug ("\u0062\u0075\u0069\u006cd\u004c\u0069\u0073\u0074\u003a\u0020\u0074\u006f\u0070\u0045l\u0065m\u0065\u006e\u0074\u0020\u0069\u0073\u0020n\u0069\u006c");
|
||
return nil ;};for _ ,_eccea :=range _cffdc ._ccaac {if _eccea ._aeff =="\u004c"{_fadc =append (_fadc ,_eccea );}else if _eccea ._aeff =="\u0054\u0061\u0062l\u0065"{_dfec :=_faff (_eccea );_fadc =append (_fadc ,_dfec ...);};};_ffdf :=_gfbf (_fadc ,_ccfcb ,_fcdb );
|
||
var _acca []*list ;for _ ,_agca :=range _ffdf {_bebd :=_edcb (_agca );_acca =append (_acca ,_bebd ...);};return _acca ;};func (_ccfb *wordBag )empty (_ebba int )bool {_ ,_ebgc :=_ccfb ._gbbd [_ebba ];return !_ebgc }; |