2020-08-27 21:45:09 +00:00
|
|
|
|
//
|
|
|
|
|
// Copyright 2020 FoxyUtils ehf. All rights reserved.
|
|
|
|
|
//
|
|
|
|
|
// This is a commercial product and requires a license to operate.
|
|
|
|
|
// A trial license can be obtained at https://unidoc.io
|
|
|
|
|
//
|
|
|
|
|
// DO NOT EDIT: generated by unitwist Go source code obfuscator.
|
|
|
|
|
//
|
|
|
|
|
// Use of this source code is governed by the UniDoc End User License Agreement
|
|
|
|
|
// terms that can be accessed at https://unidoc.io/eula/
|
2018-03-22 14:03:47 +00:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
//
|
|
|
|
|
// Package extractor is used for quickly extracting PDF content through a simple interface.
|
|
|
|
|
// Currently offers functionality for extracting textual content.
|
|
|
|
|
//
|
2020-10-05 19:28:24 +00:00
|
|
|
|
package extractor ;import (_fa "bytes";_g "errors";_ef "fmt";_gb "github.com/unidoc/unipdf/v3/common";_b "github.com/unidoc/unipdf/v3/common/license";_ba "github.com/unidoc/unipdf/v3/contentstream";_eff "github.com/unidoc/unipdf/v3/core";_ed "github.com/unidoc/unipdf/v3/internal/textencoding";_fe "github.com/unidoc/unipdf/v3/internal/transform";_dg "github.com/unidoc/unipdf/v3/model";_de "golang.org/x/text/unicode/norm";_ec "golang.org/x/xerrors";_afe "image/color";_d "io";_gc "math";_ad "regexp";_f "sort";_af "strings";_gf "unicode";_e "unicode/utf8";);func (_ccbc *shapesState )addPoint (_dfbe ,_cfca float64 ){_faa :=_ccbc .establishSubpath ();_cdgf :=_ccbc .devicePoint (_dfbe ,_cfca );if _faa ==nil {_ccbc ._egbb =true ;_ccbc ._bbede =_cdgf ;}else {_faa .add (_cdgf );};};func (_fdab rulingList )connections (_acca map[int ]intSet ,_dcce int )intSet {_egbc :=make (intSet );_gfdb :=make (intSet );var _eeed func (int );_eeed =func (_fbee int ){if !_gfdb .has (_fbee ){_gfdb .add (_fbee );for _ebda :=range _fdab {if _acca [_ebda ].has (_fbee ){_egbc .add (_ebda );};};for _bfda :=range _fdab {if _egbc .has (_bfda ){_eeed (_bfda );};};};};_eeed (_dcce );return _egbc ;};func _edad (_dfcba []*textMark ,_dadg _dg .PdfRectangle )[]*textWord {var _bdafa []*textWord ;var _ebafd *textWord ;_ffaeg :=func (){if _ebafd !=nil {_eeeb :=_ebafd .computeText ();if !_efaa (_eeeb ){_ebafd ._efdce =_eeeb ;_bdafa =append (_bdafa ,_ebafd );if _adfd {_gb .Log .Info ("\u0077o\u0072\u0064\u003d\u0025\u0073",_ebafd .String ());for _dcedd ,_cabd :=range _ebafd ._degbg {_ef .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_dcedd ,_cabd .String ());};};};_ebafd =nil ;};};for _ ,_aabg :=range _dfcba {if _bfc &&_ebafd !=nil &&len (_ebafd ._degbg )> 0{_dabd :=_ebafd ._degbg [len (_ebafd ._degbg )-1];_ffff ,_gdgg :=_fccb (_aabg ._daca );_ffaf ,_ceaa :=_fccb (_dabd ._daca );if _gdgg &&!_ceaa &&_dabd .inDiacriticArea (_aabg ){_ebafd .addDiacritic (_ffff );continue ;};if _ceaa &&!_gdgg &&_aabg .inDiacriticArea (_dabd ){_ebafd ._degbg =_ebafd ._degbg [:len (_ebafd ._degbg )-1];_ebafd .appendMark (_aabg ,_dadg );_ebafd .addDiacritic (_ffaf );continue ;};};_dfdbc :=_efaa (_aabg ._daca );if _dfdbc {_ffaeg ();continue ;};if _ebafd ==nil &&!_dfdbc {_ebafd =_egaa ([]*textMark {_aabg },_dadg );continue ;};_fdbbb :=_ebafd ._feeg ;_eace :=_gc .Abs (_gdda (_dadg ,_aabg )-_ebafd ._fadd )/_fdbbb ;_gdec :=_cagga (_aabg ,_ebafd )/_fdbbb ;if _gdec >=_dgbg ||!(-_ggcf <=_gdec &&_eace <=_egfg ){_ffaeg ();_ebafd =_egaa ([]*textMark {_aabg },_dadg );continue ;};_ebafd .appendMark (_aabg ,_dadg );};_ffaeg ();return _bdafa ;};func _bbbc (_ggacg []*subpath )[]rulingList {_efgf (_ggacg );var _cadacg rulingList ;for _ ,_aeca :=range _ggacg {if !_aeca .isQuadrilateral (){continue ;};if _baef ,_cebd :=_aeca .makeRectRuling ();_cebd {_cadacg =append (_cadacg ,_baef );};};_cadacg =_cadacg .tidied ("\u0066\u0069\u006cl\u0073");return _cadacg .toGrids ();};func _fddeg (_dffd ,_gcff float64 )rulingKind {if _dffd >=_ebdf &&_fcbfg (_gcff ,_dffd ){return _daea ;};if _gcff >=_ebdf &&_fcbfg (_dffd ,_gcff ){return _dbga ;};return _decfa ;};func _ecb (_fafe _fe .Point )_fe .Matrix {return _fe .TranslationMatrix (_fafe .X ,_fafe .Y )};func (_dgfa *textLine )markWordBoundaries (){_cgec :=_gdcaf *_dgfa ._dbcg ;for _bfce ,_acd :=range _dgfa ._bac [1:]{if _cagga (_acd ,_dgfa ._bac [_bfce ])>=_cgec {_acd ._gcgcb =true ;};};};const (_gfccf =1.0e-6;_gdcbb =1.0e-4;_dcbe =10;_cbfd =6;_cacc =0.5;_dgbg =0.11;_ggcf =0.19;_egfg =0.04;_bdgd =0.04;_bfegb =1.0;_efbf =0.04;_badd =0.4;_eega =0.7;_gac =1.0;_faae =0.1;_beed =1.4;_abdg =0.46;_gdcaf =0.02;_bdae =0.2;_begb =0.5;_gffd =4;_dafc =4.0;_feca =6;_fdc =0.3;_dada =0.01;_ddbae =0.02;_cbag =2;_ccfdg =2;_cadac =500;_ebdf =10.0;_adbf =0.05;_cggb =2.0;_eeea =2.0;_fdee =1.0;);const _ddf =20;func (_cae *textObject )getStrokeColor ()_afe .Color {return _agdbg (_cae ._fafd .ColorspaceStroking ,_cae ._fafd .ColorStroking );};type lineRuling struct{_ffec rulingKind ;_fcgb ,_bcc _fe .Point ;};type subpath struct{_dfdc []_fe .Point ;_effe bool ;};func (_gecd intSet
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-10-05 19:28:24 +00:00
|
|
|
|
// ToTextMark returns the public view of `tm`.
|
|
|
|
|
func (_cacce *textMark )ToTextMark ()TextMark {return TextMark {Text :_cacce ._daca ,Original :_cacce ._daeg ,BBox :_cacce ._fdf ,Font :_cacce ._acea ,FontSize :_cacce ._bbdg ,FillColor :_cacce ._caec ,StrokeColor :_cacce ._dfce };};func (_ggcc *textPara )fontsize ()float64 {return _ggcc ._feac [0]._dbcg };type cachedImage struct{_ea *_dg .Image ;_efd _dg .PdfColorspace ;};const (_adfd =false ;_ccdf =false ;_dfec =false ;_geeca =false ;_dbc =false ;_gccaf =false ;_ddga =false ;_eeac =false ;_ffb =_eeac &&true ;_ebaa =_ffb &&false ;_gcge =_eeac &&true ;_acba =false ;_abff =_acba ||false ;_fgda =false ;);func (_ddg *textObject )moveLP (_facg ,_dbbc float64 ){_ddg ._bfeb .Concat (_fe .NewMatrix (1,0,0,1,_facg ,_dbbc ));_ddg ._fac =_ddg ._bfeb ;};func _eebf (_gbgc ,_ecee _dg .PdfRectangle )bool {return _gbgc .Llx <=_ecee .Llx &&_ecee .Urx <=_gbgc .Urx &&_gbgc .Lly <=_ecee .Lly &&_ecee .Ury <=_gbgc .Ury ;};func (_acbb *wordBag )applyRemovals (_gada map[int ]map[*textWord ]struct{}){for _fcef ,_aga :=range _gada {if len (_aga )==0{continue ;};_gcded :=_acbb ._dbf [_fcef ];_aaag :=len (_gcded )-len (_aga );if _aaag ==0{delete (_acbb ._dbf ,_fcef );continue ;};_ccd :=make ([]*textWord ,_aaag );_aef :=0;for _ ,_fcfa :=range _gcded {if _ ,_effa :=_aga [_fcfa ];!_effa {_ccd [_aef ]=_fcfa ;_aef ++;};};_acbb ._dbf [_fcef ]=_ccd ;};};func (_gcged *textTable )log (_afgf string ){if !_acba {return ;};_gb .Log .Info ("~\u007e\u007e\u0020\u0025\u0073\u003a \u0025\u0064\u0020\u0078\u0020\u0025d\u0020\u0067\u0072\u0069\u0064\u003d\u0025t\u000a\u0020\u0020\u0020\u0020\u0020\u0020\u0025\u0036\u002e2\u0066",_afgf ,_gcged ._acdaf ,_gcged ._dfdbe ,_gcged ._ggbf ,_gcged .PdfRectangle );for _ceae :=0;_ceae < _gcged ._dfdbe ;_ceae ++{for _geefb :=0;_geefb < _gcged ._acdaf ;_geefb ++{_abacb :=_gcged .get (_geefb ,_ceae );if _abacb ==nil {continue ;};_ef .Printf ("%\u0034\u0064\u0020\u00252d\u003a \u0025\u0036\u002e\u0032\u0066 \u0025\u0071\u0020\u0025\u0064\u000a",_geefb ,_ceae ,_abacb .PdfRectangle ,_cgdcd (_abacb .text (),50),_e .RuneCountInString (_abacb .text ()));};};};func _bbff (_ffgg string )bool {if _e .RuneCountInString (_ffgg )< _gffd {return false ;};_aacg ,_fad :=_e .DecodeLastRuneInString (_ffgg );if _fad <=0||!_gf .Is (_gf .Hyphen ,_aacg ){return false ;};_aacg ,_fad =_e .DecodeLastRuneInString (_ffgg [:len (_ffgg )-_fad ]);return _fad > 0&&!_gf .IsSpace (_aacg );};func (_efdf *wordBag )depthRange (_fbbg ,_ccca int )[]int {var _ecec []int ;for _ggga :=range _efdf ._dbf {if _fbbg <=_ggga &&_ggga <=_ccca {_ecec =append (_ecec ,_ggga );};};if len (_ecec )==0{return nil ;};_f .Ints (_ecec );return _ecec ;};func (_dcfc paraList )addNeighbours (){_cdcc :=func (_bcae []int ,_dccaa *textPara )([]*textPara ,[]*textPara ){_fabfa :=make ([]*textPara ,0,len (_bcae )-1);_gcbe :=make ([]*textPara ,0,len (_bcae )-1);for _ ,_bfbb :=range _bcae {_abbag :=_dcfc [_bfbb ];if _abbag .Urx <=_dccaa .Llx {_fabfa =append (_fabfa ,_abbag );}else if _abbag .Llx >=_dccaa .Urx {_gcbe =append (_gcbe ,_abbag );};};return _fabfa ,_gcbe ;};_gdde :=func (_fefc []int ,_acbaf *textPara )([]*textPara ,[]*textPara ){_facdc :=make ([]*textPara ,0,len (_fefc )-1);_aagg :=make ([]*textPara ,0,len (_fefc )-1);for _ ,_gcdea :=range _fefc {_adbc :=_dcfc [_gcdea ];if _adbc .Ury <=_acbaf .Lly {_aagg =append (_aagg ,_adbc );}else if _adbc .Lly >=_acbaf .Ury {_facdc =append (_facdc ,_adbc );};};return _facdc ,_aagg ;};_feaaa :=_dcfc .yNeighbours (_ddbae );for _ ,_bggab :=range _dcfc {_bbfa :=_feaaa [_bggab ];if len (_bbfa )==0{continue ;};_gaba ,_cbge :=_cdcc (_bbfa ,_bggab );if len (_gaba )==0&&len (_cbge )==0{continue ;};if len (_gaba )> 0{_dbbfc :=_gaba [0];for _ ,_dgbd :=range _gaba [1:]{if _dgbd .Urx >=_dbbfc .Urx {_dbbfc =_dgbd ;};};for _ ,_afcg :=range _gaba {if _afcg !=_dbbfc &&_afcg .Urx > _dbbfc .Llx {_dbbfc =nil ;break ;};};if _dbbfc !=nil &&_afbd (_bggab .PdfRectangle ,_dbbfc .PdfRectangle ){_bggab ._feacg =_dbbfc ;};};if len (_cbge )> 0{_dbcd :=_cbge [0];for _ ,_faad :=range _cbge [1:]{if _faad .Llx <=_dbcd .Llx {_dbcd =_faad ;};};for _ ,_dbaa :=range _cbge {if _dbaa
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
|
|
|
|
// Elements returns the TextMarks in `ma`.
|
2020-10-05 19:28:24 +00:00
|
|
|
|
func (_ddfg *TextMarkArray )Elements ()[]TextMark {return _ddfg ._ccbg };func (_ecc *stateStack )top ()*textState {if _ecc .empty (){return nil ;};return (*_ecc )[_ecc .size ()-1];};func (_bfada *wordBag )sort (){for _ ,_cca :=range _bfada ._dbf {_f .Slice (_cca ,func (_gcag ,_dbee int )bool {return _fddga (_cca [_gcag ],_cca [_dbee ])< 0});};};func (_cefgc paraList )writeText (_aaga _d .Writer ){for _baba ,_ffdb :=range _cefgc {if _ffdb ._dcge {continue ;};_ffdb .writeText (_aaga );if _baba !=len (_cefgc )-1{if _dgaf (_ffdb ,_cefgc [_baba +1]){_aaga .Write ([]byte ("\u0020"));}else {_aaga .Write ([]byte ("\u000a"));_aaga .Write ([]byte ("\u000a"));};};};_aaga .Write ([]byte ("\u000a"));_aaga .Write ([]byte ("\u000a"));};func (_ecca rulingList )aligned ()bool {if len (_ecca )< 2{return false ;};_gecef :=_ecca [0];for _ ,_abfa :=range _ecca [1:]{if !(_ffac (_abfa ._ecfg ,_gecef ._ecfg )&&_ffac (_abfa ._gbbb ,_gecef ._gbbb )){return false ;};};return true ;};func (_acbcf *wordBag )scanBand (_aage string ,_bgdd *wordBag ,_aae func (_egbf *wordBag ,_gdbd *textWord )bool ,_bcfg ,_acfa ,_cbgd float64 ,_efc ,_fcdd bool )int {_gead :=_bgdd ._bgd ;var _geec map[int ]map[*textWord ]struct{};if !_efc {_geec =_acbcf .makeRemovals ();};_bagg :=_cacc *_gead ;_cgdb :=0;var _fdea []*textWord ;for _ ,_dfdb :=range _acbcf .depthBand (_bcfg -_bagg ,_acfa +_bagg ){if len (_acbcf ._dbf [_dfdb ])==0{continue ;};for _ ,_fddf :=range _acbcf ._dbf [_dfdb ]{if !(_bcfg -_bagg <=_fddf ._fadd &&_fddf ._fadd <=_acfa +_bagg ){continue ;};if !_aae (_bgdd ,_fddf ){continue ;};_dffa :=2.0*_gc .Abs (_fddf ._feeg -_bgdd ._bgd )/(_fddf ._feeg +_bgdd ._bgd );_dggf :=_gc .Max (_fddf ._feeg /_bgdd ._bgd ,_bgdd ._bgd /_fddf ._feeg );_deaf :=_gc .Min (_dffa ,_dggf );if _cbgd > 0&&_deaf > _cbgd {continue ;};if _bgdd .blocked (_fddf ){continue ;};if !_efc {_bgdd .pullWord (_fddf ,_dfdb ,_geec );};_fdea =append (_fdea ,_fddf );_cgdb ++;if !_fcdd {if _fddf ._fadd < _bcfg {_bcfg =_fddf ._fadd ;};if _fddf ._fadd > _acfa {_acfa =_fddf ._fadd ;};};if _efc {break ;};};};if !_efc {_acbcf .applyRemovals (_geec );};return _cgdb ;};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// ExtractPageImages returns the image contents of the page extractor, including data
|
|
|
|
|
// and position, size information for each image.
|
|
|
|
|
// A set of options to control page image extraction can be passed in. The options
|
|
|
|
|
// parameter can be nil for the default options. By default, inline stencil masks
|
|
|
|
|
// are not extracted.
|
2020-10-05 19:28:24 +00:00
|
|
|
|
func (_ab *Extractor )ExtractPageImages (options *ImageExtractOptions )(*PageImages ,error ){_bb :=&imageExtractContext {_ffe :options };_cf :=_bb .extractContentStreamImages (_ab ._eb ,_ab ._gg );if _cf !=nil {return nil ,_cf ;};return &PageImages {Images :_bb ._gcd },nil ;};func (_bfeg *textObject )setFont (_cagg string ,_cfde float64 )error {if _bfeg ==nil {return nil ;};_bfeg ._cfeg ._eecf =_cfde ;_ccf ,_gea :=_bfeg .getFont (_cagg );if _gea !=nil {return _gea ;};_bfeg ._cfeg ._ecdd =_ccf ;if _bfeg ._cgg .empty (){_bfeg ._cgg .push (_bfeg ._cfeg );}else {_bfeg ._cgg .top ()._ecdd =_bfeg ._cfeg ._ecdd ;};return nil ;};func (_cagda *subpath )isQuadrilateral ()bool {if len (_cagda ._dfdc )< 4||len (_cagda ._dfdc )> 5{return false ;};if len (_cagda ._dfdc )==5{_gdcfg :=_cagda ._dfdc [0];_defgc :=_cagda ._dfdc [4];if _gdcfg .X !=_defgc .X ||_gdcfg .Y !=_defgc .Y {return false ;};};return true ;};func _fddga (_abfd ,_dacbf bounded )float64 {return _abfd .bbox ().Llx -_dacbf .bbox ().Llx };func (_cgcgc *textTable )markCells (){for _dcefe :=0;_dcefe < _cgcgc ._dfdbe ;_dcefe ++{for _fceaf :=0;_fceaf < _cgcgc ._acdaf ;_fceaf ++{_eggfb :=_cgcgc .get (_fceaf ,_dcefe );_eggfb ._fdde =true ;};};};func (_efgg *shapesState )establishSubpath ()*subpath {_bggb ,_eceg :=_efgg .lastpointEstablished ();if !_eceg {_efgg ._dacd =append (_efgg ._dacd ,_dabf (_bggb ));};if len (_efgg ._dacd )==0{return nil ;};_efgg ._egbb =false ;return _efgg ._dacd [len (_efgg ._dacd )-1];};func _ddba (_ega []*textWord ,_gafd float64 ,_fdgg ,_bddc rulingList )*wordBag {_gccd :=_dbd (_ega [0],_gafd ,_fdgg ,_bddc );for _ ,_fccg :=range _ega [1:]{_ecccf :=_eefb (_fccg ._fadd );_gccd ._dbf [_ecccf ]=append (_gccd ._dbf [_ecccf ],_fccg );_gccd .PdfRectangle =_gafef (_gccd .PdfRectangle ,_fccg .PdfRectangle );};_gccd .sort ();return _gccd ;};func (_gadf rulingList )sortStrict (){_f .Slice (_gadf ,func (_degb ,_aafg int )bool {_gddca ,_dcged :=_gadf [_degb ],_gadf [_aafg ];_ccdff ,_afge :=_gddca ._eddf ,_dcged ._eddf ;if _ccdff !=_afge {return _ccdff > _afge ;};_fcbc ,_gbccg :=_gddca ._dbdb ,_dcged ._dbdb ;if _fcbc !=_gbccg {return _fcbc < _gbccg ;};_fcbc ,_gbccg =_gddca ._ecfg ,_dcged ._ecfg ;if _fcbc !=_gbccg {return _fcbc < _gbccg ;};return _gddca ._gbbb < _dcged ._gbbb ;});};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a description of `w`.
|
2020-10-05 19:28:24 +00:00
|
|
|
|
func (_cgda *textWord )String ()string {return _ef .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_cgda ._fadd ,_cgda .PdfRectangle ,_cgda ._feeg ,_cgda ._efdce );};func (_dgc *wordBag )firstWord (_faea int )*textWord {return _dgc ._dbf [_faea ][0]};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-10-05 19:28:24 +00:00
|
|
|
|
// Tables returns the tables extracted from the page.
|
|
|
|
|
func (_ace PageText )Tables ()[]TextTable {return _ace ._dabe };
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-10-05 19:28:24 +00:00
|
|
|
|
// Text returns the extracted page text.
|
|
|
|
|
func (_fegf PageText )Text ()string {return _fegf ._fffd };const (_decfa rulingKind =iota ;_daea ;_dbga ;);func _dbde (_dgag *wordBag ,_dabfb int )*textLine {_dcc :=_dgag .firstWord (_dabfb );_bedf :=textLine {PdfRectangle :_dcc .PdfRectangle ,_dbcg :_dcc ._feeg ,_edeb :_dcc ._fadd };_bedf .pullWord (_dgag ,_dcc ,_dabfb );return &_bedf ;};const (_adfc =true ;_ddgd =true ;_bfc =true ;_eefe =false ;_eagg =false ;_cbd =6;_gddae =3.0;_eabf =200;_becg =true ;_fbbb =true ;_aadg =true ;_bggc =true ;);
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-10-05 19:28:24 +00:00
|
|
|
|
// Marks returns the TextMark collection for a page. It represents all the text on the page.
|
|
|
|
|
func (_cdc PageText )Marks ()*TextMarkArray {return &TextMarkArray {_ccbg :_cdc ._acbg }};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-10-05 19:28:24 +00:00
|
|
|
|
// New returns an Extractor instance for extracting content from the input PDF page.
|
|
|
|
|
func New (page *_dg .PdfPage )(*Extractor ,error ){_be ,_aa :=page .GetAllContentStreams ();if _aa !=nil {return nil ,_aa ;};_gfa ,_aa :=page .GetMediaBox ();if _aa !=nil {return nil ,_ef .Errorf ("\u0065\u0078\u0074r\u0061\u0063\u0074\u006fr\u0020\u0072\u0065\u0071\u0075\u0069\u0072e\u0073\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u002e\u0020\u0025\u0076",_aa );};_cd :=&Extractor {_eb :_be ,_gg :page .Resources ,_aff :*_gfa ,_dc :map[string ]fontEntry {},_fb :map[string ]textResult {}};if _cd ._aff .Llx > _cd ._aff .Urx {_gb .Log .Info ("\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0058\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e",_cd ._aff );_cd ._aff .Llx ,_cd ._aff .Urx =_cd ._aff .Urx ,_cd ._aff .Llx ;};if _cd ._aff .Lly > _cd ._aff .Ury {_gb .Log .Info ("\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0059\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e",_cd ._aff );_cd ._aff .Lly ,_cd ._aff .Ury =_cd ._aff .Ury ,_cd ._aff .Lly ;};return _cd ,nil ;};func (_fcba *wordBag )maxDepth ()float64 {return _fcba ._bcd -_fcba .Lly };
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-10-05 19:28:24 +00:00
|
|
|
|
// Len returns the number of TextMarks in `ma`.
|
|
|
|
|
func (_gcgc *TextMarkArray )Len ()int {if _gcgc ==nil {return 0;};return len (_gcgc ._ccbg );};var _egaf =map[rulingKind ]string {_decfa :"\u006e\u006f\u006e\u0065",_daea :"\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c",_dbga :"\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c"};func _cfbd (_fegbg ,_abce _fe .Point )rulingKind {_dcde :=_gc .Abs (_fegbg .X -_abce .X );_eeacg :=_gc .Abs (_fegbg .Y -_abce .Y );return _fddeg (_dcde ,_eeacg );};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `tm`.
|
|
|
|
|
func (_aadgb *textMark )String ()string {return _ef .Sprintf ("\u0025\u002e\u0032f \u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\u0022\u0025\u0073\u0022",_aadgb .PdfRectangle ,_aadgb ._bbdg ,_aadgb ._daca );};func (_fcfe *wordBag )makeRemovals ()map[int ]map[*textWord ]struct{}{_ddff :=make (map[int ]map[*textWord ]struct{},len (_fcfe ._dbf ));for _gafe :=range _fcfe ._dbf {_ddff [_gafe ]=make (map[*textWord ]struct{});};return _ddff ;};func _fddfb (_egga []_eff .PdfObject )(_baea ,_fafdb float64 ,_agcae error ){if len (_egga )!=2{return 0,0,_ef .Errorf ("\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u006e\u0075\u006d\u0062\u0065\u0072\u0020o\u0066 \u0070\u0061\u0072\u0061\u006d\u0073\u003a \u0025\u0064",len (_egga ));};_dbgb ,_agcae :=_eff .GetNumbersAsFloat (_egga );if _agcae !=nil {return 0,0,_agcae ;};return _dbgb [0],_dbgb [1],nil ;};func (_fda *stateStack )push (_bbe *textState ){_fgac :=*_bbe ;*_fda =append (*_fda ,&_fgac )};func (_edcbg *textObject )getCurrentFont ()*_dg .PdfFont {var _dgb *_dg .PdfFont ;if !_edcbg ._cgg .empty (){_dgb =_edcbg ._cgg .top ()._ecdd ;};if _dgb ==nil {_gb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u0020\u0064\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020U\u0073\u0069\u006e\u0067\u0020d\u0065\u0066a\u0075\u006c\u0074\u002e");return _dg .DefaultFont ();};return _dgb ;};func (_eef *subpath )add (_dga ..._fe .Point ){_eef ._dfdc =append (_eef ._dfdc ,_dga ...)};func _cagga (_ggac ,_gbcc bounded )float64 {return _ggac .bbox ().Llx -_gbcc .bbox ().Urx };func _dcgb (_bfeebb []*textWord ,_adbcd *textWord )[]*textWord {for _egcf ,_daaa :=range _bfeebb {if _daaa ==_adbcd {return _egabf (_bfeebb ,_egcf );};};_gb .Log .Error ("\u0072\u0065\u006d\u006f\u0076e\u0057\u006f\u0072\u0064\u003a\u0020\u0077\u006f\u0072\u0064\u0073\u0020\u0064o\u0065\u0073\u006e\u0027\u0074\u0020\u0063\u006f\u006e\u0074\u0061\u0069\u006e\u0020\u0077\u006f\u0072\u0064\u003d\u0025\u0073",_adbcd );return nil ;};func (_cafff paraList )log (_caeb string ){if !_ddga {return ;};_gb .Log .Info ("%\u0038\u0073\u003a\u0020\u0025\u0064 \u0070\u0061\u0072\u0061\u0073\u0020=\u003d\u003d\u003d\u003d\u003d\u003d\u002d-\u002d\u002d\u002d\u002d\u002d\u003d\u003d\u003d\u003d\u003d=\u003d",_caeb ,len (_cafff ));for _cacb ,_gagc :=range _cafff {if _gagc ==nil {continue ;};_dfgd :=_gagc .text ();_cgac :="\u0020\u0020";if _gagc ._dbff !=nil {_cgac =_ef .Sprintf ("\u005b%\u0064\u0078\u0025\u0064\u005d",_gagc ._dbff ._acdaf ,_gagc ._dbff ._dfdbe );};_ef .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0025s\u0020\u0025\u0071\u000a",_cacb ,_gagc .PdfRectangle ,_cgac ,_cgdcd (_dfgd ,50));};};func (_bddd *textWord )absorb (_fdae *textWord ){_bddd .PdfRectangle =_gafef (_bddd .PdfRectangle ,_fdae .PdfRectangle );_bddd ._degbg =append (_bddd ._degbg ,_fdae ._degbg ...);};type shapesState struct{_bcgc _fe .Matrix ;_fbaa _fe .Matrix ;_dacd []*subpath ;_egbb bool ;_bbede _fe .Point ;};func (_cccae paraList )merge ()*textPara {_gb .Log .Trace ("\u006d\u0065\u0072\u0067\u0065:\u0020\u0070\u0061\u0072\u0061\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_cccae ));if len (_cccae )==0{return nil ;};_cccae .sortReadingOrder ();_beee :=_cccae [0].PdfRectangle ;_baed :=_cccae [0]._feac ;for _ ,_ggce :=range _cccae [1:]{_beee =_gafef (_beee ,_ggce .PdfRectangle );_baed =append (_baed ,_ggce ._feac ...);};return _dfab (_beee ,_baed );};func (_bbfc *wordBag )absorb (_gdeb *wordBag ){_adba :=_gdeb .makeRemovals ();for _ada ,_eded :=range _gdeb ._dbf {for _ ,_bada :=range _eded {_bbfc .pullWord (_bada ,_ada ,_adba );};};_gdeb .applyRemovals (_adba );};func (_egdc *shapesState )devicePoint (_bag ,_gged float64 )_fe .Point {_bdag :=_egdc ._fbaa .Mult (_egdc ._bcgc );_bag ,_gged =_bdag .Transform (_bag ,_gged );return _fe .NewPoint (_bag ,_gged );};func (_edd *wordBag )blocked (_gde *textWord )bool {if _gde .Urx <
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
|
|
|
|
|
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
|
2020-10-05 19:28:24 +00:00
|
|
|
|
func (_adb *Extractor )ExtractTextWithStats ()(_cbf string ,_eebb int ,_fbg int ,_gabb error ){_dfd ,_eebb ,_fbg ,_gabb :=_adb .ExtractPageText ();if _gabb !=nil {return "",_eebb ,_fbg ,_gabb ;};return _dfd .Text (),_eebb ,_fbg ,nil ;};type wordBag struct{_dg .PdfRectangle ;_bgd float64 ;_gcdda ,_ccfd rulingList ;_bcd float64 ;_dbf map[int ][]*textWord ;};func (_defg *textObject )checkOp (_fdd *_ba .ContentStreamOperation ,_eab int ,_dgdf bool )(_dgg bool ,_gdd error ){if _defg ==nil {var _geb []_eff .PdfObject ;if _eab > 0{_geb =_fdd .Params ;if len (_geb )> _eab {_geb =_geb [:_eab ];};};_gb .Log .Debug ("\u0025\u0023q \u006f\u0070\u0065r\u0061\u006e\u0064\u0020out\u0073id\u0065\u0020\u0074\u0065\u0078\u0074\u002e p\u0061\u0072\u0061\u006d\u0073\u003d\u0025+\u0076",_fdd .Operand ,_geb );};if _eab >=0{if len (_fdd .Params )!=_eab {if _dgdf {_gdd =_g .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");};_gb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_fdd .Operand ,_eab ,len (_fdd .Params ),_fdd .Params );return false ,_gdd ;};};return true ,nil ;};func _fffa (_gggf func (*wordBag ,*textWord ,float64 )bool ,_fab float64 )func (*wordBag ,*textWord )bool {return func (_aadd *wordBag ,_fdbc *textWord )bool {return _gggf (_aadd ,_fdbc ,_fab )};};type ruling struct{_eddf rulingKind ;_dbdb float64 ;_ecfg float64 ;_gbbb float64 ;};func (_fec *subpath )removeDuplicates (){if len (_fec ._dfdc )==0{return ;};_gagb :=[]_fe .Point {_fec ._dfdc [0]};for _ ,_egdd :=range _fec ._dfdc [1:]{if !_bfgdb (_egdd ,_gagb [len (_gagb )-1]){_gagb =append (_gagb ,_egdd );};};_fec ._dfdc =_gagb ;};func (_acdf paraList )tables ()[]TextTable {var _eac []TextTable ;for _ ,_badg :=range _acdf {_gfaba :=_badg ._dbff ;if _gfaba !=nil &&_gfaba .isExportable (){_eac =append (_eac ,_gfaba .toTextTable ());};};return _eac ;};func (_edaf *textTable )newTablePara ()*textPara {_agcc :=_edaf .computeBbox ();return &textPara {PdfRectangle :_agcc ,_ageda :_agcc ,_dbff :_edaf };};var (_cbae =map[rune ]string {0x0060:"\u0300",0x02CB:"\u0300",0x0027:"\u0301",0x00B4:"\u0301",0x02B9:"\u0301",0x02CA:"\u0301",0x005E:"\u0302",0x02C6:"\u0302",0x007E:"\u0303",0x02DC:"\u0303",0x00AF:"\u0304",0x02C9:"\u0304",0x02D8:"\u0306",0x02D9:"\u0307",0x00A8:"\u0308",0x00B0:"\u030a",0x02DA:"\u030a",0x02BA:"\u030b",0x02DD:"\u030b",0x02C7:"\u030c",0x02C8:"\u030d",0x0022:"\u030e",0x02BB:"\u0312",0x02BC:"\u0313",0x0486:"\u0313",0x055A:"\u0313",0x02BD:"\u0314",0x0485:"\u0314",0x0559:"\u0314",0x02D4:"\u031d",0x02D5:"\u031e",0x02D6:"\u031f",0x02D7:"\u0320",0x02B2:"\u0321",0x00B8:"\u0327",0x02CC:"\u0329",0x02B7:"\u032b",0x02CD:"\u0331",0x005F:"\u0332",0x204E:"\u0359"};);func (_fbbge *wordBag )firstReadingIndex (_geed int )int {_adg :=_fbbge .firstWord (_geed )._feeg ;_beef :=float64 (_geed +1)*_cbfd ;_ced :=_beef +_dafc *_adg ;_ecfb :=_geed ;for _ ,_efcg :=range _fbbge .depthBand (_beef ,_ced ){if _fddga (_fbbge .firstWord (_efcg ),_fbbge .firstWord (_ecfb ))< 0{_ecfb =_efcg ;};};return _ecfb ;};func (_bab *textObject )setCharSpacing (_fbfe float64 ){if _bab ==nil {return ;};_bab ._cfeg ._cfdb =_fbfe ;if _dbc {_gb .Log .Info ("\u0073\u0065t\u0043\u0068\u0061\u0072\u0053\u0070\u0061\u0063\u0069\u006e\u0067\u003a\u0020\u0025\u002e\u0032\u0066\u0020\u0073\u0074\u0061\u0074e=\u0025\u0073",_fbfe ,_bab ._cfeg .String ());};};func _dgae (_dagd _dg .PdfRectangle )rulingKind {_edbg :=_dagd .Width ();_gcab :=_dagd .Height ();return _fddeg (_edbg ,_gcab );};const _egee =10;func _agdbg (_aedfe _dg .PdfColorspace ,_eead _dg .PdfColor )_afe .Color {if _aedfe ==nil ||_eead ==nil {return _afe .Black ;};_fgegd ,_bbfaf :=_aedfe .ColorToRGB (_eead );if _bbfaf !=nil {_gb .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006fu\u006c\u0064\u0020no\u0074\u0020\u0063\u006f\u006e\u0
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-10-05 19:28:24 +00:00
|
|
|
|
// RenderMode specifies the text rendering mode (Tmode), which determines whether showing text shall cause
|
|
|
|
|
// glyph outlines to be stroked, filled, used as a clipping boundary, or some combination of the three.
|
|
|
|
|
// Stroking, filling, and clipping shall have the same effects for a text object as they do for a path object
|
|
|
|
|
// (see 8.5.3, "Path-Painting Operators" and 8.5.4, "Clipping Path Operators").
|
|
|
|
|
type RenderMode int ;func _afda (_ebbd float64 )bool {return _gc .Abs (_ebbd )< _gfccf };func _gbce (_bgf _dg .PdfRectangle )*textPara {_gaad :=0.5*(_bgf .Llx +_bgf .Urx );_beffe :=0.5*(_bgf .Lly +_bgf .Ury );const _fdef =1e-6;_addad :=_dg .PdfRectangle {Llx :_gaad -_fdef ,Urx :_gaad +_fdef ,Lly :_beffe -_fdef ,Ury :_beffe +_fdef };return &textPara {PdfRectangle :_addad ,_dcge :true };};func (_gae *textPara )toCellTextMarks (_cddg *int )[]TextMark {var _defa []TextMark ;for _aabfb ,_afbc :=range _gae ._feac {_afbcb :=_afbc .toTextMarks (_cddg );_fefb :=_adfc &&_afbc .endsInHyphen ()&&_aabfb !=len (_gae ._feac )-1;if _fefb {_afbcb =_feec (_afbcb ,_cddg );};_defa =append (_defa ,_afbcb ...);if !(_fefb ||_aabfb ==len (_gae ._feac )-1){_defa =_ddae (_defa ,_cddg ,_acbcd (_afbc ._edeb ,_gae ._feac [_aabfb +1]._edeb ));};};return _defa ;};func (_eebfd *textObject )newTextMark (_gfed string ,_dafad _fe .Matrix ,_ccg _fe .Point ,_fbef float64 ,_efed *_dg .PdfFont ,_fbd float64 ,_dced ,_fegbc _afe .Color )(textMark ,bool ){_abda :=_dafad .Angle ();_ebca :=_cdfc (_abda ,_dcbe );var _dfdcb float64 ;if _ebca %180!=90{_dfdcb =_dafad .ScalingFactorY ();}else {_dfdcb =_dafad .ScalingFactorX ();};_abfg :=_gag (_dafad );_fafa :=_dg .PdfRectangle {Llx :_abfg .X ,Lly :_abfg .Y ,Urx :_ccg .X ,Ury :_ccg .Y };switch _ebca %360{case 90:_fafa .Urx -=_dfdcb ;case 180:_fafa .Ury -=_dfdcb ;case 270:_fafa .Urx +=_dfdcb ;case 0:_fafa .Ury +=_dfdcb ;default:_ebca =0;_fafa .Ury +=_dfdcb ;};if _fafa .Llx > _fafa .Urx {_fafa .Llx ,_fafa .Urx =_fafa .Urx ,_fafa .Llx ;};if _fafa .Lly > _fafa .Ury {_fafa .Lly ,_fafa .Ury =_fafa .Ury ,_fafa .Lly ;};_facf ,_ebac :=_dcab (_fafa ,_eebfd ._abf ._aff );if !_ebac {_gb .Log .Debug ("\u0054\u0065\u0078\u0074\u0020m\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069\u0064\u0065\u0020\u0070a\u0067\u0065\u002e\u0020\u0062\u0062\u006f\u0078\u003d\u0025\u0067\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u003d\u0025\u0067\u0020\u0074\u0065\u0078\u0074\u003d\u0025q",_fafa ,_eebfd ._abf ._aff ,_gfed );};_fafa =_facf ;_bcbe :=_fafa ;_bfba :=_eebfd ._abf ._aff ;switch _ebca %360{case 90:_bfba .Urx ,_bfba .Ury =_bfba .Ury ,_bfba .Urx ;_bcbe =_dg .PdfRectangle {Llx :_bfba .Urx -_fafa .Ury ,Urx :_bfba .Urx -_fafa .Lly ,Lly :_fafa .Llx ,Ury :_fafa .Urx };case 180:_bcbe =_dg .PdfRectangle {Llx :_bfba .Urx -_fafa .Llx ,Urx :_bfba .Urx -_fafa .Urx ,Lly :_bfba .Ury -_fafa .Lly ,Ury :_bfba .Ury -_fafa .Ury };case 270:_bfba .Urx ,_bfba .Ury =_bfba .Ury ,_bfba .Urx ;_bcbe =_dg .PdfRectangle {Llx :_fafa .Ury ,Urx :_fafa .Lly ,Lly :_bfba .Ury -_fafa .Llx ,Ury :_bfba .Ury -_fafa .Urx };};if _bcbe .Llx > _bcbe .Urx {_bcbe .Llx ,_bcbe .Urx =_bcbe .Urx ,_bcbe .Llx ;};if _bcbe .Lly > _bcbe .Ury {_bcbe .Lly ,_bcbe .Ury =_bcbe .Ury ,_bcbe .Lly ;};_ddec :=textMark {_daca :_gfed ,PdfRectangle :_bcbe ,_fdf :_fafa ,_acea :_efed ,_bbdg :_dfdcb ,_caea :_fbd ,_afg :_dafad ,_bgcd :_ccg ,_eggf :_ebca ,_caec :_dced ,_dfce :_fegbc };if _dfec {_gb .Log .Info ("n\u0065\u0077\u0054\u0065\u0078\u0074M\u0061\u0072\u006b\u003a\u0020\u0073t\u0061\u0072\u0074\u003d\u0025\u002e\u0032f\u0020\u0065\u006e\u0064\u003d\u0025\u002e\u0032\u0066\u0020%\u0073",_abfg ,_ccg ,_ddec .String ());};return _ddec ,_ebac ;};func (_faf *imageExtractContext )extractXObjectImage (_ebge *_eff .PdfObjectName ,_abb _ba .GraphicsState ,_fcf *_dg .PdfPageResources )error {_bd ,_ :=_fcf .GetXObjectByName (*_ebge );if _bd ==nil {return nil ;};_fcg ,_fbe :=_faf ._ggf [_bd ];if !_fbe {_ae ,_aea :=_fcf .GetXObjectImageByName (*_ebge );if _aea !=nil {return _aea ;};if _ae ==nil {return nil ;};_fea ,_aea :=_ae .ToImage ();if _aea !=nil {return _aea ;};_fcg =&cachedImage {_ea :_fea ,_efd :_ae .ColorSpace };_faf ._ggf [_bd ]=_fcg ;};_gab :=_fcg ._ea ;_bfg :=_fcg ._efd ;_eed ,_bg :=_bfg .ImageToRGB (*_gab );if _bg !=nil {return _bg ;};_gb .Log .Debug ("@\u0044\u006f\u0020\u0043\u0054\u004d\u003a\u0020\u0025\u0073",_abb .CTM .String ());_dcg :=ImageMark {Image :&_eed ,Width :_abb .CTM .ScalingFactorX (),Height :_abb .CTM .ScalingFactorY (),Angle :_abb .CTM .Angle ()};_dcg .X ,_dcg .Y =_a
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-10-05 19:28:24 +00:00
|
|
|
|
// String returns a description of `k`.
|
|
|
|
|
func (_cdcf rulingKind )String ()string {_ffbe ,_efeg :=_egaf [_cdcf ];if !_efeg {return _ef .Sprintf ("\u004e\u006ft\u0020\u0061\u0020r\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0025\u0064",_cdcf );};return _ffbe ;};func (_abbbe rectRuling )asRuling ()(*ruling ,bool ){_gaef :=ruling {_eddf :_abbbe ._bbaf };switch _abbbe ._bbaf {case _dbga :_gaef ._dbdb =0.5*(_abbbe .Llx +_abbbe .Urx );_gaef ._ecfg =_abbbe .Lly ;_gaef ._gbbb =_abbbe .Ury ;case _daea :_gaef ._dbdb =0.5*(_abbbe .Lly +_abbbe .Ury );_gaef ._ecfg =_abbbe .Llx ;_gaef ._gbbb =_abbbe .Urx ;default:_gb .Log .Error ("\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064",_abbbe ._bbaf );return nil ,false ;};return &_gaef ,true ;};func (_ddc *stateStack )size ()int {return len (*_ddc )};func (_aec *PageText )computeViews (){_bgga :=_bfeeb (_aec ._fgfe );_abg :=_bbbc (_aec ._gge );var _age []rulingList ;if _aadg {_age =append (_age ,_bgga ...);};if _bggc {_age =append (_age ,_abg ...);};if _fgda {if len (_bgga )> 0{_gb .Log .Info ("S\u0074\u0072\u006f\u006b\u0065\u0073\u003a\u0020\u0025\u0064",len (_aec ._fgfe ));_gb .Log .Info ("\u0053\u0074r\u006f\u006b\u0065 \u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0025\u0064",len (_bgga ));for _eba ,_eccc :=range _bgga {_ef .Printf ("\u0025\u0034d\u003a\u0020\u0025d\u0020\u0072\u0075\u006c\u0069\u006e\u0067\u0073\u000a",_eba ,len (_eccc ));for _gbfa ,_aecc :=range _eccc {_ef .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_gbfa ,_aecc );};};};if len (_abg )> 0{_gb .Log .Info ("\u0046i\u006c\u006c\u0073\u003a\u0020\u0025d",len (_aec ._gge ));_gb .Log .Info ("\u0046\u0069\u006c\u006c\u0020\u0047\u0072\u0069\u0064s\u003a\u0020\u0025\u0064",len (_abg ));for _eccd ,_gabg :=range _abg {_ef .Printf ("\u0025\u0034d\u003a\u0020\u0025d\u0020\u0072\u0075\u006c\u0069\u006e\u0067\u0073\u000a",_eccd ,len (_gabg ));for _bfa ,_cda :=range _gabg {_ef .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_bfa ,_cda );};};};};var _aca paraList ;_cga :=len (_aec ._abcb );for _gabge :=0;_gabge < 360&&_cga > 0;_gabge +=90{_dgf :=make ([]*textMark ,0,len (_aec ._abcb )-_cga );for _ ,_fbfd :=range _aec ._abcb {if _fbfd ._eggf ==_gabge {_dgf =append (_dgf ,_fbfd );};};if len (_dgf )> 0{_bbgg :=_dcgf (_dgf ,_aec ._dfgg ,_age );_aca =append (_aca ,_bbgg ...);_cga -=len (_dgf );};};_effd :=new (_fa .Buffer );_aca .writeText (_effd );_aec ._fffd =_effd .String ();_aec ._acbg =_aca .toTextMarks ();_aec ._dabe =_aca .tables ();};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// RangeOffset returns the TextMarks in `ma` that overlap text[start:end] in the extracted text.
|
|
|
|
|
// These are tm: `start` <= tm.Offset + len(tm.Text) && tm.Offset < `end` where
|
|
|
|
|
// `start` and `end` are offsets in the extracted text.
|
|
|
|
|
// NOTE: TextMarks can contain multiple characters. e.g. "ffi" for the ffi ligature so the first and
|
|
|
|
|
// last elements of the returned TextMarkArray may only partially overlap text[start:end].
|
2020-10-05 19:28:24 +00:00
|
|
|
|
func (_aacc *TextMarkArray )RangeOffset (start ,end int )(*TextMarkArray ,error ){if _aacc ==nil {return nil ,_g .New ("\u006da\u003d\u003d\u006e\u0069\u006c");};if end < start {return nil ,_ef .Errorf ("\u0065\u006e\u0064\u0020\u003c\u0020\u0073\u0074\u0061\u0072\u0074\u002e\u0020\u0052\u0061n\u0067\u0065\u004f\u0066\u0066\u0073\u0065\u0074\u0020\u006e\u006f\u0074\u0020d\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020\u0073\u0074\u0061\u0072t=\u0025\u0064\u0020\u0065\u006e\u0064\u003d\u0025\u0064\u0020",start ,end );};_ceag :=len (_aacc ._ccbg );if _ceag ==0{return _aacc ,nil ;};if start < _aacc ._ccbg [0].Offset {start =_aacc ._ccbg [0].Offset ;};if end > _aacc ._ccbg [_ceag -1].Offset +1{end =_aacc ._ccbg [_ceag -1].Offset +1;};_fcab :=_f .Search (_ceag ,func (_fbb int )bool {return _aacc ._ccbg [_fbb ].Offset +len (_aacc ._ccbg [_fbb ].Text )-1>=start });if !(0<=_fcab &&_fcab < _ceag ){_ddbf :=_ef .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020\u0072\u0061\u006e\u0067\u0065\u002e\u0020\u0073\u0074\u0061\u0072\u0074\u003d%\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009 \u006c\u0061\u0073\u0074\u003d%\u0076",start ,_fcab ,_ceag ,_aacc ._ccbg [0],_aacc ._ccbg [_ceag -1]);return nil ,_ddbf ;};_geef :=_f .Search (_ceag ,func (_cbg int )bool {return _aacc ._ccbg [_cbg ].Offset > end -1});if !(0<=_geef &&_geef < _ceag ){_dbgd :=_ef .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020r\u0061\u006e\u0067e\u002e\u0020\u0065n\u0064\u003d%\u0064\u0020\u0069\u0045\u006e\u0064=\u0025d \u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d\u0025\u0076",end ,_geef ,_ceag ,_aacc ._ccbg [0],_aacc ._ccbg [_ceag -1]);return nil ,_dbgd ;};if _geef <=_fcab {return nil ,_ef .Errorf ("\u0069\u0045\u006e\u0064\u0020\u003c=\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003a\u0020\u0073\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u0065\u006ed\u003d\u0025\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020i\u0045n\u0064\u003d\u0025\u0064",start ,end ,_fcab ,_geef );};return &TextMarkArray {_ccbg :_aacc ._ccbg [_fcab :_geef ]},nil ;};func (_fadc *textTable )depth ()float64 {_cdcg :=1e10;for _eegd :=0;_eegd < _fadc ._acdaf ;_eegd ++{_gda :=_fadc .get (_eegd ,0);if _gda ._dcge {continue ;};_cdcg =_gc .Min (_cdcg ,_gda .depth ());};return _cdcg ;};func (_babd *wordBag )allWords ()[]*textWord {var _eebe []*textWord ;for _ ,_ebc :=range _babd ._dbf {_eebe =append (_eebe ,_ebc ...);};return _eebe ;};func _gdda (_bdcd _dg .PdfRectangle ,_gfab bounded )float64 {return _bdcd .Ury -_gfab .bbox ().Lly };func _afbd (_dgef ,_dfae _dg .PdfRectangle )bool {return _dgef .Lly <=_dfae .Ury &&_dfae .Lly <=_dgef .Ury ;};func (_ceda *subpath )makeRectRuling ()(*ruling ,bool ){if _fgda {_gb .Log .Info ("\u006d\u0061\u006beR\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0070\u0061\u0074\u0068\u003d\u0025\u0076",_ceda );};_fbdf :=_ceda ._dfdc [:4];_aecf :=make (map[int ]rulingKind ,len (_fbdf ));for _ddda ,_gdee :=range _fbdf {_fbffdf :=_ceda ._dfdc [(_ddda +1)%4];_aecf [_ddda ]=_cfbd (_gdee ,_fbffdf );};if _fgda {_ef .Printf ("\u0020\u0020\u0020\u006b\u0069\u006e\u0064\u0073\u003d\u0025\u002b\u0076\u000a",_aecf );};var _eeeac ,_eeag []int ;for _fbdc ,_fbge :=range _aecf {switch _fbge {case _daea :_eeag =append (_eeag ,_fbdc );case _dbga :_eeeac =append (_eeeac ,_fbdc );};};if _fgda {_ef .Printf ("\u0020\u0020 \u0068\u006f\u0072z\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a",len (_eeag ),_eeag );_ef .Printf ("\u0020\u0020 \u0076\u0065\u0072t\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a",len (_eeeac ),_eeeac );};_gcdf :=(len (_eeag )==2&&len (_eeeac )==2)||(len (_eeag )==2&&len (_eeeac )==0&&_gcbg (_fbdf [_eeag [0]],_fbdf [_eeag [1]]))||(len (_eeeac )==2&&len (_eeag )==0&&_agbc (_fbdf [_eeeac [0]],_fbdf [_eeeac [1]]));if _fgda {_ef .Printf (" \u0020\u0020\u0068\u006f\u0072\u007as\u003d\
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
|
|
|
|
// TextMark represents extracted text on a page with information regarding both textual content,
|
|
|
|
|
// formatting (font and size) and positioning.
|
|
|
|
|
// It is the smallest unit of text on a PDF page, typically a single character.
|
|
|
|
|
//
|
|
|
|
|
// getBBox() in test_text.go shows how to compute bounding boxes of substrings of extracted text.
|
|
|
|
|
// The following code extracts the text on PDF page `page` into `text` then finds the bounding box
|
|
|
|
|
// `bbox` of substring `term` in `text`.
|
|
|
|
|
//
|
|
|
|
|
// ex, _ := New(page)
|
|
|
|
|
// // handle errors
|
|
|
|
|
// pageText, _, _, err := ex.ExtractPageText()
|
|
|
|
|
// // handle errors
|
|
|
|
|
// text := pageText.Text()
|
|
|
|
|
// textMarks := pageText.Marks()
|
|
|
|
|
//
|
|
|
|
|
// start := strings.Index(text, term)
|
|
|
|
|
// end := start + len(term)
|
|
|
|
|
// spanMarks, err := textMarks.RangeOffset(start, end)
|
|
|
|
|
// // handle errors
|
|
|
|
|
// bbox, ok := spanMarks.BBox()
|
|
|
|
|
// // handle errors
|
|
|
|
|
type TextMark struct{
|
|
|
|
|
|
|
|
|
|
// Text is the extracted text.
|
|
|
|
|
Text string ;
|
|
|
|
|
|
|
|
|
|
// Original is the text in the PDF. It has not been decoded like `Text`.
|
|
|
|
|
Original string ;
|
2018-09-22 09:28:18 +10:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// BBox is the bounding box of the text.
|
2020-10-05 19:28:24 +00:00
|
|
|
|
BBox _dg .PdfRectangle ;
|
2018-09-22 09:28:18 +10:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// Font is the font the text was drawn with.
|
2020-10-05 19:28:24 +00:00
|
|
|
|
Font *_dg .PdfFont ;
|
2018-12-27 20:51:34 +11:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// FontSize is the font size the text was drawn with.
|
|
|
|
|
FontSize float64 ;
|
2018-11-28 18:06:03 +11:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// Offset is the offset of the start of TextMark.Text in the extracted text. If you do this
|
|
|
|
|
// text, textMarks := pageText.Text(), pageText.Marks()
|
|
|
|
|
// marks := textMarks.Elements()
|
|
|
|
|
// then marks[i].Offset is the offset of marks[i].Text in text.
|
|
|
|
|
Offset int ;
|
|
|
|
|
|
|
|
|
|
// Meta is set true for spaces and line breaks that we insert in the extracted text. We insert
|
|
|
|
|
// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
|
|
|
|
|
// distance apart. See wordJoiner (lineJoiner) in PageText.computeViews().
|
|
|
|
|
Meta bool ;
|
|
|
|
|
|
|
|
|
|
// FillColor is the fill color of the text.
|
|
|
|
|
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
2020-10-05 19:28:24 +00:00
|
|
|
|
FillColor _afe .Color ;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
|
|
|
|
// StrokeColor is the stroke color of the text.
|
|
|
|
|
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
2020-10-05 19:28:24 +00:00
|
|
|
|
StrokeColor _afe .Color ;};func _bafd (_dfffg _dg .PdfRectangle )*ruling {return &ruling {_eddf :_daea ,_dbdb :_dfffg .Lly ,_ecfg :_dfffg .Llx ,_gbbb :_dfffg .Urx };};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-10-05 19:28:24 +00:00
|
|
|
|
// PageText represents the layout of text on a device page.
|
|
|
|
|
type PageText struct{_abcb []*textMark ;_fffd string ;_acbg []TextMark ;_dabe []TextTable ;_dfgg _dg .PdfRectangle ;_fgfe []*subpath ;_gge []*subpath ;};func (_fge *textPara )toTextMarks (_ffaaa *int )[]TextMark {if _fge ._dbff ==nil {return _fge .toCellTextMarks (_ffaaa );};var _cdb []TextMark ;for _abccc :=0;_abccc < _fge ._dbff ._dfdbe ;_abccc ++{for _ecef :=0;_ecef < _fge ._dbff ._acdaf ;_ecef ++{_bffea :=_fge ._dbff .get (_ecef ,_abccc );if _bffea ==nil {_cdb =_ddae (_cdb ,_ffaaa ,"\u0009");}else {_cedcf :=_bffea .toCellTextMarks (_ffaaa );_cdb =append (_cdb ,_cedcf ...);};_cdb =_ddae (_cdb ,_ffaaa ,"\u0020");};if _abccc < _fge ._dbff ._dfdbe -1{_cdb =_ddae (_cdb ,_ffaaa ,"\u000a");};};return _cdb ;};type rectRuling struct{_bbaf rulingKind ;_dg .PdfRectangle ;};func (_afeb rulingList )tidied (_bfegc string )rulingList {_adbgb :=_afeb .removeDuplicates ();_eebc :=_adbgb .coalesce ();if _eebc ==nil {return nil ;};_eebc .sort ();if _fgda {_gb .Log .Info ("\u0074\u0069\u0064i\u0065\u0064\u003a\u0020\u0025\u0071\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u0075\u006e\u0069\u0071\u0075\u0065\u0073\u003d\u0025\u0064\u0020\u0063\u006f\u0061l\u0065\u0073\u0063\u0065\u0064\u003d\u0025\u0064",_bfegc ,len (_afeb ),len (_adbgb ),len (_eebc ));for _fagg ,_abee :=range _eebc {_ef .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_fagg ,_abee );};};return _eebc ;};func (_bbc paraList )inRect (_cddda _dg .PdfRectangle )paraList {var _fddgb ,_ffae paraList ;for _ ,_fecfb :=range _bbc {_ddge :=_fecfb .PdfRectangle ;_ddge .Lly +=_fdee ;_ddge .Ury -=_fdee ;if _eebf (_cddda ,_ddge ){_fddgb =append (_fddgb ,_fecfb );};if _abbf (_cddda ,_ddge ){_ffae =append (_ffae ,_fecfb );};};if _acba {_gb .Log .Info ("\u0069\u006e\u0052e\u0063\u0074\u003a\u0020\u0025\u002e\u0031\u0066",_cddda );for _gabgb ,_dfea :=range _ffae {_ef .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_gabgb ,_dfea );};};return _fddgb ;};func (_fce *shapesState )drawRectangle (_dafb ,_dedc ,_gcfa ,_bea float64 ){if _geeca {_dcbg :=_fce .devicePoint (_dafb ,_dedc );_fafde :=_fce .devicePoint (_dafb +_gcfa ,_dedc +_bea );_gad :=_dg .PdfRectangle {Llx :_dcbg .X ,Lly :_dcbg .Y ,Urx :_fafde .X ,Ury :_fafde .Y };_gb .Log .Info ("d\u0072a\u0077\u0052\u0065\u0063\u0074\u0061\u006e\u0067l\u0065\u003a\u0020\u00256.\u0032\u0066",_gad );};_fce .newSubPath ();_fce .moveTo (_dafb ,_dedc );_fce .lineTo (_dafb +_gcfa ,_dedc );_fce .lineTo (_dafb +_gcfa ,_dedc +_bea );_fce .lineTo (_dafb ,_dedc +_bea );_fce .closePath ();};func (_afce paraList )xNeighbours (_gggfa float64 )map[*textPara ][]int {_aadf :=make ([]event ,2*len (_afce ));if _gggfa ==0{for _eege ,_gcaa :=range _afce {_aadf [2*_eege ]=event {_gcaa .Llx ,true ,_eege };_aadf [2*_eege +1]=event {_gcaa .Urx ,false ,_eege };};}else {for _ecfbf ,_ebfa :=range _afce {_aadf [2*_ecfbf ]=event {_ebfa .Llx -_gggfa *_ebfa .fontsize (),true ,_ecfbf };_aadf [2*_ecfbf +1]=event {_ebfa .Urx +_gggfa *_ebfa .fontsize (),false ,_ecfbf };};};return _afce .eventNeighbours (_aadf );};
|
|
|
|
|
|
|
|
|
|
// TableCell is a cell in a TextTable.
|
|
|
|
|
type TableCell struct{
|
|
|
|
|
|
|
|
|
|
// Text is the extracted text.
|
|
|
|
|
Text string ;
|
|
|
|
|
|
|
|
|
|
// Marks returns the TextMarks corresponding to the text in Text.
|
|
|
|
|
Marks TextMarkArray ;};
|
|
|
|
|
|
|
|
|
|
// ImageMark represents an image drawn on a page and its position in device coordinates.
|
|
|
|
|
// All coordinates are in device coordinates.
|
|
|
|
|
type ImageMark struct{Image *_dg .Image ;
|
|
|
|
|
|
|
|
|
|
// Dimensions of the image as displayed in the PDF.
|
|
|
|
|
Width float64 ;Height float64 ;
|
|
|
|
|
|
|
|
|
|
// Position of the image in PDF coordinates (lower left corner).
|
|
|
|
|
X float64 ;Y float64 ;
|
|
|
|
|
|
|
|
|
|
// Angle in degrees, if rotated.
|
|
|
|
|
Angle float64 ;};func (_bedg *wordBag )removeDuplicates (){if _gcge {_gb .Log .Info ("r\u0065m\u006f\u0076\u0065\u0044\u0075\u0070\u006c\u0069c\u0061\u0074\u0065\u0073: \u0025\u0071",_bedg .text ());};for _ ,_fbac :=range _bedg .depthIndexes (){if len (_bedg ._dbf [_fbac ])==0{continue ;};_acda :=_bedg ._dbf [_fbac ][0];_bfcee :=_bdae *_acda ._feeg ;_bdgg :=_acda ._fadd ;for _ ,_bege :=range _bedg .depthBand (_bdgg ,_bdgg +_bfcee ){_cdcd :=map[*textWord ]struct{}{};_agag :=_bedg ._dbf [_bege ];for _ ,_facga :=range _agag {if _ ,_ccec :=_cdcd [_facga ];_ccec {continue ;};for _ ,_badf :=range _agag {if _ ,_bbeg :=_cdcd [_badf ];_bbeg {continue ;};if _badf !=_facga &&_badf ._efdce ==_facga ._efdce &&_gc .Abs (_badf .Llx -_facga .Llx )< _bfcee &&_gc .Abs (_badf .Urx -_facga .Urx )< _bfcee &&_gc .Abs (_badf .Lly -_facga .Lly )< _bfcee &&_gc .Abs (_badf .Ury -_facga .Ury )< _bfcee {_cdcd [_badf ]=struct{}{};};};};if len (_cdcd )> 0{_efbe :=0;for _ ,_egfe :=range _agag {if _ ,_dcef :=_cdcd [_egfe ];!_dcef {_agag [_efbe ]=_egfe ;_efbe ++;};};_bedg ._dbf [_bege ]=_agag [:len (_agag )-len (_cdcd )];if len (_bedg ._dbf [_bege ])==0{delete (_bedg ._dbf ,_bege );};};};};};func _feeaa (_acaa int ,_fdbce func (int ,int )bool )[]int {_bdage :=make ([]int ,_acaa );for _dcba :=range _bdage {_bdage [_dcba ]=_dcba ;};_f .Slice (_bdage ,func (_gbgcb ,_bdgbg int )bool {return _fdbce (_bdage [_gbgcb ],_bdage [_bdgbg ])});return _bdage ;};func _ebdc (_ceee *_ba .ContentStreamOperation )(float64 ,error ){if len (_ceee .Params )!=1{_aedf :=_g .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");_gb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_ceee .Operand ,1,len (_ceee .Params ),_ceee .Params );return 0.0,_aedf ;};return _eff .GetNumberAsFloat (_ceee .Params [0]);};
|
|
|
|
|
|
|
|
|
|
// String returns a string describing the current state of the textState stack.
|
|
|
|
|
func (_gfgb *stateStack )String ()string {_dbe :=[]string {_ef .Sprintf ("\u002d\u002d\u002d\u002d f\u006f\u006e\u0074\u0020\u0073\u0074\u0061\u0063\u006b\u003a\u0020\u0025\u0064",len (*_gfgb ))};for _dcb ,_feg :=range *_gfgb {_ggg :="\u003c\u006e\u0069l\u003e";if _feg !=nil {_ggg =_feg .String ();};_dbe =append (_dbe ,_ef .Sprintf ("\u0009\u0025\u0032\u0064\u003a\u0020\u0025\u0073",_dcb ,_ggg ));};return _af .Join (_dbe ,"\u000a");};func (_edefg *textLine )appendWord (_afae *textWord ){_edefg ._bac =append (_edefg ._bac ,_afae );_edefg .PdfRectangle =_gafef (_edefg .PdfRectangle ,_afae .PdfRectangle );if _afae ._feeg > _edefg ._dbcg {_edefg ._dbcg =_afae ._feeg ;};if _afae ._fadd > _edefg ._edeb {_edefg ._edeb =_afae ._fadd ;};};func (_cagge *textPara )text ()string {_fecf :=new (_fa .Buffer );_cagge .writeText (_fecf );return _fecf .String ();};func (_defb *textObject )setHorizScaling (_ceeb float64 ){if _defb ==nil {return ;};_defb ._cfeg ._ffde =_ceeb ;};
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a description of `state`.
|
2020-10-05 19:28:24 +00:00
|
|
|
|
func (_cab *textState )String ()string {_dca :="\u005bN\u004f\u0054\u0020\u0053\u0045\u0054]";if _cab ._ecdd !=nil {_dca =_cab ._ecdd .BaseFont ();};return _ef .Sprintf ("\u0074\u0063\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0077\u003d\u0025\u002e\u0032\u0066 \u0074f\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u003d\u0025\u0071",_cab ._cfdb ,_cab ._dda ,_cab ._eecf ,_dca );};func (_ecddd *textPara )depth ()float64 {if len (_ecddd ._feac )> 0{return _ecddd ._feac [0]._edeb ;};return _ecddd ._dbff .depth ();};func _ffac (_gcgaa ,_ecfca float64 )bool {return _gc .Abs (_gcgaa -_ecfca )<=_cggb };func _bfgdb (_dgba ,_dbgac _fe .Point )bool {return _dgba .X ==_dbgac .X &&_dgba .Y ==_dbgac .Y };func _gag (_dbg _fe .Matrix )_fe .Point {_dbb ,_gcf :=_dbg .Translation ();return _fe .Point {X :_dbb ,Y :_gcf };};func _fabfd (_gfeff *PageText )error {_afed :=_b .GetLicenseKey ();if _afed !=nil &&_afed .IsLicensed ()||_ff {return nil ;};_ef .Printf ("\u0055\u006e\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0064\u0020c\u006f\u0070\u0079\u0020\u006f\u0066\u0020\u0055\u006e\u0069P\u0044\u0046\u000a");_ef .Println ("-\u0020\u0047\u0065\u0074\u0020\u0061\u0020\u0066\u0072e\u0065\u0020\u0074\u0072\u0069\u0061\u006c l\u0069\u0063\u0065\u006es\u0065\u0020\u006f\u006e\u0020\u0068\u0074\u0074\u0070s:\u002f\u002fu\u006e\u0069\u0064\u006f\u0063\u002e\u0069\u006f");return _g .New ("\u0075\u006e\u0069\u0070d\u0066\u0020\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0020c\u006fd\u0065\u0020\u0072\u0065\u0071\u0075\u0069r\u0065\u0064");};const (RenderModeStroke RenderMode =1<<iota ;RenderModeFill ;RenderModeClip ;);
|
|
|
|
|
|
|
|
|
|
// BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
|
|
|
|
|
func (_babf *TextMarkArray )BBox ()(_dg .PdfRectangle ,bool ){var _ged _dg .PdfRectangle ;_acc :=false ;for _ ,_egf :=range _babf ._ccbg {if _egf .Meta ||_efaa (_egf .Text ){continue ;};if _acc {_ged =_gafef (_ged ,_egf .BBox );}else {_ged =_egf .BBox ;_acc =true ;};};return _ged ,_acc ;};func _efgf (_bbdee []*subpath ){if _gdcbb < 0.0{return ;};if _fgda {_gb .Log .Info ("\u0067r\u0061\u006e\u0075\u006c\u0061\u0072\u0069\u007a\u0065\u003a\u0020%\u0064\u0020\u0073\u0075\u0062\u0070\u0061\u0074\u0068\u0073",len (_bbdee ));};for _gded ,_gaed :=range _bbdee {for _edab ,_facc :=range _gaed ._dfdc {_gaed ._dfdc [_edab ]=_fe .Point {X :_cec (_facc .X ),Y :_cec (_facc .Y )};if _fgda {_cead :=_gaed ._dfdc [_edab ];if !_bfgdb (_facc ,_cead ){_gaadc :=_fe .Point {X :_cead .X -_facc .X ,Y :_cead .Y -_facc .Y };_ef .Printf ("\u0025\u0034\u0064\u0020\u002d\u0020\u0025\u0034\u0064\u003a \u0025\u002e\u0032\u0066\u0020\u2192\u0020%\u002e\u0032\u0066\u0020\u0028\u0025\u0067\u0029\u000a",_gded ,_edab ,_facc ,_cead ,_gaadc );};};};};};func (_fgbe paraList )findTextTables ()[]*textTable {var _efcgd []*textTable ;for _ ,_afbb :=range _fgbe {if _afbb .taken ()||_afbb .Width ()==0{continue ;};_dfcd :=_afbb .isAtom ();if _dfcd ==nil {continue ;};_dfcd .growTable ();if _dfcd ._acdaf *_dfcd ._dfdbe < _feca {continue ;};_dfcd .markCells ();_dfcd .log ("\u0067\u0072\u006fw\u006e");_efcgd =append (_efcgd ,_dfcd );};return _efcgd ;};func (_cade *textTable )growTable (){_gcegc :=func (_adeb paraList ){_cade ._dfdbe ++;for _ecccfa :=0;_ecccfa < _cade ._acdaf ;_ecccfa ++{_cbde :=_adeb [_ecccfa ];_cade .put (_ecccfa ,_cade ._dfdbe -1,_cbde );};};_adbfa :=func (_dbdd paraList ){_cade ._acdaf ++;for _ededg :=0;_ededg < _cade ._dfdbe ;_ededg ++{_adaf :=_dbdd [_ededg ];_cade .put (_cade ._acdaf -1,_ededg ,_adaf );};};for {_dffcd :=false ;_bfgf :=_cade .getDown ();_bgff :=_cade .getRight ();if _bfgf !=nil &&_bgff !=nil {_ebafb :=_bfgf [len (_bfgf )-1];if _ebafb !=nil &&!_ebafb ._fdde &&_ebafb ==_bgff [len (_bgff )-1]{_gcegc (_bfgf );if _bgff =_cade .getRight ();_bgff !=nil {_adbfa (_bgff );_cade .put (_cade ._acdaf -1,_cade ._dfdbe -1,_ebafb );};_dffcd =true ;};};if !_dffcd &&_bfgf !=nil {_gcegc (_bfgf );_dffcd =true ;};if !_dffcd &&_bgff !=nil {_adbfa (_bgff );_dffcd =true ;};if !_dffcd {break ;};};};func (_aeccc *shapesState )lastpointEstablished ()(_fe .Point ,bool ){if _aeccc ._egbb {return _aeccc ._bbede ,false ;};_dffc :=len (_aeccc ._dacd );if _dffc > 0&&_aeccc ._dacd [_dffc -1]._effe {return _aeccc ._dacd [_dffc -1].last (),false ;};return _fe .Point {},true ;};var _defad =_ad .MustCompile ("\u005c\u0064\u002b\u005c\u002e\u003f");func (_bgbe *textObject )getFontDirect (_acga string )(*_dg .PdfFont ,error ){_cfae ,_cfed :=_bgbe .getFontDict (_acga );if _cfed !=nil {return nil ,_cfed ;};_egff ,_cfed :=_dg .NewPdfFontFromPdfObject (_cfae );if _cfed !=nil {_gb .Log .Debug ("\u0067\u0065\u0074\u0046\u006f\u006e\u0074\u0044\u0069\u0072\u0065\u0063\u0074\u003a\u0020\u004e\u0065\u0077Pd\u0066F\u006f\u006e\u0074\u0046\u0072\u006f\u006d\u0050\u0064\u0066\u004f\u0062j\u0065\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u006e\u0061\u006d\u0065\u003d%\u0023\u0071\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_acga ,_cfed );};return _egff ,_cfed ;};
|
|
|
|
|
|
|
|
|
|
// ExtractText processes and extracts all text data in content streams and returns as a string.
|
|
|
|
|
// It takes into account character encodings in the PDF file, which are decoded by
|
|
|
|
|
// CharcodeBytesToUnicode.
|
|
|
|
|
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
|
|
|
|
|
func (_gdc *Extractor )ExtractText ()(string ,error ){_ebgb ,_ ,_ ,_caf :=_gdc .ExtractTextWithStats ();return _ebgb ,_caf ;};func (_babb *stateStack )empty ()bool {return len (*_babb )==0};func _dgaf (_ccda ,_cbdc *textPara )bool {if _ccda ._dcge ||_cbdc ._dcge {return true ;};return _afda (_ccda .depth ()-_cbdc .depth ());};func (_gabc *subpath )last ()_fe .Point {return _gabc ._dfdc [len (_gabc ._dfdc )-1]};func (_agca paraList )findGridTables (_gbbdd []rulingList )[]*textTable {if _acba {_gb .Log .Info ("\u0066i\u006e\u0064\u0047\u0072\u0069\u0064\u0054\u0061\u0062\u006c\u0065s\u003a\u0020\u0025\u0064\u0020\u0070\u0061\u0072\u0061\u0073",len (_agca ));for _bgfg ,_dbbf :=range _agca {_ef .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bgfg ,_dbbf );};};var _bcfd []*textTable ;for _fafc ,_ececa :=range _gbbdd {_accd ,_afef :=_agca .findTableGrid (_ececa );if _accd !=nil {_accd .log (_ef .Sprintf ("\u0066\u0069\u006e\u0064Ta\u0062\u006c\u0065\u0057\u0069\u0074\u0068\u0047\u0072\u0069\u0064\u0073\u003a\u0020%\u0064",_fafc ));_bcfd =append (_bcfd ,_accd );_accd .markCells ();};if _afef !=nil {for _efede :=range _afef {_efede ._fdde =true ;};};};return _bcfd ;};func (_cagd *wordBag )text ()string {_cedf :=_cagd .allWords ();_faab :=make ([]string ,len (_cedf ));for _bcgb ,_fgb :=range _cedf {_faab [_bcgb ]=_fgb ._efdce ;};return _af .Join (_faab ,"\u0020");};func (_dfecde *textTable )put (_cegcb ,_aagd int ,_geba *textPara ){_dfecde ._cbabg [_facd (_cegcb ,_aagd )]=_geba ;};func (_ebfe *textObject )setTextRenderMode (_eca int ){if _ebfe ==nil {return ;};_ebfe ._cfeg ._ece =RenderMode (_eca );};func (_cadd *shapesState )cubicTo (_bcgf ,_agb ,_ccfg ,_dgdb ,_fddb ,_dfad float64 ){if _geeca {_gb .Log .Info ("\u0063\u0075\u0062\u0069\u0063\u0054\u006f\u003a");};_cadd .addPoint (_fddb ,_dfad );};func (_gddc *shapesState )lineTo (_gbbd ,_dacg float64 ){if _geeca {_gb .Log .Info ("\u006c\u0069\u006eeT\u006f\u0028\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0070\u003d\u0025\u002e\u0032\u0066",_gbbd ,_dacg ,_gddc .devicePoint (_gbbd ,_dacg ));};_gddc .addPoint (_gbbd ,_dacg );};
|
|
|
|
|
|
|
|
|
|
// String returns a human readable description of `path`.
|
|
|
|
|
func (_afd *subpath )String ()string {_ecfc :=_afd ._dfdc ;_gece :=len (_ecfc );if _gece <=5{return _ef .Sprintf ("\u0025d\u003a\u0020\u0025\u0036\u002e\u0032f",_gece ,_ecfc );};return _ef .Sprintf ("\u0025d\u003a\u0020\u0025\u0036.\u0032\u0066\u0020\u0025\u0036.\u0032f\u0020.\u002e\u002e\u0020\u0025\u0036\u002e\u0032f",_gece ,_ecfc [0],_ecfc [1],_ecfc [_gece -1]);};
|
|
|
|
|
|
|
|
|
|
// ImageExtractOptions contains options for controlling image extraction from
|
|
|
|
|
// PDF pages.
|
|
|
|
|
type ImageExtractOptions struct{IncludeInlineStencilMasks bool ;};func (_ceb *shapesState )clearPath (){_ceb ._dacd =nil ;_ceb ._egbb =false ;if _geeca {_gb .Log .Info ("\u0043\u004c\u0045A\u0052\u003a\u0020\u0073\u0073\u003d\u0025\u0073",_ceb );};};func (_eeef *wordBag )empty (_aceb int )bool {_ ,_fead :=_eeef ._dbf [_aceb ];return !_fead };func (_cecd paraList )yNeighbours (_dacbd float64 )map[*textPara ][]int {_bcbg :=make ([]event ,2*len (_cecd ));if _dacbd ==0{for _fedf ,_aeecg :=range _cecd {_bcbg [2*_fedf ]=event {_aeecg .Lly ,true ,_fedf };_bcbg [2*_fedf +1]=event {_aeecg .Ury ,false ,_fedf };};}else {for _bgbeg ,_edfae :=range _cecd {_bcbg [2*_bgbeg ]=event {_edfae .Lly -_dacbd *_edfae .fontsize (),true ,_bgbeg };_bcbg [2*_bgbeg +1]=event {_edfae .Ury +_dacbd *_edfae .fontsize (),false ,_bgbeg };};};return _cecd .eventNeighbours (_bcbg );};func (_egbfe paraList )eventNeighbours (_cggc []event )map[*textPara ][]int {_f .Slice (_cggc ,func (_fece ,_dccc int )bool {_egcb ,_gfdf :=_cggc [_fece ],_cggc [_dccc ];_efde ,_gdcd :=_egcb ._bfgda ,_gfdf ._bfgda ;if _efde !=_gdcd {return _efde < _gdcd ;};if _egcb ._cddc !=_gfdf ._cddc {return _egcb ._cddc ;};return _fece < _dccc ;});_fcga :=make (map[int ]intSet );_bbdf :=make (intSet );for _ ,_abfgg :=range _cggc {if _abfgg ._cddc {_fcga [_abfgg ._acgee ]=make (intSet );for _afaf :=range _bbdf {if _afaf !=_abfgg ._acgee {_fcga [_abfgg ._acgee ].add (_afaf );_fcga [_afaf ].add (_abfgg ._acgee );};};_bbdf .add (_abfgg ._acgee );}else {_bbdf .del (_abfgg ._acgee );};};_dee :=map[*textPara ][]int {};for _agdec ,_abaf :=range _fcga {_dfca :=_egbfe [_agdec ];if len (_abaf )==0{_dee [_dfca ]=nil ;continue ;};_cafag :=make ([]int ,len (_abaf ));_eccb :=0;for _cace :=range _abaf {_cafag [_eccb ]=_cace ;_eccb ++;};_dee [_dfca ]=_cafag ;};return _dee ;};func _gcae (_ecfab *wordBag ,_deaa float64 ,_cgcd ,_bafc rulingList )[]*wordBag {var _eabg []*wordBag ;for _ ,_bffe :=range _ecfab .depthIndexes (){_abgg :=false ;for !_ecfab .empty (_bffe ){_bgdf :=_ecfab .firstReadingIndex (_bffe );_bgbcb :=_ecfab .firstWord (_bgdf );_cgca :=_dbd (_bgbcb ,_deaa ,_cgcd ,_bafc );_ecfab .removeWord (_bgbcb ,_bgdf );if _gccaf {_gb .Log .Info ("\u0066\u0069\u0072\u0073\u0074\u0057\u006f\u0072\u0064\u0020\u005e\u005e^\u005e\u0020\u0025\u0073",_bgbcb .String ());};for _gcgb :=true ;_gcgb ;_gcgb =_abgg {_abgg =false ;_dadb :=_gac *_cgca ._bgd ;_cfec :=_badd *_cgca ._bgd ;_afbdc :=_bfegb *_cgca ._bgd ;if _gccaf {_gb .Log .Info ("\u0070a\u0072a\u0057\u006f\u0072\u0064\u0073\u0020\u0064\u0065\u0070\u0074\u0068 \u0025\u002e\u0032\u0066 \u002d\u0020\u0025\u002e\u0032f\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061\u0044\u0065\u0070\u0074\u0068\u0047\u0061\u0070\u003d\u0025\u002e\u0032\u0066\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061R\u0065\u0061\u0064\u0069\u006e\u0067\u0047\u0061p\u003d\u0025\u002e\u0032\u0066",_cgca .minDepth (),_cgca .maxDepth (),_afbdc ,_cfec );};if _ecfab .scanBand ("\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c",_cgca ,_fffa (_gbgg ,0),_cgca .minDepth ()-_afbdc ,_cgca .maxDepth ()+_afbdc ,_efbf ,false ,false )> 0{_abgg =true ;};if _ecfab .scanBand ("\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c",_cgca ,_fffa (_gbgg ,_cfec ),_cgca .minDepth (),_cgca .maxDepth (),_eega ,false ,false )> 0{_abgg =true ;};if _abgg {continue ;};_bdeb :=_ecfab .scanBand ("",_cgca ,_fffa (_aecb ,_dadb ),_cgca .minDepth (),_cgca .maxDepth (),_faae ,true ,false );if _bdeb > 0{_fcbaf :=(_cgca .maxDepth ()-_cgca .minDepth ())/_cgca ._bgd ;if (_bdeb > 1&&float64 (_bdeb )> 0.3*_fcbaf )||_bdeb <=10{if _ecfab .scanBand ("\u006f\u0074\u0068e\u0072",_cgca ,_fffa (_aecb ,_dadb ),_cgca .minDepth (),_cgca .maxDepth (),_faae ,false ,true )> 0{_abgg =true ;};};};};_eabg =append (_eabg ,_cgca );};};return _eabg ;};type intSet map[int ]struct{};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `p`.
|
|
|
|
|
func (_fafeb *textPara )String ()string {_bgfc :="";if _fafeb ._dbff !=nil {_bgfc =_ef .Sprintf ("\u005b\u0025\u0064\u0078\u0025\u0064\u005d\u0020",_fafeb ._dbff ._acdaf ,_fafeb ._dbff ._dfdbe );};return _ef .Sprintf ("\u0025\u0036\u002e\u0032f \u0025\u0073\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u0020\u0025\u0071",_fafeb .PdfRectangle ,_bgfc ,len (_fafeb ._feac ),_cgdcd (_fafeb .text (),50));};func _gecbe (_ccfba float64 )bool {return _gc .Abs (_ccfba )< _eeea };func (_agcg paraList )toTextMarks ()[]TextMark {_adac :=0;var _abfgc []TextMark ;for _ecae ,_ffcc :=range _agcg {if _ffcc ._dcge {continue ;};_befgf :=_ffcc .toTextMarks (&_adac );_abfgc =append (_abfgc ,_befgf ...);if _ecae !=len (_agcg )-1{if _dgaf (_ffcc ,_agcg [_ecae +1]){_abfgc =_ddae (_abfgc ,&_adac ,"\u0020");}else {_abfgc =_ddae (_abfgc ,&_adac ,"\u000a");_abfgc =_ddae (_abfgc ,&_adac ,"\u000a");};};};_abfgc =_ddae (_abfgc ,&_adac ,"\u000a");_abfgc =_ddae (_abfgc ,&_adac ,"\u000a");return _abfgc ;};func (_aaf *textObject )reset (){_aaf ._fac =_fe .IdentityMatrix ();_aaf ._bfeb =_fe .IdentityMatrix ();_aaf ._cgc =nil ;};func (_dbgfg rulingList )blocks (_ebad ,_bdba *ruling )bool {if _ebad ._ecfg > _bdba ._gbbb ||_bdba ._ecfg > _ebad ._gbbb {return false ;};_cegc :=_gc .Max (_ebad ._ecfg ,_bdba ._ecfg );_dcaf :=_gc .Min (_ebad ._gbbb ,_bdba ._gbbb );if _ebad ._dbdb > _bdba ._dbdb {_ebad ,_bdba =_bdba ,_ebad ;};for _ ,_cfdc :=range _dbgfg {if _ebad ._dbdb <=_cfdc ._dbdb +_eeea &&_cfdc ._dbdb <=_bdba ._dbdb +_eeea &&_cfdc ._ecfg <=_dcaf &&_cegc <=_cfdc ._gbbb {return true ;};};return false ;};type textWord struct{_dg .PdfRectangle ;_fadd float64 ;_efdce string ;_degbg []*textMark ;_feeg float64 ;_gcgcb bool ;};func (_dcbcf rulingList )cells ()(int ,int ,[]_dg .PdfRectangle ){_dcbcf .sortStrict ();_beda ,_gega :=_dcbcf .vertsHorzs ();_ecefg :=len (_beda )-1;_fbaf :=len (_gega )-1;if _fgda {_gb .Log .Info ("\u0072\u0075\u006c\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002ec\u0065\u006c\u006c\u0073\u003a\u0020\u0076\u0065\u0072\u0074s\u003d\u0025\u0064",len (_beda ));for _bacg ,_ecddb :=range _beda {_ef .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bacg ,_ecddb );};_gb .Log .Info ("\u0072\u0075\u006c\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002ec\u0065\u006c\u006c\u0073\u003a\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064",len (_gega ));for _efaba ,_ddgf :=range _gega {_ef .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_efaba ,_ddgf );};_gb .Log .Info ("r\u0075\u006c\u0069\u006e\u0067\u004ci\u0073\u0074\u002e\u0063\u0065\u006cl\u0073\u003a\u0020\u0076\u0065\u0063\u0073=\u0025\u0064\u0020\u0077\u0078\u0068\u003d\u0025\u0064\u0078%\u0064",len (_dcbcf ),_ecefg ,_fbaf );};_cegf :=make ([]_dg .PdfRectangle ,_ecefg *_fbaf );for _bddfb :=0;_bddfb < _fbaf ;_bddfb ++{_ffbfg :=_gega [_bddfb ]._dbdb ;_dgcd :=_gega [_bddfb +1]._dbdb ;for _fgafc :=0;_fgafc < _ecefg ;_fgafc ++{_bcbb :=_beda [_fgafc ]._dbdb ;_aggf :=_beda [_fgafc +1]._dbdb ;_cegf [_bddfb *_ecefg +_fgafc ]=_dg .PdfRectangle {Llx :_bcbb ,Urx :_aggf ,Lly :_ffbfg ,Ury :_dgcd };};};return _ecefg ,_fbaf ,_cegf ;};func (_dgbb *textPara )writeCellText (_daaf _d .Writer ){for _gfdg ,_aedfc :=range _dgbb ._feac {_badae :=_aedfc .text ();_cbgdg :=_adfc &&_aedfc .endsInHyphen ()&&_gfdg !=len (_dgbb ._feac )-1;if _cbgdg {_badae =_cfee (_badae );};_daaf .Write ([]byte (_badae ));if !(_cbgdg ||_gfdg ==len (_dgbb ._feac )-1){_daaf .Write ([]byte (_acbcd (_aedfc ._edeb ,_dgbb ._feac [_gfdg +1]._edeb )));};};};func _agfc (_bcdb ,_aeadf ,_bbegb ,_cdea *textPara )*textTable {_dbeg :=&textTable {_acdaf :2,_dfdbe :2,_cbabg :make (map[uint64 ]*textPara ,4)};_dbeg .put (0,0,_bcdb );_dbeg .put (1,0,_aeadf );_dbeg .put (0,1,_bbegb );_dbeg .put (1,1,_cdea );return _dbeg ;};func (_dea *shapesState )quadraticTo (_dedfa ,_egc ,_ecaa ,_egbg float64 ){if _geeca {_gb .Log .Info ("\u0071\u0075\u0061d\u0072\u0061\u0074\u0069\u0063\u0054\u006f\u003a");};_dea .addPoint (_ecaa ,_egbg );};func (_fdfb *textTable )get (_cddf ,_beeef int )*textPara {return _fdfb ._cbabg [_facd (_cddf ,_beeef )]};func _fcbfg (_
|
|
|
|
|
|
|
|
|
|
// String returns a string describing `ma`.
|
|
|
|
|
func (_egd TextMarkArray )String ()string {_aagf :=len (_egd ._ccbg );if _aagf ==0{return "\u0045\u004d\u0050T\u0059";};_eecfg :=_egd ._ccbg [0];_gccg :=_egd ._ccbg [_aagf -1];return _ef .Sprintf ("\u007b\u0054\u0045\u0058\u0054\u004d\u0041\u0052K\u0041\u0052\u0052AY\u003a\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025s\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d%\u0073\u007d",_aagf ,_eecfg ,_gccg );};type textResult struct{_fef PageText ;_cfa int ;_gcdd int ;};func _gbgg (_fbad *wordBag ,_aee *textWord ,_cbce float64 )bool {return _aee .Llx < _fbad .Urx +_cbce &&_fbad .Llx -_cbce < _aee .Urx ;};func (_fcbg *textLine )text ()string {var _cdd []string ;for _ ,_ggcg :=range _fcbg ._bac {if _ggcg ._gcgcb {_cdd =append (_cdd ,"\u0020");};_cdd =append (_cdd ,_ggcg ._efdce );};return _af .Join (_cdd ,"");};func (_aedb rulingList )comp (_gagfd ,_ceagc int )bool {_bfff ,_dfag :=_aedb [_gagfd ],_aedb [_ceagc ];_gcgbg ,_ccbf :=_bfff ._eddf ,_dfag ._eddf ;if _gcgbg !=_ccbf {return _gcgbg > _ccbf ;};if _gcgbg ==_decfa {return false ;};_facfdd :=func (_ddeg bool )bool {if _gcgbg ==_daea {return _ddeg ;};return !_ddeg ;};_ddbfe ,_egba :=_bfff ._dbdb ,_dfag ._dbdb ;if _ddbfe !=_egba {return _facfdd (_ddbfe > _egba );};_ddbfe ,_egba =_bfff ._ecfg ,_dfag ._ecfg ;if _ddbfe !=_egba {return _facfdd (_ddbfe < _egba );};return _facfdd (_bfff ._gbbb < _dfag ._gbbb );};func (_dd *imageExtractContext )extractContentStreamImages (_gfae string ,_ebg *_dg .PdfPageResources )error {_cc :=_ba .NewContentStreamParser (_gfae );_fg ,_aag :=_cc .Parse ();if _aag !=nil {return _aag ;};if _dd ._ggf ==nil {_dd ._ggf =map[*_eff .PdfObjectStream ]*cachedImage {};};if _dd ._ffe ==nil {_dd ._ffe =&ImageExtractOptions {};};_gbb :=_ba .NewContentStreamProcessor (*_fg );_gbb .AddHandler (_ba .HandlerConditionEnumAllOperands ,"",_dd .processOperand );return _gbb .Process (_ebg );};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `v`.
|
|
|
|
|
func (_gadac *ruling )String ()string {if _gadac ._eddf ==_decfa {return "\u004e\u004f\u0054\u0020\u0052\u0055\u004c\u0049\u004e\u0047";};_gbde ,_cdcdc :="\u0078","\u0079";if _gadac ._eddf ==_daea {_gbde ,_cdcdc ="\u0079","\u0078";};return _ef .Sprintf ("\u0025\u0031\u0030\u0073\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u002d \u0025\u0036\u002e\u0032\u0066 \u0028\u00256\u002e\u0032\u0066\u0029",_gadac ._eddf ,_gbde ,_gadac ._dbdb ,_cdcdc ,_gadac ._ecfg ,_gadac ._gbbb ,_gadac ._gbbb -_gadac ._ecfg );};func (_fcac lineRuling )xMean ()float64 {return 0.5*(_fcac ._fcgb .X +_fcac ._bcc .X )};func _cec (_cdabg float64 )float64 {return _gdcbb *_gc .Round (_cdabg /_gdcbb )};func _babdc (_decb _dg .PdfRectangle )*ruling {return &ruling {_eddf :_daea ,_dbdb :_decb .Ury ,_ecfg :_decb .Llx ,_gbbb :_decb .Urx };};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `t`.
|
|
|
|
|
func (_fdefe *textTable )String ()string {return _ef .Sprintf ("\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u0020\u0025\u0074",_fdefe ._acdaf ,_fdefe ._dfdbe ,_fdefe ._ggbf );};const _edcb =1.0/1000.0;func _eaed (_effg []int )[]int {_afcd :=make ([]int ,len (_effg ));for _ggdde ,_efad :=range _effg {_afcd [len (_effg )-1-_ggdde ]=_efad ;};return _afcd ;};type rulingList []*ruling ;func _fccb (_ccbe string )(string ,bool ){_bfga :=[]rune (_ccbe );if len (_bfga )!=1{return "",false ;};_bfdf ,_bced :=_cbae [_bfga [0]];return _bfdf ,_bced ;};func _bfeeb (_daafb []*subpath )[]rulingList {_efgf (_daafb );var _ggbc rulingList ;for _ ,_cfecc :=range _daafb {if len (_cfecc ._dfdc )< 2{continue ;};_efca :=_cfecc ._dfdc [0];for _ ,_bcef :=range _cfecc ._dfdc [1:]{if _fbffd ,_aced :=_addg (_efca ,_bcef );_aced {_ggbc =append (_ggbc ,_fbffd );};_efca =_bcef ;};};_ggbc =_ggbc .tidied ("\u0073t\u0072\u006f\u006b\u0065\u0073");return _ggbc .toGrids ();};func (_fdbg *textLine )toTextMarks (_aafb *int )[]TextMark {var _fabc []TextMark ;for _ ,_dafe :=range _fdbg ._bac {if _dafe ._gcgcb {_fabc =_ddae (_fabc ,_aafb ,"\u0020");};_dgdag :=_dafe .toTextMarks (_aafb );_fabc =append (_fabc ,_dgdag ...);};return _fabc ;};func (_dccb *textTable )isExportable ()bool {_gebg :=func (_adfe int )bool {_fege :=_dccb .get (0,_adfe );_acdg :=_fege .text ();_edcc :=_e .RuneCountInString (_acdg );_fgbb :=_defad .MatchString (_acdg );return _edcc <=1||_fgbb ;};for _eddfg :=0;_eddfg < _dccb ._dfdbe ;_eddfg ++{if !_gebg (_eddfg ){return true ;};};return false ;};func _cbaf (_egab ,_bdec _dg .PdfRectangle )bool {return _bdec .Llx <=_egab .Urx &&_egab .Llx <=_bdec .Urx ;};func (_ccb *textObject )setTextRise (_febc float64 ){if _ccb ==nil {return ;};_ccb ._cfeg ._begf =_febc ;};var _ff =false ;func (_acbgb *textTable )getDown ()paraList {_cdfd :=make (paraList ,_acbgb ._acdaf );for _ddgb :=0;_ddgb < _acbgb ._acdaf ;_ddgb ++{_cgb :=_acbgb .get (_ddgb ,_acbgb ._dfdbe -1)._caff ;if _cgb ==nil ||_cgb ._fdde {return nil ;};_cdfd [_ddgb ]=_cgb ;};for _ecbf :=0;_ecbf < _acbgb ._acdaf -1;_ecbf ++{if _cdfd [_ecbf ]._cbb !=_cdfd [_ecbf +1]{return nil ;};};return _cdfd ;};
|
|
|
|
|
|
|
|
|
|
// PageImages represents extracted images on a PDF page with spatial information:
|
|
|
|
|
// display position and size.
|
|
|
|
|
type PageImages struct{Images []ImageMark ;};func (_egggc paraList )llyRange (_ffbf []int ,_fadf ,_ddbd float64 )[]int {_dacae :=len (_egggc );if _ddbd < _egggc [_ffbf [0]].Lly ||_fadf > _egggc [_ffbf [_dacae -1]].Lly {return nil ;};_afdde :=_f .Search (_dacae ,func (_begbb int )bool {return _egggc [_ffbf [_begbb ]].Lly >=_fadf });_bedd :=_f .Search (_dacae ,func (_faed int )bool {return _egggc [_ffbf [_faed ]].Lly > _ddbd });return _ffbf [_afdde :_bedd ];};func (_gbd *Extractor )extractPageText (_ffa string ,_dfb *_dg .PdfPageResources ,_efa _fe .Matrix ,_dba int )(*PageText ,int ,int ,error ){_gb .Log .Trace ("\u0065x\u0074\u0072\u0061\u0063t\u0050\u0061\u0067\u0065\u0054e\u0078t\u003a \u006c\u0065\u0076\u0065\u006c\u003d\u0025d",_dba );_cad :=&PageText {_dfgg :_gbd ._aff };_cag :=_gbf (_gbd ._aff );var _gee stateStack ;_ccc :=_agdb (_gbd ,_dfb ,_ba .GraphicsState {},&_cag ,&_gee );_egb :=shapesState {_fbaa :_efa ,_bcgc :_fe .IdentityMatrix ()};var _dgd bool ;if _dba > _ddf {_dfba :=_g .New ("\u0066\u006f\u0072\u006d s\u0074\u0061\u0063\u006b\u0020\u006f\u0076\u0065\u0072\u0066\u006c\u006f\u0077");_gb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a \u0065\u0078\u0074\u0072\u0061\u0063\u0074\u0050\u0061\u0067\u0065\u0054\u0065\u0078\u0074\u002e\u0020\u0072\u0065\u0063u\u0072\u0073\u0069\u006f\u006e\u0020\u006c\u0065\u0076\u0065\u006c\u003d\u0025\u0064 \u0065r\u0072\u003d\u0025\u0076",_dba ,_dfba );return _cad ,_cag ._fbga ,_cag ._bdgf ,_dfba ;};_dec :=_ba .NewContentStreamParser (_ffa );_bef ,_dfg :=_dec .Parse ();if _dfg !=nil {_gb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020e\u0078\u0074\u0072a\u0063\u0074\u0050\u0061g\u0065\u0054\u0065\u0078\u0074\u0020\u0070\u0061\u0072\u0073\u0065\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_dfg );return _cad ,_cag ._fbga ,_cag ._bdgf ,_dfg ;};_eeg :=_ba .NewContentStreamProcessor (*_bef );_eeg .AddHandler (_ba .HandlerConditionEnumAllOperands ,"",func (_bbag *_ba .ContentStreamOperation ,_gdbe _ba .GraphicsState ,_gfe *_dg .PdfPageResources )error {_ac :=_bbag .Operand ;if _ccdf {_gb .Log .Info ("\u0026&\u0026\u0020\u006f\u0070\u003d\u0025s",_bbag );};switch _ac {case "\u0071":if _geeca {_gb .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_egb ._bcgc );};_gee .push (&_cag );case "\u0051":if !_gee .empty (){_cag =*_gee .top ();if len (_gee )>=2{_gee .pop ();};};_egb ._bcgc =_gdbe .CTM ;if _geeca {_gb .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_egb ._bcgc );};case "\u0042\u0054":if _dgd {_gb .Log .Debug ("\u0042\u0054\u0020\u0063\u0061\u006c\u006c\u0065\u0064\u0020\u0077\u0068\u0069\u006c\u0065 \u0069n\u0020\u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074");_cad ._abcb =append (_cad ._abcb ,_ccc ._cgc ...);};_dgd =true ;_bbd :=_gdbe ;_bbd .CTM =_efa .Mult (_bbd .CTM );_ccc =_agdb (_gbd ,_gfe ,_bbd ,&_cag ,&_gee );case "\u0045\u0054":if !_dgd {_gb .Log .Debug ("\u0045\u0054\u0020ca\u006c\u006c\u0065\u0064\u0020\u006f\u0075\u0074\u0073i\u0064e\u0020o\u0066 \u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074");};_dgd =false ;_cad ._abcb =append (_cad ._abcb ,_ccc ._cgc ...);_ccc .reset ();case "\u0054\u002a":_ccc .nextLine ();case "\u0054\u0064":if _ce ,_gbbg :=_ccc .checkOp (_bbag ,2,true );!_ce {_gb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_gbbg );return _gbbg ;};_bgg ,_gfad ,_fde :=_fddfb (_bbag .Params );if _fde !=nil {return _fde ;};_ccc .moveText (_bgg ,_gfad );case "\u0054\u0044":if _gcdb ,_ggd :=_ccc .checkOp (_bbag ,2,true );!_gcdb {_gb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_ggd );return _ggd ;};_acb ,_bgb ,_adf :=_fddfb (_bbag .Params );if _adf !=nil {_gb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_adf );return _adf ;};_ccc .moveTextSetLeading (_acb ,_bgb );case "\u0054\u006a":if _feb ,_aad :=_ccc .checkOp (_bbag ,1,true );!_feb {_gb .Log .Debug ("\u0045\u0052\u0052\u004fR:\u0020\u0054\u006
|
|
|
|
|
|
|
|
|
|
// Extractor stores and offers functionality for extracting content from PDF pages.
|
|
|
|
|
type Extractor struct{_eb string ;_gg *_dg .PdfPageResources ;_aff _dg .PdfRectangle ;_dc map[string ]fontEntry ;_fb map[string ]textResult ;_da int64 ;_c int ;};
|
|
|
|
|
|
|
|
|
|
// TextTable represents a table.
|
|
|
|
|
// Cells are ordered top-to-bottom, left-to-right.
|
|
|
|
|
// Cells[y] is the (0-offset) y'th row in the table.
|
|
|
|
|
// Cells[y][x] is the (0-offset) x'th column in the table.
|
|
|
|
|
type TextTable struct{W ,H int ;Cells [][]TableCell ;};func (_dfdd rulingList )isActualGrid ()bool {_edgb ,_cebf :=_dfdd .vertsHorzs ();if _fgda {_gb .Log .Info ("\u0069\u0073A\u0063\u0074\u0075\u0061\u006cG\u0072\u0069\u0064\u003a\u0020n\u0075\u006d\u0056\u0065\u0072\u0074\u003d\u0025\u0064\u0020\u006e\u0075\u006d\u0048\u006f\u0072\u007a\u003d\u0025\u0064\u0020\u003a\u0020\u0025\u0074\u0020\u0026\u0020\u0025\u0074\u0020\u2192\u0020\u0025\u0074",len (_edgb ),len (_cebf ),len (_edgb )>=2,len (_cebf )>=2,len (_edgb )>=2&&len (_cebf )>=2);for _fffab ,_fdbb :=range _dfdd {_ef .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0076\u000a",_fffab ,_fdbb );};};if !(len (_edgb )>=_cbag +1&&len (_cebf )>=_ccfdg +1){return false ;};if !(_edgb .aligned ()&&_cebf .aligned ()){return false ;};_abac ,_gebbd :=_edgb [0],_edgb [len (_edgb )-1];_ddbc ,_abcec :=_cebf [0],_cebf [len (_cebf )-1];return _gecbe (_abac ._dbdb -_ddbc ._ecfg )&&_gecbe (_gebbd ._dbdb -_ddbc ._gbbb )&&_gecbe (_ddbc ._dbdb -_abac ._gbbb )&&_gecbe (_abcec ._dbdb -_abac ._ecfg );};func (_eecfa *textWord )toTextMarks (_defbc *int )[]TextMark {var _dfdff []TextMark ;for _ ,_cbgeg :=range _eecfa ._degbg {_dfdff =_bedb (_dfdff ,_defbc ,_cbgeg .ToTextMark ());};return _dfdff ;};func _cbef (_eaf ,_adda bounded )float64 {_dgfg :=_fddga (_eaf ,_adda );if !_afda (_dgfg ){return _dgfg ;};return _edef (_eaf ,_adda );};type bounded interface{bbox ()_dg .PdfRectangle };func _efaa (_ecdcb string )bool {for _ ,_gdac :=range _ecdcb {if !_gf .IsSpace (_gdac ){return false ;};};return true ;};func (_gacf *textMark )inDiacriticArea (_abffd *textMark )bool {_ecac :=_gacf .Llx -_abffd .Llx ;_ddfbg :=_gacf .Urx -_abffd .Urx ;_edg :=_gacf .Lly -_abffd .Lly ;return _gc .Abs (_ecac +_ddfbg )< _gacf .Width ()*_begb &&_gc .Abs (_edg )< _gacf .Height ()*_begb ;};func (_cbcb *textPara )bbox ()_dg .PdfRectangle {return _cbcb .PdfRectangle };func (_fcfeg intSet )del (_adga int ){delete (_fcfeg ,_adga )};func _facd (_ebdd ,_fddge int )uint64 {return uint64 (_ebdd )*0x1000000+uint64 (_fddge )};type textObject struct{_abf *Extractor ;_dcga *_dg .PdfPageResources ;_fafd _ba .GraphicsState ;_cfeg *textState ;_cgg *stateStack ;_fac _fe .Matrix ;_bfeb _fe .Matrix ;_cgc []*textMark ;_gabd bool ;};
|
|
|
|
|
|
|
|
|
|
// String returns a string describing `tm`.
|
|
|
|
|
func (_bbf TextMark )String ()string {_fcc :=_bbf .BBox ;var _abcc string ;if _bbf .Font !=nil {_abcc =_bbf .Font .String ();if len (_abcc )> 50{_abcc =_abcc [:50]+"\u002e\u002e\u002e";};};var _bede string ;if _bbf .Meta {_bede ="\u0020\u002a\u004d\u002a";};return _ef .Sprintf ("\u007b\u0054\u0065\u0078t\u004d\u0061\u0072\u006b\u003a\u0020\u0025\u0064\u0020%\u0071\u003d\u0025\u0030\u0032\u0078\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e2\u0066\u0029\u0020\u0028\u00256\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0025\u0073\u007d",_bbf .Offset ,_bbf .Text ,[]rune (_bbf .Text ),_fcc .Llx ,_fcc .Lly ,_fcc .Urx ,_fcc .Ury ,_abcc ,_bede );};func (_ddffd *textMark )bbox ()_dg .PdfRectangle {return _ddffd .PdfRectangle };
|
|
|
|
|
|
|
|
|
|
// String returns a description of `l`.
|
|
|
|
|
func (_dedg *textLine )String ()string {return _ef .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_dedg ._edeb ,_dedg .PdfRectangle ,_dedg ._dbcg ,_dedg .text ());};func (_dcfe *stateStack )pop ()*textState {if _dcfe .empty (){return nil ;};_ede :=*(*_dcfe )[len (*_dcfe )-1];*_dcfe =(*_dcfe )[:len (*_dcfe )-1];return &_ede ;};func (_fddg *shapesState )newSubPath (){_fddg .clearPath ();if _geeca {_gb .Log .Info ("\u006e\u0065\u0077\u0053\u0075\u0062\u0050\u0061\u0074h\u003a\u0020\u0025\u0073",_fddg );};};func (_ecbb *textPara )taken ()bool {return _ecbb ==nil ||_ecbb ._fdde };func (_bdc *textObject )getFillColor ()_afe .Color {return _agdbg (_bdc ._fafd .ColorspaceNonStroking ,_bdc ._fafd .ColorNonStroking );};type rulingKind int ;func (_adde *textObject )moveText (_cfe ,_gcg float64 ){_adde .moveLP (_cfe ,_gcg )};func (_dad *textObject )setTextLeading (_bcfc float64 ){if _dad ==nil {return ;};_dad ._cfeg ._bbda =_bcfc ;};func (_ggeg *textPara )writeText (_gdfg _d .Writer ){if _ggeg ._dbff ==nil {_ggeg .writeCellText (_gdfg );return ;};for _gfda :=0;_gfda < _ggeg ._dbff ._dfdbe ;_gfda ++{for _cfdf :=0;_cfdf < _ggeg ._dbff ._acdaf ;_cfdf ++{_beb :=_ggeg ._dbff .get (_cfdf ,_gfda );if _beb ==nil {_gdfg .Write ([]byte ("\u0009"));}else {_beb .writeCellText (_gdfg );};_gdfg .Write ([]byte ("\u0020"));};if _gfda < _ggeg ._dbff ._dfdbe -1{_gdfg .Write ([]byte ("\u000a"));};};};func _edef (_fafb ,_ddfdd bounded )float64 {return _ffab (_fafb )-_ffab (_ddfdd )};func _cgea (_afca map[int ]intSet )[]int {_eebbd :=make ([]int ,0,len (_afca ));for _fefbc :=range _afca {_eebbd =append (_eebbd ,_fefbc );};_f .Ints (_eebbd );return _eebbd ;};type event struct{_bfgda float64 ;_cddc bool ;_acgee int ;};func (_fccd *wordBag )depthBand (_dgdfc ,_aead float64 )[]int {if len (_fccd ._dbf )==0{return nil ;};return _fccd .depthRange (_fccd .getDepthIdx (_dgdfc ),_fccd .getDepthIdx (_aead ));};func (_cg *imageExtractContext )extractFormImages (_dddc *_eff .PdfObjectName ,_gaf _ba .GraphicsState ,_ded *_dg .PdfPageResources )error {_fdg ,_eag :=_ded .GetXObjectFormByName (*_dddc );if _eag !=nil {return _eag ;};if _fdg ==nil {return nil ;};_eeb ,_eag :=_fdg .GetContentStream ();if _eag !=nil {return _eag ;};_fcb :=_fdg .Resources ;if _fcb ==nil {_fcb =_ded ;};_eag =_cg .extractContentStreamImages (string (_eeb ),_fcb );if _eag !=nil {return _eag ;};_cg ._ge ++;return nil ;};func _dabf (_cac _fe .Point )*subpath {return &subpath {_dfdc :[]_fe .Point {_cac }}};func _ecdc (_fdag _dg .PdfRectangle )*ruling {return &ruling {_eddf :_dbga ,_dbdb :_fdag .Urx ,_ecfg :_fdag .Lly ,_gbbb :_fdag .Ury };};func (_gagd paraList )readBefore (_abdf []int ,_dfff ,_gfbb int )bool {_ffee ,_egea :=_gagd [_dfff ],_gagd [_gfbb ];if _fbadb (_ffee ,_egea )&&_ffee .Lly > _egea .Lly {return true ;};if !(_ffee ._ageda .Urx < _egea ._ageda .Llx ){return false ;};_gdf ,_afdg :=_ffee .Lly ,_egea .Lly ;if _gdf > _afdg {_afdg ,_gdf =_gdf ,_afdg ;};_fgge :=_gc .Max (_ffee ._ageda .Llx ,_egea ._ageda .Llx );_eefa :=_gc .Min (_ffee ._ageda .Urx ,_egea ._ageda .Urx );_ffggc :=_gagd .llyRange (_abdf ,_gdf ,_afdg );for _ ,_cgag :=range _ffggc {if _cgag ==_dfff ||_cgag ==_gfbb {continue ;};_gbgd :=_gagd [_cgag ];if _gbgd ._ageda .Llx <=_eefa &&_fgge <=_gbgd ._ageda .Urx {return false ;};};return true ;};func _addg (_becgc ,_gcgdd _fe .Point )(*ruling ,bool ){_eda :=lineRuling {_fcgb :_becgc ,_bcc :_gcgdd ,_ffec :_cfbd (_becgc ,_gcgdd )};if _eda ._ffec ==_decfa {return nil ,false ;};return _eda .asRuling ();};type textMark struct{_dg .PdfRectangle ;_eggf int ;_daca string ;_daeg string ;_acea *_dg .PdfFont ;_bbdg float64 ;_caea float64 ;_afg _fe .Matrix ;_bgcd _fe .Point ;_fdf _dg .PdfRectangle ;_caec _afe .Color ;_dfce _afe .Color ;};func (_fcde *wordBag )arrangeText ()*textPara {_fcde .sort ();if _ddgd {_fcde .removeDuplicates ();};var _dbcc []*textLine ;for _ ,_fcdg :=range _fcde .depthIndexes (){for !_fcde .empty (_fcdg ){_bfdgf :=_fcde .firstReadingIndex (_fcd
|
|
|
|
|
|
|
|
|
|
// String returns a human readable description of `ss`.
|
|
|
|
|
func (_cef *shapesState )String ()string {return _ef .Sprintf ("\u007b\u0025\u0064\u0020su\u0062\u0070\u0061\u0074\u0068\u0073\u0020\u0066\u0072\u0065\u0073\u0068\u003d\u0025t\u007d",len (_cef ._dacd ),_cef ._egbb );};func (_dcad *textTable )getRight ()paraList {_faedg :=make (paraList ,_dcad ._dfdbe );for _edaa :=0;_edaa < _dcad ._dfdbe ;_edaa ++{_dcca :=_dcad .get (_dcad ._acdaf -1,_edaa )._cbb ;if _dcca ==nil ||_dcca ._fdde {return nil ;};_faedg [_edaa ]=_dcca ;};for _dbfb :=0;_dbfb < _dcad ._dfdbe -1;_dbfb ++{if _faedg [_dbfb ]._caff !=_faedg [_dbfb +1]{return nil ;};};return _faedg ;};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a string describing `pt`.
|
2020-10-05 19:28:24 +00:00
|
|
|
|
func (_fbed PageText )String ()string {_ggaa :=_ef .Sprintf ("P\u0061\u0067\u0065\u0054ex\u0074:\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073",len (_fbed ._abcb ));_bbg :=[]string {"\u002d"+_ggaa };for _ ,_bfb :=range _fbed ._abcb {_bbg =append (_bbg ,_bfb .String ());};_bbg =append (_bbg ,"\u002b"+_ggaa );return _af .Join (_bbg ,"\u000a");};func _cdfc (_ecbd float64 ,_feade int )int {if _feade ==0{_feade =1;};_eaeg :=float64 (_feade );return int (_gc .Round (_ecbd /_eaeg )*_eaeg );};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `b`.
|
|
|
|
|
func (_bega *wordBag )String ()string {var _fcd []string ;for _ ,_adca :=range _bega .depthIndexes (){_gebb ,_ :=_bega ._dbf [_adca ];for _ ,_gbee :=range _gebb {_fcd =append (_fcd ,_gbee ._efdce );};};return _ef .Sprintf ("\u0025.\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065=\u0025\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0025\u0071",_bega .PdfRectangle ,_bega ._bgd ,len (_fcd ),_fcd );};func (_dcbca lineRuling )yDelta ()float64 {return _gc .Abs (_dcbca ._bcc .Y -_dcbca ._bcc .Y )};func (_edff *subpath )close (){if !_bfgdb (_edff ._dfdc [0],_edff .last ()){_edff .add (_edff ._dfdc [0]);};_edff ._effe =true ;_edff .removeDuplicates ();};
|
2020-09-28 23:18:17 +00:00
|
|
|
|
|
|
|
|
|
// ToText returns the page text as a single string.
|
|
|
|
|
// Deprecated: This function is deprecated and will be removed in a future major version. Please use
|
|
|
|
|
// Text() instead.
|
2020-10-05 19:28:24 +00:00
|
|
|
|
func (_abe PageText )ToText ()string {return _abe .Text ()};func (_bfadd rulingList )toGrids ()[]rulingList {if len (_bfadd )==0{return nil ;};_eabc :=_bfadd .intersections ();if _fgda {_gb .Log .Info ("\u0074\u006f\u0047r\u0069\u0064\u0073\u003a \u0076\u0065\u0063\u0073\u003d\u0025\u0064 \u0069\u006e\u0074\u0065\u0072\u0073\u0065\u0063\u0074\u0073\u003d\u0025\u0064\u0020",len (_bfadd ),len (_eabc ));for _ ,_feaa :=range _cgea (_eabc ){_ef .Printf ("\u00254\u0064\u003a\u0020\u0025\u002b\u0076\n",_feaa ,_eabc [_feaa ]);};};_fabf :=make (map[int ]intSet ,len (_bfadd ));for _fbcg :=range _bfadd {_edbgd :=_bfadd .connections (_eabc ,_fbcg );if len (_edbgd )> 0{_fabf [_fbcg ]=_edbgd ;};};if _fgda {_gb .Log .Info ("t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u0063\u006fn\u006e\u0065\u0063\u0074s=\u0025\u0064",len (_fabf ));for _ ,_dfed :=range _cgea (_fabf ){_ef .Printf ("\u00254\u0064\u003a\u0020\u0025\u002b\u0076\n",_dfed ,_fabf [_dfed ]);};};_egdfe :=_feeaa (len (_bfadd ),func (_egcg ,_gagf int )bool {_gbcb ,_bece :=len (_fabf [_egcg ]),len (_fabf [_gagf ]);if _gbcb !=_bece {return _gbcb > _bece ;};return _bfadd .comp (_egcg ,_gagf );});if _fgda {_gb .Log .Info ("t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u006f\u0072d\u0065\u0072\u0069\u006eg=\u0025\u0076",_egdfe );};_ffgd :=[][]int {{_egdfe [0]}};_fgca :for _ ,_ceac :=range _egdfe [1:]{for _gcgcf ,_gbggd :=range _ffgd {for _ ,_bdcg :=range _gbggd {if _fabf [_bdcg ].has (_ceac ){_ffgd [_gcgcf ]=append (_gbggd ,_ceac );continue _fgca ;};};};_ffgd =append (_ffgd ,[]int {_ceac });};if _fgda {_gb .Log .Info ("\u0074o\u0047r\u0069\u0064\u0073\u003a\u0020i\u0067\u0072i\u0064\u0073\u003d\u0025\u0076",_ffgd );};_f .SliceStable (_ffgd ,func (_dfbd ,_dgafb int )bool {return len (_ffgd [_dfbd ])> len (_ffgd [_dgafb ])});for _ ,_cbcec :=range _ffgd {_f .Slice (_cbcec ,func (_faggd ,_cbgf int )bool {return _bfadd .comp (_cbcec [_faggd ],_cbcec [_cbgf ])});};_cgdcf :=make ([]rulingList ,len (_ffgd ));for _gace ,_dcae :=range _ffgd {_agac :=make (rulingList ,len (_dcae ));for _fcedc ,_bfge :=range _dcae {_agac [_fcedc ]=_bfadd [_bfge ];};_cgdcf [_gace ]=_agac ;};if _fgda {_gb .Log .Info ("\u0074\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0067\u0072\u0069d\u0073\u003d\u0025\u0076",_cgdcf );};var _dcbgd []rulingList ;for _ ,_beca :=range _cgdcf {if _beca .isActualGrid (){_dcbgd =append (_dcbgd ,_beca );};};if _fgda {_gb .Log .Info ("\u0074\u006f\u0047ri\u0064\u0073\u003a\u0020\u0061\u0063\u0074\u0075\u0061\u006c\u0047\u0072\u0069\u0064\u0073\u003d\u0025\u0076",_dcbgd );_gb .Log .Info ("\u0074\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0067\u0072\u0069\u0064\u0073\u003d%\u0064 \u0061\u0063\u0074\u0075\u0061\u006c\u0047\u0072\u0069\u0064\u0073\u003d\u0025\u0064",len (_cgdcf ),len (_dcbgd ));};return _dcbgd ;};func (_gabbc *textObject )moveTextSetLeading (_agd ,_abde float64 ){_gabbc ._cfeg ._bbda =-_abde ;_gabbc .moveLP (_agd ,_abde );};func (_bbdcgb *textWord )computeText ()string {_geee :=make ([]string ,len (_bbdcgb ._degbg ));for _cbgg ,_aeb :=range _bbdcgb ._degbg {_geee [_cbgg ]=_aeb ._daca ;};return _af .Join (_geee ,"");};func (_faef lineRuling )yMean ()float64 {return 0.5*(_faef ._fcgb .Y +_faef ._bcc .Y )};type textLine struct{_dg .PdfRectangle ;_edeb float64 ;_bac []*textWord ;_dbcg float64 ;};type stateStack []*textState ;func (_beff *textObject )renderText (_fcaa []byte )error {if _beff ._gabd {_gb .Log .Debug ("\u0072\u0065\u006e\u0064\u0065r\u0054\u0065\u0078\u0074\u003a\u0020\u0049\u006e\u0076\u0061\u006c\u0069\u0064 \u0066\u006f\u006e\u0074\u002e\u0020\u004e\u006f\u0074\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u002e");return nil ;};_bdgb :=_beff .getCurrentFont ();_fgd :=_bdgb .BytesToCharcodes (_fcaa );_edc ,_bgc ,_aeae :=_bdgb .CharcodesToStrings (_fgd );if _aeae > 0{_gb .Log .Debug ("\u0072\u0065nd\u0065\u0072\u0054e\u0078\u0074\u003a\u0020num\u0043ha\u0072\u0073\u003d\u0025\u0064\u0020\u006eum\u004d\u0069\u0073\u0073\u0065\u0073\u003d%\u0064",_bgc ,_aeae );};_beff ._cfeg ._fbga +=_bgc ;_beff ._cfeg ._bdgf +=_aeae ;_bbed :=_beff ._c
|
|
|
|
|
|
|
|
|
|
// String returns a human readable description of `s`.
|
|
|
|
|
func (_geced intSet )String ()string {var _ecfd []int ;for _beag :=range _geced {if _geced .has (_beag ){_ecfd =append (_ecfd ,_beag );};};_f .Ints (_ecfd );return _ef .Sprintf ("\u0025\u002b\u0076",_ecfd );};
|
|
|
|
|
|
|
|
|
|
// Append appends `mark` to the mark array.
|
|
|
|
|
func (_eadc *TextMarkArray )Append (mark TextMark ){_eadc ._ccbg =append (_eadc ._ccbg ,mark )};func (_eee *shapesState )closePath (){if _eee ._egbb {_eee ._dacd =append (_eee ._dacd ,_dabf (_eee ._bbede ));_eee ._egbb =false ;}else if len (_eee ._dacd )==0{if _geeca {_gb .Log .Debug ("\u0063\u006c\u006f\u0073eP\u0061\u0074\u0068\u0020\u0077\u0069\u0074\u0068\u0020\u006e\u006f\u0020\u0070\u0061t\u0068");};_eee ._egbb =false ;return ;};_eee ._dacd [len (_eee ._dacd )-1].close ();if _geeca {_gb .Log .Info ("\u0063\u006c\u006f\u0073\u0065\u0050\u0061\u0074\u0068\u003a\u0020\u0025\u0073",_eee );};};func (_gadfe *textPara )isAtom ()*textTable {_bfgd :=_gadfe ;_edgc :=_gadfe ._cbb ;_daec :=_gadfe ._caff ;if !(_edgc !=nil &&!_edgc ._fdde &&_daec !=nil &&!_daec ._fdde ){return nil ;};_ccgb :=_edgc ._caff ;if !(_ccgb !=nil &&!_ccgb ._fdde &&_ccgb ==_daec ._cbb ){return nil ;};return _agfc (_bfgd ,_edgc ,_daec ,_ccgb );};func (_edfa *textLine )bbox ()_dg .PdfRectangle {return _edfa .PdfRectangle };
|
2020-09-28 23:18:17 +00:00
|
|
|
|
|
|
|
|
|
// TextMarkArray is a collection of TextMarks.
|
2020-10-05 19:28:24 +00:00
|
|
|
|
type TextMarkArray struct{_ccbg []TextMark };func (_aaba lineRuling )xDelta ()float64 {return _gc .Abs (_aaba ._bcc .X -_aaba ._bcc .X )};func _dcgf (_gbfb []*textMark ,_bga _dg .PdfRectangle ,_agad []rulingList )paraList {_gb .Log .Trace ("\u006d\u0061\u006b\u0065\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065\u003a \u0025\u0064\u0020\u0065\u006c\u0065m\u0065\u006e\u0074\u0073\u0020\u0070\u0061\u0067\u0065\u0053\u0069\u007a\u0065=\u0025\u002e\u0032\u0066",len (_gbfb ),_bga );if len (_gbfb )==0{return nil ;};_cgdc :=_edad (_gbfb ,_bga );if len (_cgdc )==0{return nil ;};_ebcb ,_gbfaf :=_efaf (_agad );_egfgf :=_ddba (_cgdc ,_bga .Ury ,_ebcb ,_gbfaf );_aefe :=_gcae (_egfgf ,_bga .Ury ,_ebcb ,_gbfaf );_aefe =_dcbc (_aefe );_gfdd :=make (paraList ,0,len (_aefe ));for _ ,_cedc :=range _aefe {_dgeg :=_cedc .arrangeText ();if _dgeg !=nil {_gfdd =append (_gfdd ,_dgeg );};};if len (_gfdd )>=_feca {_gfdd =_gfdd .extractTables (_agad );};_gfdd .sortReadingOrder ();_gfdd .log ("\u0073\u006f\u0072te\u0064\u0020\u0069\u006e\u0020\u0072\u0065\u0061\u0064\u0069\u006e\u0067\u0020\u006f\u0072\u0064\u0065\u0072");return _gfdd ;};func (_gdcba *shapesState )fill (_bdb *[]*subpath ){*_bdb =append (*_bdb ,_gdcba ._dacd ...);if _fgda {_gb .Log .Info ("\u0046\u0049L\u004c\u003a\u0020\u0025\u0064\u0020\u0066\u0069\u006c\u006c\u0073\u0020\u0028\u0025\u0064\u0020\u006e\u0065\u0077\u0029\u0020\u0073s=\u0025\u0073",len (*_bdb ),len (_gdcba ._dacd ),_gdcba );for _bdab ,_gecb :=range _gdcba ._dacd {_ef .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bdab ,_gecb );if _bdab ==10{break ;};};};};func (_ecffe *textWord )addDiacritic (_facee string ){_dgff :=_ecffe ._degbg [len (_ecffe ._degbg )-1];_dgff ._daca +=_facee ;_dgff ._daca =_de .NFKC .String (_dgff ._daca );};func _gbca (_feef ,_cacca int )int {if _feef < _cacca {return _feef ;};return _cacca ;};func _gdcf (_decfe ,_ecff bounded )float64 {_aacb :=_edef (_decfe ,_ecff );if !_afda (_aacb ){return _aacb ;};return _fddga (_decfe ,_ecff );};func (_fff *imageExtractContext )processOperand (_cb *_ba .ContentStreamOperation ,_gd _ba .GraphicsState ,_ga *_dg .PdfPageResources )error {if _cb .Operand =="\u0042\u0049"&&len (_cb .Params )==1{_add ,_fc :=_cb .Params [0].(*_ba .ContentStreamInlineImage );if !_fc {return nil ;};if _cfc ,_fd :=_eff .GetBoolVal (_add .ImageMask );_fd {if _cfc &&!_fff ._ffe .IncludeInlineStencilMasks {return nil ;};};return _fff .extractInlineImage (_add ,_gd ,_ga );}else if _cb .Operand =="\u0044\u006f"&&len (_cb .Params )==1{_addf ,_abd :=_eff .GetName (_cb .Params [0]);if !_abd {_gb .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065");return _bc ;};_ ,_bee :=_ga .GetXObjectByName (*_addf );switch _bee {case _dg .XObjectTypeImage :return _fff .extractXObjectImage (_addf ,_gd ,_ga );case _dg .XObjectTypeForm :return _fff .extractFormImages (_addf ,_gd ,_ga );};};return nil ;};
|
2020-09-28 23:18:17 +00:00
|
|
|
|
|
2020-10-05 19:28:24 +00:00
|
|
|
|
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
|
|
|
|
|
// TODO(peterwilliams97): The stats complicate this function signature and aren't very useful.
|
|
|
|
|
// Replace with a function like Extract() (*PageText, error)
|
|
|
|
|
func (_cfcd *Extractor )ExtractPageText ()(*PageText ,int ,int ,error ){_cbc ,_fcfd ,_db ,_ag :=_cfcd .extractPageText (_cfcd ._eb ,_cfcd ._gg ,_fe .IdentityMatrix (),0);if _ag !=nil {return nil ,0,0,_ag ;};_cbc .computeViews ();_ag =_fabfd (_cbc );if _ag !=nil {return nil ,0,0,_ag ;};return _cbc ,_fcfd ,_db ,nil ;};func _fbadb (_fdbd ,_ddcd *textPara )bool {return _cbaf (_fdbd ._ageda ,_ddcd ._ageda )};func (_ffc *shapesState )moveTo (_gfcg ,_bad float64 ){_ffc ._egbb =true ;_ffc ._bbede =_ffc .devicePoint (_gfcg ,_bad );if _geeca {_gb .Log .Info ("\u006d\u006fv\u0065\u0054\u006f\u003a\u0020\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0064\u0065\u0076\u0069\u0063\u0065\u003d%.\u0032\u0066",_gfcg ,_bad ,_ffc ._bbede );};};func (_gbe *subpath )clear (){*_gbe =subpath {}};
|
|
|
|
|
|
|
|
|
|
// ApplyArea processes the page text only within the specified area `bbox`.
|
|
|
|
|
// Each time ApplyArea is called, it updates the result set in `pt`.
|
|
|
|
|
// Can be called multiple times in a row with different bounding boxes.
|
|
|
|
|
func (_cgee *PageText )ApplyArea (bbox _dg .PdfRectangle ){_dgda :=make ([]*textMark ,0,len (_cgee ._abcb ));for _ ,_ddfa :=range _cgee ._abcb {if _abbf (_ddfa .bbox (),bbox ){_dgda =append (_dgda ,_ddfa );};};var _dgdd paraList ;_ffaa :=len (_dgda );for _bfdg :=0;_bfdg < 360&&_ffaa > 0;_bfdg +=90{_ggee :=make ([]*textMark ,0,len (_dgda )-_ffaa );for _ ,_cce :=range _dgda {if _cce ._eggf ==_bfdg {_ggee =append (_ggee ,_cce );};};if len (_ggee )> 0{_cdab :=_dcgf (_ggee ,_cgee ._dfgg ,nil );_dgdd =append (_dgdd ,_cdab ...);_ffaa -=len (_ggee );};};_abba :=new (_fa .Buffer );_dgdd .writeText (_abba );_cgee ._fffd =_abba .String ();_cgee ._acbg =_dgdd .toTextMarks ();_cgee ._dabe =_dgdd .tables ();};type fontEntry struct{_fffc *_dg .PdfFont ;_cadg int64 ;};func (_gbc *wordBag )minDepth ()float64 {return _gbc ._bcd -(_gbc .Ury -_gbc ._bgd )};func (_dae *wordBag )removeWord (_bbge *textWord ,_bead int ){_fced :=_dae ._dbf [_bead ];_fced =_dcgb (_fced ,_bbge );if len (_fced )==0{delete (_dae ._dbf ,_bead );}else {_dae ._dbf [_bead ]=_fced ;};};func (_fgg *textLine )endsInHyphen ()bool {_ccde :=_fgg ._bac [len (_fgg ._bac )-1];_dedd :=_ccde ._efdce ;_bdce ,_ddfab :=_e .DecodeLastRuneInString (_dedd );if _ddfab <=0||!_gf .Is (_gf .Hyphen ,_bdce ){return false ;};if _ccde ._gcgcb &&_bbff (_dedd ){return true ;};return _bbff (_fgg .text ());};func (_fdgad rulingList )intersections ()map[int ]intSet {var _gcgec ,_dgdfg []int ;for _efbff ,_bgdg :=range _fdgad {switch _bgdg ._eddf {case _dbga :_gcgec =append (_gcgec ,_efbff );case _daea :_dgdfg =append (_dgdfg ,_efbff );};};if len (_gcgec )< _cbag +1||len (_dgdfg )< _ccfdg +1{return nil ;};if len (_gcgec )+len (_dgdfg )> _cadac {_gb .Log .Debug ("\u0069\u006e\u0074\u0065\u0072\u0073e\u0063\u0074\u0069\u006f\u006e\u0073\u003a\u0020\u0054\u004f\u004f\u0020\u004d\u0041\u004e\u0059\u0020\u0072\u0075\u006ci\u006e\u0067\u0073\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u003d\u0020%\u0064 \u0078\u0020\u0025\u0064",len (_fdgad ),len (_gcgec ),len (_dgdfg ));return nil ;};_aafa :=make (map[int ]intSet ,len (_gcgec )+len (_dgdfg ));for _ ,_bgac :=range _gcgec {for _ ,_bded :=range _dgdfg {if _fdgad [_bgac ].intersects (_fdgad [_bded ]){if _ ,_bfde :=_aafa [_bgac ];!_bfde {_aafa [_bgac ]=make (intSet );};if _ ,_aaeg :=_aafa [_bded ];!_aaeg {_aafa [_bded ]=make (intSet );};_aafa [_bgac ].add (_bded );_aafa [_bded ].add (_bgac );};};};return _aafa ;};func (_adc *textObject )showTextAdjusted (_baa *_eff .PdfObjectArray )error {_afee :=false ;for _ ,_dce :=range _baa .Elements (){switch _dce .(type ){case *_eff .PdfObjectFloat ,*_eff .PdfObjectInteger :_fbce ,_fga :=_eff .GetNumberAsFloat (_dce );if _fga !=nil {_gb .Log .Debug ("\u0045\u0052\u0052\u004fR\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078t\u0041\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0042\u0061\u0064\u0020\u006e\u0075\u006d\u0065r\u0069\u0063\u0061\u006c\u0020a\u0072\u0067\u002e\u0020\u006f\u003d\u0025\u0073\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_dce ,_baa );return _fga ;};_cgf ,_dcf :=-_fbce *0.001*_adc ._cfeg ._eecf ,0.0;if _afee {_dcf ,_cgf =_cgf ,_dcf ;};_def :=_ecb (_fe .Point {X :_cgf ,Y :_dcf });_adc ._fac .Concat (_def );case *_eff .PdfObjectString :_ebf ,_ceec :=_eff .GetStringBytes (_dce );if !_ceec {_gb .Log .Trace ("s\u0068\u006f\u0077\u0054\u0065\u0078\u0074\u0041\u0064j\u0075\u0073\u0074\u0065\u0064\u003a\u0020Ba\u0064\u0020\u0073\u0074r\u0069\u006e\u0067\u0020\u0061\u0072\u0067\u002e\u0020o=\u0025\u0073 \u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_dce ,_baa );return _eff .ErrTypeError ;};_adc .renderText (_ebf );default:_gb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078\u0074A\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0055\u006e\u0065\u0078p\u0065\u0063\u0074\u0065\u0064\u0020\u0074\u0079\u0070\u0065\u0020\u0028%T\u0029\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_dce ,_baa );return _eff .ErrTypeError ;};};return nil ;};func (_ecdda paraList )findTableGrid (_ccgg rulingList )(*textTable ,map[
|
|
|
|
|
|
|
|
|
|
// NewFromContents creates a new extractor from contents and page resources.
|
|
|
|
|
func NewFromContents (contents string ,resources *_dg .PdfPageResources )(*Extractor ,error ){_ade :=&Extractor {_eb :contents ,_gg :resources ,_dc :map[string ]fontEntry {},_fb :map[string ]textResult {}};return _ade ,nil ;};var (_bc =_g .New ("\u0074\u0079p\u0065\u0020\u0063h\u0065\u0063\u006b\u0020\u0065\u0072\u0072\u006f\u0072");_df =_g .New ("\u0072\u0061\u006e\u0067\u0065\u0020\u0063\u0068\u0065\u0063\u006b\u0020e\u0072\u0072\u006f\u0072"););func _feec (_eagb []TextMark ,_cdbd *int )[]TextMark {_fbag :=_eagb [len (_eagb )-1];_eeda :=[]rune (_fbag .Text );if len (_eeda )==1{_eagb =_eagb [:len (_eagb )-1];_fdaa :=_eagb [len (_eagb )-1];*_cdbd =_fdaa .Offset +len (_fdaa .Text );}else {_cbed :=_cfee (_fbag .Text );*_cdbd +=len (_cbed )-len (_fbag .Text );_fbag .Text =_cbed ;};return _eagb ;};func (_babaa *ruling )equals (_baae *ruling )bool {return _babaa ._eddf ==_baae ._eddf &&_ffac (_babaa ._dbdb ,_baae ._dbdb )&&_ffac (_babaa ._ecfg ,_baae ._ecfg )&&_ffac (_babaa ._gbbb ,_baae ._gbbb );};
|