2020-08-27 21:45:09 +00:00
|
|
|
|
//
|
|
|
|
|
// Copyright 2020 FoxyUtils ehf. All rights reserved.
|
|
|
|
|
//
|
|
|
|
|
// This is a commercial product and requires a license to operate.
|
|
|
|
|
// A trial license can be obtained at https://unidoc.io
|
|
|
|
|
//
|
|
|
|
|
// DO NOT EDIT: generated by unitwist Go source code obfuscator.
|
|
|
|
|
//
|
|
|
|
|
// Use of this source code is governed by the UniDoc End User License Agreement
|
|
|
|
|
// terms that can be accessed at https://unidoc.io/eula/
|
2018-03-22 14:03:47 +00:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
//
|
|
|
|
|
// Package extractor is used for quickly extracting PDF content through a simple interface.
|
|
|
|
|
// Currently offers functionality for extracting textual content.
|
|
|
|
|
//
|
2020-09-28 23:18:17 +00:00
|
|
|
|
package extractor ;import (_dfc "bytes";_d "errors";_aae "fmt";_ca "github.com/unidoc/unipdf/v3/common";_cb "github.com/unidoc/unipdf/v3/common/license";_be "github.com/unidoc/unipdf/v3/contentstream";_f "github.com/unidoc/unipdf/v3/core";_ba "github.com/unidoc/unipdf/v3/internal/textencoding";_cd "github.com/unidoc/unipdf/v3/internal/transform";_ag "github.com/unidoc/unipdf/v3/model";_b "golang.org/x/text/unicode/norm";_de "golang.org/x/xerrors";_aa "image/color";_df "io";_ed "math";_ec "regexp";_db "sort";_ecb "strings";_c "unicode";_a "unicode/utf8";);
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// String returns a human readable description of `ss`.
|
2020-09-28 23:18:17 +00:00
|
|
|
|
func (_geac *shapesState )String ()string {return _aae .Sprintf ("%\u0064 \u0073\u0075\u0062\u0070\u0061\u0074\u0068\u0073 \u0066\u0072\u0065\u0073h=\u0025\u0074",len (_geac ._eff ),_geac ._ccd );};func (_fddb *textObject )showText (_eed []byte )error {return _fddb .renderText (_eed )};func (_gcf *textObject )getFontDirect (_bff string )(*_ag .PdfFont ,error ){_gcb ,_ecee :=_gcf .getFontDict (_bff );if _ecee !=nil {return nil ,_ecee ;};_gaaa ,_ecee :=_ag .NewPdfFontFromPdfObject (_gcb );if _ecee !=nil {_ca .Log .Debug ("\u0067\u0065\u0074\u0046\u006f\u006e\u0074\u0044\u0069\u0072\u0065\u0063\u0074\u003a\u0020\u004e\u0065\u0077Pd\u0066F\u006f\u006e\u0074\u0046\u0072\u006f\u006d\u0050\u0064\u0066\u004f\u0062j\u0065\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u006e\u0061\u006d\u0065\u003d%\u0023\u0071\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bff ,_ecee );};return _gaaa ,_ecee ;};func (_bgcf *shapesState )newSubPath (){_bgcf .clearPath ();if _feec {_ca .Log .Info ("\u006e\u0065\u0077\u0053\u0075\u0062\u0050\u0061\u0074h\u003a\u0020\u0025\u0073",_bgcf );};};func _gffed (_bbe []*textMark ,_egeg _ag .PdfRectangle )*textWord {_fcgfa :=_bbe [0].PdfRectangle ;_ffgfa :=_bbe [0]._cbbg ;for _ ,_fdfgc :=range _bbe [1:]{_fcgfa =_eegb (_fcgfa ,_fdfgc .PdfRectangle );if _fdfgc ._cbbg > _ffgfa {_ffgfa =_fdfgc ._cbbg ;};};return &textWord {PdfRectangle :_fcgfa ,_eeda :_bbe ,_fgfab :_egeg .Ury -_fcgfa .Lly ,_bbgd :_ffgfa };};const (RenderModeStroke RenderMode =1<<iota ;RenderModeFill ;RenderModeClip ;);type textObject struct{_ccf *Extractor ;_ecgg *_ag .PdfPageResources ;_deef _be .GraphicsState ;_gdb *textState ;_egd *stateStack ;_gdg _cd .Matrix ;_beab _cd .Matrix ;_bad []*textMark ;_feab bool ;};func (_acdf *shapesState )cubicTo (_abdd ,_aca ,_faff ,_cbcb ,_fada ,_cdbb float64 ){_acdf .addPoint (_fada ,_cdbb );};func (_dce *stateStack )empty ()bool {return len (*_dce )==0};func (_dfac *textObject )nextLine (){_dfac .moveLP (0,-_dfac ._gdb ._egf )};const _ggf =20;func (_dfgb *textObject )setWordSpacing (_fage float64 ){if _dfgb ==nil {return ;};_dfgb ._gdb ._fdb =_fage ;};func _cadb (_efbdg ,_bgba ,_becd ,_cgbcd *textPara )*textTable {_ggaea :=&textTable {_acgb :2,_dbff :2,_eagd :make (map[uint64 ]*textPara ,4)};_ggaea .put (0,0,_efbdg );_ggaea .put (1,0,_bgba );_ggaea .put (0,1,_becd );_ggaea .put (1,1,_cgbcd );return _ggaea ;};func (_dgab *wordBag )depthRange (_aacga ,_cebga int )[]int {var _adac []int ;for _facb :=range _dgab ._aabed {if _aacga <=_facb &&_facb <=_cebga {_adac =append (_adac ,_facb );};};if len (_adac )==0{return nil ;};_db .Ints (_adac );return _adac ;};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// PageText represents the layout of text on a device page.
|
|
|
|
|
type PageText struct{_gef []*textMark ;_bfbe string ;_ebd []TextMark ;_feaf []TextTable ;_bdc _ag .PdfRectangle ;_eag []*subpath ;_aacc []*subpath ;};func (_gafcd *wordBag )allWords ()[]*textWord {var _bdda []*textWord ;for _ ,_fbae :=range _gafcd ._aabed {_bdda =append (_bdda ,_fbae ...);};return _bdda ;};type fontEntry struct{_cdg *_ag .PdfFont ;_ccg int64 ;};func (_baaff *textPara )toCellTextMarks (_edca *int )[]TextMark {var _bcace []TextMark ;for _dbdc ,_abcc :=range _baaff ._adga {_ffec :=_abcc .toTextMarks (_edca );_gcfg :=_edfb &&_abcc .endsInHyphen ()&&_dbdc !=len (_baaff ._adga )-1;if _gcfg {_ffec =_dgbbc (_ffec ,_edca );};_bcace =append (_bcace ,_ffec ...);if !(_gcfg ||_dbdc ==len (_baaff ._adga )-1){_bcace =_eegf (_bcace ,_edca ,_dbgf (_abcc ._eagf ,_baaff ._adga [_dbdc +1]._eagf ));};};return _bcace ;};func (_bcaeb *textPara )text ()string {_gaef :=new (_dfc .Buffer );_bcaeb .writeText (_gaef );return _gaef .String ();};type rectRuling struct{_cdea rulingKind ;_ag .PdfRectangle ;};func (_adg *textObject )setTextRise (_ecaf float64 ){if _adg ==nil {return ;};_adg ._gdb ._eaff =_ecaf ;};func (_dcab *textObject )checkOp (_geec *_be .ContentStreamOperation ,_bed int ,_cdeb bool )(_aad bool ,_dfe error ){if _dcab ==nil {var _dgbf []_f .PdfObject ;if _bed > 0{_dgbf =_geec .Params ;if len (_dgbf )> _bed {_dgbf =_dgbf [:_bed ];};};_ca .Log .Debug ("\u0025\u0023q \u006f\u0070\u0065r\u0061\u006e\u0064\u0020out\u0073id\u0065\u0020\u0074\u0065\u0078\u0074\u002e p\u0061\u0072\u0061\u006d\u0073\u003d\u0025+\u0076",_geec .Operand ,_dgbf );};if _bed >=0{if len (_geec .Params )!=_bed {if _cdeb {_dfe =_d .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");};_ca .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_geec .Operand ,_bed ,len (_geec .Params ),_geec .Params );return false ,_dfe ;};};return true ,nil ;};func (_cgad *wordBag )absorb (_cgab *wordBag ){_begc :=_cgab .makeRemovals ();for _ddbd ,_dbcgb :=range _cgab ._aabed {for _ ,_bccd :=range _dbcgb {_cgad .pullWord (_bccd ,_ddbd ,_begc );};};_cgab .applyRemovals (_begc );};func (_fabb *wordBag )firstWord (_fgb int )*textWord {return _fabb ._aabed [_fgb ][0]};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a description of `tm`.
|
|
|
|
|
func (_bggd *textMark )String ()string {return _aae .Sprintf ("\u0025\u002e\u0032f \u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\u0022\u0025\u0073\u0022",_bggd .PdfRectangle ,_bggd ._cbbg ,_bggd ._ddbg );};type textResult struct{_gea PageText ;_cgf int ;_beag int ;};func (_bdge paraList )findTables (_gebf []rulingList )[]*textTable {_bdge .addNeighbours ();_db .Slice (_bdge ,func (_facc ,_degfe int )bool {return _defg (_bdge [_facc ],_bdge [_degfe ])< 0});var _gbbf []*textTable ;if _dff {_bdec :=_bdge .findGridTables (_gebf );_gbbf =append (_gbbf ,_bdec ...);};if _cbbd {_dcac :=_bdge .findTextTables ();_gbbf =append (_gbbf ,_dcac ...);};return _gbbf ;};func (_aadcd paraList )topoOrder ()[]int {if _bffgd {_ca .Log .Info ("\u0074\u006f\u0070\u006f\u004f\u0072\u0064\u0065\u0072\u003a");};_baafa :=len (_aadcd );_dgff :=make ([]bool ,_baafa );_daafg :=make ([]int ,0,_baafa );_gcbgd :=_aadcd .llyOrdering ();var _efegf func (_addg int );_efegf =func (_dgcf int ){_dgff [_dgcf ]=true ;for _cfdad :=0;_cfdad < _baafa ;_cfdad ++{if !_dgff [_cfdad ]{if _aadcd .readBefore (_gcbgd ,_dgcf ,_cfdad ){_efegf (_cfdad );};};};_daafg =append (_daafg ,_dgcf );};for _cedcb :=0;_cedcb < _baafa ;_cedcb ++{if !_dgff [_cedcb ]{_efegf (_cedcb );};};return _bdac (_daafg );};func (_ggfad *PageText )computeViews (){_aadc :=_aged (_ggfad ._eag );_ggcb :=_ccfe (_ggfad ._aacc );var _gff []rulingList ;if _egbb {_gff =append (_gff ,_aadc ...);};if _gdgdd {_gff =append (_gff ,_ggcb ...);};if _cfg {if len (_aadc )> 0{_ca .Log .Info ("S\u0074\u0072\u006f\u006b\u0065\u0073\u003a\u0020\u0025\u0064",len (_ggfad ._eag ));_ca .Log .Info ("\u0053\u0074r\u006f\u006b\u0065 \u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0025\u0064",len (_aadc ));for _affg ,_gbcc :=range _aadc {_aae .Printf ("\u0025\u0034d\u003a\u0020\u0025d\u0020\u0072\u0075\u006c\u0069\u006e\u0067\u0073\u000a",_affg ,len (_gbcc ));for _fedb ,_dgf :=range _gbcc {_aae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_fedb ,_dgf );};};};if len (_ggcb )> 0{_ca .Log .Info ("\u0046i\u006c\u006c\u0073\u003a\u0020\u0025d",len (_ggfad ._aacc ));_ca .Log .Info ("\u0046\u0069\u006c\u006c\u0020\u0047\u0072\u0069\u0064s\u003a\u0020\u0025\u0064",len (_ggcb ));for _deae ,_gdff :=range _ggcb {_aae .Printf ("\u0025\u0034d\u003a\u0020\u0025d\u0020\u0072\u0075\u006c\u0069\u006e\u0067\u0073\u000a",_deae ,len (_gdff ));for _bage ,_ageg :=range _gdff {_aae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_bage ,_ageg );};};};};var _bbcg paraList ;_eegg :=len (_ggfad ._gef );for _bcg :=0;_bcg < 360&&_eegg > 0;_bcg +=90{_egg :=make ([]*textMark ,0,len (_ggfad ._gef )-_eegg );for _ ,_bgg :=range _ggfad ._gef {if _bgg ._bgga ==_bcg {_egg =append (_egg ,_bgg );};};if len (_egg )> 0{_gbec :=_edfg (_egg ,_ggfad ._bdc ,_gff );_bbcg =append (_bbcg ,_gbec ...);_eegg -=len (_egg );};};_gdc :=new (_dfc .Buffer );_bbcg .writeText (_gdc );_ggfad ._bfbe =_gdc .String ();_ggfad ._ebd =_bbcg .toTextMarks ();_ggfad ._feaf =_bbcg .tables ();};var _beagg =TextMark {Text :"\u005b\u0058\u005d",Original :"\u0020",Meta :true ,FillColor :_aa .White ,StrokeColor :_aa .White };func (_afea *textTable )getDown ()paraList {_aaee :=make (paraList ,_afea ._acgb );for _eaed :=0;_eaed < _afea ._acgb ;_eaed ++{_ecdc :=_afea .get (_eaed ,_afea ._dbff -1)._gbbgf ;if _ecdc ==nil ||_ecdc ._ddadc {return nil ;};_aaee [_eaed ]=_ecdc ;};for _bbcf :=0;_bbcf < _afea ._acgb -1;_bbcf ++{if _aaee [_bbcf ]._afga !=_aaee [_bbcf +1]{return nil ;};};return _aaee ;};func _gccc (_fgeb _ag .PdfRectangle ,_gdce []*textLine )*textPara {return &textPara {PdfRectangle :_fgeb ,_adga :_gdce };};func (_abde *stateStack )push (_ceag *textState ){_dbaf :=*_ceag ;*_abde =append (*_abde ,&_dbaf )};func _ecfbe (_dbdd _ag .PdfColorspace ,_bfcef _ag .PdfColor )_aa .Color {if _dbdd ==nil ||_bfcef ==nil {return _aa .Black ;};_bbgf ,_feaa :=_dbdd .ColorToRGB (_bfcef );if _feaa !=nil {_ca .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006fu\u006c\u0064\u0020no\u0074\u0020\u0063\u006f\u006e\u0076e\u0072\u0074\u0020\u0063\u006
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// Text returns the extracted page text.
|
|
|
|
|
func (_agf PageText )Text ()string {return _agf ._bfbe };func (_aea paraList )computeEBBoxes (){if _dbb {_ca .Log .Info ("\u0063o\u006dp\u0075\u0074\u0065\u0045\u0042\u0042\u006f\u0078\u0065\u0073\u003a");};for _ ,_faec :=range _aea {_faec ._bbba =_faec .PdfRectangle ;};_effc :=_aea .yNeighbours (0);for _daeb ,_gfbg :=range _aea {_dcdaa :=_gfbg ._bbba ;_cdca ,_gcbea :=-1.0e9,+1.0e9;for _ ,_bebb :=range _effc [_gfbg ]{_defa :=_aea [_bebb ]._bbba ;if _defa .Urx < _dcdaa .Llx {_cdca =_ed .Max (_cdca ,_defa .Urx );}else if _dcdaa .Urx < _defa .Llx {_gcbea =_ed .Min (_gcbea ,_defa .Llx );};};for _bffgf ,_cegb :=range _aea {_bfag :=_cegb ._bbba ;if _daeb ==_bffgf ||_bfag .Ury > _dcdaa .Lly {continue ;};if _cdca <=_bfag .Llx &&_bfag .Llx < _dcdaa .Llx {_dcdaa .Llx =_bfag .Llx ;}else if _bfag .Urx <=_gcbea &&_dcdaa .Urx < _bfag .Urx {_dcdaa .Urx =_bfag .Urx ;};};if _dbb {_aae .Printf ("\u0025\u0034\u0064\u003a %\u0036\u002e\u0032\u0066\u2192\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0071\u000a",_daeb ,_gfbg ._bbba ,_dcdaa ,_aecb (_gfbg .text (),50));};_gfbg ._bbba =_dcdaa ;};if _dbce {for _ ,_baff :=range _aea {_baff .PdfRectangle =_baff ._bbba ;};};};func _feca (_baeg float64 )bool {return _ed .Abs (_baeg )< _dfgg };func (_dddb *wordBag )sort (){for _ ,_gfgfc :=range _dddb ._aabed {_db .Slice (_gfgfc ,func (_gebe ,_eebc int )bool {return _ccc (_gfgfc [_gebe ],_gfgfc [_eebc ])< 0});};};func (_decg *shapesState )stroke (_dddg *[]*subpath ){*_dddg =append (*_dddg ,_decg ._eff ...);if _cfg {_ca .Log .Info ("\u0053T\u0052\u004f\u004b\u0045\u003a\u0020\u0025\u0064\u0020\u0073\u0074r\u006f\u006b\u0065\u0073\u0020\u0073\u0073\u003d\u0025\u0073",len (*_dddg ),_decg );for _aacb ,_cba :=range _decg ._eff {_aae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_aacb ,_cba );if _aacb ==10{break ;};};};};func (_gaab *subpath )last ()_cd .Point {return _gaab ._aacge [len (_gaab ._aacge )-1]};func _cdcg (_agcb ,_aaeb _ag .PdfRectangle )bool {return _agcb .Llx <=_aaeb .Llx &&_aaeb .Urx <=_agcb .Urx &&_agcb .Lly <=_aaeb .Lly &&_aaeb .Ury <=_agcb .Ury ;};func _ccgf (_efbc ,_fece _ag .PdfRectangle )bool {return _daad (_efbc ,_fece )&&_cbbc (_efbc ,_fece )};func (_dbea *textObject )getFillColor ()_aa .Color {return _ecfbe (_dbea ._deef .ColorspaceNonStroking ,_dbea ._deef .ColorNonStroking );};func (_dgbd *textLine )bbox ()_ag .PdfRectangle {return _dgbd .PdfRectangle };func (_acdbg *subpath )isQuadrilateral ()bool {if len (_acdbg ._aacge )< 4||len (_acdbg ._aacge )> 5{return false ;};if len (_acdbg ._aacge )==5{_gffa :=_acdbg ._aacge [0];_efbf :=_acdbg ._aacge [4];if _gffa .X !=_efbf .X ||_gffa .Y !=_efbf .Y {return false ;};};return true ;};func (_cbed *subpath )add (_cdge ..._cd .Point ){_cbed ._aacge =append (_cbed ._aacge ,_cdge ...)};func (_fgad *textPara )depth ()float64 {if len (_fgad ._adga )> 0{return _fgad ._adga [0]._eagf ;};return _fgad ._cgaa .get (0,0).depth ();};type bounded interface{bbox ()_ag .PdfRectangle };func (_ggdd rulingList )tidied (_badc string )rulingList {_edfgg :=_ggdd .removeDuplicates ();_gefe :=_edfgg .coalesce ();if _gefe ==nil {return nil ;};_gefe .sort ();if _cfg {_ca .Log .Info ("\u0074\u0069\u0064i\u0065\u0064\u003a\u0020\u0025\u0071\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u0075\u006e\u0069\u0071\u0075\u0065\u0073\u003d\u0025\u0064\u0020\u0063\u006f\u0061l\u0065\u0073\u0063\u0065\u0064\u003d\u0025\u0064",_badc ,len (_ggdd ),len (_edfgg ),len (_gefe ));for _gbdb ,_bgec :=range _gefe {_aae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_gbdb ,_bgec );};};return _gefe ;};type imageExtractContext struct{_eb []ImageMark ;_bafe int ;_fed int ;_bg int ;_bb map[*_f .PdfObjectStream ]*cachedImage ;_cdb *ImageExtractOptions ;};func _ebbg (_gaba ,_aaefa _cd .Point )rulingKind {_cfga :=_ed .Abs (_gaba .X -_aaefa .X );_cfea :=_ed .Abs (_gaba .Y -_aaefa .Y );return _bbced (_cfga ,_cfea );};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
|
|
|
|
// Elements returns the TextMarks in `ma`.
|
2020-09-28 23:18:17 +00:00
|
|
|
|
func (_ecdd *TextMarkArray )Elements ()[]TextMark {return _ecdd ._ggd };func _bfacc (_aead ,_befa int )uint64 {return uint64 (_aead )*0x1000000+uint64 (_befa )};func (_fge *shapesState )drawRectangle (_afe ,_fdf ,_daa ,_defd float64 ){if _feec {_ccbf :=_fge .devicePoint (_afe ,_fdf );_add :=_fge .devicePoint (_afe +_daa ,_fdf +_defd );_eafgc :=_ag .PdfRectangle {Llx :_ccbf .X ,Lly :_ccbf .Y ,Urx :_add .X ,Ury :_add .Y };_ca .Log .Info ("d\u0072a\u0077\u0052\u0065\u0063\u0074\u0061\u006e\u0067l\u0065\u003a\u0020\u00256.\u0032\u0066",_eafgc );};_fge .newSubPath ();_fge .moveTo (_afe ,_fdf );_fge .lineTo (_afe +_daa ,_fdf );_fge .lineTo (_afe +_daa ,_fdf +_defd );_fge .lineTo (_afe ,_fdf +_defd );_fge .closePath ();};func (_dadd *wordBag )text ()string {_afad :=_dadd .allWords ();_cfcb :=make ([]string ,len (_afad ));for _gdffc ,_dfed :=range _afad {_cfcb [_gdffc ]=_dfed ._gbcg ;};return _ecb .Join (_cfcb ,"\u0020");};const (_dbb =false ;_dface =false ;_dcff =false ;_feec =false ;_fgff =false ;_egad =false ;_bffgd =false ;_becfg =false ;_efdf =_becfg &&true ;_ggb =_efdf &&false ;_acdad =_becfg &&true ;_bege =false ;_edfc =_bege ||false ;_cfg =false ;);func _dbgf (_dcae ,_eaag float64 )string {_bce :=!_fafdc (_dcae -_eaag );if _bce {return "\u000a";};return "\u0020";};
|
|
|
|
|
|
|
|
|
|
// String returns a human readable description of `path`.
|
|
|
|
|
func (_abgg *subpath )String ()string {_eace :=_abgg ._aacge ;_gebb :=len (_eace );if _gebb <=5{return _aae .Sprintf ("\u0025d\u003a\u0020\u0025\u0036\u002e\u0032f",_gebb ,_eace );};return _aae .Sprintf ("\u0025d\u003a\u0020\u0025\u0036.\u0032\u0066\u0020\u0025\u0036.\u0032f\u0020.\u002e\u002e\u0020\u0025\u0036\u002e\u0032f",_gebb ,_eace [0],_eace [1],_eace [_gebb -1]);};type textTable struct{_ag .PdfRectangle ;_acgb ,_dbff int ;_gdcfd bool ;_eagd map[uint64 ]*textPara ;};type subpath struct{_aacge []_cd .Point ;_ffea bool ;};const (_edfb =true ;_fabd =true ;_gbg =true ;_dbce =false ;_aaga =false ;_cagg =6;_ggfb =3.0;_adace =200;_dff =true ;_cbbd =true ;_egbb =true ;_gdgdd =true ;);func (_fgec *wordBag )removeWord (_cfda *textWord ,_bdfef int ){_gegg :=_fgec ._aabed [_bdfef ];_gegg =_bcaf (_gegg ,_cfda );if len (_gegg )==0{delete (_fgec ._aabed ,_bdfef );}else {_fgec ._aabed [_bdfef ]=_gegg ;};};func (_bcab *shapesState )addPoint (_eacd ,_fgf float64 ){_addc :=_bcab .establishSubpath ();_gegbg :=_bcab .devicePoint (_eacd ,_fgf );if _addc ==nil {_bcab ._ccd =true ;_bcab ._gadf =_gegbg ;}else {_addc .add (_gegbg );};};func (_bdfefd paraList )log (_gafac string ){if !_bffgd {return ;};_ca .Log .Info ("%\u0038\u0073\u003a\u0020\u0025\u0064 \u0070\u0061\u0072\u0061\u0073\u0020=\u003d\u003d\u003d\u003d\u003d\u003d\u002d-\u002d\u002d\u002d\u002d\u002d\u003d\u003d\u003d\u003d\u003d=\u003d",_gafac ,len (_bdfefd ));for _gaac ,_ddeae :=range _bdfefd {if _ddeae ==nil {continue ;};_dccbg :=_ddeae .text ();_debf :="\u0020\u0020";if _ddeae ._cgaa !=nil {_debf =_aae .Sprintf ("\u005b%\u0064\u0078\u0025\u0064\u005d",_ddeae ._cgaa ._acgb ,_ddeae ._cgaa ._dbff );};_aae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0025s\u0020\u0025\u0071\u000a",_gaac ,_ddeae .PdfRectangle ,_debf ,_aecb (_dccbg ,50));};};func (_gaec intSet )del (_dgfe int ){delete (_gaec ,_dgfe )};func (_ddff *textPara )taken ()bool {return _ddff ==nil ||_ddff ._ddadc };func (_gae *subpath )removeDuplicates (){if len (_gae ._aacge )==0{return ;};_ccdb :=[]_cd .Point {_gae ._aacge [0]};for _ ,_fca :=range _gae ._aacge [1:]{if !_gdad (_fca ,_ccdb [len (_ccdb )-1]){_ccdb =append (_ccdb ,_fca );};};_gae ._aacge =_ccdb ;};func (_defed *subpath )makeRectRuling ()(*ruling ,bool ){if _cfg {_ca .Log .Info ("\u006d\u0061\u006beR\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0070\u0061\u0074\u0068\u003d\u0025\u0076",_defed );};_feaff :=_defed ._aacge [:4];_badg :=make (map[int ]rulingKind ,len (_feaff ));for _ebfd ,_eaeg :=range _feaff {_cabgg :=_defed ._aacge [(_ebfd +1)%4];_badg [_ebfd ]=_ebbg (_eaeg ,_cabgg );};if _cfg {_aae .Printf ("\u0020\u0020\u0020\u006b\u0069\u006e\u0064\u0073\u003d\u0025\u002b\u0076\u000a",_badg );};var _ecgc ,_fbgcf []int ;for _egeca ,_fgffb :=range _badg {switch _fgffb {case _gbef :_fbgcf =append (_fbgcf ,_egeca );case _fdaf :_ecgc =append (_ecgc ,_egeca );};};if _cfg {_aae .Printf ("\u0020\u0020 \u0068\u006f\u0072z\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a",len (_fbgcf ),_fbgcf );_aae .Printf ("\u0020\u0020 \u0076\u0065\u0072t\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a",len (_ecgc ),_ecgc );};_gege :=(len (_fbgcf )==2&&len (_ecgc )==2)||(len (_fbgcf )==2&&len (_ecgc )==0&&_adfa (_feaff [_fbgcf [0]],_feaff [_fbgcf [1]]))||(len (_ecgc )==2&&len (_fbgcf )==0&&_aefec (_feaff [_ecgc [0]],_feaff [_ecgc [1]]));if _cfg {_aae .Printf (" \u0020\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064\u0020\u0076\u0065\u0072\u0074\u0073=\u0025\u0064\u0020o\u006b=\u0025\u0074\u000a",len (_fbgcf ),len (_ecgc ),_gege );};if !_gege {return &ruling {},false ;};if len (_ecgc )==0{for _gbed ,_agef :=range _badg {if _agef !=_gbef {_ecgc =append (_ecgc ,_gbed );};};};if len (_fbgcf )==0{for _aebg ,_cfdf :=range _badg {if _cfdf !=_fdaf {_fbgcf =append (_fbgcf ,_aebg );};};};if _cfg {_ca .Log .Info ("\u0020\u0020\u0068\u006f\u0072\u007a\u0073\u003d\u0025\u0064 \u0076\u0065\u0072\u0074\u0073\u003d\u0025d\u0020\u0070\u006f\u0069\u006e\u0074\u0073\u003d\u0025\u0064\u000a"+"\u0009\u0020\u0068o\u0072\u007a\u0073\u
|
|
|
|
|
|
|
|
|
|
// String returns a string describing `ma`.
|
|
|
|
|
func (_bdcc TextMarkArray )String ()string {_gbdg :=len (_bdcc ._ggd );if _gbdg ==0{return "\u0045\u004d\u0050T\u0059";};_cbe :=_bdcc ._ggd [0];_gbfg :=_bdcc ._ggd [_gbdg -1];return _aae .Sprintf ("\u007b\u0054\u0045\u0058\u0054\u004d\u0041\u0052K\u0041\u0052\u0052AY\u003a\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025s\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d%\u0073\u007d",_gbdg ,_cbe ,_gbfg );};func (_fefd *textPara )writeText (_dfdd _df .Writer ){if _fefd ._cgaa ==nil {_fefd .writeCellText (_dfdd );return ;};for _gdge :=0;_gdge < _fefd ._cgaa ._dbff ;_gdge ++{for _fbef :=0;_fbef < _fefd ._cgaa ._acgb ;_fbef ++{_cfbga :=_fefd ._cgaa .get (_fbef ,_gdge );if _cfbga ==nil {_dfdd .Write ([]byte ("\u0009"));}else {_cfbga .writeCellText (_dfdd );};_dfdd .Write ([]byte ("\u0020"));};if _gdge < _fefd ._cgaa ._dbff -1{_dfdd .Write ([]byte ("\u000a"));};};};func (_gabg *textLine )appendWord (_ggcfb *textWord ){_gabg ._decag =append (_gabg ._decag ,_ggcfb );_gabg .PdfRectangle =_eegb (_gabg .PdfRectangle ,_ggcfb .PdfRectangle );if _ggcfb ._bbgd > _gabg ._edef {_gabg ._edef =_ggcfb ._bbgd ;};if _ggcfb ._fgfab > _gabg ._eagf {_gabg ._eagf =_ggcfb ._fgfab ;};};
|
|
|
|
|
|
|
|
|
|
// NewFromContents creates a new extractor from contents and page resources.
|
|
|
|
|
func NewFromContents (contents string ,resources *_ag .PdfPageResources )(*Extractor ,error ){_aab :=&Extractor {_ece :contents ,_cde :resources ,_fb :map[string ]fontEntry {},_fe :map[string ]textResult {}};return _aab ,nil ;};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
|
|
|
|
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
|
|
|
|
|
// TODO(peterwilliams97): The stats complicate this function signature and aren't very useful.
|
|
|
|
|
// Replace with a function like Extract() (*PageText, error)
|
2020-09-28 23:18:17 +00:00
|
|
|
|
func (_bc *Extractor )ExtractPageText ()(*PageText ,int ,int ,error ){_dbd ,_bfg ,_gd ,_dgb :=_bc .extractPageText (_bc ._ece ,_bc ._cde ,_cd .IdentityMatrix (),0);if _dgb !=nil {return nil ,0,0,_dgb ;};_dbd .computeViews ();_dgb =_cgda (_dbd );if _dgb !=nil {return nil ,0,0,_dgb ;};return _dbd ,_bfg ,_gd ,nil ;};func _eged (_gage _ag .PdfRectangle ,_gefd bounded )float64 {return _gage .Ury -_gefd .bbox ().Lly };func (_gcbg *textLine )text ()string {var _gfd []string ;for _ ,_gbbg :=range _gcbg ._decag {if _gbbg ._ebbgb {_gfd =append (_gfd ,"\u0020");};_gfd =append (_gfd ,_gbbg ._gbcg );};return _ecb .Join (_gfd ,"");};func _cbbc (_dcgg ,_egc _ag .PdfRectangle )bool {return _dcgg .Lly <=_egc .Ury &&_egc .Lly <=_dcgg .Ury };func (_bacec *textWord )addDiacritic (_befd string ){_acbcb :=_bacec ._eeda [len (_bacec ._eeda )-1];_acbcb ._ddbg =_acbcb ._ddbg +_befd ;_acbcb ._ddbg =_b .NFKC .String (_acbcb ._ddbg );};func (_fdce *wordBag )depthIndexes ()[]int {if len (_fdce ._aabed )==0{return nil ;};_ddc :=make ([]int ,len (_fdce ._aabed ));_dbf :=0;for _cabe :=range _fdce ._aabed {_ddc [_dbf ]=_cabe ;_dbf ++;};_db .Ints (_ddc );return _ddc ;};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// Marks returns the TextMark collection for a page. It represents all the text on the page.
|
|
|
|
|
func (_dgec PageText )Marks ()*TextMarkArray {return &TextMarkArray {_ggd :_dgec ._ebd }};type cachedImage struct{_fc *_ag .Image ;_ea _ag .PdfColorspace ;};func (_bga *stateStack )size ()int {return len (*_bga )};func (_baafc *textTable )growTable (){_gbbe :=func (_beaag paraList ){_baafc ._dbff ++;for _efead :=0;_efead < _baafc ._acgb ;_efead ++{_dgcc :=_beaag [_efead ];_baafc .put (_efead ,_baafc ._dbff -1,_dgcc );};};_agec :=func (_bdaa paraList ){_baafc ._acgb ++;for _abdba :=0;_abdba < _baafc ._dbff ;_abdba ++{_dbdaa :=_bdaa [_abdba ];_baafc .put (_baafc ._acgb -1,_abdba ,_dbdaa );};};for {_ceba :=false ;_gfff :=_baafc .getDown ();_ggbg :=_baafc .getRight ();if _gfff !=nil &&_ggbg !=nil {_cef :=_gfff [len (_gfff )-1];if _cef !=nil &&!_cef ._ddadc &&_cef ==_ggbg [len (_ggbg )-1]{_gbbe (_gfff );if _ggbg =_baafc .getRight ();_ggbg !=nil {_agec (_ggbg );_baafc .put (_baafc ._acgb -1,_baafc ._dbff -1,_cef );};_ceba =true ;};};if !_ceba &&_gfff !=nil {_gbbe (_gfff );_ceba =true ;};if !_ceba &&_ggbg !=nil {_agec (_ggbg );_ceba =true ;};if !_ceba {break ;};};};func (_ecfb *shapesState )clearPath (){_ecfb ._eff =nil ;_ecfb ._ccd =false ;if _feec {_ca .Log .Info ("\u0043\u004c\u0045A\u0052\u003a\u0020\u0073\u0073\u003d\u0025\u0073",_ecfb );};};func (_fbbe *textObject )newTextMark (_cedb string ,_aafc _cd .Matrix ,_geag _cd .Point ,_ccec float64 ,_caec *_ag .PdfFont ,_dfff float64 ,_gbga ,_fabc _aa .Color )(textMark ,bool ){_gfbc :=_aafc .Angle ();_egcg :=_dbeed (_gfbc ,_ffb );var _cfcfb float64 ;if _egcg %180!=90{_cfcfb =_aafc .ScalingFactorY ();}else {_cfcfb =_aafc .ScalingFactorX ();};_baed :=_eaaa (_aafc );_dffe :=_ag .PdfRectangle {Llx :_baed .X ,Lly :_baed .Y ,Urx :_geag .X ,Ury :_geag .Y };switch _egcg %360{case 90:_dffe .Urx -=_cfcfb ;case 180:_dffe .Ury -=_cfcfb ;case 270:_dffe .Urx +=_cfcfb ;case 0:_dffe .Ury +=_cfcfb ;default:_egcg =0;_dffe .Ury +=_cfcfb ;};if _dffe .Llx > _dffe .Urx {_dffe .Llx ,_dffe .Urx =_dffe .Urx ,_dffe .Llx ;};if _dffe .Lly > _dffe .Ury {_dffe .Lly ,_dffe .Ury =_dffe .Ury ,_dffe .Lly ;};_adff ,_gccda :=_ddf (_dffe ,_fbbe ._ccf ._bea );if !_gccda {_ca .Log .Debug ("\u0054\u0065\u0078\u0074\u0020m\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069\u0064\u0065\u0020\u0070a\u0067\u0065\u002e\u0020\u0062\u0062\u006f\u0078\u003d\u0025\u0067\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u003d\u0025\u0067\u0020\u0074\u0065\u0078\u0074\u003d\u0025q",_dffe ,_fbbe ._ccf ._bea ,_cedb );};_dffe =_adff ;_fcda :=_dffe ;_ecde :=_fbbe ._ccf ._bea ;switch _egcg %360{case 90:_ecde .Urx ,_ecde .Ury =_ecde .Ury ,_ecde .Urx ;_fcda =_ag .PdfRectangle {Llx :_ecde .Urx -_dffe .Ury ,Urx :_ecde .Urx -_dffe .Lly ,Lly :_dffe .Llx ,Ury :_dffe .Urx };case 180:_fcda =_ag .PdfRectangle {Llx :_ecde .Urx -_dffe .Llx ,Urx :_ecde .Urx -_dffe .Urx ,Lly :_ecde .Ury -_dffe .Lly ,Ury :_ecde .Ury -_dffe .Ury };case 270:_ecde .Urx ,_ecde .Ury =_ecde .Ury ,_ecde .Urx ;_fcda =_ag .PdfRectangle {Llx :_dffe .Ury ,Urx :_dffe .Lly ,Lly :_ecde .Ury -_dffe .Llx ,Ury :_ecde .Ury -_dffe .Urx };};if _fcda .Llx > _fcda .Urx {_fcda .Llx ,_fcda .Urx =_fcda .Urx ,_fcda .Llx ;};if _fcda .Lly > _fcda .Ury {_fcda .Lly ,_fcda .Ury =_fcda .Ury ,_fcda .Lly ;};_efeg :=textMark {_ddbg :_cedb ,PdfRectangle :_fcda ,_cbff :_dffe ,_aaged :_caec ,_cbbg :_cfcfb ,_dfbd :_dfff ,_ggcba :_aafc ,_bgef :_geag ,_bgga :_egcg ,_cae :_gbga ,_gdcc :_fabc };if _dcff {_ca .Log .Info ("n\u0065\u0077\u0054\u0065\u0078\u0074M\u0061\u0072\u006b\u003a\u0020\u0073t\u0061\u0072\u0074\u003d\u0025\u002e\u0032f\u0020\u0065\u006e\u0064\u003d\u0025\u002e\u0032\u0066\u0020%\u0073",_baed ,_geag ,_efeg .String ());};return _efeg ,_gccda ;};type textLine struct{_ag .PdfRectangle ;_eagf float64 ;_decag []*textWord ;_edef float64 ;};func (_fefc paraList )toTextMarks ()[]TextMark {_ffgf :=0;var _dcgd []TextMark ;for _fgee ,_abce :=range _fefc {_aaegb :=_abce .toTextMarks (&_ffgf );_dcgd =append (_dcgd ,_aaegb ...);if _fgee !=len (_fefc )-1{if _cfgf (_abce ,_fefc [_fgee +1]){_dcgd =_eegf (_dcgd ,&_ffgf ,"\u0020");}else {_dcgd =_eegf (_dcgd ,&_ffgf ,"\u000a");_dcgd =_eegf (_dcgd ,&_f
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// ExtractPageImages returns the image contents of the page extractor, including data
|
|
|
|
|
// and position, size information for each image.
|
|
|
|
|
// A set of options to control page image extraction can be passed in. The options
|
|
|
|
|
// parameter can be nil for the default options. By default, inline stencil masks
|
|
|
|
|
// are not extracted.
|
|
|
|
|
func (_gb *Extractor )ExtractPageImages (options *ImageExtractOptions )(*PageImages ,error ){_cbf :=&imageExtractContext {_cdb :options };_bf :=_cbf .extractContentStreamImages (_gb ._ece ,_gb ._cde );if _bf !=nil {return nil ,_bf ;};return &PageImages {Images :_cbf ._eb },nil ;};func (_dd *imageExtractContext )processOperand (_eab *_be .ContentStreamOperation ,_cf _be .GraphicsState ,_fef *_ag .PdfPageResources )error {if _eab .Operand =="\u0042\u0049"&&len (_eab .Params )==1{_adf ,_fee :=_eab .Params [0].(*_be .ContentStreamInlineImage );if !_fee {return nil ;};if _aaa ,_dec :=_f .GetBoolVal (_adf .ImageMask );_dec {if _aaa &&!_dd ._cdb .IncludeInlineStencilMasks {return nil ;};};return _dd .extractInlineImage (_adf ,_cf ,_fef );}else if _eab .Operand =="\u0044\u006f"&&len (_eab .Params )==1{_ade ,_ga :=_f .GetName (_eab .Params [0]);if !_ga {_ca .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065");return _ef ;};_ ,_fa :=_fef .GetXObjectByName (*_ade );switch _fa {case _ag .XObjectTypeImage :return _dd .extractXObjectImage (_ade ,_cf ,_fef );case _ag .XObjectTypeForm :return _dd .extractFormImages (_ade ,_cf ,_fef );};};return nil ;};func _eeac (_ebcb _ag .PdfRectangle )rulingKind {_bdgf :=_ebcb .Width ();_ceec :=_ebcb .Height ();return _bbced (_bdgf ,_ceec );};func _facg (_gbd _ag .PdfRectangle )textState {return textState {_cca :100,_ffe :RenderModeFill ,_afgf :_gbd };};func (_eddb *textObject )getFontDict (_bgff string )(_bda _f .PdfObject ,_baa error ){_gebc :=_eddb ._ecgg ;if _gebc ==nil {_ca .Log .Debug ("g\u0065\u0074\u0046\u006f\u006e\u0074D\u0069\u0063\u0074\u002e\u0020\u004eo\u0020\u0072\u0065\u0073\u006f\u0075\u0072c\u0065\u0073\u002e\u0020\u006e\u0061\u006d\u0065\u003d\u0025#\u0071",_bgff );return nil ,nil ;};_bda ,_gffg :=_gebc .GetFontByName (_f .PdfObjectName (_bgff ));if !_gffg {_ca .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u0067\u0065t\u0046\u006f\u006et\u0044\u0069\u0063\u0074\u003a\u0020\u0046\u006f\u006et \u006e\u006f\u0074 \u0066\u006fu\u006e\u0064\u003a\u0020\u006e\u0061m\u0065\u003d%\u0023\u0071",_bgff );return nil ,_d .New ("f\u006f\u006e\u0074\u0020no\u0074 \u0069\u006e\u0020\u0072\u0065s\u006f\u0075\u0072\u0063\u0065\u0073");};return _bda ,nil ;};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a human readable description of `s`.
|
|
|
|
|
func (_decac intSet )String ()string {var _fccad []int ;for _daecg :=range _decac {if _decac .has (_daecg ){_fccad =append (_fccad ,_daecg );};};_db .Ints (_fccad );return _aae .Sprintf ("\u0025\u002b\u0076",_fccad );};func _ccc (_cgd ,_ebb bounded )float64 {return _cgd .bbox ().Llx -_ebb .bbox ().Llx };func _fafdc (_eedb float64 )bool {return _ed .Abs (_eedb )< _cfcd };func (_gfae *wordBag )pullWord (_dccd *textWord ,_bfge int ,_daga map[int ]map[*textWord ]struct{}){_gfae .PdfRectangle =_eegb (_gfae .PdfRectangle ,_dccd .PdfRectangle );if _dccd ._bbgd > _gfae ._fdg {_gfae ._fdg =_dccd ._bbgd ;};_gfae ._aabed [_bfge ]=append (_gfae ._aabed [_bfge ],_dccd );_daga [_bfge ][_dccd ]=struct{}{};};func (_acg *shapesState )lastpointEstablished ()(_cd .Point ,bool ){if _acg ._ccd {return _acg ._gadf ,false ;};_bgee :=len (_acg ._eff );if _bgee > 0&&_acg ._eff [_bgee -1]._ffea {return _acg ._eff [_bgee -1].last (),false ;};return _cd .Point {},true ;};func _gedg (_dfda _cd .Point )_cd .Matrix {return _cd .TranslationMatrix (_dfda .X ,_dfda .Y )};func _daad (_egec ,_fgfa _ag .PdfRectangle )bool {return _fgfa .Llx <=_egec .Urx &&_egec .Llx <=_fgfa .Urx ;};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a description of `w`.
|
|
|
|
|
func (_eebab *textWord )String ()string {return _aae .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_eebab ._fgfab ,_eebab .PdfRectangle ,_eebab ._bbgd ,_eebab ._gbcg );};func (_fce *stateStack )top ()*textState {if _fce .empty (){return nil ;};return (*_fce )[_fce .size ()-1];};type paraList []*textPara ;func (_dgbbg rulingList )intersections ()map[int ]intSet {var _gebcb ,_gfce []int ;for _afeb ,_fgda :=range _dgbbg {switch _fgda ._bfeeb {case _fdaf :_gebcb =append (_gebcb ,_afeb );case _gbef :_gfce =append (_gfce ,_afeb );};};if len (_gebcb )< _dfdf +1||len (_gfce )< _dgea +1{return nil ;};if len (_gebcb )+len (_gfce )> _gbaa {_ca .Log .Debug ("\u0069\u006e\u0074\u0065\u0072\u0073e\u0063\u0074\u0069\u006f\u006e\u0073\u003a\u0020\u0054\u004f\u004f\u0020\u004d\u0041\u004e\u0059\u0020\u0072\u0075\u006ci\u006e\u0067\u0073\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u003d\u0020%\u0064 \u0078\u0020\u0025\u0064",len (_dgbbg ),len (_gebcb ),len (_gfce ));return nil ;};_bdbg :=make (map[int ]intSet ,len (_gebcb )+len (_gfce ));for _ ,_defag :=range _gebcb {for _ ,_afdad :=range _gfce {if _dgbbg [_defag ].intersects (_dgbbg [_afdad ]){if _ ,_ggcd :=_bdbg [_defag ];!_ggcd {_bdbg [_defag ]=make (intSet );};if _ ,_ebbgf :=_bdbg [_afdad ];!_ebbgf {_bdbg [_afdad ]=make (intSet );};_bdbg [_defag ].add (_afdad );_bdbg [_afdad ].add (_defag );};};};return _bdbg ;};type event struct{_fadaa float64 ;_efgdb bool ;_cbeba int ;};func _aefec (_gbad ,_aaaaf _cd .Point )bool {_eccc :=_ed .Abs (_gbad .X -_aaaaf .X );_geaee :=_ed .Abs (_gbad .Y -_aaaaf .Y );return _agae (_eccc ,_geaee );};func _dbeed (_abab float64 ,_ffgg int )int {if _ffgg ==0{_ffgg =1;};_eedg :=float64 (_ffgg );return int (_ed .Round (_abab /_eedg )*_eedg );};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// ExtractText processes and extracts all text data in content streams and returns as a string.
|
|
|
|
|
// It takes into account character encodings in the PDF file, which are decoded by
|
|
|
|
|
// CharcodeBytesToUnicode.
|
|
|
|
|
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
|
|
|
|
|
func (_cfe *Extractor )ExtractText ()(string ,error ){_bafg ,_ ,_ ,_aeg :=_cfe .ExtractTextWithStats ();return _bafg ,_aeg ;};func (_beabe *textLine )toTextMarks (_fgbc *int )[]TextMark {var _efab []TextMark ;for _ ,_fbgc :=range _beabe ._decag {if _fbgc ._ebbgb {_efab =_eegf (_efab ,_fgbc ,"\u0020");};_afc :=_fbgc .toTextMarks (_fgbc );_efab =append (_efab ,_afc ...);};return _efab ;};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a description of `p`.
|
|
|
|
|
func (_aaeag *textPara )String ()string {_dfbb :="";if _aaeag ._cgaa !=nil {_dfbb =_aae .Sprintf ("\u005b\u0025\u0064\u0078\u0025\u0064\u005d\u0020",_aaeag ._cgaa ._acgb ,_aaeag ._cgaa ._dbff );};return _aae .Sprintf ("\u0025\u0036\u002e\u0032f \u0025\u0073\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u0020\u0025\u0071",_aaeag .PdfRectangle ,_dfbb ,len (_aaeag ._adga ),_aecb (_aaeag .text (),50));};func (_bgf *textObject )setTextLeading (_ecac float64 ){if _bgf ==nil {return ;};_bgf ._gdb ._egf =_ecac ;};func (_ebe *stateStack )pop ()*textState {if _ebe .empty (){return nil ;};_fbf :=*(*_ebe )[len (*_ebe )-1];*_ebe =(*_ebe )[:len (*_ebe )-1];return &_fbf ;};func (_eggb *subpath )clear (){*_eggb =subpath {}};func (_bdgd rulingList )connections (_ceee map[int ]intSet ,_efbdf int )intSet {_fgdeg :=make (intSet );_gefc :=make (intSet );var _bega func (int );_bega =func (_efcg int ){if !_gefc .has (_efcg ){_gefc .add (_efcg );for _agbc :=range _bdgd {if _ceee [_agbc ].has (_efcg ){_fgdeg .add (_agbc );};};for _aefc :=range _bdgd {if _fgdeg .has (_aefc ){_bega (_aefc );};};};};_bega (_efbdf );return _fgdeg ;};func (_acdb *wordBag )makeRemovals ()map[int ]map[*textWord ]struct{}{_acda :=make (map[int ]map[*textWord ]struct{},len (_acdb ._aabed ));for _cbd :=range _acdb ._aabed {_acda [_cbd ]=make (map[*textWord ]struct{});};return _acda ;};func (_ecc *subpath )close (){if !_gdad (_ecc ._aacge [0],_ecc .last ()){_ecc .add (_ecc ._aacge [0]);};_ecc ._ffea =true ;_ecc .removeDuplicates ();};func _cdggc (_fbaa string )(string ,bool ){_eeacd :=[]rune (_fbaa );if len (_eeacd )!=1{return "",false ;};_bceb ,_fdgf :=_cfaa [_eeacd [0]];return _bceb ,_fdgf ;};func (_baffg *ruling )equals (_aaefb *ruling )bool {return _baffg ._bfeeb ==_aaefb ._bfeeb &&_baffg ._cgbb ==_aaefb ._cgbb &&_baffg ._gcad ==_aaefb ._gcad &&_baffg ._fdcd ==_aaefb ._fdcd ;};
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a description of `v`.
|
|
|
|
|
func (_bdab *ruling )String ()string {if _bdab ._bfeeb ==_aebf {return "\u004e\u004f\u0054\u0020\u0052\u0055\u004c\u0049\u004e\u0047";};_afcd ,_fgde :="\u0078","\u0079";if _bdab ._bfeeb ==_gbef {_afcd ,_fgde ="\u0079","\u0078";};return _aae .Sprintf ("\u0025\u0031\u0030\u0073\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u002d \u0025\u0036\u002e\u0032\u0066 \u0028\u00256\u002e\u0032\u0066\u0029",_bdab ._bfeeb ,_afcd ,_bdab ._cgbb ,_fgde ,_bdab ._gcad ,_bdab ._fdcd ,_bdab ._fdcd -_bdab ._gcad );};func _defg (_aaaa ,_bdbf bounded )float64 {_bafeg :=_ccc (_aaaa ,_bdbf );if !_fafdc (_bafeg ){return _bafeg ;};return _bfaca (_aaaa ,_bdbf );};func (_ebca *textPara )isAtom ()*textTable {_eaae :=_ebca ;_abae :=_ebca ._afga ;_fdee :=_ebca ._gbbgf ;if !(_abae !=nil &&!_abae ._ddadc &&_fdee !=nil &&!_fdee ._ddadc ){return nil ;};_efad :=_abae ._gbbgf ;if !(_efad !=nil &&!_efad ._ddadc &&_efad ==_fdee ._afga ){return nil ;};return _cadb (_eaae ,_abae ,_fdee ,_efad );};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// PageImages represents extracted images on a PDF page with spatial information:
|
|
|
|
|
// display position and size.
|
|
|
|
|
type PageImages struct{Images []ImageMark ;};func (_faba intSet )add (_eecf int ){_faba [_eecf ]=struct{}{}};func (_cfcg *shapesState )closePath (){if _cfcg ._ccd {_cfcg ._eff =append (_cfcg ._eff ,_feb (_cfcg ._gadf ));_cfcg ._ccd =false ;}else if len (_cfcg ._eff )==0{_ca .Log .Error ("\u0063\u006c\u006f\u0073eP\u0061\u0074\u0068\u0020\u0077\u0069\u0074\u0068\u0020\u006e\u006f\u0020\u0070\u0061t\u0068");_cfcg ._ccd =false ;return ;};_cfcg ._eff [len (_cfcg ._eff )-1].close ();if _feec {_ca .Log .Info ("\u0063\u006c\u006f\u0073\u0065\u0050\u0061\u0074\u0068\u003a\u0020\u0025\u0073",_cfcg );};};func (_cdef *wordBag )arrangeText ()*textPara {_cdef .sort ();if _fabd {_cdef .removeDuplicates ();};var _dgbc []*textLine ;for _ ,_gadd :=range _cdef .depthIndexes (){for !_cdef .empty (_gadd ){_daac :=_cdef .firstReadingIndex (_gadd );_ggcc :=_cdef .firstWord (_daac );_dggg :=_acfe (_cdef ,_daac );_geea :=_ggcc ._bbgd ;_cgg :=_ggcc ._fgfab -_fagg *_geea ;_fcbe :=_ggcc ._fgfab +_fagg *_geea ;_dddd :=_bfddb *_geea ;_abbe :=_adagd *_geea ;_fdbe :for {var _faffb *textWord ;_bfaf :=0;for _ ,_acdg :=range _cdef .depthBand (_cgg ,_fcbe ){_cdfa :=_cdef .highestWord (_acdg ,_cgg ,_fcbe );if _cdfa ==nil {continue ;};_bcfdb :=_beda (_cdfa ,_dggg ._decag [len (_dggg ._decag )-1]);if _bcfdb < -_abbe {break _fdbe ;};if _bcfdb > _dddd {continue ;};if _faffb !=nil &&_ccc (_cdfa ,_faffb )>=0{continue ;};_faffb =_cdfa ;_bfaf =_acdg ;};if _faffb ==nil {break ;};_dggg .pullWord (_cdef ,_faffb ,_bfaf );};_dggg .markWordBoundaries ();_dgbc =append (_dgbc ,_dggg );};};if len (_dgbc )==0{return nil ;};_db .Slice (_dgbc ,func (_dcdf ,_cecc int )bool {return _geeb (_dgbc [_dcdf ],_dgbc [_cecc ])< 0});_ecfd :=_gccc (_cdef .PdfRectangle ,_dgbc );if _becfg {_ca .Log .Info ("\u0061\u0072\u0072an\u0067\u0065\u0054\u0065\u0078\u0074\u0020\u0021\u0021\u0021\u0020\u0070\u0061\u0072\u0061\u003d\u0025\u0073",_ecfd .String ());if _efdf {for _fegce ,_fbde :=range _ecfd ._adga {_aae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_fegce ,_fbde .String ());if _ggb {for _gadb ,_efcdf :=range _fbde ._decag {_aae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_gadb ,_efcdf .String ());for _ffbb ,_fcea :=range _efcdf ._eeda {_aae .Printf ("\u00251\u0032\u0064\u003a\u0020\u0025\u0073\n",_ffbb ,_fcea .String ());};};};};};};return _ecfd ;};
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a description of `b`.
|
|
|
|
|
func (_fcbc *wordBag )String ()string {var _adba []string ;for _ ,_dacf :=range _fcbc .depthIndexes (){_dcf ,_ :=_fcbc ._aabed [_dacf ];for _ ,_befe :=range _dcf {_adba =append (_adba ,_befe ._gbcg );};};return _aae .Sprintf ("\u0025.\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065=\u0025\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0025\u0071",_fcbc .PdfRectangle ,_fcbc ._fdg ,len (_adba ),_adba );};func (_cbdb *wordBag )scanBand (_gacb string ,_fddba *wordBag ,_beg func (_gcbe *wordBag ,_gda *textWord )bool ,_cfd ,_aaab ,_eggc float64 ,_cade ,_ebcf bool )int {_bacf :=_fddba ._fdg ;var _deca map[int ]map[*textWord ]struct{};if !_cade {_deca =_cbdb .makeRemovals ();};_bcc :=_fagg *_bacf ;_cabg :=0;var _cdga []*textWord ;for _ ,_geeg :=range _cbdb .depthBand (_cfd -_bcc ,_aaab +_bcc ){if len (_cbdb ._aabed [_geeg ])==0{continue ;};for _ ,_ebda :=range _cbdb ._aabed [_geeg ]{if !(_cfd -_bcc <=_ebda ._fgfab &&_ebda ._fgfab <=_aaab +_bcc ){continue ;};if !_beg (_fddba ,_ebda ){continue ;};_dgd :=2.0*_ed .Abs (_ebda ._bbgd -_fddba ._fdg )/(_ebda ._bbgd +_fddba ._fdg );_abbc :=_ed .Max (_ebda ._bbgd /_fddba ._fdg ,_fddba ._fdg /_ebda ._bbgd );_ggeg :=_ed .Min (_dgd ,_abbc );if _eggc > 0&&_ggeg > _eggc {continue ;};if !_cade {_fddba .pullWord (_ebda ,_geeg ,_deca );};_cdga =append (_cdga ,_ebda );_cabg ++;if !_ebcf {if _ebda ._fgfab < _cfd {_cfd =_ebda ._fgfab ;};if _ebda ._fgfab > _aaab {_aaab =_ebda ._fgfab ;};};if _cade {break ;};};};if !_cade {_cbdb .applyRemovals (_deca );};return _cabg ;};func (_dcabe *textObject )getCurrentFont ()*_ag .PdfFont {var _ecf *_ag .PdfFont ;if !_dcabe ._egd .empty (){_ecf =_dcabe ._egd .top ()._agce ;};if _ecf ==nil {_ca .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u0020\u0064\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020U\u0073\u0069\u006e\u0067\u0020d\u0065\u0066a\u0075\u006c\u0074\u002e");return _ag .DefaultFont ();};return _ecf ;};func _eegb (_aaaae ,_defe _ag .PdfRectangle )_ag .PdfRectangle {return _ag .PdfRectangle {Llx :_ed .Min (_aaaae .Llx ,_defe .Llx ),Lly :_ed .Min (_aaaae .Lly ,_defe .Lly ),Urx :_ed .Max (_aaaae .Urx ,_defe .Urx ),Ury :_ed .Max (_aaaae .Ury ,_defe .Ury )};};var (_ef =_d .New ("\u0074\u0079p\u0065\u0020\u0063h\u0065\u0063\u006b\u0020\u0065\u0072\u0072\u006f\u0072");_ab =_d .New ("\u0072\u0061\u006e\u0067\u0065\u0020\u0063\u0068\u0065\u0063\u006b\u0020e\u0072\u0072\u006f\u0072"););func (_gdfa *shapesState )lineTo (_cag ,_bece float64 ){_gdfa .addPoint (_cag ,_bece );if _feec {_ca .Log .Info ("\u006c\u0069\u006eeT\u006f\u0028\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0070\u003d\u0025\u002e\u0032\u0066",_cag ,_bece ,_gdfa .devicePoint (_cag ,_bece ));};};func _dccc (_ccda string )string {_dacef :=[]rune (_ccda );return string (_dacef [:len (_dacef )-1])};func (_cdaa paraList )findTableGrid (_aadbe rulingList )*textTable {_bffb ,_bebd ,_gcbd :=_aadbe .cells ();_efcf :=textTable {_acgb :_bffb ,_dbff :_bebd ,_gdcfd :true ,_eagd :make (map[uint64 ]*textPara )};for _gfbgc :=0;_gfbgc < _bebd ;_gfbgc ++{for _edac :=0;_edac < _bffb ;_edac ++{_gcbf :=_gcbd [_gfbgc *_bffb +_edac ];_gbgac :=_cdaa .inRect (_gcbf );if _gbgac !=nil {_efcf .put (_edac ,_bebd -1-_gfbgc ,_gbgac );}else {return nil ;};};};return &_efcf ;};
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
|
|
|
|
|
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
|
|
|
|
|
func (_ged *Extractor )ExtractTextWithStats ()(_bac string ,_cadd int ,_gge int ,_fg error ){_bgc ,_cadd ,_gge ,_fg :=_ged .ExtractPageText ();if _fg !=nil {return "",_cadd ,_gge ,_fg ;};return _bgc .Text (),_cadd ,_gge ,nil ;};const (_aebf rulingKind =iota ;_gbef ;_fdaf ;);func (_bfeb *textTable )markCells (){for _dfeg :=0;_dfeg < _bfeb ._dbff ;_dfeg ++{for _cefg :=0;_cefg < _bfeb ._acgb ;_cefg ++{_dfgdb :=_bfeb .get (_cefg ,_dfeg );_dfgdb ._ddadc =true ;};};};var _dcec =map[rulingKind ]string {_aebf :"\u006e\u006f\u006e\u0065",_gbef :"\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c",_fdaf :"\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c"};func (_eeg *imageExtractContext )extractContentStreamImages (_bd string ,_adc *_ag .PdfPageResources )error {_gf :=_be .NewContentStreamParser (_bd );_dedb ,_fbd :=_gf .Parse ();if _fbd !=nil {return _fbd ;};if _eeg ._bb ==nil {_eeg ._bb =map[*_f .PdfObjectStream ]*cachedImage {};};if _eeg ._cdb ==nil {_eeg ._cdb =&ImageExtractOptions {};};_dbe :=_be .NewContentStreamProcessor (*_dedb );_dbe .AddHandler (_be .HandlerConditionEnumAllOperands ,"",func (_bab *_be .ContentStreamOperation ,_gg _be .GraphicsState ,_abg *_ag .PdfPageResources )error {return _eeg .processOperand (_bab ,_gg ,_abg );});return _dbe .Process (_adc );};type lineRuling struct{_fcgc rulingKind ;_dfaca ,_afcc _cd .Point ;};func _efdc (_cgcab float64 )float64 {return _efcd *_ed .Round (_cgcab /_efcd )};func _cfac (_efea func (*wordBag ,*textWord ,float64 )bool ,_bacb float64 )func (*wordBag ,*textWord )bool {return func (_bead *wordBag ,_dbeag *textWord )bool {return _efea (_bead ,_dbeag ,_bacb )};};
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a description of `k`.
|
|
|
|
|
func (_bdfb rulingKind )String ()string {_dcfc ,_cbfe :=_dcec [_bdfb ];if !_cbfe {return _aae .Sprintf ("\u004e\u006ft\u0020\u0061\u0020r\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0025\u0064",_bdfb );};return _dcfc ;};func _eebd (_dedbg ,_bbbb *textPara )bool {return _daad (_dedbg ._bbba ,_bbbb ._bbba )};func _ccfe (_ddcg []*subpath )[]rulingList {_fffcd (_ddcg );var _egbff rulingList ;for _ ,_daeba :=range _ddcg {if !_daeba .isQuadrilateral (){continue ;};if _gbfe ,_egdb :=_daeba .makeRectRuling ();_egdb {_egbff =append (_egbff ,_gbfe );};};_egbff =_egbff .tidied ("\u0066\u0069\u006cl\u0073");return _egbff .toGrids ();};type wordBag struct{_ag .PdfRectangle ;_fdg float64 ;_bbcee float64 ;_aabed map[int ][]*textWord ;};func (_debb lineRuling )asRuling ()(*ruling ,bool ){_eebac :=ruling {_bfeeb :_debb ._fcgc };switch _debb ._fcgc {case _fdaf :_eebac ._cgbb =_debb .xMean ();_eebac ._gcad =_ed .Min (_debb ._dfaca .Y ,_debb ._afcc .Y );_eebac ._fdcd =_ed .Max (_debb ._dfaca .Y ,_debb ._afcc .Y );case _gbef :_eebac ._cgbb =_debb .yMean ();_eebac ._gcad =_ed .Min (_debb ._dfaca .X ,_debb ._afcc .X );_eebac ._fdcd =_ed .Max (_debb ._dfaca .X ,_debb ._afcc .X );default:_ca .Log .Error ("\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064",_debb ._fcgc );return nil ,false ;};return &_eebac ,true ;};func _bcgd (_ddba bounded )float64 {return -_ddba .bbox ().Lly };
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
|
|
|
|
|
func (_aeb *TextMarkArray )BBox ()(_ag .PdfRectangle ,bool ){var _bfgg _ag .PdfRectangle ;_egge :=false ;for _ ,_bcgb :=range _aeb ._ggd {if _bcgb .Meta ||_ecdca (_bcgb .Text ){continue ;};if _egge {_bfgg =_eegb (_bfgg ,_bcgb .BBox );}else {_bfgg =_bcgb .BBox ;_egge =true ;};};return _bfgg ,_egge ;};func _bgb (_bbbd *_be .ContentStreamOperation )(float64 ,error ){if len (_bbbd .Params )!=1{_aff :=_d .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");_ca .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_bbbd .Operand ,1,len (_bbbd .Params ),_bbbd .Params );return 0.0,_aff ;};return _f .GetNumberAsFloat (_bbbd .Params [0]);};var _ded =false ;func _fdba (_ddgd ,_agccb int )int {if _ddgd > _agccb {return _ddgd ;};return _agccb ;};func (_aedg *textLine )markWordBoundaries (){_ced :=_ffg *_aedg ._edef ;for _bbgc ,_cccb :=range _aedg ._decag [1:]{if _beda (_cccb ,_aedg ._decag [_bbgc ])>=_ced {_cccb ._ebbgb =true ;};};};func _ceef (_cdaf ,_cdgg _cd .Point )(*ruling ,bool ){_deaaf :=lineRuling {_dfaca :_cdaf ,_afcc :_cdgg ,_fcgc :_ebbg (_cdaf ,_cdgg )};if _deaaf ._fcgc ==_aebf {return nil ,false ;};return _deaaf .asRuling ();};type textWord struct{_ag .PdfRectangle ;_fgfab float64 ;_gbcg string ;_eeda []*textMark ;_bbgd float64 ;_ebbgb bool ;};func (_afde paraList )extractTables (_dgee []rulingList )paraList {if _bege {_ca .Log .Debug ("\u0065\u0078\u0074r\u0061\u0063\u0074\u0054\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_afde ));};if len (_afde )< _bfacd {return _afde ;};_aadb :=_afde .findTables (_dgee );if _bege {_ca .Log .Info ("c\u006f\u006d\u0062\u0069\u006e\u0065d\u0020\u0074\u0061\u0062\u006c\u0065s\u0020\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d",len (_aadb ));for _gega ,_dbef :=range _aadb {_dbef .log (_aae .Sprintf ("c\u006f\u006d\u0062\u0069\u006e\u0065\u0064\u0020\u0025\u0064",_gega ));};};return _afde .applyTables (_aadb );};func (_fege *wordBag )maxDepth ()float64 {return _fege ._bbcee -_fege .Lly };func (_eac *textObject )showTextAdjusted (_cbb *_f .PdfObjectArray )error {_egb :=false ;for _ ,_gfa :=range _cbb .Elements (){switch _gfa .(type ){case *_f .PdfObjectFloat ,*_f .PdfObjectInteger :_cfcf ,_ccb :=_f .GetNumberAsFloat (_gfa );if _ccb !=nil {_ca .Log .Debug ("\u0045\u0052\u0052\u004fR\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078t\u0041\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0042\u0061\u0064\u0020\u006e\u0075\u006d\u0065r\u0069\u0063\u0061\u006c\u0020a\u0072\u0067\u002e\u0020\u006f\u003d\u0025\u0073\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_gfa ,_cbb );return _ccb ;};_dgc ,_bfc :=-_cfcf *0.001*_eac ._gdb ._adb ,0.0;if _egb {_bfc ,_dgc =_dgc ,_bfc ;};_gaff :=_gedg (_cd .Point {X :_dgc ,Y :_bfc });_eac ._gdg .Concat (_gaff );case *_f .PdfObjectString :_egeb ,_cfa :=_f .GetStringBytes (_gfa );if !_cfa {_ca .Log .Trace ("s\u0068\u006f\u0077\u0054\u0065\u0078\u0074\u0041\u0064j\u0075\u0073\u0074\u0065\u0064\u003a\u0020Ba\u0064\u0020\u0073\u0074r\u0069\u006e\u0067\u0020\u0061\u0072\u0067\u002e\u0020o=\u0025\u0073 \u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_gfa ,_cbb );return _f .ErrTypeError ;};_eac .renderText (_egeb );default:_ca .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078\u0074A\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0055\u006e\u0065\u0078p\u0065\u0063\u0074\u0065\u0064\u0020\u0074\u0079\u0070\u0065\u0020\u0028%T\u0029\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_gfa ,_cbb );return _f .ErrTypeError ;};};return nil ;};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// Append appends `mark` to the mark array.
|
|
|
|
|
func (_fdab *TextMarkArray )Append (mark TextMark ){_fdab ._ggd =append (_fdab ._ggd ,mark )};func (_dbfc *textPara )fontsize ()float64 {return _dbfc ._adga [0]._edef };
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// RangeOffset returns the TextMarks in `ma` that overlap text[start:end] in the extracted text.
|
|
|
|
|
// These are tm: `start` <= tm.Offset + len(tm.Text) && tm.Offset < `end` where
|
|
|
|
|
// `start` and `end` are offsets in the extracted text.
|
|
|
|
|
// NOTE: TextMarks can contain multiple characters. e.g. "ffi" for the ffi ligature so the first and
|
|
|
|
|
// last elements of the returned TextMarkArray may only partially overlap text[start:end].
|
|
|
|
|
func (_efg *TextMarkArray )RangeOffset (start ,end int )(*TextMarkArray ,error ){if _efg ==nil {return nil ,_d .New ("\u006da\u003d\u003d\u006e\u0069\u006c");};if end < start {return nil ,_aae .Errorf ("\u0065\u006e\u0064\u0020\u003c\u0020\u0073\u0074\u0061\u0072\u0074\u002e\u0020\u0052\u0061n\u0067\u0065\u004f\u0066\u0066\u0073\u0065\u0074\u0020\u006e\u006f\u0074\u0020d\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020\u0073\u0074\u0061\u0072t=\u0025\u0064\u0020\u0065\u006e\u0064\u003d\u0025\u0064\u0020",start ,end );};_efbe :=len (_efg ._ggd );if _efbe ==0{return _efg ,nil ;};if start < _efg ._ggd [0].Offset {start =_efg ._ggd [0].Offset ;};if end > _efg ._ggd [_efbe -1].Offset +1{end =_efg ._ggd [_efbe -1].Offset +1;};_bdfe :=_db .Search (_efbe ,func (_ebc int )bool {return _efg ._ggd [_ebc ].Offset +len (_efg ._ggd [_ebc ].Text )-1>=start });if !(0<=_bdfe &&_bdfe < _efbe ){_dfcb :=_aae .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020\u0072\u0061\u006e\u0067\u0065\u002e\u0020\u0073\u0074\u0061\u0072\u0074\u003d%\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009 \u006c\u0061\u0073\u0074\u003d%\u0076",start ,_bdfe ,_efbe ,_efg ._ggd [0],_efg ._ggd [_efbe -1]);return nil ,_dfcb ;};_gad :=_db .Search (_efbe ,func (_cgb int )bool {return _efg ._ggd [_cgb ].Offset > end -1});if !(0<=_gad &&_gad < _efbe ){_abaf :=_aae .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020r\u0061\u006e\u0067e\u002e\u0020\u0065n\u0064\u003d%\u0064\u0020\u0069\u0045\u006e\u0064=\u0025d \u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d\u0025\u0076",end ,_gad ,_efbe ,_efg ._ggd [0],_efg ._ggd [_efbe -1]);return nil ,_abaf ;};if _gad <=_bdfe {return nil ,_aae .Errorf ("\u0069\u0045\u006e\u0064\u0020\u003c=\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003a\u0020\u0073\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u0065\u006ed\u003d\u0025\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020i\u0045n\u0064\u003d\u0025\u0064",start ,end ,_bdfe ,_gad );};return &TextMarkArray {_ggd :_efg ._ggd [_bdfe :_gad ]},nil ;};func _gdad (_ddddg ,_eaagf _cd .Point )bool {return _ddddg .X ==_eaagf .X &&_ddddg .Y ==_eaagf .Y };func (_ebf *shapesState )devicePoint (_cgbc ,_daaf float64 )_cd .Point {_feg :=_ebf ._gcfa .Mult (_ebf ._daec );_cgbc ,_daaf =_feg .Transform (_cgbc ,_daaf );return _cd .NewPoint (_cgbc ,_daaf );};func (_agb *shapesState )fill (_bfdd *[]*subpath ){*_bfdd =append (*_bfdd ,_agb ._eff ...);if _cfg {_ca .Log .Info ("\u0046\u0049L\u004c\u003a\u0020\u0025\u0064\u0020\u0066\u0069\u006c\u006c\u0073\u0020\u0028\u0025\u0064\u0020\u006e\u0065\u0077\u0029\u0020\u0073s=\u0025\u0073",len (*_bfdd ),len (_agb ._eff ),_agb );for _bbce ,_agde :=range _agb ._eff {_aae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bbce ,_agde );if _bbce ==10{break ;};};};};type ruling struct{_bfeeb rulingKind ;_cgbb float64 ;_gcad float64 ;_fdcd float64 ;};func _dede (_gefea int ,_fcgf func (int ,int )bool )[]int {_affa :=make ([]int ,_gefea );for _edeg :=range _affa {_affa [_edeg ]=_edeg ;};_db .Slice (_affa ,func (_edeb ,_agcga int )bool {return _fcgf (_affa [_edeb ],_affa [_agcga ])});return _affa ;};func (_bfde rulingList )removeDuplicates ()rulingList {if len (_bfde )==0{return nil ;};_bfde .sort ();_effd :=rulingList {_bfde [0]};for _ ,_dbcf :=range _bfde [1:]{if _dbcf .equals (_effd [len (_effd )-1]){continue ;};_effd =append (_effd ,_dbcf );};return _effd ;};const (_cfcd =1.0e-6;_efcd =1.0e-4;_ffb =10;_gebd =6;_fagg =0.5;_abea =0.11;_fecga =0.19;_ddcd =0.04;_dada =0.04;_gdfg =1.0;_ceab =0.04;_eeec =0.4;_feea =0.7;_cdbd =1.0;_deaf =0.1;_bfddb =1.4;_adagd =0.46;_ffg =0.02;_deg =0.2;_bfbf =0.5;_bgbg =4;_cdee =4.0;_bfacd =6;_gbff =0.01;_fbeeb =0.02;_dfdf =2;_dgea =2;_gbaa =500;_ffbf =10.0;_gegd =0.05;_ebea =0.3;_dfgg =1.0;_eada =1.0;);
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a string describing the current state of the textState stack.
|
|
|
|
|
func (_efe *stateStack )String ()string {_gcda :=[]string {_aae .Sprintf ("\u002d\u002d\u002d\u002d f\u006f\u006e\u0074\u0020\u0073\u0074\u0061\u0063\u006b\u003a\u0020\u0025\u0064",len (*_efe ))};for _cda ,_eaa :=range *_efe {_bca :="\u003c\u006e\u0069l\u003e";if _eaa !=nil {_bca =_eaa .String ();};_gcda =append (_gcda ,_aae .Sprintf ("\u0009\u0025\u0032\u0064\u003a\u0020\u0025\u0073",_cda ,_bca ));};return _ecb .Join (_gcda ,"\u000a");};
|
2020-09-21 01:20:10 +00:00
|
|
|
|
|
|
|
|
|
// String returns a description of `l`.
|
2020-09-28 23:18:17 +00:00
|
|
|
|
func (_gegdc *textLine )String ()string {return _aae .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_gegdc ._eagf ,_gegdc .PdfRectangle ,_gegdc ._edef ,_gegdc .text ());};func (_dagae *textLine )endsInHyphen ()bool {_agg :=_dagae ._decag [len (_dagae ._decag )-1];_fefa :=[]rune (_agg ._gbcg );if !_c .Is (_c .Hyphen ,_fefa [len (_fefa )-1]){return false ;};if _agg ._ebbgb &&_fdfa (_fefa ){return true ;};return _fdfa ([]rune (_dagae .text ()));};type shapesState struct{_daec _cd .Matrix ;_gcfa _cd .Matrix ;_eff []*subpath ;_ccd bool ;_gadf _cd .Point ;};func (_edcf paraList )writeText (_gbac _df .Writer ){for _ddef ,_dbbf :=range _edcf {_dbbf .writeText (_gbac );if _ddef !=len (_edcf )-1{if _cfgf (_dbbf ,_edcf [_ddef +1]){_gbac .Write ([]byte ("\u0020"));}else {_gbac .Write ([]byte ("\u000a"));_gbac .Write ([]byte ("\u000a"));};};};_gbac .Write ([]byte ("\u000a"));_gbac .Write ([]byte ("\u000a"));};func (_egece *textWord )toTextMarks (_cfgaa *int )[]TextMark {var _eegc []TextMark ;for _ ,_egega :=range _egece ._eeda {_eegc =_acdfe (_eegc ,_cfgaa ,_egega .ToTextMark ());};return _eegc ;};func (_gecd *textTable )getRight ()paraList {_fcdaf :=make (paraList ,_gecd ._dbff );for _dcde :=0;_dcde < _gecd ._dbff ;_dcde ++{_eegae :=_gecd .get (_gecd ._acgb -1,_dcde )._afga ;if _eegae ==nil ||_eegae ._ddadc {return nil ;};_fcdaf [_dcde ]=_eegae ;};for _eebcc :=0;_eebcc < _gecd ._dbff -1;_eebcc ++{if _fcdaf [_eebcc ]._gbbgf !=_fcdaf [_eebcc +1]{return nil ;};};return _fcdaf ;};func (_bfce *textMark )inDiacriticArea (_decb *textMark )bool {_dfea :=_bfce .Llx -_decb .Llx ;_bfbc :=_bfce .Urx -_decb .Urx ;_fga :=_bfce .Lly -_decb .Lly ;return _ed .Abs (_dfea +_bfbc )< _bfce .Width ()*_bfbf &&_ed .Abs (_fga )< _bfce .Height ()*_bfbf ;};func (_aaf *textObject )moveLP (_feac ,_dgg float64 ){_aaf ._beab .Concat (_cd .NewMatrix (1,0,0,1,_feac ,_dgg ));_aaf ._gdg =_aaf ._beab ;};func (_fbbab *textTable )toTextTable ()TextTable {if _bege {_ca .Log .Info ("t\u006fT\u0065\u0078\u0074\u0054\u0061\u0062\u006c\u0065:\u0020\u0025\u0064\u0020x \u0025\u0064",_fbbab ._acgb ,_fbbab ._dbff );};_gbgaa :=make ([][]TableCell ,_fbbab ._dbff );for _ebegg :=0;_ebegg < _fbbab ._dbff ;_ebegg ++{_gbgaa [_ebegg ]=make ([]TableCell ,_fbbab ._acgb );for _babc :=0;_babc < _fbbab ._acgb ;_babc ++{_gbagd :=_fbbab .get (_babc ,_ebegg );if _bege {_aae .Printf ("\u0025\u0034\u0064 \u0025\u0032\u0064\u003a\u0020\u0025\u0073\u000a",_babc ,_ebegg ,_gbagd );};if _gbagd ==nil {continue ;};_gbgaa [_ebegg ][_babc ].Text =_gbagd .text ();_gccdf :=0;_gbgaa [_ebegg ][_babc ].Marks ._ggd =_gbagd .toTextMarks (&_gccdf );};};return TextTable {W :_fbbab ._acgb ,H :_fbbab ._dbff ,Cells :_gbgaa };};func _ddf (_bgfg ,_aada _ag .PdfRectangle )(_ag .PdfRectangle ,bool ){if !_ccgf (_bgfg ,_aada ){return _ag .PdfRectangle {},false ;};return _ag .PdfRectangle {Llx :_ed .Max (_bgfg .Llx ,_aada .Llx ),Urx :_ed .Min (_bgfg .Urx ,_aada .Urx ),Lly :_ed .Max (_bgfg .Lly ,_aada .Lly ),Ury :_ed .Min (_bgfg .Ury ,_aada .Ury )},true ;};func _aecb (_fabbc string ,_fgaf int )string {if len (_fabbc )< _fgaf {return _fabbc ;};return _fabbc [:_fgaf ];};func _ebec (_gag *textWord ,_deag float64 )*wordBag {_gcgb :=_efa (_gag ._fgfab );_cga :=[]*textWord {_gag };_gebbc :=wordBag {_aabed :map[int ][]*textWord {_gcgb :_cga },PdfRectangle :_gag .PdfRectangle ,_fdg :_gag ._bbgd ,_bbcee :_deag };return &_gebbc ;};func (_aegac *wordBag )minDepth ()float64 {return _aegac ._bbcee -(_aegac .Ury -_aegac ._fdg )};func (_fcdf paraList )inRect (_aabg _ag .PdfRectangle )*textPara {var _efdbd paraList ;for _ ,_abf :=range _fcdf {_aced :=_abf .PdfRectangle ;_aced .Lly +=_eada ;_aced .Ury -=_eada ;if _cdcg (_aabg ,_aced ){_efdbd =append (_efdbd ,_abf );};};if len (_efdbd )!=1{return nil ;};return _efdbd [0];};func _dgbbc (_dfeac []TextMark ,_efgd *int )[]TextMark {_bacd :=_dfeac [len (_dfeac )-1];_ggfe :=[]rune (_bacd .Text );if len (_ggfe )==1{_dfeac =_dfeac [:len (_dfeac )-1];_gcdc :=_dfeac [len (_dfea
|
2018-03-22 13:01:04 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// ImageMark represents an image drawn on a page and its position in device coordinates.
|
|
|
|
|
// All coordinates are in device coordinates.
|
2020-09-28 23:18:17 +00:00
|
|
|
|
type ImageMark struct{Image *_ag .Image ;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// Dimensions of the image as displayed in the PDF.
|
|
|
|
|
Width float64 ;Height float64 ;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// Position of the image in PDF coordinates (lower left corner).
|
|
|
|
|
X float64 ;Y float64 ;
|
|
|
|
|
|
|
|
|
|
// Angle in degrees, if rotated.
|
2020-09-28 23:18:17 +00:00
|
|
|
|
Angle float64 ;};func (_fadf rulingList )comp (_bfef ,_dgae int )bool {_ddcab ,_befb :=_fadf [_bfef ],_fadf [_dgae ];_faede ,_bbaf :=_ddcab ._bfeeb ,_befb ._bfeeb ;if _faede !=_bbaf {return _faede > _bbaf ;};if _faede ==_aebf {return false ;};_dbdg :=func (_ddfc bool )bool {if _faede ==_gbef {return _ddfc ;};return !_ddfc ;};_edbc ,_ddedd :=_ddcab ._cgbb ,_befb ._cgbb ;if _edbc !=_ddedd {return _dbdg (_edbc > _ddedd );};_edbc ,_ddedd =_ddcab ._gcad ,_befb ._gcad ;if _edbc !=_ddedd {return _dbdg (_edbc < _ddedd );};return _dbdg (_ddcab ._fdcd < _befb ._fdcd );};func (_ebbf *textPara )bbox ()_ag .PdfRectangle {return _ebbf .PdfRectangle };func _eaaa (_dcb _cd .Matrix )_cd .Point {_dadb ,_eeb :=_dcb .Translation ();return _cd .Point {X :_dadb ,Y :_eeb };};func _acdfe (_gggc []TextMark ,_abcg *int ,_ecegc TextMark )[]TextMark {_ecegc .Offset =*_abcg ;_gggc =append (_gggc ,_ecegc );*_abcg +=len (_ecegc .Text );return _gggc ;};func (_ceb *textObject )setCharSpacing (_dda float64 ){if _ceb ==nil {return ;};_ceb ._gdb ._bcf =_dda ;if _fgff {_ca .Log .Info ("\u0073\u0065t\u0043\u0068\u0061\u0072\u0053\u0070\u0061\u0063\u0069\u006e\u0067\u003a\u0020\u0025\u002e\u0032\u0066\u0020\u0073\u0074\u0061\u0074e=\u0025\u0073",_dda ,_ceb ._gdb .String ());};};func _degc (_abag ,_ggbda int )int {if _abag < _ggbda {return _abag ;};return _ggbda ;};
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// ImageExtractOptions contains options for controlling image extraction from
|
|
|
|
|
// PDF pages.
|
|
|
|
|
type ImageExtractOptions struct{IncludeInlineStencilMasks bool ;};func (_bdbc *textWord )computeText ()string {_cgbg :=make ([]string ,len (_bdbc ._eeda ));for _dbdbe ,_dfde :=range _bdbc ._eeda {_cgbg [_dbdbe ]=_dfde ._ddbg ;};return _ecb .Join (_cgbg ,"");};
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// Tables returns the tables extracted from the page.
|
|
|
|
|
func (_aee PageText )Tables ()[]TextTable {return _aee ._feaf };func (_fffc rulingList )aligned ()bool {if len (_fffc )< 2{return false ;};_ggbf :=_fffc [0];for _ ,_cgaaa :=range _fffc [1:]{if !(_fafdc (_cgaaa ._gcad -_ggbf ._gcad )&&_fafdc (_cgaaa ._fdcd -_ggbf ._fdcd )){return false ;};};return true ;};func (_ccffc paraList )findGridTables (_fdfg []rulingList )[]*textTable {if _bege {_ca .Log .Info ("\u0066\u0069\u006e\u0064T\u0061\u0062\u006c\u0065\u0057\u0069\u0074\u0068\u0047\u0072i\u0064s\u003a\u0020\u0025\u0064\u0020\u0070\u0061r\u0061\u0073",len (_ccffc ));for _bfage ,_agac :=range _ccffc {_aae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bfage ,_agac );};};var _eegge []*textTable ;for _efdb ,_fdfe :=range _fdfg {_aec :=_ccffc .findTableGrid (_fdfe );if _aec !=nil {_aec .log (_aae .Sprintf ("\u0066\u0069\u006e\u0064Ta\u0062\u006c\u0065\u0057\u0069\u0074\u0068\u0047\u0072\u0069\u0064\u0073\u003a\u0020%\u0064",_efdb ));_eegge =append (_eegge ,_aec );_aec .markCells ();};};return _eegge ;};func _fffcd (_fbbabg []*subpath ){if _efcd < 0.0{return ;};for _gecbf ,_daee :=range _fbbabg {for _fdad ,_acec :=range _daee ._aacge {_daee ._aacge [_fdad ]=_cd .Point {X :_efdc (_acec .X ),Y :_efdc (_acec .Y )};if _cfg {_eaaad :=_daee ._aacge [_fdad ];if !_gdad (_acec ,_eaaad ){_efaba :=_cd .Point {X :_eaaad .X -_acec .X ,Y :_eaaad .Y -_acec .Y };_ca .Log .Info ("\u0020\u0025\u0064\u0020-\u0020\u0025\u0064\u003a\u0020\u0025\u002e\u0032\u0066\u0020→\u0020%\u002e\u0032\u0066\u0020\u0028\u0025\u0067)",_gecbf ,_fdad ,_acec ,_eaaad ,_efaba );};};};};};func (_fgca *textLine )pullWord (_fgge *wordBag ,_gaeg *textWord ,_accb int ){_fgca .appendWord (_gaeg );_fgge .removeWord (_gaeg ,_accb );};func _agae (_febg ,_dedg float64 )bool {return _febg /_ed .Max (1.0,_dedg )< _gegd };type rulingKind int ;func _efa (_egbf float64 )int {var _ggg int ;if _egbf >=0{_ggg =int (_egbf /_gebd );}else {_ggg =int (_egbf /_gebd )-1;};return _ggg ;};func _edfg (_ceed []*textMark ,_agdd _ag .PdfRectangle ,_dbfe []rulingList )paraList {_ca .Log .Trace ("\u006d\u0061\u006b\u0065\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065\u003a \u0025\u0064\u0020\u0065\u006c\u0065m\u0065\u006e\u0074\u0073\u0020\u0070\u0061\u0067\u0065\u0053\u0069\u007a\u0065=\u0025\u002e\u0032\u0066",len (_ceed ),_agdd );if len (_ceed )==0{return nil ;};_geed :=_bfbfc (_ceed ,_agdd );if len (_geed )==0{return nil ;};_accd :=_dead (_geed ,_agdd .Ury );_faeb :=_fdbf (_accd ,_agdd .Ury );_faeb =_cbeb (_faeb );_cdbbd :=make (paraList ,0,len (_faeb ));for _ ,_fdac :=range _faeb {_adbb :=_fdac .arrangeText ();if _adbb !=nil {_cdbbd =append (_cdbbd ,_adbb );};};if len (_cdbbd )>=_bfacd {_cdbbd =_cdbbd .extractTables (_dbfe );};_cdbbd .sortReadingOrder ();_cdbbd .log ("\u0073\u006f\u0072te\u0064\u0020\u0069\u006e\u0020\u0072\u0065\u0061\u0064\u0069\u006e\u0067\u0020\u006f\u0072\u0064\u0065\u0072");return _cdbbd ;};var (_cfaa =map[rune ]string {0x0060:"\u0300",0x02CB:"\u0300",0x0027:"\u0301",0x00B4:"\u0301",0x02B9:"\u0301",0x02CA:"\u0301",0x005E:"\u0302",0x02C6:"\u0302",0x007E:"\u0303",0x02DC:"\u0303",0x00AF:"\u0304",0x02C9:"\u0304",0x02D8:"\u0306",0x02D9:"\u0307",0x00A8:"\u0308",0x00B0:"\u030a",0x02DA:"\u030a",0x02BA:"\u030b",0x02DD:"\u030b",0x02C7:"\u030c",0x02C8:"\u030d",0x0022:"\u030e",0x02BB:"\u0312",0x02BC:"\u0313",0x0486:"\u0313",0x055A:"\u0313",0x02BD:"\u0314",0x0485:"\u0314",0x0559:"\u0314",0x02D4:"\u031d",0x02D5:"\u031e",0x02D6:"\u031f",0x02D7:"\u0320",0x02B2:"\u0321",0x00B8:"\u0327",0x02CC:"\u0329",0x02B7:"\u032b",0x02CD:"\u0331",0x005F:"\u0332",0x204E:"\u0359"};);func (_eaab rulingList )sortStrict (){_db .Slice (_eaab ,func (_fbgf ,_fbba int )bool {_ccge ,_eec :=_eaab [_fbgf ],_eaab [_fbba ];_eabf ,_bdce :=_ccge ._bfeeb ,_eec ._bfeeb ;if _eabf !=_bdce {return _eabf > _bdce ;};_geee ,_ggae :=_ccge ._cgbb ,_eec ._cgbb ;if _geee !=_ggae {return _geee < _ggae ;};_geee ,_ggae =_ccge ._gcad ,_eec ._gcad ;if _geee !=_ggae {return _geee < _ggae ;};return _ccge ._fdcd < _eec ._fdcd ;});};func (_cfeab lineRuling )yMean ()float64 {return 0.5*(_cfeab ._dfaca .Y +_cfeab ._afcc .Y
|
|
|
|
|
|
|
|
|
|
// ToTextMark returns the public view of `tm`.
|
|
|
|
|
func (_dffg *textMark )ToTextMark ()TextMark {return TextMark {Text :_dffg ._ddbg ,Original :_dffg ._bada ,BBox :_dffg ._cbff ,Font :_dffg ._aaged ,FontSize :_dffg ._cbbg ,FillColor :_dffg ._cae ,StrokeColor :_dffg ._gdcc };};const _bace =10;func (_ffecb *textTable )get (_ccfd ,_cdad int )*textPara {return _ffecb ._eagd [_bfacc (_ccfd ,_cdad )]};func (_dag *imageExtractContext )extractFormImages (_dcc *_f .PdfObjectName ,_fd _be .GraphicsState ,_dee *_ag .PdfPageResources )error {_bae ,_beb :=_dee .GetXObjectFormByName (*_dcc );if _beb !=nil {return _beb ;};if _bae ==nil {return nil ;};_cad ,_beb :=_bae .GetContentStream ();if _beb !=nil {return _beb ;};_dfb :=_bae .Resources ;if _dfb ==nil {_dfb =_dee ;};_beb =_dag .extractContentStreamImages (string (_cad ),_dfb );if _beb !=nil {return _beb ;};_dag ._bg ++;return nil ;};
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// TableCell is a cell in a TextTable.
|
|
|
|
|
type TableCell struct{
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// Text is the extracted text.
|
|
|
|
|
Text string ;
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// Marks returns the TextMarks corresponding to the text in Text.
|
2020-09-28 23:18:17 +00:00
|
|
|
|
Marks TextMarkArray ;};type intSet map[int ]struct{};func _bdac (_afac []int )[]int {_ace :=make ([]int ,len (_afac ));for _cgaf ,_egba :=range _afac {_ace [len (_afac )-1-_cgaf ]=_egba ;};return _ace ;};func (_eeggd *shapesState )establishSubpath ()*subpath {_agfb ,_edec :=_eeggd .lastpointEstablished ();if !_edec {_eeggd ._eff =append (_eeggd ._eff ,_feb (_agfb ));};if len (_eeggd ._eff )==0{return nil ;};_eeggd ._ccd =false ;return _eeggd ._eff [len (_eeggd ._eff )-1];};func (_cfffc *textTable )newTablePara ()*textPara {_ddde :=_cfffc .computeBbox ();return &textPara {PdfRectangle :_ddde ,_bbba :_ddde ,_cgaa :_cfffc };};func (_ce *imageExtractContext )extractXObjectImage (_fac *_f .PdfObjectName ,_af _be .GraphicsState ,_dg *_ag .PdfPageResources )error {_fbdf ,_ :=_dg .GetXObjectByName (*_fac );if _fbdf ==nil {return nil ;};_ecg ,_eca :=_ce ._bb [_fbdf ];if !_eca {_dbg ,_bfd :=_dg .GetXObjectImageByName (*_fac );if _bfd !=nil {return _bfd ;};if _dbg ==nil {return nil ;};_cfb ,_bfd :=_dbg .ToImage ();if _bfd !=nil {return _bfd ;};_ecg =&cachedImage {_fc :_cfb ,_ea :_dbg .ColorSpace };_ce ._bb [_fbdf ]=_ecg ;};_ac :=_ecg ._fc ;_ge :=_ecg ._ea ;_fba ,_bfdg :=_ge .ImageToRGB (*_ac );if _bfdg !=nil {return _bfdg ;};_ca .Log .Debug ("@\u0044\u006f\u0020\u0043\u0054\u004d\u003a\u0020\u0025\u0073",_af .CTM .String ());_afa :=ImageMark {Image :&_fba ,Width :_af .CTM .ScalingFactorX (),Height :_af .CTM .ScalingFactorY (),Angle :_af .CTM .Angle ()};_afa .X ,_afa .Y =_af .CTM .Translation ();_ce ._eb =append (_ce ._eb ,_afa );_ce ._fed ++;return nil ;};func _fdfa (_aage []rune )bool {return len (_aage )>=_bgbg &&_c .Is (_c .Hyphen ,_aage [len (_aage )-1])&&!_c .IsSpace (_aage [len (_aage )-2]);};func (_dbdb *Extractor )extractPageText (_agc string ,_bfgc *_ag .PdfPageResources ,_dcd _cd .Matrix ,_gc int )(*PageText ,int ,int ,error ){_ca .Log .Trace ("\u0065x\u0074\u0072\u0061\u0063t\u0050\u0061\u0067\u0065\u0054e\u0078t\u003a \u006c\u0065\u0076\u0065\u006c\u003d\u0025d",_gc );_bde :=&PageText {_bdc :_dbdb ._bea };_cc :=_facg (_dbdb ._bea );var _gce stateStack ;_dea :=_bdd (_dbdb ,_bfgc ,_be .GraphicsState {},&_cc ,&_gce );_gcd :=shapesState {_gcfa :_dcd };var _dad bool ;if _gc > _ggf {_acc :=_d .New ("\u0066\u006f\u0072\u006d s\u0074\u0061\u0063\u006b\u0020\u006f\u0076\u0065\u0072\u0066\u006c\u006f\u0077");_ca .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a \u0065\u0078\u0074\u0072\u0061\u0063\u0074\u0050\u0061\u0067\u0065\u0054\u0065\u0078\u0074\u002e\u0020\u0072\u0065\u0063u\u0072\u0073\u0069\u006f\u006e\u0020\u006c\u0065\u0076\u0065\u006c\u003d\u0025\u0064 \u0065r\u0072\u003d\u0025\u0076",_gc ,_acc );return _bde ,_cc ._geb ,_cc ._dge ,_acc ;};_deed :=_be .NewContentStreamParser (_agc );_ead ,_dcdg :=_deed .Parse ();if _dcdg !=nil {_ca .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020e\u0078\u0074\u0072a\u0063\u0074\u0050\u0061g\u0065\u0054\u0065\u0078\u0074\u0020\u0070\u0061\u0072\u0073\u0065\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_dcdg );return _bde ,_cc ._geb ,_cc ._dge ,_dcdg ;};_eee :=_be .NewContentStreamProcessor (*_ead );_eee .AddHandler (_be .HandlerConditionEnumAllOperands ,"",func (_gfcf *_be .ContentStreamOperation ,_afd _be .GraphicsState ,_aaeg *_ag .PdfPageResources )error {_cg :=_gfcf .Operand ;if _dface {_ca .Log .Info ("\u0026&\u0026\u0020\u006f\u0070\u003d\u0025s",_gfcf );};switch _cg {case "\u0071":_gce .push (&_cc );case "\u0051":if !_gce .empty (){_cc =*_gce .top ();if len (_gce )>=2{_gce .pop ();};};_gcd ._daec =_afd .CTM ;case "\u0042\u0054":if _dad {_ca .Log .Debug ("\u0042\u0054\u0020\u0063\u0061\u006c\u006c\u0065\u0064\u0020\u0077\u0068\u0069\u006c\u0065 \u0069n\u0020\u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074");_bde ._gef =append (_bde ._gef ,_dea ._bad ...);};_dad =true ;_gac :=_afd ;_gac .CTM =_dcd .Mult (_gac .CTM );_dea =_bdd (_dbdb ,_aaeg ,_gac ,&_cc ,&_gce );case "\u0045\u0054":if !_dad {_ca .Log .Debug ("\u0045\u0054\u0020ca\u006c\u006c\u0065\u0064\u0020\u006f\u0075\u0074\u0073i\u0064e\u0020o\u0
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a string describing `tm`.
|
|
|
|
|
func (_def TextMark )String ()string {_fecg :=_def .BBox ;var _gdgd string ;if _def .Font !=nil {_gdgd =_def .Font .String ();if len (_gdgd )> 50{_gdgd =_gdgd [:50]+"\u002e\u002e\u002e";};};var _dafg string ;if _def .Meta {_dafg ="\u0020\u002a\u004d\u002a";};return _aae .Sprintf ("\u007b\u0054\u0065\u0078t\u004d\u0061\u0072\u006b\u003a\u0020\u0025\u0064\u0020%\u0071\u003d\u0025\u0030\u0032\u0078\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e2\u0066\u0029\u0020\u0028\u00256\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0025\u0073\u007d",_def .Offset ,_def .Text ,[]rune (_def .Text ),_fecg .Llx ,_fecg .Lly ,_fecg .Urx ,_fecg .Ury ,_gdgd ,_dafg );};func (_dfbe rulingList )coalesce ()rulingList {if len (_dfbe )==0{return nil ;};_dfbe .sortStrict ();_gaag :=_dfbe [0];var _ddcb rulingList ;for _ ,_agcg :=range _dfbe [1:]{_caecf :=_gaag ._bfeeb ==_agcg ._bfeeb &&_gaag ._cgbb ==_agcg ._cgbb &&_agcg ._gcad <=_gaag ._fdcd +1.0;if _caecf {_ebcbc :=*_gaag ;_gaag ._fdcd =_agcg ._fdcd ;if _gaag ._fdcd < _gaag ._gcad {_ca .Log .Error ("\u0076\u0030\u002ehi\u0020\u003c\u0020\u0076\u0030\u002e\u006c\u006f\u000a\t\u00760\u003d%\u0073\n\u0009\u0020\u0076\u003d\u0025\u0073\u000a\u0009\u0020\u002d\u003e\u0025\u0073",_ebcbc .String (),_agcg .String (),_gaag .String ());return nil ;};}else {_ddcb =append (_ddcb ,_gaag );_gaag =_agcg ;};};_ddcb =append (_ddcb ,_gaag );return _ddcb ;};func (_ebaf *ruling )intersects (_afec *ruling )bool {_ddca :=(_ebaf ._bfeeb ==_fdaf &&_afec ._bfeeb ==_gbef )||(_afec ._bfeeb ==_fdaf &&_ebaf ._bfeeb ==_gbef );_aaac :=func (_dcbg ,_badf *ruling )bool {return _dcbg ._gcad <=_badf ._cgbb +_ebea &&_badf ._cgbb -_ebea <=_dcbg ._fdcd ;};_fdff :=_aaac (_ebaf ,_afec );_agbd :=_aaac (_afec ,_ebaf );if _cfg {_ca .Log .Info ("\u0069\u006e\u0074\u0065r\u0073\u0065\u0063\u0074\u0073\u003a\u0020\u0020\u006ft\u0068\u006f\u0067\u006f\u006e\u0061\u006c\u003d\u0025\u0074\u0020\u006f\u0031\u003d\u0025\u0074\u0020\u006f2\u003d\u0025\u0074\u0020\u002d>\u0020\u0025\u0074\u000a\u0009\u0020\u0076\u003d\u0025\u0073\u000a\u0009\u0076\u0032\u003d\u0025\u0073",_ddca ,_fdff ,_agbd ,_ddca &&_fdff &&_agbd ,_ebaf ,_afec );};return _ddca &&_fdff &&_agbd ;};func (_cedd paraList )llyRange (_gbbc []int ,_feecf ,_bgbd float64 )[]int {_dgafb :=len (_cedd );if _bgbd < _cedd [_gbbc [0]].Lly ||_feecf > _cedd [_gbbc [_dgafb -1]].Lly {return nil ;};_ccdf :=_db .Search (_dgafb ,func (_dfbf int )bool {return _cedd [_gbbc [_dfbf ]].Lly >=_feecf });_bgfdf :=_db .Search (_dgafb ,func (_abaa int )bool {return _cedd [_gbbc [_abaa ]].Lly > _bgbd });return _gbbc [_ccdf :_bgfdf ];};
|
2020-08-31 21:12:07 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// New returns an Extractor instance for extracting content from the input PDF page.
|
|
|
|
|
func New (page *_ag .PdfPage )(*Extractor ,error ){_ee ,_g :=page .GetAllContentStreams ();if _g !=nil {return nil ,_g ;};_bef ,_g :=page .GetMediaBox ();if _g !=nil {return nil ,_aae .Errorf ("\u0065\u0078\u0074r\u0061\u0063\u0074\u006fr\u0020\u0072\u0065\u0071\u0075\u0069\u0072e\u0073\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u002e\u0020\u0025\u0076",_g );};_ad :=&Extractor {_ece :_ee ,_cde :page .Resources ,_bea :*_bef ,_fb :map[string ]fontEntry {},_fe :map[string ]textResult {}};if _ad ._bea .Llx > _ad ._bea .Urx {_ca .Log .Info ("\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0058\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e",_ad ._bea );_ad ._bea .Llx ,_ad ._bea .Urx =_ad ._bea .Urx ,_ad ._bea .Llx ;};if _ad ._bea .Lly > _ad ._bea .Ury {_ca .Log .Info ("\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0059\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e",_ad ._bea );_ad ._bea .Lly ,_ad ._bea .Ury =_ad ._bea .Ury ,_ad ._bea .Lly ;};return _ad ,nil ;};func (_feda rectRuling )asRuling ()(*ruling ,bool ){_bfdab :=ruling {_bfeeb :_feda ._cdea };switch _feda ._cdea {case _fdaf :_bfdab ._cgbb =0.5*(_feda .Llx +_feda .Urx );_bfdab ._gcad =_feda .Lly ;_bfdab ._fdcd =_feda .Ury ;case _gbef :_bfdab ._cgbb =0.5*(_feda .Lly +_feda .Ury );_bfdab ._gcad =_feda .Llx ;_bfdab ._fdcd =_feda .Urx ;default:_ca .Log .Error ("\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064",_feda ._cdea );return nil ,false ;};return &_bfdab ,true ;};func (_cgbbe *textTable )isExportable ()bool {_bcdd :=func (_dcdfe int )bool {_cdfe :=_cgbbe .get (0,_dcdfe );_geaf :=_cdfe .text ();_fdga :=_a .RuneCountInString (_geaf );_gedb :=_bfbed .MatchString (_geaf );return _fdga <=1||_gedb ;};for _gcgg :=0;_gcgg < _cgbbe ._dbff ;_gcgg ++{if !_bcdd (_gcgg ){return true ;};};return false ;};func (_afcg paraList )readBefore (_gbge []int ,_begb ,_eddee int )bool {_dfeb ,_abdbc :=_afcg [_begb ],_afcg [_eddee ];if _eebd (_dfeb ,_abdbc )&&_dfeb .Lly > _abdbc .Lly {return true ;};if !(_dfeb ._bbba .Urx < _abdbc ._bbba .Llx ){return false ;};_cfbb ,_ecff :=_dfeb .Lly ,_abdbc .Lly ;if _cfbb > _ecff {_ecff ,_cfbb =_cfbb ,_ecff ;};_eafc :=_ed .Max (_dfeb ._bbba .Llx ,_abdbc ._bbba .Llx );_dgaf :=_ed .Min (_dfeb ._bbba .Urx ,_abdbc ._bbba .Urx );_gcgd :=_afcg .llyRange (_gbge ,_cfbb ,_ecff );for _ ,_abeb :=range _gcgd {if _abeb ==_begb ||_abeb ==_eddee {continue ;};_dbge :=_afcg [_abeb ];if _dbge ._bbba .Llx <=_dgaf &&_eafc <=_dbge ._bbba .Urx {return false ;};};return true ;};func (_aagf paraList )tables ()[]TextTable {var _ccbfg []TextTable ;for _ ,_gga :=range _aagf {_dacb :=_gga ._cgaa ;if _dacb !=nil &&_dacb .isExportable (){_ccbfg =append (_ccbfg ,_dacb .toTextTable ());};};return _ccbfg ;};
|
2020-08-31 21:12:07 +00:00
|
|
|
|
|
2020-09-21 01:20:10 +00:00
|
|
|
|
// ApplyArea processes the page text only within the specified area `bbox`.
|
|
|
|
|
// Each time ApplyArea is called, it updates the result set in `pt`.
|
|
|
|
|
// Can be called multiple times in a row with different bounding boxes.
|
2020-09-28 23:18:17 +00:00
|
|
|
|
func (_dcaa *PageText )ApplyArea (bbox _ag .PdfRectangle ){_gccd :=make ([]*textMark ,0,len (_dcaa ._gef ));for _ ,_geae :=range _dcaa ._gef {if _ccgf (_geae .bbox (),bbox ){_gccd =append (_gccd ,_geae );};};var _cfeg paraList ;_bcba :=len (_gccd );for _ega :=0;_ega < 360&&_bcba > 0;_ega +=90{_accc :=make ([]*textMark ,0,len (_gccd )-_bcba );for _ ,_ccff :=range _gccd {if _ccff ._bgga ==_ega {_accc =append (_accc ,_ccff );};};if len (_accc )> 0{_faed :=_edfg (_accc ,_dcaa ._bdc ,nil );_cfeg =append (_cfeg ,_faed ...);_bcba -=len (_accc );};};_bgac :=new (_dfc .Buffer );_cfeg .writeText (_bgac );_dcaa ._bfbe =_bgac .String ();_dcaa ._ebd =_cfeg .toTextMarks ();_dcaa ._feaf =_cfeg .tables ();};func (_ggbd *textPara )toTextMarks (_cced *int )[]TextMark {if _ggbd ._cgaa ==nil {return _ggbd .toCellTextMarks (_cced );};var _egfg []TextMark ;for _bgbb :=0;_bgbb < _ggbd ._cgaa ._dbff ;_bgbb ++{for _aagef :=0;_aagef < _ggbd ._cgaa ._acgb ;_aagef ++{_adfde :=_ggbd ._cgaa .get (_aagef ,_bgbb );if _adfde ==nil {_egfg =_eegf (_egfg ,_cced ,"\u0009");}else {_ecddg :=_adfde .toCellTextMarks (_cced );_egfg =append (_egfg ,_ecddg ...);};_egfg =_eegf (_egfg ,_cced ,"\u0020");};if _bgbb < _ggbd ._cgaa ._dbff -1{_egfg =_eegf (_egfg ,_cced ,"\u000a");};};return _egfg ;};func _ebg (_gafe *wordBag ,_bcaba *textWord ,_aabb float64 )bool {return _bcaba .Llx < _gafe .Urx +_aabb &&_gafe .Llx -_aabb < _bcaba .Urx ;};func (_aagc *wordBag )firstReadingIndex (_bbcgb int )int {_abbg :=_aagc .firstWord (_bbcgb )._bbgd ;_fceg :=float64 (_bbcgb +1)*_gebd ;_fbee :=_fceg +_cdee *_abbg ;_baaf :=_bbcgb ;for _ ,_fbgd :=range _aagc .depthBand (_fceg ,_fbee ){if _ccc (_aagc .firstWord (_fbgd ),_aagc .firstWord (_baaf ))< 0{_baaf =_fbgd ;};};return _baaf ;};func _bdd (_cec *Extractor ,_beec *_ag .PdfPageResources ,_fff _be .GraphicsState ,_dbgc *textState ,_bged *stateStack )*textObject {return &textObject {_ccf :_cec ,_ecgg :_beec ,_deef :_fff ,_egd :_bged ,_gdb :_dbgc ,_gdg :_cd .IdentityMatrix (),_beab :_cd .IdentityMatrix ()};};
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-14 09:32:45 +00:00
|
|
|
|
// TextTable represents a table.
|
|
|
|
|
// Cells are ordered top-to-bottom, left-to-right.
|
|
|
|
|
// Cells[y] is the (0-offset) y'th row in the table.
|
|
|
|
|
// Cells[y][x] is the (0-offset) x'th column in the table.
|
2020-09-28 23:18:17 +00:00
|
|
|
|
type TextTable struct{W ,H int ;Cells [][]TableCell ;};func (_aacab *textTable )put (_defaf ,_acef int ,_fdfcg *textPara ){_aacab ._eagd [_bfacc (_defaf ,_acef )]=_fdfcg ;};type textMark struct{_ag .PdfRectangle ;_bgga int ;_ddbg string ;_bada string ;_aaged *_ag .PdfFont ;_cbbg float64 ;_dfbd float64 ;_ggcba _cd .Matrix ;_bgef _cd .Point ;_cbff _ag .PdfRectangle ;_cae _aa .Color ;_gdcc _aa .Color ;};func (_gceb *textObject )setTextRenderMode (_ggfa int ){if _gceb ==nil {return ;};_gceb ._gdb ._ffe =RenderMode (_ggfa );};func (_dcgda *textWord )bbox ()_ag .PdfRectangle {return _dcgda .PdfRectangle };
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// Len returns the number of TextMarks in `ma`.
|
|
|
|
|
func (_ddd *TextMarkArray )Len ()int {if _ddd ==nil {return 0;};return len (_ddd ._ggd );};func (_geg *textObject )setHorizScaling (_dcaf float64 ){if _geg ==nil {return ;};_geg ._gdb ._cca =_dcaf ;};func (_bdb *wordBag )highestWord (_fegc int ,_ggcf ,_bcac float64 )*textWord {for _ ,_deaa :=range _bdb ._aabed [_fegc ]{if _ggcf <=_deaa ._fgfab &&_deaa ._fgfab <=_bcac {return _deaa ;};};return nil ;};func _bfaca (_eced ,_dab bounded )float64 {return _bcgd (_eced )-_bcgd (_dab )};
|
2020-09-07 00:23:12 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// RenderMode specifies the text rendering mode (Tmode), which determines whether showing text shall cause
|
|
|
|
|
// glyph outlines to be stroked, filled, used as a clipping boundary, or some combination of the three.
|
|
|
|
|
// Stroking, filling, and clipping shall have the same effects for a text object as they do for a path object
|
|
|
|
|
// (see 8.5.3, "Path-Painting Operators" and 8.5.4, "Clipping Path Operators").
|
|
|
|
|
type RenderMode int ;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
|
|
|
|
// TextMark represents extracted text on a page with information regarding both textual content,
|
|
|
|
|
// formatting (font and size) and positioning.
|
|
|
|
|
// It is the smallest unit of text on a PDF page, typically a single character.
|
|
|
|
|
//
|
|
|
|
|
// getBBox() in test_text.go shows how to compute bounding boxes of substrings of extracted text.
|
|
|
|
|
// The following code extracts the text on PDF page `page` into `text` then finds the bounding box
|
|
|
|
|
// `bbox` of substring `term` in `text`.
|
|
|
|
|
//
|
|
|
|
|
// ex, _ := New(page)
|
|
|
|
|
// // handle errors
|
|
|
|
|
// pageText, _, _, err := ex.ExtractPageText()
|
|
|
|
|
// // handle errors
|
|
|
|
|
// text := pageText.Text()
|
|
|
|
|
// textMarks := pageText.Marks()
|
|
|
|
|
//
|
|
|
|
|
// start := strings.Index(text, term)
|
|
|
|
|
// end := start + len(term)
|
|
|
|
|
// spanMarks, err := textMarks.RangeOffset(start, end)
|
|
|
|
|
// // handle errors
|
|
|
|
|
// bbox, ok := spanMarks.BBox()
|
|
|
|
|
// // handle errors
|
|
|
|
|
type TextMark struct{
|
|
|
|
|
|
|
|
|
|
// Text is the extracted text.
|
|
|
|
|
Text string ;
|
|
|
|
|
|
|
|
|
|
// Original is the text in the PDF. It has not been decoded like `Text`.
|
|
|
|
|
Original string ;
|
2018-09-22 09:28:18 +10:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// BBox is the bounding box of the text.
|
2020-09-28 23:18:17 +00:00
|
|
|
|
BBox _ag .PdfRectangle ;
|
2018-09-22 09:28:18 +10:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// Font is the font the text was drawn with.
|
2020-09-28 23:18:17 +00:00
|
|
|
|
Font *_ag .PdfFont ;
|
2018-12-27 20:51:34 +11:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// FontSize is the font size the text was drawn with.
|
|
|
|
|
FontSize float64 ;
|
2018-11-28 18:06:03 +11:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// Offset is the offset of the start of TextMark.Text in the extracted text. If you do this
|
|
|
|
|
// text, textMarks := pageText.Text(), pageText.Marks()
|
|
|
|
|
// marks := textMarks.Elements()
|
|
|
|
|
// then marks[i].Offset is the offset of marks[i].Text in text.
|
|
|
|
|
Offset int ;
|
|
|
|
|
|
|
|
|
|
// Meta is set true for spaces and line breaks that we insert in the extracted text. We insert
|
|
|
|
|
// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
|
|
|
|
|
// distance apart. See wordJoiner (lineJoiner) in PageText.computeViews().
|
|
|
|
|
Meta bool ;
|
|
|
|
|
|
|
|
|
|
// FillColor is the fill color of the text.
|
|
|
|
|
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
2020-09-28 23:18:17 +00:00
|
|
|
|
FillColor _aa .Color ;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
|
|
|
|
// StrokeColor is the stroke color of the text.
|
|
|
|
|
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
2020-09-28 23:18:17 +00:00
|
|
|
|
StrokeColor _aa .Color ;};func (_efbd *textObject )setTextMatrix (_feee []float64 ){if len (_feee )!=6{_ca .Log .Debug ("\u0045\u0052\u0052OR\u003a\u0020\u006c\u0065\u006e\u0028\u0066\u0029\u0020\u0021\u003d\u0020\u0036\u0020\u0028\u0025\u0064\u0029",len (_feee ));return ;};_gaf ,_ggc ,_bfb ,_aaea ,_aaca ,_adag :=_feee [0],_feee [1],_feee [2],_feee [3],_feee [4],_feee [5];_efbd ._gdg =_cd .NewMatrix (_gaf ,_ggc ,_bfb ,_aaea ,_aaca ,_adag );_efbd ._beab =_efbd ._gdg ;};func (_eecd paraList )findTextTables ()[]*textTable {var _gfe []*textTable ;for _ ,_babf :=range _eecd {if _babf .taken ()||_babf .Width ()==0{continue ;};_fgffbe :=_babf .isAtom ();if _fgffbe ==nil {continue ;};_fgffbe .growTable ();if _fgffbe ._acgb *_fgffbe ._dbff < _bfacd {continue ;};_fgffbe .markCells ();_fgffbe .log ("\u0067\u0072\u006fw\u006e");_gfe =append (_gfe ,_fgffbe );};return _gfe ;};func (_eef lineRuling )xDelta ()float64 {return _ed .Abs (_eef ._afcc .X -_eef ._afcc .X )};func (_ceaea *textTable )computeBbox ()_ag .PdfRectangle {_bfcd :=_ceaea .get (0,0).PdfRectangle ;for _gbgc :=1;_gbgc < _ceaea ._acgb ;_gbgc ++{_bfcd =_eegb (_bfcd ,_ceaea .get (_gbgc ,0).PdfRectangle );};for _cgec :=1;_cgec < _ceaea ._dbff ;_cgec ++{for _aadd :=0;_aadd < _ceaea ._acgb ;_aadd ++{_ebbgg :=_ceaea .get (_aadd ,_cgec );if _ebbgg !=nil {_bfcd =_eegb (_bfcd ,_ebbgg .PdfRectangle );};};};return _bfcd ;};func _bfbfc (_gefa []*textMark ,_dbfbg _ag .PdfRectangle )[]*textWord {var _aefb []*textWord ;var _acba *textWord ;_cabee :=func (){if _acba !=nil {_gdgg :=_acba .computeText ();if !_ecdca (_gdgg ){_acba ._gbcg =_gdgg ;_aefb =append (_aefb ,_acba );if _dbb {_ca .Log .Info ("\u0077o\u0072\u0064\u003d\u0025\u0073",_acba .String ());for _cefge ,_acbc :=range _acba ._eeda {_aae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_cefge ,_acbc .String ());};};};_acba =nil ;};};for _ ,_egdf :=range _gefa {if _gbg &&_acba !=nil &&len (_acba ._eeda )> 0{_gbba :=_acba ._eeda [len (_acba ._eeda )-1];_eaabc ,_gbffa :=_cdggc (_egdf ._ddbg );_gfda ,_gddb :=_cdggc (_gbba ._ddbg );if _gbffa &&!_gddb &&_gbba .inDiacriticArea (_egdf ){_acba .addDiacritic (_eaabc );continue ;};if _gddb &&!_gbffa &&_egdf .inDiacriticArea (_gbba ){_acba ._eeda =_acba ._eeda [:len (_acba ._eeda )-1];_acba .appendMark (_egdf ,_dbfbg );_acba .addDiacritic (_gfda );continue ;};};_fgab :=_ecdca (_egdf ._ddbg );if _fgab {_cabee ();continue ;};if _acba ==nil &&!_fgab {_acba =_gffed ([]*textMark {_egdf },_dbfbg );continue ;};_gdbg :=_acba ._bbgd ;_cgfd :=_ed .Abs (_eged (_dbfbg ,_egdf )-_acba ._fgfab )/_gdbg ;_bccg :=_beda (_egdf ,_acba )/_gdbg ;if _bccg >=_abea ||!(-_fecga <=_bccg &&_cgfd <=_ddcd ){_cabee ();_acba =_gffed ([]*textMark {_egdf },_dbfbg );continue ;};_acba .appendMark (_egdf ,_dbfbg );};_cabee ();return _aefb ;};func (_eae *textObject )moveText (_ecec ,_ceae float64 ){_eae .moveLP (_ecec ,_ceae )};func (_eggbb intSet )has (_adgg int )bool {_ ,_gced :=_eggbb [_adgg ];return _gced };
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a description of `t`.
|
|
|
|
|
func (_fead *textTable )String ()string {return _aae .Sprintf ("\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u0020\u0025\u0074",_fead ._acgb ,_fead ._dbff ,_fead ._gdcfd );};func (_dafaa paraList )llyOrdering ()[]int {_efcb :=make ([]int ,len (_dafaa ));for _cccf :=range _dafaa {_efcb [_cccf ]=_cccf ;};_db .SliceStable (_efcb ,func (_bcfd ,_bgfd int )bool {_aabec ,_agaf :=_efcb [_bcfd ],_efcb [_bgfd ];return _dafaa [_aabec ].Lly < _dafaa [_agaf ].Lly ;});return _efcb ;};func _ggaee (_dbcb []_f .PdfObject )(_bgea ,_efbdb float64 ,_gbeg error ){if len (_dbcb )!=2{return 0,0,_aae .Errorf ("\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u006e\u0075\u006d\u0062\u0065\u0072\u0020o\u0066 \u0070\u0061\u0072\u0061\u006d\u0073\u003a \u0025\u0064",len (_dbcb ));};_efde ,_gbeg :=_f .GetNumbersAsFloat (_dbcb );if _gbeg !=nil {return 0,0,_gbeg ;};return _efde [0],_efde [1],nil ;};
|
2020-09-14 09:32:45 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a description of `state`.
|
|
|
|
|
func (_gdf *textState )String ()string {_ebeg :="\u005bN\u004f\u0054\u0020\u0053\u0045\u0054]";if _gdf ._agce !=nil {_ebeg =_gdf ._agce .BaseFont ();};return _aae .Sprintf ("\u0074\u0063\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0077\u003d\u0025\u002e\u0032\u0066 \u0074f\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u003d\u0025\u0071",_gdf ._bcf ,_gdf ._fdb ,_gdf ._adb ,_ebeg );};const _bbc =1.0/1000.0;func _beda (_bfab ,_cce bounded )float64 {return _bfab .bbox ().Llx -_cce .bbox ().Urx };var _bfbed =_ec .MustCompile ("\u005c\u0064\u002b\u005c\u002e\u003f");func (_caga *textTable )bbox ()_ag .PdfRectangle {return _caga .PdfRectangle };func _aged (_eeag []*subpath )[]rulingList {_fffcd (_eeag );var _bbbae rulingList ;for _ ,_gefb :=range _eeag {if len (_gefb ._aacge )< 2{continue ;};_afee :=_gefb ._aacge [0];for _ ,_beff :=range _gefb ._aacge [1:]{if _acge ,_ebfbc :=_ceef (_afee ,_beff );_ebfbc {_bbbae =append (_bbbae ,_acge );};_afee =_beff ;};};_bbbae =_bbbae .tidied ("\u0073t\u0072\u006f\u006b\u0065\u0073");return _bbbae .toGrids ();};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-09-28 23:18:17 +00:00
|
|
|
|
// String returns a string describing `pt`.
|
|
|
|
|
func (_bdef PageText )String ()string {_dafa :=_aae .Sprintf ("P\u0061\u0067\u0065\u0054ex\u0074:\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073",len (_bdef ._gef ));_gbcf :=[]string {"\u002d"+_dafa };for _ ,_bfbg :=range _bdef ._gef {_gbcf =append (_gbcf ,_bfbg .String ());};_gbcf =append (_gbcf ,"\u002b"+_dafa );return _ecb .Join (_gbcf ,"\u000a");};
|
|
|
|
|
|
|
|
|
|
// ToText returns the page text as a single string.
|
|
|
|
|
// Deprecated: This function is deprecated and will be removed in a future major version. Please use
|
|
|
|
|
// Text() instead.
|
|
|
|
|
func (_efd PageText )ToText ()string {return _efd .Text ()};func _dfcgd (_gabb []*textWord ,_dfgc int )[]*textWord {_bebe :=len (_gabb );copy (_gabb [_dfgc :],_gabb [_dfgc +1:]);return _gabb [:_bebe -1];};func _feb (_bbd _cd .Point )*subpath {return &subpath {_aacge :[]_cd .Point {_bbd }}};func (_agfe *wordBag )empty (_dcef int )bool {_ ,_fgcc :=_agfe ._aabed [_dcef ];return !_fgcc };func (_gbgg lineRuling )xMean ()float64 {return 0.5*(_gbgg ._dfaca .X +_gbgg ._afcc .X )};func _acfe (_dccf *wordBag ,_ccfa int )*textLine {_bbg :=_dccf .firstWord (_ccfa );_fcac :=textLine {PdfRectangle :_bbg .PdfRectangle ,_edef :_bbg ._bbgd ,_eagf :_bbg ._fgfab };_fcac .pullWord (_dccf ,_bbg ,_ccfa );return &_fcac ;};func (_cdc *textObject )renderText (_fccg []byte )error {if _cdc ._feab {_ca .Log .Debug ("\u0072\u0065\u006e\u0064\u0065r\u0054\u0065\u0078\u0074\u003a\u0020\u0049\u006e\u0076\u0061\u006c\u0069\u0064 \u0066\u006f\u006e\u0074\u002e\u0020\u004e\u006f\u0074\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u002e");return nil ;};_dcdge :=_cdc .getCurrentFont ();_cff :=_dcdge .BytesToCharcodes (_fccg );_gegb ,_fcca ,_fae :=_dcdge .CharcodesToStrings (_cff );if _fae > 0{_ca .Log .Debug ("\u0072\u0065nd\u0065\u0072\u0054e\u0078\u0074\u003a\u0020num\u0043ha\u0072\u0073\u003d\u0025\u0064\u0020\u006eum\u004d\u0069\u0073\u0073\u0065\u0073\u003d%\u0064",_fcca ,_fae );};_cdc ._gdb ._geb +=_fcca ;_cdc ._gdb ._dge +=_fae ;_dbc :=_cdc ._gdb ;_bagf :=_dbc ._adb ;_ecad :=_dbc ._cca /100.0;_cege ,_dgba :=_dcdge .GetRuneMetrics (' ');if !_dgba {_cege ,_dgba =_dcdge .GetCharMetrics (32);};if !_dgba {_cege ,_ =_ag .DefaultFont ().GetRuneMetrics (' ');};_abba :=_cege .Wx *_bbc ;_ca .Log .Trace ("\u0073p\u0061\u0063e\u0057\u0069\u0064t\u0068\u003d\u0025\u002e\u0032\u0066\u0020t\u0065\u0078\u0074\u003d\u0025\u0071 \u0066\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0066\u006f\u006et\u0053\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066",_abba ,_gegb ,_dcdge ,_bagf );_fagd :=_cd .NewMatrix (_bagf *_ecad ,0,0,_bagf ,0,_dbc ._eaff );if _fgff {_ca .Log .Info ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0074\u0065\u0078t\u0073\u003d\u0025\u0071",len (_cff ),_cff ,_gegb );};_ca .Log .Trace ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0072\u0075\u006ee\u0073\u003d\u0025\u0071",len (_cff ),_cff ,len (_gegb ));_fad :=_cdc .getFillColor ();_bcb :=_cdc .getStrokeColor ();for _gfb ,_bedc :=range _gegb {_fcb :=[]rune (_bedc );if len (_fcb )==1&&_fcb [0]=='\x00'{continue ;};_egde :=_cff [_gfb ];_age :=_cdc ._deef .CTM .Mult (_cdc ._gdg ).Mult (_fagd );_fgc :=0.0;if len (_fcb )==1&&_fcb [0]==32{_fgc =_dbc ._fdb ;};_eaec ,_ceaa :=_dcdge .GetCharMetrics (_egde );if !_ceaa {_ca .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u004e\u006f \u006d\u0065\u0074r\u0069\u0063\u0020\u0066\u006f\u0072\u0020\u0063\u006fde\u003d\u0025\u0064 \u0072\u003d0\u0078\u0025\u0030\u0034\u0078\u003d%\u002b\u0071 \u0025\u0073",_egde ,_fcb ,_fcb ,_dcdge );return _aae .Errorf ("\u006e\u006f\u0020\u0063\u0068\u0061\u0072\u0020\u006d\u0065\u0074\u0072\u0069\u0063\u0073:\u0020f\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0063\u006f\u0064\u0065\u003d\u0025\u0064",_dcdge .String (),_egde );};_gbe :=_cd .Point {X :_eaec .Wx *_bbc ,Y :_eaec .Wy *_bbc };_cfff :=_cd .Point {X :(_gbe .X *_bagf +_fgc )*_ecad };_ffad :=_cd .Point {X :(_gbe .X *_bagf +_dbc ._bcf +_fgc )*_ecad };if _fgff {_ca .Log .Info ("\u0074\u0066\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0063\u003d\u0025\u002e\u0032f\u0020t\u0077\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0068\u003d\u0025\u002e\u0032\u0066",_bagf ,_dbc ._bcf ,_dbc ._fdb ,_ecad );_ca .Log .Info ("\u0064x\u002c\u0064\u0079\u003d%\u002e\u0033\u0066\u0020\u00740\u003d%\u002e3\u0066\u0020\u0074\u003d\u0025\u002e\u0033f",_gbe ,_cfff ,_ffad );};_gcea :=_gedg (_cfff );_bede :=_gedg (_ffad );_cge :=_cdc ._deef .CTM .Mult (_cdc ._gdg ).Mult (_gcea );if _dcff {_ca
|
|
|
|
|
|
|
|
|
|
// TextMarkArray is a collection of TextMarks.
|
|
|
|
|
type TextMarkArray struct{_ggd []TextMark };func (_bfac *textObject )getFont (_gafa string )(*_ag .PdfFont ,error ){if _bfac ._ccf ._fb !=nil {_bfac ._ccf ._baf ++;_gab ,_agcf :=_bfac ._ccf ._fb [_gafa ];if _agcf {_gab ._ccg =_bfac ._ccf ._baf ;return _gab ._cdg ,nil ;};};_cfaf ,_fde :=_bfac .getFontDirect (_gafa );if _fde !=nil {return nil ,_fde ;};if _bfac ._ccf ._fb !=nil {_ddb :=fontEntry {_cfaf ,_bfac ._ccf ._baf };if len (_bfac ._ccf ._fb )>=_bace {var _fcfg []string ;for _aed :=range _bfac ._ccf ._fb {_fcfg =append (_fcfg ,_aed );};_db .Slice (_fcfg ,func (_aag ,_gcg int )bool {return _bfac ._ccf ._fb [_fcfg [_aag ]]._ccg < _bfac ._ccf ._fb [_fcfg [_gcg ]]._ccg });delete (_bfac ._ccf ._fb ,_fcfg [0]);};_bfac ._ccf ._fb [_gafa ]=_ddb ;};return _cfaf ,nil ;};func (_aeda rulingList )sort (){_db .Slice (_aeda ,func (_eabee ,_ggegd int )bool {return _aeda .comp (_eabee ,_ggegd )});};func (_caac *textTable )log (_gdd string ){if !_bege {return ;};_ca .Log .Info ("~\u007e\u007e\u0020\u0025\u0073\u003a \u0025\u0064\u0020\u0078\u0020\u0025d\u0020\u0067\u0072\u0069\u0064\u003d\u0025t\u000a\u0020\u0020\u0020\u0020\u0020\u0020\u0025\u0036\u002e2\u0066",_gdd ,_caac ._acgb ,_caac ._dbff ,_caac ._gdcfd ,_caac .PdfRectangle );for _cfcbg :=0;_cfcbg < _caac ._dbff ;_cfcbg ++{for _gecb :=0;_gecb < _caac ._acgb ;_gecb ++{_bgag :=_caac .get (_gecb ,_cfcbg );_aae .Printf ("%\u0034\u0064\u0020\u00252d\u003a \u0025\u0036\u002e\u0032\u0066 \u0025\u0071\u0020\u0025\u0064\u000a",_gecb ,_cfcbg ,_bgag .PdfRectangle ,_aecb (_bgag .text (),50),_a .RuneCountInString (_bgag .text ()));};};};func _bba (_acdff *wordBag ,_cgbe *textWord ,_fdae float64 )bool {return _acdff .Urx <=_cgbe .Llx &&_cgbe .Llx < _acdff .Urx +_fdae ;};
|
|
|
|
|
|
|
|
|
|
// Extractor stores and offers functionality for extracting content from PDF pages.
|
|
|
|
|
type Extractor struct{_ece string ;_cde *_ag .PdfPageResources ;_bea _ag .PdfRectangle ;_fb map[string ]fontEntry ;_fe map[string ]textResult ;_baf int64 ;_dfa int ;};func (_aacg *textObject )getStrokeColor ()_aa .Color {return _ecfbe (_aacg ._deef .ColorspaceStroking ,_aacg ._deef .ColorStroking );};
|