2020-08-27 21:45:09 +00:00
|
|
|
|
//
|
|
|
|
|
// Copyright 2020 FoxyUtils ehf. All rights reserved.
|
|
|
|
|
//
|
|
|
|
|
// This is a commercial product and requires a license to operate.
|
|
|
|
|
// A trial license can be obtained at https://unidoc.io
|
|
|
|
|
//
|
|
|
|
|
// DO NOT EDIT: generated by unitwist Go source code obfuscator.
|
|
|
|
|
//
|
|
|
|
|
// Use of this source code is governed by the UniDoc End User License Agreement
|
|
|
|
|
// terms that can be accessed at https://unidoc.io/eula/
|
2018-03-22 14:03:47 +00:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
//
|
|
|
|
|
// Package extractor is used for quickly extracting PDF content through a simple interface.
|
|
|
|
|
// Currently offers functionality for extracting textual content.
|
|
|
|
|
//
|
2020-08-31 21:12:07 +00:00
|
|
|
|
package extractor ;import (_f "bytes";_d "errors";_bf "fmt";_bb "github.com/unidoc/unipdf/v3/common";_gdf "github.com/unidoc/unipdf/v3/common/license";_gb "github.com/unidoc/unipdf/v3/contentstream";_dg "github.com/unidoc/unipdf/v3/core";_bfc "github.com/unidoc/unipdf/v3/internal/textencoding";_ae "github.com/unidoc/unipdf/v3/internal/transform";_ad "github.com/unidoc/unipdf/v3/model";_fd "golang.org/x/text/unicode/norm";_cb "golang.org/x/xerrors";_e "image/color";_bc "io";_gd "math";_g "sort";_c "strings";_a "unicode";);
|
2018-03-22 13:01:04 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// Elements returns the TextMarks in `ma`.
|
|
|
|
|
func (_adce *TextMarkArray )Elements ()[]TextMark {return _adce ._cfb };func (_ace *wordBag )maxDepth ()float64 {return _ace ._ccggc -_ace .Lly };func (_ddc *textObject )getFillColor ()_e .Color {return _adbdc (_ddc ._begd .ColorspaceNonStroking ,_ddc ._begd .ColorNonStroking );};type fontEntry struct{_ebfe *_ad .PdfFont ;_ddbc int64 ;};func (_bdg *textObject )setTextLeading (_bdef float64 ){if _bdg ==nil ||_bdg ._ccg ==nil {return ;};_bdg ._ccg ._fgdg =_bdef ;};func (_dgbg *textMark )bbox ()_ad .PdfRectangle {return _dgbg .PdfRectangle };func (_bfd *stateStack )size ()int {return len (*_bfd )};func _dbaf (_dggc ,_bccc int )int {if _dggc > _bccc {return _dggc ;};return _bccc ;};func (_cge *wordBag )arrangeText ()*textPara {_cge .sort ();if _fccg {_cge .removeDuplicates ();};var _deba []*textLine ;for _ ,_bcaa :=range _cge .depthIndexes (){for !_cge .empty (_bcaa ){_dcead :=_cge .firstReadingIndex (_bcaa );_baeed :=_cge .firstWord (_dcead );_eecb :=_decec (_cge ,_dcead );_dgab :=_baeed ._befa ;_cbcdf :=_baeed ._efde -_bcgb *_dgab ;_fage :=_baeed ._efde +_bcgb *_dgab ;_cdec :=_cbbd *_dgab ;_cbab :=_eabb *_dgab ;_bfeg :for {var _cefc *textWord ;_ffecb :=0;for _ ,_cabee :=range _cge .depthBand (_cbcdf ,_fage ){_agde :=_cge .highestWord (_cabee ,_cbcdf ,_fage );if _agde ==nil {continue ;};_aaeg :=_cfee (_agde ,_eecb ._ffge [len (_eecb ._ffge )-1]);if _aaeg < -_cbab {break _bfeg ;};if _aaeg > _cdec {continue ;};if _cefc !=nil &&_ddab (_agde ,_cefc )>=0{continue ;};_cefc =_agde ;_ffecb =_cabee ;};if _cefc ==nil {break ;};_eecb .pullWord (_cge ,_cefc ,_ffecb );};_eecb .markWordBoundaries ();_deba =append (_deba ,_eecb );};};if len (_deba )==0{return nil ;};_g .Slice (_deba ,func (_gbag ,_cfd int )bool {return _adge (_deba [_gbag ],_deba [_cfd ])< 0});_bedde :=_affea (_cge .PdfRectangle ,_deba );if _bgcb {_bb .Log .Info ("\u0061\u0072\u0072an\u0067\u0065\u0054\u0065\u0078\u0074\u0020\u0021\u0021\u0021\u0020\u0070\u0061\u0072\u0061\u003d\u0025\u0073",_bedde .String ());if _cgc {for _acfb ,_ebca :=range _bedde ._cbf {_bf .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_acfb ,_ebca .String ());if _deac {for _cad ,_gab :=range _ebca ._ffge {_bf .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_cad ,_gab .String ());for _gefg ,_ggbd :=range _gab ._efga {_bf .Printf ("\u00251\u0032\u0064\u003a\u0020\u0025\u0073\n",_gefg ,_ggbd .String ());};};};};};};return _bedde ;};func (_fed paraList )log (_fcbd string ){if !_fdfg {return ;};_bb .Log .Info ("%\u0038\u0073\u003a\u0020\u0025\u0064 \u0070\u0061\u0072\u0061\u0073\u0020=\u003d\u003d\u003d\u003d\u003d\u003d\u002d-\u002d\u002d\u002d\u002d\u002d\u003d\u003d\u003d\u003d\u003d=\u003d",_fcbd ,len (_fed ));for _dffb ,_ebaa :=range _fed {if _ebaa ==nil {continue ;};_eegfa :=_ebaa .text ();_baf :="\u0020\u0020";if _ebaa ._ddfbc !=nil {_baf =_bf .Sprintf ("\u005b%\u0064\u0078\u0025\u0064\u005d",_ebaa ._ddfbc ._gffc ,_ebaa ._ddfbc ._fgbe );};_bf .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0025s\u0020\u0025\u0071\u000a",_dffb ,_ebaa .PdfRectangle ,_baf ,_gcgd (_eegfa ,50));};};func _cadf (_eefg *PageText )error {_gdacg :=_gdf .GetLicenseKey ();if _gdacg !=nil &&_gdacg .IsLicensed ()||_gdb {return nil ;};_bf .Printf ("\u0055\u006e\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0064\u0020c\u006f\u0070\u0079\u0020\u006f\u0066\u0020\u0055\u006e\u0069P\u0044\u0046\u000a");_bf .Println ("-\u0020\u0047\u0065\u0074\u0020\u0061\u0020\u0066\u0072e\u0065\u0020\u0074\u0072\u0069\u0061\u006c l\u0069\u0063\u0065\u006es\u0065\u0020\u006f\u006e\u0020\u0068\u0074\u0074\u0070s:\u002f\u002fu\u006e\u0069\u0064\u006f\u0063\u002e\u0069\u006f");return _d .New ("\u0075\u006e\u0069\u0070d\u0066\u0020\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0020c\u006fd\u0065\u0020\u0072\u0065\u0071\u0075\u0069r\u0065\u0064");};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// String returns a description of `tm`.
|
|
|
|
|
func (_dgf *textMark )String ()string {return _bf .Sprintf ("\u0025\u002e\u0032f \u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\u0022\u0025\u0073\u0022",_dgf .PdfRectangle ,_dgf ._cegg ,_dgf ._ddabf );};func _adbdc (_dbag _ad .PdfColorspace ,_dfcaf _ad .PdfColor )_e .Color {if _dbag ==nil ||_dfcaf ==nil {return _e .Black ;};_fedad ,_gggd :=_dbag .ColorToRGB (_dfcaf );if _gggd !=nil {_bb .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006fu\u006c\u0064\u0020no\u0074\u0020\u0063\u006f\u006e\u0076e\u0072\u0074\u0020\u0063\u006f\u006c\u006f\u0072\u0020\u0025\u0076\u0020\u0028\u0025\u0076)\u0020\u0074\u006f\u0020\u0052\u0047\u0042\u003a \u0025\u0073",_dfcaf ,_dbag ,_gggd );return _e .Black ;};_bdbe ,_bfagf :=_fedad .(*_ad .PdfColorDeviceRGB );if !_bfagf {_bb .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006f\u006e\u0076\u0065\u0072\u0074\u0065\u0064 \u0063\u006f\u006c\u006f\u0072\u0020\u0069\u0073\u0020\u006e\u006f\u0074\u0020i\u006e\u0020\u0074\u0068\u0065\u0020\u0052\u0047\u0042\u0020\u0063\u006flo\u0072\u0073\u0070\u0061\u0063\u0065\u003a\u0020\u0025\u0076",_fedad );return _e .Black ;};return _e .NRGBA {R :uint8 (_bdbe .R ()*255),G :uint8 (_bdbe .G ()*255),B :uint8 (_bdbe .B ()*255),A :uint8 (255)};};func (_fba *wordBag )firstReadingIndex (_eeaf int )int {_eccag :=_fba .firstWord (_eeaf )._befa ;_ffdb :=float64 (_eeaf +1)*_gbab ;_fcacd :=_ffdb +_bcbd *_eccag ;_aacg :=_eeaf ;for _ ,_bdfe :=range _fba .depthBand (_ffdb ,_fcacd ){if _ddab (_fba .firstWord (_bdfe ),_fba .firstWord (_aacg ))< 0{_aacg =_bdfe ;};};return _aacg ;};func (_eadac paraList )writeText (_cege _bc .Writer ){for _cgfg ,_aaea :=range _eadac {_aaea .writeText (_cege );if _cgfg !=len (_eadac )-1{if _bdba (_aaea ,_eadac [_cgfg +1]){_cege .Write ([]byte ("\u0020"));}else {_cege .Write ([]byte ("\u000a"));_cege .Write ([]byte ("\u000a"));};};};_cege .Write ([]byte ("\u000a"));_cege .Write ([]byte ("\u000a"));};const (RenderModeStroke RenderMode =1<<iota ;RenderModeFill ;RenderModeClip ;);
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
|
|
|
|
|
// TODO(peterwilliams97): The stats complicate this function signature and aren't very useful.
|
|
|
|
|
// Replace with a function like Extract() (*PageText, error)
|
|
|
|
|
func (_fbd *Extractor )ExtractPageText ()(*PageText ,int ,int ,error ){_ge ,_aad ,_aeg ,_fcb :=_fbd .extractPageText (_fbd ._fda ,_fbd ._be ,_ae .IdentityMatrix (),0);if _fcb !=nil {return nil ,0,0,_fcb ;};_ge .computeViews ();_fcb =_cadf (_ge );if _fcb !=nil {return nil ,0,0,_fcb ;};return _ge ,_aad ,_aeg ,nil ;};func _abce (_adb float64 )int {var _eag int ;if _adb >=0{_eag =int (_adb /_gbab );}else {_eag =int (_adb /_gbab )-1;};return _eag ;};func _dbdd (_dcad ,_gcga int )uint64 {return uint64 (_dcad )*0x1000000+uint64 (_gcga )};type textResult struct{_aefa PageText ;_gbf int ;_gag int ;};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// String returns a description of `state`.
|
|
|
|
|
func (_ecd *textState )String ()string {_egc :="\u005bN\u004f\u0054\u0020\u0053\u0045\u0054]";if _ecd ._dafd !=nil {_egc =_ecd ._dafd .BaseFont ();};return _bf .Sprintf ("\u0074\u0063\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0077\u003d\u0025\u002e\u0032\u0066 \u0074f\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u003d\u0025\u0071",_ecd ._eebc ,_ecd ._dbg ,_ecd ._ecf ,_egc );};var (_eb =_d .New ("\u0074\u0079p\u0065\u0020\u0063h\u0065\u0063\u006b\u0020\u0065\u0072\u0072\u006f\u0072"););
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
|
|
|
|
// Marks returns the TextMark collection for a page. It represents all the text on the page.
|
2020-08-31 21:12:07 +00:00
|
|
|
|
func (_dcbf PageText )Marks ()*TextMarkArray {return &TextMarkArray {_cfb :_dcbf ._agd }};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// NewFromContents creates a new extractor from contents and page resources.
|
|
|
|
|
func NewFromContents (contents string ,resources *_ad .PdfPageResources )(*Extractor ,error ){_dd :=&Extractor {_fda :contents ,_be :resources ,_bbe :map[string ]fontEntry {},_df :map[string ]textResult {}};return _dd ,nil ;};func (_db *imageExtractContext )extractContentStreamImages (_gc string ,_fa *_ad .PdfPageResources )error {_cbb :=_gb .NewContentStreamParser (_gc );_cef ,_cga :=_cbb .Parse ();if _cga !=nil {return _cga ;};if _db ._dcb ==nil {_db ._dcb =map[*_dg .PdfObjectStream ]*cachedImage {};};if _db ._bg ==nil {_db ._bg =&ImageExtractOptions {};};_cc :=_gb .NewContentStreamProcessor (*_cef );_cc .AddHandler (_gb .HandlerConditionEnumAllOperands ,"",func (_de *_gb .ContentStreamOperation ,_bd _gb .GraphicsState ,_aa *_ad .PdfPageResources )error {return _db .processOperand (_de ,_bd ,_aa );});return _cc .Process (_fa );};func _dcab (_egg _ae .Matrix )_ae .Point {_ffff ,_faef :=_egg .Translation ();return _ae .Point {X :_ffff ,Y :_faef };};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// String returns a string describing `pt`.
|
|
|
|
|
func (_fgcc PageText )String ()string {_aed :=_bf .Sprintf ("P\u0061\u0067\u0065\u0054ex\u0074:\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073",len (_fgcc ._gbbc ));_daec :=[]string {"\u002d"+_aed };for _ ,_bae :=range _fgcc ._gbbc {_daec =append (_daec ,_bae .String ());};_daec =append (_daec ,"\u002b"+_aed );return _c .Join (_daec ,"\u000a");};func _dgcb (_efdf *wordBag ,_eadb *textWord ,_dece float64 )bool {return _eadb .Llx < _efdf .Urx +_dece &&_efdf .Llx -_dece < _eadb .Urx ;};func _gadb (_ddfg []*textWord ,_feda int )[]*textWord {_acdbd :=len (_ddfg );copy (_ddfg [_feda :],_ddfg [_feda +1:]);return _ddfg [:_acdbd -1];};func (_edd *textObject )setTextRenderMode (_adc int ){if _edd ==nil {return ;};_edd ._ccg ._cbg =RenderMode (_adc );};func (_abf *textObject )moveText (_bded ,_ecc float64 ){_abf .moveTo (_bded ,_ecc )};func (_bcea *textLine )toTextMarks (_dgebf *int )[]TextMark {var _abfga []TextMark ;for _ ,_fbfg :=range _bcea ._ffge {if _fbfg ._agbb {_abfga =_bgda (_abfga ,_dgebf ,"\u0020");};_gbdc :=_fbfg .toTextMarks (_dgebf );_abfga =append (_abfga ,_gbdc ...);};return _abfga ;};func _eefe (_cafa ,_eefc float64 )string {_bcbe :=!_gacb (_cafa -_eefc );if _bcbe {return "\u000a";};return "\u0020";};const _ggb =10;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// String returns a description of `l`.
|
|
|
|
|
func (_fgaa *textLine )String ()string {return _bf .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_fgaa ._fbffd ,_fgaa .PdfRectangle ,_fgaa ._gge ,_fgaa .text ());};func (_eadg *wordBag )minDepth ()float64 {return _eadg ._ccggc -(_eadg .Ury -_eadg ._deb )};func (_fdaa paraList )llyRange (_bddb []int ,_efec ,_fgag float64 )[]int {_ccffe :=len (_fdaa );if _fgag < _fdaa [_bddb [0]].Lly ||_efec > _fdaa [_bddb [_ccffe -1]].Lly {return nil ;};_ddeb :=_g .Search (_ccffe ,func (_gcec int )bool {return _fdaa [_bddb [_gcec ]].Lly >=_efec });_bfcb :=_g .Search (_ccffe ,func (_cgbf int )bool {return _fdaa [_bddb [_cgbf ]].Lly > _fgag });return _bddb [_ddeb :_bfcb ];};type textLine struct{_ad .PdfRectangle ;_fbffd float64 ;_ffge []*textWord ;_gge float64 ;};func (_eedc paraList )topoOrder ()[]int {if _fdfg {_bb .Log .Info ("\u0074\u006f\u0070\u006f\u004f\u0072\u0064\u0065\u0072\u003a");};_gefaa :=len (_eedc );_dgaad :=make ([]bool ,_gefaa );_geeg :=make ([]int ,0,_gefaa );_fbag :=_eedc .llyOrdering ();var _feg func (_bee int );_feg =func (_aceb int ){_dgaad [_aceb ]=true ;for _ccggb :=0;_ccggb < _gefaa ;_ccggb ++{if !_dgaad [_ccggb ]{if _eedc .readBefore (_fbag ,_aceb ,_ccggb ){_feg (_ccggb );};};};_geeg =append (_geeg ,_aceb );};for _adaf :=0;_adaf < _gefaa ;_adaf ++{if !_dgaad [_adaf ]{_feg (_adaf );};};return _bccf (_geeg );};func (_faecg paraList )findTables ()[]*textTable {_faecg .addNeighbours ();_g .Slice (_faecg ,func (_fabc ,_bfega int )bool {return _geac (_faecg [_fabc ],_faecg [_bfega ])< 0});var _bbee []*textTable ;for _ ,_cbac :=range _faecg {if _cbac ._eebca {continue ;};_fbdgb :=_cbac .isAtom ();if _fbdgb ==nil {continue ;};_fbdgb .growTable ();if _fbdgb ._gffc *_fbdgb ._fgbe < _ceac {continue ;};_fbdgb .markCells ();_fbdgb .log ("\u0067\u0072\u006fw\u006e");_bbee =append (_bbee ,_fbdgb );};return _bbee ;};func _ggc (_ggga ,_bcfc _ad .PdfRectangle )bool {return _ggga .Llx <=_bcfc .Llx &&_bcfc .Urx <=_ggga .Urx &&_ggga .Lly <=_bcfc .Lly &&_bcfc .Ury <=_ggga .Ury ;};type stateStack []*textState ;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// ToText returns the page text as a single string.
|
|
|
|
|
// Deprecated: This function is deprecated and will be removed in a future major version. Please use
|
|
|
|
|
// Text() instead.
|
|
|
|
|
func (_afc PageText )ToText ()string {return _afc .Text ()};func (_dgbe *textObject )setHorizScaling (_becc float64 ){if _dgbe ==nil {return ;};_dgbe ._ccg ._dbgb =_becc ;};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// PageImages represents extracted images on a PDF page with spatial information:
|
|
|
|
|
// display position and size.
|
|
|
|
|
type PageImages struct{Images []ImageMark ;};func (_gagd *textObject )newTextMark (_fdcf string ,_becb _ae .Matrix ,_egff _ae .Point ,_feee float64 ,_acdce *_ad .PdfFont ,_gee float64 ,_aafb ,_adgde _e .Color )(textMark ,bool ){_cabe :=_becb .Angle ();_fdeb :=_bcde (_cabe ,_ebbf );var _ege float64 ;if _fdeb %180!=90{_ege =_becb .ScalingFactorY ();}else {_ege =_becb .ScalingFactorX ();};_cdd :=_dcab (_becb );_edb :=_ad .PdfRectangle {Llx :_cdd .X ,Lly :_cdd .Y ,Urx :_egff .X ,Ury :_egff .Y };switch _fdeb %360{case 90:_edb .Urx -=_ege ;case 180:_edb .Ury -=_ege ;case 270:_edb .Urx +=_ege ;case 0:_edb .Ury +=_ege ;default:_fdeb =0;_edb .Ury +=_ege ;};if _edb .Llx > _edb .Urx {_edb .Llx ,_edb .Urx =_edb .Urx ,_edb .Llx ;};if _edb .Lly > _edb .Ury {_edb .Lly ,_edb .Ury =_edb .Ury ,_edb .Lly ;};_dbdac ,_abcd :=_egbc (_edb ,_gagd ._fcd ._fc );if !_abcd {_bb .Log .Debug ("\u0054\u0065\u0078\u0074\u0020m\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069\u0064\u0065\u0020\u0070a\u0067\u0065\u002e\u0020\u0062\u0062\u006f\u0078\u003d\u0025\u0067\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u003d\u0025\u0067\u0020\u0074\u0065\u0078\u0074\u003d\u0025q",_edb ,_gagd ._fcd ._fc ,_fdcf );};_edb =_dbdac ;_bfbcd :=_edb ;_bedc :=_gagd ._fcd ._fc ;switch _fdeb %360{case 90:_bedc .Urx ,_bedc .Ury =_bedc .Ury ,_bedc .Urx ;_bfbcd =_ad .PdfRectangle {Llx :_bedc .Urx -_edb .Ury ,Urx :_bedc .Urx -_edb .Lly ,Lly :_edb .Llx ,Ury :_edb .Urx };case 180:_bfbcd =_ad .PdfRectangle {Llx :_bedc .Urx -_edb .Llx ,Urx :_bedc .Urx -_edb .Urx ,Lly :_bedc .Ury -_edb .Lly ,Ury :_bedc .Ury -_edb .Ury };case 270:_bedc .Urx ,_bedc .Ury =_bedc .Ury ,_bedc .Urx ;_bfbcd =_ad .PdfRectangle {Llx :_edb .Ury ,Urx :_edb .Lly ,Lly :_bedc .Ury -_edb .Llx ,Ury :_bedc .Ury -_edb .Urx };};if _bfbcd .Llx > _bfbcd .Urx {_bfbcd .Llx ,_bfbcd .Urx =_bfbcd .Urx ,_bfbcd .Llx ;};if _bfbcd .Lly > _bfbcd .Ury {_bfbcd .Lly ,_bfbcd .Ury =_bfbcd .Ury ,_bfbcd .Lly ;};_cgg :=textMark {_ddabf :_fdcf ,PdfRectangle :_bfbcd ,_bfagag :_edb ,_gbbg :_acdce ,_cegg :_ege ,_faec :_gee ,_acde :_becb ,_acb :_egff ,_dbda :_fdeb ,_adcd :_aafb ,_edc :_adgde };if _bdeg {_bb .Log .Info ("n\u0065\u0077\u0054\u0065\u0078\u0074M\u0061\u0072\u006b\u003a\u0020\u0073t\u0061\u0072\u0074\u003d\u0025\u002e\u0032f\u0020\u0065\u006e\u0064\u003d\u0025\u002e\u0032\u0066\u0020%\u0073",_cdd ,_egff ,_cgg .String ());};return _cgg ,_abcd ;};func _fea (_fbff ,_ddda _ad .PdfRectangle )bool {return _agdc (_fbff ,_ddda )&&_fdd (_fbff ,_ddda )};func (_eaaea *textWord )absorb (_fbbg *textWord ){_eaaea .PdfRectangle =_fbed (_eaaea .PdfRectangle ,_fbbg .PdfRectangle );_eaaea ._efga =append (_eaaea ._efga ,_fbbg ._efga ...);};func (_defa *textTable )get (_bcdg ,_bbfd int )*textPara {return _defa ._cgbeb [_dbdd (_bcdg ,_bbfd )]};func (_gdc *textObject )getCurrentFont ()*_ad .PdfFont {var _ddb *_ad .PdfFont ;if !_gdc ._gcfa .empty (){_ddb =_gdc ._gcfa .top ()._dafd ;};if _ddb ==nil {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u0020\u0064\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020U\u0073\u0069\u006e\u0067\u0020d\u0065\u0066a\u0075\u006c\u0074\u002e");return _ad .DefaultFont ();};return _ddb ;};func (_bgdc *wordBag )pullWord (_ggfg *wordBag ,_fcdd *textWord ,_caaa int ){_bgdc .PdfRectangle =_fbed (_bgdc .PdfRectangle ,_fcdd .PdfRectangle );if _fcdd ._befa > _bgdc ._deb {_bgdc ._deb =_fcdd ._befa ;};_bgdc ._ddd [_caaa ]=append (_bgdc ._ddd [_caaa ],_fcdd );_ggfg .removeWord (_fcdd ,_caaa );};func (_fafa *textLine )endsInHyphen ()bool {_eedb :=_fafa ._ffge [len (_fafa ._ffge )-1];_abg :=[]rune (_eedb ._fagfb );if !_a .Is (_a .Hyphen ,_abg [len (_abg )-1]){return false ;};if _eedb ._agbb &&_agdac (_abg ){return true ;};return _agdac ([]rune (_fafa .text ()));};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// TextMarkArray is a collection of TextMarks.
|
|
|
|
|
type TextMarkArray struct{_cfb []TextMark };func (_bbaf paraList )llyOrdering ()[]int {_dded :=make ([]int ,len (_bbaf ));for _gbca :=range _bbaf {_dded [_gbca ]=_gbca ;};_g .SliceStable (_dded ,func (_ebfg ,_dfa int )bool {_ecbd ,_daa :=_dded [_ebfg ],_dded [_dfa ];return _bbaf [_ecbd ].Lly < _bbaf [_daa ].Lly ;});return _dded ;};func (_ddeg *wordBag )depthRange (_dafg ,_bcff int )[]int {_fbde :=_ddeg .depthIndexes ();var _dgbd []int ;for _ ,_aefb :=range _fbde {if _dafg <=_aefb &&_aefb <=_bcff {_dgbd =append (_dgbd ,_aefb );};};return _dgbd ;};func (_bdbd *wordBag )getDepthIdx (_caa float64 )int {_bfag :=_bdbd .depthIndexes ();_bcfd :=_abce (_caa );if _bcfd < _bfag [0]{return _bfag [0];};if _bcfd > _bfag [len (_bfag )-1]{return _bfag [len (_bfag )-1];};return _bcfd ;};func _fbed (_deae ,_gef _ad .PdfRectangle )_ad .PdfRectangle {return _ad .PdfRectangle {Llx :_gd .Min (_deae .Llx ,_gef .Llx ),Lly :_gd .Min (_deae .Lly ,_gef .Lly ),Urx :_gd .Max (_deae .Urx ,_gef .Urx ),Ury :_gd .Max (_deae .Ury ,_gef .Ury )};};func _bdafe (_dbe *wordBag ,_bbfe *textWord ,_gdae float64 )bool {return _dbe .Urx <=_bbfe .Llx &&_bbfe .Llx < _dbe .Urx +_gdae ;};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// TextTable represents a table.
|
|
|
|
|
// Cells are ordered top-to-bottom, left-to-right.
|
|
|
|
|
// Cells[y] is the (0-offset) y'th row in the table.
|
|
|
|
|
// Cells[y][x] is the (0-offset) x'th column in the table.
|
|
|
|
|
type TextTable struct{W ,H int ;Cells [][]TableCell ;};func _gcgd (_cdfe string ,_defgf int )string {if len (_cdfe )< _defgf {return _cdfe ;};return _cdfe [:_defgf ];};func (_dfdd *textPara )writeText (_eaae _bc .Writer ){if _dfdd ._ddfbc ==nil {_dfdd .writeCellText (_eaae );return ;};for _cefbe :=0;_cefbe < _dfdd ._ddfbc ._fgbe ;_cefbe ++{for _bfgg :=0;_bfgg < _dfdd ._ddfbc ._gffc ;_bfgg ++{_bdbdb :=_dfdd ._ddfbc .get (_bfgg ,_cefbe );if _bdbdb ==nil {_eaae .Write ([]byte ("\u0009"));}else {_bdbdb .writeCellText (_eaae );};_eaae .Write ([]byte ("\u0020"));};if _cefbe < _dfdd ._ddfbc ._fgbe -1{_eaae .Write ([]byte ("\u000a"));};};};func _adcc (_gaee _ad .PdfRectangle ,_eaef bounded )float64 {return _gaee .Ury -_eaef .bbox ().Lly };func _ddfb (_bcba []TextMark ,_afcg *int ,_dgff TextMark )[]TextMark {_dgff .Offset =*_afcg ;_bcba =append (_bcba ,_dgff );*_afcg +=len (_dgff .Text );return _bcba ;};func (_fgb *textObject )setTextRise (_gdec float64 ){if _fgb ==nil {return ;};_fgb ._ccg ._ccff =_gdec ;};func (_cgae *stateStack )empty ()bool {return len (*_cgae )==0};func (_defg *textObject )getFont (_ccgd string )(*_ad .PdfFont ,error ){if _defg ._fcd ._bbe !=nil {_defg ._fcd ._gde ++;_gfbda ,_dgdg :=_defg ._fcd ._bbe [_ccgd ];if _dgdg {_gfbda ._ddbc =_defg ._fcd ._gde ;return _gfbda ._ebfe ,nil ;};};_dfca ,_fgee :=_defg .getFontDirect (_ccgd );if _fgee !=nil {return nil ,_fgee ;};if _defg ._fcd ._bbe !=nil {_aca :=fontEntry {_dfca ,_defg ._fcd ._gde };if len (_defg ._fcd ._bbe )>=_ggb {var _bcbb []string ;for _aab :=range _defg ._fcd ._bbe {_bcbb =append (_bcbb ,_aab );};_g .Slice (_bcbb ,func (_cdbd ,_abcc int )bool {return _defg ._fcd ._bbe [_bcbb [_cdbd ]]._ddbc < _defg ._fcd ._bbe [_bcbb [_abcc ]]._ddbc ;});delete (_defg ._fcd ._bbe ,_bcbb [0]);};_defg ._fcd ._bbe [_ccgd ]=_aca ;};return _dfca ,nil ;};const (_dbce =false ;_bdeg =false ;_fdfg =false ;_bgcb =false ;_cgc =_bgcb &&false ;_deac =_cgc &&false ;_bbff =false ;);func _agdc (_egfd ,_dgeb _ad .PdfRectangle )bool {return _dgeb .Llx <=_egfd .Urx &&_egfd .Llx <=_dgeb .Urx ;};func _cfee (_dfge ,_aedd bounded )float64 {return _dfge .bbox ().Llx -_aedd .bbox ().Urx };func _bdba (_afdc ,_feea *textPara )bool {return _gacb (_afdc .depth ()-_feea .depth ())};func (_gbeg *textTable )bbox ()_ad .PdfRectangle {return _gbeg .PdfRectangle };
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// ImageExtractOptions contains options for controlling image extraction from
|
|
|
|
|
// PDF pages.
|
|
|
|
|
type ImageExtractOptions struct{IncludeInlineStencilMasks bool ;};func _cagb (_gba *Extractor ,_eea *_ad .PdfPageResources ,_gfe _gb .GraphicsState ,_ffgd *textState ,_bedd *stateStack )*textObject {return &textObject {_fcd :_gba ,_adgb :_eea ,_begd :_gfe ,_gcfa :_bedd ,_ccg :_ffgd ,_fee :_ae .IdentityMatrix (),_gadg :_ae .IdentityMatrix ()};};const _fbf =1.0/1000.0;func (_bga *textObject )setCharSpacing (_fgf float64 ){if _bga ==nil {return ;};_bga ._ccg ._eebc =_fgf ;if _bdeg {_bb .Log .Info ("\u0073\u0065t\u0043\u0068\u0061\u0072\u0053\u0070\u0061\u0063\u0069\u006e\u0067\u003a\u0020\u0025\u002e\u0032\u0066\u0020\u0073\u0074\u0061\u0074e=\u0025\u0073",_fgf ,_bga ._ccg .String ());};};func (_acab *textTable )put (_bagd ,_ecdf int ,_fgca *textPara ){_acab ._cgbeb [_dbdd (_bagd ,_ecdf )]=_fgca ;};func _ebbg (_cda *textWord ,_dcd float64 )*wordBag {_dcaaa :=_abce (_cda ._efde );_gadc :=[]*textWord {_cda };_bgaa :=wordBag {_ddd :map[int ][]*textWord {_dcaaa :_gadc },PdfRectangle :_cda .PdfRectangle ,_deb :_cda ._befa ,_ccggc :_dcd };return &_bgaa ;};func _ggf (_agda []*textWord ,_caed float64 )*wordBag {_bfda :=_ebbg (_agda [0],_caed );for _ ,_aaa :=range _agda [1:]{_edg :=_abce (_aaa ._efde );_bfda ._ddd [_edg ]=append (_bfda ._ddd [_edg ],_aaa );};_bfda .sort ();return _bfda ;};func _fef (_cggd *wordBag ,_eeega float64 )[]*wordBag {var _dabc []*wordBag ;for _ ,_geaca :=range _cggd .depthIndexes (){_ggcf :=false ;for !_cggd .empty (_geaca ){_adedd :=_cggd .firstReadingIndex (_geaca );_dddf :=_cggd .firstWord (_adedd );_dcbgf :=_ebbg (_dddf ,_eeega );_cggd .removeWord (_dddf ,_adedd );if _fdfg {_bb .Log .Info ("w\u006f\u0072\u0064\u0073\u005b\u0030\u005d\u003d\u0025\u0073",_dddf .String ());};_eddg :=_ffbe *_dcbgf ._deb ;_eegc :=_geafa *_dcbgf ._deb ;_ggea :=_bddg *_dcbgf ._deb ;for _dddfc :=true ;_dddfc ;_dddfc =_ggcf {_ggcf =false ;if _fdfg {_bb .Log .Info ("\u0070\u0061\u0072\u0061\u0057\u006f\u0072\u0064\u0073\u0020\u0064\u0065\u0070\u0074\u0068\u0020\u0025\u002e2\u0066\u0020\u002d\u0020\u0025\u002e\u0032f\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061\u0044\u0065p\u0074\u0068\u0047\u0061\u0070\u003d\u0025\u002e\u0032\u0066\u0020",_dcbgf .minDepth (),_dcbgf .maxDepth (),_ggea );};if _cggd .scanBand ("\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c",_dcbgf ,_caaae (_dgcb ,0),_dcbgf .minDepth ()-_ggea ,_dcbgf .maxDepth ()+_ggea ,_fcbe ,false ,false )> 0{_ggcf =true ;};if _cggd .scanBand ("\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c",_dcbgf ,_caaae (_dgcb ,_eegc ),_dcbgf .minDepth (),_dcbgf .maxDepth (),_dad ,false ,false )> 0{_ggcf =true ;};if _ggcf {continue ;};_bgfe :=_cggd .scanBand ("",_dcbgf ,_caaae (_bdafe ,_eddg ),_dcbgf .minDepth (),_dcbgf .maxDepth (),_eeaff ,true ,false );if _bgfe > 0{_bdcd :=(_dcbgf .maxDepth ()-_dcbgf .minDepth ())/_dcbgf ._deb ;if (_bgfe > 1&&float64 (_bgfe )> 0.3*_bdcd )||_bgfe <=10{if _cggd .scanBand ("\u006f\u0074\u0068e\u0072",_dcbgf ,_caaae (_bdafe ,_eddg ),_dcbgf .minDepth (),_dcbgf .maxDepth (),_eeaff ,false ,true )> 0{_ggcf =true ;};};};};_dabc =append (_dabc ,_dcbgf );};};return _dabc ;};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// Len returns the number of TextMarks in `ma`.
|
|
|
|
|
func (_gfef *TextMarkArray )Len ()int {if _gfef ==nil {return 0;};return len (_gfef ._cfb );};func (_efba *wordBag )removeWord (_bdac *textWord ,_cfga int ){_bedf :=_acdac (_efba .stratum (_cfga ),_bdac );if len (_bedf )==0{delete (_efba ._ddd ,_cfga );}else {_efba ._ddd [_cfga ]=_bedf ;};};func (_ddgf paraList )eventNeighbours (_cfce []event )map[*textPara ][]int {_g .Slice (_cfce ,func (_effb ,_eggf int )bool {_cbcdd ,_ebea :=_cfce [_effb ],_cfce [_eggf ];_accc ,_dcgc :=_cbcdd ._egcb ,_ebea ._egcb ;if _accc !=_dcgc {return _accc < _dcgc ;};if _cbcdd ._baggg !=_ebea ._baggg {return _cbcdd ._baggg ;};return _effb < _eggf ;});_gdac :=map[int ]map[int ]struct{}{};_baec :=map[int ]struct{}{};for _ ,_geea :=range _cfce {if _geea ._baggg {_gdac [_geea ._eac ]=map[int ]struct{}{};for _fcbde :=range _baec {if _fcbde !=_geea ._eac {_gdac [_geea ._eac ][_fcbde ]=struct{}{};_gdac [_fcbde ][_geea ._eac ]=struct{}{};};};_baec [_geea ._eac ]=struct{}{};}else {delete (_baec ,_geea ._eac );};};_dfege :=map[*textPara ][]int {};for _abed ,_caeg :=range _gdac {_feaa :=_ddgf [_abed ];_dfad :=make ([]int ,len (_caeg ));_adgf :=0;for _acg :=range _caeg {_dfad [_adgf ]=_acg ;_adgf ++;};_dfege [_feaa ]=_dfad ;};return _dfege ;};func _afef (_adbb ,_gece *textPara )bool {return _agdc (_adbb ._bgfba ,_gece ._bgfba )};func (_bec *imageExtractContext )extractInlineImage (_dbb *_gb .ContentStreamInlineImage ,_ade _gb .GraphicsState ,_ebf *_ad .PdfPageResources )error {_ba ,_gfa :=_dbb .ToImage (_ebf );if _gfa !=nil {return _gfa ;};_gdg ,_gfa :=_dbb .GetColorSpace (_ebf );if _gfa !=nil {return _gfa ;};if _gdg ==nil {_gdg =_ad .NewPdfColorspaceDeviceGray ();};_dgb ,_gfa :=_gdg .ImageToRGB (*_ba );if _gfa !=nil {return _gfa ;};_gbg :=ImageMark {Image :&_dgb ,Width :_ade .CTM .ScalingFactorX (),Height :_ade .CTM .ScalingFactorY (),Angle :_ade .CTM .Angle ()};_gbg .X ,_gbg .Y =_ade .CTM .Translation ();_bec ._dc =append (_bec ._dc ,_gbg );_bec ._ed ++;return nil ;};func (_cgbe *wordBag )firstWord (_efeg int )*textWord {return _cgbe ._ddd [_efeg ][0]};func (_gdfg *textObject )setTextMatrix (_gbb []float64 ){if len (_gbb )!=6{_bb .Log .Debug ("\u0045\u0052\u0052OR\u003a\u0020\u006c\u0065\u006e\u0028\u0066\u0029\u0020\u0021\u003d\u0020\u0036\u0020\u0028\u0025\u0064\u0029",len (_gbb ));return ;};_efcg ,_ceg ,_abec ,_gfbd ,_bddf ,_eeb :=_gbb [0],_gbb [1],_gbb [2],_gbb [3],_gbb [4],_gbb [5];_gdfg ._fee =_ae .NewMatrix (_efcg ,_ceg ,_abec ,_gfbd ,_bddf ,_eeb );_gdfg ._gadg =_gdfg ._fee ;};func (_bfdf *wordBag )scanBand (_gaa string ,_ggdb *wordBag ,_dffg func (_eee *wordBag ,_cbcd *textWord )bool ,_dbf ,_bfbc ,_dbc float64 ,_faa ,_eab bool )int {_fadd :=_ggdb ._deb ;_efb :=_bcgb *_fadd ;_cgf :=0;_abdeb ,_dbd :=_dbf ,_bfbc ;var _dfg []*textWord ;for _ ,_dbde :=range _bfdf .depthBand (_dbf -_efb ,_bfbc +_efb ){for _ ,_becd :=range _bfdf ._ddd [_dbde ]{if !(_dbf -_efb <=_becd ._efde &&_becd ._efde <=_bfbc +_efb ){continue ;};if !_dffg (_ggdb ,_becd ){continue ;};_debc :=_gd .Abs (_becd ._befa -_fadd )/_fadd ;_bdbfe :=_becd ._befa /_fadd ;_aeff :=_gd .Min (_debc ,_bdbfe );if _dbc > 0{if _aeff > _dbc {continue ;};};if !_faa {_ggdb .pullWord (_bfdf ,_becd ,_dbde );};_dfg =append (_dfg ,_becd );_cgf ++;if !_eab {if _becd ._efde < _dbf {_dbf =_becd ._efde ;};if _becd ._efde > _bfbc {_bfbc =_becd ._efde ;};};if _faa {break ;};};};if _dbce {if len (_gaa )> 0{_bb .Log .Info ("\u0073\u0063\u0061\u006e\u0042\u0061\u006e\u0064\u003a\u0020\u0025\u0073\u0020\u005b\u0025\u002e\u0032f\u0020\u0025\u002e\u0032\u0066\u005d\u002d\u003e\u005b\u0025.\u0032\u0066\u0020\u0025\u002e\u0032\u0066\u005d\u0020\u0070\u0061\u0072\u0061\u003d\u0025\u002e\u0032\u0066 \u0066\u006f\u006e\u0074\u0073\u0069z\u0065\u003d%\u002e\u0032f\u0020%\u0071",_gaa ,_abdeb ,_dbd ,_dbf ,_bfbc ,_ggdb .PdfRectangle ,_ggdb ._deb ,_gcgd (_ggdb .text (),20));for _beb ,_ffda :=range _dfg {_bf .Printf ("\u0020\u0020\u0025\u0071",_ffda ._fagfb );if _beb >=5{break ;};};if len (_dfg )> 0{_bf .Println ();};};};return _cgf ;};func (_bdgb *stateStack )push (_ccb *textState ){_bdb :=*_ccb ;*_bdgb =append (*_bdgb ,&_bdb )};fun
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// Text returns the extracted page text.
|
|
|
|
|
func (_adgd PageText )Text ()string {return _adgd ._bdbf };const (_ebbf =10;_gbab =6;_bcgb =0.5;_ddff =0.11;_fbdg =0.19;_ceba =0.04;_eef =0.04;_bddg =1.0;_fcbe =0.04;_geafa =0.4;_dad =0.7;_ffbe =1.0;_eeaff =0.1;_cbbd =1.4;_eabb =0.46;_abcf =0.02;_ddea =0.2;_dedcf =0.5;_egca =4;_bcbd =4.0;_ceac =6;);func (_deg *imageExtractContext )processOperand (_bbb *_gb .ContentStreamOperation ,_bfb _gb .GraphicsState ,_ga *_ad .PdfPageResources )error {if _bbb .Operand =="\u0042\u0049"&&len (_bbb .Params )==1{_ccf ,_ebb :=_bbb .Params [0].(*_gb .ContentStreamInlineImage );if !_ebb {return nil ;};if _adg ,_gcg :=_dg .GetBoolVal (_ccf .ImageMask );_gcg {if _adg &&!_deg ._bg .IncludeInlineStencilMasks {return nil ;};};return _deg .extractInlineImage (_ccf ,_bfb ,_ga );}else if _bbb .Operand =="\u0044\u006f"&&len (_bbb .Params )==1{_dca ,_dba :=_dg .GetName (_bbb .Params [0]);if !_dba {_bb .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065");return _eb ;};_ ,_dgd :=_ga .GetXObjectByName (*_dca );switch _dgd {case _ad .XObjectTypeImage :return _deg .extractXObjectImage (_dca ,_bfb ,_ga );case _ad .XObjectTypeForm :return _deg .extractFormImages (_dca ,_bfb ,_ga );};};return nil ;};func (_gac *textTable )log (_cgbb string ){if !_bbff {return ;};_bb .Log .Info ("~\u007e\u007e\u0020\u0025\u0073\u003a \u0025\u0064\u0020\u0078\u0020\u0025\u0064\u000a\u0020 \u0020\u0020\u0020 \u00256\u002e\u0032\u0066",_cgbb ,_gac ._gffc ,_gac ._fgbe ,_gac .PdfRectangle );for _bgdf :=0;_bgdf < _gac ._fgbe ;_bgdf ++{for _baae :=0;_baae < _gac ._gffc ;_baae ++{_acaa :=_gac .get (_baae ,_bgdf );_bf .Printf ("\u00254\u0064 \u0025\u0032\u0064\u003a\u0020%\u0036\u002e2\u0066\u0020\u0025\u0071\u000a",_baae ,_bgdf ,_acaa .PdfRectangle ,_gcgd (_acaa .text (),50));};};};func (_fcf *textObject )getStrokeColor ()_e .Color {return _adbdc (_fcf ._begd .ColorspaceStroking ,_fcf ._begd .ColorStroking );};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// TableCell is a cell in a TextTable.
|
|
|
|
|
type TableCell struct{
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// Text is the extracted text.
|
|
|
|
|
Text string ;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// Marks returns the TextMarks corresponding to the text in Text.
|
|
|
|
|
Marks TextMarkArray ;};type wordBag struct{_ad .PdfRectangle ;_deb float64 ;_ccggc float64 ;_ddd map[int ][]*textWord ;};func (_gfad *textWord )computeText ()string {_caef :=make ([]string ,len (_gfad ._efga ));for _cbeeb ,_abcda :=range _gfad ._efga {_caef [_cbeeb ]=_abcda ._ddabf ;};return _c .Join (_caef ,"");};
|
2018-03-22 13:01:04 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// ImageMark represents an image drawn on a page and its position in device coordinates.
|
|
|
|
|
// All coordinates are in device coordinates.
|
|
|
|
|
type ImageMark struct{Image *_ad .Image ;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// Dimensions of the image as displayed in the PDF.
|
|
|
|
|
Width float64 ;Height float64 ;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// Position of the image in PDF coordinates (lower left corner).
|
|
|
|
|
X float64 ;Y float64 ;
|
|
|
|
|
|
|
|
|
|
// Angle in degrees, if rotated.
|
|
|
|
|
Angle float64 ;};func (_ffaf *textObject )moveTextSetLeading (_gbgc ,_dge float64 ){_ffaf ._ccg ._fgdg =-_dge ;_ffaf .moveTo (_gbgc ,_dge );};func (_aaggg *textLine )markWordBoundaries (){_accd :=_abcf *_aaggg ._gge ;for _dggb ,_bfdac :=range _aaggg ._ffge [1:]{if _cfee (_bfdac ,_aaggg ._ffge [_dggb ])>=_accd {_bfdac ._agbb =true ;};};};
|
|
|
|
|
|
|
|
|
|
// Tables returns the tables extracted from the page.
|
|
|
|
|
func (_ffd PageText )Tables ()[]TextTable {return _ffd ._egaa };func (_cbbc *textPara )bbox ()_ad .PdfRectangle {return _cbbc .PdfRectangle };func (_daeg paraList )xNeighbours ()map[*textPara ][]int {_cded :=make ([]event ,2*len (_daeg ));for _eeab ,_cbcad :=range _daeg {_cded [2*_eeab ]=event {_cbcad .Llx ,true ,_eeab };_cded [2*_eeab +1]=event {_cbcad .Urx ,false ,_eeab };};return _daeg .eventNeighbours (_cded );};func (_aede *textWord )toTextMarks (_dgec *int )[]TextMark {var _dcdg []TextMark ;for _ ,_bbdd :=range _aede ._efga {_dcdg =_ddfb (_dcdg ,_dgec ,_bbdd .ToTextMark ());};return _dcdg ;};func _gacb (_febg float64 )bool {return _gd .Abs (_febg )< TOL };func _caaae (_dedd func (*wordBag ,*textWord ,float64 )bool ,_aba float64 )func (*wordBag ,*textWord )bool {return func (_gada *wordBag ,_bbegc *textWord )bool {return _dedd (_gada ,_bbegc ,_aba )};};func _ddab (_fgg ,_bgde bounded )float64 {return _fgg .bbox ().Llx -_bgde .bbox ().Llx };func _decec (_bbg *wordBag ,_ggcd int )*textLine {_ebec :=_bbg .firstWord (_ggcd );_aebd :=textLine {PdfRectangle :_ebec .PdfRectangle ,_gge :_ebec ._befa ,_fbffd :_ebec ._efde };_aebd .pullWord (_bbg ,_ebec ,_ggcd );return &_aebd ;};func (_efce *textTable )getDown ()paraList {_bfdfd :=make (paraList ,_efce ._gffc );for _cgaea :=0;_cgaea < _efce ._gffc ;_cgaea ++{_cgdd :=_efce .get (_cgaea ,_efce ._fgbe -1)._gbde ;if _cgdd ==nil ||_cgdd ._eebca {return nil ;};_bfdfd [_cgaea ]=_cgdd ;};for _gca :=0;_gca < _efce ._gffc -1;_gca ++{if _bfdfd [_gca ]._gbac !=_bfdfd [_gca +1]{return nil ;};};return _bfdfd ;};func _bgda (_aae []TextMark ,_bgaf *int ,_eaad string )[]TextMark {_aggb :=_gffa ;_aggb .Text =_eaad ;return _ddfb (_aae ,_bgaf ,_aggb );};
|
|
|
|
|
|
|
|
|
|
// String returns a string describing the current state of the textState stack.
|
|
|
|
|
func (_dga *stateStack )String ()string {_gdgd :=[]string {_bf .Sprintf ("\u002d\u002d\u002d\u002d f\u006f\u006e\u0074\u0020\u0073\u0074\u0061\u0063\u006b\u003a\u0020\u0025\u0064",len (*_dga ))};for _cfge ,_gcbd :=range *_dga {_ecca :="\u003c\u006e\u0069l\u003e";if _gcbd !=nil {_ecca =_gcbd .String ();};_gdgd =append (_gdgd ,_bf .Sprintf ("\u0009\u0025\u0032\u0064\u003a\u0020\u0025\u0073",_cfge ,_ecca ));};return _c .Join (_gdgd ,"\u000a");};type textWord struct{_ad .PdfRectangle ;_efde float64 ;_fagfb string ;_efga []*textMark ;_befa float64 ;_agbb bool ;};func (_bad *textTable )getRight ()paraList {_cdbf :=make (paraList ,_bad ._fgbe );for _fcda :=0;_fcda < _bad ._fgbe ;_fcda ++{_aeddg :=_bad .get (_bad ._gffc -1,_fcda )._gbac ;if _aeddg ==nil ||_aeddg ._eebca {return nil ;};_cdbf [_fcda ]=_aeddg ;};for _defe :=0;_defe < _bad ._fgbe -1;_defe ++{if _cdbf [_defe ]._gbde !=_cdbf [_defe +1]{return nil ;};};return _cdbf ;};func (_cdf *textObject )getFontDict (_cagg string )(_dgg _dg .PdfObject ,_aaf error ){_egf :=_cdf ._adgb ;if _egf ==nil {_bb .Log .Debug ("g\u0065\u0074\u0046\u006f\u006e\u0074D\u0069\u0063\u0074\u002e\u0020\u004eo\u0020\u0072\u0065\u0073\u006f\u0075\u0072c\u0065\u0073\u002e\u0020\u006e\u0061\u006d\u0065\u003d\u0025#\u0071",_cagg );return nil ,nil ;};_dgg ,_aeef :=_egf .GetFontByName (_dg .PdfObjectName (_cagg ));if !_aeef {_bb .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u0067\u0065t\u0046\u006f\u006et\u0044\u0069\u0063\u0074\u003a\u0020\u0046\u006f\u006et \u006e\u006f\u0074 \u0066\u006fu\u006e\u0064\u003a\u0020\u006e\u0061m\u0065\u003d%\u0023\u0071",_cagg );return nil ,_d .New ("f\u006f\u006e\u0074\u0020no\u0074 \u0069\u006e\u0020\u0072\u0065s\u006f\u0075\u0072\u0063\u0065\u0073");};return _dgg ,nil ;};func (_ecgb *textWord )addDiacritic (_ggef string ){_dccb :=_ecgb ._efga [len (_ecgb ._efga )-1];_dccb ._ddabf =_dccb ._ddabf +_ggef ;_dccb ._ddabf =_fd .NFKC .String (_dccb ._ddabf );};func (_abda *textObject )nextLine (){_abda .moveTo (0,-_abda ._ccg ._fgdg )};func _affea (_cedd _ad .PdfRectangle ,_eedf []*textLine )*textPara {return &textPara {PdfRectangle :_cedd ,_cbf :_eedf };};func (_cbef *textPara )depth ()float64 {if len (_cbef ._cbf )> 0{return _cbef ._cbf [0]._fbffd ;};return _cbef ._ddfbc .get (0,0).depth ();};type bounded interface{bbox ()_ad .PdfRectangle };func (_adag *textPara )toTextMarks (_egge *int )[]TextMark {if _adag ._ddfbc ==nil {return _adag .toCellTextMarks (_egge );};var _fdca []TextMark ;for _gafb :=0;_gafb < _adag ._ddfbc ._fgbe ;_gafb ++{for _ddegd :=0;_ddegd < _adag ._ddfbc ._gffc ;_ddegd ++{_acee :=_adag ._ddfbc .get (_ddegd ,_gafb );if _acee ==nil {_fdca =_bgda (_fdca ,_egge ,"\u0009");}else {_acae :=_acee .toCellTextMarks (_egge );_fdca =append (_fdca ,_acae ...);};_fdca =_bgda (_fdca ,_egge ,"\u0020");};if _gafb < _adag ._ddfbc ._fgbe -1{_fdca =_bgda (_fdca ,_egge ,"\u000a");};};return _fdca ;};type textMark struct{_ad .PdfRectangle ;_dbda int ;_ddabf string ;_cbce string ;_gbbg *_ad .PdfFont ;_cegg float64 ;_faec float64 ;_acde _ae .Matrix ;_acb _ae .Point ;_bfagag _ad .PdfRectangle ;_adcd _e .Color ;_edc _e .Color ;};func (_edee *textLine )appendWord (_eaab *textWord ){_edee ._ffge =append (_edee ._ffge ,_eaab );_edee .PdfRectangle =_fbed (_edee .PdfRectangle ,_eaab .PdfRectangle );if _eaab ._befa > _edee ._gge {_edee ._gge =_eaab ._befa ;};if _eaab ._efde > _edee ._fbffd {_edee ._fbffd =_eaab ._efde ;};};
|
|
|
|
|
|
|
|
|
|
// String returns a string describing `tm`.
|
|
|
|
|
func (_ceb TextMark )String ()string {_adced :=_ceb .BBox ;var _gfaf string ;if _ceb .Font !=nil {_gfaf =_ceb .Font .String ();if len (_gfaf )> 50{_gfaf =_gfaf [:50]+"\u002e\u002e\u002e";};};var _aadc string ;if _ceb .Meta {_aadc ="\u0020\u002a\u004d\u002a";};return _bf .Sprintf ("\u007b\u0054\u0065\u0078t\u004d\u0061\u0072\u006b\u003a\u0020\u0025\u0064\u0020%\u0071\u003d\u0025\u0030\u0032\u0078\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e2\u0066\u0029\u0020\u0028\u00256\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0025\u0073\u007d",_ceb .Offset ,_ceb .Text ,[]rune (_ceb .Text ),_adced .Llx ,_adced .Lly ,_adced .Urx ,_adced .Ury ,_gfaf ,_aadc );};type paraList []*textPara ;func (_efaab *textMark )inDiacriticArea (_dab *textMark )bool {_bge :=_efaab .Llx -_dab .Llx ;_aafde :=_efaab .Urx -_dab .Urx ;_agbg :=_efaab .Lly -_dab .Lly ;return _gd .Abs (_bge +_aafde )< _efaab .Width ()*_dedcf &&_gd .Abs (_agbg )< _efaab .Height ()*_dedcf ;};func (_fagf *textTable )growTable (){_dfb :=func (_gcfg paraList ){_fagf ._fgbe ++;for _fcbff :=0;_fcbff < _fagf ._gffc ;_fcbff ++{_dddb :=_gcfg [_fcbff ];_fagf .put (_fcbff ,_fagf ._fgbe -1,_dddb );};};_dfcc :=func (_bdcc paraList ){_fagf ._gffc ++;for _efbga :=0;_efbga < _fagf ._fgbe ;_efbga ++{_bbdf :=_bdcc [_efbga ];_fagf .put (_fagf ._gffc -1,_efbga ,_bbdf );};};for {_abfa :=false ;_gfac :=_fagf .getDown ();_gbge :=_fagf .getRight ();if _gfac !=nil &&_gbge !=nil {_fbcb :=_gfac [len (_gfac )-1];if _fbcb !=nil &&!_fbcb ._eebca &&_fbcb ==_gbge [len (_gbge )-1]{_dfb (_gfac );_dfcc (_gbge );_fagf .put (_fagf ._gffc -1,_fagf ._fgbe -1,_fbcb );_abfa =true ;};};if !_abfa &&_gfac !=nil {_dfb (_gfac );_abfa =true ;};if !_abfa &&_gbge !=nil {_dfcc (_gbge );_abfa =true ;};if !_abfa {break ;};};};func (_fgfd *wordBag )highestWord (_aeec int ,_baee ,_afd float64 )*textWord {for _ ,_fadg :=range _fgfd ._ddd [_aeec ]{if _baee <=_fadg ._efde &&_fadg ._efde <=_afd {return _fadg ;};};return nil ;};func (_eafe *textTable )computeBbox ()_ad .PdfRectangle {_agcf :=_eafe .get (0,0).PdfRectangle ;for _eggg :=1;_eggg < _eafe ._gffc ;_eggg ++{_agcf =_fbed (_agcf ,_eafe .get (_eggg ,0).PdfRectangle );};for _edgb :=1;_edgb < _eafe ._fgbe ;_edgb ++{for _ebed :=0;_ebed < _eafe ._gffc ;_ebed ++{_agcf =_fbed (_agcf ,_eafe .get (_ebed ,_edgb ).PdfRectangle );};};return _agcf ;};func (_acdb *wordBag )removeDuplicates (){for _ ,_gcd :=range _acdb .depthIndexes (){if len (_acdb ._ddd [_gcd ])==0{continue ;};_acbc :=_acdb ._ddd [_gcd ][0];_ebfd :=_ddea *_acbc ._befa ;_bcfg :=_acbc ._efde ;for _ ,_bdeca :=range _acdb .depthBand (_bcfg ,_bcfg +_ebfd ){_ccbb :=map[*textWord ]struct{}{};_ggff :=_acdb ._ddd [_bdeca ];for _ ,_acad :=range _ggff {if _acad !=_acbc &&_acad ._fagfb ==_acbc ._fagfb &&_gd .Abs (_acad .Llx -_acbc .Llx )< _ebfd &&_gd .Abs (_acad .Urx -_acbc .Urx )< _ebfd &&_gd .Abs (_acad .Lly -_acbc .Lly )< _ebfd &&_gd .Abs (_acad .Ury -_acbc .Ury )< _ebfd {_ccbb [_acad ]=struct{}{};};};if len (_ccbb )> 0{_fdge :=0;for _ ,_ecgdb :=range _ggff {if _ ,_adee :=_ccbb [_ecgdb ];!_adee {_ggff [_fdge ]=_ecgdb ;_fdge ++;};};_acdb ._ddd [_bdeca ]=_ggff [:len (_ggff )-len (_ccbb )];if len (_acdb ._ddd [_bdeca ])==0{delete (_acdb ._ddd ,_bdeca );};};};};};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
|
|
|
|
// TextMark represents extracted text on a page with information regarding both textual content,
|
|
|
|
|
// formatting (font and size) and positioning.
|
|
|
|
|
// It is the smallest unit of text on a PDF page, typically a single character.
|
|
|
|
|
//
|
|
|
|
|
// getBBox() in test_text.go shows how to compute bounding boxes of substrings of extracted text.
|
|
|
|
|
// The following code extracts the text on PDF page `page` into `text` then finds the bounding box
|
|
|
|
|
// `bbox` of substring `term` in `text`.
|
|
|
|
|
//
|
|
|
|
|
// ex, _ := New(page)
|
|
|
|
|
// // handle errors
|
|
|
|
|
// pageText, _, _, err := ex.ExtractPageText()
|
|
|
|
|
// // handle errors
|
|
|
|
|
// text := pageText.Text()
|
|
|
|
|
// textMarks := pageText.Marks()
|
|
|
|
|
//
|
|
|
|
|
// start := strings.Index(text, term)
|
|
|
|
|
// end := start + len(term)
|
|
|
|
|
// spanMarks, err := textMarks.RangeOffset(start, end)
|
|
|
|
|
// // handle errors
|
|
|
|
|
// bbox, ok := spanMarks.BBox()
|
|
|
|
|
// // handle errors
|
|
|
|
|
type TextMark struct{
|
|
|
|
|
|
|
|
|
|
// Text is the extracted text.
|
|
|
|
|
Text string ;
|
|
|
|
|
|
|
|
|
|
// Original is the text in the PDF. It has not been decoded like `Text`.
|
|
|
|
|
Original string ;
|
2018-09-22 09:28:18 +10:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// BBox is the bounding box of the text.
|
2020-08-31 21:12:07 +00:00
|
|
|
|
BBox _ad .PdfRectangle ;
|
2018-09-22 09:28:18 +10:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// Font is the font the text was drawn with.
|
2020-08-31 21:12:07 +00:00
|
|
|
|
Font *_ad .PdfFont ;
|
2018-12-27 20:51:34 +11:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// FontSize is the font size the text was drawn with.
|
|
|
|
|
FontSize float64 ;
|
2018-11-28 18:06:03 +11:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
// Offset is the offset of the start of TextMark.Text in the extracted text. If you do this
|
|
|
|
|
// text, textMarks := pageText.Text(), pageText.Marks()
|
|
|
|
|
// marks := textMarks.Elements()
|
|
|
|
|
// then marks[i].Offset is the offset of marks[i].Text in text.
|
|
|
|
|
Offset int ;
|
|
|
|
|
|
|
|
|
|
// Meta is set true for spaces and line breaks that we insert in the extracted text. We insert
|
|
|
|
|
// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
|
|
|
|
|
// distance apart. See wordJoiner (lineJoiner) in PageText.computeViews().
|
|
|
|
|
Meta bool ;
|
|
|
|
|
|
|
|
|
|
// FillColor is the fill color of the text.
|
|
|
|
|
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
2020-08-31 21:12:07 +00:00
|
|
|
|
FillColor _e .Color ;
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
|
|
|
|
// StrokeColor is the stroke color of the text.
|
|
|
|
|
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
2020-08-31 21:12:07 +00:00
|
|
|
|
StrokeColor _e .Color ;};func (_accf *textObject )showText (_aee []byte )error {return _accf .renderText (_aee )};func _bcg (_eccd *_gb .ContentStreamOperation )(float64 ,error ){if len (_eccd .Params )!=1{_fbgc :=_d .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_eccd .Operand ,1,len (_eccd .Params ),_eccd .Params );return 0.0,_fbgc ;};return _dg .GetNumberAsFloat (_eccd .Params [0]);};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// ExtractText processes and extracts all text data in content streams and returns as a string.
|
|
|
|
|
// It takes into account character encodings in the PDF file, which are decoded by
|
|
|
|
|
// CharcodeBytesToUnicode.
|
|
|
|
|
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
|
|
|
|
|
func (_gfc *Extractor )ExtractText ()(string ,error ){_fcc ,_ ,_ ,_bbeg :=_gfc .ExtractTextWithStats ();return _fcc ,_bbeg ;};type textTable struct{_ad .PdfRectangle ;_gffc ,_fgbe int ;_cgbeb map[uint64 ]*textPara ;};func (_aac *textObject )setFont (_bff string ,_bdgf float64 )error {if _aac ==nil {return nil ;};_aac ._ccg ._ecf =_bdgf ;_bgdb ,_feb :=_aac .getFont (_bff );if _feb !=nil {return _feb ;};_aac ._ccg ._dafd =_bgdb ;if _aac ._gcfa .empty (){_aac ._gcfa .push (_aac ._ccg );}else {_aac ._gcfa .top ()._dafd =_aac ._ccg ._dafd ;};return nil ;};func _adge (_agc ,_afg bounded )float64 {_cbdg :=_efge (_agc ,_afg );if !_gacb (_cbdg ){return _cbdg ;};return _ddab (_agc ,_afg );};func (_gaf *textObject )moveTo (_eae ,_dbbg float64 ){_gaf ._gadg .Concat (_ae .NewMatrix (1,0,0,1,_eae ,_dbbg ));_gaf ._fee =_gaf ._gadg ;};
|
2018-03-22 13:01:04 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// ToTextMark returns the public view of `tm`.
|
|
|
|
|
func (_ffcf *textMark )ToTextMark ()TextMark {return TextMark {Text :_ffcf ._ddabf ,Original :_ffcf ._cbce ,BBox :_ffcf ._bfagag ,Font :_ffcf ._gbbg ,FontSize :_ffcf ._cegg ,FillColor :_ffcf ._adcd ,StrokeColor :_ffcf ._edc };};func _efge (_afa ,_agad bounded )float64 {return _cefg (_afa )-_cefg (_agad )};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// String returns a string describing `ma`.
|
|
|
|
|
func (_dcbac TextMarkArray )String ()string {_fcfg :=len (_dcbac ._cfb );if _fcfg ==0{return "\u0045\u004d\u0050T\u0059";};_fffg :=_dcbac ._cfb [0];_eec :=_dcbac ._cfb [_fcfg -1];return _bf .Sprintf ("\u007b\u0054\u0045\u0058\u0054\u004d\u0041\u0052K\u0041\u0052\u0052AY\u003a\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025s\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d%\u0073\u007d",_fcfg ,_fffg ,_eec );};func (_fab *wordBag )empty (_gfbab int )bool {_ ,_gdeg :=_fab ._ddd [_gfbab ];return !_gdeg };func _agdac (_edea []rune )bool {return len (_edea )>=_egca &&_a .Is (_a .Hyphen ,_edea [len (_edea )-1])&&!_a .IsSpace (_edea [len (_edea )-2]);};func (_eccae *textPara )text ()string {_cbgc :=new (_f .Buffer );_eccae .writeText (_cbgc );return _cbgc .String ();};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// New returns an Extractor instance for extracting content from the input PDF page.
|
|
|
|
|
func New (page *_ad .PdfPage )(*Extractor ,error ){_bbc ,_ab :=page .GetAllContentStreams ();if _ab !=nil {return nil ,_ab ;};_ec ,_ab :=page .GetMediaBox ();if _ab !=nil {return nil ,_bf .Errorf ("\u0065\u0078\u0074r\u0061\u0063\u0074\u006fr\u0020\u0072\u0065\u0071\u0075\u0069\u0072e\u0073\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u002e\u0020\u0025\u0076",_ab );};_ag :=&Extractor {_fda :_bbc ,_be :page .Resources ,_fc :*_ec ,_bbe :map[string ]fontEntry {},_df :map[string ]textResult {}};return _ag ,nil ;};func _bccf (_efbg []int )[]int {_efff :=make ([]int ,len (_efbg ));for _acbb ,_gecb :=range _efbg {_efff [len (_efbg )-1-_acbb ]=_gecb ;};return _efff ;};func (_dacd *textObject )reset (){_dacd ._fee =_ae .IdentityMatrix ();_dacd ._gadg =_ae .IdentityMatrix ();_dacd ._gfba =nil ;};func (_ffee *textTable )newTablePara ()*textPara {_egdef :=_ffee .computeBbox ();return &textPara {PdfRectangle :_egdef ,_bgfba :_egdef ,_ddfbc :_ffee };};func _dfff (_geafac ,_debcd ,_fafe ,_aeca *textPara )*textTable {_ffab :=&textTable {_gffc :2,_fgbe :2,_cgbeb :map[uint64 ]*textPara {}};_ffab .put (0,0,_geafac );_ffab .put (1,0,_debcd );_ffab .put (0,1,_fafe );_ffab .put (1,1,_aeca );return _ffab ;};type cachedImage struct{_gf *_ad .Image ;_caf _ad .PdfColorspace ;};func _cbcg (_cedc []*wordBag )[]*wordBag {if len (_cedc )<=1{return _cedc ;};if _dbce {_bb .Log .Info ("\u006d\u0065\u0072\u0067\u0065\u0057\u006f\u0072\u0064B\u0061\u0067\u0073\u003a");};_g .Slice (_cedc ,func (_dace ,_fcbca int )bool {_aafa ,_cbba :=_cedc [_dace ],_cedc [_fcbca ];_ddg :=_aafa .Width ()*_aafa .Height ();_daff :=_cbba .Width ()*_cbba .Height ();if _ddg !=_daff {return _ddg > _daff ;};if _aafa .Height ()!=_cbba .Height (){return _aafa .Height ()> _cbba .Height ();};return _dace < _fcbca ;});var _fde []*wordBag ;_dcea :=map[int ]struct{}{};for _dcaad :=0;_dcaad < len (_cedc );_dcaad ++{if _ ,_aded :=_dcea [_dcaad ];_aded {continue ;};_fabd :=_cedc [_dcaad ];for _bfaga :=_dcaad +1;_bfaga < len (_cedc );_bfaga ++{if _ ,_bagg :=_dcea [_dcaad ];_bagg {continue ;};_bffgf :=_cedc [_bfaga ];_bgc :=_fabd .PdfRectangle ;_bgc .Llx -=_fabd ._deb ;if _ggc (_bgc ,_bffgf .PdfRectangle ){_fabd .absorb (_bffgf );_dcea [_bfaga ]=struct{}{};};};_fde =append (_fde ,_fabd );};if len (_cedc )!=len (_fde )+len (_dcea ){_bb .Log .Error ("\u006d\u0065\u0072\u0067\u0065\u0057o\u0072\u0064\u0042\u0061\u0067\u0073\u003a\u0020\u0025\u0064\u002d\u003e\u0025d\u0020\u0061\u0062\u0073\u006f\u0072\u0062e\u0064\u003d\u0025\u0064",len (_cedc ),len (_fde ),len (_dcea ));};return _fde ;};func (_agf *Extractor )extractPageText (_bce string ,_gcf *_ad .PdfPageResources ,_gea _ae .Matrix ,_bde int )(*PageText ,int ,int ,error ){_bb .Log .Trace ("\u0065x\u0074\u0072\u0061\u0063t\u0050\u0061\u0067\u0065\u0054e\u0078t\u003a \u006c\u0065\u0076\u0065\u006c\u003d\u0025d",_bde );_efa :=&PageText {_gfab :_agf ._fc };_fgc :=_bcc (_agf ._fc );var _gad stateStack ;_cefb :=_cagb (_agf ,_gcf ,_gb .GraphicsState {},&_fgc ,&_gad );var _bda bool ;if _bde > _ee {_fcac :=_d .New ("\u0066\u006f\u0072\u006d s\u0074\u0061\u0063\u006b\u0020\u006f\u0076\u0065\u0072\u0066\u006c\u006f\u0077");_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a \u0065\u0078\u0074\u0072\u0061\u0063\u0074\u0050\u0061\u0067\u0065\u0054\u0065\u0078\u0074\u002e\u0020\u0072\u0065\u0063u\u0072\u0073\u0069\u006f\u006e\u0020\u006c\u0065\u0076\u0065\u006c\u003d\u0025\u0064 \u0065r\u0072\u003d\u0025\u0076",_bde ,_fcac );return _efa ,_fgc ._ccffa ,_fgc ._cdc ,_fcac ;};_efc :=_gb .NewContentStreamParser (_bce );_cbc ,_dff :=_efc .Parse ();if _dff !=nil {_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020e\u0078\u0074\u0072a\u0063\u0074\u0050\u0061g\u0065\u0054\u0065\u0078\u0074\u0020\u0070\u0061\u0072\u0073\u0065\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_dff );return _efa ,_fgc ._ccffa ,_fgc ._cdc ,_dff ;};_ged :=_gb .NewContentStreamProcessor (*_cbc );_ged .AddHandler (_gb .HandlerConditionEnumAllOperands ,"",func (_dcg *_gb .ContentStreamOperation ,_cgd _gb .GraphicsState ,_aea *_
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// String returns a description of `w`.
|
|
|
|
|
func (_egcc *textWord )String ()string {return _bf .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_egcc ._efde ,_egcc .PdfRectangle ,_egcc ._befa ,_egcc ._fagfb );};func (_cbee paraList )computeEBBoxes (){if _dbce {_bb .Log .Info ("\u0063o\u006dp\u0075\u0074\u0065\u0045\u0042\u0042\u006f\u0078\u0065\u0073\u003a");};for _ ,_bbbd :=range _cbee {_bbbd ._bgfba =_bbbd .PdfRectangle ;};_bbcf :=_cbee .yNeighbours ();for _gead ,_bced :=range _cbee {_eegf :=_bced ._bgfba ;_fcdf ,_fceg :=-1.0e9,+1.0e9;for _ ,_dfde :=range _bbcf [_bced ]{_caca :=_cbee [_dfde ]._bgfba ;if _caca .Urx < _eegf .Llx {_fcdf =_gd .Max (_fcdf ,_caca .Urx );}else if _eegf .Urx < _caca .Llx {_fceg =_gd .Min (_fceg ,_caca .Llx );};};for _facg ,_bef :=range _cbee {_affe :=_bef ._bgfba ;if _gead ==_facg ||_affe .Ury > _eegf .Lly {continue ;};if _fcdf <=_affe .Llx &&_affe .Llx < _eegf .Llx {_eegf .Llx =_affe .Llx ;}else if _affe .Urx <=_fceg &&_eegf .Urx < _affe .Urx {_eegf .Urx =_affe .Urx ;};};if _dbce {_bf .Printf ("%\u0034\u0064\u003a\u0020%6\u002e2\u0066\u002d\u003e\u0025\u0036.\u0032\u0066\u0020\u0025\u0071\u000a",_gead ,_bced ._bgfba ,_eegf ,_gcgd (_bced .text (),50));};_bced ._bgfba =_eegf ;};if _eece {for _ ,_egea :=range _cbee {_egea .PdfRectangle =_egea ._bgfba ;};};};func (_aagg *wordBag )allWords ()[]*textWord {var _dcee []*textWord ;for _ ,_ccbdd :=range _aagg ._ddd {_dcee =append (_dcee ,_ccbdd ...);};return _dcee ;};func (_bbd *wordBag )sort (){for _ ,_ccbd :=range _bbd ._ddd {_g .Slice (_ccbd ,func (_gaag ,_gfeb int )bool {return _ddab (_ccbd [_gaag ],_ccbd [_gfeb ])< 0});};};func (_bcgg *textPara )toCellTextMarks (_ecgd *int )[]TextMark {var _gbeba []TextMark ;for _egbd ,_dgbf :=range _bcgg ._cbf {_fbfb :=_dgbf .toTextMarks (_ecgd );_ffbd :=_gdgdb &&_dgbf .endsInHyphen ()&&_egbd !=len (_bcgg ._cbf )-1;if _ffbd {_fbfb =_eabdb (_fbfb ,_ecgd );};_gbeba =append (_gbeba ,_fbfb ...);if !(_ffbd ||_egbd ==len (_bcgg ._cbf )-1){_gbeba =_bgda (_gbeba ,_ecgd ,_eefe (_dgbf ._fbffd ,_bcgg ._cbf [_egbd +1]._fbffd ));};};return _gbeba ;};func (_cafae paraList )yNeighbours ()map[*textPara ][]int {_cdagb :=make ([]event ,2*len (_cafae ));for _gfg ,_gbabf :=range _cafae {_cdagb [2*_gfg ]=event {_gbabf .Lly ,true ,_gfg };_cdagb [2*_gfg +1]=event {_gbabf .Ury ,false ,_gfg };};return _cafae .eventNeighbours (_cdagb );};func (_egde *PageText )computeViews (){var _bbfgd paraList ;_gbeb :=len (_egde ._gbbc );for _deeb :=0;_deeb < 360&&_gbeb > 0;_deeb +=90{_dbae :=make ([]*textMark ,0,len (_egde ._gbbc )-_gbeb );for _ ,_ddag :=range _egde ._gbbc {if _ddag ._dbda ==_deeb {_dbae =append (_dbae ,_ddag );};};if len (_dbae )> 0{_abc :=_dgaa (_dbae ,_egde ._gfab );_bbfgd =append (_bbfgd ,_abc ...);_gbeb -=len (_dbae );};};_dcba :=new (_f .Buffer );_bbfgd .writeText (_dcba );_egde ._bdbf =_dcba .String ();_egde ._agd =_bbfgd .toTextMarks ();_egde ._egaa =_bbfgd .tables ();};func _bcc (_gce _ad .PdfRectangle )textState {return textState {_dbgb :100,_cbg :RenderModeFill ,_gga :_gce };};
|
2020-06-16 21:19:10 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// String returns a description of `t`.
|
|
|
|
|
func (_aeea *textTable )String ()string {return _bf .Sprintf ("\u0025d\u0020\u0078\u0020\u0025\u0064",_aeea ._gffc ,_aeea ._fgbe );};const (_gdgdb =true ;_fccg =true ;_deef =true ;_eece =false ;);func (_bdfb *textLine )bbox ()_ad .PdfRectangle {return _bdfb .PdfRectangle };var _gdb =false ;func _acdac (_dfag []*textWord ,_aafdf *textWord )[]*textWord {for _efbf ,_caeea :=range _dfag {if _caeea ==_aafdf {return _gadb (_dfag ,_efbf );};};_bb .Log .Error ("\u0072\u0065\u006d\u006f\u0076e\u0057\u006f\u0072\u0064\u003a\u0020\u0077\u006f\u0072\u0064\u0073\u0020\u0064o\u0065\u0073\u006e\u0027\u0074\u0020\u0063\u006f\u006e\u0074\u0061\u0069\u006e\u0020\u0077\u006f\u0072\u0064\u003d\u0025\u0073",_aafdf );return nil ;};type textState struct{_eebc float64 ;_dbg float64 ;_dbgb float64 ;_fgdg float64 ;_ecf float64 ;_cbg RenderMode ;_ccff float64 ;_dafd *_ad .PdfFont ;_gga _ad .PdfRectangle ;_ccffa int ;_cdc int ;};func _bacff (_ggcdf string )bool {for _ ,_cee :=range _ggcdf {if !_a .IsSpace (_cee ){return false ;};};return true ;};func (_fgba paraList )addNeighbours (){_faca :=_fgba .yNeighbours ();for _ ,_cfde :=range _fgba {var _cbfb *textPara ;_fgea :=false ;for _ ,_gcfge :=range _faca [_cfde ]{_abfc :=_fgba [_gcfge ];if _abfc .Urx <=_cfde .Llx {if _cbfb ==nil {_cbfb =_abfc ;}else {if _abfc .Llx > _cbfb .Llx {_cbfb =_abfc ;_fgea =false ;}else if _abfc .Llx ==_cbfb .Llx {_fgea =true ;};};};};if !_fgea {_cfde ._dfe =_cbfb ;};};for _ ,_eafc :=range _fgba {var _dgebc *textPara ;_fgcae :=false ;for _ ,_dbee :=range _faca [_eafc ]{_bcbbc :=_fgba [_dbee ];if _bcbbc .Llx >=_eafc .Urx {if _dgebc ==nil {_dgebc =_bcbbc ;}else {if _bcbbc .Llx < _dgebc .Llx {_dgebc =_bcbbc ;_fgcae =false ;}else if _bcbbc .Llx ==_dgebc .Llx {_fgcae =true ;};};};};if !_fgcae {_eafc ._gbac =_dgebc ;};};_faca =_fgba .xNeighbours ();for _ ,_cbgb :=range _fgba {var _gacbd *textPara ;_eeaae :=false ;for _ ,_aeba :=range _faca [_cbgb ]{_bgcc :=_fgba [_aeba ];if _bgcc .Lly >=_cbgb .Ury {if _gacbd ==nil {_gacbd =_bgcc ;}else {if _bgcc .Ury < _gacbd .Ury {_gacbd =_bgcc ;_eeaae =false ;}else if _bgcc .Ury ==_gacbd .Ury {_eeaae =true ;};};};};if !_eeaae {_cbgb ._aeda =_gacbd ;};};for _ ,_ccca :=range _fgba {var _cegc *textPara ;_dabf :=false ;for _ ,_fdcg :=range _faca [_ccca ]{_gbaa :=_fgba [_fdcg ];if _gbaa .Ury <=_ccca .Lly {if _cegc ==nil {_cegc =_gbaa ;}else {if _gbaa .Ury > _cegc .Ury {_cegc =_gbaa ;_dabf =false ;}else if _gbaa .Ury ==_cegc .Ury {_dabf =true ;};};};};if !_dabf {_ccca ._gbde =_cegc ;};};};func (_aec *textObject )setWordSpacing (_aace float64 ){if _aec ==nil {return ;};_aec ._ccg ._dbg =_aace ;};func (_fag *stateStack )top ()*textState {if _fag .empty (){return nil ;};return (*_fag )[_fag .size ()-1];};func (_eba paraList )sortReadingOrder (){_bb .Log .Trace ("\u0073\u006fr\u0074\u0052\u0065\u0061\u0064i\u006e\u0067\u004f\u0072\u0064e\u0072\u003a\u0020\u0070\u0061\u0072\u0061\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_eba ));if len (_eba )<=1{return ;};_eba .computeEBBoxes ();_g .Slice (_eba ,func (_ffcd ,_cde int )bool {return _adge (_eba [_ffcd ],_eba [_cde ])<=0});_dgfa :=_eba .topoOrder ();_eba .reorder (_dgfa );};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// ExtractPageImages returns the image contents of the page extractor, including data
|
|
|
|
|
// and position, size information for each image.
|
|
|
|
|
// A set of options to control page image extraction can be passed in. The options
|
|
|
|
|
// parameter can be nil for the default options. By default, inline stencil masks
|
|
|
|
|
// are not extracted.
|
|
|
|
|
func (_cg *Extractor )ExtractPageImages (options *ImageExtractOptions )(*PageImages ,error ){_aef :=&imageExtractContext {_bg :options };_eg :=_aef .extractContentStreamImages (_cg ._fda ,_cg ._be );if _eg !=nil {return nil ,_eg ;};return &PageImages {Images :_aef ._dc },nil ;};func (_acdc *wordBag )depthBand (_edf ,_agb float64 )[]int {if len (_acdc ._ddd )==0{return nil ;};return _acdc .depthRange (_acdc .getDepthIdx (_edf ),_acdc .getDepthIdx (_agb ));};func (_abde *textObject )getFontDirect (_baac string )(*_ad .PdfFont ,error ){_bag ,_bcaf :=_abde .getFontDict (_baac );if _bcaf !=nil {return nil ,_bcaf ;};_febe ,_bcaf :=_ad .NewPdfFontFromPdfObject (_bag );if _bcaf !=nil {_bb .Log .Debug ("\u0067\u0065\u0074\u0046\u006f\u006e\u0074\u0044\u0069\u0072\u0065\u0063\u0074\u003a\u0020\u004e\u0065\u0077Pd\u0066F\u006f\u006e\u0074\u0046\u0072\u006f\u006d\u0050\u0064\u0066\u004f\u0062j\u0065\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u006e\u0061\u006d\u0065\u003d%\u0023\u0071\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_baac ,_bcaf );};return _febe ,_bcaf ;};type textPara struct{_ad .PdfRectangle ;_bgfba _ad .PdfRectangle ;_cbf []*textLine ;_ddfbc *textTable ;_eebca bool ;_dfe *textPara ;_gbac *textPara ;_aeda *textPara ;_gbde *textPara ;};func (_efed *textPara )isAtom ()*textTable {_gcda :=_efed ;_ccfa :=_efed ._gbac ;_adcb :=_efed ._gbde ;if !(_ccfa !=nil &&!_ccfa ._eebca &&_adcb !=nil &&!_adcb ._eebca ){return nil ;};_dfeg :=_ccfa ._gbde ;if !(_dfeg !=nil &&!_dfeg ._eebca &&_dfeg ==_adcb ._gbac ){return nil ;};if _ccfa ._dfe !=_gcda ||_adcb ._aeda !=_gcda ||_dfeg ._dfe !=_adcb ||_dfeg ._aeda !=_ccfa {return nil ;};return _dfff (_gcda ,_ccfa ,_adcb ,_dfeg );};func _gafd (_fdfgc string )(string ,bool ){_dggd :=[]rune (_fdfgc );if len (_dggd )!=1{return "",false ;};_fcde ,_gaga :=_dacb [_dggd [0]];return _fcde ,_gaga ;};func (_bgfb *textLine )text ()string {var _fcbf []string ;for _ ,_ddge :=range _bgfb ._ffge {if _ddge ._agbb {_fcbf =append (_fcbf ,"\u0020");};_fcbf =append (_fcbf ,_ddge ._fagfb );};return _c .Join (_fcbf ,"");};func (_bfdg *textPara )fontsize ()float64 {return _bfdg ._cbf [0]._gge };type imageExtractContext struct{_dc []ImageMark ;_ed int ;_ca int ;_ce int ;_dcb map[*_dg .PdfObjectStream ]*cachedImage ;_bg *ImageExtractOptions ;};func (_bcf *textObject )renderText (_dega []byte )error {if _bcf ._dec {_bb .Log .Debug ("\u0072\u0065\u006e\u0064\u0065r\u0054\u0065\u0078\u0074\u003a\u0020\u0049\u006e\u0076\u0061\u006c\u0069\u0064 \u0066\u006f\u006e\u0074\u002e\u0020\u004e\u006f\u0074\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u002e");return nil ;};_affd :=_bcf .getCurrentFont ();_egb :=_affd .BytesToCharcodes (_dega );_gbc ,_ded ,_efg :=_affd .CharcodesToStrings (_egb );if _efg > 0{_bb .Log .Debug ("\u0072\u0065nd\u0065\u0072\u0054e\u0078\u0074\u003a\u0020num\u0043ha\u0072\u0073\u003d\u0025\u0064\u0020\u006eum\u004d\u0069\u0073\u0073\u0065\u0073\u003d%\u0064",_ded ,_efg );};_bcf ._ccg ._ccffa +=_ded ;_bcf ._ccg ._cdc +=_efg ;_fdbg :=_bcf ._ccg ;_cba :=_fdbg ._ecf ;_fagd :=_fdbg ._dbgb /100.0;_gddg ,_ega :=_affd .GetRuneMetrics (' ');if !_ega {_gddg ,_ega =_affd .GetCharMetrics (32);};if !_ega {_gddg ,_ =_ad .DefaultFont ().GetRuneMetrics (' ');};_cgag :=_gddg .Wx *_fbf ;_bb .Log .Trace ("\u0073p\u0061\u0063e\u0057\u0069\u0064t\u0068\u003d\u0025\u002e\u0032\u0066\u0020t\u0065\u0078\u0074\u003d\u0025\u0071 \u0066\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0066\u006f\u006et\u0053\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066",_cgag ,_gbc ,_affd ,_cba );_fcca :=_ae .NewMatrix (_cba *_fagd ,0,0,_cba ,0,_fdbg ._ccff );if _bdeg {_bb .Log .Info ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0074\u0065\u0078t\u0073\u003d\u0025\u0071",len (_egb ),_egb ,_gbc );};_bb .Log .Trace ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0072\u0075\u006ee\u0073\u003d\u0025\u0071",len (_egb ),_egb
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// Extractor stores and offers functionality for extracting content from PDF pages.
|
|
|
|
|
type Extractor struct{_fda string ;_be *_ad .PdfPageResources ;_fc _ad .PdfRectangle ;_bbe map[string ]fontEntry ;_df map[string ]textResult ;_gde int64 ;_cf int ;};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// PageText represents the layout of text on a device page.
|
|
|
|
|
type PageText struct{_gbbc []*textMark ;_bdbf string ;_agd []TextMark ;_egaa []TextTable ;_gfab _ad .PdfRectangle ;};var _gffa =TextMark {Text :"\u005b\u0058\u005d",Original :"\u0020",Meta :true ,FillColor :_e .White ,StrokeColor :_e .White };func _ccgg (_ebe _ae .Point )_ae .Matrix {return _ae .TranslationMatrix (_ebe .X ,_ebe .Y )};func _cefg (_egce bounded )float64 {return -_egce .bbox ().Lly };func _dbfg (_efegg string )string {_baeb :=[]rune (_efegg );return string (_baeb [:len (_baeb )-1])};func (_bbce *textLine )pullWord (_bgb *wordBag ,_dgcg *textWord ,_ggag int ){_bbce .appendWord (_dgcg );_bgb .removeWord (_dgcg ,_ggag );};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// Append appends `mark` to the mark array.
|
|
|
|
|
func (_ebd *TextMarkArray )Append (mark TextMark ){_ebd ._cfb =append (_ebd ._cfb ,mark )};func _acdf (_fdgb []*textMark ,_cbfg _ad .PdfRectangle )*textWord {_ccbg :=_fdgb [0].PdfRectangle ;_gedd :=_fdgb [0]._cegg ;for _ ,_febgf :=range _fdgb [1:]{_ccbg =_fbed (_ccbg ,_febgf .PdfRectangle );if _febgf ._cegg > _gedd {_gedd =_febgf ._cegg ;};};return &textWord {PdfRectangle :_ccbg ,_efga :_fdgb ,_efde :_cbfg .Ury -_ccbg .Lly ,_befa :_gedd };};func (_aagf *textObject )showTextAdjusted (_cbde *_dg .PdfObjectArray )error {_bbcg :=false ;for _ ,_bcag :=range _cbde .Elements (){switch _bcag .(type ){case *_dg .PdfObjectFloat ,*_dg .PdfObjectInteger :_gff ,_fdb :=_dg .GetNumberAsFloat (_bcag );if _fdb !=nil {_bb .Log .Debug ("\u0045\u0052\u0052\u004fR\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078t\u0041\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0042\u0061\u0064\u0020\u006e\u0075\u006d\u0065r\u0069\u0063\u0061\u006c\u0020a\u0072\u0067\u002e\u0020\u006f\u003d\u0025\u0073\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_bcag ,_cbde );return _fdb ;};_cgb ,_fdf :=-_gff *0.001*_aagf ._ccg ._ecf ,0.0;if _bbcg {_fdf ,_cgb =_cgb ,_fdf ;};_eega :=_ccgg (_ae .Point {X :_cgb ,Y :_fdf });_aagf ._fee .Concat (_eega );case *_dg .PdfObjectString :_cag ,_ebc :=_dg .GetStringBytes (_bcag );if !_ebc {_bb .Log .Trace ("s\u0068\u006f\u0077\u0054\u0065\u0078\u0074\u0041\u0064j\u0075\u0073\u0074\u0065\u0064\u003a\u0020Ba\u0064\u0020\u0073\u0074r\u0069\u006e\u0067\u0020\u0061\u0072\u0067\u002e\u0020o=\u0025\u0073 \u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_bcag ,_cbde );return _dg .ErrTypeError ;};_aagf .renderText (_cag );default:_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078\u0074A\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0055\u006e\u0065\u0078p\u0065\u0063\u0074\u0065\u0064\u0020\u0074\u0079\u0070\u0065\u0020\u0028%T\u0029\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_bcag ,_cbde );return _dg .ErrTypeError ;};};return nil ;};func (_fcea *wordBag )absorb (_ddegc *wordBag ){for _bdgfa ,_fbad :=range _ddegc ._ddd {for _ ,_ffafc :=range _fbad {_fcea .pullWord (_ddegc ,_ffafc ,_bdgfa );};};};func _eabdb (_cgbc []TextMark ,_cfea *int )[]TextMark {_efdc :=_cgbc [len (_cgbc )-1];_caee :=[]rune (_efdc .Text );if len (_caee )==1{_cgbc =_cgbc [:len (_cgbc )-1];_eeggb :=_cgbc [len (_cgbc )-1];*_cfea =_eeggb .Offset +len (_eeggb .Text );}else {_dbec :=_dbfg (_efdc .Text );*_cfea +=len (_dbec )-len (_efdc .Text );_efdc .Text =_dbec ;};return _cgbc ;};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// String returns a description of `b`.
|
|
|
|
|
func (_dfd *wordBag )String ()string {var _dacg []string ;for _ ,_deebd :=range _dfd .depthIndexes (){_dde ,_ :=_dfd ._ddd [_deebd ];for _ ,_cdfa :=range _dde {_dacg =append (_dacg ,_cdfa ._fagfb );};};return _bf .Sprintf ("\u0025.\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065=\u0025\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0025\u0071",_dfd .PdfRectangle ,_dfd ._deb ,len (_dacg ),_dacg );};type event struct{_egcb float64 ;_baggg bool ;_eac int ;};func (_face paraList )applyTables (_dgdc []*textTable )paraList {_eegeb :=map[*textPara ]struct{}{};var _gedcc paraList ;for _ ,_fbb :=range _dgdc {for _ ,_affa :=range _fbb ._cgbeb {_eegeb [_affa ]=struct{}{};};_gedcc =append (_gedcc ,_fbb .newTablePara ());};for _ ,_affc :=range _face {if _ ,_bdbdd :=_eegeb [_affc ];!_bdbdd {_gedcc =append (_gedcc ,_affc );};};return _gedcc ;};func _dgaa (_gfee []*textMark ,_abgg _ad .PdfRectangle )paraList {_bb .Log .Trace ("\u006d\u0061\u006b\u0065\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065\u003a \u0025\u0064\u0020\u0065\u006c\u0065m\u0065\u006e\u0074\u0073\u0020\u0070\u0061\u0067\u0065\u0053\u0069\u007a\u0065=\u0025\u002e\u0032\u0066",len (_gfee ),_abgg );if len (_gfee )==0{return nil ;};_cacb :=_faabf (_gfee ,_abgg );if len (_cacb )==0{return nil ;};_cgfb :=_ggf (_cacb ,_abgg .Ury );_dfgc :=_fef (_cgfb ,_abgg .Ury );_dfgc =_cbcg (_dfgc );_bbeb :=make (paraList ,0,len (_dfgc ));for _ ,_dcc :=range _dfgc {_gedc :=_dcc .arrangeText ();if _gedc !=nil {_bbeb =append (_bbeb ,_gedc );};};if len (_bbeb )>=_ceac {_bbeb =_bbeb .extractTables ();};_bbeb .sortReadingOrder ();_bbeb .log ("\u0073\u006f\u0072te\u0064\u0020\u0069\u006e\u0020\u0072\u0065\u0061\u0064\u0069\u006e\u0067\u0020\u006f\u0072\u0064\u0065\u0072");return _bbeb ;};func _edaa (_daegb []_dg .PdfObject )(_gddd ,_aecg float64 ,_fdefg error ){if len (_daegb )!=2{return 0,0,_bf .Errorf ("\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u006e\u0075\u006d\u0062\u0065\u0072\u0020o\u0066 \u0070\u0061\u0072\u0061\u006d\u0073\u003a \u0025\u0064",len (_daegb ));};_fbagb ,_fdefg :=_dg .GetNumbersAsFloat (_daegb );if _fdefg !=nil {return 0,0,_fdefg ;};return _fbagb [0],_fbagb [1],nil ;};func (_bgad paraList )extractTables ()paraList {if _bbff {_bb .Log .Debug ("\u0065\u0078\u0074r\u0061\u0063\u0074\u0054\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_bgad ));};if len (_bgad )< _ceac {return _bgad ;};_abdaa :=_bgad .findTables ();if _bbff {_bb .Log .Info ("c\u006f\u006d\u0062\u0069\u006e\u0065d\u0020\u0074\u0061\u0062\u006c\u0065s\u0020\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d",len (_abdaa ));for _cabc ,_degad :=range _abdaa {_degad .log (_bf .Sprintf ("c\u006f\u006d\u0062\u0069\u006e\u0065\u0064\u0020\u0025\u0064",_cabc ));};};return _bgad .applyTables (_abdaa );};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
|
|
|
|
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
|
|
|
|
|
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
|
2020-08-31 21:12:07 +00:00
|
|
|
|
func (_baa *Extractor )ExtractTextWithStats ()(_fdc string ,_dag int ,_acd int ,_fe error ){_bdf ,_dag ,_acd ,_fe :=_baa .ExtractPageText ();if _fe !=nil {return "",_dag ,_acd ,_fe ;};return _bdf .Text (),_dag ,_acd ,nil ;};type textObject struct{_fcd *Extractor ;_adgb *_ad .PdfPageResources ;_begd _gb .GraphicsState ;_ccg *textState ;_gcfa *stateStack ;_fee _ae .Matrix ;_gadg _ae .Matrix ;_gfba []*textMark ;_dec bool ;};func (_efd *stateStack )pop ()*textState {if _efd .empty (){return nil ;};_gbdg :=*(*_efd )[len (*_efd )-1];*_efd =(*_efd )[:len (*_efd )-1];return &_gbdg ;};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// RenderMode specifies the text rendering mode (Tmode), which determines whether showing text shall cause
|
|
|
|
|
// glyph outlines to be stroked, filled, used as a clipping boundary, or some combination of the three.
|
|
|
|
|
// Stroking, filling, and clipping shall have the same effects for a text object as they do for a path object
|
|
|
|
|
// (see 8.5.3, "Path-Painting Operators" and 8.5.4, "Clipping Path Operators").
|
|
|
|
|
type RenderMode int ;func _geac (_fcfc ,_gfcb bounded )float64 {_bdgd :=_ddab (_fcfc ,_gfcb );if !_gacb (_bdgd ){return _bdgd ;};return _efge (_fcfc ,_gfcb );};func _fdd (_dbca ,_cea _ad .PdfRectangle )bool {return _dbca .Lly <=_cea .Ury &&_cea .Lly <=_dbca .Ury };
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// String returns a description of `p`.
|
|
|
|
|
func (_faab *textPara )String ()string {_cebf :="";if _faab ._ddfbc !=nil {_cebf =_bf .Sprintf ("\u005b\u0025\u0064\u0078\u0025\u0064\u005d\u0020",_faab ._ddfbc ._gffc ,_faab ._ddfbc ._fgbe );};return _bf .Sprintf ("\u0025\u0036\u002e\u0032f \u0025\u0073\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u0020\u0025\u0071",_faab .PdfRectangle ,_cebf ,len (_faab ._cbf ),_gcgd (_faab .text (),50));};func (_gbdgd paraList )reorder (_afce []int ){_bdad :=make (paraList ,len (_gbdgd ));for _cafe ,_adcee :=range _afce {_bdad [_cafe ]=_gbdgd [_adcee ];};copy (_gbdgd ,_bdad );};func (_eagf paraList )readBefore (_efaad []int ,_gcc ,_abdc int )bool {_gade ,_ccgf :=_eagf [_gcc ],_eagf [_abdc ];if _afef (_gade ,_ccgf )&&_gade .Lly > _ccgf .Lly {return true ;};if !(_gade ._bgfba .Urx < _ccgf ._bgfba .Llx ){return false ;};_dcggg ,_adgea :=_gade .Lly ,_ccgf .Lly ;if _dcggg > _adgea {_adgea ,_dcggg =_dcggg ,_adgea ;};_bdae :=_gd .Max (_gade ._bgfba .Llx ,_ccgf ._bgfba .Llx );_ddfc :=_gd .Min (_gade ._bgfba .Urx ,_ccgf ._bgfba .Urx );_bedb :=_eagf .llyRange (_efaad ,_dcggg ,_adgea );for _ ,_gfd :=range _bedb {if _gfd ==_gcc ||_gfd ==_abdc {continue ;};_ccgbg :=_eagf [_gfd ];if _ccgbg ._bgfba .Llx <=_ddfc &&_bdae <=_ccgbg ._bgfba .Urx {return false ;};};return true ;};func (_aega *wordBag )text ()string {_fbga :=_aega .allWords ();_degb :=make ([]string ,len (_fbga ));for _deag ,_aafd :=range _fbga {_degb [_deag ]=_aafd ._fagfb ;};return _c .Join (_degb ,"\u0020");};var (_dacb =map[rune ]string {0x0060:"\u0300",0x02CB:"\u0300",0x0027:"\u0301",0x00B4:"\u0301",0x02B9:"\u0301",0x02CA:"\u0301",0x005E:"\u0302",0x02C6:"\u0302",0x007E:"\u0303",0x02DC:"\u0303",0x00AF:"\u0304",0x02C9:"\u0304",0x02D8:"\u0306",0x02D9:"\u0307",0x00A8:"\u0308",0x00B0:"\u030a",0x02DA:"\u030a",0x02BA:"\u030b",0x02DD:"\u030b",0x02C7:"\u030c",0x02C8:"\u030d",0x0022:"\u030e",0x02BB:"\u0312",0x02BC:"\u0313",0x0486:"\u0313",0x055A:"\u0313",0x02BD:"\u0314",0x0485:"\u0314",0x0559:"\u0314",0x02D4:"\u031d",0x02D5:"\u031e",0x02D6:"\u031f",0x02D7:"\u0320",0x02B2:"\u0321",0x00B8:"\u0327",0x02CC:"\u0329",0x02B7:"\u032b",0x02CD:"\u0331",0x005F:"\u0332",0x204E:"\u0359"};);func _abge (_effe ,_feag int )int {if _effe < _feag {return _effe ;};return _feag ;};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
|
|
|
|
|
func (_bbfc *TextMarkArray )BBox ()(_ad .PdfRectangle ,bool ){var _baad _ad .PdfRectangle ;_fga :=false ;for _ ,_bcfb :=range _bbfc ._cfb {if _bcfb .Meta ||_bacff (_bcfb .Text ){continue ;};if _fga {_baad =_fbed (_baad ,_bcfb .BBox );}else {_baad =_bcfb .BBox ;_fga =true ;};};return _baad ,_fga ;};func (_cd *textObject )checkOp (_eegg *_gb .ContentStreamOperation ,_fgdc int ,_ccc bool )(_fgec bool ,_egdf error ){if _cd ==nil {var _bbfg []_dg .PdfObject ;if _fgdc > 0{_bbfg =_eegg .Params ;if len (_bbfg )> _fgdc {_bbfg =_bbfg [:_fgdc ];};};_bb .Log .Debug ("\u0025\u0023q \u006f\u0070\u0065r\u0061\u006e\u0064\u0020out\u0073id\u0065\u0020\u0074\u0065\u0078\u0074\u002e p\u0061\u0072\u0061\u006d\u0073\u003d\u0025+\u0076",_eegg .Operand ,_bbfg );};if _fgdc >=0{if len (_eegg .Params )!=_fgdc {if _ccc {_egdf =_d .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");};_bb .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_eegg .Operand ,_fgdc ,len (_eegg .Params ),_eegg .Params );return false ,_egdf ;};};return true ,nil ;};const TOL =1.0e-6;func (_dgcgd *textWord )appendMark (_cdaf *textMark ,_aefc _ad .PdfRectangle ){_dgcgd ._efga =append (_dgcgd ._efga ,_cdaf );_dgcgd .PdfRectangle =_fbed (_dgcgd .PdfRectangle ,_cdaf .PdfRectangle );if _cdaf ._cegg > _dgcgd ._befa {_dgcgd ._befa =_cdaf ._cegg ;};_dgcgd ._efde =_aefc .Ury -_dgcgd .PdfRectangle .Lly ;};func (_gcaa *textTable )markCells (){for _cgcg :=0;_cgcg < _gcaa ._fgbe ;_cgcg ++{for _dbeg :=0;_dbeg < _gcaa ._gffc ;_dbeg ++{_bcbc :=_gcaa .get (_dbeg ,_cgcg );_bcbc ._eebca =true ;};};};func (_abff paraList )toTextMarks ()[]TextMark {_daef :=0;var _adgeb []TextMark ;for _gegb ,_eaf :=range _abff {_gefa :=_eaf .toTextMarks (&_daef );_adgeb =append (_adgeb ,_gefa ...);if _gegb !=len (_abff )-1{if _bdba (_eaf ,_abff [_gegb +1]){_adgeb =_bgda (_adgeb ,&_daef ,"\u0020");}else {_adgeb =_bgda (_adgeb ,&_daef ,"\u000a");_adgeb =_bgda (_adgeb ,&_daef ,"\u000a");};};};_adgeb =_bgda (_adgeb ,&_daef ,"\u000a");_adgeb =_bgda (_adgeb ,&_daef ,"\u000a");return _adgeb ;};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2020-08-31 21:12:07 +00:00
|
|
|
|
// RangeOffset returns the TextMarks in `ma` that overlap text[start:end] in the extracted text.
|
|
|
|
|
// These are tm: `start` <= tm.Offset + len(tm.Text) && tm.Offset < `end` where
|
|
|
|
|
// `start` and `end` are offsets in the extracted text.
|
|
|
|
|
// NOTE: TextMarks can contain multiple characters. e.g. "ffi" for the ffi ligature so the first and
|
|
|
|
|
// last elements of the returned TextMarkArray may only partially overlap text[start:end].
|
|
|
|
|
func (_fec *TextMarkArray )RangeOffset (start ,end int )(*TextMarkArray ,error ){if _fec ==nil {return nil ,_d .New ("\u006da\u003d\u003d\u006e\u0069\u006c");};if end < start {return nil ,_bf .Errorf ("\u0065\u006e\u0064\u0020\u003c\u0020\u0073\u0074\u0061\u0072\u0074\u002e\u0020\u0052\u0061n\u0067\u0065\u004f\u0066\u0066\u0073\u0065\u0074\u0020\u006e\u006f\u0074\u0020d\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020\u0073\u0074\u0061\u0072t=\u0025\u0064\u0020\u0065\u006e\u0064\u003d\u0025\u0064\u0020",start ,end );};_bba :=len (_fec ._cfb );if _bba ==0{return _fec ,nil ;};if start < _fec ._cfb [0].Offset {start =_fec ._cfb [0].Offset ;};if end > _fec ._cfb [_bba -1].Offset +1{end =_fec ._cfb [_bba -1].Offset +1;};_bfea :=_g .Search (_bba ,func (_ebda int )bool {return _fec ._cfb [_ebda ].Offset +len (_fec ._cfb [_ebda ].Text )-1>=start });if !(0<=_bfea &&_bfea < _bba ){_bcd :=_bf .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020\u0072\u0061\u006e\u0067\u0065\u002e\u0020\u0073\u0074\u0061\u0072\u0074\u003d%\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009 \u006c\u0061\u0073\u0074\u003d%\u0076",start ,_bfea ,_bba ,_fec ._cfb [0],_fec ._cfb [_bba -1]);return nil ,_bcd ;};_cdb :=_g .Search (_bba ,func (_agg int )bool {return _fec ._cfb [_agg ].Offset > end -1});if !(0<=_cdb &&_cdb < _bba ){_add :=_bf .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020r\u0061\u006e\u0067e\u002e\u0020\u0065n\u0064\u003d%\u0064\u0020\u0069\u0045\u006e\u0064=\u0025d \u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d\u0025\u0076",end ,_cdb ,_bba ,_fec ._cfb [0],_fec ._cfb [_bba -1]);return nil ,_add ;};if _cdb <=_bfea {return nil ,_bf .Errorf ("\u0069\u0045\u006e\u0064\u0020\u003c=\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003a\u0020\u0073\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u0065\u006ed\u003d\u0025\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020i\u0045n\u0064\u003d\u0025\u0064",start ,end ,_bfea ,_cdb );};return &TextMarkArray {_cfb :_fec ._cfb [_bfea :_cdb ]},nil ;};func (_cdbe *wordBag )depthIndexes ()[]int {if len (_cdbe ._ddd )==0{return nil ;};_ffe :=make ([]int ,len (_cdbe ._ddd ));_acff :=0;for _bac :=range _cdbe ._ddd {_ffe [_acff ]=_bac ;_acff ++;};_g .Ints (_ffe );return _ffe ;};func (_bdgda *textPara )writeCellText (_egdd _bc .Writer ){for _fbae ,_aeaa :=range _bdgda ._cbf {_feef :=_aeaa .text ();_aegf :=_gdgdb &&_aeaa .endsInHyphen ()&&_fbae !=len (_bdgda ._cbf )-1;if _aegf {_feef =_dbfg (_feef );};_egdd .Write ([]byte (_feef ));if !(_aegf ||_fbae ==len (_bdgda ._cbf )-1){_egdd .Write ([]byte (_eefe (_aeaa ._fbffd ,_bdgda ._cbf [_fbae +1]._fbffd )));};};};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
|
|
|
|
// ApplyArea processes the page text only within the specified area `bbox`.
|
|
|
|
|
// Each time ApplyArea is called, it updates the result set in `pt`.
|
|
|
|
|
// Can be called multiple times in a row with different bounding boxes.
|
2020-08-31 21:12:07 +00:00
|
|
|
|
func (_eada *PageText )ApplyArea (bbox _ad .PdfRectangle ){_fbe :=make ([]*textMark ,0,len (_eada ._gbbc ));for _ ,_aadd :=range _eada ._gbbc {if _fea (_aadd .bbox (),bbox ){_fbe =append (_fbe ,_aadd );};};var _eda paraList ;_dcec :=len (_fbe );for _def :=0;_def < 360&&_dcec > 0;_def +=90{_aade :=make ([]*textMark ,0,len (_fbe )-_dcec );for _ ,_gae :=range _fbe {if _gae ._dbda ==_def {_aade =append (_aade ,_gae );};};if len (_aade )> 0{_eaac :=_dgaa (_aade ,_eada ._gfab );_eda =append (_eda ,_eaac ...);_dcec -=len (_aade );};};_ggge :=new (_f .Buffer );_eda .writeText (_ggge );_eada ._bdbf =_ggge .String ();_eada ._agd =_eda .toTextMarks ();_eada ._egaa =_eda .tables ();};
|