2020-08-27 21:45:09 +00:00
|
|
|
|
//
|
|
|
|
|
// Copyright 2020 FoxyUtils ehf. All rights reserved.
|
|
|
|
|
//
|
|
|
|
|
// This is a commercial product and requires a license to operate.
|
|
|
|
|
// A trial license can be obtained at https://unidoc.io
|
|
|
|
|
//
|
|
|
|
|
// DO NOT EDIT: generated by unitwist Go source code obfuscator.
|
|
|
|
|
//
|
|
|
|
|
// Use of this source code is governed by the UniDoc End User License Agreement
|
|
|
|
|
// terms that can be accessed at https://unidoc.io/eula/
|
2018-03-22 14:03:47 +00:00
|
|
|
|
|
2020-08-27 21:45:09 +00:00
|
|
|
|
//
|
|
|
|
|
// Package extractor is used for quickly extracting PDF content through a simple interface.
|
|
|
|
|
// Currently offers functionality for extracting textual content.
|
|
|
|
|
//
|
2021-05-11 00:01:27 +00:00
|
|
|
|
package extractor ;import (_ce "bytes";_dd "errors";_ae "fmt";_gd "github.com/unidoc/unipdf/v3/common";_de "github.com/unidoc/unipdf/v3/contentstream";_eb "github.com/unidoc/unipdf/v3/core";_fe "github.com/unidoc/unipdf/v3/internal/license";_daf "github.com/unidoc/unipdf/v3/internal/textencoding";
|
|
|
|
|
_ff "github.com/unidoc/unipdf/v3/internal/transform";_bc "github.com/unidoc/unipdf/v3/model";_af "golang.org/x/text/unicode/norm";_dc "golang.org/x/xerrors";_fd "image/color";_e "io";_f "math";_g "regexp";_b "sort";_a "strings";_c "unicode";_da "unicode/utf8";
|
|
|
|
|
);func (_bff *shapesState )quadraticTo (_ddcd ,_fbgc ,_gaac ,_bcd float64 ){if _egfa {_gd .Log .Info ("\u0071\u0075\u0061d\u0072\u0061\u0074\u0069\u0063\u0054\u006f\u003a");};_bff .addPoint (_gaac ,_bcd );};var _fddf =_g .MustCompile ("\u005e\u005c\u0073\u002a\u0028\u005c\u0064\u002b\u005c\u002e\u003f|\u005b\u0049\u0069\u0076\u005d\u002b\u0029\u005c\u0073\u002a\\\u0029\u003f\u0024");
|
|
|
|
|
func _cgdg (_bfgg func (*wordBag ,*textWord ,float64 )bool ,_ccce float64 )func (*wordBag ,*textWord )bool {return func (_gced *wordBag ,_abc *textWord )bool {return _bfgg (_gced ,_abc ,_ccce )};};type subpath struct{_cfac []_ff .Point ;_abe bool ;};func (_dacd *wordBag )pullWord (_bfcf *textWord ,_acae int ,_gfd map[int ]map[*textWord ]struct{}){_dacd .PdfRectangle =_deb (_dacd .PdfRectangle ,_bfcf .PdfRectangle );
|
|
|
|
|
if _bfcf ._edega > _dacd ._cfec {_dacd ._cfec =_bfcf ._edega ;};_dacd ._aaaf [_acae ]=append (_dacd ._aaaf [_acae ],_bfcf );_gfd [_acae ][_bfcf ]=struct{}{};};func (_becg *wordBag )minDepth ()float64 {return _becg ._agadg -(_becg .Ury -_becg ._cfec )};
|
2020-08-27 21:45:09 +00:00
|
|
|
|
|
2021-04-23 20:28:14 +00:00
|
|
|
|
|
|
|
|
|
// String returns a description of `p`.
|
2021-05-11 00:01:27 +00:00
|
|
|
|
func (_dcfe *textPara )String ()string {if _dcfe ._fbed {return _ae .Sprintf ("\u0025\u0036\u002e\u0032\u0066\u0020\u005b\u0045\u004d\u0050\u0054\u0059\u005d",_dcfe .PdfRectangle );};_fced :="";if _dcfe ._egea !=nil {_fced =_ae .Sprintf ("\u005b\u0025\u0064\u0078\u0025\u0064\u005d\u0020",_dcfe ._egea ._cacec ,_dcfe ._egea ._bfbba );
|
|
|
|
|
};return _ae .Sprintf ("\u0025\u0036\u002e\u0032f \u0025\u0073\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u0020\u0025\u0071",_dcfe .PdfRectangle ,_fced ,len (_dcfe ._bcca ),_eaed (_dcfe .text (),50));};
|
2021-03-23 23:12:52 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// Elements returns the TextMarks in `ma`.
|
|
|
|
|
func (_gcda *TextMarkArray )Elements ()[]TextMark {return _gcda ._fad };func _gaaae (_cfgg []TextMark ,_eedaa *int ,_eaff TextMark )[]TextMark {_eaff .Offset =*_eedaa ;_cfgg =append (_cfgg ,_eaff );*_eedaa +=len (_eaff .Text );return _cfgg ;};func (_adb *textObject )showTextAdjusted (_dcce *_eb .PdfObjectArray )error {_ffb :=false ;
|
|
|
|
|
for _ ,_ecg :=range _dcce .Elements (){switch _ecg .(type ){case *_eb .PdfObjectFloat ,*_eb .PdfObjectInteger :_cad ,_efcg :=_eb .GetNumberAsFloat (_ecg );if _efcg !=nil {_gd .Log .Debug ("\u0045\u0052\u0052\u004fR\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078t\u0041\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0042\u0061\u0064\u0020\u006e\u0075\u006d\u0065r\u0069\u0063\u0061\u006c\u0020a\u0072\u0067\u002e\u0020\u006f\u003d\u0025\u0073\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_ecg ,_dcce );
|
|
|
|
|
return _efcg ;};_gbee ,_cgd :=-_cad *0.001*_adb ._efb ._abfe ,0.0;if _ffb {_cgd ,_gbee =_gbee ,_cgd ;};_cgbd :=_gcac (_ff .Point {X :_gbee ,Y :_cgd });_adb ._cdf .Concat (_cgbd );case *_eb .PdfObjectString :_fda ,_fcgd :=_eb .GetStringBytes (_ecg );if !_fcgd {_gd .Log .Trace ("s\u0068\u006f\u0077\u0054\u0065\u0078\u0074\u0041\u0064j\u0075\u0073\u0074\u0065\u0064\u003a\u0020Ba\u0064\u0020\u0073\u0074r\u0069\u006e\u0067\u0020\u0061\u0072\u0067\u002e\u0020o=\u0025\u0073 \u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_ecg ,_dcce );
|
|
|
|
|
return _eb .ErrTypeError ;};_adb .renderText (_fda );default:_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0073\u0068\u006f\u0077\u0054\u0065\u0078\u0074A\u0064\u006a\u0075\u0073\u0074\u0065\u0064\u002e\u0020\u0055\u006e\u0065\u0078p\u0065\u0063\u0074\u0065\u0064\u0020\u0074\u0079\u0070\u0065\u0020\u0028%T\u0029\u0020\u0061\u0072\u0067\u0073\u003d\u0025\u002b\u0076",_ecg ,_dcce );
|
|
|
|
|
return _eb .ErrTypeError ;};};return nil ;};
|
2021-03-23 23:12:52 +00:00
|
|
|
|
|
2021-04-23 20:28:14 +00:00
|
|
|
|
// String returns a human readable description of `path`.
|
2021-05-11 00:01:27 +00:00
|
|
|
|
func (_feec *subpath )String ()string {_ddebg :=_feec ._cfac ;_fbbf :=len (_ddebg );if _fbbf <=5{return _ae .Sprintf ("\u0025d\u003a\u0020\u0025\u0036\u002e\u0032f",_fbbf ,_ddebg );};return _ae .Sprintf ("\u0025d\u003a\u0020\u0025\u0036.\u0032\u0066\u0020\u0025\u0036.\u0032f\u0020.\u002e\u002e\u0020\u0025\u0036\u002e\u0032f",_fbbf ,_ddebg [0],_ddebg [1],_ddebg [_fbbf -1]);
|
|
|
|
|
};func (_aaeaf paraList )yNeighbours (_gccb float64 )map[*textPara ][]int {_dbdg :=make ([]event ,2*len (_aaeaf ));if _gccb ==0{for _ffdcg ,_afddd :=range _aaeaf {_dbdg [2*_ffdcg ]=event {_afddd .Lly ,true ,_ffdcg };_dbdg [2*_ffdcg +1]=event {_afddd .Ury ,false ,_ffdcg };
|
|
|
|
|
};}else {for _feccd ,_caea :=range _aaeaf {_dbdg [2*_feccd ]=event {_caea .Lly -_gccb *_caea .fontsize (),true ,_feccd };_dbdg [2*_feccd +1]=event {_caea .Ury +_gccb *_caea .fontsize (),false ,_feccd };};};return _aaeaf .eventNeighbours (_dbdg );};func (_ecgg *textWord )addDiacritic (_efbd string ){_dggfa :=_ecgg ._eeefd [len (_ecgg ._eeefd )-1];
|
|
|
|
|
_dggfa ._ecfa +=_efbd ;_dggfa ._ecfa =_af .NFKC .String (_dggfa ._ecfa );};func _agbd (_cdfcf []pathSection ){if _ebdf < 0.0{return ;};if _acea {_gd .Log .Info ("\u0067\u0072\u0061\u006e\u0075\u006c\u0061\u0072\u0069\u007a\u0065\u003a\u0020\u0025\u0064 \u0073u\u0062\u0070\u0061\u0074\u0068\u0020\u0073\u0065\u0063\u0074\u0069\u006f\u006e\u0073",len (_cdfcf ));
|
|
|
|
|
};for _cgda ,_cgdda :=range _cdfcf {for _gacfc ,_ggegf :=range _cgdda ._bbd {for _ddgc ,_ffgeg :=range _ggegf ._cfac {_ggegf ._cfac [_ddgc ]=_ff .Point {X :_fbgca (_ffgeg .X ),Y :_fbgca (_ffgeg .Y )};if _acea {_dggf :=_ggegf ._cfac [_ddgc ];if !_fgbd (_ffgeg ,_dggf ){_egdb :=_ff .Point {X :_dggf .X -_ffgeg .X ,Y :_dggf .Y -_ffgeg .Y };
|
|
|
|
|
_ae .Printf ("\u0025\u0034d \u002d\u0020\u00254\u0064\u0020\u002d\u0020%4d\u003a %\u002e\u0032\u0066\u0020\u2192\u0020\u0025.2\u0066\u0020\u0028\u0025\u0067\u0029\u000a",_cgda ,_gacfc ,_ddgc ,_ffgeg ,_dggf ,_egdb );};};};};};};func _bbfce (_cecg ,_cceaea int )int {if _cecg > _cceaea {return _cecg ;
|
|
|
|
|
};return _cceaea ;};func (_ebb *stateStack )top ()*textState {if _ebb .empty (){return nil ;};return (*_ebb )[_ebb .size ()-1];};func (_gfef *wordBag )blocked (_cfbc *textWord )bool {if _cfbc .Urx < _gfef .Llx {_ffgd :=_eaaa (_cfbc .PdfRectangle );_gbage :=_caag (_gfef .PdfRectangle );
|
|
|
|
|
if _gfef ._fedd .blocks (_ffgd ,_gbage ){if _eagfda {_gd .Log .Info ("\u0062\u006c\u006f\u0063ke\u0064\u0020\u2190\u0078\u003a\u0020\u0025\u0073\u0020\u0025\u0073",_cfbc ,_gfef );};return true ;};}else if _gfef .Urx < _cfbc .Llx {_acdf :=_eaaa (_gfef .PdfRectangle );
|
|
|
|
|
_afg :=_caag (_cfbc .PdfRectangle );if _gfef ._fedd .blocks (_acdf ,_afg ){if _eagfda {_gd .Log .Info ("b\u006co\u0063\u006b\u0065\u0064\u0020\u0078\u2192\u0020:\u0020\u0025\u0073\u0020%s",_cfbc ,_gfef );};return true ;};};if _cfbc .Ury < _gfef .Lly {_ccg :=_aedac (_cfbc .PdfRectangle );
|
|
|
|
|
_dedg :=_ebag (_gfef .PdfRectangle );if _gfef ._gec .blocks (_ccg ,_dedg ){if _eagfda {_gd .Log .Info ("\u0062\u006c\u006f\u0063ke\u0064\u0020\u2190\u0079\u003a\u0020\u0025\u0073\u0020\u0025\u0073",_cfbc ,_gfef );};return true ;};}else if _gfef .Ury < _cfbc .Lly {_ecdb :=_aedac (_gfef .PdfRectangle );
|
|
|
|
|
_cccf :=_ebag (_cfbc .PdfRectangle );if _gfef ._gec .blocks (_ecdb ,_cccf ){if _eagfda {_gd .Log .Info ("b\u006co\u0063\u006b\u0065\u0064\u0020\u0079\u2192\u0020:\u0020\u0025\u0073\u0020%s",_cfbc ,_gfef );};return true ;};};return false ;};
|
2021-03-23 23:12:52 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// String returns a string descibing `i`.
|
|
|
|
|
func (_fdag gridTile )String ()string {_abeba :=func (_egcbf bool ,_bffc string )string {if _egcbf {return _bffc ;};return "\u005f";};return _ae .Sprintf ("\u00256\u002e2\u0066\u0020\u0025\u0031\u0073%\u0031\u0073%\u0031\u0073\u0025\u0031\u0073",_fdag .PdfRectangle ,_abeba (_fdag ._cgaag ,"\u004c"),_abeba (_fdag ._fbfa ,"\u0052"),_abeba (_fdag ._ggfc ,"\u0042"),_abeba (_fdag ._adfgd ,"\u0054"));
|
|
|
|
|
};type compositeCell struct{_bc .PdfRectangle ;paraList ;};func _bdgbe (_fgdb *PageText )error {_edafed :=_fe .GetLicenseKey ();if _edafed !=nil &&_edafed .IsLicensed ()||_ad {return nil ;};_ae .Printf ("\u0055\u006e\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0064\u0020c\u006f\u0070\u0079\u0020\u006f\u0066\u0020\u0055\u006e\u0069P\u0044\u0046\u000a");
|
|
|
|
|
_ae .Println ("-\u0020\u0047\u0065\u0074\u0020\u0061\u0020\u0066\u0072e\u0065\u0020\u0074\u0072\u0069\u0061\u006c l\u0069\u0063\u0065\u006es\u0065\u0020\u006f\u006e\u0020\u0068\u0074\u0074\u0070s:\u002f\u002fu\u006e\u0069\u0064\u006f\u0063\u002e\u0069\u006f");
|
|
|
|
|
return _dd .New ("\u0075\u006e\u0069\u0070d\u0066\u0020\u006c\u0069\u0063\u0065\u006e\u0073\u0065\u0020c\u006fd\u0065\u0020\u0072\u0065\u0071\u0075\u0069r\u0065\u0064");};func (_cfef *textTable )growTable (){_bcagg :=func (_fceab paraList ){_cfef ._bfbba ++;
|
|
|
|
|
for _cdede :=0;_cdede < _cfef ._cacec ;_cdede ++{_dgdf :=_fceab [_cdede ];_cfef .put (_cdede ,_cfef ._bfbba -1,_dgdf );};};_abgee :=func (_edaa paraList ){_cfef ._cacec ++;for _fbcg :=0;_fbcg < _cfef ._bfbba ;_fbcg ++{_aaead :=_edaa [_fbcg ];_cfef .put (_cfef ._cacec -1,_fbcg ,_aaead );
|
|
|
|
|
};};if _egeg {_cfef .log ("\u0067r\u006f\u0077\u0054\u0061\u0062\u006ce");};for _cdace :=0;;_cdace ++{_eedab :=false ;_ebdg :=_cfef .getDown ();_babc :=_cfef .getRight ();if _egeg {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_cdace ,_cfef );
|
|
|
|
|
_ae .Printf ("\u0020\u0020 \u0020\u0020\u0020 \u0020\u0064\u006f\u0077\u006e\u003d\u0025\u0073\u000a",_ebdg );_ae .Printf ("\u0020\u0020 \u0020\u0020\u0020 \u0072\u0069\u0067\u0068\u0074\u003d\u0025\u0073\u000a",_babc );};if _ebdg !=nil &&_babc !=nil {_bbccb :=_ebdg [len (_ebdg )-1];
|
|
|
|
|
if _bbccb !=nil &&!_bbccb ._abgd &&_bbccb ==_babc [len (_babc )-1]{_bcagg (_ebdg );if _babc =_cfef .getRight ();_babc !=nil {_abgee (_babc );_cfef .put (_cfef ._cacec -1,_cfef ._bfbba -1,_bbccb );};_eedab =true ;};};if !_eedab &&_ebdg !=nil {_bcagg (_ebdg );
|
|
|
|
|
_eedab =true ;};if !_eedab &&_babc !=nil {_abgee (_babc );_eedab =true ;};if !_eedab {break ;};};};func (_daeb *wordBag )text ()string {_dfge :=_daeb .allWords ();_cggb :=make ([]string ,len (_dfge ));for _bafe ,_bgf :=range _dfge {_cggb [_bafe ]=_bgf ._dgcdg ;
|
|
|
|
|
};return _a .Join (_cggb ,"\u0020");};func _gdaec (_edfd string )bool {if _da .RuneCountInString (_edfd )< _gdcd {return false ;};_cfg ,_effe :=_da .DecodeLastRuneInString (_edfd );if _effe <=0||!_c .Is (_c .Hyphen ,_cfg ){return false ;};_cfg ,_effe =_da .DecodeLastRuneInString (_edfd [:len (_edfd )-_effe ]);
|
|
|
|
|
return _effe > 0&&!_c .IsSpace (_cfg );};func (_fba *textLine )text ()string {var _cfab []string ;for _ ,_bcfcg :=range _fba ._dee {if _bcfcg ._cfcd {_cfab =append (_cfab ,"\u0020");};_cfab =append (_cfab ,_bcfcg ._dgcdg );};return _a .Join (_cfab ,"");
|
|
|
|
|
};func (_gfeb *wordBag )getDepthIdx (_bbbg float64 )int {_cbe :=_gfeb .depthIndexes ();_gagd :=_faaa (_bbbg );if _gagd < _cbe [0]{return _cbe [0];};if _gagd > _cbe [len (_cbe )-1]{return _cbe [len (_cbe )-1];};return _gagd ;};func _cdea (_aged ,_aegb *textPara )bool {if _aged ._fbed ||_aegb ._fbed {return true ;
|
|
|
|
|
};return _gfbdg (_aged .depth ()-_aegb .depth ());};func _cfcf (_feff *wordBag ,_cgfc *textWord ,_dgdb float64 )bool {return _feff .Urx <=_cgfc .Llx &&_cgfc .Llx < _feff .Urx +_dgdb ;};func (_ege *textObject )setCharSpacing (_eedd float64 ){if _ege ==nil {return ;
|
|
|
|
|
};_ege ._efb ._aba =_eedd ;if _aagd {_gd .Log .Info ("\u0073\u0065t\u0043\u0068\u0061\u0072\u0053\u0070\u0061\u0063\u0069\u006e\u0067\u003a\u0020\u0025\u002e\u0032\u0066\u0020\u0073\u0074\u0061\u0074e=\u0025\u0073",_eedd ,_ege ._efb .String ());};};func _cafc (_ccd ,_ceag _bc .PdfRectangle )(_bc .PdfRectangle ,bool ){if !_bafa (_ccd ,_ceag ){return _bc .PdfRectangle {},false ;
|
|
|
|
|
};return _bc .PdfRectangle {Llx :_f .Max (_ccd .Llx ,_ceag .Llx ),Urx :_f .Min (_ccd .Urx ,_ceag .Urx ),Lly :_f .Max (_ccd .Lly ,_ceag .Lly ),Ury :_f .Min (_ccd .Ury ,_ceag .Ury )},true ;};
|
2021-04-23 20:28:14 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// String returns a description of `v`.
|
|
|
|
|
func (_fagg *ruling )String ()string {if _fagg ._beb ==_dgcb {return "\u004e\u004f\u0054\u0020\u0052\u0055\u004c\u0049\u004e\u0047";};_egege ,_bbba :="\u0078","\u0079";if _fagg ._beb ==_dffd {_egege ,_bbba ="\u0079","\u0078";};_eeffa :="";if _fagg ._cgdca !=0.0{_eeffa =_ae .Sprintf (" \u0077\u0069\u0064\u0074\u0068\u003d\u0025\u002e\u0032\u0066",_fagg ._cgdca );
|
|
|
|
|
};return _ae .Sprintf ("\u0025\u00310\u0073\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u0020\u002d\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0020\u0025\u0076\u0025\u0073",_fagg ._beb ,_egege ,_fagg ._aadb ,_bbba ,_fagg ._eacb ,_fagg ._cbf ,_fagg ._cbf -_fagg ._eacb ,_fagg ._fafbe ,_fagg .Color ,_eeffa );
|
|
|
|
|
};func _aeec (_bcb float64 ,_aecg int )int {if _aecg ==0{_aecg =1;};_bgdd :=float64 (_aecg );return int (_f .Round (_bcb /_bgdd )*_bgdd );};func (_afege *textTable )getDown ()paraList {_bcbba :=make (paraList ,_afege ._cacec );for _ggbec :=0;_ggbec < _afege ._cacec ;
|
|
|
|
|
_ggbec ++{_fceaa :=_afege .get (_ggbec ,_afege ._bfbba -1)._eebd ;if _fceaa ==nil ||_fceaa ._abgd {return nil ;};_bcbba [_ggbec ]=_fceaa ;};for _agda :=0;_agda < _afege ._cacec -1;_agda ++{if _bcbba [_agda ]._fefc !=_bcbba [_agda +1]{return nil ;};};return _bcbba ;
|
|
|
|
|
};type cachedImage struct{_ef *_bc .Image ;_ga _bc .PdfColorspace ;};func _abgc (_cgc _bc .PdfRectangle )textState {return textState {_aeea :100,_adbd :RenderModeFill ,_ddaa :_cgc };};type textState struct{_aba float64 ;_dccc float64 ;_aeea float64 ;_fgdf float64 ;
|
|
|
|
|
_abfe float64 ;_adbd RenderMode ;_cea float64 ;_afc *_bc .PdfFont ;_ddaa _bc .PdfRectangle ;_cdd int ;_ceab int ;};func (_adfe rulingList )isActualGrid ()(rulingList ,bool ){_gfgc ,_fbgf :=_adfe .augmentGrid ();if !(len (_gfgc )>=_gfcc +1&&len (_fbgf )>=_abcc +1){if _acea {_gd .Log .Info ("\u0069s\u0041\u0063t\u0075\u0061\u006c\u0047r\u0069\u0064\u003a \u004e\u006f\u0074\u0020\u0061\u006c\u0069\u0067\u006eed\u002e\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u003c\u0020\u0025d\u0020\u0078 \u0025\u0064",len (_gfgc ),len (_fbgf ),_gfcc +1,_abcc +1);
|
|
|
|
|
};return nil ,false ;};if _acea {_gd .Log .Info ("\u0069\u0073\u0041\u0063\u0074\u0075a\u006c\u0047\u0072\u0069\u0064\u003a\u0020\u0025\u0073\u0020\u003a\u0020\u0025t\u0020\u0026\u0020\u0025\u0074\u0020\u2192 \u0025\u0074",_adfe ,len (_gfgc )>=2,len (_fbgf )>=2,len (_gfgc )>=2&&len (_fbgf )>=2);
|
|
|
|
|
for _acdfbf ,_fcab :=range _adfe {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0076\u000a",_acdfbf ,_fcab );};};if _agcb {_fbfg ,_fga :=_gfgc [0],_gfgc [len (_gfgc )-1];_fgbg ,_bacbc :=_fbgf [0],_fbgf [len (_fbgf )-1];if !(_efgd (_fbfg ._aadb -_fgbg ._eacb )&&_efgd (_fga ._aadb -_fgbg ._cbf )&&_efgd (_fgbg ._aadb -_fbfg ._cbf )&&_efgd (_bacbc ._aadb -_fbfg ._eacb )){if _acea {_gd .Log .Info ("\u0069\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069d\u003a\u0020\u0020N\u006f\u0074 \u0061\u006c\u0069\u0067\u006e\u0065d\u002e\n\t\u0076\u0030\u003d\u0025\u0073\u000a\u0009\u0076\u0031\u003d\u0025\u0073\u000a\u0009\u0068\u0030\u003d\u0025\u0073\u000a\u0009\u0068\u0031\u003d\u0025\u0073",_fbfg ,_fga ,_fgbg ,_bacbc );
|
|
|
|
|
};return nil ,false ;};}else {if !_gfgc .aligned (){if _gbfc {_gd .Log .Info ("i\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069\u0064\u003a\u0020N\u006f\u0074\u0020\u0061\u006c\u0069\u0067n\u0065\u0064\u0020\u0076\u0065\u0072\u0074\u0073\u002e\u0020%\u0064",len (_gfgc ));
|
|
|
|
|
};return nil ,false ;};if !_fbgf .aligned (){if _acea {_gd .Log .Info ("i\u0073\u0041\u0063\u0074\u0075\u0061l\u0047\u0072\u0069\u0064\u003a\u0020N\u006f\u0074\u0020\u0061\u006c\u0069\u0067n\u0065\u0064\u0020\u0068\u006f\u0072\u007a\u0073\u002e\u0020%\u0064",len (_fbgf ));
|
|
|
|
|
};return nil ,false ;};};_eggaa :=append (_gfgc ,_fbgf ...);return _eggaa ,true ;};func (_ebg *textObject )showText (_ccb []byte )error {return _ebg .renderText (_ccb )};func (_fbd *stateStack )size ()int {return len (*_fbd )};func (_cada paraList )findTables (_eedca []gridTiling )[]*textTable {_cada .addNeighbours ();
|
|
|
|
|
_b .Slice (_cada ,func (_gfgcf ,_ggbcf int )bool {return _ggec (_cada [_gfgcf ],_cada [_ggbcf ])< 0});var _cgbde []*textTable ;if _addf {_ggbd :=_cada .findGridTables (_eedca );_cgbde =append (_cgbde ,_ggbd ...);};if _decc {_cdabe :=_cada .findTextTables ();
|
|
|
|
|
_cgbde =append (_cgbde ,_cdabe ...);};return _cgbde ;};func _gcac (_ceba _ff .Point )_ff .Matrix {return _ff .TranslationMatrix (_ceba .X ,_ceba .Y )};func (_facg paraList )sortReadingOrder (){_gd .Log .Trace ("\u0073\u006fr\u0074\u0052\u0065\u0061\u0064i\u006e\u0067\u004f\u0072\u0064e\u0072\u003a\u0020\u0070\u0061\u0072\u0061\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_facg ));
|
|
|
|
|
if len (_facg )<=1{return ;};_facg .computeEBBoxes ();_b .Slice (_facg ,func (_bge ,_aecgd int )bool {return _dfac (_facg [_bge ],_facg [_aecgd ])<=0});_adeg :=_facg .topoOrder ();_facg .reorder (_adeg );};
|
2021-01-07 14:20:10 +00:00
|
|
|
|
|
|
|
|
|
// TextMark represents extracted text on a page with information regarding both textual content,
|
|
|
|
|
// formatting (font and size) and positioning.
|
|
|
|
|
// It is the smallest unit of text on a PDF page, typically a single character.
|
|
|
|
|
//
|
|
|
|
|
// getBBox() in test_text.go shows how to compute bounding boxes of substrings of extracted text.
|
|
|
|
|
// The following code extracts the text on PDF page `page` into `text` then finds the bounding box
|
|
|
|
|
// `bbox` of substring `term` in `text`.
|
|
|
|
|
//
|
|
|
|
|
// ex, _ := New(page)
|
|
|
|
|
// // handle errors
|
|
|
|
|
// pageText, _, _, err := ex.ExtractPageText()
|
|
|
|
|
// // handle errors
|
|
|
|
|
// text := pageText.Text()
|
|
|
|
|
// textMarks := pageText.Marks()
|
|
|
|
|
//
|
|
|
|
|
// start := strings.Index(text, term)
|
|
|
|
|
// end := start + len(term)
|
|
|
|
|
// spanMarks, err := textMarks.RangeOffset(start, end)
|
|
|
|
|
// // handle errors
|
|
|
|
|
// bbox, ok := spanMarks.BBox()
|
|
|
|
|
// // handle errors
|
|
|
|
|
type TextMark struct{
|
2020-11-23 22:15:56 +00:00
|
|
|
|
|
2020-12-06 13:03:03 +00:00
|
|
|
|
// Text is the extracted text.
|
|
|
|
|
Text string ;
|
2020-11-23 22:15:56 +00:00
|
|
|
|
|
2021-01-07 14:20:10 +00:00
|
|
|
|
// Original is the text in the PDF. It has not been decoded like `Text`.
|
|
|
|
|
Original string ;
|
2020-11-23 22:15:56 +00:00
|
|
|
|
|
2021-01-07 14:20:10 +00:00
|
|
|
|
// BBox is the bounding box of the text.
|
2021-05-11 00:01:27 +00:00
|
|
|
|
BBox _bc .PdfRectangle ;
|
2021-01-07 14:20:10 +00:00
|
|
|
|
|
|
|
|
|
// Font is the font the text was drawn with.
|
2021-05-11 00:01:27 +00:00
|
|
|
|
Font *_bc .PdfFont ;
|
2021-01-07 14:20:10 +00:00
|
|
|
|
|
|
|
|
|
// FontSize is the font size the text was drawn with.
|
|
|
|
|
FontSize float64 ;
|
|
|
|
|
|
|
|
|
|
// Offset is the offset of the start of TextMark.Text in the extracted text. If you do this
|
|
|
|
|
// text, textMarks := pageText.Text(), pageText.Marks()
|
|
|
|
|
// marks := textMarks.Elements()
|
|
|
|
|
// then marks[i].Offset is the offset of marks[i].Text in text.
|
|
|
|
|
Offset int ;
|
|
|
|
|
|
|
|
|
|
// Meta is set true for spaces and line breaks that we insert in the extracted text. We insert
|
|
|
|
|
// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
|
|
|
|
|
// distance apart. See wordJoiner (lineJoiner) in PageText.computeViews().
|
|
|
|
|
Meta bool ;
|
|
|
|
|
|
|
|
|
|
// FillColor is the fill color of the text.
|
|
|
|
|
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
2021-05-11 00:01:27 +00:00
|
|
|
|
FillColor _fd .Color ;
|
2021-01-07 14:20:10 +00:00
|
|
|
|
|
|
|
|
|
// StrokeColor is the stroke color of the text.
|
|
|
|
|
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
|
2021-05-11 00:01:27 +00:00
|
|
|
|
StrokeColor _fd .Color ;
|
2021-01-07 14:20:10 +00:00
|
|
|
|
|
|
|
|
|
// Orientation is the text orientation
|
2021-05-11 00:01:27 +00:00
|
|
|
|
Orientation int ;};func (_gada *wordBag )firstReadingIndex (_eccb int )int {_gdaed :=_gada .firstWord (_eccb )._edega ;_eeff :=float64 (_eccb +1)*_gbeg ;_afdfa :=_eeff +_fbfe *_gdaed ;_fedeb :=_eccb ;for _ ,_ddbc :=range _gada .depthBand (_eeff ,_afdfa ){if _gfdd (_gada .firstWord (_ddbc ),_gada .firstWord (_fedeb ))< 0{_fedeb =_ddbc ;
|
|
|
|
|
};};return _fedeb ;};func (_egb *subpath )add (_gbeb ..._ff .Point ){_egb ._cfac =append (_egb ._cfac ,_gbeb ...)};func (_dcccc *wordBag )depthBand (_fcee ,_efag float64 )[]int {if len (_dcccc ._aaaf )==0{return nil ;};return _dcccc .depthRange (_dcccc .getDepthIdx (_fcee ),_dcccc .getDepthIdx (_efag ));
|
|
|
|
|
};func (_dgea *textTable )depth ()float64 {_edcf :=1e10;for _fcgeb :=0;_fcgeb < _dgea ._cacec ;_fcgeb ++{_eeeac :=_dgea .get (_fcgeb ,0);if _eeeac ==nil ||_eeeac ._fbed {continue ;};_edcf =_f .Min (_edcf ,_eeeac .depth ());};return _edcf ;};func _fbf (_affe *wordBag ,_bgbc *textWord ,_gcgf float64 )bool {return _bgbc .Llx < _affe .Urx +_gcgf &&_affe .Llx -_gcgf < _bgbc .Urx ;
|
|
|
|
|
};const _daga =20;func (_dfgfc paraList )eventNeighbours (_cgeca []event )map[*textPara ][]int {_b .Slice (_cgeca ,func (_ggfba ,_bgdf int )bool {_affeg ,_gcaaf :=_cgeca [_ggfba ],_cgeca [_bgdf ];_geecc ,_fefd :=_affeg ._agff ,_gcaaf ._agff ;if _geecc !=_fefd {return _geecc < _fefd ;
|
|
|
|
|
};if _affeg ._bbdd !=_gcaaf ._bbdd {return _affeg ._bbdd ;};return _ggfba < _bgdf ;});_adbe :=make (map[int ]intSet );_eedgb :=make (intSet );for _ ,_fdfga :=range _cgeca {if _fdfga ._bbdd {_adbe [_fdfga ._fbdcc ]=make (intSet );for _aaafe :=range _eedgb {if _aaafe !=_fdfga ._fbdcc {_adbe [_fdfga ._fbdcc ].add (_aaafe );
|
|
|
|
|
_adbe [_aaafe ].add (_fdfga ._fbdcc );};};_eedgb .add (_fdfga ._fbdcc );}else {_eedgb .del (_fdfga ._fbdcc );};};_cfcg :=map[*textPara ][]int {};for _afeaf ,_agggf :=range _adbe {_fdcfg :=_dfgfc [_afeaf ];if len (_agggf )==0{_cfcg [_fdcfg ]=nil ;continue ;
|
|
|
|
|
};_fdfdd :=make ([]int ,len (_agggf ));_bdbab :=0;for _bfeff :=range _agggf {_fdfdd [_bdbab ]=_bfeff ;_bdbab ++;};_cfcg [_fdcfg ]=_fdfdd ;};return _cfcg ;};func (_bga *textObject )moveText (_gdg ,_fdgd float64 ){_bga .moveLP (_gdg ,_fdgd )};func (_gdd *wordBag )maxDepth ()float64 {return _gdd ._agadg -_gdd .Lly };
|
|
|
|
|
func (_acbg *shapesState )moveTo (_dggb ,_geg float64 ){_acbg ._bbfc =true ;_acbg ._egd =_acbg .devicePoint (_dggb ,_geg );if _egfa {_gd .Log .Info ("\u006d\u006fv\u0065\u0054\u006f\u003a\u0020\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0064\u0065\u0076\u0069\u0063\u0065\u003d%.\u0032\u0066",_dggb ,_geg ,_acbg ._egd );
|
|
|
|
|
};};
|
|
|
|
|
|
|
|
|
|
// TextMarkArray is a collection of TextMarks.
|
|
|
|
|
type TextMarkArray struct{_fad []TextMark };func (_bgbg *textTable )compositeRowCorridors ()map[int ][]float64 {_eefa :=make (map[int ][]float64 ,_bgbg ._bfbba );if _gcdc {_gd .Log .Info ("c\u006f\u006d\u0070\u006f\u0073\u0069t\u0065\u0052\u006f\u0077\u0043\u006f\u0072\u0072\u0069d\u006f\u0072\u0073:\u0020h\u003d\u0025\u0064",_bgbg ._bfbba );
|
|
|
|
|
};for _cgag :=1;_cgag < _bgbg ._bfbba ;_cgag ++{var _bfef []compositeCell ;for _ffba :=0;_ffba < _bgbg ._cacec ;_ffba ++{if _cafd ,_fcfc :=_bgbg ._ffgge [_daceb (_ffba ,_cgag )];_fcfc {_bfef =append (_bfef ,_cafd );};};if len (_bfef )==0{continue ;};_adef :=_aaacd (_bfef );
|
|
|
|
|
_eefa [_cgag ]=_adef ;if _gcdc {_ae .Printf ("\u0020\u0020\u0020\u0025\u0032\u0064\u003a\u0020\u00256\u002e\u0032\u0066\u000a",_cgag ,_adef );};};return _eefa ;};func _ddfbc (_bfgd []int )[]int {_dcbe :=make ([]int ,len (_bfgd ));for _ecge ,_dbgf :=range _bfgd {_dcbe [len (_bfgd )-1-_ecge ]=_dbgf ;
|
|
|
|
|
};return _dcbe ;};func (_ffgfc *textTable )put (_dabb ,_dfdb int ,_feca *textPara ){_ffgfc ._egfdd [_daceb (_dabb ,_dfdb )]=_feca ;};func (_ffad *subpath )last ()_ff .Point {return _ffad ._cfac [len (_ffad ._cfac )-1]};
|
|
|
|
|
|
|
|
|
|
// Len returns the number of TextMarks in `ma`.
|
|
|
|
|
func (_gdaea *TextMarkArray )Len ()int {if _gdaea ==nil {return 0;};return len (_gdaea ._fad );};type ruling struct{_beb rulingKind ;_fafbe markKind ;_fd .Color ;_aadb float64 ;_eacb float64 ;_cbf float64 ;_cgdca float64 ;};func (_acdfb *wordBag )allWords ()[]*textWord {var _aggb []*textWord ;
|
|
|
|
|
for _ ,_dacf :=range _acdfb ._aaaf {_aggb =append (_aggb ,_dacf ...);};return _aggb ;};func (_afga rulingList )snapToGroups ()rulingList {_ddefd ,_ggbbc :=_afga .vertsHorzs ();if len (_ddefd )> 0{_ddefd =_ddefd .snapToGroupsDirection ();};if len (_ggbbc )> 0{_ggbbc =_ggbbc .snapToGroupsDirection ();
|
|
|
|
|
};_efdcd :=append (_ddefd ,_ggbbc ...);_efdcd .log ("\u0073\u006e\u0061p\u0054\u006f\u0047\u0072\u006f\u0075\u0070\u0073");return _efdcd ;};func (_ccceb *textTable )putComposite (_degfa ,_gfad int ,_gffc paraList ,_dgdc _bc .PdfRectangle ){if len (_gffc )==0{_gd .Log .Error ("\u0074\u0065xt\u0054\u0061\u0062l\u0065\u0029\u0020\u0070utC\u006fmp\u006f\u0073\u0069\u0074\u0065\u003a\u0020em\u0070\u0074\u0079\u0020\u0070\u0061\u0072a\u0073");
|
|
|
|
|
return ;};_bbgg :=compositeCell {_dgdc ,_gffc };if _gcdc {_ae .Printf ("\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0070\u0075\u0074\u0043\u006f\u006d\u0070o\u0073i\u0074\u0065\u0028\u0025\u0064\u002c\u0025\u0064\u0029\u003c\u002d\u0025\u0073\u000a",_degfa ,_gfad ,_bbgg .String ());
|
|
|
|
|
};_bbgg .updateBBox ();_ccceb ._ffgge [_daceb (_degfa ,_gfad )]=_bbgg ;};func _cdgg (_dfdc bounded )float64 {return -_dfdc .bbox ().Lly };func (_eabde *textTable )emptyColumn (_ddagc int )bool {for _eede :=0;_eede < _eabde ._bfbba ;_eede ++{_cdbca :=_eabde .get (_ddagc ,_eede );
|
|
|
|
|
if _cdbca !=nil &&_cdbca .text ()!=""{return false ;};};return true ;};func (_faad *textObject )newTextMark (_cdac string ,_degf _ff .Matrix ,_aeag _ff .Point ,_afefe float64 ,_fbdb *_bc .PdfFont ,_deab float64 ,_ggf ,_ebaa _fd .Color )(textMark ,bool ){_dfgff :=_degf .Angle ();
|
|
|
|
|
_egac :=_aeec (_dfgff ,_gaga );var _bead float64 ;if _egac %180!=90{_bead =_degf .ScalingFactorY ();}else {_bead =_degf .ScalingFactorX ();};_gcef :=_adba (_degf );_dce :=_bc .PdfRectangle {Llx :_gcef .X ,Lly :_gcef .Y ,Urx :_aeag .X ,Ury :_aeag .Y };switch _egac %360{case 90:_dce .Urx -=_bead ;
|
|
|
|
|
case 180:_dce .Ury -=_bead ;case 270:_dce .Urx +=_bead ;case 0:_dce .Ury +=_bead ;default:_egac =0;_dce .Ury +=_bead ;};if _dce .Llx > _dce .Urx {_dce .Llx ,_dce .Urx =_dce .Urx ,_dce .Llx ;};if _dce .Lly > _dce .Ury {_dce .Lly ,_dce .Ury =_dce .Ury ,_dce .Lly ;
|
|
|
|
|
};_edag ,_fdcf :=_cafc (_dce ,_faad ._aac ._dag );if !_fdcf {_gd .Log .Debug ("\u0054\u0065\u0078\u0074\u0020m\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069\u0064\u0065\u0020\u0070a\u0067\u0065\u002e\u0020\u0062\u0062\u006f\u0078\u003d\u0025\u0067\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u003d\u0025\u0067\u0020\u0074\u0065\u0078\u0074\u003d\u0025q",_dce ,_faad ._aac ._dag ,_cdac );
|
|
|
|
|
};_dce =_edag ;_daef :=_dce ;_cca :=_faad ._aac ._dag ;switch _egac %360{case 90:_cca .Urx ,_cca .Ury =_cca .Ury ,_cca .Urx ;_daef =_bc .PdfRectangle {Llx :_cca .Urx -_dce .Ury ,Urx :_cca .Urx -_dce .Lly ,Lly :_dce .Llx ,Ury :_dce .Urx };case 180:_daef =_bc .PdfRectangle {Llx :_cca .Urx -_dce .Llx ,Urx :_cca .Urx -_dce .Urx ,Lly :_cca .Ury -_dce .Lly ,Ury :_cca .Ury -_dce .Ury };
|
|
|
|
|
case 270:_cca .Urx ,_cca .Ury =_cca .Ury ,_cca .Urx ;_daef =_bc .PdfRectangle {Llx :_dce .Ury ,Urx :_dce .Lly ,Lly :_cca .Ury -_dce .Llx ,Ury :_cca .Ury -_dce .Urx };};if _daef .Llx > _daef .Urx {_daef .Llx ,_daef .Urx =_daef .Urx ,_daef .Llx ;};if _daef .Lly > _daef .Ury {_daef .Lly ,_daef .Ury =_daef .Ury ,_daef .Lly ;
|
|
|
|
|
};_cgbe :=textMark {_ecfa :_cdac ,PdfRectangle :_daef ,_fgfd :_dce ,_aacba :_fbdb ,_ebgc :_bead ,_afged :_deab ,_fcgdb :_degf ,_aafb :_aeag ,_ggbe :_egac ,_dcggg :_ggf ,_aggg :_ebaa };if _dbbf {_gd .Log .Info ("n\u0065\u0077\u0054\u0065\u0078\u0074M\u0061\u0072\u006b\u003a\u0020\u0073t\u0061\u0072\u0074\u003d\u0025\u002e\u0032f\u0020\u0065\u006e\u0064\u003d\u0025\u002e\u0032\u0066\u0020%\u0073",_gcef ,_aeag ,_cgbe .String ());
|
|
|
|
|
};return _cgbe ,_fdcf ;};func (_edd *shapesState )stroke (_efgc *[]pathSection ){_ffa :=pathSection {_bbd :_edd ._cbdd ,Color :_edd ._eeg .getStrokeColor ()};*_efgc =append (*_efgc ,_ffa );if _acea {_ae .Printf ("\u0020 \u0020\u0020S\u0054\u0052\u004fK\u0045\u003a\u0020\u0025\u0064\u0020\u0073t\u0072\u006f\u006b\u0065\u0073\u0020s\u0073\u003d\u0025\u0073\u0020\u0063\u006f\u006c\u006f\u0072\u003d%\u002b\u0076\u0020\u0025\u0036\u002e\u0032\u0066\u000a",len (*_efgc ),_edd ,_edd ._eeg .getStrokeColor (),_ffa .bbox ());
|
|
|
|
|
if _dfe {for _dfgb ,_cbbf :=range _edd ._cbdd {_ae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_dfgb ,_cbbf );if _dfgb ==10{break ;};};};};};func (_bbbb *wordBag )absorb (_cccb *wordBag ){_cgae :=_cccb .makeRemovals ();for _gacge ,_ddef :=range _cccb ._aaaf {for _ ,_gdccd :=range _ddef {_bbbb .pullWord (_gdccd ,_gacge ,_cgae );
|
|
|
|
|
};};_cccb .applyRemovals (_cgae );};func (_ggddd *wordBag )makeRemovals ()map[int ]map[*textWord ]struct{}{_gabg :=make (map[int ]map[*textWord ]struct{},len (_ggddd ._aaaf ));for _cdda :=range _ggddd ._aaaf {_gabg [_cdda ]=make (map[*textWord ]struct{});
|
|
|
|
|
};return _gabg ;};func (_befd *textObject )setHorizScaling (_bbgd float64 ){if _befd ==nil {return ;};_befd ._efb ._aeea =_bbgd ;};var _afcd =map[markKind ]string {_bfgf :"\u0073\u0074\u0072\u006f\u006b\u0065",_cfgb :"\u0066\u0069\u006c\u006c",_ggfd :"\u0061u\u0067\u006d\u0065\u006e\u0074"};
|
|
|
|
|
func _bfad (_cafg *_de .ContentStreamOperation )(float64 ,error ){if len (_cafg .Params )!=1{_gaeg :=_dd .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_cafg .Operand ,1,len (_cafg .Params ),_cafg .Params );
|
|
|
|
|
return 0.0,_gaeg ;};return _eb .GetNumberAsFloat (_cafg .Params [0]);};func _cfagb (_afgea []*textMark ,_efaf _bc .PdfRectangle )[]*textWord {var _ccfd []*textWord ;var _agea *textWord ;if _dbbf {_gd .Log .Info ("\u006d\u0061\u006beT\u0065\u0078\u0074\u0057\u006f\u0072\u0064\u0073\u003a\u0020\u0025\u0064\u0020\u006d\u0061\u0072\u006b\u0073",len (_afgea ));
|
|
|
|
|
};_acaea :=func (){if _agea !=nil {_gbbec :=_agea .computeText ();if !_eafg (_gbbec ){_agea ._dgcdg =_gbbec ;_ccfd =append (_ccfd ,_agea );if _dbbf {_gd .Log .Info ("\u0061\u0064\u0064Ne\u0077\u0057\u006f\u0072\u0064\u003a\u0020\u0025\u0064\u003a\u0020\u0077\u006f\u0072\u0064\u003d\u0025\u0073",len (_ccfd )-1,_agea .String ());
|
|
|
|
|
for _fceg ,_fgdae :=range _agea ._eeefd {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_fceg ,_fgdae .String ());};};};_agea =nil ;};};for _ ,_cbgdf :=range _afgea {if _edaf &&_agea !=nil &&len (_agea ._eeefd )> 0{_bcce :=_agea ._eeefd [len (_agea ._eeefd )-1];
|
|
|
|
|
_bacbd ,_ggaf :=_ffaeg (_cbgdf ._ecfa );_efdb ,_edgg :=_ffaeg (_bcce ._ecfa );if _ggaf &&!_edgg &&_bcce .inDiacriticArea (_cbgdf ){_agea .addDiacritic (_bacbd );continue ;};if _edgg &&!_ggaf &&_cbgdf .inDiacriticArea (_bcce ){_agea ._eeefd =_agea ._eeefd [:len (_agea ._eeefd )-1];
|
|
|
|
|
_agea .appendMark (_cbgdf ,_efaf );_agea .addDiacritic (_efdb );continue ;};};_deff :=_eafg (_cbgdf ._ecfa );if _deff {_acaea ();continue ;};if _agea ==nil &&!_deff {_agea =_cbfaa ([]*textMark {_cbgdf },_efaf );continue ;};_fcfdc :=_agea ._edega ;_adbad :=_f .Abs (_gebcc (_efaf ,_cbgdf )-_agea ._gdce )/_fcfdc ;
|
|
|
|
|
_geab :=_eccg (_cbgdf ,_agea )/_fcfdc ;if _geab >=_dfgfb ||!(-_fgea <=_geab &&_adbad <=_cfbg ){_acaea ();_agea =_cbfaa ([]*textMark {_cbgdf },_efaf );continue ;};_agea .appendMark (_cbgdf ,_efaf );};_acaea ();return _ccfd ;};func _gbbeg (_dgcf []_eb .PdfObject )(_fceca ,_fgafc float64 ,_badee error ){if len (_dgcf )!=2{return 0,0,_ae .Errorf ("\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u006e\u0075\u006d\u0062\u0065\u0072\u0020o\u0066 \u0070\u0061\u0072\u0061\u006d\u0073\u003a \u0025\u0064",len (_dgcf ));
|
|
|
|
|
};_cacga ,_badee :=_eb .GetNumbersAsFloat (_dgcf );if _badee !=nil {return 0,0,_badee ;};return _cacga [0],_cacga [1],nil ;};func (_acf *wordBag )depthRange (_fcbdc ,_bdbdf int )[]int {var _bdbf []int ;for _gee :=range _acf ._aaaf {if _fcbdc <=_gee &&_gee <=_bdbdf {_bdbf =append (_bdbf ,_gee );
|
|
|
|
|
};};if len (_bdbf )==0{return nil ;};_b .Ints (_bdbf );return _bdbf ;};func (_gdbd paraList )computeEBBoxes (){if _cbge {_gd .Log .Info ("\u0063o\u006dp\u0075\u0074\u0065\u0045\u0042\u0042\u006f\u0078\u0065\u0073\u003a");};for _ ,_dbdfd :=range _gdbd {_dbdfd ._bcgb =_dbdfd .PdfRectangle ;
|
|
|
|
|
};_eefb :=_gdbd .yNeighbours (0);for _dfgfg ,_ecbc :=range _gdbd {_fbbg :=_ecbc ._bcgb ;_eac ,_baaa :=-1.0e9,+1.0e9;for _ ,_afaf :=range _eefb [_ecbc ]{_fdff :=_gdbd [_afaf ]._bcgb ;if _fdff .Urx < _fbbg .Llx {_eac =_f .Max (_eac ,_fdff .Urx );}else if _fbbg .Urx < _fdff .Llx {_baaa =_f .Min (_baaa ,_fdff .Llx );
|
|
|
|
|
};};for _eeab ,_bgbe :=range _gdbd {_bcbf :=_bgbe ._bcgb ;if _dfgfg ==_eeab ||_bcbf .Ury > _fbbg .Lly {continue ;};if _eac <=_bcbf .Llx &&_bcbf .Llx < _fbbg .Llx {_fbbg .Llx =_bcbf .Llx ;}else if _bcbf .Urx <=_baaa &&_fbbg .Urx < _bcbf .Urx {_fbbg .Urx =_bcbf .Urx ;
|
|
|
|
|
};};if _cbge {_ae .Printf ("\u0025\u0034\u0064\u003a %\u0036\u002e\u0032\u0066\u2192\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0071\u000a",_dfgfg ,_ecbc ._bcgb ,_fbbg ,_eaed (_ecbc .text (),50));};_ecbc ._bcgb =_fbbg ;};if _gdfd {for _ ,_fdfd :=range _gdbd {_fdfd .PdfRectangle =_fdfd ._bcgb ;
|
|
|
|
|
};};};func (_geef paraList )llyOrdering ()[]int {_dccf :=make ([]int ,len (_geef ));for _egge :=range _geef {_dccf [_egge ]=_egge ;};_b .SliceStable (_dccf ,func (_bcfef ,_fafb int )bool {_efccd ,_ebea :=_dccf [_bcfef ],_dccf [_fafb ];return _geef [_efccd ].Lly < _geef [_ebea ].Lly ;
|
|
|
|
|
});return _dccf ;};func _aada (_ebeg []TextMark ,_gfdc *int ,_fdf string )[]TextMark {_edae :=_eebg ;_edae .Text =_fdf ;return _gaaae (_ebeg ,_gfdc ,_edae );};func _abfba (_dfd _ff .Point )*subpath {return &subpath {_cfac :[]_ff .Point {_dfd }}};func (_bda *textLine )endsInHyphen ()bool {_abd :=_bda ._dee [len (_bda ._dee )-1];
|
|
|
|
|
_aabd :=_abd ._dgcdg ;_bfeg ,_gabe :=_da .DecodeLastRuneInString (_aabd );if _gabe <=0||!_c .Is (_c .Hyphen ,_bfeg ){return false ;};if _abd ._cfcd &&_gdaec (_aabd ){return true ;};return _gdaec (_bda .text ());};func (_cagaf rulingList )snapToGroupsDirection ()rulingList {_cagaf .sortStrict ();
|
|
|
|
|
_gbgf :=make (map[*ruling ]rulingList ,len (_cagaf ));_ffea :=_cagaf [0];_deeg :=func (_abbb *ruling ){_ffea =_abbb ;_gbgf [_ffea ]=rulingList {_abbb }};_deeg (_cagaf [0]);for _ ,_dcaf :=range _cagaf [1:]{if _dcaf ._aadb < _ffea ._aadb -_bdcdb {_gd .Log .Error ("\u0073\u006e\u0061\u0070T\u006f\u0047\u0072\u006f\u0075\u0070\u0073\u0044\u0069r\u0065\u0063\u0074\u0069\u006f\u006e\u003a\u0020\u0057\u0072\u006f\u006e\u0067\u0020\u0070\u0072\u0069\u006da\u0072\u0079\u0020\u006f\u0072d\u0065\u0072\u002e\u000a\u0009\u0076\u0030\u003d\u0025\u0073\u000a\u0009\u0020\u0076\u003d\u0025\u0073",_ffea ,_dcaf );
|
|
|
|
|
};if _dcaf ._aadb > _ffea ._aadb +_gdff {_deeg (_dcaf );}else {_gbgf [_ffea ]=append (_gbgf [_ffea ],_dcaf );};};_dffg :=make (map[*ruling ]float64 ,len (_gbgf ));_gggcg :=make (map[*ruling ]*ruling ,len (_cagaf ));for _dbdd ,_geec :=range _gbgf {_dffg [_dbdd ]=_geec .mergePrimary ();
|
|
|
|
|
for _ ,_gccd :=range _geec {_gggcg [_gccd ]=_dbdd ;};};for _ ,_cbbcg :=range _cagaf {_cbbcg ._aadb =_dffg [_gggcg [_cbbcg ]];};_aacbc :=make (rulingList ,0,len (_cagaf ));for _ ,_aacc :=range _gbgf {_ggdf :=_aacc .splitSec ();for _dfcb ,_gdfdg :=range _ggdf {_dbee :=_gdfdg .merge ();
|
|
|
|
|
if len (_aacbc )> 0{_fccdf :=_aacbc [len (_aacbc )-1];if _fccdf .alignsPrimary (_dbee )&&_fccdf .alignsSec (_dbee ){_gd .Log .Error ("\u0073\u006e\u0061\u0070\u0054\u006fG\u0072\u006f\u0075\u0070\u0073\u0044\u0069\u0072\u0065\u0063\u0074\u0069\u006f\u006e\u003a\u0020\u0044\u0075\u0070\u006ci\u0063\u0061\u0074\u0065\u0020\u0069\u003d\u0025\u0064\u000a\u0009\u0077\u003d\u0025s\u000a\t\u0076\u003d\u0025\u0073",_dfcb ,_fccdf ,_dbee );
|
|
|
|
|
continue ;};};_aacbc =append (_aacbc ,_dbee );};};_aacbc .sortStrict ();return _aacbc ;};func (_ccgc *wordBag )removeWord (_dcbbb *textWord ,_cdbc int ){_bacd :=_ccgc ._aaaf [_cdbc ];_bacd =_fbgcg (_bacd ,_dcbbb );if len (_bacd )==0{delete (_ccgc ._aaaf ,_cdbc );
|
|
|
|
|
}else {_ccgc ._aaaf [_cdbc ]=_bacd ;};};func _begea (_aefcd _bc .PdfRectangle ,_eacg ,_efgf ,_fcfa ,_bfaf *ruling )gridTile {_afega :=_aefcd .Llx ;_ddee :=_aefcd .Urx ;_feaf :=_aefcd .Lly ;_efbc :=_aefcd .Ury ;return gridTile {PdfRectangle :_aefcd ,_cgaag :_eacg !=nil &&_eacg .encloses (_feaf ,_efbc ),_fbfa :_efgf !=nil &&_efgf .encloses (_feaf ,_efbc ),_ggfc :_fcfa !=nil &&_fcfa .encloses (_afega ,_ddee ),_adfgd :_bfaf !=nil &&_bfaf .encloses (_afega ,_ddee )};
|
|
|
|
|
};func _eaaa (_aceb _bc .PdfRectangle )*ruling {return &ruling {_beb :_dddb ,_aadb :_aceb .Urx ,_eacb :_aceb .Lly ,_cbf :_aceb .Ury };};func (_ddefdd *textTable )isExportable ()bool {if _ddefdd ._dcbbg {return true ;};_ebbg :=func (_dabaf int )bool {_dbdae :=_ddefdd .get (0,_dabaf );
|
|
|
|
|
if _dbdae ==nil {return false ;};_addfc :=_dbdae .text ();_aaac :=_da .RuneCountInString (_addfc );_deaec :=_fddf .MatchString (_addfc );return _aaac <=1||_deaec ;};for _beadc :=0;_beadc < _ddefdd ._bfbba ;_beadc ++{if !_ebbg (_beadc ){return true ;};};
|
|
|
|
|
return false ;};func (_eaeb *textObject )nextLine (){_eaeb .moveLP (0,-_eaeb ._efb ._fgdf )};func (_fgae *textWord )computeText ()string {_dggdf :=make ([]string ,len (_fgae ._eeefd ));for _dbfcg ,_bgfbe :=range _fgae ._eeefd {_dggdf [_dbfcg ]=_bgfbe ._ecfa ;
|
|
|
|
|
};return _a .Join (_dggdf ,"");};const (_cbge =false ;_dbbf =false ;_acge =false ;_fdad =false ;_egfa =false ;_aagd =false ;_afa =false ;_gaca =false ;_gagb =false ;_fcag =_gagb &&true ;_face =_fcag &&false ;_eaebg =_gagb &&true ;_gcdc =false ;_egeg =_gcdc &&false ;
|
|
|
|
|
_cfadb =_gcdc &&true ;_acea =false ;_dfe =_acea &&false ;_gbfc =_acea &&false ;_gcace =_acea &&true ;_dcgde =_acea &&false ;_eagfda =_acea &&false ;);var _ad =false ;
|
|
|
|
|
|
|
|
|
|
// New returns an Extractor instance for extracting content from the input PDF page.
|
|
|
|
|
func New (page *_bc .PdfPage )(*Extractor ,error ){const _cg ="\u0065\u0078\u0074\u0072\u0061\u0063\u0074\u006f\u0072\u002e\u004e\u0065\u0077";_ea ,_cda :=page .GetAllContentStreams ();if _cda !=nil {return nil ,_cda ;};_bb ,_cda :=page .GetMediaBox ();
|
|
|
|
|
if _cda !=nil {return nil ,_ae .Errorf ("\u0065\u0078\u0074r\u0061\u0063\u0074\u006fr\u0020\u0072\u0065\u0071\u0075\u0069\u0072e\u0073\u0020\u006d\u0065\u0064\u0069\u0061\u0042\u006f\u0078\u002e\u0020\u0025\u0076",_cda );};_eaa :=&Extractor {_ec :_ea ,_ffe :page .Resources ,_dag :*_bb ,_ca :map[string ]fontEntry {},_dg :map[string ]textResult {}};
|
|
|
|
|
if _eaa ._dag .Llx > _eaa ._dag .Urx {_gd .Log .Info ("\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0058\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e",_eaa ._dag );
|
|
|
|
|
_eaa ._dag .Llx ,_eaa ._dag .Urx =_eaa ._dag .Urx ,_eaa ._dag .Llx ;};if _eaa ._dag .Lly > _eaa ._dag .Ury {_gd .Log .Info ("\u004d\u0065\u0064\u0069\u0061\u0042o\u0078\u0020\u0068\u0061\u0073\u0020\u0059\u0020\u0063\u006f\u006f\u0072\u0064\u0069\u006e\u0061\u0074\u0065\u0073\u0020r\u0065\u0076\u0065\u0072\u0073\u0065\u0064\u002e\u0020\u0025\u002e\u0032\u0066\u0020F\u0069x\u0069\u006e\u0067\u002e",_eaa ._dag );
|
|
|
|
|
_eaa ._dag .Lly ,_eaa ._dag .Ury =_eaa ._dag .Ury ,_eaa ._dag .Lly ;};_fe .TrackUse (_cg );return _eaa ,nil ;};func (_bfgb paraList )lines ()[]*textLine {var _gdef []*textLine ;for _ ,_aegg :=range _bfgb {_gdef =append (_gdef ,_aegg ._bcca ...);};return _gdef ;
|
|
|
|
|
};func _aedac (_aaga _bc .PdfRectangle )*ruling {return &ruling {_beb :_dffd ,_aadb :_aaga .Ury ,_eacb :_aaga .Llx ,_cbf :_aaga .Urx };};func (_gfa *textObject )setTextRenderMode (_ecf int ){if _gfa ==nil {return ;};_gfa ._efb ._adbd =RenderMode (_ecf );
|
|
|
|
|
};func (_gade *textLine )markWordBoundaries (){_dagc :=_ageg *_gade ._cgdd ;for _gcaa ,_aeg :=range _gade ._dee [1:]{if _eccg (_aeg ,_gade ._dee [_gcaa ])>=_dagc {_aeg ._cfcd =true ;};};};func (_fagd *textMark )bbox ()_bc .PdfRectangle {return _fagd .PdfRectangle };
|
|
|
|
|
func _dcea (_cdbfd []*textWord ,_fbdeg int )[]*textWord {_ceeeg :=len (_cdbfd );copy (_cdbfd [_fbdeg :],_cdbfd [_fbdeg +1:]);return _cdbfd [:_ceeeg -1];};type rulingKind int ;func _eafg (_ffcf string )bool {for _ ,_cdcb :=range _ffcf {if !_c .IsSpace (_cdcb ){return false ;
|
|
|
|
|
};};return true ;};func _bdcdg (_daba []TextMark ,_abed *int )[]TextMark {_bccaf :=_daba [len (_daba )-1];_cgfa :=[]rune (_bccaf .Text );if len (_cgfa )==1{_daba =_daba [:len (_daba )-1];_dbde :=_daba [len (_daba )-1];*_abed =_dbde .Offset +len (_dbde .Text );
|
|
|
|
|
}else {_cafab :=_ebfa (_bccaf .Text );*_abed +=len (_cafab )-len (_bccaf .Text );_bccaf .Text =_cafab ;};return _daba ;};func (_bee *imageExtractContext )extractInlineImage (_df *_de .ContentStreamInlineImage ,_fdd _de .GraphicsState ,_gb *_bc .PdfPageResources )error {_ddfb ,_gfec :=_df .ToImage (_gb );
|
|
|
|
|
if _gfec !=nil {return _gfec ;};_bbe ,_gfec :=_df .GetColorSpace (_gb );if _gfec !=nil {return _gfec ;};if _bbe ==nil {_bbe =_bc .NewPdfColorspaceDeviceGray ();};_cdcf ,_gfec :=_bbe .ImageToRGB (*_ddfb );if _gfec !=nil {return _gfec ;};_aa :=ImageMark {Image :&_cdcf ,Width :_fdd .CTM .ScalingFactorX (),Height :_fdd .CTM .ScalingFactorY (),Angle :_fdd .CTM .Angle ()};
|
|
|
|
|
_aa .X ,_aa .Y =_fdd .CTM .Translation ();_bee ._ee =append (_bee ._ee ,_aa );_bee ._cdc ++;return nil ;};const (_dgcb rulingKind =iota ;_dffd ;_dddb ;);
|
|
|
|
|
|
|
|
|
|
// Append appends `mark` to the mark array.
|
|
|
|
|
func (_fac *TextMarkArray )Append (mark TextMark ){_fac ._fad =append (_fac ._fad ,mark )};func (_dedb paraList )readBefore (_dafa []int ,_cedgb ,_ebefg int )bool {_aaea ,_cfbcb :=_dedb [_cedgb ],_dedb [_ebefg ];if _dbfdc (_aaea ,_cfbcb )&&_aaea .Lly > _cfbcb .Lly {return true ;
|
|
|
|
|
};if !(_aaea ._bcgb .Urx < _cfbcb ._bcgb .Llx ){return false ;};_ccee ,_beea :=_aaea .Lly ,_cfbcb .Lly ;if _ccee > _beea {_beea ,_ccee =_ccee ,_beea ;};_eagg :=_f .Max (_aaea ._bcgb .Llx ,_cfbcb ._bcgb .Llx );_gdee :=_f .Min (_aaea ._bcgb .Urx ,_cfbcb ._bcgb .Urx );
|
|
|
|
|
_gfda :=_dedb .llyRange (_dafa ,_ccee ,_beea );for _ ,_acgeg :=range _gfda {if _acgeg ==_cedgb ||_acgeg ==_ebefg {continue ;};_dgcd :=_dedb [_acgeg ];if _dgcd ._bcgb .Llx <=_gdee &&_eagg <=_dgcd ._bcgb .Urx {return false ;};};return true ;};func (_cbg *textObject )setTextRise (_cegb float64 ){if _cbg ==nil {return ;
|
|
|
|
|
};_cbg ._efb ._cea =_cegb ;};func _efbb (_eedg *wordBag ,_aadad float64 ,_fagf ,_gaad rulingList )[]*wordBag {var _adad []*wordBag ;for _ ,_acdfd :=range _eedg .depthIndexes (){_fffd :=false ;for !_eedg .empty (_acdfd ){_eccgf :=_eedg .firstReadingIndex (_acdfd );
|
|
|
|
|
_ddbe :=_eedg .firstWord (_eccgf );_acba :=_acbgd (_ddbe ,_aadad ,_fagf ,_gaad );_eedg .removeWord (_ddbe ,_eccgf );if _afa {_gd .Log .Info ("\u0066\u0069\u0072\u0073\u0074\u0057\u006f\u0072\u0064\u0020\u005e\u005e^\u005e\u0020\u0025\u0073",_ddbe .String ());
|
|
|
|
|
};for _bdgeb :=true ;_bdgeb ;_bdgeb =_fffd {_fffd =false ;_decg :=_cede *_acba ._cfec ;_aaed :=_ffgec *_acba ._cfec ;_aeff :=_geea *_acba ._cfec ;if _afa {_gd .Log .Info ("\u0070a\u0072a\u0057\u006f\u0072\u0064\u0073\u0020\u0064\u0065\u0070\u0074\u0068 \u0025\u002e\u0032\u0066 \u002d\u0020\u0025\u002e\u0032f\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061\u0044\u0065\u0070\u0074\u0068\u0047\u0061\u0070\u003d\u0025\u002e\u0032\u0066\u0020\u006d\u0061\u0078\u0049\u006e\u0074\u0072\u0061R\u0065\u0061\u0064\u0069\u006e\u0067\u0047\u0061p\u003d\u0025\u002e\u0032\u0066",_acba .minDepth (),_acba .maxDepth (),_aeff ,_aaed );
|
|
|
|
|
};if _eedg .scanBand ("\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c",_acba ,_cgdg (_fbf ,0),_acba .minDepth ()-_aeff ,_acba .maxDepth ()+_aeff ,_febe ,false ,false )> 0{_fffd =true ;};if _eedg .scanBand ("\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c",_acba ,_cgdg (_fbf ,_aaed ),_acba .minDepth (),_acba .maxDepth (),_defgd ,false ,false )> 0{_fffd =true ;
|
|
|
|
|
};if _fffd {continue ;};_ddce :=_eedg .scanBand ("",_acba ,_cgdg (_cfcf ,_decg ),_acba .minDepth (),_acba .maxDepth (),_bfff ,true ,false );if _ddce > 0{_gdab :=(_acba .maxDepth ()-_acba .minDepth ())/_acba ._cfec ;if (_ddce > 1&&float64 (_ddce )> 0.3*_gdab )||_ddce <=10{if _eedg .scanBand ("\u006f\u0074\u0068e\u0072",_acba ,_cgdg (_cfcf ,_decg ),_acba .minDepth (),_acba .maxDepth (),_bfff ,false ,true )> 0{_fffd =true ;
|
|
|
|
|
};};};};_adad =append (_adad ,_acba );};};return _adad ;};var _abff =map[rulingKind ]string {_dgcb :"\u006e\u006f\u006e\u0065",_dffd :"\u0068\u006f\u0072\u0069\u007a\u006f\u006e\u0074\u0061\u006c",_dddb :"\u0076\u0065\u0072\u0074\u0069\u0063\u0061\u006c"};
|
|
|
|
|
func (_eagb *textObject )getFontDict (_acdc string )(_gaaa _eb .PdfObject ,_cebc error ){_gebc :=_eagb ._dcgb ;if _gebc ==nil {_gd .Log .Debug ("g\u0065\u0074\u0046\u006f\u006e\u0074D\u0069\u0063\u0074\u002e\u0020\u004eo\u0020\u0072\u0065\u0073\u006f\u0075\u0072c\u0065\u0073\u002e\u0020\u006e\u0061\u006d\u0065\u003d\u0025#\u0071",_acdc );
|
|
|
|
|
return nil ,nil ;};_gaaa ,_faab :=_gebc .GetFontByName (_eb .PdfObjectName (_acdc ));if !_faab {_gd .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u0067\u0065t\u0046\u006f\u006et\u0044\u0069\u0063\u0074\u003a\u0020\u0046\u006f\u006et \u006e\u006f\u0074 \u0066\u006fu\u006e\u0064\u003a\u0020\u006e\u0061m\u0065\u003d%\u0023\u0071",_acdc );
|
|
|
|
|
return nil ,_dd .New ("f\u006f\u006e\u0074\u0020no\u0074 \u0069\u006e\u0020\u0072\u0065s\u006f\u0075\u0072\u0063\u0065\u0073");};return _gaaa ,nil ;};type textPara struct{_bc .PdfRectangle ;_bcgb _bc .PdfRectangle ;_bcca []*textLine ;_egea *textTable ;
|
|
|
|
|
_abgd bool ;_fbed bool ;_dbfb *textPara ;_fefc *textPara ;_ffeb *textPara ;_eebd *textPara ;};func (_ecag *textTable )subdivide ()*textTable {_ecag .logComposite ("\u0073u\u0062\u0064\u0069\u0076\u0069\u0064e");_afcf :=_ecag .compositeRowCorridors ();_dfdgg :=_ecag .compositeColCorridors ();
|
|
|
|
|
if _gcdc {_gd .Log .Info ("\u0073u\u0062\u0064i\u0076\u0069\u0064\u0065:\u000a\u0009\u0072o\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s=\u0025\u0073\u000a\t\u0063\u006fl\u0043\u006f\u0072\u0072\u0069\u0064o\u0072\u0073=\u0025\u0073",_eebdg (_afcf ),_eebdg (_dfdgg ));
|
|
|
|
|
};if len (_afcf )==0||len (_dfdgg )==0{return _ecag ;};_bcfgg (_afcf );_bcfgg (_dfdgg );if _gcdc {_gd .Log .Info ("\u0073\u0075\u0062\u0064\u0069\u0076\u0069\u0064\u0065\u0020\u0066\u0069\u0078\u0065\u0064\u003a\u000a\u0009r\u006f\u0077\u0043\u006f\u0072\u0072\u0069d\u006f\u0072\u0073\u003d\u0025\u0073\u000a\u0009\u0063\u006f\u006cC\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u003d\u0025\u0073",_eebdg (_afcf ),_eebdg (_dfdgg ));
|
|
|
|
|
};_acbdg ,_gcaf :=_acgefa (_ecag ._bfbba ,_afcf );_gagc ,_fcgfe :=_acgefa (_ecag ._cacec ,_dfdgg );_ggddb :=make (map[uint64 ]*textPara ,_fcgfe *_gcaf );_bggeg :=&textTable {PdfRectangle :_ecag .PdfRectangle ,_dcbbg :_ecag ._dcbbg ,_bfbba :_gcaf ,_cacec :_fcgfe ,_egfdd :_ggddb };
|
|
|
|
|
if _gcdc {_gd .Log .Info ("\u0073\u0075b\u0064\u0069\u0076\u0069\u0064\u0065\u003a\u0020\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u0020\u003d\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u0020\u0063\u0065\u006c\u006c\u0073\u003d\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u000a"+"\u0009\u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s\u003d\u0025\u0073\u000a"+"\u0009\u0063\u006f\u006c\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072s\u003d\u0025\u0073\u000a"+"\u0009\u0079\u004f\u0066\u0066\u0073\u0065\u0074\u0073=\u0025\u002b\u0076\u000a"+"\u0009\u0078\u004f\u0066\u0066\u0073\u0065\u0074\u0073\u003d\u0025\u002b\u0076",_ecag ._cacec ,_ecag ._bfbba ,_fcgfe ,_gcaf ,_eebdg (_afcf ),_eebdg (_dfdgg ),_acbdg ,_gagc );
|
|
|
|
|
};for _cbcc :=0;_cbcc < _ecag ._bfbba ;_cbcc ++{_aefe :=_acbdg [_cbcc ];for _fgeb :=0;_fgeb < _ecag ._cacec ;_fgeb ++{_bddbe :=_gagc [_fgeb ];if _gcdc {_ae .Printf ("\u0025\u0036\u0064\u002c %\u0032\u0064\u003a\u0020\u0078\u0030\u003d\u0025\u0064\u0020\u0079\u0030\u003d\u0025d\u000a",_fgeb ,_cbcc ,_bddbe ,_aefe );
|
|
|
|
|
};_ggddg ,_afeb :=_ecag ._ffgge [_daceb (_fgeb ,_cbcc )];if !_afeb {continue ;};_gegc :=_ggddg .split (_afcf [_cbcc ],_dfdgg [_fgeb ]);for _gegb :=0;_gegb < _gegc ._bfbba ;_gegb ++{for _adda :=0;_adda < _gegc ._cacec ;_adda ++{_bcdb :=_gegc .get (_adda ,_gegb );
|
|
|
|
|
_bggeg .put (_bddbe +_adda ,_aefe +_gegb ,_bcdb );if _gcdc {_ae .Printf ("\u0025\u0038\u0064\u002c\u0020\u0025\u0032\u0064\u003a\u0020\u0025\u0073\u000a",_bddbe +_adda ,_aefe +_gegb ,_bcdb );};};};};};return _bggeg ;};func (_gfedc rulingList )findPrimSec (_bgce ,_ffcd float64 )*ruling {for _ ,_adffd :=range _gfedc {if _gfbdg (_adffd ._aadb -_bgce )&&_adffd ._eacb -_cdbcd <=_ffcd &&_ffcd <=_adffd ._cbf +_cdbcd {return _adffd ;
|
|
|
|
|
};};return nil ;};func (_cdgf paraList )reorder (_adca []int ){_aeecg :=make (paraList ,len (_cdgf ));for _dgae ,_caga :=range _adca {_aeecg [_dgae ]=_cdgf [_caga ];};copy (_cdgf ,_aeecg );};func (_acfb *textTable )toTextTable ()TextTable {if _gcdc {_gd .Log .Info ("t\u006fT\u0065\u0078\u0074\u0054\u0061\u0062\u006c\u0065:\u0020\u0025\u0064\u0020x \u0025\u0064",_acfb ._cacec ,_acfb ._bfbba );
|
|
|
|
|
};_dagg :=make ([][]TableCell ,_acfb ._bfbba );for _dace :=0;_dace < _acfb ._bfbba ;_dace ++{_dagg [_dace ]=make ([]TableCell ,_acfb ._cacec );for _gage :=0;_gage < _acfb ._cacec ;_gage ++{_eceae :=_acfb .get (_gage ,_dace );if _eceae ==nil {continue ;
|
|
|
|
|
};if _gcdc {_ae .Printf ("\u0025\u0034\u0064 \u0025\u0032\u0064\u003a\u0020\u0025\u0073\u000a",_gage ,_dace ,_eceae );};_dagg [_dace ][_gage ].Text =_eceae .text ();_feabc :=0;_dagg [_dace ][_gage ].Marks ._fad =_eceae .toTextMarks (&_feabc );};};return TextTable {W :_acfb ._cacec ,H :_acfb ._bfbba ,Cells :_dagg };
|
|
|
|
|
};type imageExtractContext struct{_ee []ImageMark ;_cdc int ;_be int ;_dcc int ;_ed map[*_eb .PdfObjectStream ]*cachedImage ;_cgf *ImageExtractOptions ;};
|
|
|
|
|
|
|
|
|
|
// ExtractText processes and extracts all text data in content streams and returns as a string.
|
|
|
|
|
// It takes into account character encodings in the PDF file, which are decoded by
|
|
|
|
|
// CharcodeBytesToUnicode.
|
|
|
|
|
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
|
|
|
|
|
func (_dde *Extractor )ExtractText ()(string ,error ){_fce ,_ ,_ ,_ccc :=_dde .ExtractTextWithStats ();return _fce ,_ccc ;};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `k`.
|
|
|
|
|
func (_cceb markKind )String ()string {_gecb ,_acdeb :=_afcd [_cceb ];if !_acdeb {return _ae .Sprintf ("\u004e\u006f\u0074\u0020\u0061\u0020\u006d\u0061\u0072k\u003a\u0020\u0025\u0064",_cceb );};return _gecb ;};
|
|
|
|
|
|
|
|
|
|
// Extractor stores and offers functionality for extracting content from PDF pages.
|
|
|
|
|
type Extractor struct{_ec string ;_ffe *_bc .PdfPageResources ;_dag _bc .PdfRectangle ;_ca map[string ]fontEntry ;_dg map[string ]textResult ;_ceg int64 ;_cc int ;};func (_bdba *textTable )computeBbox ()_bc .PdfRectangle {var _afceg _bc .PdfRectangle ;
|
|
|
|
|
_faag :=false ;for _dgad :=0;_dgad < _bdba ._bfbba ;_dgad ++{for _adae :=0;_adae < _bdba ._cacec ;_adae ++{_cebcb :=_bdba .get (_adae ,_dgad );if _cebcb ==nil {continue ;};if !_faag {_afceg =_cebcb .PdfRectangle ;_faag =true ;}else {_afceg =_deb (_afceg ,_cebcb .PdfRectangle );
|
|
|
|
|
};};};return _afceg ;};func (_ggbab *textLine )toTextMarks (_addbg *int )[]TextMark {var _ggcc []TextMark ;for _ ,_cecf :=range _ggbab ._dee {if _cecf ._cfcd {_ggcc =_aada (_ggcc ,_addbg ,"\u0020");};_abfcf :=_cecf .toTextMarks (_addbg );_ggcc =append (_ggcc ,_abfcf ...);
|
|
|
|
|
};return _ggcc ;};func (_aacb *textLine )pullWord (_edbd *wordBag ,_fdb *textWord ,_gbbe int ){_aacb .appendWord (_fdb );_edbd .removeWord (_fdb ,_gbbe );};func _aeac (_dgda *wordBag ,_gdge int )*textLine {_bdbea :=_dgda .firstWord (_gdge );_egca :=textLine {PdfRectangle :_bdbea .PdfRectangle ,_cgdd :_bdbea ._edega ,_cedfb :_bdbea ._gdce };
|
|
|
|
|
_egca .pullWord (_dgda ,_bdbea ,_gdge );return &_egca ;};func (_eeb *textObject )moveTextSetLeading (_cbd ,_cfc float64 ){_eeb ._efb ._fgdf =-_cfc ;_eeb .moveLP (_cbd ,_cfc );};func (_eabb *textObject )moveLP (_cef ,_ecea float64 ){_eabb ._ggdd .Concat (_ff .NewMatrix (1,0,0,1,_cef ,_ecea ));
|
|
|
|
|
_eabb ._cdf =_eabb ._ggdd ;};func (_dbbfg *textPara )toCellTextMarks (_fgg *int )[]TextMark {var _gbd []TextMark ;for _ggbbg ,_dcfc :=range _dbbfg ._bcca {_daae :=_dcfc .toTextMarks (_fgg );_bfcg :=_gacf &&_dcfc .endsInHyphen ()&&_ggbbg !=len (_dbbfg ._bcca )-1;
|
|
|
|
|
if _bfcg {_daae =_bdcdg (_daae ,_fgg );};_gbd =append (_gbd ,_daae ...);if !(_bfcg ||_ggbbg ==len (_dbbfg ._bcca )-1){_gbd =_aada (_gbd ,_fgg ,_gfebf (_dcfc ._cedfb ,_dbbfg ._bcca [_ggbbg +1]._cedfb ));};};return _gbd ;};func _cddc (_ceae ,_gfff _bc .PdfRectangle )bool {return _ceae .Llx <=_gfff .Llx &&_gfff .Urx <=_ceae .Urx &&_ceae .Lly <=_gfff .Lly &&_gfff .Ury <=_ceae .Ury ;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Marks returns the TextMark collection for a page. It represents all the text on the page.
|
|
|
|
|
func (_dbbd PageText )Marks ()*TextMarkArray {return &TextMarkArray {_fad :_dbbd ._dcbg }};func (_bdcd *textObject )setTextLeading (_aec float64 ){if _bdcd ==nil {return ;};_bdcd ._efb ._fgdf =_aec ;};func _acbgd (_acgf *textWord ,_gacg float64 ,_bcfe ,_ffab rulingList )*wordBag {_ebc :=_faaa (_acgf ._gdce );
|
|
|
|
|
_fgba :=[]*textWord {_acgf };_acbc :=wordBag {_aaaf :map[int ][]*textWord {_ebc :_fgba },PdfRectangle :_acgf .PdfRectangle ,_cfec :_acgf ._edega ,_agadg :_gacg ,_fedd :_bcfe ,_gec :_ffab };return &_acbc ;};func (_gffg rulingList )blocks (_abfg ,_fefa *ruling )bool {if _abfg ._eacb > _fefa ._cbf ||_fefa ._eacb > _abfg ._cbf {return false ;
|
|
|
|
|
};_aabb :=_f .Max (_abfg ._eacb ,_fefa ._eacb );_afbf :=_f .Min (_abfg ._cbf ,_fefa ._cbf );if _abfg ._aadb > _fefa ._aadb {_abfg ,_fefa =_fefa ,_abfg ;};for _ ,_dgbca :=range _gffg {if _abfg ._aadb <=_dgbca ._aadb +_gdff &&_dgbca ._aadb <=_fefa ._aadb +_gdff &&_dgbca ._eacb <=_afbf &&_aabb <=_dgbca ._cbf {return true ;
|
|
|
|
|
};};return false ;};func (_gcca *subpath )removeDuplicates (){if len (_gcca ._cfac )==0{return ;};_cedf :=[]_ff .Point {_gcca ._cfac [0]};for _ ,_gdebe :=range _gcca ._cfac [1:]{if !_fgbd (_gdebe ,_cedf [len (_cedf )-1]){_cedf =append (_cedf ,_gdebe );
|
|
|
|
|
};};_gcca ._cfac =_cedf ;};func (_egdg *textWord )appendMark (_aegae *textMark ,_cagf _bc .PdfRectangle ){_egdg ._eeefd =append (_egdg ._eeefd ,_aegae );_egdg .PdfRectangle =_deb (_egdg .PdfRectangle ,_aegae .PdfRectangle );if _aegae ._ebgc > _egdg ._edega {_egdg ._edega =_aegae ._ebgc ;
|
|
|
|
|
};_egdg ._gdce =_cagf .Ury -_egdg .PdfRectangle .Lly ;};func _ffaeg (_aaeafd string )(string ,bool ){_fdee :=[]rune (_aaeafd );if len (_fdee )!=1{return "",false ;};_efddb ,_bafc :=_ddcda [_fdee [0]];return _efddb ,_bafc ;};func (_dfdd rulingList )tidied (_ddcg string )rulingList {_cagd :=_dfdd .removeDuplicates ();
|
|
|
|
|
_cagd .log ("\u0075n\u0069\u0071\u0075\u0065\u0073");_efead :=_cagd .snapToGroups ();if _efead ==nil {return nil ;};_efead .sort ();if _acea {_gd .Log .Info ("\u0074\u0069\u0064i\u0065\u0064\u003a\u0020\u0025\u0071\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u0075\u006e\u0069\u0071\u0075\u0065\u0073\u003d\u0025\u0064\u0020\u0063\u006f\u0061l\u0065\u0073\u0063\u0065\u0064\u003d\u0025\u0064",_ddcg ,len (_dfdd ),len (_cagd ),len (_efead ));
|
|
|
|
|
};_efead .log ("\u0063o\u0061\u006c\u0065\u0073\u0063\u0065d");return _efead ;};type gridTiling struct{_bc .PdfRectangle ;_defe []float64 ;_afbc []float64 ;_gdgc map[float64 ]map[float64 ]gridTile ;};func _cecfg (_adccd ,_cbed int )int {if _adccd < _cbed {return _adccd ;
|
|
|
|
|
};return _cbed ;};func _bcgdg (_dacb ,_eee _bc .PdfRectangle )bool {return _eee .Llx <=_dacb .Urx &&_dacb .Llx <=_eee .Urx };func (_gggd rulingList )merge ()*ruling {_gbbb :=_gggd [0]._aadb ;_adbg :=_gggd [0]._eacb ;_fffe :=_gggd [0]._cbf ;for _ ,_ddea :=range _gggd [1:]{_gbbb +=_ddea ._aadb ;
|
|
|
|
|
if _ddea ._eacb < _adbg {_adbg =_ddea ._eacb ;};if _ddea ._cbf > _fffe {_fffe =_ddea ._cbf ;};};_dagd :=&ruling {_beb :_gggd [0]._beb ,_fafbe :_gggd [0]._fafbe ,Color :_gggd [0].Color ,_aadb :_gbbb /float64 (len (_gggd )),_eacb :_adbg ,_cbf :_fffe };if _gbfc {_gd .Log .Info ("\u006de\u0072g\u0065\u003a\u0020\u0025\u0032d\u0020\u0076e\u0063\u0073\u0020\u0025\u0073",len (_gggd ),_dagd );
|
|
|
|
|
for _edef ,_gegf :=range _gggd {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_edef ,_gegf );};};return _dagd ;};func _gfbdg (_dccfa float64 )bool {return _f .Abs (_dccfa )< _bdcdb };func (_fdfdf *textTable )markCells (){for _bffb :=0;
|
|
|
|
|
_bffb < _fdfdf ._bfbba ;_bffb ++{for _eabd :=0;_eabd < _fdfdf ._cacec ;_eabd ++{_aadg :=_fdfdf .get (_eabd ,_bffb );if _aadg !=nil {_aadg ._abgd =true ;};};};};
|
|
|
|
|
|
|
|
|
|
// String returns a string describing the current state of the textState stack.
|
|
|
|
|
func (_faga *stateStack )String ()string {_gafe :=[]string {_ae .Sprintf ("\u002d\u002d\u002d\u002d f\u006f\u006e\u0074\u0020\u0073\u0074\u0061\u0063\u006b\u003a\u0020\u0025\u0064",len (*_faga ))};for _ecd ,_fede :=range *_faga {_feb :="\u003c\u006e\u0069l\u003e";
|
|
|
|
|
if _fede !=nil {_feb =_fede .String ();};_gafe =append (_gafe ,_ae .Sprintf ("\u0009\u0025\u0032\u0064\u003a\u0020\u0025\u0073",_ecd ,_feb ));};return _a .Join (_gafe ,"\u000a");};func _efeef (_fdadb ,_edbc _ff .Point )rulingKind {_acfd :=_f .Abs (_fdadb .X -_edbc .X );
|
|
|
|
|
_bfac :=_f .Abs (_fdadb .Y -_edbc .Y );return _bgeb (_acfd ,_bfac ,_cbged );};func _gaee (_dcff []pathSection )rulingList {_agbd (_dcff );if _acea {_gd .Log .Info ("\u006da\u006b\u0065\u0046\u0069l\u006c\u0052\u0075\u006c\u0069n\u0067s\u003a \u0025\u0064\u0020\u0066\u0069\u006c\u006cs",len (_dcff ));
|
|
|
|
|
};var _dgdeg rulingList ;for _ ,_eeca :=range _dcff {for _ ,_eaac :=range _eeca ._bbd {if !_eaac .isQuadrilateral (){if _acea {_gd .Log .Error ("!\u0069s\u0051\u0075\u0061\u0064\u0072\u0069\u006c\u0061t\u0065\u0072\u0061\u006c: \u0025\u0073",_eaac );};
|
|
|
|
|
continue ;};if _eagfa ,_adgc :=_eaac .makeRectRuling (_eeca .Color );_adgc {_dgdeg =append (_dgdeg ,_eagfa );}else {if _dcgde {_gd .Log .Error ("\u0021\u006d\u0061\u006beR\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0025\u0073",_eaac );
|
|
|
|
|
};};};};if _acea {_gd .Log .Info ("\u006d\u0061\u006b\u0065Fi\u006c\u006c\u0052\u0075\u006c\u0069\u006e\u0067\u0073\u003a\u0020\u0025\u0073",_dgdeg .String ());};return _dgdeg ;};type textLine struct{_bc .PdfRectangle ;_cedfb float64 ;_dee []*textWord ;
|
|
|
|
|
_cgdd float64 ;};
|
|
|
|
|
|
|
|
|
|
// String returns a human readable description of `vecs`.
|
|
|
|
|
func (_gaff rulingList )String ()string {if len (_gaff )==0{return "\u007b \u0045\u004d\u0050\u0054\u0059\u0020}";};_ddgd ,_edcg :=_gaff .vertsHorzs ();_dbac :=len (_ddgd );_bedab :=len (_edcg );if _dbac ==0||_bedab ==0{return _ae .Sprintf ("\u007b%\u0064\u0020\u0078\u0020\u0025\u0064}",_dbac ,_bedab );
|
|
|
|
|
};_edga :=_bc .PdfRectangle {Llx :_ddgd [0]._aadb ,Urx :_ddgd [_dbac -1]._aadb ,Lly :_edcg [_bedab -1]._aadb ,Ury :_edcg [0]._aadb };return _ae .Sprintf ("\u007b\u0025d\u0020\u0078\u0020%\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u007d",_dbac ,_bedab ,_edga );
|
|
|
|
|
};func _fbgca (_dgdd float64 )float64 {return _ebdf *_f .Round (_dgdd /_ebdf )};func (_afef *shapesState )clearPath (){_afef ._cbdd =nil ;_afef ._bbfc =false ;if _egfa {_gd .Log .Info ("\u0043\u004c\u0045A\u0052\u003a\u0020\u0073\u0073\u003d\u0025\u0073",_afef );
|
|
|
|
|
};};
|
|
|
|
|
|
|
|
|
|
// ImageMark represents an image drawn on a page and its position in device coordinates.
|
|
|
|
|
// All coordinates are in device coordinates.
|
|
|
|
|
type ImageMark struct{Image *_bc .Image ;
|
|
|
|
|
|
|
|
|
|
// Dimensions of the image as displayed in the PDF.
|
|
|
|
|
Width float64 ;Height float64 ;
|
|
|
|
|
|
|
|
|
|
// Position of the image in PDF coordinates (lower left corner).
|
|
|
|
|
X float64 ;Y float64 ;
|
|
|
|
|
|
|
|
|
|
// Angle in degrees, if rotated.
|
|
|
|
|
Angle float64 ;};
|
|
|
|
|
|
|
|
|
|
// Tables returns the tables extracted from the page.
|
|
|
|
|
func (_dgbf PageText )Tables ()[]TextTable {if _gcdc {_gd .Log .Info ("\u0054\u0061\u0062\u006c\u0065\u0073\u003a\u0020\u0025\u0064",len (_dgbf ._dfc ));};return _dgbf ._dfc ;};func _ccec (_dabgg []rulingList )(rulingList ,rulingList ){var _cbgd rulingList ;
|
|
|
|
|
for _ ,_caad :=range _dabgg {_cbgd =append (_cbgd ,_caad ...);};return _cbgd .vertsHorzs ();};func _gge (_ddeb *Extractor ,_acb *_bc .PdfPageResources ,_geb _de .GraphicsState ,_bcag *textState ,_dcgd *stateStack )*textObject {return &textObject {_aac :_ddeb ,_dcgb :_acb ,_faf :_geb ,_dcd :_dcgd ,_efb :_bcag ,_cdf :_ff .IdentityMatrix (),_ggdd :_ff .IdentityMatrix ()};
|
|
|
|
|
};func (_ccf *shapesState )cubicTo (_bcc ,_fbg ,_gcdb ,_agad ,_gab ,_fcda float64 ){if _egfa {_gd .Log .Info ("\u0063\u0075\u0062\u0069\u0063\u0054\u006f\u003a");};_ccf .addPoint (_gab ,_fcda );};func (_bbbf rulingList )comp (_bag ,_bfbb int )bool {_fdgccd ,_adec :=_bbbf [_bag ],_bbbf [_bfbb ];
|
|
|
|
|
_cbdb ,_gcgbc :=_fdgccd ._beb ,_adec ._beb ;if _cbdb !=_gcgbc {return _cbdb > _gcgbc ;};if _cbdb ==_dgcb {return false ;};_cgec :=func (_aade bool )bool {if _cbdb ==_dffd {return _aade ;};return !_aade ;};_gcefc ,_bbea :=_fdgccd ._aadb ,_adec ._aadb ;if _gcefc !=_bbea {return _cgec (_gcefc > _bbea );
|
|
|
|
|
};_gcefc ,_bbea =_fdgccd ._eacb ,_adec ._eacb ;if _gcefc !=_bbea {return _cgec (_gcefc < _bbea );};return _cgec (_fdgccd ._cbf < _adec ._cbf );};
|
|
|
|
|
|
|
|
|
|
// ExtractPageImages returns the image contents of the page extractor, including data
|
|
|
|
|
// and position, size information for each image.
|
|
|
|
|
// A set of options to control page image extraction can be passed in. The options
|
|
|
|
|
// parameter can be nil for the default options. By default, inline stencil masks
|
|
|
|
|
// are not extracted.
|
|
|
|
|
func (_bf *Extractor )ExtractPageImages (options *ImageExtractOptions )(*PageImages ,error ){_ba :=&imageExtractContext {_cgf :options };_fb :=_ba .extractContentStreamImages (_bf ._ec ,_bf ._ffe );if _fb !=nil {return nil ,_fb ;};return &PageImages {Images :_ba ._ee },nil ;
|
|
|
|
|
};func (_efee compositeCell )hasLines (_dgde []*textLine )bool {for _dgee ,_deca :=range _dgde {_bdac :=_bafa (_efee .PdfRectangle ,_deca .PdfRectangle );if _gcdc {_ae .Printf ("\u0020\u0020\u0020\u0020\u0020\u0020\u005e\u005e\u005e\u0069\u006e\u0074\u0065\u0072\u0073e\u0063t\u0073\u003d\u0025\u0074\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064\u000a",_bdac ,_dgee ,len (_dgde ));
|
|
|
|
|
_ae .Printf ("\u0020\u0020\u0020\u0020 \u005e\u005e\u005e\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u003d\u0025s\u000a",_efee );_ae .Printf ("\u0020 \u0020 \u0020\u0020\u0020\u006c\u0069\u006e\u0065\u003d\u0025\u0073\u000a",_deca );};if _bdac {return true ;
|
|
|
|
|
};};return false ;};func (_dbgc *shapesState )lineTo (_efe ,_edad float64 ){if _egfa {_gd .Log .Info ("\u006c\u0069\u006eeT\u006f\u0028\u0025\u002e\u0032\u0066\u002c\u0025\u002e\u0032\u0066\u0020\u0070\u003d\u0025\u002e\u0032\u0066",_efe ,_edad ,_dbgc .devicePoint (_efe ,_edad ));
|
|
|
|
|
};_dbgc .addPoint (_efe ,_edad );};func _cgea (_beaa _bc .PdfRectangle )rulingKind {_daaea :=_beaa .Width ();_ggecc :=_beaa .Height ();if _daaea > _ggecc {if _daaea >=_cbged {return _dffd ;};}else {if _ggecc >=_cbged {return _dddb ;};};return _dgcb ;};
|
|
|
|
|
func (_def *shapesState )lastpointEstablished ()(_ff .Point ,bool ){if _def ._bbfc {return _def ._egd ,false ;};_effgd :=len (_def ._cbdd );if _effgd > 0&&_def ._cbdd [_effgd -1]._abe {return _def ._cbdd [_effgd -1].last (),false ;};return _ff .Point {},true ;
|
|
|
|
|
};func (_edfa *wordBag )empty (_gege int )bool {_ ,_egfe :=_edfa ._aaaf [_gege ];return !_egfe };type markKind int ;func (_fecc compositeCell )parasBBox ()(paraList ,_bc .PdfRectangle ){return _fecc .paraList ,_fecc .PdfRectangle ;};func (_ebdad *textTable )reduce ()*textTable {_egdc :=make ([]int ,0,_ebdad ._bfbba );
|
|
|
|
|
_dafg :=make ([]int ,0,_ebdad ._cacec );for _cdae :=0;_cdae < _ebdad ._bfbba ;_cdae ++{if !_ebdad .emptyRow (_cdae ){_egdc =append (_egdc ,_cdae );};};for _dgedd :=0;_dgedd < _ebdad ._cacec ;_dgedd ++{if !_ebdad .emptyColumn (_dgedd ){_dafg =append (_dafg ,_dgedd );
|
|
|
|
|
};};if len (_egdc )==_ebdad ._bfbba &&len (_dafg )==_ebdad ._cacec {return _ebdad ;};_cdgea :=textTable {_dcbbg :_ebdad ._dcbbg ,_cacec :len (_dafg ),_bfbba :len (_egdc ),_egfdd :make (map[uint64 ]*textPara ,len (_dafg )*len (_egdc ))};if _gcdc {_gd .Log .Info ("\u0072\u0065\u0064\u0075ce\u003a\u0020\u0025\u0064\u0078\u0025\u0064\u0020\u002d\u003e\u0020\u0025\u0064\u0078%\u0064",_ebdad ._cacec ,_ebdad ._bfbba ,len (_dafg ),len (_egdc ));
|
|
|
|
|
_gd .Log .Info ("\u0072\u0065d\u0075\u0063\u0065d\u0043\u006f\u006c\u0073\u003a\u0020\u0025\u002b\u0076",_dafg );_gd .Log .Info ("\u0072\u0065d\u0075\u0063\u0065d\u0052\u006f\u0077\u0073\u003a\u0020\u0025\u002b\u0076",_egdc );};for _fffa ,_aeef :=range _egdc {for _fcecd ,_cdebb :=range _dafg {_bgaef :=_ebdad .get (_cdebb ,_aeef );
|
|
|
|
|
if _bgaef ==nil {continue ;};if _gcdc {_ae .Printf ("\u0020 \u0025\u0032\u0064\u002c \u0025\u0032\u0064\u0020\u0028%\u0032d\u002c \u0025\u0032\u0064\u0029\u0020\u0025\u0071\n",_fcecd ,_fffa ,_cdebb ,_aeef ,_eaed (_bgaef .text (),50));};_cdgea .put (_fcecd ,_fffa ,_bgaef );
|
|
|
|
|
};};return &_cdgea ;};
|
2021-04-06 22:35:37 +00:00
|
|
|
|
|
2021-04-23 20:28:14 +00:00
|
|
|
|
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
|
|
|
|
|
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
|
2021-05-11 00:01:27 +00:00
|
|
|
|
func (_cge *Extractor )ExtractTextWithStats ()(_eg string ,_bfb int ,_aeb int ,_egf error ){_fae ,_bfb ,_aeb ,_egf :=_cge .ExtractPageText ();if _egf !=nil {return "",_bfb ,_aeb ,_egf ;};return _fae .Text (),_bfb ,_aeb ,nil ;};func (_ddcc paraList )writeText (_cafa _e .Writer ){for _cdeeb ,_bdaf :=range _ddcc {if _bdaf ._fbed {continue ;
|
|
|
|
|
};_bdaf .writeText (_cafa );if _cdeeb !=len (_ddcc )-1{if _cdea (_bdaf ,_ddcc [_cdeeb +1]){_cafa .Write ([]byte ("\u0020"));}else {_cafa .Write ([]byte ("\u000a"));_cafa .Write ([]byte ("\u000a"));};};};_cafa .Write ([]byte ("\u000a"));_cafa .Write ([]byte ("\u000a"));
|
|
|
|
|
};func (_fgaf *textTable )getComposite (_febf ,_cdeea int )(paraList ,_bc .PdfRectangle ){_abcb ,_dffa :=_fgaf ._ffgge [_daceb (_febf ,_cdeea )];if _gcdc {_ae .Printf ("\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0067\u0065\u0074\u0043\u006f\u006d\u0070o\u0073i\u0074\u0065\u0028\u0025\u0064\u002c\u0025\u0064\u0029\u002d\u003e\u0025\u0073\u000a",_febf ,_cdeea ,_abcb .String ());
|
|
|
|
|
};if !_dffa {return nil ,_bc .PdfRectangle {};};return _abcb .parasBBox ();};type gridTile struct{_bc .PdfRectangle ;_adfgd ,_cgaag ,_ggfc ,_fbfa bool ;};func _gdfbd (_fdade _bc .PdfRectangle ,_cded []*textLine )*textPara {return &textPara {PdfRectangle :_fdade ,_bcca :_cded };
|
|
|
|
|
};func _eccg (_bcdg ,_aebga bounded )float64 {return _bcdg .bbox ().Llx -_aebga .bbox ().Urx };func _acgefa (_aeeae int ,_dffc map[int ][]float64 )([]int ,int ){_efga :=make ([]int ,_aeeae );_cgdf :=0;for _fbaf :=0;_fbaf < _aeeae ;_fbaf ++{_efga [_fbaf ]=_cgdf ;
|
|
|
|
|
_cgdf +=len (_dffc [_fbaf ])+1;};return _efga ,_cgdf ;};func _adedc (_bcgbe string ,_ddag []rulingList ){_gd .Log .Info ("\u0024\u0024 \u0025\u0064\u0020g\u0072\u0069\u0064\u0073\u0020\u002d\u0020\u0025\u0073",len (_ddag ),_bcgbe );for _gfffc ,_fedda :=range _ddag {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_gfffc ,_fedda .String ());
|
|
|
|
|
};};func (_feg *textObject )reset (){_feg ._cdf =_ff .IdentityMatrix ();_feg ._ggdd =_ff .IdentityMatrix ();_feg ._ddeg =nil ;};
|
2021-04-06 22:35:37 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// String returns a string describing `tm`.
|
|
|
|
|
func (_gbag TextMark )String ()string {_eebe :=_gbag .BBox ;var _dae string ;if _gbag .Font !=nil {_dae =_gbag .Font .String ();if len (_dae )> 50{_dae =_dae [:50]+"\u002e\u002e\u002e";};};var _gafa string ;if _gbag .Meta {_gafa ="\u0020\u002a\u004d\u002a";
|
|
|
|
|
};return _ae .Sprintf ("\u007b\u0054\u0065\u0078t\u004d\u0061\u0072\u006b\u003a\u0020\u0025\u0064\u0020%\u0071\u003d\u0025\u0030\u0032\u0078\u0020\u0028\u0025\u0036\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e2\u0066\u0029\u0020\u0028\u00256\u002e\u0032\u0066\u002c\u0020\u0025\u0036\u002e\u0032\u0066\u0029\u0020\u0025\u0073\u0025\u0073\u007d",_gbag .Offset ,_gbag .Text ,[]rune (_gbag .Text ),_eebe .Llx ,_eebe .Lly ,_eebe .Urx ,_eebe .Ury ,_dae ,_gafa );
|
|
|
|
|
};func (_dgedc *textTable )get (_baae ,_bedc int )*textPara {return _dgedc ._egfdd [_daceb (_baae ,_bedc )]};
|
|
|
|
|
|
|
|
|
|
// ToTextMark returns the public view of `tm`.
|
|
|
|
|
func (_cdee *textMark )ToTextMark ()TextMark {return TextMark {Text :_cdee ._ecfa ,Original :_cdee ._gfg ,BBox :_cdee ._fgfd ,Font :_cdee ._aacba ,FontSize :_cdee ._ebgc ,FillColor :_cdee ._dcggg ,StrokeColor :_cdee ._aggg ,Orientation :_cdee ._ggbe };
|
|
|
|
|
};func _eaed (_dfaca string ,_ccaf int )string {if len (_dfaca )< _ccaf {return _dfaca ;};return _dfaca [:_ccaf ];};func _efgd (_babd float64 )bool {return _f .Abs (_babd )< _gdff };func _caag (_agegb _bc .PdfRectangle )*ruling {return &ruling {_beb :_dddb ,_aadb :_agegb .Llx ,_eacb :_agegb .Lly ,_cbf :_agegb .Ury };
|
|
|
|
|
};func (_gabed *textWord )toTextMarks (_eddc *int )[]TextMark {var _ccgb []TextMark ;for _ ,_bgbgd :=range _gabed ._eeefd {_ccgb =_gaaae (_ccgb ,_eddc ,_bgbgd .ToTextMark ());};return _ccgb ;};func (_eggg gridTile )complete ()bool {return _eggg .numBorders ()==4};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
|
|
|
|
|
// TODO(peterwilliams97): The stats complicate this function signature and aren't very useful.
|
|
|
|
|
// Replace with a function like Extract() (*PageText, error)
|
|
|
|
|
func (_ge *Extractor )ExtractPageText ()(*PageText ,int ,int ,error ){_bcg ,_fgd ,_cab ,_cfa :=_ge .extractPageText (_ge ._ec ,_ge ._ffe ,_ff .IdentityMatrix (),0);if _cfa !=nil {return nil ,0,0,_cfa ;};_bcg .computeViews ();_cfa =_bdgbe (_bcg );if _cfa !=nil {return nil ,0,0,_cfa ;
|
|
|
|
|
};return _bcg ,_fgd ,_cab ,nil ;};func _dedeg (_adfg ,_cdbb bounded )float64 {return _cdgg (_adfg )-_cdgg (_cdbb )};func (_aeead *textPara )writeText (_adbf _e .Writer ){if _aeead ._egea ==nil {_aeead .writeCellText (_adbf );return ;};for _dbda :=0;_dbda < _aeead ._egea ._bfbba ;
|
|
|
|
|
_dbda ++{for _dfea :=0;_dfea < _aeead ._egea ._cacec ;_dfea ++{_fadc :=_aeead ._egea .get (_dfea ,_dbda );if _fadc ==nil {_adbf .Write ([]byte ("\u0009"));}else {_fadc .writeCellText (_adbf );};_adbf .Write ([]byte ("\u0020"));};if _dbda < _aeead ._egea ._bfbba -1{_adbf .Write ([]byte ("\u000a"));
|
|
|
|
|
};};};func _dfac (_agaeg ,_bfg bounded )float64 {_fcfb :=_dedeg (_agaeg ,_bfg );if !_gfbdg (_fcfb ){return _fcfb ;};return _gfdd (_agaeg ,_bfg );};func _dbfdc (_dgbe ,_bdbeac *textPara )bool {return _bcgdg (_dgbe ._bcgb ,_bdbeac ._bcgb )};func _cgbf (_cgdde ,_edeg _ff .Point )bool {_dcbeb :=_f .Abs (_cgdde .X -_edeg .X );
|
|
|
|
|
_bcaea :=_f .Abs (_cgdde .Y -_edeg .Y );return _dbfcb (_bcaea ,_dcbeb );};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `state`.
|
|
|
|
|
func (_agae *textState )String ()string {_dbc :="\u005bN\u004f\u0054\u0020\u0053\u0045\u0054]";if _agae ._afc !=nil {_dbc =_agae ._afc .BaseFont ();};return _ae .Sprintf ("\u0074\u0063\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0077\u003d\u0025\u002e\u0032\u0066 \u0074f\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u003d\u0025\u0071",_agae ._aba ,_agae ._dccc ,_agae ._abfe ,_dbc );
|
|
|
|
|
};func (_ddccb gridTile )numBorders ()int {_bcddc :=0;if _ddccb ._cgaag {_bcddc ++;};if _ddccb ._fbfa {_bcddc ++;};if _ddccb ._ggfc {_bcddc ++;};if _ddccb ._adfgd {_bcddc ++;};return _bcddc ;};type fontEntry struct{_faef *_bc .PdfFont ;_gdbe int64 ;};func _cadc (_dfbe int ,_cgga func (int ,int )bool )[]int {_beeea :=make ([]int ,_dfbe );
|
|
|
|
|
for _ceaebc :=range _beeea {_beeea [_ceaebc ]=_ceaebc ;};_b .Slice (_beeea ,func (_bgca ,_bcbff int )bool {return _cgga (_beeea [_bgca ],_beeea [_bcbff ])});return _beeea ;};func (_gedee *textObject )renderText (_fedc []byte )error {if _gedee ._cde {_gd .Log .Debug ("\u0072\u0065\u006e\u0064\u0065r\u0054\u0065\u0078\u0074\u003a\u0020\u0049\u006e\u0076\u0061\u006c\u0069\u0064 \u0066\u006f\u006e\u0074\u002e\u0020\u004e\u006f\u0074\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u002e");
|
|
|
|
|
return nil ;};_fdac :=_gedee .getCurrentFont ();_acg :=_fdac .BytesToCharcodes (_fedc );_cfd ,_ddb ,_ddcb :=_fdac .CharcodesToStrings (_acg );if _ddcb > 0{_gd .Log .Debug ("\u0072\u0065nd\u0065\u0072\u0054e\u0078\u0074\u003a\u0020num\u0043ha\u0072\u0073\u003d\u0025\u0064\u0020\u006eum\u004d\u0069\u0073\u0073\u0065\u0073\u003d%\u0064",_ddb ,_ddcb );
|
|
|
|
|
};_gedee ._efb ._cdd +=_ddb ;_gedee ._efb ._ceab +=_ddcb ;_dca :=_gedee ._efb ;_cbag :=_dca ._abfe ;_bfe :=_dca ._aeea /100.0;_bgg :=_afdf ;if _fdac .Subtype ()=="\u0054\u0079\u0070e\u0033"{_bgg =1;};_edee ,_egcb :=_fdac .GetRuneMetrics (' ');if !_egcb {_edee ,_egcb =_fdac .GetCharMetrics (32);
|
|
|
|
|
};if !_egcb {_edee ,_ =_bc .DefaultFont ().GetRuneMetrics (' ');};_ecaca :=_edee .Wx *_bgg ;_gd .Log .Trace ("\u0073p\u0061\u0063e\u0057\u0069\u0064t\u0068\u003d\u0025\u002e\u0032\u0066\u0020t\u0065\u0078\u0074\u003d\u0025\u0071 \u0066\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0066\u006f\u006et\u0053\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066",_ecaca ,_cfd ,_fdac ,_cbag );
|
|
|
|
|
_eff :=_ff .NewMatrix (_cbag *_bfe ,0,0,_cbag ,0,_dca ._cea );if _aagd {_gd .Log .Info ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0074\u0065\u0078t\u0073\u003d\u0025\u0071",len (_acg ),_acg ,_cfd );
|
|
|
|
|
};_gd .Log .Trace ("\u0072\u0065\u006e\u0064\u0065\u0072T\u0065\u0078\u0074\u003a\u0020\u0025\u0064\u0020\u0063\u006f\u0064\u0065\u0073=\u0025\u002b\u0076\u0020\u0072\u0075\u006ee\u0073\u003d\u0025\u0071",len (_acg ),_acg ,len (_cfd ));_gcce :=_gedee .getFillColor ();
|
|
|
|
|
_ecgb :=_gedee .getStrokeColor ();for _bbf ,_adag :=range _cfd {_deg :=[]rune (_adag );if len (_deg )==1&&_deg [0]=='\x00'{continue ;};_dgfd :=_acg [_bbf ];_cfe :=_gedee ._faf .CTM .Mult (_gedee ._cdf ).Mult (_eff );_edf :=0.0;if len (_deg )==1&&_deg [0]==32{_edf =_dca ._dccc ;
|
|
|
|
|
};_baee ,_cag :=_fdac .GetCharMetrics (_dgfd );if !_cag {_gd .Log .Debug ("\u0045R\u0052\u004fR\u003a\u0020\u004e\u006f \u006d\u0065\u0074r\u0069\u0063\u0020\u0066\u006f\u0072\u0020\u0063\u006fde\u003d\u0025\u0064 \u0072\u003d0\u0078\u0025\u0030\u0034\u0078\u003d%\u002b\u0071 \u0025\u0073",_dgfd ,_deg ,_deg ,_fdac );
|
|
|
|
|
return _ae .Errorf ("\u006e\u006f\u0020\u0063\u0068\u0061\u0072\u0020\u006d\u0065\u0074\u0072\u0069\u0063\u0073:\u0020f\u006f\u006e\u0074\u003d\u0025\u0073\u0020\u0063\u006f\u0064\u0065\u003d\u0025\u0064",_fdac .String (),_dgfd );};_dfba :=_ff .Point {X :_baee .Wx *_bgg ,Y :_baee .Wy *_bgg };
|
|
|
|
|
_gbcg :=_ff .Point {X :(_dfba .X *_cbag +_edf )*_bfe };_cbdg :=_ff .Point {X :(_dfba .X *_cbag +_dca ._aba +_edf )*_bfe };if _aagd {_gd .Log .Info ("\u0074\u0066\u0073\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0063\u003d\u0025\u002e\u0032f\u0020t\u0077\u003d\u0025\u002e\u0032\u0066\u0020\u0074\u0068\u003d\u0025\u002e\u0032\u0066",_cbag ,_dca ._aba ,_dca ._dccc ,_bfe );
|
|
|
|
|
_gd .Log .Info ("\u0064x\u002c\u0064\u0079\u003d%\u002e\u0033\u0066\u0020\u00740\u003d%\u002e3\u0066\u0020\u0074\u003d\u0025\u002e\u0033f",_dfba ,_gbcg ,_cbdg );};_egg :=_gcac (_gbcg );_afd :=_gcac (_cbdg );_fgde :=_gedee ._faf .CTM .Mult (_gedee ._cdf ).Mult (_egg );
|
|
|
|
|
if _fdad {_gd .Log .Info ("e\u006e\u0064\u003a\u000a\tC\u0054M\u003d\u0025\u0073\u000a\u0009 \u0074\u006d\u003d\u0025\u0073\u000a"+"\u0009\u0020t\u0064\u003d\u0025s\u0020\u0078\u006c\u0061\u0074\u003d\u0025\u0073\u000a"+"\u0009t\u0064\u0030\u003d\u0025s\u000a\u0009\u0020\u0020\u2192 \u0025s\u0020x\u006c\u0061\u0074\u003d\u0025\u0073",_gedee ._faf .CTM ,_gedee ._cdf ,_afd ,_adba (_gedee ._faf .CTM .Mult (_gedee ._cdf ).Mult (_afd )),_egg ,_fgde ,_adba (_fgde ));
|
|
|
|
|
};_dcac ,_gbf :=_gedee .newTextMark (_daf .ExpandLigatures (_deg ),_cfe ,_adba (_fgde ),_f .Abs (_ecaca *_cfe .ScalingFactorX ()),_fdac ,_gedee ._efb ._aba ,_gcce ,_ecgb );if !_gbf {_gd .Log .Debug ("\u0054\u0065\u0078\u0074\u0020\u006d\u0061\u0072\u006b\u0020\u006f\u0075\u0074\u0073\u0069d\u0065 \u0070\u0061\u0067\u0065\u002e\u0020\u0053\u006b\u0069\u0070\u0070\u0069\u006e\u0067");
|
|
|
|
|
continue ;};if _fdac ==nil {_gd .Log .Debug ("\u0045R\u0052O\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u002e");}else if _fdac .Encoder ()==nil {_gd .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020N\u006f\u0020\u0065\u006e\u0063\u006f\u0064\u0069\u006eg\u002e\u0020\u0066o\u006et\u003d\u0025\u0073",_fdac );
|
|
|
|
|
}else {if _dgc ,_eggb :=_fdac .Encoder ().CharcodeToRune (_dgfd );_eggb {_dcac ._gfg =string (_dgc );};};_gd .Log .Trace ("i\u003d\u0025\u0064\u0020\u0063\u006fd\u0065\u003d\u0025\u0064\u0020\u006d\u0061\u0072\u006b=\u0025\u0073\u0020t\u0072m\u003d\u0025\u0073",_bbf ,_dgfd ,_dcac ,_cfe );
|
|
|
|
|
_gedee ._ddeg =append (_gedee ._ddeg ,&_dcac );_gedee ._cdf .Concat (_afd );};return nil ;};func (_fg *imageExtractContext )extractXObjectImage (_dfa *_eb .PdfObjectName ,_baa _de .GraphicsState ,_gff *_bc .PdfPageResources )error {_eca ,_ :=_gff .GetXObjectByName (*_dfa );
|
|
|
|
|
if _eca ==nil {return nil ;};_cdcg ,_gcb :=_fg ._ed [_eca ];if !_gcb {_bae ,_gg :=_gff .GetXObjectImageByName (*_dfa );if _gg !=nil {return _gg ;};if _bae ==nil {return nil ;};_ceb ,_gg :=_bae .ToImage ();if _gg !=nil {return _gg ;};_cdcg =&cachedImage {_ef :_ceb ,_ga :_bae .ColorSpace };
|
|
|
|
|
_fg ._ed [_eca ]=_cdcg ;};_fec :=_cdcg ._ef ;_bac :=_cdcg ._ga ;_dfb ,_cf :=_bac .ImageToRGB (*_fec );if _cf !=nil {return _cf ;};_gd .Log .Debug ("@\u0044\u006f\u0020\u0043\u0054\u004d\u003a\u0020\u0025\u0073",_baa .CTM .String ());_dac :=ImageMark {Image :&_dfb ,Width :_baa .CTM .ScalingFactorX (),Height :_baa .CTM .ScalingFactorY (),Angle :_baa .CTM .Angle ()};
|
|
|
|
|
_dac .X ,_dac .Y =_baa .CTM .Translation ();_fg ._ee =append (_fg ._ee ,_dac );_fg ._be ++;return nil ;};func (_gaec *wordBag )arrangeText ()*textPara {_gaec .sort ();if _acec {_gaec .removeDuplicates ();};var _dacbf []*textLine ;for _ ,_cbbc :=range _gaec .depthIndexes (){for !_gaec .empty (_cbbc ){_agfg :=_gaec .firstReadingIndex (_cbbc );
|
|
|
|
|
_edafe :=_gaec .firstWord (_agfg );_agdg :=_aeac (_gaec ,_agfg );_daeg :=_edafe ._edega ;_bbaa :=_edafe ._gdce -_bbfa *_daeg ;_bcfa :=_edafe ._gdce +_bbfa *_daeg ;_ggag :=_bdbg *_daeg ;_cdfg :=_edg *_daeg ;_ececb :for {var _aefc *textWord ;_eegd :=0;for _ ,_efcga :=range _gaec .depthBand (_bbaa ,_bcfa ){_bgbb :=_gaec .highestWord (_efcga ,_bbaa ,_bcfa );
|
|
|
|
|
if _bgbb ==nil {continue ;};_gaba :=_eccg (_bgbb ,_agdg ._dee [len (_agdg ._dee )-1]);if _gaba < -_cdfg {break _ececb ;};if _gaba > _ggag {continue ;};if _aefc !=nil &&_gfdd (_bgbb ,_aefc )>=0{continue ;};_aefc =_bgbb ;_eegd =_efcga ;};if _aefc ==nil {break ;
|
|
|
|
|
};_agdg .pullWord (_gaec ,_aefc ,_eegd );};_agdg .markWordBoundaries ();_dacbf =append (_dacbf ,_agdg );};};if len (_dacbf )==0{return nil ;};_b .Slice (_dacbf ,func (_feea ,_cfba int )bool {return _dfac (_dacbf [_feea ],_dacbf [_cfba ])< 0});_fafa :=_gdfbd (_gaec .PdfRectangle ,_dacbf );
|
|
|
|
|
if _gagb {_gd .Log .Info ("\u0061\u0072\u0072an\u0067\u0065\u0054\u0065\u0078\u0074\u0020\u0021\u0021\u0021\u0020\u0070\u0061\u0072\u0061\u003d\u0025\u0073",_fafa .String ());if _fcag {for _acdg ,_abb :=range _fafa ._bcca {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_acdg ,_abb .String ());
|
|
|
|
|
if _face {for _cecfb ,_gfgf :=range _abb ._dee {_ae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_cecfb ,_gfgf .String ());for _gebf ,_ffdd :=range _gfgf ._eeefd {_ae .Printf ("\u00251\u0032\u0064\u003a\u0020\u0025\u0073\n",_gebf ,_ffdd .String ());
|
|
|
|
|
};};};};};};return _fafa ;};func _gfdd (_ffdf ,_agge bounded )float64 {return _ffdf .bbox ().Llx -_agge .bbox ().Llx };var (_fa =_dd .New ("\u0074\u0079p\u0065\u0020\u0063h\u0065\u0063\u006b\u0020\u0065\u0072\u0072\u006f\u0072");_cd =_dd .New ("\u0072\u0061\u006e\u0067\u0065\u0020\u0063\u0068\u0065\u0063\u006b\u0020e\u0072\u0072\u006f\u0072");
|
|
|
|
|
);func _eaaca (_aece map[float64 ]map[float64 ]gridTile )[]float64 {_fedf :=make ([]float64 ,0,len (_aece ));for _cdacb :=range _aece {_fedf =append (_fedf ,_cdacb );};_b .Float64s (_fedf );_fbfae :=len (_fedf );for _bgebg :=0;_bgebg < _fbfae /2;_bgebg ++{_fedf [_bgebg ],_fedf [_fbfae -1-_bgebg ]=_fedf [_fbfae -1-_bgebg ],_fedf [_bgebg ];
|
|
|
|
|
};return _fedf ;};func _ccac (_befde ,_dgcc _ff .Point )bool {_bedag :=_f .Abs (_befde .X -_dgcc .X );_edfe :=_f .Abs (_befde .Y -_dgcc .Y );return _dbfcb (_bedag ,_edfe );};func (_gafc *stateStack )pop ()*textState {if _gafc .empty (){return nil ;};_fcbd :=*(*_gafc )[len (*_gafc )-1];
|
|
|
|
|
*_gafc =(*_gafc )[:len (*_gafc )-1];return &_fcbd ;};func _gfbd (_afge []*wordBag )[]*wordBag {if len (_afge )<=1{return _afge ;};if _gagb {_gd .Log .Info ("\u006d\u0065\u0072\u0067\u0065\u0057\u006f\u0072\u0064B\u0061\u0067\u0073\u003a");};_b .Slice (_afge ,func (_ffbg ,_dgbc int )bool {_bgb ,_gedg :=_afge [_ffbg ],_afge [_dgbc ];
|
|
|
|
|
_ebf :=_bgb .Width ()*_bgb .Height ();_ebbf :=_gedg .Width ()*_gedg .Height ();if _ebf !=_ebbf {return _ebf > _ebbf ;};if _bgb .Height ()!=_gedg .Height (){return _bgb .Height ()> _gedg .Height ();};return _ffbg < _dgbc ;});var _daa []*wordBag ;_bfbf :=make (intSet );
|
|
|
|
|
for _bggc :=0;_bggc < len (_afge );_bggc ++{if _bfbf .has (_bggc ){continue ;};_agd :=_afge [_bggc ];for _dga :=_bggc +1;_dga < len (_afge );_dga ++{if _bfbf .has (_bggc ){continue ;};_dfff :=_afge [_dga ];_fde :=_agd .PdfRectangle ;_fde .Llx -=_agd ._cfec ;
|
|
|
|
|
if _cddc (_fde ,_dfff .PdfRectangle ){_agd .absorb (_dfff );_bfbf .add (_dga );};};_daa =append (_daa ,_agd );};if len (_afge )!=len (_daa )+len (_bfbf ){_gd .Log .Error ("\u006d\u0065\u0072ge\u0057\u006f\u0072\u0064\u0042\u0061\u0067\u0073\u003a \u0025d\u2192%\u0064 \u0061\u0062\u0073\u006f\u0072\u0062\u0065\u0064\u003d\u0025\u0064",len (_afge ),len (_daa ),len (_bfbf ));
|
|
|
|
|
};return _daa ;};type wordBag struct{_bc .PdfRectangle ;_cfec float64 ;_fedd ,_gec rulingList ;_agadg float64 ;_aaaf map[int ][]*textWord ;};func (_cdcga *textObject )setWordSpacing (_gca float64 ){if _cdcga ==nil {return ;};_cdcga ._efb ._dccc =_gca ;
|
|
|
|
|
};func (_gdcc *shapesState )fill (_ddcdc *[]pathSection ){_aaf :=pathSection {_bbd :_gdcc ._cbdd ,Color :_gdcc ._eeg .getFillColor ()};*_ddcdc =append (*_ddcdc ,_aaf );if _acea {_eafd :=_aaf .bbox ();_ae .Printf ("\u0020 \u0020\u0020\u0046\u0049\u004c\u004c\u003a %\u0032\u0064\u0020\u0066\u0069\u006c\u006c\u0073\u0020\u0028\u0025\u0064\u0020\u006ee\u0077\u0029 \u0073\u0073\u003d%\u0073\u0020\u0063\u006f\u006c\u006f\u0072\u003d\u0025\u0033\u0076\u0020\u0025\u0036\u002e\u0032f\u003d\u00256.\u0032\u0066\u0078%\u0036\u002e\u0032\u0066\u000a",len (*_ddcdc ),len (_aaf ._bbd ),_gdcc ,_aaf .Color ,_eafd ,_eafd .Width (),_eafd .Height ());
|
|
|
|
|
if _dfe {for _gffe ,_cac :=range _aaf ._bbd {_ae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_gffe ,_cac );if _gffe ==10{break ;};};};};};func (_ggba *textObject )getFillColor ()_fd .Color {return _cggac (_ggba ._faf .ColorspaceNonStroking ,_ggba ._faf .ColorNonStroking );
|
|
|
|
|
};func (_fcaa *textTable )log (_gdfbg string ){if !_gcdc {return ;};_gd .Log .Info ("~\u007e\u007e\u0020\u0025\u0073\u003a \u0025\u0064\u0020\u0078\u0020\u0025d\u0020\u0067\u0072\u0069\u0064\u003d\u0025t\u000a\u0020\u0020\u0020\u0020\u0020\u0020\u0025\u0036\u002e2\u0066",_gdfbg ,_fcaa ._cacec ,_fcaa ._bfbba ,_fcaa ._dcbbg ,_fcaa .PdfRectangle );
|
|
|
|
|
for _gdfdf :=0;_gdfdf < _fcaa ._bfbba ;_gdfdf ++{for _dgff :=0;_dgff < _fcaa ._cacec ;_dgff ++{_ebcbc :=_fcaa .get (_dgff ,_gdfdf );if _ebcbc ==nil {continue ;};_ae .Printf ("%\u0034\u0064\u0020\u00252d\u003a \u0025\u0036\u002e\u0032\u0066 \u0025\u0071\u0020\u0025\u0064\u000a",_dgff ,_gdfdf ,_ebcbc .PdfRectangle ,_eaed (_ebcbc .text (),50),_da .RuneCountInString (_ebcbc .text ()));
|
|
|
|
|
};};};func (_gacd paraList )extractTables (_faeff []gridTiling )paraList {if _gcdc {_gd .Log .Debug ("\u0065\u0078\u0074r\u0061\u0063\u0074\u0054\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_gacd ));
|
|
|
|
|
};if len (_gacd )< _ceaeb {return _gacd ;};_fffb :=_gacd .findTables (_faeff );if _gcdc {_gd .Log .Info ("c\u006f\u006d\u0062\u0069\u006e\u0065d\u0020\u0074\u0061\u0062\u006c\u0065s\u0020\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d",len (_fffb ));
|
|
|
|
|
for _dfga ,_aaeb :=range _fffb {_aaeb .log (_ae .Sprintf ("c\u006f\u006d\u0062\u0069\u006e\u0065\u0064\u0020\u0025\u0064",_dfga ));};};return _gacd .applyTables (_fffb );};func (_dfab rectRuling )checkWidth (_gfag ,_adbaf float64 )(float64 ,bool ){_bege :=_adbaf -_gfag ;
|
|
|
|
|
_agga :=_bege <=_gdff ;return _bege ,_agga ;};func _cggac (_fbdgf _bc .PdfColorspace ,_eedef _bc .PdfColor )_fd .Color {if _fbdgf ==nil ||_eedef ==nil {return _fd .Black ;};_eceaed ,_efbbc :=_fbdgf .ColorToRGB (_eedef );if _efbbc !=nil {_gd .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006fu\u006c\u0064\u0020no\u0074\u0020\u0063\u006f\u006e\u0076e\u0072\u0074\u0020\u0063\u006f\u006c\u006f\u0072\u0020\u0025\u0076\u0020\u0028\u0025\u0076)\u0020\u0074\u006f\u0020\u0052\u0047\u0042\u003a \u0025\u0073",_eedef ,_fbdgf ,_efbbc );
|
|
|
|
|
return _fd .Black ;};_dbgff ,_gbegb :=_eceaed .(*_bc .PdfColorDeviceRGB );if !_gbegb {_gd .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0063\u006f\u006e\u0076\u0065\u0072\u0074\u0065\u0064 \u0063\u006f\u006c\u006f\u0072\u0020\u0069\u0073\u0020\u006e\u006f\u0074\u0020i\u006e\u0020\u0074\u0068\u0065\u0020\u0052\u0047\u0042\u0020\u0063\u006flo\u0072\u0073\u0070\u0061\u0063\u0065\u003a\u0020\u0025\u0076",_eceaed );
|
|
|
|
|
return _fd .Black ;};return _fd .NRGBA {R :uint8 (_dbgff .R ()*255),G :uint8 (_dbgff .G ()*255),B :uint8 (_dbgff .B ()*255),A :uint8 (255)};};func (_cbea rulingList )augmentGrid ()(rulingList ,rulingList ){_dggc ,_gbdc :=_cbea .vertsHorzs ();if len (_dggc )==0||len (_gbdc )==0{return _dggc ,_gbdc ;
|
|
|
|
|
};_bfbc ,_cbaa :=_dggc ,_gbdc ;_debf :=_dggc .bbox ();_egec :=_gbdc .bbox ();if _acea {_gd .Log .Info ("\u0061u\u0067\u006d\u0065\u006e\u0074\u0047\u0072\u0069\u0064\u003a\u0020b\u0062\u006f\u0078\u0056\u003d\u0025\u0036\u002e\u0032\u0066",_debf );_gd .Log .Info ("\u0061u\u0067\u006d\u0065\u006e\u0074\u0047\u0072\u0069\u0064\u003a\u0020b\u0062\u006f\u0078\u0048\u003d\u0025\u0036\u002e\u0032\u0066",_egec );
|
|
|
|
|
};var _dfeg ,_dbaf ,_afbe ,_cbcb *ruling ;if _egec .Llx < _debf .Llx -_cdbcd {_dfeg =&ruling {_fafbe :_ggfd ,_beb :_dddb ,_aadb :_egec .Llx ,_eacb :_debf .Lly ,_cbf :_debf .Ury };_dggc =append (rulingList {_dfeg },_dggc ...);};if _egec .Urx > _debf .Urx +_cdbcd {_dbaf =&ruling {_fafbe :_ggfd ,_beb :_dddb ,_aadb :_egec .Urx ,_eacb :_debf .Lly ,_cbf :_debf .Ury };
|
|
|
|
|
_dggc =append (_dggc ,_dbaf );};if _debf .Lly < _egec .Lly -_cdbcd {_afbe =&ruling {_fafbe :_ggfd ,_beb :_dffd ,_aadb :_debf .Lly ,_eacb :_egec .Llx ,_cbf :_egec .Urx };_gbdc =append (rulingList {_afbe },_gbdc ...);};if _debf .Ury > _egec .Ury +_cdbcd {_cbcb =&ruling {_fafbe :_ggfd ,_beb :_dffd ,_aadb :_debf .Ury ,_eacb :_egec .Llx ,_cbf :_egec .Urx };
|
|
|
|
|
_gbdc =append (_gbdc ,_cbcb );};if len (_dggc )+len (_gbdc )==len (_cbea ){return _bfbc ,_cbaa ;};_fded :=append (_dggc ,_gbdc ...);_cbea .log ("u\u006e\u0061\u0075\u0067\u006d\u0065\u006e\u0074\u0065\u0064");_fded .log ("\u0061u\u0067\u006d\u0065\u006e\u0074\u0065d");
|
|
|
|
|
return _dggc ,_gbdc ;};
|
|
|
|
|
|
|
|
|
|
// PageImages represents extracted images on a PDF page with spatial information:
|
|
|
|
|
// display position and size.
|
|
|
|
|
type PageImages struct{Images []ImageMark ;};
|
|
|
|
|
|
|
|
|
|
// String returns a human readable description of `ss`.
|
|
|
|
|
func (_bgad *shapesState )String ()string {return _ae .Sprintf ("\u007b\u0025\u0064\u0020su\u0062\u0070\u0061\u0074\u0068\u0073\u0020\u0066\u0072\u0065\u0073\u0068\u003d\u0025t\u007d",len (_bgad ._cbdd ),_bgad ._bbfc );};func _bafa (_bdge ,_gef _bc .PdfRectangle )bool {return _bcgdg (_bdge ,_gef )&&_gece (_bdge ,_gef )};
|
|
|
|
|
func (_efeg *textTable )getRight ()paraList {_gfdb :=make (paraList ,_efeg ._bfbba );for _bgcg :=0;_bgcg < _efeg ._bfbba ;_bgcg ++{_ccacc :=_efeg .get (_efeg ._cacec -1,_bgcg )._fefc ;if _ccacc ==nil ||_ccacc ._abgd {return nil ;};_gfdb [_bgcg ]=_ccacc ;
|
|
|
|
|
};for _dggg :=0;_dggg < _efeg ._bfbba -1;_dggg ++{if _gfdb [_dggg ]._eebd !=_gfdb [_dggg +1]{return nil ;};};return _gfdb ;};
|
|
|
|
|
|
|
|
|
|
// NewFromContents creates a new extractor from contents and page resources.
|
|
|
|
|
func NewFromContents (contents string ,resources *_bc .PdfPageResources )(*Extractor ,error ){const _cb ="\u0065x\u0074\u0072\u0061\u0063t\u006f\u0072\u002e\u004e\u0065w\u0046r\u006fm\u0043\u006f\u006e\u0074\u0065\u006e\u0074s";_gdf :=&Extractor {_ec :contents ,_ffe :resources ,_ca :map[string ]fontEntry {},_dg :map[string ]textResult {}};
|
|
|
|
|
_fe .TrackUse (_cb );return _gdf ,nil ;};type textTable struct{_bc .PdfRectangle ;_cacec ,_bfbba int ;_dcbbg bool ;_egfdd map[uint64 ]*textPara ;_ffgge map[uint64 ]compositeCell ;};func (_fca *shapesState )closePath (){if _fca ._bbfc {_fca ._cbdd =append (_fca ._cbdd ,_abfba (_fca ._egd ));
|
|
|
|
|
_fca ._bbfc =false ;}else if len (_fca ._cbdd )==0{if _egfa {_gd .Log .Debug ("\u0063\u006c\u006f\u0073eP\u0061\u0074\u0068\u0020\u0077\u0069\u0074\u0068\u0020\u006e\u006f\u0020\u0070\u0061t\u0068");};_fca ._bbfc =false ;return ;};_fca ._cbdd [len (_fca ._cbdd )-1].close ();
|
|
|
|
|
if _egfa {_gd .Log .Info ("\u0063\u006c\u006f\u0073\u0065\u0050\u0061\u0074\u0068\u003a\u0020\u0025\u0073",_fca );};};func (_faec *textWord )bbox ()_bc .PdfRectangle {return _faec .PdfRectangle };func (_abdb *subpath )makeRectRuling (_bgae _fd .Color )(*ruling ,bool ){if _dcgde {_gd .Log .Info ("\u006d\u0061\u006beR\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0070\u0061\u0074\u0068\u003d\u0025\u0076",_abdb );
|
|
|
|
|
};_cdggf :=_abdb ._cfac [:4];_adde :=make (map[int ]rulingKind ,len (_cdggf ));for _bbcc ,_adgcd :=range _cdggf {_cege :=_abdb ._cfac [(_bbcc +1)%4];_adde [_bbcc ]=_cdega (_adgcd ,_cege );if _dcgde {_ae .Printf ("\u0025\u0034\u0064: \u0025\u0073\u0020\u003d\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u002d\u0020\u0025\u0036\u002e\u0032\u0066",_bbcc ,_adde [_bbcc ],_adgcd ,_cege );
|
|
|
|
|
};};if _dcgde {_ae .Printf ("\u0020\u0020\u0020\u006b\u0069\u006e\u0064\u0073\u003d\u0025\u002b\u0076\u000a",_adde );};var _gfeff ,_fgce []int ;for _abga ,_bfga :=range _adde {switch _bfga {case _dffd :_fgce =append (_fgce ,_abga );case _dddb :_gfeff =append (_gfeff ,_abga );
|
|
|
|
|
};};if _dcgde {_ae .Printf ("\u0020\u0020 \u0068\u006f\u0072z\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a",len (_fgce ),_fgce );_ae .Printf ("\u0020\u0020 \u0076\u0065\u0072t\u0073\u003d\u0025\u0064\u0020\u0025\u002b\u0076\u000a",len (_gfeff ),_gfeff );
|
|
|
|
|
};_acgb :=(len (_fgce )==2&&len (_gfeff )==2)||(len (_fgce )==2&&len (_gfeff )==0&&_cgbf (_cdggf [_fgce [0]],_cdggf [_fgce [1]]))||(len (_gfeff )==2&&len (_fgce )==0&&_ccac (_cdggf [_gfeff [0]],_cdggf [_gfeff [1]]));if _dcgde {_ae .Printf (" \u0020\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064\u0020\u0076\u0065\u0072\u0074\u0073=\u0025\u0064\u0020o\u006b=\u0025\u0074\u000a",len (_fgce ),len (_gfeff ),_acgb );
|
|
|
|
|
};if !_acgb {if _dcgde {_gd .Log .Error ("\u0021!\u006d\u0061\u006b\u0065R\u0065\u0063\u0074\u0052\u0075l\u0069n\u0067:\u0020\u0070\u0061\u0074\u0068\u003d\u0025v",_abdb );_ae .Printf (" \u0020\u0020\u0068\u006f\u0072\u007as\u003d\u0025\u0064\u0020\u0076\u0065\u0072\u0074\u0073=\u0025\u0064\u0020o\u006b=\u0025\u0074\u000a",len (_fgce ),len (_gfeff ),_acgb );
|
|
|
|
|
};return &ruling {},false ;};if len (_gfeff )==0{for _gafca ,_baecg :=range _adde {if _baecg !=_dffd {_gfeff =append (_gfeff ,_gafca );};};};if len (_fgce )==0{for _cbgc ,_ccgcc :=range _adde {if _ccgcc !=_dddb {_fgce =append (_fgce ,_cbgc );};};};if _dcgde {_gd .Log .Info ("\u006da\u006b\u0065R\u0065\u0063\u0074\u0052u\u006c\u0069\u006eg\u003a\u0020\u0068\u006f\u0072\u007a\u0073\u003d\u0025d \u0076\u0065\u0072t\u0073\u003d%\u0064\u0020\u0070\u006f\u0069\u006et\u0073\u003d%\u0064\u000a"+"\u0009\u0020\u0068o\u0072\u007a\u0073\u003d\u0025\u002b\u0076\u000a"+"\u0009\u0020\u0076e\u0072\u0074\u0073\u003d\u0025\u002b\u0076\u000a"+"\t\u0070\u006f\u0069\u006e\u0074\u0073\u003d\u0025\u002b\u0076",len (_fgce ),len (_gfeff ),len (_cdggf ),_fgce ,_gfeff ,_cdggf );
|
|
|
|
|
};var _dceef ,_agcg ,_baeg ,_gfdg _ff .Point ;if _cdggf [_fgce [0]].Y > _cdggf [_fgce [1]].Y {_baeg ,_gfdg =_cdggf [_fgce [0]],_cdggf [_fgce [1]];}else {_baeg ,_gfdg =_cdggf [_fgce [1]],_cdggf [_fgce [0]];};if _cdggf [_gfeff [0]].X > _cdggf [_gfeff [1]].X {_dceef ,_agcg =_cdggf [_gfeff [0]],_cdggf [_gfeff [1]];
|
|
|
|
|
}else {_dceef ,_agcg =_cdggf [_gfeff [1]],_cdggf [_gfeff [0]];};_gebg :=_bc .PdfRectangle {Llx :_dceef .X ,Urx :_agcg .X ,Lly :_gfdg .Y ,Ury :_baeg .Y };if _gebg .Llx > _gebg .Urx {_gebg .Llx ,_gebg .Urx =_gebg .Urx ,_gebg .Llx ;};if _gebg .Lly > _gebg .Ury {_gebg .Lly ,_gebg .Ury =_gebg .Ury ,_gebg .Lly ;
|
|
|
|
|
};_gfed :=rectRuling {PdfRectangle :_gebg ,_gggf :_cgea (_gebg ),Color :_bgae };if _gfed ._gggf ==_dgcb {if _dcgde {_gd .Log .Error ("\u006da\u006b\u0065\u0052\u0065\u0063\u0074\u0052\u0075\u006c\u0069\u006eg\u003a\u0020\u006b\u0069\u006e\u0064\u003d\u006e\u0069\u006c");
|
|
|
|
|
};return nil ,false ;};_cfea ,_ccdd :=_gfed .asRuling ();if !_ccdd {if _dcgde {_gd .Log .Error ("\u006da\u006b\u0065\u0052\u0065c\u0074\u0052\u0075\u006c\u0069n\u0067:\u0020!\u0069\u0073\u0052\u0075\u006c\u0069\u006eg");};return nil ,false ;};if _acea {_ae .Printf ("\u0020\u0020\u0020\u0072\u003d\u0025\u0073\u000a",_cfea .String ());
|
|
|
|
|
};return _cfea ,true ;};func _adba (_egga _ff .Matrix )_ff .Point {_eef ,_bba :=_egga .Translation ();return _ff .Point {X :_eef ,Y :_bba };};func _gaegc (_ebef []*textWord ,_dbfg float64 ,_becc ,_fegd rulingList )*wordBag {_eggc :=_acbgd (_ebef [0],_dbfg ,_becc ,_fegd );
|
|
|
|
|
for _ ,_dcaa :=range _ebef [1:]{_fbge :=_faaa (_dcaa ._gdce );_eggc ._aaaf [_fbge ]=append (_eggc ._aaaf [_fbge ],_dcaa );_eggc .PdfRectangle =_deb (_eggc .PdfRectangle ,_dcaa .PdfRectangle );};_eggc .sort ();return _eggc ;};func (_fea *textObject )setFont (_caf string ,_fbc float64 )error {if _fea ==nil {return nil ;
|
|
|
|
|
};_fea ._efb ._abfe =_fbc ;_eag ,_fgc :=_fea .getFont (_caf );if _fgc !=nil {return _fgc ;};_fea ._efb ._afc =_eag ;return nil ;};
|
|
|
|
|
|
|
|
|
|
// ToText returns the page text as a single string.
|
|
|
|
|
// Deprecated: This function is deprecated and will be removed in a future major version. Please use
|
|
|
|
|
// Text() instead.
|
|
|
|
|
func (_ecc PageText )ToText ()string {return _ecc .Text ()};func (_fbdfg paraList )findTableGrid (_ceabc gridTiling )(*textTable ,map[*textPara ]struct{}){_fggd :=len (_ceabc ._defe );_gbbc :=len (_ceabc ._afbc );_beae :=textTable {_dcbbg :true ,_cacec :_fggd ,_bfbba :_gbbc ,_egfdd :make (map[uint64 ]*textPara ,_fggd *_gbbc ),_ffgge :make (map[uint64 ]compositeCell ,_fggd *_gbbc )};
|
|
|
|
|
_beeab :=make (map[*textPara ]struct{});_degb :=int ((1.0-_gcfa )*float64 (_fggd *_gbbc ));_cegbd :=0;if _gcace {_gd .Log .Info ("\u0066\u0069\u006e\u0064Ta\u0062\u006c\u0065\u0047\u0072\u0069\u0064\u003a\u0020\u0025\u0064\u0020\u0078\u0020%\u0064",_fggd ,_gbbc );
|
|
|
|
|
};for _fged ,_cccebb :=range _ceabc ._afbc {_gebga ,_gccab :=_ceabc ._gdgc [_cccebb ];if !_gccab {continue ;};for _geebf ,_babdb :=range _ceabc ._defe {_cbega ,_dbcc :=_gebga [_babdb ];if !_dbcc {continue ;};_ceee :=_fbdfg .inTile (_cbega );if len (_ceee )==0{_cegbd ++;
|
|
|
|
|
if _cegbd > _degb {if _gcace {_gd .Log .Info ("\u0021\u006e\u0075m\u0045\u006d\u0070\u0074\u0079\u003d\u0025\u0064",_cegbd );};return nil ,nil ;};}else {_beae .putComposite (_geebf ,_fged ,_ceee ,_cbega .PdfRectangle );for _ ,_geeae :=range _ceee {_beeab [_geeae ]=struct{}{};
|
|
|
|
|
};};};};_agcgc :=0;for _eddf :=0;_eddf < _fggd ;_eddf ++{_eeac :=_beae .get (_eddf ,0);if _eeac ==nil ||!_eeac ._fbed {_agcgc ++;};};if _agcgc ==0{if _gcace {_gd .Log .Info ("\u0021\u006e\u0075m\u0048\u0065\u0061\u0064\u0065\u0072\u003d\u0030");};return nil ,nil ;
|
|
|
|
|
};_cdedc :=_beae .reduceTiling (_ceabc ,_cedd );_cdedc =_cdedc .subdivide ();return _cdedc ,_beeab ;};func _bcfgg (_ccaa map[int ][]float64 ){if len (_ccaa )<=1{return ;};_ccbc :=_cdag (_ccaa );if _gcdc {_gd .Log .Info ("\u0066i\u0078C\u0065\u006c\u006c\u0073\u003a \u006b\u0065y\u0073\u003d\u0025\u002b\u0076",_ccbc );
|
|
|
|
|
};var _gfcdf ,_gefc int ;for _gfcdf ,_gefc =range _ccbc {if _ccaa [_gefc ]!=nil {break ;};};for _gefce ,_badec :=range _ccbc [_gfcdf :]{_bgfc :=_ccaa [_badec ];if _bgfc ==nil {continue ;};if _gcdc {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u006b\u0030\u003d\u0025\u0064\u0020\u006b1\u003d\u0025\u0064\u000a",_gfcdf +_gefce ,_gefc ,_badec );
|
|
|
|
|
};_eacbc :=_ccaa [_badec ];if _eacbc [len (_eacbc )-1]> _bgfc [0]{_eacbc [len (_eacbc )-1]=_bgfc [0];_ccaa [_gefc ]=_eacbc ;};_gefc =_badec ;};};type bounded interface{bbox ()_bc .PdfRectangle };func (_ffcdb *ruling )intersects (_ecaefe *ruling )bool {_fdabg :=(_ffcdb ._beb ==_dddb &&_ecaefe ._beb ==_dffd )||(_ecaefe ._beb ==_dddb &&_ffcdb ._beb ==_dffd );
|
|
|
|
|
_dacdg :=func (_bddb ,_baaac *ruling )bool {return _bddb ._eacb -_cdbcd <=_baaac ._aadb &&_baaac ._aadb <=_bddb ._cbf +_cdbcd ;};_gfga :=_dacdg (_ffcdb ,_ecaefe );_acfgd :=_dacdg (_ecaefe ,_ffcdb );if _acea {_ae .Printf ("\u0020\u0020\u0020\u0020\u0069\u006e\u0074\u0065\u0072\u0073\u0065\u0063\u0074\u0073\u003a\u0020\u0020\u006fr\u0074\u0068\u006f\u0067\u006f\u006e\u0061l\u003d\u0025\u0074\u0020\u006f\u0031\u003d\u0025\u0074\u0020\u006f2\u003d\u0025\u0074\u0020\u2192\u0020\u0025\u0074\u000a"+"\u0020\u0020\u0020 \u0020\u0020\u0020\u0076\u003d\u0025\u0073\u000a"+" \u0020\u0020\u0020\u0020\u0020\u0077\u003d\u0025\u0073\u000a",_fdabg ,_gfga ,_acfgd ,_fdabg &&_gfga &&_acfgd ,_ffcdb ,_ecaefe );
|
|
|
|
|
};return _fdabg &&_gfga &&_acfgd ;};func (_deae rulingList )intersections ()map[int ]intSet {var _badc ,_cfce []int ;for _dcfb ,_cebag :=range _deae {switch _cebag ._beb {case _dddb :_badc =append (_badc ,_dcfb );case _dffd :_cfce =append (_cfce ,_dcfb );
|
|
|
|
|
};};if len (_badc )< _gfcc +1||len (_cfce )< _abcc +1{return nil ;};if len (_badc )+len (_cfce )> _gedf {_gd .Log .Debug ("\u0069\u006e\u0074\u0065\u0072\u0073e\u0063\u0074\u0069\u006f\u006e\u0073\u003a\u0020\u0054\u004f\u004f\u0020\u004d\u0041\u004e\u0059\u0020\u0072\u0075\u006ci\u006e\u0067\u0073\u0020\u0076\u0065\u0063\u0073\u003d\u0025\u0064\u0020\u003d\u0020%\u0064 \u0078\u0020\u0025\u0064",len (_deae ),len (_badc ),len (_cfce ));
|
|
|
|
|
return nil ;};_fdgcc :=make (map[int ]intSet ,len (_badc )+len (_cfce ));for _ ,_ffdea :=range _badc {for _ ,_aaae :=range _cfce {if _deae [_ffdea ].intersects (_deae [_aaae ]){if _ ,_abbd :=_fdgcc [_ffdea ];!_abbd {_fdgcc [_ffdea ]=make (intSet );};if _ ,_afca :=_fdgcc [_aaae ];
|
|
|
|
|
!_afca {_fdgcc [_aaae ]=make (intSet );};_fdgcc [_ffdea ].add (_aaae );_fdgcc [_aaae ].add (_ffdea );};};};return _fdgcc ;};func _cdag (_ffce map[int ][]float64 )[]int {_geeaa :=make ([]int ,len (_ffce ));_gffgf :=0;for _cgcdbd :=range _ffce {_geeaa [_gffgf ]=_cgcdbd ;
|
|
|
|
|
_gffgf ++;};_b .Ints (_geeaa );return _geeaa ;};func (_ccgcg *textPara )text ()string {_dgba :=new (_ce .Buffer );_ccgcg .writeText (_dgba );return _dgba .String ();};func (_eabc gridTiling )complete ()bool {for _ ,_eeeg :=range _eabc ._gdgc {for _ ,_cfeag :=range _eeeg {if !_cfeag .complete (){return false ;
|
|
|
|
|
};};};return true ;};func _deb (_ebce ,_egfd _bc .PdfRectangle )_bc .PdfRectangle {return _bc .PdfRectangle {Llx :_f .Min (_ebce .Llx ,_egfd .Llx ),Lly :_f .Min (_ebce .Lly ,_egfd .Lly ),Urx :_f .Max (_ebce .Urx ,_egfd .Urx ),Ury :_f .Max (_ebce .Ury ,_egfd .Ury )};
|
|
|
|
|
};func (_cbbfe *textLine )bbox ()_bc .PdfRectangle {return _cbbfe .PdfRectangle };func (_aaaa paraList )xNeighbours (_dbgb float64 )map[*textPara ][]int {_cdead :=make ([]event ,2*len (_aaaa ));if _dbgb ==0{for _egcc ,_ebdde :=range _aaaa {_cdead [2*_egcc ]=event {_ebdde .Llx ,true ,_egcc };
|
|
|
|
|
_cdead [2*_egcc +1]=event {_ebdde .Urx ,false ,_egcc };};}else {for _gacag ,_aaff :=range _aaaa {_cdead [2*_gacag ]=event {_aaff .Llx -_dbgb *_aaff .fontsize (),true ,_gacag };_cdead [2*_gacag +1]=event {_aaff .Urx +_dbgb *_aaff .fontsize (),false ,_gacag };
|
|
|
|
|
};};return _aaaa .eventNeighbours (_cdead );};func (_gbbee rulingList )log (_febea string ){if !_acea {return ;};_gd .Log .Info ("\u0023\u0023\u0023\u0020\u0025\u0031\u0030\u0073\u003a\u0020\u0076\u0065c\u0073\u003d\u0025\u0073",_febea ,_gbbee .String ());
|
|
|
|
|
for _faada ,_acdgf :=range _gbbee {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_faada ,_acdgf .String ());};};func _fafda (_gaag map[float64 ]map[float64 ]gridTile )[]float64 {_cabd :=make ([]float64 ,0,len (_gaag ));_bedf :=make (map[float64 ]struct{},len (_gaag ));
|
|
|
|
|
for _ ,_egda :=range _gaag {for _dcfg :=range _egda {if _ ,_bfcc :=_bedf [_dcfg ];_bfcc {continue ;};_cabd =append (_cabd ,_dcfg );_bedf [_dcfg ]=struct{}{};};};_b .Float64s (_cabd );return _cabd ;};const _age =10;type rulingList []*ruling ;func (_cegd *textPara )fontsize ()float64 {return _cegd ._bcca [0]._cgdd };
|
|
|
|
|
func (_afaeb *ruling )encloses (_cfeea ,_ffcg float64 )bool {return _afaeb ._eacb -_cdbcd <=_cfeea &&_ffcg <=_afaeb ._cbf +_cdbcd ;};func (_gceb rulingList )bbox ()_bc .PdfRectangle {var _agdgb _bc .PdfRectangle ;if len (_gceb )==0{_gd .Log .Error ("r\u0075\u006c\u0069\u006e\u0067\u004ci\u0073\u0074\u002e\u0062\u0062\u006f\u0078\u003a\u0020n\u006f\u0020\u0072u\u006ci\u006e\u0067\u0073");
|
|
|
|
|
return _bc .PdfRectangle {};};if _gceb [0]._beb ==_dffd {_agdgb .Llx ,_agdgb .Urx =_gceb .secMinMax ();_agdgb .Lly ,_agdgb .Ury =_gceb .primMinMax ();}else {_agdgb .Llx ,_agdgb .Urx =_gceb .primMinMax ();_agdgb .Lly ,_agdgb .Ury =_gceb .secMinMax ();};
|
|
|
|
|
return _agdgb ;};func (_gegg rulingList )toGrids ()[]rulingList {if _acea {_gd .Log .Info ("t\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0025\u0073",_gegg );};_dfbbd :=_gegg .intersections ();if _acea {_gd .Log .Info ("\u0074\u006f\u0047r\u0069\u0064\u0073\u003a \u0076\u0065\u0063\u0073\u003d\u0025\u0064 \u0069\u006e\u0074\u0065\u0072\u0073\u0065\u0063\u0074\u0073\u003d\u0025\u0064\u0020",len (_gegg ),len (_dfbbd ));
|
|
|
|
|
for _ ,_adee :=range _gecgd (_dfbbd ){_ae .Printf ("\u00254\u0064\u003a\u0020\u0025\u002b\u0076\n",_adee ,_dfbbd [_adee ]);};};_geag :=make (map[int ]intSet ,len (_gegg ));for _dfbgb :=range _gegg {_egfg :=_gegg .connections (_dfbbd ,_dfbgb );if len (_egfg )> 0{_geag [_dfbgb ]=_egfg ;
|
|
|
|
|
};};if _acea {_gd .Log .Info ("t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u0063\u006fn\u006e\u0065\u0063\u0074s=\u0025\u0064",len (_geag ));for _ ,_gfaf :=range _gecgd (_geag ){_ae .Printf ("\u00254\u0064\u003a\u0020\u0025\u002b\u0076\n",_gfaf ,_geag [_gfaf ]);
|
|
|
|
|
};};_fbdg :=_cadc (len (_gegg ),func (_dbgg ,_bcbe int )bool {_bebd ,_fafd :=len (_geag [_dbgg ]),len (_geag [_bcbe ]);if _bebd !=_fafd {return _bebd > _fafd ;};return _gegg .comp (_dbgg ,_bcbe );});if _acea {_gd .Log .Info ("t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u006f\u0072d\u0065\u0072\u0069\u006eg=\u0025\u0076",_fbdg );
|
|
|
|
|
};_efdd :=[][]int {{_fbdg [0]}};_dgcdb :for _ ,_affd :=range _fbdg [1:]{for _cbbfc ,_gcdg :=range _efdd {for _ ,_cacg :=range _gcdg {if _geag [_cacg ].has (_affd ){_efdd [_cbbfc ]=append (_gcdg ,_affd );continue _dgcdb ;};};};_efdd =append (_efdd ,[]int {_affd });
|
|
|
|
|
};if _acea {_gd .Log .Info ("\u0074o\u0047r\u0069\u0064\u0073\u003a\u0020i\u0067\u0072i\u0064\u0073\u003d\u0025\u0076",_efdd );};_b .SliceStable (_efdd ,func (_cdad ,_bfd int )bool {return len (_efdd [_cdad ])> len (_efdd [_bfd ])});for _ ,_afcb :=range _efdd {_b .Slice (_afcb ,func (_cfgd ,_abcgb int )bool {return _gegg .comp (_afcb [_cfgd ],_afcb [_abcgb ])});
|
|
|
|
|
};_agdgg :=make ([]rulingList ,len (_efdd ));for _bdbb ,_gfbc :=range _efdd {_gagf :=make (rulingList ,len (_gfbc ));for _dgab ,_fegb :=range _gfbc {_gagf [_dgab ]=_gegg [_fegb ];};_agdgg [_bdbb ]=_gagf ;};if _acea {_gd .Log .Info ("\u0074o\u0047r\u0069\u0064\u0073\u003a\u0020g\u0072\u0069d\u0073\u003d\u0025\u002b\u0076",_agdgg );
|
|
|
|
|
};var _abad []rulingList ;for _ ,_eeba :=range _agdgg {if _bcde ,_aaaff :=_eeba .isActualGrid ();_aaaff {_eeba =_bcde ;_eeba =_eeba .snapToGroups ();_abad =append (_abad ,_eeba );};};if _acea {_adedc ("t\u006fG\u0072\u0069\u0064\u0073\u003a\u0020\u0061\u0063t\u0075\u0061\u006c\u0047ri\u0064\u0073",_abad );
|
|
|
|
|
_gd .Log .Info ("\u0074\u006f\u0047\u0072\u0069\u0064\u0073\u003a\u0020\u0067\u0072\u0069\u0064\u0073\u003d%\u0064 \u0061\u0063\u0074\u0075\u0061\u006c\u0047\u0072\u0069\u0064\u0073\u003d\u0025\u0064",len (_agdgg ),len (_abad ));};return _abad ;};func (_gdc *stateStack )push (_dede *textState ){_ddcf :=*_dede ;
|
|
|
|
|
*_gdc =append (*_gdc ,&_ddcf )};func (_bfeb intSet )has (_egeca int )bool {_ ,_gaab :=_bfeb [_egeca ];return _gaab };func (_abfce *shapesState )devicePoint (_ggbag ,_dafcf float64 )_ff .Point {_edda :=_abfce ._aea .Mult (_abfce ._ffc );_ggbag ,_dafcf =_edda .Transform (_ggbag ,_dafcf );
|
|
|
|
|
return _ff .NewPoint (_ggbag ,_dafcf );};func (_dgbgc paraList )topoOrder ()[]int {if _gaca {_gd .Log .Info ("\u0074\u006f\u0070\u006f\u004f\u0072\u0064\u0065\u0072\u003a");};_gccae :=len (_dgbgc );_deac :=make ([]bool ,_gccae );_dcdb :=make ([]int ,0,_gccae );
|
|
|
|
|
_eec :=_dgbgc .llyOrdering ();var _bfca func (_bcbc int );_bfca =func (_cfbe int ){_deac [_cfbe ]=true ;for _abfcea :=0;_abfcea < _gccae ;_abfcea ++{if !_deac [_abfcea ]{if _dgbgc .readBefore (_eec ,_cfbe ,_abfcea ){_bfca (_abfcea );};};};_dcdb =append (_dcdb ,_cfbe );
|
|
|
|
|
};for _cffg :=0;_cffg < _gccae ;_cffg ++{if !_deac [_cffg ]{_bfca (_cffg );};};return _ddfbc (_dcdb );};func (_bcdd *textPara )bbox ()_bc .PdfRectangle {return _bcdd .PdfRectangle };func (_addd rulingList )primaries ()[]float64 {_eggcd :=make (map[float64 ]struct{},len (_addd ));
|
|
|
|
|
for _ ,_accd :=range _addd {_eggcd [_accd ._aadb ]=struct{}{};};_fffc :=make ([]float64 ,len (_eggcd ));_dcgfab :=0;for _deee :=range _eggcd {_fffc [_dcgfab ]=_deee ;_dcgfab ++;};_b .Float64s (_fffc );return _fffc ;};const (_bdcdb =1.0e-6;_ebdf =1.0e-4;
|
|
|
|
|
_gaga =10;_gbeg =6;_bbfa =0.5;_dfgfb =0.12;_fgea =0.19;_cfbg =0.04;_adbb =0.04;_geea =1.0;_febe =0.04;_ffgec =0.4;_defgd =0.7;_cede =1.0;_bfff =0.1;_bdbg =1.4;_edg =0.46;_ageg =0.02;_gcbde =0.2;_gacc =0.5;_gdcd =4;_fbfe =4.0;_ceaeb =6;_gcfa =0.3;_eadd =0.01;
|
|
|
|
|
_dcae =0.02;_gfcc =2;_abcc =2;_gedf =500;_cbged =4.0;_aae =4.0;_cccd =0.05;_gbgd =0.1;_cdbcd =2.0;_gdff =2.0;_fcaf =1.5;_cedd =3.0;_ebda =0.25;);var (_ddcda =map[rune ]string {0x0060:"\u0300",0x02CB:"\u0300",0x0027:"\u0301",0x00B4:"\u0301",0x02B9:"\u0301",0x02CA:"\u0301",0x005E:"\u0302",0x02C6:"\u0302",0x007E:"\u0303",0x02DC:"\u0303",0x00AF:"\u0304",0x02C9:"\u0304",0x02D8:"\u0306",0x02D9:"\u0307",0x00A8:"\u0308",0x00B0:"\u030a",0x02DA:"\u030a",0x02BA:"\u030b",0x02DD:"\u030b",0x02C7:"\u030c",0x02C8:"\u030d",0x0022:"\u030e",0x02BB:"\u0312",0x02BC:"\u0313",0x0486:"\u0313",0x055A:"\u0313",0x02BD:"\u0314",0x0485:"\u0314",0x0559:"\u0314",0x02D4:"\u031d",0x02D5:"\u031e",0x02D6:"\u031f",0x02D7:"\u0320",0x02B2:"\u0321",0x00B8:"\u0327",0x02CC:"\u0329",0x02B7:"\u032b",0x02CD:"\u0331",0x005F:"\u0332",0x204E:"\u0359"};
|
|
|
|
|
);func (_eeda *shapesState )addPoint (_agb ,_dcf float64 ){_caca :=_eeda .establishSubpath ();_ecb :=_eeda .devicePoint (_agb ,_dcf );if _caca ==nil {_eeda ._bbfc =true ;_eeda ._egd =_ecb ;}else {_caca .add (_ecb );};};func (_fadd paraList )findGridTables (_dded []gridTiling )[]*textTable {if _gcdc {_gd .Log .Info ("\u0066i\u006e\u0064\u0047\u0072\u0069\u0064\u0054\u0061\u0062\u006c\u0065s\u003a\u0020\u0025\u0064\u0020\u0070\u0061\u0072\u0061\u0073",len (_fadd ));
|
|
|
|
|
for _afec ,_deag :=range _fadd {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_afec ,_deag );};};var _gbce []*textTable ;for _eddd ,_begbd :=range _dded {_ffcgb ,_bdfe :=_fadd .findTableGrid (_begbd );if _ffcgb !=nil {_ffcgb .log (_ae .Sprintf ("\u0066\u0069\u006e\u0064Ta\u0062\u006c\u0065\u0057\u0069\u0074\u0068\u0047\u0072\u0069\u0064\u0073\u003a\u0020%\u0064",_eddd ));
|
|
|
|
|
_gbce =append (_gbce ,_ffcgb );_ffcgb .markCells ();};for _bdgeg :=range _bdfe {_bdgeg ._abgd =true ;};};if _gcdc {_gd .Log .Info ("\u0066i\u006e\u0064\u0047\u0072i\u0064\u0054\u0061\u0062\u006ce\u0073:\u0020%\u0064\u0020\u0074\u0061\u0062\u006c\u0065s",len (_gbce ));
|
|
|
|
|
};return _gbce ;};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `w`.
|
|
|
|
|
func (_ecfc *textWord )String ()string {return _ae .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_ecfc ._gdce ,_ecfc .PdfRectangle ,_ecfc ._edega ,_ecfc ._dgcdg );
|
|
|
|
|
};func _gece (_dcgfa ,_dege _bc .PdfRectangle )bool {return _dcgfa .Lly <=_dege .Ury &&_dege .Lly <=_dcgfa .Ury ;};func (_adfd *wordBag )removeDuplicates (){if _eaebg {_gd .Log .Info ("r\u0065m\u006f\u0076\u0065\u0044\u0075\u0070\u006c\u0069c\u0061\u0074\u0065\u0073: \u0025\u0071",_adfd .text ());
|
|
|
|
|
};for _ ,_ffef :=range _adfd .depthIndexes (){if len (_adfd ._aaaf [_ffef ])==0{continue ;};_eebc :=_adfd ._aaaf [_ffef ][0];_gdccc :=_gcbde *_eebc ._edega ;_dgac :=_eebc ._gdce ;for _ ,_bgfb :=range _adfd .depthBand (_dgac ,_dgac +_gdccc ){_ggbc :=map[*textWord ]struct{}{};
|
|
|
|
|
_afab :=_adfd ._aaaf [_bgfb ];for _ ,_bdafg :=range _afab {if _ ,_eggae :=_ggbc [_bdafg ];_eggae {continue ;};for _ ,_aedf :=range _afab {if _ ,_fdaf :=_ggbc [_aedf ];_fdaf {continue ;};if _aedf !=_bdafg &&_aedf ._dgcdg ==_bdafg ._dgcdg &&_f .Abs (_aedf .Llx -_bdafg .Llx )< _gdccc &&_f .Abs (_aedf .Urx -_bdafg .Urx )< _gdccc &&_f .Abs (_aedf .Lly -_bdafg .Lly )< _gdccc &&_f .Abs (_aedf .Ury -_bdafg .Ury )< _gdccc {_ggbc [_aedf ]=struct{}{};
|
|
|
|
|
};};};if len (_ggbc )> 0{_abeg :=0;for _ ,_bffa :=range _afab {if _ ,_bdf :=_ggbc [_bffa ];!_bdf {_afab [_abeg ]=_bffa ;_abeg ++;};};_adfd ._aaaf [_bgfb ]=_afab [:len (_afab )-len (_ggbc )];if len (_adfd ._aaaf [_bgfb ])==0{delete (_adfd ._aaaf ,_bgfb );
|
|
|
|
|
};};};};};func (_cggdf *textTable )compositeColCorridors ()map[int ][]float64 {_ageb :=make (map[int ][]float64 ,_cggdf ._cacec );if _gcdc {_gd .Log .Info ("\u0063\u006f\u006d\u0070o\u0073\u0069\u0074\u0065\u0043\u006f\u006c\u0043\u006f\u0072r\u0069d\u006f\u0072\u0073\u003a\u0020\u0077\u003d%\u0064\u0020",_cggdf ._cacec );
|
|
|
|
|
};for _ddgg :=0;_ddgg < _cggdf ._cacec ;_ddgg ++{_ageb [_ddgg ]=nil ;};return _ageb ;};type shapesState struct{_ffc _ff .Matrix ;_aea _ff .Matrix ;_cbdd []*subpath ;_bbfc bool ;_egd _ff .Point ;_eeg *textObject ;};func _fafg (_cgff ,_eeed float64 )bool {return _f .Abs (_cgff -_eeed )<=_cdbcd };
|
|
|
|
|
func (_gddb *ruling )alignsPrimary (_abfgc *ruling )bool {return _gddb ._beb ==_abfgc ._beb &&_f .Abs (_gddb ._aadb -_abfgc ._aadb )< _gdff *0.5;};func (_cecc *compositeCell )updateBBox (){for _ ,_bbbbe :=range _cecc .paraList {_cecc .PdfRectangle =_deb (_cecc .PdfRectangle ,_bbbbe .PdfRectangle );
|
|
|
|
|
};};const (_gacf =true ;_acec =true ;_edaf =true ;_gdfd =false ;_egcbb =false ;_fdab =6;_facb =3.0;_fcbe =200;_addf =true ;_decc =true ;_gbaa =true ;_cbae =true ;_agcb =false ;);func (_gafd *textObject )getFontDirect (_bdcc string )(*_bc .PdfFont ,error ){_cgdc ,_fdgc :=_gafd .getFontDict (_bdcc );
|
|
|
|
|
if _fdgc !=nil {return nil ,_fdgc ;};_fab ,_fdgc :=_bc .NewPdfFontFromPdfObject (_cgdc );if _fdgc !=nil {_gd .Log .Debug ("\u0067\u0065\u0074\u0046\u006f\u006e\u0074\u0044\u0069\u0072\u0065\u0063\u0074\u003a\u0020\u004e\u0065\u0077Pd\u0066F\u006f\u006e\u0074\u0046\u0072\u006f\u006d\u0050\u0064\u0066\u004f\u0062j\u0065\u0063\u0074\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u006e\u0061\u006d\u0065\u003d%\u0023\u0071\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bdcc ,_fdgc );
|
|
|
|
|
};return _fab ,_fdgc ;};func (_aeeg paraList )findTextTables ()[]*textTable {var _acad []*textTable ;for _ ,_dfdg :=range _aeeg {if _dfdg .taken ()||_dfdg .Width ()==0{continue ;};_gfcd :=_dfdg .isAtom ();if _gfcd ==nil {continue ;};_gfcd .growTable ();
|
|
|
|
|
if _gfcd ._cacec *_gfcd ._bfbba < _ceaeb {continue ;};_gfcd .markCells ();_gfcd .log ("\u0067\u0072\u006fw\u006e");_acad =append (_acad ,_gfcd );};return _acad ;};type lineRuling struct{_cceae rulingKind ;_ddda markKind ;_fd .Color ;_agcee ,_fgga _ff .Point ;
|
|
|
|
|
};func (_ced *textObject )checkOp (_fag *_de .ContentStreamOperation ,_ggg int ,_bcga bool )(_dgb bool ,_ddae error ){if _ced ==nil {var _ccbf []_eb .PdfObject ;if _ggg > 0{_ccbf =_fag .Params ;if len (_ccbf )> _ggg {_ccbf =_ccbf [:_ggg ];};};_gd .Log .Debug ("\u0025\u0023q \u006f\u0070\u0065r\u0061\u006e\u0064\u0020out\u0073id\u0065\u0020\u0074\u0065\u0078\u0074\u002e p\u0061\u0072\u0061\u006d\u0073\u003d\u0025+\u0076",_fag .Operand ,_ccbf );
|
|
|
|
|
};if _ggg >=0{if len (_fag .Params )!=_ggg {if _bcga {_ddae =_dd .New ("\u0069n\u0063\u006f\u0072\u0072e\u0063\u0074\u0020\u0070\u0061r\u0061m\u0065t\u0065\u0072\u0020\u0063\u006f\u0075\u006et");};_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0025\u0023\u0071\u0020\u0073\u0068\u006f\u0075\u006c\u0064\u0020h\u0061\u0076\u0065\u0020\u0025\u0064\u0020i\u006e\u0070\u0075\u0074\u0020\u0070\u0061\u0072\u0061\u006d\u0073,\u0020\u0067\u006f\u0074\u0020\u0025\u0064\u0020\u0025\u002b\u0076",_fag .Operand ,_ggg ,len (_fag .Params ),_fag .Params );
|
|
|
|
|
return false ,_ddae ;};};return true ,nil ;};func _gfebf (_ddbd ,_dged float64 )string {_fgfc :=!_gfbdg (_ddbd -_dged );if _fgfc {return "\u000a";};return "\u0020";};
|
|
|
|
|
|
|
|
|
|
// String returns a string describing `ma`.
|
|
|
|
|
func (_gac TextMarkArray )String ()string {_dbce :=len (_gac ._fad );if _dbce ==0{return "\u0045\u004d\u0050T\u0059";};_ggbb :=_gac ._fad [0];_agf :=_gac ._fad [_dbce -1];return _ae .Sprintf ("\u007b\u0054\u0045\u0058\u0054\u004d\u0041\u0052K\u0041\u0052\u0052AY\u003a\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025s\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d%\u0073\u007d",_dbce ,_ggbb ,_agf );
|
|
|
|
|
};func _faaa (_dea float64 )int {var _abab int ;if _dea >=0{_abab =int (_dea /_gbeg );}else {_abab =int (_dea /_gbeg )-1;};return _abab ;};func (_faeb *textObject )setTextMatrix (_gcbd []float64 ){if len (_gcbd )!=6{_gd .Log .Debug ("\u0045\u0052\u0052OR\u003a\u0020\u006c\u0065\u006e\u0028\u0066\u0029\u0020\u0021\u003d\u0020\u0036\u0020\u0028\u0025\u0064\u0029",len (_gcbd ));
|
|
|
|
|
return ;};_bgaf ,_cgg ,_gbb ,_aeda ,_dgga ,_addb :=_gcbd [0],_gcbd [1],_gcbd [2],_gcbd [3],_gcbd [4],_gcbd [5];_faeb ._cdf =_ff .NewMatrix (_bgaf ,_cgg ,_gbb ,_aeda ,_dgga ,_addb );_faeb ._ggdd =_faeb ._cdf ;};func (_ffgb paraList )toTextMarks ()[]TextMark {_eabf :=0;
|
|
|
|
|
var _fage []TextMark ;for _gffd ,_addc :=range _ffgb {if _addc ._fbed {continue ;};_feab :=_addc .toTextMarks (&_eabf );_fage =append (_fage ,_feab ...);if _gffd !=len (_ffgb )-1{if _cdea (_addc ,_ffgb [_gffd +1]){_fage =_aada (_fage ,&_eabf ,"\u0020");
|
|
|
|
|
}else {_fage =_aada (_fage ,&_eabf ,"\u000a");_fage =_aada (_fage ,&_eabf ,"\u000a");};};};_fage =_aada (_fage ,&_eabf ,"\u000a");_fage =_aada (_fage ,&_eabf ,"\u000a");return _fage ;};func (_ffadc *wordBag )highestWord (_cgcdb int ,_bgd ,_agcd float64 )*textWord {for _ ,_ccbg :=range _ffadc ._aaaf [_cgcdb ]{if _bgd <=_ccbg ._gdce &&_ccbg ._gdce <=_agcd {return _ccbg ;
|
|
|
|
|
};};return nil ;};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `b`.
|
|
|
|
|
func (_bbdc *wordBag )String ()string {var _gabb []string ;for _ ,_aff :=range _bbdc .depthIndexes (){_edc :=_bbdc ._aaaf [_aff ];for _ ,_fge :=range _edc {_gabb =append (_gabb ,_fge ._dgcdg );};};return _ae .Sprintf ("\u0025.\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065=\u0025\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0025\u0071",_bbdc .PdfRectangle ,_bbdc ._cfec ,len (_gabb ),_gabb );
|
|
|
|
|
};func (_dcee *textPara )toTextMarks (_caa *int )[]TextMark {if _dcee ._egea ==nil {return _dcee .toCellTextMarks (_caa );};var _gdda []TextMark ;for _ffgbb :=0;_ffgbb < _dcee ._egea ._bfbba ;_ffgbb ++{for _bafbg :=0;_bafbg < _dcee ._egea ._cacec ;_bafbg ++{_acdcb :=_dcee ._egea .get (_bafbg ,_ffgbb );
|
|
|
|
|
if _acdcb ==nil {_gdda =_aada (_gdda ,_caa ,"\u0009");}else {_begb :=_acdcb .toCellTextMarks (_caa );_gdda =append (_gdda ,_begb ...);};_gdda =_aada (_gdda ,_caa ,"\u0020");};if _ffgbb < _dcee ._egea ._bfbba -1{_gdda =_aada (_gdda ,_caa ,"\u000a");};};
|
|
|
|
|
return _gdda ;};func _fbgcg (_fccg []*textWord ,_aaad *textWord )[]*textWord {for _febb ,_egdcf :=range _fccg {if _egdcf ==_aaad {return _dcea (_fccg ,_febb );};};_gd .Log .Error ("\u0072\u0065\u006d\u006f\u0076e\u0057\u006f\u0072\u0064\u003a\u0020\u0077\u006f\u0072\u0064\u0073\u0020\u0064o\u0065\u0073\u006e\u0027\u0074\u0020\u0063\u006f\u006e\u0074\u0061\u0069\u006e\u0020\u0077\u006f\u0072\u0064\u003d\u0025\u0073",_aaad );
|
|
|
|
|
return nil ;};func (_aaba lineRuling )asRuling ()(*ruling ,bool ){_ceda :=ruling {_beb :_aaba ._cceae ,Color :_aaba .Color ,_fafbe :_bfgf };switch _aaba ._cceae {case _dddb :_ceda ._aadb =_aaba .xMean ();_ceda ._eacb =_f .Min (_aaba ._agcee .Y ,_aaba ._fgga .Y );
|
|
|
|
|
_ceda ._cbf =_f .Max (_aaba ._agcee .Y ,_aaba ._fgga .Y );case _dffd :_ceda ._aadb =_aaba .yMean ();_ceda ._eacb =_f .Min (_aaba ._agcee .X ,_aaba ._fgga .X );_ceda ._cbf =_f .Max (_aaba ._agcee .X ,_aaba ._fgga .X );default:_gd .Log .Error ("\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064",_aaba ._cceae );
|
|
|
|
|
return nil ,false ;};return &_ceda ,true ;};type intSet map[int ]struct{};var _eebg =TextMark {Text :"\u005b\u0058\u005d",Original :"\u0020",Meta :true ,FillColor :_fd .White ,StrokeColor :_fd .White };func (_eebed *shapesState )drawRectangle (_dcgbb ,_aad ,_eagfd ,_cefb float64 ){if _egfa {_gaega :=_eebed .devicePoint (_dcgbb ,_aad );
|
|
|
|
|
_egaa :=_eebed .devicePoint (_dcgbb +_eagfd ,_aad +_cefb );_afb :=_bc .PdfRectangle {Llx :_gaega .X ,Lly :_gaega .Y ,Urx :_egaa .X ,Ury :_egaa .Y };_gd .Log .Info ("d\u0072a\u0077\u0052\u0065\u0063\u0074\u0061\u006e\u0067l\u0065\u003a\u0020\u00256.\u0032\u0066",_afb );
|
|
|
|
|
};_eebed .newSubPath ();_eebed .moveTo (_dcgbb ,_aad );_eebed .lineTo (_dcgbb +_eagfd ,_aad );_eebed .lineTo (_dcgbb +_eagfd ,_aad +_cefb );_eebed .lineTo (_dcgbb ,_aad +_cefb );_eebed .closePath ();};func (_gf *imageExtractContext )extractContentStreamImages (_ddf string ,_fef *_bc .PdfPageResources )error {_gae :=_de .NewContentStreamParser (_ddf );
|
|
|
|
|
_gfe ,_adf :=_gae .Parse ();if _adf !=nil {return _adf ;};if _gf ._ed ==nil {_gf ._ed =map[*_eb .PdfObjectStream ]*cachedImage {};};if _gf ._cgf ==nil {_gf ._cgf =&ImageExtractOptions {};};_dcg :=_de .NewContentStreamProcessor (*_gfe );_dcg .AddHandler (_de .HandlerConditionEnumAllOperands ,"",_gf .processOperand );
|
|
|
|
|
return _dcg .Process (_fef );};func (_bcbcb rulingList )splitSec ()[]rulingList {_b .Slice (_bcbcb ,func (_dbfgg ,_cccdc int )bool {_fgffc ,_cfgbc :=_bcbcb [_dbfgg ],_bcbcb [_cccdc ];if _fgffc ._eacb !=_cfgbc ._eacb {return _fgffc ._eacb < _cfgbc ._eacb ;
|
|
|
|
|
};return _fgffc ._cbf < _cfgbc ._cbf ;});_adaf :=make (map[*ruling ]struct{},len (_bcbcb ));_edfcc :=func (_ebbb *ruling )rulingList {_dafd :=rulingList {_ebbb };_adaf [_ebbb ]=struct{}{};for _ ,_cebff :=range _bcbcb {if _ ,_bdec :=_adaf [_cebff ];_bdec {continue ;
|
|
|
|
|
};for _ ,_fgag :=range _dafd {if _cebff .alignsSec (_fgag ){_dafd =append (_dafd ,_cebff );_adaf [_cebff ]=struct{}{};break ;};};};return _dafd ;};_bfgac :=[]rulingList {_edfcc (_bcbcb [0])};for _ ,_dcca :=range _bcbcb [1:]{if _ ,_bgba :=_adaf [_dcca ];
|
|
|
|
|
_bgba {continue ;};_bfgac =append (_bfgac ,_edfcc (_dcca ));};return _bfgac ;};func (_aab *textObject )getFont (_fbef string )(*_bc .PdfFont ,error ){if _aab ._aac ._ca !=nil {_aab ._aac ._ceg ++;_aecc ,_acgc :=_aab ._aac ._ca [_fbef ];if _acgc {_aecc ._gdbe =_aab ._aac ._ceg ;
|
|
|
|
|
return _aecc ._faef ,nil ;};};_ddfg ,_bgga :=_aab .getFontDirect (_fbef );if _bgga !=nil {return nil ,_bgga ;};if _aab ._aac ._ca !=nil {_egae :=fontEntry {_ddfg ,_aab ._aac ._ceg };if len (_aab ._aac ._ca )>=_age {var _cgcd []string ;for _ade :=range _aab ._aac ._ca {_cgcd =append (_cgcd ,_ade );
|
|
|
|
|
};_b .Slice (_cgcd ,func (_aggc ,_eaag int )bool {return _aab ._aac ._ca [_cgcd [_aggc ]]._gdbe < _aab ._aac ._ca [_cgcd [_eaag ]]._gdbe ;});delete (_aab ._aac ._ca ,_cgcd [0]);};_aab ._aac ._ca [_fbef ]=_egae ;};return _ddfg ,nil ;};
|
|
|
|
|
|
|
|
|
|
// RenderMode specifies the text rendering mode (Tmode), which determines whether showing text shall cause
|
|
|
|
|
// glyph outlines to be stroked, filled, used as a clipping boundary, or some combination of the three.
|
|
|
|
|
// Stroking, filling, and clipping shall have the same effects for a text object as they do for a path object
|
|
|
|
|
// (see 8.5.3, "Path-Painting Operators" and 8.5.4, "Clipping Path Operators").
|
|
|
|
|
type RenderMode int ;func (_gdcb *textPara )isAtom ()*textTable {_dcafe :=_gdcb ;_aage :=_gdcb ._fefc ;_cdfd :=_gdcb ._eebd ;if !(_aage !=nil &&!_aage ._abgd &&_cdfd !=nil &&!_cdfd ._abgd ){return nil ;};_effc :=_aage ._eebd ;if !(_effc !=nil &&!_effc ._abgd &&_effc ==_cdfd ._fefc ){return nil ;
|
|
|
|
|
};return _efdf (_dcafe ,_aage ,_cdfd ,_effc );};func (_caaf rulingList )aligned ()bool {if len (_caaf )< 2{return false ;};_dbge :=make (map[*ruling ]int );_dbge [_caaf [0]]=0;for _ ,_aebd :=range _caaf [1:]{_dgdg :=false ;for _dggd :=range _dbge {if _aebd .gridIntersecting (_dggd ){_dbge [_dggd ]++;
|
|
|
|
|
_dgdg =true ;break ;};};if !_dgdg {_dbge [_aebd ]=0;};};_dfgeg :=0;for _ ,_bgfff :=range _dbge {if _bgfff ==0{_dfgeg ++;};};_dbfbc :=float64 (_dfgeg )/float64 (len (_caaf ));_gcebd :=_dbfbc <=1.0-_ebda ;if _acea {_gd .Log .Info ("\u0061\u006c\u0069\u0067\u006e\u0065\u0064\u003d\u0025\u0074\u0020\u0075\u006em\u0061\u0074\u0063\u0068\u0065\u0064=\u0025\u002e\u0032\u0066\u003d\u0025\u0064\u002f\u0025\u0064\u0020\u0076\u0065c\u0073\u003d\u0025\u0073",_gcebd ,_dbfbc ,_dfgeg ,len (_caaf ),_caaf .String ());
|
|
|
|
|
};return _gcebd ;};func _ebag (_afeg _bc .PdfRectangle )*ruling {return &ruling {_beb :_dffd ,_aadb :_afeg .Lly ,_eacb :_afeg .Llx ,_cbf :_afeg .Urx };};func _daceb (_bdbad ,_bebf int )uint64 {return uint64 (_bdbad )*0x1000000+uint64 (_bebf )};
|
|
|
|
|
|
|
|
|
|
// PageText represents the layout of text on a device page.
|
|
|
|
|
type PageText struct{_ddgb []*textMark ;_fdaa string ;_dcbg []TextMark ;_dfc []TextTable ;_fcd _bc .PdfRectangle ;_bade []pathSection ;_efg []pathSection ;};const (_cccc markKind =iota ;_bfgf ;_cfgb ;_ggfd ;);func (_bcaee pathSection )bbox ()_bc .PdfRectangle {_fff :=_bcaee ._bbd [0]._cfac [0];
|
|
|
|
|
_abfc :=_bc .PdfRectangle {Llx :_fff .X ,Urx :_fff .X ,Lly :_fff .Y ,Ury :_fff .Y };_dfbb :=func (_bea _ff .Point ){if _bea .X < _abfc .Llx {_abfc .Llx =_bea .X ;}else if _bea .X > _abfc .Urx {_abfc .Urx =_bea .X ;};if _bea .Y < _abfc .Lly {_abfc .Lly =_bea .Y ;
|
|
|
|
|
}else if _bea .Y > _abfc .Ury {_abfc .Ury =_bea .Y ;};};for _ ,_eea :=range _bcaee ._bbd [0]._cfac [1:]{_dfbb (_eea );};for _ ,_bbac :=range _bcaee ._bbd [1:]{for _ ,_dbca :=range _bbac ._cfac {_dfbb (_dbca );};};return _abfc ;};func _cbfaa (_adga []*textMark ,_dbea _bc .PdfRectangle )*textWord {_ggbcg :=_adga [0].PdfRectangle ;
|
|
|
|
|
_gdag :=_adga [0]._ebgc ;for _ ,_fbac :=range _adga [1:]{_ggbcg =_deb (_ggbcg ,_fbac .PdfRectangle );if _fbac ._ebgc > _gdag {_gdag =_fbac ._ebgc ;};};return &textWord {PdfRectangle :_ggbcg ,_eeefd :_adga ,_gdce :_dbea .Ury -_ggbcg .Lly ,_edega :_gdag };
|
|
|
|
|
};func (_gbcd paraList )log (_ccea string ){if !_gaca {return ;};_gd .Log .Info ("%\u0038\u0073\u003a\u0020\u0025\u0064 \u0070\u0061\u0072\u0061\u0073\u0020=\u003d\u003d\u003d\u003d\u003d\u003d\u002d-\u002d\u002d\u002d\u002d\u002d\u003d\u003d\u003d\u003d\u003d=\u003d",_ccea ,len (_gbcd ));
|
|
|
|
|
for _ggcb ,_cffgd :=range _gbcd {if _cffgd ==nil {continue ;};_cbac :=_cffgd .text ();_afae :="\u0020\u0020";if _cffgd ._egea !=nil {_afae =_ae .Sprintf ("\u005b%\u0064\u0078\u0025\u0064\u005d",_cffgd ._egea ._cacec ,_cffgd ._egea ._bfbba );};_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0025s\u0020\u0025\u0071\u000a",_ggcb ,_cffgd .PdfRectangle ,_afae ,_eaed (_cbac ,50));
|
|
|
|
|
};};func (_bagg paraList )applyTables (_bccd []*textTable )paraList {var _ecef paraList ;for _ ,_decgf :=range _bccd {_ecef =append (_ecef ,_decgf .newTablePara ());};for _ ,_gagdf :=range _bagg {if _gagdf ._abgd {continue ;};_ecef =append (_ecef ,_gagdf );
|
|
|
|
|
};return _ecef ;};func (_gfeca paraList )tables ()[]TextTable {var _ebegd []TextTable ;if _gcdc {_gd .Log .Info ("\u0070\u0061\u0072\u0061\u0073\u002e\u0074\u0061\u0062\u006c\u0065\u0073\u003a");};for _ ,_cfee :=range _gfeca {_bfggf :=_cfee ._egea ;if _bfggf !=nil &&_bfggf .isExportable (){_ebegd =append (_ebegd ,_bfggf .toTextTable ());
|
|
|
|
|
};};return _ebegd ;};func _eebdg (_gcgd map[int ][]float64 )string {_acgegc :=_cdag (_gcgd );_bbbc :=make ([]string ,len (_gcgd ));for _gaeb ,_aeecd :=range _acgegc {_bbbc [_gaeb ]=_ae .Sprintf ("\u0025\u0064\u003a\u0020\u0025\u002e\u0032\u0066",_aeecd ,_gcgd [_aeecd ]);
|
|
|
|
|
};return _ae .Sprintf ("\u007b\u0025\u0073\u007d",_a .Join (_bbbc ,"\u002c\u0020"));};func _fcbee (_dbdc map[float64 ]gridTile )[]float64 {_ebefga :=make ([]float64 ,0,len (_dbdc ));for _fade :=range _dbdc {_ebefga =append (_ebefga ,_fade );};_b .Float64s (_ebefga );
|
|
|
|
|
return _ebefga ;};func (_dgec *textTable )newTablePara ()*textPara {_fbff :=_dgec .computeBbox ();_ebcea :=&textPara {PdfRectangle :_fbff ,_bcgb :_fbff ,_egea :_dgec };if _gcdc {_gd .Log .Info ("\u006e\u0065w\u0054\u0061\u0062l\u0065\u0050\u0061\u0072\u0061\u003a\u0020\u0025\u0073",_ebcea );
|
|
|
|
|
};return _ebcea ;};
|
|
|
|
|
|
|
|
|
|
// ImageExtractOptions contains options for controlling image extraction from
|
|
|
|
|
// PDF pages.
|
|
|
|
|
type ImageExtractOptions struct{IncludeInlineStencilMasks bool ;};func (_feccf rulingList )primMinMax ()(float64 ,float64 ){_bgcc ,_ceeb :=_feccf [0]._aadb ,_feccf [0]._aadb ;for _ ,_beed :=range _feccf [1:]{if _beed ._aadb < _bgcc {_bgcc =_beed ._aadb ;
|
|
|
|
|
}else if _beed ._aadb > _ceeb {_ceeb =_beed ._aadb ;};};return _bgcc ,_ceeb ;};type event struct{_agff float64 ;_bbdd bool ;_fbdcc int ;};func (_acd *textObject )getCurrentFont ()*_bc .PdfFont {_fbde :=_acd ._efb ._afc ;if _fbde ==nil {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u004e\u006f\u0020\u0066\u006f\u006e\u0074\u0020\u0064\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020U\u0073\u0069\u006e\u0067\u0020d\u0065\u0066a\u0075\u006c\u0074\u002e");
|
|
|
|
|
return _bc .DefaultFont ();};return _fbde ;};func _ebfa (_deef string )string {_efef :=[]rune (_deef );return string (_efef [:len (_efef )-1])};
|
|
|
|
|
|
|
|
|
|
// String returns a human readable description of `s`.
|
|
|
|
|
func (_cbgede intSet )String ()string {var _aedd []int ;for _cdaacg :=range _cbgede {if _cbgede .has (_cdaacg ){_aedd =append (_aedd ,_cdaacg );};};_b .Ints (_aedd );return _ae .Sprintf ("\u0025\u002b\u0076",_aedd );};
|
2021-04-06 22:35:37 +00:00
|
|
|
|
|
2021-04-23 20:28:14 +00:00
|
|
|
|
// RangeOffset returns the TextMarks in `ma` that overlap text[start:end] in the extracted text.
|
|
|
|
|
// These are tm: `start` <= tm.Offset + len(tm.Text) && tm.Offset < `end` where
|
|
|
|
|
// `start` and `end` are offsets in the extracted text.
|
|
|
|
|
// NOTE: TextMarks can contain multiple characters. e.g. "ffi" for the ffi ligature so the first and
|
|
|
|
|
// last elements of the returned TextMarkArray may only partially overlap text[start:end].
|
2021-05-11 00:01:27 +00:00
|
|
|
|
func (_aefg *TextMarkArray )RangeOffset (start ,end int )(*TextMarkArray ,error ){if _aefg ==nil {return nil ,_dd .New ("\u006da\u003d\u003d\u006e\u0069\u006c");};if end < start {return nil ,_ae .Errorf ("\u0065\u006e\u0064\u0020\u003c\u0020\u0073\u0074\u0061\u0072\u0074\u002e\u0020\u0052\u0061n\u0067\u0065\u004f\u0066\u0066\u0073\u0065\u0074\u0020\u006e\u006f\u0074\u0020d\u0065\u0066\u0069\u006e\u0065\u0064\u002e\u0020\u0073\u0074\u0061\u0072t=\u0025\u0064\u0020\u0065\u006e\u0064\u003d\u0025\u0064\u0020",start ,end );
|
|
|
|
|
};_bced :=len (_aefg ._fad );if _bced ==0{return _aefg ,nil ;};if start < _aefg ._fad [0].Offset {start =_aefg ._fad [0].Offset ;};if end > _aefg ._fad [_bced -1].Offset +1{end =_aefg ._fad [_bced -1].Offset +1;};_gcff :=_b .Search (_bced ,func (_edfc int )bool {return _aefg ._fad [_edfc ].Offset +len (_aefg ._fad [_edfc ].Text )-1>=start });
|
|
|
|
|
if !(0<=_gcff &&_gcff < _bced ){_fgdd :=_ae .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020\u0072\u0061\u006e\u0067\u0065\u002e\u0020\u0073\u0074\u0061\u0072\u0074\u003d%\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009 \u006c\u0061\u0073\u0074\u003d%\u0076",start ,_gcff ,_bced ,_aefg ._fad [0],_aefg ._fad [_bced -1]);
|
|
|
|
|
return nil ,_fgdd ;};_efce :=_b .Search (_bced ,func (_ddd int )bool {return _aefg ._fad [_ddd ].Offset > end -1});if !(0<=_efce &&_efce < _bced ){_dabg :=_ae .Errorf ("\u004f\u0075\u0074\u0020\u006f\u0066\u0020r\u0061\u006e\u0067e\u002e\u0020\u0065n\u0064\u003d%\u0064\u0020\u0069\u0045\u006e\u0064=\u0025d \u006c\u0065\u006e\u003d\u0025\u0064\u000a\u0009\u0066\u0069\u0072\u0073\u0074\u003d\u0025\u0076\u000a\u0009\u0020\u006c\u0061\u0073\u0074\u003d\u0025\u0076",end ,_efce ,_bced ,_aefg ._fad [0],_aefg ._fad [_bced -1]);
|
|
|
|
|
return nil ,_dabg ;};if _efce <=_gcff {return nil ,_ae .Errorf ("\u0069\u0045\u006e\u0064\u0020\u003c=\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003a\u0020\u0073\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020\u0065\u006ed\u003d\u0025\u0064\u0020\u0069\u0053\u0074\u0061\u0072\u0074\u003d\u0025\u0064\u0020i\u0045n\u0064\u003d\u0025\u0064",start ,end ,_gcff ,_efce );
|
|
|
|
|
};return &TextMarkArray {_fad :_aefg ._fad [_gcff :_efce ]},nil ;};
|
2021-04-06 22:35:37 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// String returns a description of `l`.
|
|
|
|
|
func (_efd *textLine )String ()string {return _ae .Sprintf ("\u0025\u002e2\u0066\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\"%\u0073\u0022",_efd ._cedfb ,_efd .PdfRectangle ,_efd ._cgdd ,_efd .text ());
|
|
|
|
|
};func (_gefa rulingList )sortStrict (){_b .Slice (_gefa ,func (_geff ,_dcdg int )bool {_bbcbf ,_accc :=_gefa [_geff ],_gefa [_dcdg ];_beaf ,_facc :=_bbcbf ._beb ,_accc ._beb ;if _beaf !=_facc {return _beaf > _facc ;};_ceff ,_faee :=_bbcbf ._aadb ,_accc ._aadb ;
|
|
|
|
|
if !_gfbdg (_ceff -_faee ){return _ceff < _faee ;};_ceff ,_faee =_bbcbf ._eacb ,_accc ._eacb ;if _ceff !=_faee {return _ceff < _faee ;};return _bbcbf ._cbf < _accc ._cbf ;});};func (_fcge *textLine )appendWord (_dbdf *textWord ){_fcge ._dee =append (_fcge ._dee ,_dbdf );
|
|
|
|
|
_fcge .PdfRectangle =_deb (_fcge .PdfRectangle ,_dbdf .PdfRectangle );if _dbdf ._edega > _fcge ._cgdd {_fcge ._cgdd =_dbdf ._edega ;};if _dbdf ._gdce > _fcge ._cedfb {_fcge ._cedfb =_dbdf ._gdce ;};};func (_aeece *ruling )alignsSec (_acag *ruling )bool {const _bceg =_gdff +1.0;
|
|
|
|
|
return _aeece ._eacb -_bceg <=_acag ._cbf &&_acag ._eacb -_bceg <=_aeece ._cbf ;};type pathSection struct{_bbd []*subpath ;_fd .Color ;};func (_ddbf rulingList )vertsHorzs ()(rulingList ,rulingList ){var _bgebd ,_cdfa rulingList ;for _ ,_badg :=range _ddbf {switch _badg ._beb {case _dddb :_bgebd =append (_bgebd ,_badg );
|
|
|
|
|
case _dffd :_cdfa =append (_cdfa ,_badg );};};return _bgebd ,_cdfa ;};func (_bgbec lineRuling )xMean ()float64 {return 0.5*(_bgbec ._agcee .X +_bgbec ._fgga .X )};func (_ggeg *wordBag )applyRemovals (_eagd map[int ]map[*textWord ]struct{}){for _aebg ,_efa :=range _eagd {if len (_efa )==0{continue ;
|
|
|
|
|
};_fabd :=_ggeg ._aaaf [_aebg ];_agce :=len (_fabd )-len (_efa );if _agce ==0{delete (_ggeg ._aaaf ,_aebg );continue ;};_ecaef :=make ([]*textWord ,_agce );_ccbb :=0;for _ ,_gegd :=range _fabd {if _ ,_dbga :=_efa [_gegd ];!_dbga {_ecaef [_ccbb ]=_gegd ;
|
|
|
|
|
_ccbb ++;};};_ggeg ._aaaf [_aebg ]=_ecaef ;};};func (_fdffb intSet )add (_fbafeg int ){_fdffb [_fbafeg ]=struct{}{}};
|
2021-04-06 22:35:37 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// TableCell is a cell in a TextTable.
|
|
|
|
|
type TableCell struct{
|
2021-04-06 22:35:37 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// Text is the extracted text.
|
|
|
|
|
Text string ;
|
2021-04-06 22:35:37 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// Marks returns the TextMarks corresponding to the text in Text.
|
|
|
|
|
Marks TextMarkArray ;};func (_gggc *textObject )getStrokeColor ()_fd .Color {return _cggac (_gggc ._faf .ColorspaceStroking ,_gggc ._faf .ColorStroking );};func (_cba *stateStack )empty ()bool {return len (*_cba )==0};func (_eedc *textMark )inDiacriticArea (_aded *textMark )bool {_edcb :=_eedc .Llx -_aded .Llx ;
|
|
|
|
|
_beccd :=_eedc .Urx -_aded .Urx ;_ebga :=_eedc .Lly -_aded .Lly ;return _f .Abs (_edcb +_beccd )< _eedc .Width ()*_gacc &&_f .Abs (_ebga )< _eedc .Height ()*_gacc ;};func (_ebdb *textTable )logComposite (_aebgc string ){if !_gcdc {return ;};_gd .Log .Info ("\u007e~\u007eP\u0061\u0072\u0061\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0073",_ebdb ._cacec ,_ebdb ._bfbba ,_aebgc );
|
|
|
|
|
_ae .Printf ("\u0025\u0035\u0073 \u007c","");for _fgda :=0;_fgda < _ebdb ._cacec ;_fgda ++{_ae .Printf ("\u0025\u0033\u0064 \u007c",_fgda );};_ae .Println ("");_ae .Printf ("\u0025\u0035\u0073 \u002b","");for _cgge :=0;_cgge < _ebdb ._cacec ;_cgge ++{_ae .Printf ("\u0025\u0033\u0073 \u002b","\u002d\u002d\u002d");
|
|
|
|
|
};_ae .Println ("");for _cdec :=0;_cdec < _ebdb ._bfbba ;_cdec ++{_ae .Printf ("\u0025\u0035\u0064 \u007c",_cdec );for _dfae :=0;_dfae < _ebdb ._cacec ;_dfae ++{_ffee ,_ :=_ebdb ._ffgge [_daceb (_dfae ,_cdec )].parasBBox ();_ae .Printf ("\u0025\u0033\u0064 \u007c",len (_ffee ));
|
|
|
|
|
};_ae .Println ("");};_gd .Log .Info ("\u007e~\u007eT\u0065\u0078\u0074\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0073",_ebdb ._cacec ,_ebdb ._bfbba ,_aebgc );_ae .Printf ("\u0025\u0035\u0073 \u007c","");for _abdf :=0;_abdf < _ebdb ._cacec ;_abdf ++{_ae .Printf ("\u0025\u0031\u0032\u0064\u0020\u007c",_abdf );
|
|
|
|
|
};_ae .Println ("");_ae .Printf ("\u0025\u0035\u0073 \u002b","");for _ggfdb :=0;_ggfdb < _ebdb ._cacec ;_ggfdb ++{_ae .Print ("\u002d\u002d\u002d\u002d\u002d\u002d\u002d\u002d\u002d-\u002d\u002d\u002d\u002b");};_ae .Println ("");for _abbdb :=0;_abbdb < _ebdb ._bfbba ;
|
|
|
|
|
_abbdb ++{_ae .Printf ("\u0025\u0035\u0064 \u007c",_abbdb );for _cebe :=0;_cebe < _ebdb ._cacec ;_cebe ++{_gfde ,_ :=_ebdb ._ffgge [_daceb (_cebe ,_abbdb )].parasBBox ();_degd :="";_geac :=_gfde .merge ();if _geac !=nil {_degd =_geac .text ();};_degd =_ae .Sprintf ("\u0025\u0071",_eaed (_degd ,12));
|
|
|
|
|
_degd =_degd [1:len (_degd )-1];_ae .Printf ("\u0025\u0031\u0032\u0073\u0020\u007c",_degd );};_ae .Println ("");};};
|
2021-04-06 22:35:37 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// Text returns the extracted page text.
|
|
|
|
|
func (_fcbg PageText )Text ()string {return _fcbg ._fdaa };func (_gcf *Extractor )extractPageText (_bbg string ,_cff *_bc .PdfPageResources ,_aaa _ff .Matrix ,_afe int )(*PageText ,int ,int ,error ){_gd .Log .Trace ("\u0065x\u0074\u0072\u0061\u0063t\u0050\u0061\u0067\u0065\u0054e\u0078t\u003a \u006c\u0065\u0076\u0065\u006c\u003d\u0025d",_afe );
|
|
|
|
|
_fbb :=&PageText {_fcd :_gcf ._dag };_dda :=_abgc (_gcf ._dag );var _gbg stateStack ;_bcfg :=_gge (_gcf ,_cff ,_de .GraphicsState {},&_dda ,&_gbg );_bfa :=shapesState {_aea :_aaa ,_ffc :_ff .IdentityMatrix (),_eeg :_bcfg };var _cdcgc bool ;if _afe > _daga {_gcc :=_dd .New ("\u0066\u006f\u0072\u006d s\u0074\u0061\u0063\u006b\u0020\u006f\u0076\u0065\u0072\u0066\u006c\u006f\u0077");
|
|
|
|
|
_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a \u0065\u0078\u0074\u0072\u0061\u0063\u0074\u0050\u0061\u0067\u0065\u0054\u0065\u0078\u0074\u002e\u0020\u0072\u0065\u0063u\u0072\u0073\u0069\u006f\u006e\u0020\u006c\u0065\u0076\u0065\u006c\u003d\u0025\u0064 \u0065r\u0072\u003d\u0025\u0076",_afe ,_gcc );
|
|
|
|
|
return _fbb ,_dda ._cdd ,_dda ._ceab ,_gcc ;};_gbe :=_de .NewContentStreamParser (_bbg );_fcb ,_bdc :=_gbe .Parse ();if _bdc !=nil {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020e\u0078\u0074\u0072a\u0063\u0074\u0050\u0061g\u0065\u0054\u0065\u0078\u0074\u0020\u0070\u0061\u0072\u0073\u0065\u0020\u0066\u0061\u0069\u006c\u0065\u0064\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bdc );
|
|
|
|
|
return _fbb ,_dda ._cdd ,_dda ._ceab ,_bdc ;};_eed :=_de .NewContentStreamProcessor (*_fcb );_eed .AddHandler (_de .HandlerConditionEnumAllOperands ,"",func (_ead *_de .ContentStreamOperation ,_fdg _de .GraphicsState ,_egc *_bc .PdfPageResources )error {_cfad :=_ead .Operand ;
|
|
|
|
|
if _acge {_gd .Log .Info ("\u0026&\u0026\u0020\u006f\u0070\u003d\u0025s",_ead );};switch _cfad {case "\u0071":if _egfa {_gd .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_bfa ._ffc );};_gbg .push (&_dda );case "\u0051":if !_gbg .empty (){_dda =*_gbg .pop ();
|
|
|
|
|
};_bfa ._ffc =_fdg .CTM ;if _egfa {_gd .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_bfa ._ffc );};case "\u0042\u0054":if _cdcgc {_gd .Log .Debug ("\u0042\u0054\u0020\u0063\u0061\u006c\u006c\u0065\u0064\u0020\u0077\u0068\u0069\u006c\u0065 \u0069n\u0020\u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074");
|
|
|
|
|
_fbb ._ddgb =append (_fbb ._ddgb ,_bcfg ._ddeg ...);};_cdcgc =true ;_gccg :=_fdg ;_gccg .CTM =_aaa .Mult (_gccg .CTM );_bcfg =_gge (_gcf ,_egc ,_gccg ,&_dda ,&_gbg );_bfa ._eeg =_bcfg ;case "\u0045\u0054":if !_cdcgc {_gd .Log .Debug ("\u0045\u0054\u0020ca\u006c\u006c\u0065\u0064\u0020\u006f\u0075\u0074\u0073i\u0064e\u0020o\u0066 \u0061\u0020\u0074\u0065\u0078\u0074\u0020\u006f\u0062\u006a\u0065\u0063\u0074");
|
|
|
|
|
};_cdcgc =false ;_fbb ._ddgb =append (_fbb ._ddgb ,_bcfg ._ddeg ...);_bcfg .reset ();case "\u0054\u002a":_bcfg .nextLine ();case "\u0054\u0064":if _dbf ,_baec :=_bcfg .checkOp (_ead ,2,true );!_dbf {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_baec );
|
|
|
|
|
return _baec ;};_fbe ,_ddc ,_gce :=_gbbeg (_ead .Params );if _gce !=nil {return _gce ;};_bcfg .moveText (_fbe ,_ddc );case "\u0054\u0044":if _dcb ,_ffge :=_bcfg .checkOp (_ead ,2,true );!_dcb {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_ffge );
|
|
|
|
|
return _ffge ;};_bec ,_dab ,_bdg :=_gbbeg (_ead .Params );if _bdg !=nil {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bdg );return _bdg ;};_bcfg .moveTextSetLeading (_bec ,_dab );case "\u0054\u006a":if _ded ,_dcgg :=_bcfg .checkOp (_ead ,1,true );
|
|
|
|
|
!_ded {_gd .Log .Debug ("\u0045\u0052\u0052\u004fR:\u0020\u0054\u006a\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0065\u0072\u0072\u003d%\u0076",_ead ,_dcgg );return _dcgg ;};_bdcg ,_bdgd :=_eb .GetStringBytes (_ead .Params [0]);if !_bdgd {_gd .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a\u0020T\u006a\u0020o\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074S\u0074\u0072\u0069\u006e\u0067\u0042\u0079\u0074\u0065\u0073\u0020\u0066a\u0069\u006c\u0065\u0064",_ead );
|
|
|
|
|
return _eb .ErrTypeError ;};return _bcfg .showText (_bdcg );case "\u0054\u004a":if _bcfc ,_ac :=_bcfg .checkOp (_ead ,1,true );!_bcfc {_gd .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u004a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_ac );return _ac ;
|
|
|
|
|
};_ggd ,_aed :=_eb .GetArray (_ead .Params [0]);if !_aed {_gd .Log .Debug ("\u0045\u0052\u0052OR\u003a\u0020\u0054\u004a\u0020\u006f\u0070\u003d\u0025s\u0020G\u0065t\u0041r\u0072\u0061\u0079\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065\u0064",_ead );
|
|
|
|
|
return _bdc ;};return _bcfg .showTextAdjusted (_ggd );case "\u0027":if _dbfd ,_cae :=_bcfg .checkOp (_ead ,1,true );!_dbfd {_gd .Log .Debug ("\u0045R\u0052O\u0052\u003a\u0020\u0027\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_cae );return _cae ;};_dgg ,_eab :=_eb .GetStringBytes (_ead .Params [0]);
|
|
|
|
|
if !_eab {_gd .Log .Debug ("\u0045\u0052RO\u0052\u003a\u0020'\u0020\u006f\u0070\u003d%s \u0047et\u0053\u0074\u0072\u0069\u006e\u0067\u0042yt\u0065\u0073\u0020\u0066\u0061\u0069\u006ce\u0064",_ead );return _eb .ErrTypeError ;};_bcfg .nextLine ();return _bcfg .showText (_dgg );
|
|
|
|
|
case "\u0022":if _gbc ,_ddfa :=_bcfg .checkOp (_ead ,3,true );!_gbc {_gd .Log .Debug ("\u0045R\u0052O\u0052\u003a\u0020\u0022\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_ddfa );return _ddfa ;};_bad ,_gcge ,_dba :=_gbbeg (_ead .Params [:2]);if _dba !=nil {return _dba ;
|
|
|
|
|
};_adff ,_dafc :=_eb .GetStringBytes (_ead .Params [2]);if !_dafc {_gd .Log .Debug ("\u0045\u0052RO\u0052\u003a\u0020\"\u0020\u006f\u0070\u003d%s \u0047et\u0053\u0074\u0072\u0069\u006e\u0067\u0042yt\u0065\u0073\u0020\u0066\u0061\u0069\u006ce\u0064",_ead );
|
|
|
|
|
return _eb .ErrTypeError ;};_bcfg .setCharSpacing (_bad );_bcfg .setWordSpacing (_gcge );_bcfg .nextLine ();return _bcfg .showText (_adff );case "\u0054\u004c":_gcga ,_add :=_bfad (_ead );if _add !=nil {_gd .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u004c\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_add );
|
|
|
|
|
return _add ;};_bcfg .setTextLeading (_gcga );case "\u0054\u0063":_dff ,_ddg :=_bfad (_ead );if _ddg !=nil {_gd .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0063\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_ddg );return _ddg ;};_bcfg .setCharSpacing (_dff );
|
|
|
|
|
case "\u0054\u0066":if _gea ,_gfc :=_bcfg .checkOp (_ead ,2,true );!_gea {_gd .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0066\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_gfc );return _gfc ;};_dbb ,_ab :=_eb .GetNameVal (_ead .Params [0]);if !_ab {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a \u0054\u0066\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074\u004ea\u006d\u0065\u0056\u0061\u006c\u0020\u0066a\u0069\u006c\u0065\u0064",_ead );
|
|
|
|
|
return _eb .ErrTypeError ;};_dfg ,_eabg :=_eb .GetNumberAsFloat (_ead .Params [1]);if !_ab {_gd .Log .Debug ("\u0045\u0052\u0052O\u0052\u003a\u0020\u0054\u0066\u0020\u006f\u0070\u003d\u0025\u0073\u0020\u0047\u0065\u0074\u0046\u006c\u006f\u0061\u0074\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065d\u002e\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_ead ,_eabg );
|
|
|
|
|
return _eabg ;};_eabg =_bcfg .setFont (_dbb ,_dfg );_bcfg ._cde =_dc .Is (_eabg ,_eb .ErrNotSupported );if _eabg !=nil &&!_bcfg ._cde {return _eabg ;};case "\u0054\u006d":if _cdab ,_baf :=_bcfg .checkOp (_ead ,6,true );!_cdab {_gd .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u006d\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_baf );
|
|
|
|
|
return _baf ;};_ggb ,_geae :=_eb .GetNumbersAsFloat (_ead .Params );if _geae !=nil {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_geae );return _geae ;};_bcfg .setTextMatrix (_ggb );case "\u0054\u0072":if _aef ,_cgfe :=_bcfg .checkOp (_ead ,1,true );
|
|
|
|
|
!_aef {_gd .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0072\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_cgfe );return _cgfe ;};_agc ,_gfb :=_eb .GetIntVal (_ead .Params [0]);if !_gfb {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0072\u0020\u006f\u0070\u003d\u0025\u0073 \u0047e\u0074\u0049\u006e\u0074\u0056\u0061\u006c\u0020\u0066\u0061\u0069\u006c\u0065\u0064",_ead );
|
|
|
|
|
return _eb .ErrTypeError ;};_bcfg .setTextRenderMode (_agc );case "\u0054\u0073":if _fcf ,_dgd :=_bcfg .checkOp (_ead ,1,true );!_fcf {_gd .Log .Debug ("\u0045\u0052R\u004f\u0052\u003a \u0054\u0073\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_dgd );return _dgd ;
|
|
|
|
|
};_dgf ,_gdb :=_eb .GetNumberAsFloat (_ead .Params [0]);if _gdb !=nil {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_gdb );return _gdb ;};_bcfg .setTextRise (_dgf );case "\u0054\u0077":if _abf ,_bca :=_bcfg .checkOp (_ead ,1,true );
|
|
|
|
|
!_abf {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bca );return _bca ;};_bdb ,_bfc :=_eb .GetNumberAsFloat (_ead .Params [0]);if _bfc !=nil {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_bfc );
|
|
|
|
|
return _bfc ;};_bcfg .setWordSpacing (_bdb );case "\u0054\u007a":if _bg ,_efc :=_bcfg .checkOp (_ead ,1,true );!_bg {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_efc );return _efc ;};_aga ,_fed :=_eb .GetNumberAsFloat (_ead .Params [0]);
|
|
|
|
|
if _fed !=nil {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0072\u0072\u003d\u0025\u0076",_fed );return _fed ;};_bcfg .setHorizScaling (_aga );case "\u0063\u006d":_bfa ._ffc =_fdg .CTM ;if _bfa ._ffc .Singular (){_bdbd :=_ff .IdentityMatrix ().Translate (_bfa ._ffc .Translation ());
|
|
|
|
|
_gd .Log .Debug ("S\u0069n\u0067\u0075\u006c\u0061\u0072\u0020\u0063\u0074m\u003d\u0025\u0073\u2192%s",_bfa ._ffc ,_bdbd );_bfa ._ffc =_bdbd ;};if _egfa {_gd .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_bfa ._ffc );};case "\u006d":if len (_ead .Params )!=2{_gd .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0065\u0072\u0072o\u0072\u0020\u0077\u0068\u0069\u006c\u0065\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u0020\u0060\u006d\u0060\u0020o\u0070\u0065r\u0061\u0074o\u0072\u003a\u0020\u0025\u0076\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074 m\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063o\u0072\u0072\u0065\u0063\u0074\u002e",_cd );
|
|
|
|
|
return nil ;};_faa ,_ebd :=_eb .GetNumbersAsFloat (_ead .Params );if _ebd !=nil {return _ebd ;};_gd .Log .Debug ("\u004d\u006f\u0076\u0065\u0020\u0074\u006f\u003a\u0020\u0025\u002e\u0032\u0066",_faa );_bfa .moveTo (_faa [0],_faa [1]);case "\u006c":if len (_ead .Params )!=2{_gd .Log .Debug ("\u0057\u0041\u0052\u004e\u003a\u0020\u0065\u0072\u0072o\u0072\u0020\u0077\u0068\u0069\u006c\u0065\u0020\u0070\u0072\u006f\u0063\u0065\u0073\u0073\u0069\u006e\u0067\u0020\u0060\u006c\u0060\u0020o\u0070\u0065r\u0061\u0074o\u0072\u003a\u0020\u0025\u0076\u002e\u0020\u004f\u0075\u0074\u0070\u0075\u0074 m\u0061\u0079\u0020\u0062\u0065\u0020\u0069\u006e\u0063o\u0072\u0072\u0065\u0063\u0074\u002e",_cd );
|
|
|
|
|
return nil ;};_abg ,_bef :=_eb .GetNumbersAsFloat (_ead .Params );if _bef !=nil {return _bef ;};_bfa .lineTo (_abg [0],_abg [1]);case "\u0063":if len (_ead .Params )!=6{return _cd ;};_bcae ,_cfaf :=_eb .GetNumbersAsFloat (_ead .Params );if _cfaf !=nil {return _cfaf ;
|
|
|
|
|
};_gd .Log .Debug ("\u0043u\u0062\u0069\u0063\u0020b\u0065\u007a\u0069\u0065\u0072 \u0070a\u0072a\u006d\u0073\u003a\u0020\u0025\u002e\u0032f",_bcae );_bfa .cubicTo (_bcae [0],_bcae [1],_bcae [2],_bcae [3],_bcae [4],_bcae [5]);case "\u0076","\u0079":if len (_ead .Params )!=4{return _cd ;
|
|
|
|
|
};_aee ,_fee :=_eb .GetNumbersAsFloat (_ead .Params );if _fee !=nil {return _fee ;};_gd .Log .Debug ("\u0043u\u0062\u0069\u0063\u0020b\u0065\u007a\u0069\u0065\u0072 \u0070a\u0072a\u006d\u0073\u003a\u0020\u0025\u002e\u0032f",_aee );_bfa .quadraticTo (_aee [0],_aee [1],_aee [2],_aee [3]);
|
|
|
|
|
case "\u0068":_bfa .closePath ();case "\u0072\u0065":if len (_ead .Params )!=4{return _cd ;};_bbb ,_fcgf :=_eb .GetNumbersAsFloat (_ead .Params );if _fcgf !=nil {return _fcgf ;};_bfa .drawRectangle (_bbb [0],_bbb [1],_bbb [2],_bbb [3]);_bfa .closePath ();
|
|
|
|
|
case "\u0053":_bfa .stroke (&_fbb ._bade );_bfa .clearPath ();case "\u0073":_bfa .closePath ();_bfa .stroke (&_fbb ._bade );_bfa .clearPath ();case "\u0046":_bfa .fill (&_fbb ._efg );_bfa .clearPath ();case "\u0066","\u0066\u002a":_bfa .closePath ();_bfa .fill (&_fbb ._efg );
|
|
|
|
|
_bfa .clearPath ();case "\u0042","\u0042\u002a":_bfa .fill (&_fbb ._efg );_bfa .stroke (&_fbb ._bade );_bfa .clearPath ();case "\u0062","\u0062\u002a":_bfa .closePath ();_bfa .fill (&_fbb ._efg );_bfa .stroke (&_fbb ._bade );_bfa .clearPath ();case "\u006e":_bfa .clearPath ();
|
|
|
|
|
case "\u0044\u006f":if len (_ead .Params )==0{_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0065\u0078\u0070\u0065\u0063\u0074\u0065\u0064\u0020\u0058\u004fbj\u0065c\u0074\u0020\u006e\u0061\u006d\u0065\u0020\u006f\u0070\u0065\u0072\u0061n\u0064\u0020\u0066\u006f\u0072\u0020\u0044\u006f\u0020\u006f\u0070\u0065\u0072\u0061\u0074\u006f\u0072.\u0020\u0047\u006f\u0074\u0020\u0025\u002b\u0076\u002e",_ead .Params );
|
|
|
|
|
return _eb .ErrRangeError ;};_dfad ,_aca :=_eb .GetName (_ead .Params [0]);if !_aca {_gd .Log .Debug ("\u0045\u0052\u0052\u004f\u0052\u003a\u0020\u0069\u006e\u0076\u0061l\u0069\u0064\u0020\u0044\u006f\u0020\u006f\u0070e\u0072a\u0074\u006f\u0072\u0020\u0058\u004f\u0062\u006a\u0065\u0063\u0074\u0020\u006e\u0061\u006d\u0065\u0020\u006fp\u0065\u0072\u0061\u006e\u0064\u003a\u0020\u0025\u002b\u0076\u002e",_ead .Params [0]);
|
|
|
|
|
return _eb .ErrTypeError ;};_ ,_ged :=_egc .GetXObjectByName (*_dfad );if _ged !=_bc .XObjectTypeForm {break ;};_bce ,_aca :=_gcf ._dg [_dfad .String ()];if !_aca {_agg ,_eda :=_egc .GetXObjectFormByName (*_dfad );if _eda !=nil {_gd .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v",_eda );
|
|
|
|
|
return _eda ;};_dbfa ,_eda :=_agg .GetContentStream ();if _eda !=nil {_gd .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v",_eda );return _eda ;};_gde :=_agg .Resources ;if _gde ==nil {_gde =_egc ;};_gcd ,_cgb ,_gede ,_eda :=_gcf .extractPageText (string (_dbfa ),_gde ,_aaa .Mult (_fdg .CTM ),_afe +1);
|
|
|
|
|
if _eda !=nil {_gd .Log .Debug ("\u0045R\u0052\u004f\u0052\u003a\u0020\u0025v",_eda );return _eda ;};_bce =textResult {*_gcd ,_cgb ,_gede };_gcf ._dg [_dfad .String ()]=_bce ;};_bfa ._ffc =_fdg .CTM ;if _egfa {_gd .Log .Info ("\u0063\u0074\u006d\u003d\u0025\u0073",_bfa ._ffc );
|
|
|
|
|
};_fbb ._ddgb =append (_fbb ._ddgb ,_bce ._dcgf ._ddgb ...);_fbb ._bade =append (_fbb ._bade ,_bce ._dcgf ._bade ...);_fbb ._efg =append (_fbb ._efg ,_bce ._dcgf ._efg ...);_dda ._cdd +=_bce ._gdfe ;_dda ._ceab +=_bce ._ecac ;case "\u0072\u0067","\u0067","\u006b","\u0063\u0073","\u0073\u0063","\u0073\u0063\u006e":_bcfg ._faf .ColorspaceNonStroking =_fdg .ColorspaceNonStroking ;
|
|
|
|
|
_bcfg ._faf .ColorNonStroking =_fdg .ColorNonStroking ;case "\u0052\u0047","\u0047","\u004b","\u0043\u0053","\u0053\u0043","\u0053\u0043\u004e":_bcfg ._faf .ColorspaceStroking =_fdg .ColorspaceStroking ;_bcfg ._faf .ColorStroking =_fdg .ColorStroking ;
|
|
|
|
|
};return nil ;});_bdc =_eed .Process (_cff );return _fbb ,_dda ._cdd ,_dda ._ceab ,_bdc ;};func _cdgd (_abge []*textMark ,_bbcb _bc .PdfRectangle ,_dge rulingList ,_ecbe []gridTiling )paraList {_gd .Log .Trace ("\u006d\u0061\u006b\u0065\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065\u003a \u0025\u0064\u0020\u0065\u006c\u0065m\u0065\u006e\u0074\u0073\u0020\u0070\u0061\u0067\u0065\u0053\u0069\u007a\u0065=\u0025\u002e\u0032\u0066",len (_abge ),_bbcb );
|
|
|
|
|
if len (_abge )==0{return nil ;};_bed :=_cfagb (_abge ,_bbcb );if len (_bed )==0{return nil ;};_dge .log ("\u006d\u0061\u006be\u0054\u0065\u0078\u0074\u0050\u0061\u0067\u0065");_eeee ,_bafb :=_dge .vertsHorzs ();_efcc :=_gaegc (_bed ,_bbcb .Ury ,_eeee ,_bafb );
|
|
|
|
|
_fcfd :=_efbb (_efcc ,_bbcb .Ury ,_eeee ,_bafb );_fcfd =_gfbd (_fcfd );_afgg :=make (paraList ,0,len (_fcfd ));for _ ,_cdeg :=range _fcfd {_cabe :=_cdeg .arrangeText ();if _cabe !=nil {_afgg =append (_afgg ,_cabe );};};if len (_afgg )>=_ceaeb {_afgg =_afgg .extractTables (_ecbe );
|
|
|
|
|
};_afgg .sortReadingOrder ();_afgg .log ("\u0073\u006f\u0072te\u0064\u0020\u0069\u006e\u0020\u0072\u0065\u0061\u0064\u0069\u006e\u0067\u0020\u006f\u0072\u0064\u0065\u0072");return _afgg ;};
|
2021-04-06 22:35:37 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// String returns a string describing `pt`.
|
|
|
|
|
func (_bcea PageText )String ()string {_ebgg :=_ae .Sprintf ("P\u0061\u0067\u0065\u0054ex\u0074:\u0020\u0025\u0064\u0020\u0065l\u0065\u006d\u0065\u006e\u0074\u0073",len (_bcea ._ddgb ));_gaa :=[]string {"\u002d"+_ebgg };for _ ,_ccbe :=range _bcea ._ddgb {_gaa =append (_gaa ,_ccbe .String ());
|
|
|
|
|
};_gaa =append (_gaa ,"\u002b"+_ebgg );return _a .Join (_gaa ,"\u000a");};type textObject struct{_aac *Extractor ;_dcgb *_bc .PdfPageResources ;_faf _de .GraphicsState ;_efb *textState ;_dcd *stateStack ;_cdf _ff .Matrix ;_ggdd _ff .Matrix ;_ddeg []*textMark ;
|
|
|
|
|
_cde bool ;};func (_degc rulingList )connections (_gabf map[int ]intSet ,_ddab int )intSet {_afdd :=make (intSet );_cebf :=make (intSet );var _badd func (int );_badd =func (_gbcgb int ){if !_cebf .has (_gbcgb ){_cebf .add (_gbcgb );for _ceagg :=range _degc {if _gabf [_ceagg ].has (_gbcgb ){_afdd .add (_ceagg );
|
|
|
|
|
};};for _ffgef :=range _degc {if _afdd .has (_ffgef ){_badd (_ffgef );};};};};_badd (_ddab );return _afdd ;};func (_fc *imageExtractContext )processOperand (_gc *_de .ContentStreamOperation ,_gcg _de .GraphicsState ,_db *_bc .PdfPageResources )error {if _gc .Operand =="\u0042\u0049"&&len (_gc .Params )==1{_bcf ,_gdfb :=_gc .Params [0].(*_de .ContentStreamInlineImage );
|
|
|
|
|
if !_gdfb {return nil ;};if _ffd ,_bd :=_eb .GetBoolVal (_bcf .ImageMask );_bd {if _ffd &&!_fc ._cgf .IncludeInlineStencilMasks {return nil ;};};return _fc .extractInlineImage (_bcf ,_gcg ,_db );}else if _gc .Operand =="\u0044\u006f"&&len (_gc .Params )==1{_ag ,_gda :=_eb .GetName (_gc .Params [0]);
|
|
|
|
|
if !_gda {_gd .Log .Debug ("E\u0052\u0052\u004f\u0052\u003a\u0020\u0054\u0079\u0070\u0065");return _fa ;};_ ,_fcc :=_db .GetXObjectByName (*_ag );switch _fcc {case _bc .XObjectTypeImage :return _fc .extractXObjectImage (_ag ,_gcg ,_db );case _bc .XObjectTypeForm :return _fc .extractFormImages (_ag ,_gcg ,_db );
|
|
|
|
|
};};return nil ;};func (_decb *subpath )isQuadrilateral ()bool {if len (_decb ._cfac )< 4||len (_decb ._cfac )> 5{return false ;};if len (_decb ._cfac )==5{_ebec :=_decb ._cfac [0];_cdbg :=_decb ._cfac [4];if _ebec .X !=_cdbg .X ||_ebec .Y !=_cdbg .Y {return false ;
|
|
|
|
|
};};return true ;};func _fgbd (_aaacg ,_fdfg _ff .Point )bool {return _aaacg .X ==_fdfg .X &&_aaacg .Y ==_fdfg .Y };func (_dabe *textPara )depth ()float64 {if _dabe ._fbed {return -1.0;};if len (_dabe ._bcca )> 0{return _dabe ._bcca [0]._cedfb ;};return _dabe ._egea .depth ();
|
|
|
|
|
};func (_cee *subpath )clear (){*_cee =subpath {}};func (_edba paraList )addNeighbours (){_dcgfe :=func (_feccb []int ,_gdea *textPara )([]*textPara ,[]*textPara ){_afcbg :=make ([]*textPara ,0,len (_feccb )-1);_ffdb :=make ([]*textPara ,0,len (_feccb )-1);
|
|
|
|
|
for _ ,_fdfda :=range _feccb {_gddbb :=_edba [_fdfda ];if _gddbb .Urx <=_gdea .Llx {_afcbg =append (_afcbg ,_gddbb );}else if _gddbb .Llx >=_gdea .Urx {_ffdb =append (_ffdb ,_gddbb );};};return _afcbg ,_ffdb ;};_gdcba :=func (_ceebd []int ,_cgbb *textPara )([]*textPara ,[]*textPara ){_cdebe :=make ([]*textPara ,0,len (_ceebd )-1);
|
|
|
|
|
_cfbgc :=make ([]*textPara ,0,len (_ceebd )-1);for _ ,_febfa :=range _ceebd {_aceaa :=_edba [_febfa ];if _aceaa .Ury <=_cgbb .Lly {_cfbgc =append (_cfbgc ,_aceaa );}else if _aceaa .Lly >=_cgbb .Ury {_cdebe =append (_cdebe ,_aceaa );};};return _cdebe ,_cfbgc ;
|
|
|
|
|
};_cabb :=_edba .yNeighbours (_dcae );for _ ,_ddagd :=range _edba {_bedg :=_cabb [_ddagd ];if len (_bedg )==0{continue ;};_bdca ,_acaef :=_dcgfe (_bedg ,_ddagd );if len (_bdca )==0&&len (_acaef )==0{continue ;};if len (_bdca )> 0{_eacbe :=_bdca [0];for _ ,_ebgd :=range _bdca [1:]{if _ebgd .Urx >=_eacbe .Urx {_eacbe =_ebgd ;
|
|
|
|
|
};};for _ ,_afced :=range _bdca {if _afced !=_eacbe &&_afced .Urx > _eacbe .Llx {_eacbe =nil ;break ;};};if _eacbe !=nil &&_gece (_ddagd .PdfRectangle ,_eacbe .PdfRectangle ){_ddagd ._dbfb =_eacbe ;};};if len (_acaef )> 0{_cdaac :=_acaef [0];for _ ,_gdfdga :=range _acaef [1:]{if _gdfdga .Llx <=_cdaac .Llx {_cdaac =_gdfdga ;
|
|
|
|
|
};};for _ ,_cbcd :=range _acaef {if _cbcd !=_cdaac &&_cbcd .Llx < _cdaac .Urx {_cdaac =nil ;break ;};};if _cdaac !=nil &&_gece (_ddagd .PdfRectangle ,_cdaac .PdfRectangle ){_ddagd ._fefc =_cdaac ;};};};_cabb =_edba .xNeighbours (_eadd );for _ ,_bbaaa :=range _edba {_cagdd :=_cabb [_bbaaa ];
|
|
|
|
|
if len (_cagdd )==0{continue ;};_feba ,_dedc :=_gdcba (_cagdd ,_bbaaa );if len (_feba )==0&&len (_dedc )==0{continue ;};if len (_dedc )> 0{_dcbbc :=_dedc [0];for _ ,_ddba :=range _dedc [1:]{if _ddba .Ury >=_dcbbc .Ury {_dcbbc =_ddba ;};};for _ ,_cbcag :=range _dedc {if _cbcag !=_dcbbc &&_cbcag .Ury > _dcbbc .Lly {_dcbbc =nil ;
|
|
|
|
|
break ;};};if _dcbbc !=nil &&_bcgdg (_bbaaa .PdfRectangle ,_dcbbc .PdfRectangle ){_bbaaa ._eebd =_dcbbc ;};};if len (_feba )> 0{_feaa :=_feba [0];for _ ,_dfcc :=range _feba [1:]{if _dfcc .Lly <=_feaa .Lly {_feaa =_dfcc ;};};for _ ,_adagc :=range _feba {if _adagc !=_feaa &&_adagc .Lly < _feaa .Ury {_feaa =nil ;
|
|
|
|
|
break ;};};if _feaa !=nil &&_bcgdg (_bbaaa .PdfRectangle ,_feaa .PdfRectangle ){_bbaaa ._ffeb =_feaa ;};};};for _ ,_gdfa :=range _edba {if _gdfa ._dbfb !=nil &&_gdfa ._dbfb ._fefc !=_gdfa {_gdfa ._dbfb =nil ;};if _gdfa ._ffeb !=nil &&_gdfa ._ffeb ._eebd !=_gdfa {_gdfa ._ffeb =nil ;
|
|
|
|
|
};if _gdfa ._fefc !=nil &&_gdfa ._fefc ._dbfb !=_gdfa {_gdfa ._fefc =nil ;};if _gdfa ._eebd !=nil &&_gdfa ._eebd ._ffeb !=_gdfa {_gdfa ._eebd =nil ;};};};func _egfb (_gcgaf []pathSection )rulingList {_agbd (_gcgaf );if _acea {_gd .Log .Info ("\u006d\u0061k\u0065\u0053\u0074\u0072\u006f\u006b\u0065\u0052\u0075\u006c\u0069\u006e\u0067\u0073\u003a\u0020\u0025\u0064\u0020\u0073\u0074\u0072ok\u0065\u0073",len (_gcgaf ));
|
|
|
|
|
};var _cfdf rulingList ;for _ ,_eebgf :=range _gcgaf {for _ ,_efab :=range _eebgf ._bbd {if len (_efab ._cfac )< 2{continue ;};_ebgcf :=_efab ._cfac [0];for _ ,_eaba :=range _efab ._cfac [1:]{if _gacgf ,_befc :=_agfc (_ebgcf ,_eaba ,_eebgf .Color );_befc {_cfdf =append (_cfdf ,_gacgf );
|
|
|
|
|
};_ebgcf =_eaba ;};};};if _acea {_gd .Log .Info ("m\u0061\u006b\u0065\u0053tr\u006fk\u0065\u0052\u0075\u006c\u0069n\u0067\u0073\u003a\u0020\u0025\u0073",_cfdf );};return _cfdf ;};func _aaacd (_dcgdd []compositeCell )[]float64 {var _affb []*textLine ;_dccag :=0;
|
|
|
|
|
for _ ,_gabgg :=range _dcgdd {_dccag +=len (_gabgg .paraList );_affb =append (_affb ,_gabgg .lines ()...);};_b .Slice (_affb ,func (_cdfgc ,_befff int )bool {_dacbfe ,_aecda :=_affb [_cdfgc ],_affb [_befff ];_eacgb ,_gfac :=_dacbfe ._cedfb ,_aecda ._cedfb ;
|
|
|
|
|
if !_gfbdg (_eacgb -_gfac ){return _eacgb < _gfac ;};return _dacbfe .Llx < _aecda .Llx ;});if _gcdc {_ae .Printf ("\u0020\u0020\u0020 r\u006f\u0077\u0042\u006f\u0072\u0064\u0065\u0072\u0073:\u0020%\u0064 \u0070a\u0072\u0061\u0073\u0020\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u000a",_dccag ,len (_affb ));
|
|
|
|
|
for _dcdf ,_ccccc :=range _affb {_ae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_dcdf ,_ccccc );};};var _dbef []float64 ;_cfda :=_affb [0];var _cefg [][]*textLine ;_dead :=[]*textLine {_cfda };for _cbaed ,_eeaad :=range _affb [1:]{if _eeaad .Ury < _cfda .Lly {_ebdd :=0.5*(_eeaad .Ury +_cfda .Lly );
|
|
|
|
|
if _gcdc {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u0020\u003c\u0020\u0025\u0036.\u0032f\u0020\u0062\u006f\u0072\u0064\u0065\u0072\u003d\u0025\u0036\u002e\u0032\u0066\u000a"+"\u0009\u0020\u0071\u003d\u0025\u0073\u000a\u0009\u0020p\u003d\u0025\u0073\u000a",_cbaed ,_eeaad .Ury ,_cfda .Lly ,_ebdd ,_cfda ,_eeaad );
|
|
|
|
|
};_dbef =append (_dbef ,_ebdd );_cefg =append (_cefg ,_dead );_dead =nil ;};_dead =append (_dead ,_eeaad );if _eeaad .Lly < _cfda .Lly {_cfda =_eeaad ;};};if len (_dead )> 0{_cefg =append (_cefg ,_dead );};if _gcdc {_ae .Printf (" \u0020\u0020\u0020\u0020\u0020\u0020 \u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064o\u0072\u0073\u003d%\u0036.\u0032\u0066\u000a",_dbef );
|
|
|
|
|
};if _gcdc {_gd .Log .Info ("\u0072\u006f\u0077\u003d\u0025\u0064",len (_dcgdd ));for _abca ,_edeeb :=range _dcgdd {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_abca ,_edeeb );};_gd .Log .Info ("\u0067r\u006f\u0075\u0070\u0073\u003d\u0025d",len (_cefg ));
|
|
|
|
|
for _abfed ,_ffgfe :=range _cefg {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0064\u000a",_abfed ,len (_ffgfe ));for _aafg ,_gabag :=range _ffgfe {_ae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_aafg ,_gabag );};};};_bbca :=true ;
|
|
|
|
|
for _fbafe ,_bacbcd :=range _cefg {_gecf :=true ;for _defec ,_afgc :=range _dcgdd {if _gcdc {_ae .Printf ("\u0020\u0020\u0020\u007e\u007e\u007e\u0067\u0072\u006f\u0075\u0070\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064\u0020\u0063\u0065\u006cl\u0020\u0025\u0064\u0020\u006ff\u0020\u0025d\u0020\u0025\u0073\u000a",_fbafe ,len (_cefg ),_defec ,len (_dcgdd ),_afgc );
|
|
|
|
|
};if !_afgc .hasLines (_bacbcd ){if _gcdc {_ae .Printf ("\u0020\u0020\u0020\u0021\u0021\u0021\u0067\u0072\u006f\u0075\u0070\u0020\u0025d\u0020\u006f\u0066\u0020\u0025\u0064 \u0063\u0065\u006c\u006c\u0020\u0025\u0064\u0020\u006f\u0066\u0020\u0025\u0064 \u004f\u0055\u0054\u000a",_fbafe ,len (_cefg ),_defec ,len (_dcgdd ));
|
|
|
|
|
};_gecf =false ;break ;};};if !_gecf {_bbca =false ;break ;};};if !_bbca {if _gcdc {_gd .Log .Info ("\u0072\u006f\u0077\u0020\u0063o\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u0020\u0064\u006f\u006e\u0027\u0074 \u0073\u0070\u0061\u006e\u0020\u0061\u006c\u006c\u0020\u0063\u0065\u006c\u006c\u0073\u0020\u0069\u006e\u0020\u0072\u006f\u0077\u002e\u0020\u0069\u0067\u006e\u006f\u0072\u0069\u006eg");
|
|
|
|
|
};_dbef =nil ;};if _gcdc &&_dbef !=nil {_ae .Printf ("\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u002a\u002a*\u0072\u006f\u0077\u0043\u006f\u0072\u0072i\u0064\u006f\u0072\u0073\u003d\u0025\u0036\u002e\u0032\u0066\u000a",_dbef );};return _dbef ;};
|
|
|
|
|
func (_bcfd compositeCell )String ()string {_cbcf :="";if len (_bcfd .paraList )> 0{_cbcf =_eaed (_bcfd .paraList .merge ().text (),50);};return _ae .Sprintf ("\u0025\u0036\u002e\u0032\u0066\u0020\u0025\u0064\u0020\u0070\u0061\u0072a\u0073\u0020\u0025\u0071",_bcfd .PdfRectangle ,len (_bcfd .paraList ),_cbcf );
|
|
|
|
|
};type textResult struct{_dcgf PageText ;_gdfe int ;_ecac int ;};func _aagb (_fbefb []float64 ,_caaff ,_effgc float64 )[]float64 {_fdef ,_ebgce :=_caaff ,_effgc ;if _ebgce < _fdef {_fdef ,_ebgce =_ebgce ,_fdef ;};_edadf :=make ([]float64 ,0,len (_fbefb )+2);
|
|
|
|
|
_edadf =append (_edadf ,_caaff );for _ ,_egged :=range _fbefb {if _egged <=_fdef {continue ;}else if _egged >=_ebgce {break ;};_edadf =append (_edadf ,_egged );};_edadf =append (_edadf ,_effgc );return _edadf ;};func (_cdfb *shapesState )establishSubpath ()*subpath {_effg ,_bbc :=_cdfb .lastpointEstablished ();
|
|
|
|
|
if !_bbc {_cdfb ._cbdd =append (_cdfb ._cbdd ,_abfba (_effg ));};if len (_cdfb ._cbdd )==0{return nil ;};_cdfb ._bbfc =false ;return _cdfb ._cbdd [len (_cdfb ._cbdd )-1];};func (_egde *textPara )writeCellText (_ggfb _e .Writer ){for _gedd ,_edec :=range _egde ._bcca {_eaga :=_edec .text ();
|
|
|
|
|
_ecdd :=_gacf &&_edec .endsInHyphen ()&&_gedd !=len (_egde ._bcca )-1;if _ecdd {_eaga =_ebfa (_eaga );};_ggfb .Write ([]byte (_eaga ));if !(_ecdd ||_gedd ==len (_egde ._bcca )-1){_ggfb .Write ([]byte (_gfebf (_edec ._cedfb ,_egde ._bcca [_gedd +1]._cedfb )));
|
|
|
|
|
};};};type rectRuling struct{_gggf rulingKind ;_cdfc markKind ;_fd .Color ;_bc .PdfRectangle ;};func _gecgd (_abbda map[int ]intSet )[]int {_cbce :=make ([]int ,0,len (_abbda ));for _ddbde :=range _abbda {_cbce =append (_cbce ,_ddbde );};_b .Ints (_cbce );
|
|
|
|
|
return _cbce ;};func (_begbe *textTable )reduceTiling (_adcg gridTiling ,_gfee float64 )*textTable {_efefe :=make ([]int ,0,_begbe ._bfbba );_dccac :=make ([]int ,0,_begbe ._cacec );_efcd :=_adcg ._defe ;_gcde :=_adcg ._afbc ;for _gbdeg :=0;_gbdeg < _begbe ._bfbba ;
|
|
|
|
|
_gbdeg ++{_fdcc :=_gbdeg > 0&&_f .Abs (_gcde [_gbdeg -1]-_gcde [_gbdeg ])< _gfee &&_begbe .emptyRow (_gbdeg );if !_fdcc {_efefe =append (_efefe ,_gbdeg );};};for _geed :=0;_geed < _begbe ._cacec ;_geed ++{_eacgd :=_geed < _begbe ._cacec -1&&_f .Abs (_efcd [_geed +1]-_efcd [_geed ])< _gfee &&_begbe .emptyColumn (_geed );
|
|
|
|
|
if !_eacgd {_dccac =append (_dccac ,_geed );};};if len (_efefe )==_begbe ._bfbba &&len (_dccac )==_begbe ._cacec {return _begbe ;};_cebg :=textTable {_dcbbg :_begbe ._dcbbg ,_cacec :len (_dccac ),_bfbba :len (_efefe ),_ffgge :make (map[uint64 ]compositeCell ,len (_dccac )*len (_efefe ))};
|
|
|
|
|
if _gcdc {_gd .Log .Info ("\u0072\u0065\u0064\u0075c\u0065\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0025d\u0078%\u0064\u0020\u002d\u003e\u0020\u0025\u0064x\u0025\u0064",_begbe ._cacec ,_begbe ._bfbba ,len (_dccac ),len (_efefe ));_gd .Log .Info ("\u0072\u0065d\u0075\u0063\u0065d\u0043\u006f\u006c\u0073\u003a\u0020\u0025\u002b\u0076",_dccac );
|
|
|
|
|
_gd .Log .Info ("\u0072\u0065d\u0075\u0063\u0065d\u0052\u006f\u0077\u0073\u003a\u0020\u0025\u002b\u0076",_efefe );};for _abgf ,_bacc :=range _efefe {for _gcabe ,_cgcg :=range _dccac {_efgb ,_fbfb :=_begbe .getComposite (_cgcg ,_bacc );if len (_efgb )==0{continue ;
|
|
|
|
|
};if _gcdc {_ae .Printf ("\u0020 \u0025\u0032\u0064\u002c \u0025\u0032\u0064\u0020\u0028%\u0032d\u002c \u0025\u0032\u0064\u0029\u0020\u0025\u0071\n",_gcabe ,_abgf ,_cgcg ,_bacc ,_eaed (_efgb .merge ().text (),50));};_cebg .putComposite (_gcabe ,_abgf ,_efgb ,_fbfb );
|
|
|
|
|
};};return &_cebg ;};func (_acfg *ruling )equals (_cgbeb *ruling )bool {return _acfg ._beb ==_cgbeb ._beb &&_fafg (_acfg ._aadb ,_cgbeb ._aadb )&&_fafg (_acfg ._eacb ,_cgbeb ._eacb )&&_fafg (_acfg ._cbf ,_cgbeb ._cbf );};func (_fgbb rulingList )asTiling ()gridTiling {if _gcace {_gd .Log .Info ("r\u0075\u006ci\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0076\u0065\u0063s\u003d\u0025\u0064\u0020\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u002b\u002b\u002b\u0020\u003d\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d=\u003d",len (_fgbb ));
|
|
|
|
|
};for _dbe ,_bdfc :=range _fgbb [1:]{_ddcbe :=_fgbb [_dbe ];if _ddcbe .alignsPrimary (_bdfc )&&_ddcbe .alignsSec (_bdfc ){_gd .Log .Error ("a\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0044\u0075\u0070\u006c\u0069\u0063\u0061\u0074\u0065 \u0072\u0075\u006c\u0069\u006e\u0067\u0073\u002e\u000a\u0009v=\u0025\u0073\u000a\t\u0077=\u0025\u0073",_bdfc ,_ddcbe );
|
|
|
|
|
};};_fgbb .sortStrict ();_fgbb .log ("\u0073n\u0061\u0070\u0070\u0065\u0064");_ebaad ,_dagcg :=_fgbb .vertsHorzs ();_edgab :=_ebaad .primaries ();_gged :=_dagcg .primaries ();_egef :=len (_edgab )-1;_cgcdf :=len (_gged )-1;if _egef ==0||_cgcdf ==0{return gridTiling {};
|
|
|
|
|
};_cfgf :=_bc .PdfRectangle {Llx :_edgab [0],Urx :_edgab [_egef ],Lly :_gged [0],Ury :_gged [_cgcdf ]};if _gcace {_gd .Log .Info ("\u0072\u0075l\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0076\u0065\u0072\u0074s=\u0025\u0064",len (_ebaad ));
|
|
|
|
|
for _bbgf ,_bagf :=range _ebaad {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_bbgf ,_bagf );};_gd .Log .Info ("\u0072\u0075l\u0069\u006e\u0067\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067\u003a\u0020\u0068\u006f\u0072\u007as=\u0025\u0064",len (_dagcg ));
|
|
|
|
|
for _fcce ,_ebcb :=range _dagcg {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_fcce ,_ebcb );};_gd .Log .Info ("\u0072\u0075\u006c\u0069\u006eg\u004c\u0069\u0073\u0074\u002e\u0061\u0073\u0054\u0069\u006c\u0069\u006e\u0067:\u0020\u0020\u0077\u0078\u0068\u003d\u0025\u0064\u0078\u0025\u0064\u000a\u0009\u006c\u006c\u0078\u003d\u0025\u002e\u0032\u0066\u000a\u0009\u006c\u006c\u0079\u003d\u0025\u002e\u0032f",_egef ,_cgcdf ,_edgab ,_gged );
|
|
|
|
|
};_cdbcg :=make ([]gridTile ,_egef *_cgcdf );for _dacbc :=_cgcdf -1;_dacbc >=0;_dacbc --{_cega :=_gged [_dacbc ];_fbda :=_gged [_dacbc +1];for _begeb :=0;_begeb < _egef ;_begeb ++{_bdaa :=_edgab [_begeb ];_egab :=_edgab [_begeb +1];_caee :=_ebaad .findPrimSec (_bdaa ,_cega );
|
|
|
|
|
_deba :=_ebaad .findPrimSec (_egab ,_cega );_adce :=_dagcg .findPrimSec (_cega ,_bdaa );_fdfa :=_dagcg .findPrimSec (_fbda ,_bdaa );_bcdea :=_bc .PdfRectangle {Llx :_bdaa ,Urx :_egab ,Lly :_cega ,Ury :_fbda };_gfba :=_begea (_bcdea ,_caee ,_deba ,_adce ,_fdfa );
|
|
|
|
|
_cdbcg [_dacbc *_egef +_begeb ]=_gfba ;if _gcace {_ae .Printf ("\u0020\u0020\u0078\u003d\u0025\u0032\u0064\u0020\u0079\u003d\u0025\u0032\u0064\u003a\u0020%\u0073 \u0025\u0036\u002e\u0032\u0066\u0020\u0078\u0020\u0025\u0036\u002e\u0032\u0066\u000a",_begeb ,_dacbc ,_gfba .String (),_gfba .Width (),_gfba .Height ());
|
|
|
|
|
};};};if _gcace {_gd .Log .Info ("r\u0075\u006c\u0069\u006e\u0067\u004c\u0069\u0073\u0074.\u0061\u0073\u0054\u0069\u006c\u0069\u006eg:\u0020\u0063\u006f\u0061l\u0065\u0073\u0063\u0065\u0020\u0068\u006f\u0072\u0069zo\u006e\u0074a\u006c\u002e\u0020\u0025\u0036\u002e\u0032\u0066",_cfgf );
|
|
|
|
|
};_cbef :=make ([]map[float64 ]gridTile ,_cgcdf );for _cdcd :=_cgcdf -1;_cdcd >=0;_cdcd --{if _gcace {_ae .Printf ("\u0020\u0020\u0079\u003d\u0025\u0032\u0064\u000a",_cdcd );};_cbef [_cdcd ]=make (map[float64 ]gridTile ,_egef );for _gbfcb :=0;_gbfcb < _egef ;
|
|
|
|
|
_gbfcb ++{_aebc :=_cdbcg [_cdcd *_egef +_gbfcb ];if _gcace {_ae .Printf ("\u0020\u0020\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_gbfcb ,_aebc );};if !_aebc ._cgaag {continue ;};_bfdb :=_gbfcb ;for _edbda :=_gbfcb +1;!_aebc ._fbfa &&_edbda < _egef ;
|
|
|
|
|
_edbda ++{_cadb :=_cdbcg [_cdcd *_egef +_edbda ];_aebc .Urx =_cadb .Urx ;_aebc ._adfgd =_aebc ._adfgd ||_cadb ._adfgd ;_aebc ._ggfc =_aebc ._ggfc ||_cadb ._ggfc ;_aebc ._fbfa =_cadb ._fbfa ;if _gcace {_ae .Printf ("\u0020 \u0020%\u0034\u0064\u003a\u0020\u0025s\u0020\u2192 \u0025\u0073\u000a",_edbda ,_cadb ,_aebc );
|
|
|
|
|
};_bfdb =_edbda ;};if _gcace {_ae .Printf (" \u0020 \u0025\u0032\u0064\u0020\u002d\u0020\u0025\u0032d\u0020\u2192\u0020\u0025s\n",_gbfcb ,_bfdb ,_aebc );};_gbfcb =_bfdb ;_cbef [_cdcd ][_aebc .Llx ]=_aebc ;};};_ggeda :=make (map[float64 ]map[float64 ]gridTile ,_cgcdf );
|
|
|
|
|
_gfge :=make (map[float64 ]map[float64 ]struct{},_cgcdf );for _gddd :=_cgcdf -1;_gddd >=0;_gddd --{_ffgf :=_cdbcg [_gddd *_egef ].Lly ;_ggeda [_ffgf ]=make (map[float64 ]gridTile ,_egef );_gfge [_ffgf ]=make (map[float64 ]struct{},_egef );};if _gcace {_gd .Log .Info ("\u0072u\u006c\u0069n\u0067\u004c\u0069s\u0074\u002e\u0061\u0073\u0054\u0069\u006ci\u006e\u0067\u003a\u0020\u0063\u006fa\u006c\u0065\u0073\u0063\u0065\u0020\u0076\u0065\u0072\u0074\u0069c\u0061\u006c\u002e\u0020\u0025\u0036\u002e\u0032\u0066",_cfgf );
|
|
|
|
|
};for _dafe :=_cgcdf -1;_dafe >=0;_dafe --{_geeg :=_cdbcg [_dafe *_egef ].Lly ;_cgee :=_cbef [_dafe ];if _gcace {_ae .Printf ("\u0020\u0020\u0079\u003d\u0025\u0032\u0064\u000a",_dafe );};for _ ,_cdeb :=range _fcbee (_cgee ){if _ ,_eeaa :=_gfge [_geeg ][_cdeb ];
|
|
|
|
|
_eeaa {continue ;};_ggbf :=_cgee [_cdeb ];if _gcace {_ae .Printf (" \u0020\u0020\u0020\u0020\u0076\u0030\u003d\u0025\u0073\u000a",_ggbf .String ());};for _ddaf :=_dafe -1;_ddaf >=0;_ddaf --{if _ggbf ._ggfc {break ;};_ccda :=_cbef [_ddaf ];_ffac ,_efec :=_ccda [_cdeb ];
|
|
|
|
|
if !_efec {break ;};if _ffac .Urx !=_ggbf .Urx {break ;};_ggbf ._ggfc =_ffac ._ggfc ;_ggbf .Lly =_ffac .Lly ;if _gcace {_ae .Printf ("\u0020\u0020\u0020\u0020 \u0020\u0020\u0076\u003d\u0025\u0073\u0020\u0076\u0030\u003d\u0025\u0073\u000a",_ffac .String (),_ggbf .String ());
|
|
|
|
|
};_gfge [_ffac .Lly ][_ffac .Llx ]=struct{}{};};if _dafe ==0{_ggbf ._ggfc =true ;};if _ggbf .complete (){_ggeda [_geeg ][_cdeb ]=_ggbf ;};};};_bgee :=gridTiling {PdfRectangle :_cfgf ,_defe :_fafda (_ggeda ),_afbc :_eaaca (_ggeda ),_gdgc :_ggeda };_bgee .log ("\u0043r\u0065\u0061\u0074\u0065\u0064");
|
|
|
|
|
return _bgee ;};func _bgeb (_dfbg ,_abdc ,_cecb float64 )rulingKind {if _dfbg >=_cecb &&_dbfcb (_abdc ,_dfbg ){return _dffd ;};if _abdc >=_cecb &&_dbfcb (_dfbg ,_abdc ){return _dddb ;};return _dgcb ;};func (_ccba rulingList )secMinMax ()(float64 ,float64 ){_bfec ,_ebcf :=_ccba [0]._eacb ,_ccba [0]._cbf ;
|
|
|
|
|
for _ ,_gedc :=range _ccba [1:]{if _gedc ._eacb < _bfec {_bfec =_gedc ._eacb ;};if _gedc ._cbf > _ebcf {_ebcf =_gedc ._cbf ;};};return _bfec ,_ebcf ;};func _dbfcb (_agba ,_eefd float64 )bool {return _agba /_f .Max (_gbgd ,_eefd )< _cccd };type stateStack []*textState ;
|
|
|
|
|
func (_dgbg *wordBag )scanBand (_eba string ,_ddfaa *wordBag ,_gag func (_cfeb *wordBag ,_cadf *textWord )bool ,_fcbb ,_caba ,_bdbe float64 ,_cdg ,_bdgb bool )int {_acbd :=_ddfaa ._cfec ;var _fafe map[int ]map[*textWord ]struct{};if !_cdg {_fafe =_dgbg .makeRemovals ();
|
|
|
|
|
};_cffe :=_bbfa *_acbd ;_cgbc :=0;for _ ,_efea :=range _dgbg .depthBand (_fcbb -_cffe ,_caba +_cffe ){if len (_dgbg ._aaaf [_efea ])==0{continue ;};for _ ,_beg :=range _dgbg ._aaaf [_efea ]{if !(_fcbb -_cffe <=_beg ._gdce &&_beg ._gdce <=_caba +_cffe ){continue ;
|
|
|
|
|
};if !_gag (_ddfaa ,_beg ){continue ;};_cdb :=2.0*_f .Abs (_beg ._edega -_ddfaa ._cfec )/(_beg ._edega +_ddfaa ._cfec );_dfgf :=_f .Max (_beg ._edega /_ddfaa ._cfec ,_ddfaa ._cfec /_beg ._edega );_fdgf :=_f .Min (_cdb ,_dfgf );if _bdbe > 0&&_fdgf > _bdbe {continue ;
|
|
|
|
|
};if _ddfaa .blocked (_beg ){continue ;};if !_cdg {_ddfaa .pullWord (_beg ,_efea ,_fafe );};_cgbc ++;if !_bdgb {if _beg ._gdce < _fcbb {_fcbb =_beg ._gdce ;};if _beg ._gdce > _caba {_caba =_beg ._gdce ;};};if _cdg {break ;};};};if !_cdg {_dgbg .applyRemovals (_fafe );
|
|
|
|
|
};return _cgbc ;};
|
2021-04-06 22:35:37 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
|
|
|
|
|
func (_eaf *TextMarkArray )BBox ()(_bc .PdfRectangle ,bool ){var _fgdfb _bc .PdfRectangle ;_ace :=false ;for _ ,_ebe :=range _eaf ._fad {if _ebe .Meta ||_eafg (_ebe .Text ){continue ;};if _ace {_fgdfb =_deb (_fgdfb ,_ebe .BBox );}else {_fgdfb =_ebe .BBox ;
|
|
|
|
|
_ace =true ;};};return _fgdfb ,_ace ;};func _cdega (_fbcf ,_fbdf _ff .Point )rulingKind {_dffe :=_f .Abs (_fbcf .X -_fbdf .X );_gdbb :=_f .Abs (_fbcf .Y -_fbdf .Y );return _bgeb (_dffe ,_gdbb ,_cccd );};func (_ecgbb rulingList )sort (){_b .Slice (_ecgbb ,_ecgbb .comp )};
|
|
|
|
|
func _agfc (_abeb ,_cbba _ff .Point ,_fbgd _fd .Color )(*ruling ,bool ){_cdge :=lineRuling {_agcee :_abeb ,_fgga :_cbba ,_cceae :_efeef (_abeb ,_cbba ),Color :_fbgd };if _cdge ._cceae ==_dgcb {return nil ,false ;};return _cdge .asRuling ();};func (_beee *shapesState )newSubPath (){_beee .clearPath ();
|
|
|
|
|
if _egfa {_gd .Log .Info ("\u006e\u0065\u0077\u0053\u0075\u0062\u0050\u0061\u0074h\u003a\u0020\u0025\u0073",_beee );};};func (_cbeg *ruling )gridIntersecting (_bcad *ruling )bool {return _fafg (_cbeg ._eacb ,_bcad ._eacb )&&_fafg (_cbeg ._cbf ,_bcad ._cbf );
|
|
|
|
|
};func (_ffae rulingList )removeDuplicates ()rulingList {if len (_ffae )==0{return nil ;};_ffae .sort ();_ffdcd :=rulingList {_ffae [0]};for _ ,_bbdf :=range _ffae [1:]{if _bbdf .equals (_ffdcd [len (_ffdcd )-1]){continue ;};_ffdcd =append (_ffdcd ,_bbdf );
|
|
|
|
|
};return _ffdcd ;};
|
2021-04-06 22:35:37 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// ApplyArea processes the page text only within the specified area `bbox`.
|
|
|
|
|
// Each time ApplyArea is called, it updates the result set in `pt`.
|
|
|
|
|
// Can be called multiple times in a row with different bounding boxes.
|
|
|
|
|
func (_adc *PageText )ApplyArea (bbox _bc .PdfRectangle ){_ecae :=make ([]*textMark ,0,len (_adc ._ddgb ));for _ ,_afdc :=range _adc ._ddgb {if _bafa (_afdc .bbox (),bbox ){_ecae =append (_ecae ,_afdc );};};var _fcec paraList ;_caec :=len (_ecae );for _gcfd :=0;
|
|
|
|
|
_gcfd < 360&&_caec > 0;_gcfd +=90{_dbd :=make ([]*textMark ,0,len (_ecae )-_caec );for _ ,_bgc :=range _ecae {if _bgc ._ggbe ==_gcfd {_dbd =append (_dbd ,_bgc );};};if len (_dbd )> 0{_eagf :=_cdgd (_dbd ,_adc ._fcd ,nil ,nil );_fcec =append (_fcec ,_eagf ...);
|
|
|
|
|
_caec -=len (_dbd );};};_aecd :=new (_ce .Buffer );_fcec .writeText (_aecd );_adc ._fdaa =_aecd .String ();_adc ._dcbg =_fcec .toTextMarks ();_adc ._dfc =_fcec .tables ();};func (_adcc *wordBag )sort (){for _ ,_bceae :=range _adcc ._aaaf {_b .Slice (_bceae ,func (_dcad ,_fdeae int )bool {return _gfdd (_bceae [_dcad ],_bceae [_fdeae ])< 0});
|
|
|
|
|
};};func (_fdca paraList )inTile (_fdgg gridTile )paraList {var _debc paraList ;for _ ,_agbg :=range _fdca {if _fdgg .contains (_agbg .PdfRectangle ){_debc =append (_debc ,_agbg );};};if _gcdc {_ae .Printf ("\u0020 \u0020\u0069\u006e\u0054i\u006c\u0065\u003a\u0020\u0020%\u0073 \u0069n\u0073\u0069\u0064\u0065\u003d\u0025\u0064\n",_fdgg ,len (_debc ));
|
|
|
|
|
for _adgg ,_dbcd :=range _debc {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_adgg ,_dbcd );};_ae .Println ("");};return _debc ;};
|
2021-04-06 22:35:37 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// TextTable represents a table.
|
|
|
|
|
// Cells are ordered top-to-bottom, left-to-right.
|
|
|
|
|
// Cells[y] is the (0-offset) y'th row in the table.
|
|
|
|
|
// Cells[y][x] is the (0-offset) x'th column in the table.
|
|
|
|
|
type TextTable struct{W ,H int ;Cells [][]TableCell ;};func (_cgcdff gridTiling )log (_bgade string ){if !_gcace {return ;};_gd .Log .Info ("\u0074i\u006ci\u006e\u0067\u003a\u0020\u0025d\u0020\u0078 \u0025\u0064\u0020\u0025\u0071",len (_cgcdff ._defe ),len (_cgcdff ._afbc ),_bgade );
|
|
|
|
|
_ae .Printf ("\u0020\u0020\u0020l\u006c\u0078\u003d\u0025\u002e\u0032\u0066\u000a",_cgcdff ._defe );_ae .Printf ("\u0020\u0020\u0020l\u006c\u0079\u003d\u0025\u002e\u0032\u0066\u000a",_cgcdff ._afbc );for _ddcgd ,_begc :=range _cgcdff ._afbc {_cagac ,_ddbcd :=_cgcdff ._gdgc [_begc ];
|
|
|
|
|
if !_ddbcd {continue ;};_ae .Printf ("%\u0034\u0064\u003a\u0020\u0025\u0036\u002e\u0032\u0066\u000a",_ddcgd ,_begc );for _egecd ,_gfegf :=range _cgcdff ._defe {_facd ,_feddd :=_cagac [_gfegf ];if !_feddd {continue ;};_ae .Printf ("\u0025\u0038\u0064\u003a\u0020\u0025\u0073\u000a",_egecd ,_facd .String ());
|
|
|
|
|
};};};func _gebcc (_dec _bc .PdfRectangle ,_ddca bounded )float64 {return _dec .Ury -_ddca .bbox ().Lly };func (_fgdeb *subpath )close (){if !_fgbd (_fgdeb ._cfac [0],_fgdeb .last ()){_fgdeb .add (_fgdeb ._cfac [0]);};_fgdeb ._abe =true ;_fgdeb .removeDuplicates ();
|
|
|
|
|
};func _ggec (_aeeb ,_fdec bounded )float64 {_acgca :=_gfdd (_aeeb ,_fdec );if !_gfbdg (_acgca ){return _acgca ;};return _dedeg (_aeeb ,_fdec );};func (_adfa paraList )merge ()*textPara {_gd .Log .Trace ("\u006d\u0065\u0072\u0067\u0065:\u0020\u0070\u0061\u0072\u0061\u0073\u003d\u0025\u0064\u0020\u003d\u003d\u003d=\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u0078\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d",len (_adfa ));
|
|
|
|
|
if len (_adfa )==0{return nil ;};_adfa .sortReadingOrder ();_aega :=_adfa [0].PdfRectangle ;_cfdg :=_adfa [0]._bcca ;for _ ,_abea :=range _adfa [1:]{_aega =_deb (_aega ,_abea .PdfRectangle );_cfdg =append (_cfdg ,_abea ._bcca ...);};return _gdfbd (_aega ,_cfdg );
|
|
|
|
|
};func (_dbg *PageText )computeViews (){var _gba rulingList ;if _gbaa {_gdeb :=_egfb (_dbg ._bade );_gba =append (_gba ,_gdeb ...);};if _cbae {_fbea :=_gaee (_dbg ._efg );_gba =append (_gba ,_fbea ...);};_gba ,_beecb :=_gba .toTilings ();var _fgb paraList ;
|
|
|
|
|
_gdae :=len (_dbg ._ddgb );for _gcgb :=0;_gcgb < 360&&_gdae > 0;_gcgb +=90{_fgcf :=make ([]*textMark ,0,len (_dbg ._ddgb )-_gdae );for _ ,_ega :=range _dbg ._ddgb {if _ega ._ggbe ==_gcgb {_fgcf =append (_fgcf ,_ega );};};if len (_fgcf )> 0{_abfb :=_cdgd (_fgcf ,_dbg ._fcd ,_gba ,_beecb );
|
|
|
|
|
_fgb =append (_fgb ,_abfb ...);_gdae -=len (_fgcf );};};_beff :=new (_ce .Buffer );_fgb .writeText (_beff );_dbg ._fdaa =_beff .String ();_dbg ._dcbg =_fgb .toTextMarks ();_dbg ._dfc =_fgb .tables ();if _gcdc {_gd .Log .Info ("\u0063\u006f\u006dpu\u0074\u0065\u0056\u0069\u0065\u0077\u0073\u003a\u0020\u0074\u0061\u0062\u006c\u0065\u0073\u003d\u0025\u0064",len (_dbg ._dfc ));
|
|
|
|
|
};};func (_fbfag *textPara )taken ()bool {return _fbfag ==nil ||_fbfag ._abgd };func (_beadb *textWord )absorb (_fbcc *textWord ){_beadb .PdfRectangle =_deb (_beadb .PdfRectangle ,_fbcc .PdfRectangle );_beadb ._eeefd =append (_beadb ._eeefd ,_fbcc ._eeefd ...);
|
|
|
|
|
};const (RenderModeStroke RenderMode =1<<iota ;RenderModeFill ;RenderModeClip ;);func (_egacc gridTile )contains (_bdde _bc .PdfRectangle )bool {if _egacc .numBorders ()< 3{return false ;};if _egacc ._cgaag &&_bdde .Llx < _egacc .Llx -_fcaf {return false ;
|
|
|
|
|
};if _egacc ._fbfa &&_bdde .Urx > _egacc .Urx +_fcaf {return false ;};if _egacc ._ggfc &&_bdde .Lly < _egacc .Lly -_fcaf {return false ;};if _egacc ._adfgd &&_bdde .Ury > _egacc .Ury +_fcaf {return false ;};return true ;};func (_ddggb *textTable )emptyRow (_cefba int )bool {for _bbgfa :=0;
|
|
|
|
|
_bbgfa < _ddggb ._cacec ;_bbgfa ++{_ececd :=_ddggb .get (_bbgfa ,_cefba );if _ececd !=nil &&_ececd .text ()!=""{return false ;};};return true ;};func (_ggc *imageExtractContext )extractFormImages (_eae *_eb .PdfObjectName ,_ede _de .GraphicsState ,_cce *_bc .PdfPageResources )error {_beec ,_ece :=_cce .GetXObjectFormByName (*_eae );
|
|
|
|
|
if _ece !=nil {return _ece ;};if _beec ==nil {return nil ;};_fcg ,_ece :=_beec .GetContentStream ();if _ece !=nil {return _ece ;};_ffgg :=_beec .Resources ;if _ffgg ==nil {_ffgg =_cce ;};_ece =_ggc .extractContentStreamImages (string (_fcg ),_ffgg );if _ece !=nil {return _ece ;
|
|
|
|
|
};_ggc ._dcc ++;return nil ;};func (_ccfe rectRuling )asRuling ()(*ruling ,bool ){_bcba :=ruling {_beb :_ccfe ._gggf ,Color :_ccfe .Color ,_fafbe :_cfgb };switch _ccfe ._gggf {case _dddb :_bcba ._aadb =0.5*(_ccfe .Llx +_ccfe .Urx );_bcba ._eacb =_ccfe .Lly ;
|
|
|
|
|
_bcba ._cbf =_ccfe .Ury ;_eaec ,_gcfb :=_ccfe .checkWidth (_ccfe .Llx ,_ccfe .Urx );if !_gcfb {if _dcgde {_gd .Log .Error ("\u0072\u0065\u0063\u0074\u0052\u0075l\u0069\u006e\u0067\u002e\u0061\u0073\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0072\u0075\u006c\u0069\u006e\u0067V\u0065\u0072\u0074\u0020\u0021\u0063\u0068\u0065\u0063\u006b\u0057\u0069\u0064\u0074h\u0020v\u003d\u0025\u002b\u0076",_ccfe );
|
|
|
|
|
};return nil ,false ;};_bcba ._cgdca =_eaec ;case _dffd :_bcba ._aadb =0.5*(_ccfe .Lly +_ccfe .Ury );_bcba ._eacb =_ccfe .Llx ;_bcba ._cbf =_ccfe .Urx ;_cdba ,_cfdfd :=_ccfe .checkWidth (_ccfe .Lly ,_ccfe .Ury );if !_cfdfd {if _dcgde {_gd .Log .Error ("\u0072\u0065\u0063\u0074\u0052\u0075l\u0069\u006e\u0067\u002e\u0061\u0073\u0052\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0072\u0075\u006c\u0069\u006e\u0067H\u006f\u0072\u007a\u0020\u0021\u0063\u0068\u0065\u0063\u006b\u0057\u0069\u0064\u0074h\u0020v\u003d\u0025\u002b\u0076",_ccfe );
|
|
|
|
|
};return nil ,false ;};_bcba ._cgdca =_cdba ;default:_gd .Log .Error ("\u0062\u0061\u0064\u0020pr\u0069\u006d\u0061\u0072\u0079\u0020\u006b\u0069\u006e\u0064\u003d\u0025\u0064",_ccfe ._gggf );return nil ,false ;};return &_bcba ,true ;};func _efdf (_cbfa ,_cbgb ,_fdba ,_gbab *textPara )*textTable {_cagdf :=&textTable {_cacec :2,_bfbba :2,_egfdd :make (map[uint64 ]*textPara ,4)};
|
|
|
|
|
_cagdf .put (0,0,_cbfa );_cagdf .put (1,0,_cbgb );_cagdf .put (0,1,_fdba );_cagdf .put (1,1,_gbab );return _cagdf ;};type textWord struct{_bc .PdfRectangle ;_gdce float64 ;_dgcdg string ;_eeefd []*textMark ;_edega float64 ;_cfcd bool ;};
|
2021-04-17 13:46:54 +00:00
|
|
|
|
|
2021-05-11 00:01:27 +00:00
|
|
|
|
// String returns a description of `t`.
|
|
|
|
|
func (_ddfab *textTable )String ()string {return _ae .Sprintf ("\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u0020\u0025\u0074",_ddfab ._cacec ,_ddfab ._bfbba ,_ddfab ._dcbbg );};const _afdf =1.0/1000.0;func (_abgab rulingList )toTilings ()(rulingList ,[]gridTiling ){_abgab .log ("\u0074o\u0054\u0069\u006c\u0069\u006e\u0067s");
|
|
|
|
|
if len (_abgab )==0{return nil ,nil ;};_abgab =_abgab .tidied ("\u0061\u006c\u006c");_abgab .log ("\u0074\u0069\u0064\u0069\u0065\u0064");_gagbc :=_abgab .toGrids ();_dgbcg :=make ([]gridTiling ,len (_gagbc ));for _ebgb ,_cggd :=range _gagbc {_dgbcg [_ebgb ]=_cggd .asTiling ();
|
|
|
|
|
};return _abgab ,_dgbcg ;};type textMark struct{_bc .PdfRectangle ;_ggbe int ;_ecfa string ;_gfg string ;_aacba *_bc .PdfFont ;_ebgc float64 ;_afged float64 ;_fcgdb _ff .Matrix ;_aafb _ff .Point ;_fgfd _bc .PdfRectangle ;_dcggg _fd .Color ;_aggg _fd .Color ;
|
|
|
|
|
};func (_gcbf paraList )llyRange (_dcgdf []int ,_cfff ,_cace float64 )[]int {_begg :=len (_gcbf );if _cace < _gcbf [_dcgdf [0]].Lly ||_cfff > _gcbf [_dcgdf [_begg -1]].Lly {return nil ;};_bgcd :=_b .Search (_begg ,func (_bbfad int )bool {return _gcbf [_dcgdf [_bbfad ]].Lly >=_cfff });
|
|
|
|
|
_bgff :=_b .Search (_begg ,func (_gfab int )bool {return _gcbf [_dcgdf [_gfab ]].Lly > _cace });return _dcgdf [_bgcd :_bgff ];};func (_ccgd *textTable )bbox ()_bc .PdfRectangle {return _ccgd .PdfRectangle };func (_cabc *wordBag )firstWord (_ebaf int )*textWord {return _cabc ._aaaf [_ebaf ][0]};
|
|
|
|
|
func (_dcfd intSet )del (_ebbc int ){delete (_dcfd ,_ebbc )};func (_gbde rulingList )mergePrimary ()float64 {_ffded :=_gbde [0]._aadb ;for _ ,_dgded :=range _gbde [1:]{_ffded +=_dgded ._aadb ;};return _ffded /float64 (len (_gbde ));};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `tm`.
|
|
|
|
|
func (_dafcb *textMark )String ()string {return _ae .Sprintf ("\u0025\u002e\u0032f \u0066\u006f\u006e\u0074\u0073\u0069\u007a\u0065\u003d\u0025\u002e\u0032\u0066\u0020\u0022\u0025\u0073\u0022",_dafcb .PdfRectangle ,_dafcb ._ebgc ,_dafcb ._ecfa );};func (_gega compositeCell )split (_dad ,_agag []float64 )*textTable {_cbca :=len (_dad )+1;
|
|
|
|
|
_bgge :=len (_agag )+1;if _gcdc {_gd .Log .Info ("\u0063\u006f\u006d\u0070\u006f\u0073\u0069t\u0065\u0043\u0065l\u006c\u002e\u0073\u0070l\u0069\u0074\u003a\u0020\u0025\u0064\u0020\u0078\u0020\u0025\u0064\u000a\u0009\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u003d\u0025\u0073\u000a"+"\u0009\u0072\u006f\u0077\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073=\u0025\u0036\u002e\u0032\u0066\u000a\t\u0063\u006f\u006c\u0043\u006f\u0072\u0072\u0069\u0064\u006f\u0072\u0073\u003d%\u0036\u002e\u0032\u0066",_bgge ,_cbca ,_gega ,_dad ,_agag );
|
|
|
|
|
_ae .Printf ("\u0020\u0020\u0020\u0020\u0025\u0064\u0020\u0070\u0061\u0072\u0061\u0073\u000a",len (_gega .paraList ));for _cfeg ,_fcea :=range _gega .paraList {_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_cfeg ,_fcea .String ());};_ae .Printf ("\u0020\u0020\u0020\u0020\u0025\u0064\u0020\u006c\u0069\u006e\u0065\u0073\u000a",len (_gega .lines ()));
|
|
|
|
|
for _acc ,_acaf :=range _gega .lines (){_ae .Printf ("\u0025\u0034\u0064\u003a\u0020\u0025\u0073\u000a",_acc ,_acaf );};};_dad =_aagb (_dad ,_gega .Ury ,_gega .Lly );_agag =_aagb (_agag ,_gega .Llx ,_gega .Urx );_adg :=make (map[uint64 ]*textPara ,_bgge *_cbca );
|
|
|
|
|
_bcec :=textTable {_cacec :_bgge ,_bfbba :_cbca ,_egfdd :_adg };_dcda :=_gega .paraList ;_b .Slice (_dcda ,func (_dcba ,_afff int )bool {_ccbgd ,_fabdb :=_dcda [_dcba ],_dcda [_afff ];_acgef ,_cdaa :=_ccbgd .Lly ,_fabdb .Lly ;if _acgef !=_cdaa {return _acgef < _cdaa ;
|
|
|
|
|
};return _ccbgd .Llx < _fabdb .Llx ;});_cacf :=make (map[uint64 ]_bc .PdfRectangle ,_bgge *_cbca );for _afbg ,_ggga :=range _dad [1:]{_bde :=_dad [_afbg ];for _gbff ,_daad :=range _agag [1:]{_bacb :=_agag [_gbff ];_cacf [_daceb (_gbff ,_afbg )]=_bc .PdfRectangle {Llx :_bacb ,Urx :_daad ,Lly :_ggga ,Ury :_bde };
|
|
|
|
|
};};if _gcdc {_gd .Log .Info ("\u0063\u006f\u006d\u0070\u006f\u0073\u0069\u0074\u0065\u0043\u0065l\u006c\u002e\u0073\u0070\u006c\u0069\u0074\u003a\u0020\u0072e\u0063\u0074\u0073");_ae .Printf ("\u0020\u0020\u0020\u0020");for _bcfec :=0;_bcfec < _bgge ;
|
|
|
|
|
_bcfec ++{_ae .Printf ("\u0025\u0033\u0030\u0064\u002c\u0020",_bcfec );};_ae .Println ();for _bgcb :=0;_bgcb < _cbca ;_bgcb ++{_ae .Printf ("\u0020\u0020\u0025\u0032\u0064\u003a",_bgcb );for _ecab :=0;_ecab < _bgge ;_ecab ++{_ae .Printf ("\u00256\u002e\u0032\u0066\u002c\u0020",_cacf [_daceb (_ecab ,_bgcb )]);
|
|
|
|
|
};_ae .Println ();};};_eace :=func (_dedbb *textLine )(int ,int ){for _ccgf :=0;_ccgf < _cbca ;_ccgf ++{for _efdc :=0;_efdc < _bgge ;_efdc ++{if _cddc (_cacf [_daceb (_efdc ,_ccgf )],_dedbb .PdfRectangle ){return _efdc ,_ccgf ;};};};return -1,-1;};_deeb :=make (map[uint64 ][]*textLine ,_bgge *_cbca );
|
|
|
|
|
for _ ,_bdcge :=range _dcda .lines (){_egcd ,_fccd :=_eace (_bdcge );if _egcd < 0{continue ;};_deeb [_daceb (_egcd ,_fccd )]=append (_deeb [_daceb (_egcd ,_fccd )],_bdcge );};for _aabg :=0;_aabg < len (_dad )-1;_aabg ++{_ffdc :=_dad [_aabg ];_bcbb :=_dad [_aabg +1];
|
|
|
|
|
for _fdbd :=0;_fdbd < len (_agag )-1;_fdbd ++{_eeea :=_agag [_fdbd ];_abcg :=_agag [_fdbd +1];_cggf :=_bc .PdfRectangle {Llx :_eeea ,Urx :_abcg ,Lly :_bcbb ,Ury :_ffdc };_fgcb :=_deeb [_daceb (_fdbd ,_aabg )];if len (_fgcb )==0{continue ;};_bgafa :=_gdfbd (_cggf ,_fgcb );
|
|
|
|
|
_bcec .put (_fdbd ,_aabg ,_bgafa );};};return &_bcec ;};
|
|
|
|
|
|
|
|
|
|
// String returns a description of `k`.
|
|
|
|
|
func (_gdba rulingKind )String ()string {_gfeg ,_afce :=_abff [_gdba ];if !_afce {return _ae .Sprintf ("\u004e\u006ft\u0020\u0061\u0020r\u0075\u006c\u0069\u006e\u0067\u003a\u0020\u0025\u0064",_gdba );};return _gfeg ;};func (_defg *wordBag )depthIndexes ()[]int {if len (_defg ._aaaf )==0{return nil ;
|
|
|
|
|
};_ggce :=make ([]int ,len (_defg ._aaaf ));_aeed :=0;for _fdea :=range _defg ._aaaf {_ggce [_aeed ]=_fdea ;_aeed ++;};_b .Ints (_ggce );return _ggce ;};func (_efeb lineRuling )yMean ()float64 {return 0.5*(_efeb ._agcee .Y +_efeb ._fgga .Y )};type paraList []*textPara ;
|