mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-14 19:29:50 +08:00
66 lines
1.8 KiB
Go
66 lines
1.8 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package extractor
|
|
|
|
// The follow constant configure debugging.
|
|
const (
|
|
verbose = false
|
|
verboseGeom = false
|
|
verbosePage = false
|
|
verbosePara = false
|
|
verboseParaLine = verbosePara && true
|
|
verboseParaWord = verboseParaLine && false
|
|
verboseTable = false
|
|
)
|
|
|
|
// The following constants control the approaches used in the code.
|
|
const (
|
|
useTables = true
|
|
doHyphens = true
|
|
useEBBox = false
|
|
)
|
|
|
|
// The following constants are the tuning parameter for text extracton
|
|
const (
|
|
// Size of depth bins in points
|
|
depthBinPoints = 6
|
|
|
|
// Variation in line depth as a fraction of font size. +lineDepthR for subscripts, -lineDepthR for
|
|
// superscripts
|
|
lineDepthR = 0.5
|
|
|
|
// All constants that end in R are relative to font size.
|
|
|
|
// Max difference in font sizes allowed within a word.
|
|
maxIntraWordFontTolR = 0.05
|
|
|
|
// Maximum gap between a word and a para in the depth direction for which we pull the word
|
|
// into the para, as a fraction of the font size.
|
|
maxIntraDepthGapR = 1.0
|
|
// Max diffrence in font size for word and para for the above case
|
|
maxIntraDepthFontTolR = 0.05
|
|
|
|
// Maximum gap between a word and a para in the reading direction for which we pull the word
|
|
// into the para.
|
|
maxIntraReadingGapR = 0.4
|
|
// Max diffrence in font size for word and para for the above case
|
|
maxIntraReadingFontTol = 0.6
|
|
|
|
// Minimum spacing between paras in the reading direction.
|
|
minInterReadingGapR = 1.0
|
|
// Max diffrence in font size for word and para for the above case
|
|
minInterReadingFontTol = 0.1
|
|
|
|
// Maximum inter-word spacing.
|
|
maxIntraWordGapR = 1.4
|
|
|
|
// Maximum overlap between characters allowd within a line
|
|
maxIntraLineOverlapR = 0.46
|
|
|
|
// Maximum spacing between characters within a line.
|
|
maxIntraLineGapR = 0.03
|
|
)
|