All files / src ParseParameters.ts

0% Statements 0/0
0% Branches 1/1
0% Functions 1/1
0% Lines 0/0
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  export interface ParseParameters {
	/**
	 * - Array of 1-based page numbers to parse. When provided, only these pages
	 *   will be parsed and returned in the same order as specified.
	 *   Example: [1, 3, 5].
	 *   Parse only one page: [7].
	 */
	partial?: Array<number>;
 
	/**
	 * - If set to a positive integer N, parse the first N pages (pages 1..N).
	 *   Ignored when `partial` is provided.
	 * - If both `first` and `last` are set, they define an explicit inclusive
	 *   page range and only pages from `first` to `last` will be parsed. In that
	 *   case `first` is treated as the starting page number and the "first N"
	 *   semantics is ignored.
	 */
	first?: number;
 
	/**
	 * - If set to a positive integer N, parse the last N pages (pages total-N+1..total).
	 *   Ignored when `partial` is provided.
	 * - If both `first` and `last` are set, they define an explicit inclusive
	 *   page range and only pages from `first` to `last` will be parsed. In that
	 *   case `last` is treated as the ending page number and the "last N"
	 *   semantics is ignored.
	 */
	last?: number;
 
	/**
	 * - When true, attempt to detect and include hyperlink annotations (e.g. URLs)
	 *   associated with text. Detected links are formatted as Markdown inline links
	 *   (for example: [link text](https://example.com)).
	 *   Defaults to `false`.
	 */
	parseHyperlinks?: boolean;
 
	/**
	 * - When true, the extractor will try to enforce logical line breaks by
	 *   inserting a newline between text items when the vertical distance
	 *   between them exceeds `lineThreshold`.
	 * - Useful to preserve paragraph/line structure when text items are
	 *   emitted as separate segments by the PDF renderer.
	 * - Default: `true`.
	 */
	lineEnforce?: boolean;
 
	/**
	 * - Threshold used to decide whether two nearby text
	 *   items belong to different lines. A larger value makes the parser more
	 *   likely to start a new line between items.
	 * - Default: `4.6`.
	 */
	lineThreshold?: number;
 
	/**
	 * - String inserted between text items on the same line when a sufficiently
	 *   large horizontal gap is detected (see `cellThreshold`). This is typically
	 *   used to emulate a cell/column separator (for example, a tab).
	 * - Example: `"\t"` to produce tab-separated cells.
	 * - Default: `'\t'`.
	 */
	cellSeparator?: string;
 
	/**
	 * - Horizontal distance threshold used to decide when
	 *   two text items on the same baseline should be considered separate cells
	 *   (and thus separated by `cellSeparator`).
	 * - A larger value produces fewer (wider) cells; smaller value creates more
	 *   cell breaks.
	 * - Default: `7`.
	 */
	cellThreshold?: number;
 
	/**
	 * - Optional string appended at the end of each page's extracted text to
	 *   mark page boundaries. The string supports the placeholders
	 *   `page_number` and `total_number`, which are substituted with the
	 *   current page number and total page count respectively.
	 * - If omitted or empty, no page boundary marker is added.
	 * - Default: `'\n-- page_number of total_number --'`
	 */
	pageJoiner?: string;
 
	/**
	 * - Optional string used to join text items when returning a page's text.
	 *   If provided, the extractor will use this value to join the sequence of
	 *   text items instead of the default empty-string joining behavior.
	 * - Use this to insert a custom separator between every text item.
	 * - Default: `undefined`
	 */
	itemJoiner?: string;
 
	/**
	 * - Minimum image width (in pixels). When set, images with a width smaller
	 *   than this value will be ignored by getImage().
	 * - Use to filter out very small embedded images (thumbnails, icons).
	 * - Default: undefined (no minimum).
	 */
	minImageWidth?: number;
 
	/**
	 * - Minimum image height (in pixels). When set, images with a height smaller
	 *   than this value will be ignored by getImage().
	 * - Use together with minImageWidth to require both dimensions to meet the threshold.
	 * - Default: undefined (no minimum).
	 */
	minImageHeight?: number;
 
	/**
	 * - When true, include marked content items in the items array of TextContent.
	 * - Enables capturing the PDF's "marked content"
	 * - Tags (MCID, role/props) and structural/accessibility information — e.g.
	 * - Semantic tagging, sectioning, spans, alternate/alternative text, etc.
	 * - How to use:
	 * - Turn it on when you need structure/tag information or to map text ↔ structure using MCIDs (for example with page.getStructTree()).
	 * - For plain text extraction it's usually left false (trade-off: larger output/increased detail).
	 *   Defaults to `false`.
	 */
	includeMarkedContent?: boolean;
 
	/**
	 * - When true, the text is *not* normalized in the worker thread.
	 * - Normalize in worker (false recommended for plain text)
	 *   Defaults to `false`.
	 */
	disableNormalization?: boolean;
}