All files / src ParseParameters.ts

0% Statements 0/0
0% Branches 1/1
0% Functions 1/1
0% Lines 0/0

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129                                                                                                                                                                                                                                                                 
export interface ParseParameters {
	/**
	 * - Array of 1-based page numbers to parse. When provided, only these pages
	 *   will be parsed and returned in the same order as specified.
	 *   Example: [1, 3, 5].
	 *   Parse only one page: [7].
	 */
	partial?: Array<number>;
 
	/**
	 * - If set to a positive integer N, parse the first N pages (pages 1..N).
	 *   Ignored when `partial` is provided.
	 * - If both `first` and `last` are set, they define an explicit inclusive
	 *   page range and only pages from `first` to `last` will be parsed. In that
	 *   case `first` is treated as the starting page number and the "first N"
	 *   semantics is ignored.
	 */
	first?: number;
 
	/**
	 * - If set to a positive integer N, parse the last N pages (pages total-N+1..total).
	 *   Ignored when `partial` is provided.
	 * - If both `first` and `last` are set, they define an explicit inclusive
	 *   page range and only pages from `first` to `last` will be parsed. In that
	 *   case `last` is treated as the ending page number and the "last N"
	 *   semantics is ignored.
	 */
	last?: number;
 
	/**
	 * - When true, attempt to detect and include hyperlink annotations (e.g. URLs)
	 *   associated with text. Detected links are formatted as Markdown inline links
	 *   (for example: [link text](https://example.com)).
	 *   Defaults to `false`.
	 */
	parseHyperlinks?: boolean;
 
	/**
	 * - When true, the extractor will try to enforce logical line breaks by
	 *   inserting a newline between text items when the vertical distance
	 *   between them exceeds `lineThreshold`.
	 * - Useful to preserve paragraph/line structure when text items are
	 *   emitted as separate segments by the PDF renderer.
	 * - Default: `true`.
	 */
	lineEnforce?: boolean;
 
	/**
	 * - Threshold used to decide whether two nearby text
	 *   items belong to different lines. A larger value makes the parser more
	 *   likely to start a new line between items.
	 * - Default: `4.6`.
	 */
	lineThreshold?: number;
 
	/**
	 * - String inserted between text items on the same line when a sufficiently
	 *   large horizontal gap is detected (see `cellThreshold`). This is typically
	 *   used to emulate a cell/column separator (for example, a tab).
	 * - Example: `"\t"` to produce tab-separated cells.
	 * - Default: `'\t'`.
	 */
	cellSeparator?: string;
 
	/**
	 * - Horizontal distance threshold used to decide when
	 *   two text items on the same baseline should be considered separate cells
	 *   (and thus separated by `cellSeparator`).
	 * - A larger value produces fewer (wider) cells; smaller value creates more
	 *   cell breaks.
	 * - Default: `7`.
	 */
	cellThreshold?: number;
 
	/**
	 * - Optional string appended at the end of each page's extracted text to
	 *   mark page boundaries. The string supports the placeholders
	 *   `page_number` and `total_number`, which are substituted with the
	 *   current page number and total page count respectively.
	 * - If omitted or empty, no page boundary marker is added.
	 * - Default: `'\n-- page_number of total_number --'`
	 */
	pageJoiner?: string;
 
	/**
	 * - Optional string used to join text items when returning a page's text.
	 *   If provided, the extractor will use this value to join the sequence of
	 *   text items instead of the default empty-string joining behavior.
	 * - Use this to insert a custom separator between every text item.
	 * - Default: `undefined`
	 */
	itemJoiner?: string;
 
	/**
	 * - Minimum image width (in pixels). When set, images with a width smaller
	 *   than this value will be ignored by getImage().
	 * - Use to filter out very small embedded images (thumbnails, icons).
	 * - Default: undefined (no minimum).
	 */
	minImageWidth?: number;
 
	/**
	 * - Minimum image height (in pixels). When set, images with a height smaller
	 *   than this value will be ignored by getImage().
	 * - Use together with minImageWidth to require both dimensions to meet the threshold.
	 * - Default: undefined (no minimum).
	 */
	minImageHeight?: number;
 
	/**
	 * - When true, include marked content items in the items array of TextContent.
	 * - Enables capturing the PDF's "marked content"
	 * - Tags (MCID, role/props) and structural/accessibility information — e.g.
	 * - Semantic tagging, sectioning, spans, alternate/alternative text, etc.
	 * - How to use:
	 * - Turn it on when you need structure/tag information or to map text ↔ structure using MCIDs (for example with page.getStructTree()).
	 * - For plain text extraction it's usually left false (trade-off: larger output/increased detail).
	 *   Defaults to `false`.
	 */
	includeMarkedContent?: boolean;
 
	/**
	 * - When true, the text is *not* normalized in the worker thread.
	 * - Normalize in worker (false recommended for plain text)
	 *   Defaults to `false`.
	 */
	disableNormalization?: boolean;
}