Press n or j to go to the next uncovered block, b, p or k for the previous block.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | 1x 1x 1x 8x 8x 8x 8x 8x 8x 8x 8x 2x 2x 2x 2x 2x 2x 2x 1x 1x 2x 1x 1x 2x 6x 6x 6x 6x 1x 1x 6x 1x 1x 6x 1x 1x 6x 1x 1x 6x 1x 1x 6x 1x 1x 6x 6x 1x 2x 8x 6x 3x 3x 3x 3x 6x 8x 8x 8x 8x | import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs'; import type { Metadata } from 'pdfjs-dist/types/src/display/metadata.js'; const XMP_DATE_PROPERTIES = ['xmp:createdate', 'xmp:modifydate', 'xmp:metadatadate', 'xap:createdate', 'xap:modifydate', 'xap:metadatadate']; export type { Metadata } from 'pdfjs-dist/types/src/display/metadata.js'; /** * Node representing a single item in the PDF outline (bookmarks). * This mirrors the structure returned by PDF.js' getOutline() API. */ export interface OutlineNode { // The visible title of the bookmark / outline entry. title: string; // If true, the title should be rendered in bold. bold: boolean; // If true, the title should be rendered in italic. italic: boolean; // Optional RGBA color for the title as a clamped byte array. color: Uint8ClampedArray; // Destination for the outline item. PDF.js may return a named // destination (string) or an array representing an explicit destination. // Can be null when no explicit destination is available. // biome-ignore lint/suspicious/noExplicitAny: <unsupported underline type> dest: string | Array<any> | null; // If the outline entry points to an external URL, it will be here. url: string | null; // When PDF.js flags a URL as unsafe the raw value is available here. unsafeUrl?: string; // Whether the link should open in a new window/tab if rendered. newWindow?: boolean; // Number of child entries (if provided by the PDF). May be undefined. count?: number; // Child outline items. Type is kept loose to match PDF.js returns. // biome-ignore lint/suspicious/noExplicitAny: <unsupported underline type> items: Array<any>; } /** * Consolidated date information gathered from different PDF sources. * The PDF 'Info' dictionary contains CreationDate / ModDate and * the XMP/XAP metadata can contain several timestamps as well. This * structure collects those values (if present) as JavaScript Date objects * or null when the property exists but cannot be parsed. */ export type DateNode = { CreationDate?: Date | null; ModDate?: Date | null; XmpCreateDate?: Date | null; XmpModifyDate?: Date | null; XmpMetadataDate?: Date | null; XapCreateDate?: Date | null; XapModifyDate?: Date | null; XapMetadataDate?: Date | null; }; /** * Per-page link extraction result. * - pageNumber: the physical page index (1-based) within the PDF document. * - pageLabel: optional printed page label shown by PDF viewers (e.g. "iii", "1", "A-1"); * this can differ from the physical page number and may be undefined * when the document does not provide labels. * - links: array of text->URL mappings that were found/overlaid on the page. * - width/height: page dimensions in PDF units for the viewport used. */ export type PageLinkResult = { // Physical page number (1-based index inside the PDF document). pageNumber: number; // Optional printed page label as displayed by PDF viewers. May be null/undefined // if the document does not provide explicit labels for pages. pageLabel?: string | null; // Hyperlinks that were overlaid or embedded on the page surface. Each entry // contains the visible text (if any) and the resolved URL. links: Array<{ text: string; url: string }>; // Page width and height for the page viewport that was used when extracting links. width: number; height: number; }; /** * Aggregated information about a PDF document returned by getInfo(). * The object contains high-level metadata, outline/bookmark structure, * per-page extracted hyperlinks and utility helpers for parsing dates. */ export class InfoResult { // Total number of pages in the PDF document (count of physical pages). total: number; /** * The PDF 'Info' dictionary. Typical fields include title, author, subject, * Creator, Producer and Creation/Modification dates. The exact structure is * determined by the PDF and as returned by PDF.js. */ // biome-ignore lint/suspicious/noExplicitAny: <unsupported underline type> info?: any; // Low-level document metadata object (XMP). Use this to access extended // properties that are not present in the Info dictionary. metadata?: Metadata; /** * An array of document fingerprint strings provided by PDF.js. Useful * for caching, de-duplication or identifying a document across runs. */ fingerprints?: Array<string | null>; /** * Permission flags for the document as returned by PDF.js (or null). * These flags indicate capabilities such as printing, copying and * other restrictions imposed by the PDF security settings. */ permission?: number[] | null; /** * Optional document outline (bookmarks). When present this is the * hierarchical navigation structure which viewers use for quick access. */ outline?: Array<OutlineNode> | null; // Results with per-page hyperlink extraction. Empty array by default. pages: Array<PageLinkResult> = []; /** * Collects dates from different sources (Info dictionary and XMP/XAP metadata) * and returns them as a DateNode where available. This helps callers compare * and choose the most relevant timestamp (for example a creation date vs XMP date). */ public getDateNode(): DateNode { const result: DateNode = {}; // The Info dictionary may contain CreationDate/ModDate in PDF date string format. // biome-ignore lint/suspicious/noExplicitAny: <unsupported underline type> const CreationDate = (this.info as any)?.CreationDate; if (CreationDate) { result.CreationDate = pdfjs.PDFDateString.toDateObject(CreationDate); } // biome-ignore lint/suspicious/noExplicitAny: <unsupported underline type> const ModDate = (this.info as any)?.ModDate; if (ModDate) { result.ModDate = pdfjs.PDFDateString.toDateObject(ModDate); } // If no XMP metadata is present, return the Info-based dates only. if (!this.metadata) { return result; } // Extract several XMP/XAP date properties (if present) and attempt to // parse them as ISO-like strings. Parsed values are added to the // corresponding DateNode fields. for (const prop of XMP_DATE_PROPERTIES) { const value = this.metadata?.get(prop); const date = this.parseISODateString(value); switch (prop) { case XMP_DATE_PROPERTIES[0]: result.XmpCreateDate = date; break; case XMP_DATE_PROPERTIES[1]: result.XmpModifyDate = date; break; case XMP_DATE_PROPERTIES[2]: result.XmpMetadataDate = date; break; case XMP_DATE_PROPERTIES[3]: result.XapCreateDate = date; break; case XMP_DATE_PROPERTIES[4]: result.XapModifyDate = date; break; case XMP_DATE_PROPERTIES[5]: result.XapMetadataDate = date; break; } } return result; } /** * Try to parse an ISO-8601 date string from XMP/XAP metadata. If the * value is falsy or cannot be parsed, undefined is returned to indicate * absence or unparsable input. */ private parseISODateString(isoDateString: string): Date | undefined { if (!isoDateString) return undefined; const parsedDate = Date.parse(isoDateString); if (!Number.isNaN(parsedDate)) { return new Date(parsedDate); } return undefined; } constructor(total: number) { this.total = total; } } |