From c247d4c1e33baec9772d4497a7b452dc16f2e91f Mon Sep 17 00:00:00 2001 From: Philip Okugbe <16838612+Philipinho@users.noreply.github.com> Date: Fri, 1 May 2026 14:56:39 +0100 Subject: [PATCH] feat(ee): PDF import (#2142) * feat: replace pdfjs-dist with firecrawl-pdf-inspector * use modified firecrawl-pdf-inspector * feat: pdf import * increase single file upload size limit * use npm package * sync * update package --- apps/client/src/ee/features.ts | 1 + .../page/components/page-import-modal.tsx | 30 +++- apps/server/package.json | 2 +- apps/server/src/common/features.ts | 1 + .../integrations/import/import.controller.ts | 5 +- .../import/services/import.service.ts | 51 +++++- pnpm-lock.yaml | 145 ++---------------- 7 files changed, 93 insertions(+), 142 deletions(-) diff --git a/apps/client/src/ee/features.ts b/apps/client/src/ee/features.ts index a9ab8b0d..cacf851f 100644 --- a/apps/client/src/ee/features.ts +++ b/apps/client/src/ee/features.ts @@ -8,6 +8,7 @@ export const Feature = { AI: 'ai', CONFLUENCE_IMPORT: 'import:confluence', DOCX_IMPORT: 'import:docx', + PDF_IMPORT: 'import:pdf', ATTACHMENT_INDEXING: 'attachment:indexing', SECURITY_SETTINGS: 'security:settings', MCP: 'mcp', diff --git a/apps/client/src/features/page/components/page-import-modal.tsx b/apps/client/src/features/page/components/page-import-modal.tsx index df6691d5..c1c12dc4 100644 --- a/apps/client/src/features/page/components/page-import-modal.tsx +++ b/apps/client/src/features/page/components/page-import-modal.tsx @@ -12,6 +12,7 @@ import { IconCheck, IconFileCode, IconFileTypeDocx, + IconFileTypePdf, IconFileTypeZip, IconMarkdown, IconX, @@ -90,12 +91,14 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) { const markdownFileRef = useRef<() => void>(null); const htmlFileRef = useRef<() => void>(null); const docxFileRef = useRef<() => void>(null); + const pdfFileRef = useRef<() => void>(null); const notionFileRef = useRef<() => void>(null); const confluenceFileRef = useRef<() => void>(null); const zipFileRef = useRef<() => void>(null); const canUseConfluence = useHasFeature(Feature.CONFLUENCE_IMPORT); const canUseDocx = useHasFeature(Feature.DOCX_IMPORT); + const canUsePdf = useHasFeature(Feature.PDF_IMPORT); const upgradeLabel = useUpgradeLabel(); const handleZipUpload = async (selectedFile: File, source: string) => { @@ -244,7 +247,7 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) { }, 3000); }, [fileTaskId]); - const maxSingleFileSize = bytes("20mb"); + const maxSingleFileSize = bytes("30mb"); const handleFileUpload = async (selectedFiles: File[]) => { if (!selectedFiles) { @@ -298,6 +301,7 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) { if (markdownFileRef.current) markdownFileRef.current(); if (htmlFileRef.current) htmlFileRef.current(); if (docxFileRef.current) docxFileRef.current(); + if (pdfFileRef.current) pdfFileRef.current(); const pageCountText = pageCount === 1 ? `1 ${t("page")}` : `${pageCount} ${t("pages")}`; @@ -378,6 +382,30 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) { )} + + {(props) => ( + + + + )} + + handleZipUpload(file, "notion")} accept="application/zip" diff --git a/apps/server/package.json b/apps/server/package.json index 79492487..c0eeddf3 100644 --- a/apps/server/package.json +++ b/apps/server/package.json @@ -37,6 +37,7 @@ "@aws-sdk/lib-storage": "3.1037.0", "@aws-sdk/s3-request-presigner": "3.1037.0", "@clickhouse/client": "^1.18.2", + "@docmost/pdf-inspector": "1.9.4", "@fastify/cookie": "^11.0.2", "@fastify/multipart": "^10.0.0", "@fastify/static": "^9.1.3", @@ -100,7 +101,6 @@ "p-limit": "^7.3.0", "passport-google-oauth20": "^2.0.0", "passport-jwt": "^4.0.1", - "pdfjs-dist": "^5.5.207", "pg-tsquery": "^8.4.2", "pgvector": "^0.2.1", "pino-http": "^11.0.0", diff --git a/apps/server/src/common/features.ts b/apps/server/src/common/features.ts index 38f226a8..c5fd9a20 100644 --- a/apps/server/src/common/features.ts +++ b/apps/server/src/common/features.ts @@ -8,6 +8,7 @@ export const Feature = { AI: 'ai', CONFLUENCE_IMPORT: 'import:confluence', DOCX_IMPORT: 'import:docx', + PDF_IMPORT: 'import:pdf', ATTACHMENT_INDEXING: 'attachment:indexing', SECURITY_SETTINGS: 'security:settings', MCP: 'mcp', diff --git a/apps/server/src/integrations/import/import.controller.ts b/apps/server/src/integrations/import/import.controller.ts index 7ee325e5..cd2341ea 100644 --- a/apps/server/src/integrations/import/import.controller.ts +++ b/apps/server/src/integrations/import/import.controller.ts @@ -51,9 +51,9 @@ export class ImportController { @AuthUser() user: User, @AuthWorkspace() workspace: Workspace, ) { - const validFileExtensions = ['.md', '.html', '.docx']; + const validFileExtensions = ['.md', '.html', '.docx', '.pdf']; - const maxFileSize = bytes('20mb'); + const maxFileSize = bytes('30mb'); let file = null; try { @@ -102,6 +102,7 @@ export class ImportController { '.md': 'markdown', '.html': 'html', '.docx': 'docx', + '.pdf': 'pdf', }; if (createdPage) { diff --git a/apps/server/src/integrations/import/services/import.service.ts b/apps/server/src/integrations/import/services/import.service.ts index 0eb3ae40..1eb10ca8 100644 --- a/apps/server/src/integrations/import/services/import.service.ts +++ b/apps/server/src/integrations/import/services/import.service.ts @@ -63,7 +63,10 @@ export class ImportService { let createdPage = null; // For DOCX, we need the page ID upfront so images can reference it - const pageId = fileExtension === '.docx' ? uuid7() : undefined; + const pageId = + fileExtension === '.docx' || fileExtension === '.pdf' + ? uuid7() + : undefined; try { if (fileExtension.endsWith('.md')) { @@ -78,6 +81,14 @@ export class ImportService { pageId, userId, ); + } else if (fileExtension.endsWith('.pdf')) { + prosemirrorState = await this.processPdf( + fileBuffer, + workspaceId, + spaceId, + pageId, + userId, + ); } } catch (err) { const message = 'Error processing file content'; @@ -156,7 +167,7 @@ export class ImportService { let DocxImportModule: any; try { // eslint-disable-next-line @typescript-eslint/no-require-imports - DocxImportModule = require('./../../../ee/docx-import/docx-import.service'); + DocxImportModule = require('./../../../ee/document-import/docx-import.service'); } catch (err) { this.logger.error( 'DOCX import requested but EE module not bundled in this build', @@ -182,6 +193,42 @@ export class ImportService { return this.processHTML(html); } + async processPdf( + fileBuffer: Buffer, + workspaceId: string, + spaceId: string, + pageId: string, + userId: string, + ): Promise { + let PdfImportModule: any; + try { + // eslint-disable-next-line @typescript-eslint/no-require-imports + PdfImportModule = require('./../../../ee/document-import/pdf-import.service'); + } catch (err) { + this.logger.error( + 'PDF import requested but EE module not bundled in this build', + ); + throw new BadRequestException( + 'This feature requires a valid enterprise license.', + ); + } + + const pdfImportService = this.moduleRef.get( + PdfImportModule.PdfImportService, + { strict: false }, + ); + + const html = await pdfImportService.convertPdfToHtml( + fileBuffer, + workspaceId, + spaceId, + pageId, + userId, + ); + + return this.processHTML(html); + } + async createYdoc(prosemirrorJson: any): Promise { if (prosemirrorJson) { // this.logger.debug(`Converting prosemirror json state to ydoc`); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 3985bc75..f6e1a757 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -482,6 +482,9 @@ importers: '@clickhouse/client': specifier: ^1.18.2 version: 1.18.2 + '@docmost/pdf-inspector': + specifier: 1.9.4 + version: 1.9.4 '@fastify/cookie': specifier: ^11.0.2 version: 11.0.2 @@ -671,9 +674,6 @@ importers: passport-jwt: specifier: ^4.0.1 version: 4.0.1 - pdfjs-dist: - specifier: ^5.5.207 - version: 5.5.207 pg-tsquery: specifier: ^8.4.2 version: 8.4.2 @@ -1826,6 +1826,9 @@ packages: resolution: {integrity: sha512-UJnjoFsmxfKUdNYdWgOB0mWUypuLvAfQPH1+pyvRJs6euowbFkFC6P13w1l8mJyi3vxYMxc9kld5jZEGRQs6bw==} engines: {node: '>=18'} + '@docmost/pdf-inspector@1.9.4': + resolution: {integrity: sha512-G5DNyDtLNxybTXWakqi7PuOEuSb/A2ZjDlv2WCkOkiHszPeILdrC+G0a4e4UP10yxvzuLfb23pJ5jy8fUSYZPw==} + '@emnapi/core@1.8.1': resolution: {integrity: sha512-AvT9QFpxK0Zd8J0jopedNm+w/2fIzvtPKPjqyw9jwvBaReTTqPBk9Hixaz7KbjimP+QNz605/XnjFcDAL2pqBg==} @@ -2762,76 +2765,6 @@ packages: cpu: [x64] os: [win32] - '@napi-rs/canvas-android-arm64@0.1.97': - resolution: {integrity: sha512-V1c/WVw+NzH8vk7ZK/O8/nyBSCQimU8sfMsB/9qeSvdkGKNU7+mxy/bIF0gTgeBFmHpj30S4E9WHMSrxXGQuVQ==} - engines: {node: '>= 10'} - cpu: [arm64] - os: [android] - - '@napi-rs/canvas-darwin-arm64@0.1.97': - resolution: {integrity: sha512-ok+SCEF4YejcxuJ9Rm+WWunHHpf2HmiPxfz6z1a/NFQECGXtsY7A4B8XocK1LmT1D7P174MzwPF9Wy3AUAwEPw==} - engines: {node: '>= 10'} - cpu: [arm64] - os: [darwin] - - '@napi-rs/canvas-darwin-x64@0.1.97': - resolution: {integrity: sha512-PUP6e6/UGlclUvAQNnuXCcnkpdUou6VYZfQOQxExLp86epOylmiwLkqXIvpFmjoTEDmPmXrI+coL/9EFU1gKPA==} - engines: {node: '>= 10'} - cpu: [x64] - os: [darwin] - - '@napi-rs/canvas-linux-arm-gnueabihf@0.1.97': - resolution: {integrity: sha512-XyXH2L/cic8eTNtbrXCcvqHtMX/nEOxN18+7rMrAM2XtLYC/EB5s0wnO1FsLMWmK+04ZSLN9FBGipo7kpIkcOw==} - engines: {node: '>= 10'} - cpu: [arm] - os: [linux] - - '@napi-rs/canvas-linux-arm64-gnu@0.1.97': - resolution: {integrity: sha512-Kuq/M3djq0K8ktgz6nPlK7Ne5d4uWeDxPpyKWOjWDK2RIOhHVtLtyLiJw2fuldw7Vn4mhw05EZXCEr4Q76rs9w==} - engines: {node: '>= 10'} - cpu: [arm64] - os: [linux] - - '@napi-rs/canvas-linux-arm64-musl@0.1.97': - resolution: {integrity: sha512-kKmSkQVnWeqg7qdsiXvYxKhAFuHz3tkBjW/zyQv5YKUPhotpaVhpBGv5LqCngzyuRV85SXoe+OFj+Tv0a0QXkQ==} - engines: {node: '>= 10'} - cpu: [arm64] - os: [linux] - - '@napi-rs/canvas-linux-riscv64-gnu@0.1.97': - resolution: {integrity: sha512-Jc7I3A51jnEOIAXeLsN/M/+Z28LUeakcsXs07FLq9prXc0eYOtVwsDEv913Gr+06IRo34gJJVgT0TXvmz+N2VA==} - engines: {node: '>= 10'} - cpu: [riscv64] - os: [linux] - - '@napi-rs/canvas-linux-x64-gnu@0.1.97': - resolution: {integrity: sha512-iDUBe7AilfuBSRbSa8/IGX38Mf+iCSBqoVKLSQ5XaY2JLOaqz1TVyPFEyIck7wT6mRQhQt5sN6ogfjIDfi74tg==} - engines: {node: '>= 10'} - cpu: [x64] - os: [linux] - - '@napi-rs/canvas-linux-x64-musl@0.1.97': - resolution: {integrity: sha512-AKLFd/v0Z5fvgqBDqhvqtAdx+fHMJ5t9JcUNKq4FIZ5WH+iegGm8HPdj00NFlCSnm83Fp3Ln8I2f7uq1aIiWaA==} - engines: {node: '>= 10'} - cpu: [x64] - os: [linux] - - '@napi-rs/canvas-win32-arm64-msvc@0.1.97': - resolution: {integrity: sha512-u883Yr6A6fO7Vpsy9YE4FVCIxzzo5sO+7pIUjjoDLjS3vQaNMkVzx5bdIpEL+ob+gU88WDK4VcxYMZ6nmnoX9A==} - engines: {node: '>= 10'} - cpu: [arm64] - os: [win32] - - '@napi-rs/canvas-win32-x64-msvc@0.1.97': - resolution: {integrity: sha512-sWtD2EE3fV0IzN+iiQUqr/Q1SwqWhs2O1FKItFlxtdDkikpEj5g7DKQpY3x55H/MAOnL8iomnlk3mcEeGiUMoQ==} - engines: {node: '>= 10'} - cpu: [x64] - os: [win32] - - '@napi-rs/canvas@0.1.97': - resolution: {integrity: sha512-8cFniXvrIEnVwuNSRCW9wirRZbHvrD3JVujdS2P5n5xiJZNZMOZcfOvJ1pb66c7jXMKHHglJEDVJGbm8XWFcXQ==} - engines: {node: '>= 10'} - '@napi-rs/wasm-runtime@0.2.12': resolution: {integrity: sha512-ZVWUcfwY4E/yPitQJl481FjFo3K22D6qF0DuFH6Y/nbnE11GY5uguDxZMGXPQ8WQ0128MXQD7TnfHyK4oWoIJQ==} @@ -8551,9 +8484,6 @@ packages: node-int64@0.4.0: resolution: {integrity: sha512-O5lz91xSOeoXP6DulyHfllpq+Eg00MWitZIbtPfoSEvqIHdl5gfcY6hYzDWnj0qD5tz52PI08u9qUvSVeUBeHw==} - node-readable-to-web-readable-stream@0.4.2: - resolution: {integrity: sha512-/cMZNI34v//jUTrI+UIo4ieHAB5EZRY/+7OmXZgBxaWBMcW2tGdceIw06RFxWxrKZ5Jp3sI2i5TsRo+CBhtVLQ==} - node-releases@2.0.27: resolution: {integrity: sha512-nmh3lCkYZ3grZvqcCH+fjmQ7X+H0OeZgP40OierEaAptX4XofMh5kwNbWh7lBduUzCcV/8kZ+NDLCwm2iorIlA==} @@ -8845,10 +8775,6 @@ packages: pause@0.0.1: resolution: {integrity: sha512-KG8UEiEVkR3wGEb4m5yZkVCzigAD+cVEJck2CzYZO37ZGJfctvVptVO192MwrtPhzONn6go8ylnOdMhKqi4nfg==} - pdfjs-dist@5.5.207: - resolution: {integrity: sha512-WMqqw06w1vUt9ZfT0gOFhMf3wHsWhaCrxGrckGs5Cci6ybDW87IvPaOd2pnBwT6BJuP/CzXDZxjFgmSULLdsdw==} - engines: {node: '>=20.19.0 || >=22.13.0 || >=24'} - peberminta@0.9.0: resolution: {integrity: sha512-XIxfHpEuSJbITd1H3EeQwpcZbTLHc+VVr8ANI9t5sit565tsI4/xK3KWTUFE2e6QiangUkh3B0jihzmGnNrRsQ==} @@ -10328,6 +10254,7 @@ packages: uuid@10.0.0: resolution: {integrity: sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==} + deprecated: uuid@10 and below is no longer supported. For ESM codebases, update to uuid@latest. For CommonJS codebases, use uuid@11 (but be aware this version will likely be deprecated in 2028). hasBin: true uuid@11.1.0: @@ -12193,6 +12120,8 @@ snapshots: '@csstools/css-tokenizer@3.0.3': {} + '@docmost/pdf-inspector@1.9.4': {} + '@emnapi/core@1.8.1': dependencies: '@emnapi/wasi-threads': 1.1.0 @@ -13193,54 +13122,6 @@ snapshots: '@msgpackr-extract/msgpackr-extract-win32-x64@3.0.2': optional: true - '@napi-rs/canvas-android-arm64@0.1.97': - optional: true - - '@napi-rs/canvas-darwin-arm64@0.1.97': - optional: true - - '@napi-rs/canvas-darwin-x64@0.1.97': - optional: true - - '@napi-rs/canvas-linux-arm-gnueabihf@0.1.97': - optional: true - - '@napi-rs/canvas-linux-arm64-gnu@0.1.97': - optional: true - - '@napi-rs/canvas-linux-arm64-musl@0.1.97': - optional: true - - '@napi-rs/canvas-linux-riscv64-gnu@0.1.97': - optional: true - - '@napi-rs/canvas-linux-x64-gnu@0.1.97': - optional: true - - '@napi-rs/canvas-linux-x64-musl@0.1.97': - optional: true - - '@napi-rs/canvas-win32-arm64-msvc@0.1.97': - optional: true - - '@napi-rs/canvas-win32-x64-msvc@0.1.97': - optional: true - - '@napi-rs/canvas@0.1.97': - optionalDependencies: - '@napi-rs/canvas-android-arm64': 0.1.97 - '@napi-rs/canvas-darwin-arm64': 0.1.97 - '@napi-rs/canvas-darwin-x64': 0.1.97 - '@napi-rs/canvas-linux-arm-gnueabihf': 0.1.97 - '@napi-rs/canvas-linux-arm64-gnu': 0.1.97 - '@napi-rs/canvas-linux-arm64-musl': 0.1.97 - '@napi-rs/canvas-linux-riscv64-gnu': 0.1.97 - '@napi-rs/canvas-linux-x64-gnu': 0.1.97 - '@napi-rs/canvas-linux-x64-musl': 0.1.97 - '@napi-rs/canvas-win32-arm64-msvc': 0.1.97 - '@napi-rs/canvas-win32-x64-msvc': 0.1.97 - optional: true - '@napi-rs/wasm-runtime@0.2.12': dependencies: '@emnapi/core': 1.8.1 @@ -19627,9 +19508,6 @@ snapshots: node-int64@0.4.0: {} - node-readable-to-web-readable-stream@0.4.2: - optional: true - node-releases@2.0.27: {} nodemailer@8.0.5: {} @@ -19981,11 +19859,6 @@ snapshots: pause@0.0.1: {} - pdfjs-dist@5.5.207: - optionalDependencies: - '@napi-rs/canvas': 0.1.97 - node-readable-to-web-readable-stream: 0.4.2 - peberminta@0.9.0: {} pend@1.2.0: {}