From c247d4c1e33baec9772d4497a7b452dc16f2e91f Mon Sep 17 00:00:00 2001
From: Philip Okugbe <16838612+Philipinho@users.noreply.github.com>
Date: Fri, 1 May 2026 14:56:39 +0100
Subject: [PATCH] feat(ee): PDF import (#2142)
* feat: replace pdfjs-dist with firecrawl-pdf-inspector
* use modified firecrawl-pdf-inspector
* feat: pdf import
* increase single file upload size limit
* use npm package
* sync
* update package
---
apps/client/src/ee/features.ts | 1 +
.../page/components/page-import-modal.tsx | 30 +++-
apps/server/package.json | 2 +-
apps/server/src/common/features.ts | 1 +
.../integrations/import/import.controller.ts | 5 +-
.../import/services/import.service.ts | 51 +++++-
pnpm-lock.yaml | 145 ++----------------
7 files changed, 93 insertions(+), 142 deletions(-)
diff --git a/apps/client/src/ee/features.ts b/apps/client/src/ee/features.ts
index a9ab8b0d..cacf851f 100644
--- a/apps/client/src/ee/features.ts
+++ b/apps/client/src/ee/features.ts
@@ -8,6 +8,7 @@ export const Feature = {
AI: 'ai',
CONFLUENCE_IMPORT: 'import:confluence',
DOCX_IMPORT: 'import:docx',
+ PDF_IMPORT: 'import:pdf',
ATTACHMENT_INDEXING: 'attachment:indexing',
SECURITY_SETTINGS: 'security:settings',
MCP: 'mcp',
diff --git a/apps/client/src/features/page/components/page-import-modal.tsx b/apps/client/src/features/page/components/page-import-modal.tsx
index df6691d5..c1c12dc4 100644
--- a/apps/client/src/features/page/components/page-import-modal.tsx
+++ b/apps/client/src/features/page/components/page-import-modal.tsx
@@ -12,6 +12,7 @@ import {
IconCheck,
IconFileCode,
IconFileTypeDocx,
+ IconFileTypePdf,
IconFileTypeZip,
IconMarkdown,
IconX,
@@ -90,12 +91,14 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
const markdownFileRef = useRef<() => void>(null);
const htmlFileRef = useRef<() => void>(null);
const docxFileRef = useRef<() => void>(null);
+ const pdfFileRef = useRef<() => void>(null);
const notionFileRef = useRef<() => void>(null);
const confluenceFileRef = useRef<() => void>(null);
const zipFileRef = useRef<() => void>(null);
const canUseConfluence = useHasFeature(Feature.CONFLUENCE_IMPORT);
const canUseDocx = useHasFeature(Feature.DOCX_IMPORT);
+ const canUsePdf = useHasFeature(Feature.PDF_IMPORT);
const upgradeLabel = useUpgradeLabel();
const handleZipUpload = async (selectedFile: File, source: string) => {
@@ -244,7 +247,7 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
}, 3000);
}, [fileTaskId]);
- const maxSingleFileSize = bytes("20mb");
+ const maxSingleFileSize = bytes("30mb");
const handleFileUpload = async (selectedFiles: File[]) => {
if (!selectedFiles) {
@@ -298,6 +301,7 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
if (markdownFileRef.current) markdownFileRef.current();
if (htmlFileRef.current) htmlFileRef.current();
if (docxFileRef.current) docxFileRef.current();
+ if (pdfFileRef.current) pdfFileRef.current();
const pageCountText =
pageCount === 1 ? `1 ${t("page")}` : `${pageCount} ${t("pages")}`;
@@ -378,6 +382,30 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
)}
+
+ {(props) => (
+
+ }
+ {...props}
+ >
+ PDF
+
+
+ )}
+
+
handleZipUpload(file, "notion")}
accept="application/zip"
diff --git a/apps/server/package.json b/apps/server/package.json
index 79492487..c0eeddf3 100644
--- a/apps/server/package.json
+++ b/apps/server/package.json
@@ -37,6 +37,7 @@
"@aws-sdk/lib-storage": "3.1037.0",
"@aws-sdk/s3-request-presigner": "3.1037.0",
"@clickhouse/client": "^1.18.2",
+ "@docmost/pdf-inspector": "1.9.4",
"@fastify/cookie": "^11.0.2",
"@fastify/multipart": "^10.0.0",
"@fastify/static": "^9.1.3",
@@ -100,7 +101,6 @@
"p-limit": "^7.3.0",
"passport-google-oauth20": "^2.0.0",
"passport-jwt": "^4.0.1",
- "pdfjs-dist": "^5.5.207",
"pg-tsquery": "^8.4.2",
"pgvector": "^0.2.1",
"pino-http": "^11.0.0",
diff --git a/apps/server/src/common/features.ts b/apps/server/src/common/features.ts
index 38f226a8..c5fd9a20 100644
--- a/apps/server/src/common/features.ts
+++ b/apps/server/src/common/features.ts
@@ -8,6 +8,7 @@ export const Feature = {
AI: 'ai',
CONFLUENCE_IMPORT: 'import:confluence',
DOCX_IMPORT: 'import:docx',
+ PDF_IMPORT: 'import:pdf',
ATTACHMENT_INDEXING: 'attachment:indexing',
SECURITY_SETTINGS: 'security:settings',
MCP: 'mcp',
diff --git a/apps/server/src/integrations/import/import.controller.ts b/apps/server/src/integrations/import/import.controller.ts
index 7ee325e5..cd2341ea 100644
--- a/apps/server/src/integrations/import/import.controller.ts
+++ b/apps/server/src/integrations/import/import.controller.ts
@@ -51,9 +51,9 @@ export class ImportController {
@AuthUser() user: User,
@AuthWorkspace() workspace: Workspace,
) {
- const validFileExtensions = ['.md', '.html', '.docx'];
+ const validFileExtensions = ['.md', '.html', '.docx', '.pdf'];
- const maxFileSize = bytes('20mb');
+ const maxFileSize = bytes('30mb');
let file = null;
try {
@@ -102,6 +102,7 @@ export class ImportController {
'.md': 'markdown',
'.html': 'html',
'.docx': 'docx',
+ '.pdf': 'pdf',
};
if (createdPage) {
diff --git a/apps/server/src/integrations/import/services/import.service.ts b/apps/server/src/integrations/import/services/import.service.ts
index 0eb3ae40..1eb10ca8 100644
--- a/apps/server/src/integrations/import/services/import.service.ts
+++ b/apps/server/src/integrations/import/services/import.service.ts
@@ -63,7 +63,10 @@ export class ImportService {
let createdPage = null;
// For DOCX, we need the page ID upfront so images can reference it
- const pageId = fileExtension === '.docx' ? uuid7() : undefined;
+ const pageId =
+ fileExtension === '.docx' || fileExtension === '.pdf'
+ ? uuid7()
+ : undefined;
try {
if (fileExtension.endsWith('.md')) {
@@ -78,6 +81,14 @@ export class ImportService {
pageId,
userId,
);
+ } else if (fileExtension.endsWith('.pdf')) {
+ prosemirrorState = await this.processPdf(
+ fileBuffer,
+ workspaceId,
+ spaceId,
+ pageId,
+ userId,
+ );
}
} catch (err) {
const message = 'Error processing file content';
@@ -156,7 +167,7 @@ export class ImportService {
let DocxImportModule: any;
try {
// eslint-disable-next-line @typescript-eslint/no-require-imports
- DocxImportModule = require('./../../../ee/docx-import/docx-import.service');
+ DocxImportModule = require('./../../../ee/document-import/docx-import.service');
} catch (err) {
this.logger.error(
'DOCX import requested but EE module not bundled in this build',
@@ -182,6 +193,42 @@ export class ImportService {
return this.processHTML(html);
}
+ async processPdf(
+ fileBuffer: Buffer,
+ workspaceId: string,
+ spaceId: string,
+ pageId: string,
+ userId: string,
+ ): Promise {
+ let PdfImportModule: any;
+ try {
+ // eslint-disable-next-line @typescript-eslint/no-require-imports
+ PdfImportModule = require('./../../../ee/document-import/pdf-import.service');
+ } catch (err) {
+ this.logger.error(
+ 'PDF import requested but EE module not bundled in this build',
+ );
+ throw new BadRequestException(
+ 'This feature requires a valid enterprise license.',
+ );
+ }
+
+ const pdfImportService = this.moduleRef.get(
+ PdfImportModule.PdfImportService,
+ { strict: false },
+ );
+
+ const html = await pdfImportService.convertPdfToHtml(
+ fileBuffer,
+ workspaceId,
+ spaceId,
+ pageId,
+ userId,
+ );
+
+ return this.processHTML(html);
+ }
+
async createYdoc(prosemirrorJson: any): Promise {
if (prosemirrorJson) {
// this.logger.debug(`Converting prosemirror json state to ydoc`);
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 3985bc75..f6e1a757 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -482,6 +482,9 @@ importers:
'@clickhouse/client':
specifier: ^1.18.2
version: 1.18.2
+ '@docmost/pdf-inspector':
+ specifier: 1.9.4
+ version: 1.9.4
'@fastify/cookie':
specifier: ^11.0.2
version: 11.0.2
@@ -671,9 +674,6 @@ importers:
passport-jwt:
specifier: ^4.0.1
version: 4.0.1
- pdfjs-dist:
- specifier: ^5.5.207
- version: 5.5.207
pg-tsquery:
specifier: ^8.4.2
version: 8.4.2
@@ -1826,6 +1826,9 @@ packages:
resolution: {integrity: sha512-UJnjoFsmxfKUdNYdWgOB0mWUypuLvAfQPH1+pyvRJs6euowbFkFC6P13w1l8mJyi3vxYMxc9kld5jZEGRQs6bw==}
engines: {node: '>=18'}
+ '@docmost/pdf-inspector@1.9.4':
+ resolution: {integrity: sha512-G5DNyDtLNxybTXWakqi7PuOEuSb/A2ZjDlv2WCkOkiHszPeILdrC+G0a4e4UP10yxvzuLfb23pJ5jy8fUSYZPw==}
+
'@emnapi/core@1.8.1':
resolution: {integrity: sha512-AvT9QFpxK0Zd8J0jopedNm+w/2fIzvtPKPjqyw9jwvBaReTTqPBk9Hixaz7KbjimP+QNz605/XnjFcDAL2pqBg==}
@@ -2762,76 +2765,6 @@ packages:
cpu: [x64]
os: [win32]
- '@napi-rs/canvas-android-arm64@0.1.97':
- resolution: {integrity: sha512-V1c/WVw+NzH8vk7ZK/O8/nyBSCQimU8sfMsB/9qeSvdkGKNU7+mxy/bIF0gTgeBFmHpj30S4E9WHMSrxXGQuVQ==}
- engines: {node: '>= 10'}
- cpu: [arm64]
- os: [android]
-
- '@napi-rs/canvas-darwin-arm64@0.1.97':
- resolution: {integrity: sha512-ok+SCEF4YejcxuJ9Rm+WWunHHpf2HmiPxfz6z1a/NFQECGXtsY7A4B8XocK1LmT1D7P174MzwPF9Wy3AUAwEPw==}
- engines: {node: '>= 10'}
- cpu: [arm64]
- os: [darwin]
-
- '@napi-rs/canvas-darwin-x64@0.1.97':
- resolution: {integrity: sha512-PUP6e6/UGlclUvAQNnuXCcnkpdUou6VYZfQOQxExLp86epOylmiwLkqXIvpFmjoTEDmPmXrI+coL/9EFU1gKPA==}
- engines: {node: '>= 10'}
- cpu: [x64]
- os: [darwin]
-
- '@napi-rs/canvas-linux-arm-gnueabihf@0.1.97':
- resolution: {integrity: sha512-XyXH2L/cic8eTNtbrXCcvqHtMX/nEOxN18+7rMrAM2XtLYC/EB5s0wnO1FsLMWmK+04ZSLN9FBGipo7kpIkcOw==}
- engines: {node: '>= 10'}
- cpu: [arm]
- os: [linux]
-
- '@napi-rs/canvas-linux-arm64-gnu@0.1.97':
- resolution: {integrity: sha512-Kuq/M3djq0K8ktgz6nPlK7Ne5d4uWeDxPpyKWOjWDK2RIOhHVtLtyLiJw2fuldw7Vn4mhw05EZXCEr4Q76rs9w==}
- engines: {node: '>= 10'}
- cpu: [arm64]
- os: [linux]
-
- '@napi-rs/canvas-linux-arm64-musl@0.1.97':
- resolution: {integrity: sha512-kKmSkQVnWeqg7qdsiXvYxKhAFuHz3tkBjW/zyQv5YKUPhotpaVhpBGv5LqCngzyuRV85SXoe+OFj+Tv0a0QXkQ==}
- engines: {node: '>= 10'}
- cpu: [arm64]
- os: [linux]
-
- '@napi-rs/canvas-linux-riscv64-gnu@0.1.97':
- resolution: {integrity: sha512-Jc7I3A51jnEOIAXeLsN/M/+Z28LUeakcsXs07FLq9prXc0eYOtVwsDEv913Gr+06IRo34gJJVgT0TXvmz+N2VA==}
- engines: {node: '>= 10'}
- cpu: [riscv64]
- os: [linux]
-
- '@napi-rs/canvas-linux-x64-gnu@0.1.97':
- resolution: {integrity: sha512-iDUBe7AilfuBSRbSa8/IGX38Mf+iCSBqoVKLSQ5XaY2JLOaqz1TVyPFEyIck7wT6mRQhQt5sN6ogfjIDfi74tg==}
- engines: {node: '>= 10'}
- cpu: [x64]
- os: [linux]
-
- '@napi-rs/canvas-linux-x64-musl@0.1.97':
- resolution: {integrity: sha512-AKLFd/v0Z5fvgqBDqhvqtAdx+fHMJ5t9JcUNKq4FIZ5WH+iegGm8HPdj00NFlCSnm83Fp3Ln8I2f7uq1aIiWaA==}
- engines: {node: '>= 10'}
- cpu: [x64]
- os: [linux]
-
- '@napi-rs/canvas-win32-arm64-msvc@0.1.97':
- resolution: {integrity: sha512-u883Yr6A6fO7Vpsy9YE4FVCIxzzo5sO+7pIUjjoDLjS3vQaNMkVzx5bdIpEL+ob+gU88WDK4VcxYMZ6nmnoX9A==}
- engines: {node: '>= 10'}
- cpu: [arm64]
- os: [win32]
-
- '@napi-rs/canvas-win32-x64-msvc@0.1.97':
- resolution: {integrity: sha512-sWtD2EE3fV0IzN+iiQUqr/Q1SwqWhs2O1FKItFlxtdDkikpEj5g7DKQpY3x55H/MAOnL8iomnlk3mcEeGiUMoQ==}
- engines: {node: '>= 10'}
- cpu: [x64]
- os: [win32]
-
- '@napi-rs/canvas@0.1.97':
- resolution: {integrity: sha512-8cFniXvrIEnVwuNSRCW9wirRZbHvrD3JVujdS2P5n5xiJZNZMOZcfOvJ1pb66c7jXMKHHglJEDVJGbm8XWFcXQ==}
- engines: {node: '>= 10'}
-
'@napi-rs/wasm-runtime@0.2.12':
resolution: {integrity: sha512-ZVWUcfwY4E/yPitQJl481FjFo3K22D6qF0DuFH6Y/nbnE11GY5uguDxZMGXPQ8WQ0128MXQD7TnfHyK4oWoIJQ==}
@@ -8551,9 +8484,6 @@ packages:
node-int64@0.4.0:
resolution: {integrity: sha512-O5lz91xSOeoXP6DulyHfllpq+Eg00MWitZIbtPfoSEvqIHdl5gfcY6hYzDWnj0qD5tz52PI08u9qUvSVeUBeHw==}
- node-readable-to-web-readable-stream@0.4.2:
- resolution: {integrity: sha512-/cMZNI34v//jUTrI+UIo4ieHAB5EZRY/+7OmXZgBxaWBMcW2tGdceIw06RFxWxrKZ5Jp3sI2i5TsRo+CBhtVLQ==}
-
node-releases@2.0.27:
resolution: {integrity: sha512-nmh3lCkYZ3grZvqcCH+fjmQ7X+H0OeZgP40OierEaAptX4XofMh5kwNbWh7lBduUzCcV/8kZ+NDLCwm2iorIlA==}
@@ -8845,10 +8775,6 @@ packages:
pause@0.0.1:
resolution: {integrity: sha512-KG8UEiEVkR3wGEb4m5yZkVCzigAD+cVEJck2CzYZO37ZGJfctvVptVO192MwrtPhzONn6go8ylnOdMhKqi4nfg==}
- pdfjs-dist@5.5.207:
- resolution: {integrity: sha512-WMqqw06w1vUt9ZfT0gOFhMf3wHsWhaCrxGrckGs5Cci6ybDW87IvPaOd2pnBwT6BJuP/CzXDZxjFgmSULLdsdw==}
- engines: {node: '>=20.19.0 || >=22.13.0 || >=24'}
-
peberminta@0.9.0:
resolution: {integrity: sha512-XIxfHpEuSJbITd1H3EeQwpcZbTLHc+VVr8ANI9t5sit565tsI4/xK3KWTUFE2e6QiangUkh3B0jihzmGnNrRsQ==}
@@ -10328,6 +10254,7 @@ packages:
uuid@10.0.0:
resolution: {integrity: sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==}
+ deprecated: uuid@10 and below is no longer supported. For ESM codebases, update to uuid@latest. For CommonJS codebases, use uuid@11 (but be aware this version will likely be deprecated in 2028).
hasBin: true
uuid@11.1.0:
@@ -12193,6 +12120,8 @@ snapshots:
'@csstools/css-tokenizer@3.0.3': {}
+ '@docmost/pdf-inspector@1.9.4': {}
+
'@emnapi/core@1.8.1':
dependencies:
'@emnapi/wasi-threads': 1.1.0
@@ -13193,54 +13122,6 @@ snapshots:
'@msgpackr-extract/msgpackr-extract-win32-x64@3.0.2':
optional: true
- '@napi-rs/canvas-android-arm64@0.1.97':
- optional: true
-
- '@napi-rs/canvas-darwin-arm64@0.1.97':
- optional: true
-
- '@napi-rs/canvas-darwin-x64@0.1.97':
- optional: true
-
- '@napi-rs/canvas-linux-arm-gnueabihf@0.1.97':
- optional: true
-
- '@napi-rs/canvas-linux-arm64-gnu@0.1.97':
- optional: true
-
- '@napi-rs/canvas-linux-arm64-musl@0.1.97':
- optional: true
-
- '@napi-rs/canvas-linux-riscv64-gnu@0.1.97':
- optional: true
-
- '@napi-rs/canvas-linux-x64-gnu@0.1.97':
- optional: true
-
- '@napi-rs/canvas-linux-x64-musl@0.1.97':
- optional: true
-
- '@napi-rs/canvas-win32-arm64-msvc@0.1.97':
- optional: true
-
- '@napi-rs/canvas-win32-x64-msvc@0.1.97':
- optional: true
-
- '@napi-rs/canvas@0.1.97':
- optionalDependencies:
- '@napi-rs/canvas-android-arm64': 0.1.97
- '@napi-rs/canvas-darwin-arm64': 0.1.97
- '@napi-rs/canvas-darwin-x64': 0.1.97
- '@napi-rs/canvas-linux-arm-gnueabihf': 0.1.97
- '@napi-rs/canvas-linux-arm64-gnu': 0.1.97
- '@napi-rs/canvas-linux-arm64-musl': 0.1.97
- '@napi-rs/canvas-linux-riscv64-gnu': 0.1.97
- '@napi-rs/canvas-linux-x64-gnu': 0.1.97
- '@napi-rs/canvas-linux-x64-musl': 0.1.97
- '@napi-rs/canvas-win32-arm64-msvc': 0.1.97
- '@napi-rs/canvas-win32-x64-msvc': 0.1.97
- optional: true
-
'@napi-rs/wasm-runtime@0.2.12':
dependencies:
'@emnapi/core': 1.8.1
@@ -19627,9 +19508,6 @@ snapshots:
node-int64@0.4.0: {}
- node-readable-to-web-readable-stream@0.4.2:
- optional: true
-
node-releases@2.0.27: {}
nodemailer@8.0.5: {}
@@ -19981,11 +19859,6 @@ snapshots:
pause@0.0.1: {}
- pdfjs-dist@5.5.207:
- optionalDependencies:
- '@napi-rs/canvas': 0.1.97
- node-readable-to-web-readable-stream: 0.4.2
-
peberminta@0.9.0: {}
pend@1.2.0: {}