feat(ee): PDF import (#2142)

* feat: replace pdfjs-dist with firecrawl-pdf-inspector * use modified firecrawl-pdf-inspector * feat: pdf import * increase single file upload size limit * use npm package * sync * update package
2026-05-07 06:23:06 +08:00 · 2026-05-01 14:56:39 +01:00
parent 641ce142df
commit c247d4c1e3
7 changed files with 93 additions and 142 deletions
@@ -8,6 +8,7 @@ export const Feature = {
  AI: 'ai',
  CONFLUENCE_IMPORT: 'import:confluence',
  DOCX_IMPORT: 'import:docx',
+  PDF_IMPORT: 'import:pdf',
  ATTACHMENT_INDEXING: 'attachment:indexing',
  SECURITY_SETTINGS: 'security:settings',
  MCP: 'mcp',
@@ -12,6 +12,7 @@ import {
  IconCheck,
  IconFileCode,
  IconFileTypeDocx,
+  IconFileTypePdf,
  IconFileTypeZip,
  IconMarkdown,
  IconX,
@@ -90,12 +91,14 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
  const markdownFileRef = useRef<() => void>(null);
  const htmlFileRef = useRef<() => void>(null);
  const docxFileRef = useRef<() => void>(null);
+  const pdfFileRef = useRef<() => void>(null);
  const notionFileRef = useRef<() => void>(null);
  const confluenceFileRef = useRef<() => void>(null);
  const zipFileRef = useRef<() => void>(null);

  const canUseConfluence = useHasFeature(Feature.CONFLUENCE_IMPORT);
  const canUseDocx = useHasFeature(Feature.DOCX_IMPORT);
+  const canUsePdf = useHasFeature(Feature.PDF_IMPORT);
  const upgradeLabel = useUpgradeLabel();

  const handleZipUpload = async (selectedFile: File, source: string) => {
@@ -244,7 +247,7 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
    }, 3000);
  }, [fileTaskId]);

-  const maxSingleFileSize = bytes("20mb");
+  const maxSingleFileSize = bytes("30mb");

  const handleFileUpload = async (selectedFiles: File[]) => {
    if (!selectedFiles) {
@@ -298,6 +301,7 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
      if (markdownFileRef.current) markdownFileRef.current();
      if (htmlFileRef.current) htmlFileRef.current();
      if (docxFileRef.current) docxFileRef.current();
+      if (pdfFileRef.current) pdfFileRef.current();

      const pageCountText =
        pageCount === 1 ? `1 ${t("page")}` : `${pageCount} ${t("pages")}`;
@@ -378,6 +382,30 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
          )}
        </FileButton>

+        <FileButton
+          onChange={handleFileUpload}
+          accept=".pdf"
+          multiple
+          resetRef={pdfFileRef}
+        >
+          {(props) => (
+            <Tooltip
+              label={upgradeLabel}
+              disabled={canUsePdf}
+            >
+              <Button
+                disabled={!canUsePdf}
+                justify="start"
+                variant="default"
+                leftSection={<IconFileTypePdf size={18} />}
+                {...props}
+              >
+                PDF
+              </Button>
+            </Tooltip>
+          )}
+        </FileButton>
+
        <FileButton
          onChange={(file) => handleZipUpload(file, "notion")}
          accept="application/zip"
@@ -37,6 +37,7 @@
    "@aws-sdk/lib-storage": "3.1037.0",
    "@aws-sdk/s3-request-presigner": "3.1037.0",
    "@clickhouse/client": "^1.18.2",
+    "@docmost/pdf-inspector": "1.9.4",
    "@fastify/cookie": "^11.0.2",
    "@fastify/multipart": "^10.0.0",
    "@fastify/static": "^9.1.3",
@@ -100,7 +101,6 @@
    "p-limit": "^7.3.0",
    "passport-google-oauth20": "^2.0.0",
    "passport-jwt": "^4.0.1",
-    "pdfjs-dist": "^5.5.207",
    "pg-tsquery": "^8.4.2",
    "pgvector": "^0.2.1",
    "pino-http": "^11.0.0",
@@ -8,6 +8,7 @@ export const Feature = {
  AI: 'ai',
  CONFLUENCE_IMPORT: 'import:confluence',
  DOCX_IMPORT: 'import:docx',
+  PDF_IMPORT: 'import:pdf',
  ATTACHMENT_INDEXING: 'attachment:indexing',
  SECURITY_SETTINGS: 'security:settings',
  MCP: 'mcp',
@@ -51,9 +51,9 @@ export class ImportController {
    @AuthUser() user: User,
    @AuthWorkspace() workspace: Workspace,
  ) {
-    const validFileExtensions = ['.md', '.html', '.docx'];
+    const validFileExtensions = ['.md', '.html', '.docx', '.pdf'];

-    const maxFileSize = bytes('20mb');
+    const maxFileSize = bytes('30mb');

    let file = null;
    try {
@@ -102,6 +102,7 @@ export class ImportController {
      '.md': 'markdown',
      '.html': 'html',
      '.docx': 'docx',
+      '.pdf': 'pdf',
    };

    if (createdPage) {
@@ -63,7 +63,10 @@ export class ImportService {
    let createdPage = null;

    // For DOCX, we need the page ID upfront so images can reference it
-    const pageId = fileExtension === '.docx' ? uuid7() : undefined;
+    const pageId =
+      fileExtension === '.docx' || fileExtension === '.pdf'
+        ? uuid7()
+        : undefined;

    try {
      if (fileExtension.endsWith('.md')) {
@@ -78,6 +81,14 @@ export class ImportService {
          pageId,
          userId,
        );
+      } else if (fileExtension.endsWith('.pdf')) {
+        prosemirrorState = await this.processPdf(
+          fileBuffer,
+          workspaceId,
+          spaceId,
+          pageId,
+          userId,
+        );
      }
    } catch (err) {
      const message = 'Error processing file content';
@@ -156,7 +167,7 @@ export class ImportService {
    let DocxImportModule: any;
    try {
      // eslint-disable-next-line @typescript-eslint/no-require-imports
-      DocxImportModule = require('./../../../ee/docx-import/docx-import.service');
+      DocxImportModule = require('./../../../ee/document-import/docx-import.service');
    } catch (err) {
      this.logger.error(
        'DOCX import requested but EE module not bundled in this build',
@@ -182,6 +193,42 @@ export class ImportService {
    return this.processHTML(html);
  }

+  async processPdf(
+    fileBuffer: Buffer,
+    workspaceId: string,
+    spaceId: string,
+    pageId: string,
+    userId: string,
+  ): Promise<any> {
+    let PdfImportModule: any;
+    try {
+      // eslint-disable-next-line @typescript-eslint/no-require-imports
+      PdfImportModule = require('./../../../ee/document-import/pdf-import.service');
+    } catch (err) {
+      this.logger.error(
+        'PDF import requested but EE module not bundled in this build',
+      );
+      throw new BadRequestException(
+        'This feature requires a valid enterprise license.',
+      );
+    }
+
+    const pdfImportService = this.moduleRef.get(
+      PdfImportModule.PdfImportService,
+      { strict: false },
+    );
+
+    const html = await pdfImportService.convertPdfToHtml(
+      fileBuffer,
+      workspaceId,
+      spaceId,
+      pageId,
+      userId,
+    );
+
+    return this.processHTML(html);
+  }
+
  async createYdoc(prosemirrorJson: any): Promise<Buffer | null> {
    if (prosemirrorJson) {
      // this.logger.debug(`Converting prosemirror json state to ydoc`);