feat(ee): PDF import (#2142)

* feat: replace pdfjs-dist with firecrawl-pdf-inspector

* use modified firecrawl-pdf-inspector

* feat: pdf import

* increase single file upload size limit

* use npm package

* sync

* update package
This commit is contained in:
Philip Okugbe
2026-05-01 14:56:39 +01:00
committed by GitHub
parent 641ce142df
commit c247d4c1e3
7 changed files with 93 additions and 142 deletions
+1
View File
@@ -8,6 +8,7 @@ export const Feature = {
AI: 'ai',
CONFLUENCE_IMPORT: 'import:confluence',
DOCX_IMPORT: 'import:docx',
PDF_IMPORT: 'import:pdf',
ATTACHMENT_INDEXING: 'attachment:indexing',
SECURITY_SETTINGS: 'security:settings',
MCP: 'mcp',
@@ -12,6 +12,7 @@ import {
IconCheck,
IconFileCode,
IconFileTypeDocx,
IconFileTypePdf,
IconFileTypeZip,
IconMarkdown,
IconX,
@@ -90,12 +91,14 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
const markdownFileRef = useRef<() => void>(null);
const htmlFileRef = useRef<() => void>(null);
const docxFileRef = useRef<() => void>(null);
const pdfFileRef = useRef<() => void>(null);
const notionFileRef = useRef<() => void>(null);
const confluenceFileRef = useRef<() => void>(null);
const zipFileRef = useRef<() => void>(null);
const canUseConfluence = useHasFeature(Feature.CONFLUENCE_IMPORT);
const canUseDocx = useHasFeature(Feature.DOCX_IMPORT);
const canUsePdf = useHasFeature(Feature.PDF_IMPORT);
const upgradeLabel = useUpgradeLabel();
const handleZipUpload = async (selectedFile: File, source: string) => {
@@ -244,7 +247,7 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
}, 3000);
}, [fileTaskId]);
const maxSingleFileSize = bytes("20mb");
const maxSingleFileSize = bytes("30mb");
const handleFileUpload = async (selectedFiles: File[]) => {
if (!selectedFiles) {
@@ -298,6 +301,7 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
if (markdownFileRef.current) markdownFileRef.current();
if (htmlFileRef.current) htmlFileRef.current();
if (docxFileRef.current) docxFileRef.current();
if (pdfFileRef.current) pdfFileRef.current();
const pageCountText =
pageCount === 1 ? `1 ${t("page")}` : `${pageCount} ${t("pages")}`;
@@ -378,6 +382,30 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
)}
</FileButton>
<FileButton
onChange={handleFileUpload}
accept=".pdf"
multiple
resetRef={pdfFileRef}
>
{(props) => (
<Tooltip
label={upgradeLabel}
disabled={canUsePdf}
>
<Button
disabled={!canUsePdf}
justify="start"
variant="default"
leftSection={<IconFileTypePdf size={18} />}
{...props}
>
PDF
</Button>
</Tooltip>
)}
</FileButton>
<FileButton
onChange={(file) => handleZipUpload(file, "notion")}
accept="application/zip"
+1 -1
View File
@@ -37,6 +37,7 @@
"@aws-sdk/lib-storage": "3.1037.0",
"@aws-sdk/s3-request-presigner": "3.1037.0",
"@clickhouse/client": "^1.18.2",
"@docmost/pdf-inspector": "1.9.4",
"@fastify/cookie": "^11.0.2",
"@fastify/multipart": "^10.0.0",
"@fastify/static": "^9.1.3",
@@ -100,7 +101,6 @@
"p-limit": "^7.3.0",
"passport-google-oauth20": "^2.0.0",
"passport-jwt": "^4.0.1",
"pdfjs-dist": "^5.5.207",
"pg-tsquery": "^8.4.2",
"pgvector": "^0.2.1",
"pino-http": "^11.0.0",
+1
View File
@@ -8,6 +8,7 @@ export const Feature = {
AI: 'ai',
CONFLUENCE_IMPORT: 'import:confluence',
DOCX_IMPORT: 'import:docx',
PDF_IMPORT: 'import:pdf',
ATTACHMENT_INDEXING: 'attachment:indexing',
SECURITY_SETTINGS: 'security:settings',
MCP: 'mcp',
@@ -51,9 +51,9 @@ export class ImportController {
@AuthUser() user: User,
@AuthWorkspace() workspace: Workspace,
) {
const validFileExtensions = ['.md', '.html', '.docx'];
const validFileExtensions = ['.md', '.html', '.docx', '.pdf'];
const maxFileSize = bytes('20mb');
const maxFileSize = bytes('30mb');
let file = null;
try {
@@ -102,6 +102,7 @@ export class ImportController {
'.md': 'markdown',
'.html': 'html',
'.docx': 'docx',
'.pdf': 'pdf',
};
if (createdPage) {
@@ -63,7 +63,10 @@ export class ImportService {
let createdPage = null;
// For DOCX, we need the page ID upfront so images can reference it
const pageId = fileExtension === '.docx' ? uuid7() : undefined;
const pageId =
fileExtension === '.docx' || fileExtension === '.pdf'
? uuid7()
: undefined;
try {
if (fileExtension.endsWith('.md')) {
@@ -78,6 +81,14 @@ export class ImportService {
pageId,
userId,
);
} else if (fileExtension.endsWith('.pdf')) {
prosemirrorState = await this.processPdf(
fileBuffer,
workspaceId,
spaceId,
pageId,
userId,
);
}
} catch (err) {
const message = 'Error processing file content';
@@ -156,7 +167,7 @@ export class ImportService {
let DocxImportModule: any;
try {
// eslint-disable-next-line @typescript-eslint/no-require-imports
DocxImportModule = require('./../../../ee/docx-import/docx-import.service');
DocxImportModule = require('./../../../ee/document-import/docx-import.service');
} catch (err) {
this.logger.error(
'DOCX import requested but EE module not bundled in this build',
@@ -182,6 +193,42 @@ export class ImportService {
return this.processHTML(html);
}
async processPdf(
fileBuffer: Buffer,
workspaceId: string,
spaceId: string,
pageId: string,
userId: string,
): Promise<any> {
let PdfImportModule: any;
try {
// eslint-disable-next-line @typescript-eslint/no-require-imports
PdfImportModule = require('./../../../ee/document-import/pdf-import.service');
} catch (err) {
this.logger.error(
'PDF import requested but EE module not bundled in this build',
);
throw new BadRequestException(
'This feature requires a valid enterprise license.',
);
}
const pdfImportService = this.moduleRef.get(
PdfImportModule.PdfImportService,
{ strict: false },
);
const html = await pdfImportService.convertPdfToHtml(
fileBuffer,
workspaceId,
spaceId,
pageId,
userId,
);
return this.processHTML(html);
}
async createYdoc(prosemirrorJson: any): Promise<Buffer | null> {
if (prosemirrorJson) {
// this.logger.debug(`Converting prosemirror json state to ydoc`);