feat: pdf import

This commit is contained in:
Philipinho
2026-04-16 14:17:27 +01:00
parent ba9e4de036
commit 28b46dc0cb
6 changed files with 82 additions and 4 deletions
+1
View File
@@ -8,6 +8,7 @@ export const Feature = {
AI: 'ai', AI: 'ai',
CONFLUENCE_IMPORT: 'import:confluence', CONFLUENCE_IMPORT: 'import:confluence',
DOCX_IMPORT: 'import:docx', DOCX_IMPORT: 'import:docx',
PDF_IMPORT: 'import:pdf',
ATTACHMENT_INDEXING: 'attachment:indexing', ATTACHMENT_INDEXING: 'attachment:indexing',
SECURITY_SETTINGS: 'security:settings', SECURITY_SETTINGS: 'security:settings',
MCP: 'mcp', MCP: 'mcp',
@@ -12,6 +12,7 @@ import {
IconCheck, IconCheck,
IconFileCode, IconFileCode,
IconFileTypeDocx, IconFileTypeDocx,
IconFileTypePdf,
IconFileTypeZip, IconFileTypeZip,
IconMarkdown, IconMarkdown,
IconX, IconX,
@@ -90,12 +91,14 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
const markdownFileRef = useRef<() => void>(null); const markdownFileRef = useRef<() => void>(null);
const htmlFileRef = useRef<() => void>(null); const htmlFileRef = useRef<() => void>(null);
const docxFileRef = useRef<() => void>(null); const docxFileRef = useRef<() => void>(null);
const pdfFileRef = useRef<() => void>(null);
const notionFileRef = useRef<() => void>(null); const notionFileRef = useRef<() => void>(null);
const confluenceFileRef = useRef<() => void>(null); const confluenceFileRef = useRef<() => void>(null);
const zipFileRef = useRef<() => void>(null); const zipFileRef = useRef<() => void>(null);
const canUseConfluence = useHasFeature(Feature.CONFLUENCE_IMPORT); const canUseConfluence = useHasFeature(Feature.CONFLUENCE_IMPORT);
const canUseDocx = useHasFeature(Feature.DOCX_IMPORT); const canUseDocx = useHasFeature(Feature.DOCX_IMPORT);
const canUsePdf = useHasFeature(Feature.PDF_IMPORT);
const upgradeLabel = useUpgradeLabel(); const upgradeLabel = useUpgradeLabel();
const handleZipUpload = async (selectedFile: File, source: string) => { const handleZipUpload = async (selectedFile: File, source: string) => {
@@ -298,6 +301,7 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
if (markdownFileRef.current) markdownFileRef.current(); if (markdownFileRef.current) markdownFileRef.current();
if (htmlFileRef.current) htmlFileRef.current(); if (htmlFileRef.current) htmlFileRef.current();
if (docxFileRef.current) docxFileRef.current(); if (docxFileRef.current) docxFileRef.current();
if (pdfFileRef.current) pdfFileRef.current();
const pageCountText = const pageCountText =
pageCount === 1 ? `1 ${t("page")}` : `${pageCount} ${t("pages")}`; pageCount === 1 ? `1 ${t("page")}` : `${pageCount} ${t("pages")}`;
@@ -378,6 +382,30 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
)} )}
</FileButton> </FileButton>
<FileButton
onChange={handleFileUpload}
accept=".pdf"
multiple
resetRef={pdfFileRef}
>
{(props) => (
<Tooltip
label={upgradeLabel}
disabled={canUsePdf}
>
<Button
disabled={!canUsePdf}
justify="start"
variant="default"
leftSection={<IconFileTypePdf size={18} />}
{...props}
>
PDF
</Button>
</Tooltip>
)}
</FileButton>
<FileButton <FileButton
onChange={(file) => handleZipUpload(file, "notion")} onChange={(file) => handleZipUpload(file, "notion")}
accept="application/zip" accept="application/zip"
+1
View File
@@ -8,6 +8,7 @@ export const Feature = {
AI: 'ai', AI: 'ai',
CONFLUENCE_IMPORT: 'import:confluence', CONFLUENCE_IMPORT: 'import:confluence',
DOCX_IMPORT: 'import:docx', DOCX_IMPORT: 'import:docx',
PDF_IMPORT: 'import:pdf',
ATTACHMENT_INDEXING: 'attachment:indexing', ATTACHMENT_INDEXING: 'attachment:indexing',
SECURITY_SETTINGS: 'security:settings', SECURITY_SETTINGS: 'security:settings',
MCP: 'mcp', MCP: 'mcp',
@@ -51,7 +51,7 @@ export class ImportController {
@AuthUser() user: User, @AuthUser() user: User,
@AuthWorkspace() workspace: Workspace, @AuthWorkspace() workspace: Workspace,
) { ) {
const validFileExtensions = ['.md', '.html', '.docx']; const validFileExtensions = ['.md', '.html', '.docx', '.pdf'];
const maxFileSize = bytes('20mb'); const maxFileSize = bytes('20mb');
@@ -102,6 +102,7 @@ export class ImportController {
'.md': 'markdown', '.md': 'markdown',
'.html': 'html', '.html': 'html',
'.docx': 'docx', '.docx': 'docx',
'.pdf': 'pdf',
}; };
if (createdPage) { if (createdPage) {
@@ -62,7 +62,10 @@ export class ImportService {
let createdPage = null; let createdPage = null;
// For DOCX, we need the page ID upfront so images can reference it // For DOCX, we need the page ID upfront so images can reference it
const pageId = fileExtension === '.docx' ? uuid7() : undefined; const pageId =
fileExtension === '.docx' || fileExtension === '.pdf'
? uuid7()
: undefined;
try { try {
if (fileExtension.endsWith('.md')) { if (fileExtension.endsWith('.md')) {
@@ -77,6 +80,14 @@ export class ImportService {
pageId, pageId,
userId, userId,
); );
} else if (fileExtension.endsWith('.pdf')) {
prosemirrorState = await this.processPdf(
fileBuffer,
workspaceId,
spaceId,
pageId,
userId,
);
} }
} catch (err) { } catch (err) {
const message = 'Error processing file content'; const message = 'Error processing file content';
@@ -153,7 +164,7 @@ export class ImportService {
let DocxImportModule: any; let DocxImportModule: any;
try { try {
// eslint-disable-next-line @typescript-eslint/no-require-imports // eslint-disable-next-line @typescript-eslint/no-require-imports
DocxImportModule = require('./../../../ee/docx-import/docx-import.service'); DocxImportModule = require('./../../../ee/document-import/docx-import.service');
} catch (err) { } catch (err) {
this.logger.error( this.logger.error(
'DOCX import requested but EE module not bundled in this build', 'DOCX import requested but EE module not bundled in this build',
@@ -179,6 +190,42 @@ export class ImportService {
return this.processHTML(html); return this.processHTML(html);
} }
async processPdf(
fileBuffer: Buffer,
workspaceId: string,
spaceId: string,
pageId: string,
userId: string,
): Promise<any> {
let PdfImportModule: any;
try {
// eslint-disable-next-line @typescript-eslint/no-require-imports
PdfImportModule = require('./../../../ee/document-import/pdf-import.service');
} catch (err) {
this.logger.error(
'PDF import requested but EE module not bundled in this build',
);
throw new BadRequestException(
'This feature requires a valid enterprise license.',
);
}
const pdfImportService = this.moduleRef.get(
PdfImportModule.PdfImportService,
{ strict: false },
);
const html = await pdfImportService.convertPdfToHtml(
fileBuffer,
workspaceId,
spaceId,
pageId,
userId,
);
return this.processHTML(html);
}
async createYdoc(prosemirrorJson: any): Promise<Buffer | null> { async createYdoc(prosemirrorJson: any): Promise<Buffer | null> {
if (prosemirrorJson) { if (prosemirrorJson) {
// this.logger.debug(`Converting prosemirror json state to ydoc`); // this.logger.debug(`Converting prosemirror json state to ydoc`);