mirror of
https://github.com/docmost/docmost.git
synced 2026-05-07 06:23:06 +08:00
feat(ee): PDF import (#2142)
* feat: replace pdfjs-dist with firecrawl-pdf-inspector * use modified firecrawl-pdf-inspector * feat: pdf import * increase single file upload size limit * use npm package * sync * update package
This commit is contained in:
@@ -8,6 +8,7 @@ export const Feature = {
|
||||
AI: 'ai',
|
||||
CONFLUENCE_IMPORT: 'import:confluence',
|
||||
DOCX_IMPORT: 'import:docx',
|
||||
PDF_IMPORT: 'import:pdf',
|
||||
ATTACHMENT_INDEXING: 'attachment:indexing',
|
||||
SECURITY_SETTINGS: 'security:settings',
|
||||
MCP: 'mcp',
|
||||
|
||||
@@ -12,6 +12,7 @@ import {
|
||||
IconCheck,
|
||||
IconFileCode,
|
||||
IconFileTypeDocx,
|
||||
IconFileTypePdf,
|
||||
IconFileTypeZip,
|
||||
IconMarkdown,
|
||||
IconX,
|
||||
@@ -90,12 +91,14 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
|
||||
const markdownFileRef = useRef<() => void>(null);
|
||||
const htmlFileRef = useRef<() => void>(null);
|
||||
const docxFileRef = useRef<() => void>(null);
|
||||
const pdfFileRef = useRef<() => void>(null);
|
||||
const notionFileRef = useRef<() => void>(null);
|
||||
const confluenceFileRef = useRef<() => void>(null);
|
||||
const zipFileRef = useRef<() => void>(null);
|
||||
|
||||
const canUseConfluence = useHasFeature(Feature.CONFLUENCE_IMPORT);
|
||||
const canUseDocx = useHasFeature(Feature.DOCX_IMPORT);
|
||||
const canUsePdf = useHasFeature(Feature.PDF_IMPORT);
|
||||
const upgradeLabel = useUpgradeLabel();
|
||||
|
||||
const handleZipUpload = async (selectedFile: File, source: string) => {
|
||||
@@ -244,7 +247,7 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
|
||||
}, 3000);
|
||||
}, [fileTaskId]);
|
||||
|
||||
const maxSingleFileSize = bytes("20mb");
|
||||
const maxSingleFileSize = bytes("30mb");
|
||||
|
||||
const handleFileUpload = async (selectedFiles: File[]) => {
|
||||
if (!selectedFiles) {
|
||||
@@ -298,6 +301,7 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
|
||||
if (markdownFileRef.current) markdownFileRef.current();
|
||||
if (htmlFileRef.current) htmlFileRef.current();
|
||||
if (docxFileRef.current) docxFileRef.current();
|
||||
if (pdfFileRef.current) pdfFileRef.current();
|
||||
|
||||
const pageCountText =
|
||||
pageCount === 1 ? `1 ${t("page")}` : `${pageCount} ${t("pages")}`;
|
||||
@@ -378,6 +382,30 @@ function ImportFormatSelection({ spaceId, onClose }: ImportFormatSelection) {
|
||||
)}
|
||||
</FileButton>
|
||||
|
||||
<FileButton
|
||||
onChange={handleFileUpload}
|
||||
accept=".pdf"
|
||||
multiple
|
||||
resetRef={pdfFileRef}
|
||||
>
|
||||
{(props) => (
|
||||
<Tooltip
|
||||
label={upgradeLabel}
|
||||
disabled={canUsePdf}
|
||||
>
|
||||
<Button
|
||||
disabled={!canUsePdf}
|
||||
justify="start"
|
||||
variant="default"
|
||||
leftSection={<IconFileTypePdf size={18} />}
|
||||
{...props}
|
||||
>
|
||||
PDF
|
||||
</Button>
|
||||
</Tooltip>
|
||||
)}
|
||||
</FileButton>
|
||||
|
||||
<FileButton
|
||||
onChange={(file) => handleZipUpload(file, "notion")}
|
||||
accept="application/zip"
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
"@aws-sdk/lib-storage": "3.1037.0",
|
||||
"@aws-sdk/s3-request-presigner": "3.1037.0",
|
||||
"@clickhouse/client": "^1.18.2",
|
||||
"@docmost/pdf-inspector": "1.9.4",
|
||||
"@fastify/cookie": "^11.0.2",
|
||||
"@fastify/multipart": "^10.0.0",
|
||||
"@fastify/static": "^9.1.3",
|
||||
@@ -100,7 +101,6 @@
|
||||
"p-limit": "^7.3.0",
|
||||
"passport-google-oauth20": "^2.0.0",
|
||||
"passport-jwt": "^4.0.1",
|
||||
"pdfjs-dist": "^5.5.207",
|
||||
"pg-tsquery": "^8.4.2",
|
||||
"pgvector": "^0.2.1",
|
||||
"pino-http": "^11.0.0",
|
||||
|
||||
@@ -8,6 +8,7 @@ export const Feature = {
|
||||
AI: 'ai',
|
||||
CONFLUENCE_IMPORT: 'import:confluence',
|
||||
DOCX_IMPORT: 'import:docx',
|
||||
PDF_IMPORT: 'import:pdf',
|
||||
ATTACHMENT_INDEXING: 'attachment:indexing',
|
||||
SECURITY_SETTINGS: 'security:settings',
|
||||
MCP: 'mcp',
|
||||
|
||||
@@ -51,9 +51,9 @@ export class ImportController {
|
||||
@AuthUser() user: User,
|
||||
@AuthWorkspace() workspace: Workspace,
|
||||
) {
|
||||
const validFileExtensions = ['.md', '.html', '.docx'];
|
||||
const validFileExtensions = ['.md', '.html', '.docx', '.pdf'];
|
||||
|
||||
const maxFileSize = bytes('20mb');
|
||||
const maxFileSize = bytes('30mb');
|
||||
|
||||
let file = null;
|
||||
try {
|
||||
@@ -102,6 +102,7 @@ export class ImportController {
|
||||
'.md': 'markdown',
|
||||
'.html': 'html',
|
||||
'.docx': 'docx',
|
||||
'.pdf': 'pdf',
|
||||
};
|
||||
|
||||
if (createdPage) {
|
||||
|
||||
@@ -63,7 +63,10 @@ export class ImportService {
|
||||
let createdPage = null;
|
||||
|
||||
// For DOCX, we need the page ID upfront so images can reference it
|
||||
const pageId = fileExtension === '.docx' ? uuid7() : undefined;
|
||||
const pageId =
|
||||
fileExtension === '.docx' || fileExtension === '.pdf'
|
||||
? uuid7()
|
||||
: undefined;
|
||||
|
||||
try {
|
||||
if (fileExtension.endsWith('.md')) {
|
||||
@@ -78,6 +81,14 @@ export class ImportService {
|
||||
pageId,
|
||||
userId,
|
||||
);
|
||||
} else if (fileExtension.endsWith('.pdf')) {
|
||||
prosemirrorState = await this.processPdf(
|
||||
fileBuffer,
|
||||
workspaceId,
|
||||
spaceId,
|
||||
pageId,
|
||||
userId,
|
||||
);
|
||||
}
|
||||
} catch (err) {
|
||||
const message = 'Error processing file content';
|
||||
@@ -156,7 +167,7 @@ export class ImportService {
|
||||
let DocxImportModule: any;
|
||||
try {
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||
DocxImportModule = require('./../../../ee/docx-import/docx-import.service');
|
||||
DocxImportModule = require('./../../../ee/document-import/docx-import.service');
|
||||
} catch (err) {
|
||||
this.logger.error(
|
||||
'DOCX import requested but EE module not bundled in this build',
|
||||
@@ -182,6 +193,42 @@ export class ImportService {
|
||||
return this.processHTML(html);
|
||||
}
|
||||
|
||||
async processPdf(
|
||||
fileBuffer: Buffer,
|
||||
workspaceId: string,
|
||||
spaceId: string,
|
||||
pageId: string,
|
||||
userId: string,
|
||||
): Promise<any> {
|
||||
let PdfImportModule: any;
|
||||
try {
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||
PdfImportModule = require('./../../../ee/document-import/pdf-import.service');
|
||||
} catch (err) {
|
||||
this.logger.error(
|
||||
'PDF import requested but EE module not bundled in this build',
|
||||
);
|
||||
throw new BadRequestException(
|
||||
'This feature requires a valid enterprise license.',
|
||||
);
|
||||
}
|
||||
|
||||
const pdfImportService = this.moduleRef.get(
|
||||
PdfImportModule.PdfImportService,
|
||||
{ strict: false },
|
||||
);
|
||||
|
||||
const html = await pdfImportService.convertPdfToHtml(
|
||||
fileBuffer,
|
||||
workspaceId,
|
||||
spaceId,
|
||||
pageId,
|
||||
userId,
|
||||
);
|
||||
|
||||
return this.processHTML(html);
|
||||
}
|
||||
|
||||
async createYdoc(prosemirrorJson: any): Promise<Buffer | null> {
|
||||
if (prosemirrorJson) {
|
||||
// this.logger.debug(`Converting prosemirror json state to ydoc`);
|
||||
|
||||
Reference in New Issue
Block a user