mirror of
https://github.com/docmost/docmost.git
synced 2026-05-09 07:43:06 +08:00
feat(ee): PDF import (#2142)
* feat: replace pdfjs-dist with firecrawl-pdf-inspector * use modified firecrawl-pdf-inspector * feat: pdf import * increase single file upload size limit * use npm package * sync * update package
This commit is contained in:
@@ -8,6 +8,7 @@ export const Feature = {
|
||||
AI: 'ai',
|
||||
CONFLUENCE_IMPORT: 'import:confluence',
|
||||
DOCX_IMPORT: 'import:docx',
|
||||
PDF_IMPORT: 'import:pdf',
|
||||
ATTACHMENT_INDEXING: 'attachment:indexing',
|
||||
SECURITY_SETTINGS: 'security:settings',
|
||||
MCP: 'mcp',
|
||||
|
||||
@@ -51,9 +51,9 @@ export class ImportController {
|
||||
@AuthUser() user: User,
|
||||
@AuthWorkspace() workspace: Workspace,
|
||||
) {
|
||||
const validFileExtensions = ['.md', '.html', '.docx'];
|
||||
const validFileExtensions = ['.md', '.html', '.docx', '.pdf'];
|
||||
|
||||
const maxFileSize = bytes('20mb');
|
||||
const maxFileSize = bytes('30mb');
|
||||
|
||||
let file = null;
|
||||
try {
|
||||
@@ -102,6 +102,7 @@ export class ImportController {
|
||||
'.md': 'markdown',
|
||||
'.html': 'html',
|
||||
'.docx': 'docx',
|
||||
'.pdf': 'pdf',
|
||||
};
|
||||
|
||||
if (createdPage) {
|
||||
|
||||
@@ -63,7 +63,10 @@ export class ImportService {
|
||||
let createdPage = null;
|
||||
|
||||
// For DOCX, we need the page ID upfront so images can reference it
|
||||
const pageId = fileExtension === '.docx' ? uuid7() : undefined;
|
||||
const pageId =
|
||||
fileExtension === '.docx' || fileExtension === '.pdf'
|
||||
? uuid7()
|
||||
: undefined;
|
||||
|
||||
try {
|
||||
if (fileExtension.endsWith('.md')) {
|
||||
@@ -78,6 +81,14 @@ export class ImportService {
|
||||
pageId,
|
||||
userId,
|
||||
);
|
||||
} else if (fileExtension.endsWith('.pdf')) {
|
||||
prosemirrorState = await this.processPdf(
|
||||
fileBuffer,
|
||||
workspaceId,
|
||||
spaceId,
|
||||
pageId,
|
||||
userId,
|
||||
);
|
||||
}
|
||||
} catch (err) {
|
||||
const message = 'Error processing file content';
|
||||
@@ -156,7 +167,7 @@ export class ImportService {
|
||||
let DocxImportModule: any;
|
||||
try {
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||
DocxImportModule = require('./../../../ee/docx-import/docx-import.service');
|
||||
DocxImportModule = require('./../../../ee/document-import/docx-import.service');
|
||||
} catch (err) {
|
||||
this.logger.error(
|
||||
'DOCX import requested but EE module not bundled in this build',
|
||||
@@ -182,6 +193,42 @@ export class ImportService {
|
||||
return this.processHTML(html);
|
||||
}
|
||||
|
||||
async processPdf(
|
||||
fileBuffer: Buffer,
|
||||
workspaceId: string,
|
||||
spaceId: string,
|
||||
pageId: string,
|
||||
userId: string,
|
||||
): Promise<any> {
|
||||
let PdfImportModule: any;
|
||||
try {
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||
PdfImportModule = require('./../../../ee/document-import/pdf-import.service');
|
||||
} catch (err) {
|
||||
this.logger.error(
|
||||
'PDF import requested but EE module not bundled in this build',
|
||||
);
|
||||
throw new BadRequestException(
|
||||
'This feature requires a valid enterprise license.',
|
||||
);
|
||||
}
|
||||
|
||||
const pdfImportService = this.moduleRef.get(
|
||||
PdfImportModule.PdfImportService,
|
||||
{ strict: false },
|
||||
);
|
||||
|
||||
const html = await pdfImportService.convertPdfToHtml(
|
||||
fileBuffer,
|
||||
workspaceId,
|
||||
spaceId,
|
||||
pageId,
|
||||
userId,
|
||||
);
|
||||
|
||||
return this.processHTML(html);
|
||||
}
|
||||
|
||||
async createYdoc(prosemirrorJson: any): Promise<Buffer | null> {
|
||||
if (prosemirrorJson) {
|
||||
// this.logger.debug(`Converting prosemirror json state to ydoc`);
|
||||
|
||||
Reference in New Issue
Block a user