From f99d8c280829cc0c0e2675bb566195f2900f904c Mon Sep 17 00:00:00 2001 From: Philipinho <16838612+Philipinho@users.noreply.github.com> Date: Sun, 15 Mar 2026 21:49:06 +0000 Subject: [PATCH] fix notion importer --- .../services/file-import-task.service.ts | 74 +++++++++++++++---- .../import/utils/import-formatter.ts | 36 +++++++-- .../integrations/import/utils/import.utils.ts | 20 ++++- 3 files changed, 109 insertions(+), 21 deletions(-) diff --git a/apps/server/src/integrations/import/services/file-import-task.service.ts b/apps/server/src/integrations/import/services/file-import-task.service.ts index 8ff1cadca..59447b276 100644 --- a/apps/server/src/integrations/import/services/file-import-task.service.ts +++ b/apps/server/src/integrations/import/services/file-import-task.service.ts @@ -25,6 +25,7 @@ import { buildAttachmentCandidates, collectMarkdownAndHtmlFiles, encodeFilePath, + extractNotionPartialId, readDocmostMetadata, stripNotionID, } from '../utils/import.utils'; @@ -160,6 +161,7 @@ export class FileImportTaskService { fileTask: FileTask; }): Promise { const { extractDir, fileTask } = opts; + const isNotion = fileTask.source === FileImportSource.Notion; const allFiles = await collectMarkdownAndHtmlFiles(extractDir); const attachmentCandidates = await buildAttachmentCandidates(extractDir); const docmostMetadata = await readDocmostMetadata(extractDir); @@ -230,7 +232,17 @@ export class FileImportTaskService { } // For each folder with content, create a placeholder page if no corresponding .md or .html exists - foldersWithContent.forEach((folderPath) => { + // Process folders with partial UUIDs first so they claim their specific files + // before plain folders (without partial UUIDs) take whatever remains. + const sortedFolders = isNotion + ? [...foldersWithContent].sort((a, b) => { + const aHasPartial = extractNotionPartialId(path.basename(a)) ? 0 : 1; + const bHasPartial = extractNotionPartialId(path.basename(b)) ? 0 : 1; + return aHasPartial - bHasPartial; + }) + : [...foldersWithContent]; + + sortedFolders.forEach((folderPath) => { if ( skipRootFolder && folderPath?.toLowerCase() === skipRootFolder?.toLowerCase() @@ -243,18 +255,54 @@ export class FileImportTaskService { if (!pagesMap.has(mdPath) && !pagesMap.has(htmlPath)) { const folderName = path.basename(folderPath); - const encodedMdPath = encodeFilePath(mdPath); - const placeholderMetadata = docmostMetadata?.pages[encodedMdPath]; - pagesMap.set(mdPath, { - id: v7(), - slugId: generateSlugId(), - name: stripNotionID(folderName), - content: '', - parentPageId: null, - fileExtension: '.md', - filePath: mdPath, - icon: placeholderMetadata?.icon ?? null, - }); + const parentDir = path.dirname(folderPath); + + // Notion no longer adds UUIDs to folder names, but still adds them to files. + // For duplicate names, Notion adds a partial UUID "{first4}-{last4}" to the folder. + let matched = false; + if (isNotion) { + const partialId = extractNotionPartialId(folderName); + const strippedFolderName = stripNotionID(folderName); + const isSameDir = (fileDir: string) => + fileDir === parentDir || (parentDir === '.' && !fileDir.includes('/')); + + for (const [filePath, page] of pagesMap.entries()) { + if (!isSameDir(path.dirname(filePath))) continue; + if (page.name !== strippedFolderName) continue; + + if (partialId) { + // Match partial UUID against the full UUID in the filename + const fileBase = path.basename(filePath, path.extname(filePath)); + const fullIdMatch = fileBase.match(/[a-f0-9]{32}$/i); + if (!fullIdMatch) continue; + const fullId = fullIdMatch[0].toLowerCase(); + if (!fullId.startsWith(partialId.prefix) || !fullId.endsWith(partialId.suffix)) { + continue; + } + } + + pagesMap.delete(filePath); + page.filePath = mdPath; + pagesMap.set(mdPath, page); + matched = true; + break; + } + } + + if (!matched) { + const encodedMdPath = encodeFilePath(mdPath); + const placeholderMetadata = docmostMetadata?.pages[encodedMdPath]; + pagesMap.set(mdPath, { + id: v7(), + slugId: generateSlugId(), + name: stripNotionID(folderName), + content: '', + parentPageId: null, + fileExtension: '.md', + filePath: mdPath, + icon: placeholderMetadata?.icon ?? null, + }); + } } }); diff --git a/apps/server/src/integrations/import/utils/import-formatter.ts b/apps/server/src/integrations/import/utils/import-formatter.ts index 2d4bca7b4..c46b3c025 100644 --- a/apps/server/src/integrations/import/utils/import-formatter.ts +++ b/apps/server/src/integrations/import/utils/import-formatter.ts @@ -1,6 +1,7 @@ import { getEmbedUrlAndProvider } from '@docmost/editor-ext'; import { Logger } from '@nestjs/common'; import * as path from 'path'; +import { v7 } from 'uuid'; import { InsertableBacklink } from '@docmost/db/types/entity.types'; import { Cheerio, CheerioAPI, load } from 'cheerio'; // eslint-disable-next-line @typescript-eslint/no-require-imports @@ -344,14 +345,35 @@ export async function rewriteInternalLinksToMentionHtml( const meta = filePathToPageMetaMap.get(resolved); if (!meta) return; - const titleSlug = slugify(meta.title?.substring(0, 70) || 'untitled'); - const pageSlug = `${titleSlug}-${meta.slugId}`; - const internalHref = spaceSlug - ? `/s/${spaceSlug}/p/${pageSlug}` - : `/p/${pageSlug}`; + const linkText = $a.text().trim(); + const titleMatch = + linkText === meta.title || + linkText === meta.title?.trim(); - $a.attr('href', internalHref); - $a.attr('data-internal', 'true'); + if (titleMatch) { + const mentionId = v7(); + const $mention = $('') + .attr({ + 'data-type': 'mention', + 'data-id': mentionId, + 'data-entity-type': 'page', + 'data-entity-id': meta.id, + 'data-label': meta.title, + 'data-slug-id': meta.slugId, + 'data-creator-id': creatorId, + }) + .text(meta.title); + $a.replaceWith($mention); + } else { + const titleSlug = slugify(meta.title?.substring(0, 70) || 'untitled'); + const pageSlug = `${titleSlug}-${meta.slugId}`; + const internalHref = spaceSlug + ? `/s/${spaceSlug}/p/${pageSlug}` + : `/p/${pageSlug}`; + + $a.attr('href', internalHref); + $a.attr('data-internal', 'true'); + } backlinks.push({ sourcePageId, targetPageId: meta.id, workspaceId }); }); diff --git a/apps/server/src/integrations/import/utils/import.utils.ts b/apps/server/src/integrations/import/utils/import.utils.ts index cd3486526..cebe89ea6 100644 --- a/apps/server/src/integrations/import/utils/import.utils.ts +++ b/apps/server/src/integrations/import/utils/import.utils.ts @@ -81,7 +81,25 @@ export async function collectMarkdownAndHtmlFiles( export function stripNotionID(fileName: string): string { // Handle optional separator (space or dash) + 32 alphanumeric chars at end const notionIdPattern = /[ -]?[a-z0-9]{32}$/i; - return fileName.replace(notionIdPattern, '').trim(); + // Handle partial UUID format used for duplicate names: "Name abcd-ef12" + const partialIdPattern = / [a-f0-9]{4}-[a-f0-9]{4}$/i; + return fileName + .replace(notionIdPattern, '') + .replace(partialIdPattern, '') + .trim(); +} + +/** + * Extract a partial Notion UUID suffix from a folder name. + * Notion adds "{first4}-{last4}" when multiple pages share the same title. + * e.g. "Cool 324d-35ab" → { prefix: "324d", suffix: "35ab" } + */ +export function extractNotionPartialId( + folderName: string, +): { prefix: string; suffix: string } | null { + const match = folderName.match(/ ([a-f0-9]{4})-([a-f0-9]{4})$/i); + if (!match) return null; + return { prefix: match[1].toLowerCase(), suffix: match[2].toLowerCase() }; } export function encodeFilePath(filePath: string): string {