From 68309247b5362871b7b0ae7b9730810f8ac7ce69 Mon Sep 17 00:00:00 2001 From: Philipinho <16838612+Philipinho@users.noreply.github.com> Date: Wed, 22 Apr 2026 01:36:14 +0100 Subject: [PATCH] share html normalization between zip and single-file imports --- .../import/services/import.service.ts | 6 +++++- .../import/utils/import-formatter.ts | 21 ++++++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/apps/server/src/integrations/import/services/import.service.ts b/apps/server/src/integrations/import/services/import.service.ts index 231a6c89a..2b02c16b6 100644 --- a/apps/server/src/integrations/import/services/import.service.ts +++ b/apps/server/src/integrations/import/services/import.service.ts @@ -30,6 +30,8 @@ import { InjectQueue } from '@nestjs/bullmq'; import { Queue } from 'bullmq'; import { QueueJob, QueueName } from '../../queue/constants'; import { ModuleRef } from '@nestjs/core'; +import { load } from 'cheerio'; +import { normalizeImportHtml } from '../utils/import-formatter'; @Injectable() export class ImportService { @@ -137,7 +139,9 @@ export class ImportService { async processHTML(htmlInput: string): Promise { try { - return htmlToJson(htmlInput); + const $ = load(htmlInput); + normalizeImportHtml($, $.root()); + return htmlToJson($.html() || ''); } catch (err) { throw err; } diff --git a/apps/server/src/integrations/import/utils/import-formatter.ts b/apps/server/src/integrations/import/utils/import-formatter.ts index 12e617c3c..0333b75e7 100644 --- a/apps/server/src/integrations/import/utils/import-formatter.ts +++ b/apps/server/src/integrations/import/utils/import-formatter.ts @@ -52,9 +52,7 @@ export async function formatImportHtml(opts: { } } - notionFormatter($, $root); - xwikiFormatter($, $root); - defaultHtmlFormatter($, $root); + normalizeImportHtml($, $root); const backlinks = await rewriteInternalLinksToMentionHtml( $, @@ -74,6 +72,23 @@ export async function formatImportHtml(opts: { }; } +/** + * Contextless HTML cleanup shared by every import path. + * - notionFormatter: no-op on non-Notion HTML (class-selector-based). + * - xwikiFormatter: no-op on non-XWiki HTML (looks for #xwikicontent). + * - defaultHtmlFormatter: table column widths + provider auto-embeds. + * + * Does NOT run rewriteInternalLinksToMentionHtml — that requires zip context. + */ +export function normalizeImportHtml( + $: CheerioAPI, + $root: Cheerio, +): void { + notionFormatter($, $root); + xwikiFormatter($, $root); + defaultHtmlFormatter($, $root); +} + export function xwikiFormatter($: CheerioAPI, $root: Cheerio) { const $content = $root.find('#xwikicontent'); if ($content.length) {