From 3e66aff1e25ba64bcd6045d0c239476d37453678 Mon Sep 17 00:00:00 2001 From: Philipinho <16838612+Philipinho@users.noreply.github.com> Date: Sat, 16 May 2026 01:21:14 +0100 Subject: [PATCH] feat(import): extract Confluence indent helper with auto-detect unit and tests --- .../import/utils/confluence-indent.spec.ts | 149 ++++++++++++++++++ .../import/utils/confluence-indent.ts | 76 +++++++++ .../import/utils/import-formatter.ts | 76 +-------- 3 files changed, 227 insertions(+), 74 deletions(-) create mode 100644 apps/server/src/integrations/import/utils/confluence-indent.spec.ts create mode 100644 apps/server/src/integrations/import/utils/confluence-indent.ts diff --git a/apps/server/src/integrations/import/utils/confluence-indent.spec.ts b/apps/server/src/integrations/import/utils/confluence-indent.spec.ts new file mode 100644 index 000000000..c420dbba3 --- /dev/null +++ b/apps/server/src/integrations/import/utils/confluence-indent.spec.ts @@ -0,0 +1,149 @@ +import { load } from 'cheerio'; +import { applyConfluenceMarginLeftIndent } from './confluence-indent'; + +function run(html: string): string { + const $ = load(html); + applyConfluenceMarginLeftIndent($, $.root()); + // cheerio's html() includes ; return the body's inner HTML so + // tests can assert on the meaningful portion. + return $('body').html() ?? $.html(); +} + +describe('applyConfluenceMarginLeftIndent', () => { + describe('Confluence Cloud (30 px per level, max 6)', () => { + it('maps 30/60/90/120/150/180 px to data-indent 1..6', () => { + const html = + '

L1

' + + '

L2

' + + '

L3

' + + '

L4

' + + '

L5

' + + '

L6

'; + const out = run(html); + expect(out).toContain('

L1

'); + expect(out).toContain('

L2

'); + expect(out).toContain('

L3

'); + expect(out).toContain('

L4

'); + expect(out).toContain('

L5

'); + expect(out).toContain('

L6

'); + expect(out).not.toContain('margin-left'); + }); + }); + + describe('Confluence Data Center (40 px per level, no upper bound)', () => { + it('maps 40/80/120/160/200/240 px to data-indent 1..6', () => { + const html = + '

one

' + + '

two

' + + '

three

' + + '

four

' + + '

five

' + + '

six

'; + const out = run(html); + expect(out).toContain('

one

'); + expect(out).toContain('

two

'); + expect(out).toContain('

three

'); + expect(out).toContain('

four

'); + expect(out).toContain('

five

'); + expect(out).toContain('

six

'); + expect(out).not.toContain('margin-left'); + }); + + it('clamps DC levels above 8 down to 8', () => { + const html = + '

L8

' + + '

L9

' + + '

L15

'; + const out = run(html); + expect(out).toContain('

L8

'); + expect(out).toContain('

L9

'); + expect(out).toContain('

L15

'); + }); + }); + + describe('headings', () => { + it('handles indent on h1-h6 the same way as paragraphs', () => { + const html = + '

a

' + + '
b
'; + const out = run(html); + expect(out).toContain('

a

'); + expect(out).toContain('
b
'); + }); + }); + + describe('style attribute handling', () => { + it('strips margin-left but preserves other inline styles', () => { + const html = + '

x

'; + const out = run(html); + expect(out).toMatch(/

x<\/p>/); + expect(out).not.toContain('margin-left'); + }); + + it('removes the style attribute entirely when only margin-left was set', () => { + // Two values so GCD detection sees a real unit (60 px) instead of + // collapsing to the lone value. The point of this test is the style + // attribute being stripped, not the level number. + const html = + '

x

' + + '

y

'; + const out = run(html); + expect(out).toContain('

x

'); + expect(out).toContain('

y

'); + expect(out).not.toContain('style='); + }); + }); + + describe('scope and edge cases', () => { + it('leaves elements without margin-left untouched', () => { + const html = '

plain

heading

'; + const out = run(html); + expect(out).toBe('

plain

heading

'); + }); + + it('does not touch divs, spans, or list items', () => { + const html = + '
div
' + + '
  • li
  • ' + + 'span'; + const out = run(html); + expect(out).not.toContain('data-indent'); + expect(out).toContain('margin-left: 30px'); + }); + + it('ignores zero, negative, and unparseable margin-left values', () => { + const html = + '

    zero

    ' + + '

    neg

    ' + + '

    auto

    '; + const out = run(html); + expect(out).not.toContain('data-indent'); + }); + + it('honors an explicit pxPerLevel override', () => { + // Mixed Cloud-and-DC nominal values forced to 40 px/level interpretation. + const $ = load( + '

    a

    ' + + '

    b

    ', + ); + applyConfluenceMarginLeftIndent($, $.root(), { pxPerLevel: 40 }); + const out = $('body').html() ?? ''; + expect(out).toContain('

    a

    '); + expect(out).toContain('

    b

    '); + }); + + it('returns a no-op when no indented elements are present', () => { + const html = '

    hi

    '; + const out = run(html); + expect(out).toBe('

    hi

    '); + }); + + it('handles a single ambiguous value by clamping to level 1', () => { + // GCD of a single value is the value itself, so 120 / 120 = 1. + const html = '

    only

    '; + const out = run(html); + expect(out).toContain('

    only

    '); + }); + }); +}); diff --git a/apps/server/src/integrations/import/utils/confluence-indent.ts b/apps/server/src/integrations/import/utils/confluence-indent.ts new file mode 100644 index 000000000..3bdc3aa2d --- /dev/null +++ b/apps/server/src/integrations/import/utils/confluence-indent.ts @@ -0,0 +1,76 @@ +import { Cheerio, CheerioAPI } from 'cheerio'; + +// Maximum indent level supported by the Indent editor extension (see +// packages/editor-ext/src/lib/indent.ts). Values above this clamp down. +const MAX_INDENT_LEVEL = 8; +const MARGIN_LEFT_RE = /margin-left\s*:\s*(-?\d*\.?\d+)\s*px/i; +const MARGIN_LEFT_STRIP_RE = /margin-left\s*:\s*-?\d*\.?\d+\s*px\s*;?/i; + +/** + * Confluence encodes paragraph indent as inline `style="margin-left: Npx"`. + * The per-level pixel value differs by edition: Cloud uses 30 (max 6 levels), + * Data Center uses 40 (no upper limit). The HTML-export ZIP path has no + * edition information available, so we auto-detect the per-level unit from + * the GCD of all margin-left values in the document. The API converter can + * pass `pxPerLevel` explicitly when the edition is known. + * + * Levels are written to `data-indent` for the TipTap Indent extension to + * pick up; the margin-left style is stripped from the element so the + * normalized indent doesn't double up with the editor's own indent padding. + */ +export function applyConfluenceMarginLeftIndent( + $: CheerioAPI, + $root: Cheerio, + options?: { pxPerLevel?: number }, +): void { + const $els = $root.find('p, h1, h2, h3, h4, h5, h6'); + + const values: number[] = []; + $els.each((_, el) => { + const style = $(el).attr('style'); + if (!style) return; + const match = MARGIN_LEFT_RE.exec(style); + if (!match) return; + const px = parseFloat(match[1]); + if (Number.isFinite(px) && px > 0) values.push(px); + }); + if (values.length === 0) return; + + const unit = options?.pxPerLevel ?? detectIndentUnit(values); + if (!unit || unit <= 0) return; + + $els.each((_, el) => { + const $el = $(el); + const style = $el.attr('style'); + if (!style) return; + const match = MARGIN_LEFT_RE.exec(style); + if (!match) return; + const px = parseFloat(match[1]); + if (!Number.isFinite(px) || px <= 0) return; + const level = Math.min( + MAX_INDENT_LEVEL, + Math.max(1, Math.round(px / unit)), + ); + $el.attr('data-indent', String(level)); + const remaining = style.replace(MARGIN_LEFT_STRIP_RE, '').trim(); + if (remaining) { + $el.attr('style', remaining); + } else { + $el.removeAttr('style'); + } + }); +} + +function detectIndentUnit(values: number[]): number { + // Confluence emits floats like "30.0"; round to ints for a clean GCD. + const ints = values.map((v) => Math.round(v)).filter((v) => v > 0); + if (ints.length === 0) return 0; + return ints.reduce((a, b) => gcd(a, b)); +} + +function gcd(a: number, b: number): number { + while (b !== 0) { + [a, b] = [b, a % b]; + } + return a; +} diff --git a/apps/server/src/integrations/import/utils/import-formatter.ts b/apps/server/src/integrations/import/utils/import-formatter.ts index 87cfd4cd0..7b8f6892e 100644 --- a/apps/server/src/integrations/import/utils/import-formatter.ts +++ b/apps/server/src/integrations/import/utils/import-formatter.ts @@ -97,80 +97,8 @@ export function xwikiFormatter($: CheerioAPI, $root: Cheerio) { } } -// Maximum indent level supported by the Indent editor extension (see -// packages/editor-ext/src/lib/indent.ts). Values above this clamp down. -const MAX_INDENT_LEVEL = 8; -const MARGIN_LEFT_RE = /margin-left\s*:\s*(-?\d*\.?\d+)\s*px/i; -const MARGIN_LEFT_STRIP_RE = /margin-left\s*:\s*-?\d*\.?\d+\s*px\s*;?/i; - -/** - * Confluence encodes paragraph indent as inline `style="margin-left: Npx"`. - * The per-level pixel value differs by edition: Cloud uses 30 (max 6 levels), - * Data Center uses 40 (no upper limit). The HTML-export ZIP path has no - * edition information available, so we auto-detect the per-level unit from - * the GCD of all margin-left values in the document. The API converter can - * pass `pxPerLevel` explicitly when the edition is known. - * - * Levels are written to `data-indent` for the TipTap Indent extension to - * pick up; the margin-left style is stripped from the element so the - * normalized indent doesn't double up with the editor's own indent padding. - */ -export function applyConfluenceMarginLeftIndent( - $: CheerioAPI, - $root: Cheerio, - options?: { pxPerLevel?: number }, -): void { - const $els = $root.find('p, h1, h2, h3, h4, h5, h6'); - - const values: number[] = []; - $els.each((_, el) => { - const style = $(el).attr('style'); - if (!style) return; - const match = MARGIN_LEFT_RE.exec(style); - if (!match) return; - const px = parseFloat(match[1]); - if (Number.isFinite(px) && px > 0) values.push(px); - }); - if (values.length === 0) return; - - const unit = options?.pxPerLevel ?? detectIndentUnit(values); - if (!unit || unit <= 0) return; - - $els.each((_, el) => { - const $el = $(el); - const style = $el.attr('style'); - if (!style) return; - const match = MARGIN_LEFT_RE.exec(style); - if (!match) return; - const px = parseFloat(match[1]); - if (!Number.isFinite(px) || px <= 0) return; - const level = Math.min( - MAX_INDENT_LEVEL, - Math.max(1, Math.round(px / unit)), - ); - $el.attr('data-indent', String(level)); - const remaining = style.replace(MARGIN_LEFT_STRIP_RE, '').trim(); - if (remaining) { - $el.attr('style', remaining); - } else { - $el.removeAttr('style'); - } - }); -} - -function detectIndentUnit(values: number[]): number { - // Confluence emits floats like "30.0"; round to ints for a clean GCD. - const ints = values.map((v) => Math.round(v)).filter((v) => v > 0); - if (ints.length === 0) return 0; - return ints.reduce((a, b) => gcd(a, b)); -} - -function gcd(a: number, b: number): number { - while (b !== 0) { - [a, b] = [b, a % b]; - } - return a; -} +import { applyConfluenceMarginLeftIndent } from './confluence-indent'; +export { applyConfluenceMarginLeftIndent }; export function defaultHtmlFormatter($: CheerioAPI, $root: Cheerio) { normalizeTableColumnWidths($, $root);