feat: properly preserve table width (#2143)

This commit is contained in:
Philip Okugbe
2026-05-01 00:49:31 +01:00
committed by GitHub
parent 9943e104a5
commit 09c69d7a0f
4 changed files with 225 additions and 5 deletions
@@ -80,10 +80,12 @@ export const MarkdownClipboard = Extension.create({
const { from, to } = view.state.selection;
const parsed = markdownToHtml(text.replace(/\n+$/, ""));
const body = elementFromString(parsed);
normalizeTableColumnWidths(body);
const contentNodes = DOMParser.fromSchema(
this.editor.schema,
).parseSlice(elementFromString(parsed), {
).parseSlice(body, {
preserveWhitespace: true,
});
@@ -137,3 +139,92 @@ function elementFromString(value) {
return new window.DOMParser().parseFromString(wrappedValue, "text/html").body;
}
const DEFAULT_PASTE_COL_WIDTH_PX = 150;
function parsePixelWidth(el: Element): number | null {
const attr = el.getAttribute("width");
if (attr) {
const n = parseInt(attr, 10);
if (Number.isFinite(n) && n > 0) return n;
}
const style = el.getAttribute("style") || "";
const m = style.match(/(?:^|;)\s*width\s*:\s*([\d.]+)\s*px/i);
if (m) {
const n = parseInt(m[1], 10);
if (Number.isFinite(n) && n > 0) return n;
}
return null;
}
function getFirstRow(table: Element): Element | null {
const tbodyRow = table.querySelector(":scope > tbody > tr");
if (tbodyRow) return tbodyRow;
const theadRow = table.querySelector(":scope > thead > tr");
if (theadRow) return theadRow;
return table.querySelector(":scope > tr");
}
function deriveColumnWidths(table: Element): (number | null)[] | null {
const cols = table.querySelectorAll(":scope > colgroup > col");
if (cols.length > 0) {
const widths: (number | null)[] = [];
cols.forEach((col) => widths.push(parsePixelWidth(col)));
if (widths.some((w) => w !== null)) return widths;
}
const firstRow = getFirstRow(table);
if (!firstRow) return null;
const widths: (number | null)[] = [];
Array.from(firstRow.children)
.filter((c) => c.tagName === "TD" || c.tagName === "TH")
.forEach((cell) => {
const colspan = parseInt(cell.getAttribute("colspan") || "1", 10) || 1;
const w = parsePixelWidth(cell);
for (let i = 0; i < colspan; i++) {
widths.push(w !== null ? Math.round(w / colspan) : null);
}
});
if (widths.length === 0 || widths.every((w) => w === null)) return null;
return widths;
}
// Mirror of server normalizeTableColumnWidths (see import/utils/table-utils.ts):
// markdown source has no widths, so without this every pasted table renders
// at table-layout:fixed/100% and squashes columns to fit the editor instead of
// letting .tableWrapper's overflow-x: auto scroll.
export function normalizeTableColumnWidths(root: Element): void {
root.querySelectorAll("table").forEach((table) => {
const firstRow = getFirstRow(table);
if (!firstRow) return;
let colWidths = deriveColumnWidths(table);
if (!colWidths) {
let count = 0;
Array.from(firstRow.children)
.filter((c) => c.tagName === "TD" || c.tagName === "TH")
.forEach((cell) => {
count += parseInt(cell.getAttribute("colspan") || "1", 10) || 1;
});
if (count === 0) return;
colWidths = new Array(count).fill(DEFAULT_PASTE_COL_WIDTH_PX);
}
let col = 0;
Array.from(firstRow.children)
.filter((c) => c.tagName === "TD" || c.tagName === "TH")
.forEach((cell) => {
if (cell.getAttribute("colwidth")) {
col += parseInt(cell.getAttribute("colspan") || "1", 10) || 1;
return;
}
const colspan = parseInt(cell.getAttribute("colspan") || "1", 10) || 1;
const slice = colWidths!.slice(col, col + colspan);
col += colspan;
if (slice.length === 0 || slice.every((w) => w === null)) return;
const values = slice.map((w) => (w == null ? 100 : w));
cell.setAttribute("colwidth", values.join(","));
});
});
}
@@ -29,6 +29,8 @@ import { InjectQueue } from '@nestjs/bullmq';
import { Queue } from 'bullmq';
import { QueueJob, QueueName } from '../../queue/constants';
import { ModuleRef } from '@nestjs/core';
import { load } from 'cheerio';
import { normalizeImportHtml } from '../utils/import-formatter';
@Injectable()
export class ImportService {
@@ -136,7 +138,9 @@ export class ImportService {
async processHTML(htmlInput: string): Promise<any> {
try {
return htmlToJson(htmlInput);
const $ = load(htmlInput);
normalizeImportHtml($, $.root());
return htmlToJson($.html() || '');
} catch (err) {
throw err;
}
@@ -5,6 +5,7 @@ import { v7 } from 'uuid';
import { InsertableBacklink } from '@docmost/db/types/entity.types';
import { Cheerio, CheerioAPI, load } from 'cheerio';
import slugify from '@sindresorhus/slugify';
import { normalizeTableColumnWidths } from './table-utils';
// Check if text contains Unicode characters (for emojis/icons)
function isUnicodeCharacter(text: string): boolean {
@@ -51,9 +52,7 @@ export async function formatImportHtml(opts: {
}
}
notionFormatter($, $root);
xwikiFormatter($, $root);
defaultHtmlFormatter($, $root);
normalizeImportHtml($, $root);
const backlinks = await rewriteInternalLinksToMentionHtml(
$,
@@ -73,6 +72,23 @@ export async function formatImportHtml(opts: {
};
}
/**
* Contextless HTML cleanup shared by every import path.
* - notionFormatter: no-op on non-Notion HTML (class-selector-based).
* - xwikiFormatter: no-op on non-XWiki HTML (looks for #xwikicontent).
* - defaultHtmlFormatter: table column widths + provider auto-embeds.
*
* Does NOT run rewriteInternalLinksToMentionHtml — that requires zip context.
*/
export function normalizeImportHtml(
$: CheerioAPI,
$root: Cheerio<any>,
): void {
notionFormatter($, $root);
xwikiFormatter($, $root);
defaultHtmlFormatter($, $root);
}
export function xwikiFormatter($: CheerioAPI, $root: Cheerio<any>) {
const $content = $root.find('#xwikicontent');
if ($content.length) {
@@ -82,6 +98,8 @@ export function xwikiFormatter($: CheerioAPI, $root: Cheerio<any>) {
}
export function defaultHtmlFormatter($: CheerioAPI, $root: Cheerio<any>) {
normalizeTableColumnWidths($, $root);
$root.find('a[href]').each((_, el) => {
const $el = $(el);
const url = $el.attr('href')!;
@@ -0,0 +1,107 @@
import { CheerioAPI, Cheerio } from 'cheerio';
const DEFAULT_IMPORT_COL_WIDTH_PX = 150;
/**
* Extracts a pixel-integer width from either the `width` attribute or
* `style="width: Npx"` on a <col>/<td>/<th>. Returns null when absent,
* non-numeric, or a non-px unit (em, %).
*/
function parsePixelWidth(el: Cheerio<any>): number | null {
const attr = el.attr('width');
if (attr) {
const n = parseInt(attr, 10);
if (Number.isFinite(n) && n > 0) return n;
}
const style = el.attr('style') || '';
const m = style.match(/(?:^|;)\s*width\s*:\s*([\d.]+)\s*px/i);
if (m) {
const n = parseInt(m[1], 10);
if (Number.isFinite(n) && n > 0) return n;
}
return null;
}
/**
* Derives per-column widths for a table, in visual column order.
* Priority: <colgroup><col> → first-row cells' own width style.
* Returns an array of length = number of columns, with null entries
* for columns whose width couldn't be determined.
*/
function deriveColumnWidths(
$: CheerioAPI,
table: Cheerio<any>,
): (number | null)[] | null {
const cols = table.find('> colgroup > col');
if (cols.length > 0) {
const widths: (number | null)[] = [];
cols.each(function () {
widths.push(parsePixelWidth($(this)));
});
if (widths.some((w) => w !== null)) return widths;
}
// Fallback: first row's cells.
const firstRow = table.find('> tbody > tr, > thead > tr, > tr').first();
if (!firstRow.length) return null;
const widths: (number | null)[] = [];
firstRow.children('td, th').each(function () {
const cell = $(this);
const colspan = parseInt(cell.attr('colspan') || '1', 10) || 1;
const w = parsePixelWidth(cell);
for (let i = 0; i < colspan; i++) {
widths.push(w !== null ? Math.round(w / colspan) : null);
}
});
if (widths.every((w) => w === null)) return null;
return widths;
}
/**
* Apply colwidth attributes to the first row of each table based on
* derived column widths. Accounts for colspan. Idempotent — re-running
* on already-normalized markup is a no-op.
*
* This lives upstream of tiptap's generateJSON: tiptap reads
* `colwidth="N[,N...]"` on <td>/<th> to build the runtime <colgroup>.
*/
export function normalizeTableColumnWidths(
$: CheerioAPI,
$root: Cheerio<any>,
): void {
$root.find('table').each(function () {
const table = $(this);
const firstRow = table.find('> tbody > tr, > thead > tr, > tr').first();
if (!firstRow.length) return;
let colWidths = deriveColumnWidths($, table);
if (!colWidths) {
// No widths anywhere (e.g. markdown-sourced tables). Apply a default
// per-column width so the table's intrinsic width can exceed the
// editor container, letting .tableWrapper's overflow-x: auto scroll
// instead of cramming columns into the available width.
let count = 0;
firstRow.children('td, th').each(function () {
count += parseInt($(this).attr('colspan') || '1', 10) || 1;
});
if (count === 0) return;
colWidths = new Array(count).fill(DEFAULT_IMPORT_COL_WIDTH_PX);
}
let col = 0;
firstRow.children('td, th').each(function () {
const cell = $(this);
if (cell.attr('colwidth')) {
col += parseInt(cell.attr('colspan') || '1', 10) || 1;
return;
}
const colspan = parseInt(cell.attr('colspan') || '1', 10) || 1;
const slice = colWidths.slice(col, col + colspan);
col += colspan;
if (slice.length === 0 || slice.every((w) => w === null)) return;
const values = slice.map((w) => (w == null ? 100 : w));
cell.attr('colwidth', values.join(','));
});
});
}