feat: bulk page imports (#1219)

* refactor imports - WIP

* Add readstream

* WIP

* fix attachmentId render

* fix attachmentId render

* turndown video tag

* feat: add stream upload support and improve file handling

- Add stream upload functionality to storage drivers\n- Improve ZIP file extraction with better encoding handling\n- Fix attachment ID rendering issues\n- Add AWS S3 upload stream support\n- Update dependencies for better compatibility

* WIP

* notion formatter

* move embed parser to editor-ext package

* import embeds

* utility files

* cleanup

* Switch from happy-dom to cheerio
* Refine code

* WIP

* bug fixes and UI

* sync

* WIP

* sync

* keep import modal mounted

* Show modal during upload

* WIP

* WIP
This commit is contained in:
Philip Okugbe
2025-06-09 04:29:27 +01:00
committed by GitHub
parent ce1503af85
commit 6d024fc3de
45 changed files with 2362 additions and 149 deletions
@@ -0,0 +1,187 @@
import * as yauzl from 'yauzl';
import * as path from 'path';
import * as fs from 'node:fs';
export enum FileTaskType {
Import = 'import',
Export = 'export',
}
export enum FileImportSource {
Generic = 'generic',
Notion = 'notion',
Confluence = 'confluence',
}
export enum FileTaskStatus {
Processing = 'processing',
Success = 'success',
Failed = 'failed',
}
export function getFileTaskFolderPath(
type: FileTaskType,
workspaceId: string,
): string {
switch (type) {
case FileTaskType.Import:
return `${workspaceId}/imports`;
case FileTaskType.Export:
return `${workspaceId}/exports`;
}
}
/**
* Extracts a ZIP archive.
*/
export async function extractZip(
source: string,
target: string,
): Promise<void> {
return extractZipInternal(source, target, true);
}
/**
* Internal helper to extract a ZIP, with optional single-nested-ZIP handling.
* @param source Path to the ZIP file
* @param target Directory to extract into
* @param allowNested Whether to check and unwrap one level of nested ZIP
*/
function extractZipInternal(
source: string,
target: string,
allowNested: boolean,
): Promise<void> {
return new Promise((resolve, reject) => {
yauzl.open(
source,
{ lazyEntries: true, decodeStrings: false, autoClose: true },
(err, zipfile) => {
if (err) return reject(err);
// Handle one level of nested ZIP if allowed
if (allowNested && zipfile.entryCount === 1) {
zipfile.readEntry();
zipfile.once('entry', (entry) => {
const name = entry.fileName.toString('utf8').replace(/^\/+/, '');
const isZip =
!/\/$/.test(entry.fileName) &&
name.toLowerCase().endsWith('.zip');
if (isZip) {
// temporary name to avoid overwriting file
const nestedPath = source.endsWith('.zip')
? source.slice(0, -4) + '.inner.zip'
: source + '.inner.zip';
zipfile.openReadStream(entry, (openErr, rs) => {
if (openErr) return reject(openErr);
const ws = fs.createWriteStream(nestedPath);
rs.on('error', reject);
ws.on('error', reject);
ws.on('finish', () => {
zipfile.close();
extractZipInternal(nestedPath, target, false)
.then(() => {
fs.unlinkSync(nestedPath);
resolve();
})
.catch(reject);
});
rs.pipe(ws);
});
} else {
zipfile.close();
extractZipInternal(source, target, false).then(resolve, reject);
}
});
zipfile.once('error', reject);
return;
}
// Normal extraction
zipfile.readEntry();
zipfile.on('entry', (entry) => {
const name = entry.fileName.toString('utf8');
const safe = name.replace(/^\/+/, '');
if (safe.startsWith('__MACOSX/')) {
zipfile.readEntry();
return;
}
const fullPath = path.join(target, safe);
// Handle directories
if (/\/$/.test(name)) {
try {
fs.mkdirSync(fullPath, { recursive: true });
} catch (mkdirErr: any) {
if (mkdirErr.code === 'ENAMETOOLONG') {
console.warn(`Skipping directory (path too long): ${fullPath}`);
zipfile.readEntry();
return;
}
return reject(mkdirErr);
}
zipfile.readEntry();
return;
}
// Handle files
try {
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
} catch (mkdirErr: any) {
if (mkdirErr.code === 'ENAMETOOLONG') {
console.warn(
`Skipping file directory creation (path too long): ${fullPath}`,
);
zipfile.readEntry();
return;
}
return reject(mkdirErr);
}
zipfile.openReadStream(entry, (openErr, rs) => {
if (openErr) return reject(openErr);
let ws: fs.WriteStream;
try {
ws = fs.createWriteStream(fullPath);
} catch (openWsErr: any) {
if (openWsErr.code === 'ENAMETOOLONG') {
console.warn(
`Skipping file write (path too long): ${fullPath}`,
);
zipfile.readEntry();
return;
}
return reject(openWsErr);
}
rs.on('error', (err) => reject(err));
ws.on('error', (err) => {
if ((err as any).code === 'ENAMETOOLONG') {
console.warn(
`Skipping file write on stream (path too long): ${fullPath}`,
);
zipfile.readEntry();
} else {
reject(err);
}
});
ws.on('finish', () => zipfile.readEntry());
rs.pipe(ws);
});
});
zipfile.on('end', () => resolve());
zipfile.on('error', (err) => reject(err));
},
);
});
}
export function cleanUrlString(url: string): string {
if (!url) return null;
const [mainUrl] = url.split('?', 1);
return mainUrl;
}
@@ -0,0 +1,254 @@
import { getEmbedUrlAndProvider } from '@docmost/editor-ext';
import * as path from 'path';
import { v7 } from 'uuid';
import { InsertableBacklink } from '@docmost/db/types/entity.types';
import { Cheerio, CheerioAPI, load } from 'cheerio';
export async function formatImportHtml(opts: {
html: string;
currentFilePath: string;
filePathToPageMetaMap: Map<
string,
{ id: string; title: string; slugId: string }
>;
creatorId: string;
sourcePageId: string;
workspaceId: string;
pageDir?: string;
attachmentCandidates?: string[];
}): Promise<{ html: string; backlinks: InsertableBacklink[] }> {
const {
html,
currentFilePath,
filePathToPageMetaMap,
creatorId,
sourcePageId,
workspaceId,
} = opts;
const $: CheerioAPI = load(html);
const $root: Cheerio<any> = $.root();
notionFormatter($, $root);
defaultHtmlFormatter($, $root);
const backlinks = await rewriteInternalLinksToMentionHtml(
$,
$root,
currentFilePath,
filePathToPageMetaMap,
creatorId,
sourcePageId,
workspaceId,
);
return {
html: $root.html() || '',
backlinks,
};
}
export function defaultHtmlFormatter($: CheerioAPI, $root: Cheerio<any>) {
$root.find('a[href]').each((_, el) => {
const $el = $(el);
const url = $el.attr('href')!;
const { provider } = getEmbedUrlAndProvider(url);
if (provider === 'iframe') return;
const embed = `<div data-type=\"embed\" data-src=\"${url}\" data-provider=\"${provider}\" data-align=\"center\" data-width=\"640\" data-height=\"480\"></div>`;
$el.replaceWith(embed);
});
$root.find('iframe[src]').each((_, el) => {
const $el = $(el);
const url = $el.attr('src')!;
const { provider } = getEmbedUrlAndProvider(url);
const embed = `<div data-type=\"embed\" data-src=\"${url}\" data-provider=\"${provider}\" data-align=\"center\" data-width=\"640\" data-height=\"480\"></div>`;
$el.replaceWith(embed);
});
}
export function notionFormatter($: CheerioAPI, $root: Cheerio<any>) {
// remove empty description paragraphs
$root.find('p.page-description').each((_, el) => {
if (!$(el).text().trim()) $(el).remove();
});
// block math → mathBlock
$root.find('figure.equation').each((_: any, fig: any) => {
const $fig = $(fig);
const tex = $fig
.find('annotation[encoding="application/x-tex"]')
.text()
.trim();
const $math = $('<div>')
.attr('data-type', 'mathBlock')
.attr('data-katex', 'true')
.text(tex);
$fig.replaceWith($math);
});
// inline math → mathInline
$root.find('span.notion-text-equation-token').each((_, tok) => {
const $tok = $(tok);
const $prev = $tok.prev('style');
if ($prev.length) $prev.remove();
const tex = $tok
.find('annotation[encoding="application/x-tex"]')
.text()
.trim();
const $inline = $('<span>')
.attr('data-type', 'mathInline')
.attr('data-katex', 'true')
.text(tex);
$tok.replaceWith($inline);
});
// callouts
$root
.find('figure.callout')
.get()
.reverse()
.forEach((fig) => {
const $fig = $(fig);
const $content = $fig.find('div').eq(1);
if (!$content.length) return;
const $wrapper = $('<div>')
.attr('data-type', 'callout')
.attr('data-callout-type', 'info');
// @ts-ignore
$content.children().each((_, child) => $wrapper.append(child));
$fig.replaceWith($wrapper);
});
// to-do lists
$root.find('ul.to-do-list').each((_, list) => {
const $old = $(list);
const $new = $('<ul>').attr('data-type', 'taskList');
$old.find('li').each((_, li) => {
const $li = $(li);
const isChecked = $li.find('.checkbox.checkbox-on').length > 0;
const text =
$li
.find('span.to-do-children-unchecked, span.to-do-children-checked')
.first()
.text()
.trim() || '';
const $taskItem = $('<li>')
.attr('data-type', 'taskItem')
.attr('data-checked', String(isChecked));
const $label = $('<label>');
const $input = $('<input>').attr('type', 'checkbox');
if (isChecked) $input.attr('checked', '');
$label.append($input, $('<span>'));
const $container = $('<div>').append($('<p>').text(text));
$taskItem.append($label, $container);
$new.append($taskItem);
});
$old.replaceWith($new);
});
// toggle blocks
$root
.find('ul.toggle details')
.get()
.reverse()
.forEach((det) => {
const $det = $(det);
const $li = $det.closest('li');
if ($li.length) {
$li.before($det);
if (!$li.children().length) $li.remove();
}
const $ul = $det.closest('ul.toggle');
if ($ul.length) {
$ul.before($det);
if (!$ul.children().length) $ul.remove();
}
});
// bookmarks
$root
.find('figure')
.filter((_, fig) => $(fig).find('a.bookmark.source').length > 0)
.get()
.reverse()
.forEach((fig) => {
const $fig = $(fig);
const $link = $fig.find('a.bookmark.source').first();
if (!$link.length) return;
const href = $link.attr('href')!;
const title = $link.find('.bookmark-title').text().trim() || href;
const $newAnchor = $('<a>')
.addClass('bookmark source')
.attr('href', href)
.append($('<div>').addClass('bookmark-info').text(title));
$fig.replaceWith($newAnchor);
});
// remove toc
$root.find('nav.table_of_contents').remove();
}
export function unwrapFromParagraph($: CheerioAPI, $node: Cheerio<any>) {
// find the nearest <p> or <a> ancestor
let $wrapper = $node.closest('p, a');
while ($wrapper.length) {
// if the wrapper has only our node inside, replace it entirely
if ($wrapper.contents().length === 1) {
$wrapper.replaceWith($node);
} else {
// otherwise just move the node to before the wrapper
$wrapper.before($node);
}
// look again for any new wrapper around $node
$wrapper = $node.closest('p, a');
}
}
export async function rewriteInternalLinksToMentionHtml(
$: CheerioAPI,
$root: Cheerio<any>,
currentFilePath: string,
filePathToPageMetaMap: Map<
string,
{ id: string; title: string; slugId: string }
>,
creatorId: string,
sourcePageId: string,
workspaceId: string,
): Promise<InsertableBacklink[]> {
const normalize = (p: string) => p.replace(/\\/g, '/');
const backlinks: InsertableBacklink[] = [];
$root.find('a[href]').each((_, el) => {
const $a = $(el);
const raw = $a.attr('href')!;
if (raw.startsWith('http') || raw.startsWith('/api/')) return;
const resolved = normalize(
path.join(path.dirname(currentFilePath), decodeURIComponent(raw)),
);
const meta = filePathToPageMetaMap.get(resolved);
if (!meta) return;
const mentionId = v7();
const $mention = $('<span>')
.attr({
'data-type': 'mention',
'data-id': mentionId,
'data-entity-type': 'page',
'data-entity-id': meta.id,
'data-label': meta.title,
'data-slug-id': meta.slugId,
'data-creator-id': creatorId,
})
.text(meta.title);
$a.replaceWith($mention);
backlinks.push({ sourcePageId, targetPageId: meta.id, workspaceId });
});
return backlinks;
}
@@ -0,0 +1,66 @@
import { promises as fs } from 'fs';
import * as path from 'path';
export async function buildAttachmentCandidates(
extractDir: string,
): Promise<Map<string, string>> {
const map = new Map<string, string>();
async function walk(dir: string) {
for (const ent of await fs.readdir(dir, { withFileTypes: true })) {
const abs = path.join(dir, ent.name);
if (ent.isDirectory()) {
await walk(abs);
} else {
if (['.md', '.html'].includes(path.extname(ent.name).toLowerCase())) {
continue;
}
const rel = path.relative(extractDir, abs).split(path.sep).join('/');
map.set(rel, abs);
}
}
}
await walk(extractDir);
return map;
}
export function resolveRelativeAttachmentPath(
raw: string,
pageDir: string,
attachmentCandidates: Map<string, string>,
): string | null {
const mainRel = decodeURIComponent(raw.replace(/^\.?\/+/, ''));
const fallback = path.normalize(path.join(pageDir, mainRel));
if (attachmentCandidates.has(mainRel)) {
return mainRel;
}
if (attachmentCandidates.has(fallback)) {
return fallback;
}
return null;
}
export async function collectMarkdownAndHtmlFiles(
dir: string,
): Promise<string[]> {
const results: string[] = [];
async function walk(current: string) {
const entries = await fs.readdir(current, { withFileTypes: true });
for (const ent of entries) {
const fullPath = path.join(current, ent.name);
if (ent.isDirectory()) {
await walk(fullPath);
} else if (
['.md', '.html'].includes(path.extname(ent.name).toLowerCase())
) {
results.push(fullPath);
}
}
}
await walk(dir);
return results;
}