mirror of
https://github.com/docmost/docmost.git
synced 2026-05-14 20:54:07 +08:00
feat: bulk page imports (#1219)
* refactor imports - WIP * Add readstream * WIP * fix attachmentId render * fix attachmentId render * turndown video tag * feat: add stream upload support and improve file handling - Add stream upload functionality to storage drivers\n- Improve ZIP file extraction with better encoding handling\n- Fix attachment ID rendering issues\n- Add AWS S3 upload stream support\n- Update dependencies for better compatibility * WIP * notion formatter * move embed parser to editor-ext package * import embeds * utility files * cleanup * Switch from happy-dom to cheerio * Refine code * WIP * bug fixes and UI * sync * WIP * sync * keep import modal mounted * Show modal during upload * WIP * WIP
This commit is contained in:
@@ -0,0 +1,187 @@
|
||||
import * as yauzl from 'yauzl';
|
||||
import * as path from 'path';
|
||||
import * as fs from 'node:fs';
|
||||
|
||||
export enum FileTaskType {
|
||||
Import = 'import',
|
||||
Export = 'export',
|
||||
}
|
||||
|
||||
export enum FileImportSource {
|
||||
Generic = 'generic',
|
||||
Notion = 'notion',
|
||||
Confluence = 'confluence',
|
||||
}
|
||||
|
||||
export enum FileTaskStatus {
|
||||
Processing = 'processing',
|
||||
Success = 'success',
|
||||
Failed = 'failed',
|
||||
}
|
||||
|
||||
export function getFileTaskFolderPath(
|
||||
type: FileTaskType,
|
||||
workspaceId: string,
|
||||
): string {
|
||||
switch (type) {
|
||||
case FileTaskType.Import:
|
||||
return `${workspaceId}/imports`;
|
||||
case FileTaskType.Export:
|
||||
return `${workspaceId}/exports`;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts a ZIP archive.
|
||||
*/
|
||||
export async function extractZip(
|
||||
source: string,
|
||||
target: string,
|
||||
): Promise<void> {
|
||||
return extractZipInternal(source, target, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal helper to extract a ZIP, with optional single-nested-ZIP handling.
|
||||
* @param source Path to the ZIP file
|
||||
* @param target Directory to extract into
|
||||
* @param allowNested Whether to check and unwrap one level of nested ZIP
|
||||
*/
|
||||
function extractZipInternal(
|
||||
source: string,
|
||||
target: string,
|
||||
allowNested: boolean,
|
||||
): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
yauzl.open(
|
||||
source,
|
||||
{ lazyEntries: true, decodeStrings: false, autoClose: true },
|
||||
(err, zipfile) => {
|
||||
if (err) return reject(err);
|
||||
|
||||
// Handle one level of nested ZIP if allowed
|
||||
if (allowNested && zipfile.entryCount === 1) {
|
||||
zipfile.readEntry();
|
||||
zipfile.once('entry', (entry) => {
|
||||
const name = entry.fileName.toString('utf8').replace(/^\/+/, '');
|
||||
const isZip =
|
||||
!/\/$/.test(entry.fileName) &&
|
||||
name.toLowerCase().endsWith('.zip');
|
||||
if (isZip) {
|
||||
// temporary name to avoid overwriting file
|
||||
const nestedPath = source.endsWith('.zip')
|
||||
? source.slice(0, -4) + '.inner.zip'
|
||||
: source + '.inner.zip';
|
||||
|
||||
zipfile.openReadStream(entry, (openErr, rs) => {
|
||||
if (openErr) return reject(openErr);
|
||||
const ws = fs.createWriteStream(nestedPath);
|
||||
rs.on('error', reject);
|
||||
ws.on('error', reject);
|
||||
ws.on('finish', () => {
|
||||
zipfile.close();
|
||||
extractZipInternal(nestedPath, target, false)
|
||||
.then(() => {
|
||||
fs.unlinkSync(nestedPath);
|
||||
resolve();
|
||||
})
|
||||
.catch(reject);
|
||||
});
|
||||
rs.pipe(ws);
|
||||
});
|
||||
} else {
|
||||
zipfile.close();
|
||||
extractZipInternal(source, target, false).then(resolve, reject);
|
||||
}
|
||||
});
|
||||
zipfile.once('error', reject);
|
||||
return;
|
||||
}
|
||||
|
||||
// Normal extraction
|
||||
zipfile.readEntry();
|
||||
zipfile.on('entry', (entry) => {
|
||||
const name = entry.fileName.toString('utf8');
|
||||
const safe = name.replace(/^\/+/, '');
|
||||
if (safe.startsWith('__MACOSX/')) {
|
||||
zipfile.readEntry();
|
||||
return;
|
||||
}
|
||||
|
||||
const fullPath = path.join(target, safe);
|
||||
|
||||
// Handle directories
|
||||
if (/\/$/.test(name)) {
|
||||
try {
|
||||
fs.mkdirSync(fullPath, { recursive: true });
|
||||
} catch (mkdirErr: any) {
|
||||
if (mkdirErr.code === 'ENAMETOOLONG') {
|
||||
console.warn(`Skipping directory (path too long): ${fullPath}`);
|
||||
zipfile.readEntry();
|
||||
return;
|
||||
}
|
||||
return reject(mkdirErr);
|
||||
}
|
||||
zipfile.readEntry();
|
||||
return;
|
||||
}
|
||||
|
||||
// Handle files
|
||||
try {
|
||||
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
|
||||
} catch (mkdirErr: any) {
|
||||
if (mkdirErr.code === 'ENAMETOOLONG') {
|
||||
console.warn(
|
||||
`Skipping file directory creation (path too long): ${fullPath}`,
|
||||
);
|
||||
zipfile.readEntry();
|
||||
return;
|
||||
}
|
||||
return reject(mkdirErr);
|
||||
}
|
||||
|
||||
zipfile.openReadStream(entry, (openErr, rs) => {
|
||||
if (openErr) return reject(openErr);
|
||||
|
||||
let ws: fs.WriteStream;
|
||||
try {
|
||||
ws = fs.createWriteStream(fullPath);
|
||||
} catch (openWsErr: any) {
|
||||
if (openWsErr.code === 'ENAMETOOLONG') {
|
||||
console.warn(
|
||||
`Skipping file write (path too long): ${fullPath}`,
|
||||
);
|
||||
zipfile.readEntry();
|
||||
return;
|
||||
}
|
||||
return reject(openWsErr);
|
||||
}
|
||||
|
||||
rs.on('error', (err) => reject(err));
|
||||
ws.on('error', (err) => {
|
||||
if ((err as any).code === 'ENAMETOOLONG') {
|
||||
console.warn(
|
||||
`Skipping file write on stream (path too long): ${fullPath}`,
|
||||
);
|
||||
zipfile.readEntry();
|
||||
} else {
|
||||
reject(err);
|
||||
}
|
||||
});
|
||||
ws.on('finish', () => zipfile.readEntry());
|
||||
rs.pipe(ws);
|
||||
});
|
||||
});
|
||||
|
||||
zipfile.on('end', () => resolve());
|
||||
zipfile.on('error', (err) => reject(err));
|
||||
},
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
export function cleanUrlString(url: string): string {
|
||||
if (!url) return null;
|
||||
const [mainUrl] = url.split('?', 1);
|
||||
return mainUrl;
|
||||
}
|
||||
@@ -0,0 +1,254 @@
|
||||
import { getEmbedUrlAndProvider } from '@docmost/editor-ext';
|
||||
import * as path from 'path';
|
||||
import { v7 } from 'uuid';
|
||||
import { InsertableBacklink } from '@docmost/db/types/entity.types';
|
||||
import { Cheerio, CheerioAPI, load } from 'cheerio';
|
||||
|
||||
export async function formatImportHtml(opts: {
|
||||
html: string;
|
||||
currentFilePath: string;
|
||||
filePathToPageMetaMap: Map<
|
||||
string,
|
||||
{ id: string; title: string; slugId: string }
|
||||
>;
|
||||
creatorId: string;
|
||||
sourcePageId: string;
|
||||
workspaceId: string;
|
||||
pageDir?: string;
|
||||
attachmentCandidates?: string[];
|
||||
}): Promise<{ html: string; backlinks: InsertableBacklink[] }> {
|
||||
const {
|
||||
html,
|
||||
currentFilePath,
|
||||
filePathToPageMetaMap,
|
||||
creatorId,
|
||||
sourcePageId,
|
||||
workspaceId,
|
||||
} = opts;
|
||||
const $: CheerioAPI = load(html);
|
||||
const $root: Cheerio<any> = $.root();
|
||||
|
||||
notionFormatter($, $root);
|
||||
defaultHtmlFormatter($, $root);
|
||||
|
||||
const backlinks = await rewriteInternalLinksToMentionHtml(
|
||||
$,
|
||||
$root,
|
||||
currentFilePath,
|
||||
filePathToPageMetaMap,
|
||||
creatorId,
|
||||
sourcePageId,
|
||||
workspaceId,
|
||||
);
|
||||
|
||||
return {
|
||||
html: $root.html() || '',
|
||||
backlinks,
|
||||
};
|
||||
}
|
||||
|
||||
export function defaultHtmlFormatter($: CheerioAPI, $root: Cheerio<any>) {
|
||||
$root.find('a[href]').each((_, el) => {
|
||||
const $el = $(el);
|
||||
const url = $el.attr('href')!;
|
||||
const { provider } = getEmbedUrlAndProvider(url);
|
||||
if (provider === 'iframe') return;
|
||||
|
||||
const embed = `<div data-type=\"embed\" data-src=\"${url}\" data-provider=\"${provider}\" data-align=\"center\" data-width=\"640\" data-height=\"480\"></div>`;
|
||||
$el.replaceWith(embed);
|
||||
});
|
||||
|
||||
$root.find('iframe[src]').each((_, el) => {
|
||||
const $el = $(el);
|
||||
const url = $el.attr('src')!;
|
||||
const { provider } = getEmbedUrlAndProvider(url);
|
||||
|
||||
const embed = `<div data-type=\"embed\" data-src=\"${url}\" data-provider=\"${provider}\" data-align=\"center\" data-width=\"640\" data-height=\"480\"></div>`;
|
||||
$el.replaceWith(embed);
|
||||
});
|
||||
}
|
||||
|
||||
export function notionFormatter($: CheerioAPI, $root: Cheerio<any>) {
|
||||
// remove empty description paragraphs
|
||||
$root.find('p.page-description').each((_, el) => {
|
||||
if (!$(el).text().trim()) $(el).remove();
|
||||
});
|
||||
|
||||
// block math → mathBlock
|
||||
$root.find('figure.equation').each((_: any, fig: any) => {
|
||||
const $fig = $(fig);
|
||||
const tex = $fig
|
||||
.find('annotation[encoding="application/x-tex"]')
|
||||
.text()
|
||||
.trim();
|
||||
const $math = $('<div>')
|
||||
.attr('data-type', 'mathBlock')
|
||||
.attr('data-katex', 'true')
|
||||
.text(tex);
|
||||
$fig.replaceWith($math);
|
||||
});
|
||||
|
||||
// inline math → mathInline
|
||||
$root.find('span.notion-text-equation-token').each((_, tok) => {
|
||||
const $tok = $(tok);
|
||||
const $prev = $tok.prev('style');
|
||||
if ($prev.length) $prev.remove();
|
||||
const tex = $tok
|
||||
.find('annotation[encoding="application/x-tex"]')
|
||||
.text()
|
||||
.trim();
|
||||
const $inline = $('<span>')
|
||||
.attr('data-type', 'mathInline')
|
||||
.attr('data-katex', 'true')
|
||||
.text(tex);
|
||||
$tok.replaceWith($inline);
|
||||
});
|
||||
|
||||
// callouts
|
||||
$root
|
||||
.find('figure.callout')
|
||||
.get()
|
||||
.reverse()
|
||||
.forEach((fig) => {
|
||||
const $fig = $(fig);
|
||||
const $content = $fig.find('div').eq(1);
|
||||
if (!$content.length) return;
|
||||
const $wrapper = $('<div>')
|
||||
.attr('data-type', 'callout')
|
||||
.attr('data-callout-type', 'info');
|
||||
// @ts-ignore
|
||||
$content.children().each((_, child) => $wrapper.append(child));
|
||||
$fig.replaceWith($wrapper);
|
||||
});
|
||||
|
||||
// to-do lists
|
||||
$root.find('ul.to-do-list').each((_, list) => {
|
||||
const $old = $(list);
|
||||
const $new = $('<ul>').attr('data-type', 'taskList');
|
||||
$old.find('li').each((_, li) => {
|
||||
const $li = $(li);
|
||||
const isChecked = $li.find('.checkbox.checkbox-on').length > 0;
|
||||
const text =
|
||||
$li
|
||||
.find('span.to-do-children-unchecked, span.to-do-children-checked')
|
||||
.first()
|
||||
.text()
|
||||
.trim() || '';
|
||||
const $taskItem = $('<li>')
|
||||
.attr('data-type', 'taskItem')
|
||||
.attr('data-checked', String(isChecked));
|
||||
const $label = $('<label>');
|
||||
const $input = $('<input>').attr('type', 'checkbox');
|
||||
if (isChecked) $input.attr('checked', '');
|
||||
$label.append($input, $('<span>'));
|
||||
const $container = $('<div>').append($('<p>').text(text));
|
||||
$taskItem.append($label, $container);
|
||||
$new.append($taskItem);
|
||||
});
|
||||
$old.replaceWith($new);
|
||||
});
|
||||
|
||||
// toggle blocks
|
||||
$root
|
||||
.find('ul.toggle details')
|
||||
.get()
|
||||
.reverse()
|
||||
.forEach((det) => {
|
||||
const $det = $(det);
|
||||
const $li = $det.closest('li');
|
||||
if ($li.length) {
|
||||
$li.before($det);
|
||||
if (!$li.children().length) $li.remove();
|
||||
}
|
||||
const $ul = $det.closest('ul.toggle');
|
||||
if ($ul.length) {
|
||||
$ul.before($det);
|
||||
if (!$ul.children().length) $ul.remove();
|
||||
}
|
||||
});
|
||||
|
||||
// bookmarks
|
||||
$root
|
||||
.find('figure')
|
||||
.filter((_, fig) => $(fig).find('a.bookmark.source').length > 0)
|
||||
.get()
|
||||
.reverse()
|
||||
.forEach((fig) => {
|
||||
const $fig = $(fig);
|
||||
const $link = $fig.find('a.bookmark.source').first();
|
||||
if (!$link.length) return;
|
||||
|
||||
const href = $link.attr('href')!;
|
||||
const title = $link.find('.bookmark-title').text().trim() || href;
|
||||
|
||||
const $newAnchor = $('<a>')
|
||||
.addClass('bookmark source')
|
||||
.attr('href', href)
|
||||
.append($('<div>').addClass('bookmark-info').text(title));
|
||||
|
||||
$fig.replaceWith($newAnchor);
|
||||
});
|
||||
|
||||
// remove toc
|
||||
$root.find('nav.table_of_contents').remove();
|
||||
}
|
||||
|
||||
export function unwrapFromParagraph($: CheerioAPI, $node: Cheerio<any>) {
|
||||
// find the nearest <p> or <a> ancestor
|
||||
let $wrapper = $node.closest('p, a');
|
||||
|
||||
while ($wrapper.length) {
|
||||
// if the wrapper has only our node inside, replace it entirely
|
||||
if ($wrapper.contents().length === 1) {
|
||||
$wrapper.replaceWith($node);
|
||||
} else {
|
||||
// otherwise just move the node to before the wrapper
|
||||
$wrapper.before($node);
|
||||
}
|
||||
// look again for any new wrapper around $node
|
||||
$wrapper = $node.closest('p, a');
|
||||
}
|
||||
}
|
||||
|
||||
export async function rewriteInternalLinksToMentionHtml(
|
||||
$: CheerioAPI,
|
||||
$root: Cheerio<any>,
|
||||
currentFilePath: string,
|
||||
filePathToPageMetaMap: Map<
|
||||
string,
|
||||
{ id: string; title: string; slugId: string }
|
||||
>,
|
||||
creatorId: string,
|
||||
sourcePageId: string,
|
||||
workspaceId: string,
|
||||
): Promise<InsertableBacklink[]> {
|
||||
const normalize = (p: string) => p.replace(/\\/g, '/');
|
||||
const backlinks: InsertableBacklink[] = [];
|
||||
|
||||
$root.find('a[href]').each((_, el) => {
|
||||
const $a = $(el);
|
||||
const raw = $a.attr('href')!;
|
||||
if (raw.startsWith('http') || raw.startsWith('/api/')) return;
|
||||
const resolved = normalize(
|
||||
path.join(path.dirname(currentFilePath), decodeURIComponent(raw)),
|
||||
);
|
||||
const meta = filePathToPageMetaMap.get(resolved);
|
||||
if (!meta) return;
|
||||
const mentionId = v7();
|
||||
const $mention = $('<span>')
|
||||
.attr({
|
||||
'data-type': 'mention',
|
||||
'data-id': mentionId,
|
||||
'data-entity-type': 'page',
|
||||
'data-entity-id': meta.id,
|
||||
'data-label': meta.title,
|
||||
'data-slug-id': meta.slugId,
|
||||
'data-creator-id': creatorId,
|
||||
})
|
||||
.text(meta.title);
|
||||
$a.replaceWith($mention);
|
||||
backlinks.push({ sourcePageId, targetPageId: meta.id, workspaceId });
|
||||
});
|
||||
|
||||
return backlinks;
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
import { promises as fs } from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
export async function buildAttachmentCandidates(
|
||||
extractDir: string,
|
||||
): Promise<Map<string, string>> {
|
||||
const map = new Map<string, string>();
|
||||
async function walk(dir: string) {
|
||||
for (const ent of await fs.readdir(dir, { withFileTypes: true })) {
|
||||
const abs = path.join(dir, ent.name);
|
||||
if (ent.isDirectory()) {
|
||||
await walk(abs);
|
||||
} else {
|
||||
if (['.md', '.html'].includes(path.extname(ent.name).toLowerCase())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const rel = path.relative(extractDir, abs).split(path.sep).join('/');
|
||||
map.set(rel, abs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await walk(extractDir);
|
||||
return map;
|
||||
}
|
||||
|
||||
export function resolveRelativeAttachmentPath(
|
||||
raw: string,
|
||||
pageDir: string,
|
||||
attachmentCandidates: Map<string, string>,
|
||||
): string | null {
|
||||
const mainRel = decodeURIComponent(raw.replace(/^\.?\/+/, ''));
|
||||
const fallback = path.normalize(path.join(pageDir, mainRel));
|
||||
|
||||
if (attachmentCandidates.has(mainRel)) {
|
||||
return mainRel;
|
||||
}
|
||||
if (attachmentCandidates.has(fallback)) {
|
||||
return fallback;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
export async function collectMarkdownAndHtmlFiles(
|
||||
dir: string,
|
||||
): Promise<string[]> {
|
||||
const results: string[] = [];
|
||||
|
||||
async function walk(current: string) {
|
||||
const entries = await fs.readdir(current, { withFileTypes: true });
|
||||
for (const ent of entries) {
|
||||
const fullPath = path.join(current, ent.name);
|
||||
if (ent.isDirectory()) {
|
||||
await walk(fullPath);
|
||||
} else if (
|
||||
['.md', '.html'].includes(path.extname(ent.name).toLowerCase())
|
||||
) {
|
||||
results.push(fullPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await walk(dir);
|
||||
return results;
|
||||
}
|
||||
Reference in New Issue
Block a user