fix(base): enable duckdb disk spill + raise memory default to avoid oom on large bases

This commit is contained in:
Philipinho
2026-04-23 13:56:31 +01:00
parent 5ebab5cd9e
commit 17db634029
4 changed files with 48 additions and 4 deletions
@@ -65,6 +65,9 @@ class FakeEnvService {
getBaseQueryCacheThreads() { getBaseQueryCacheThreads() {
return 2; return 2;
} }
getBaseQueryCacheTempDirectory() {
return require('node:os').tmpdir() + '/docmost-duckdb-test';
}
getRedisUrl() { getRedisUrl() {
return REDIS_URL; return REDIS_URL;
} }
@@ -453,6 +456,9 @@ describeIntegration('BaseQueryCacheService LRU eviction', () => {
getBaseQueryCacheThreads() { getBaseQueryCacheThreads() {
return 2; return 2;
} }
getBaseQueryCacheTempDirectory() {
return require('node:os').tmpdir() + '/docmost-duckdb-test';
}
getRedisUrl() { getRedisUrl() {
return REDIS_URL; return REDIS_URL;
} }
@@ -1101,6 +1107,9 @@ describeIntegration('BaseQueryCacheService warm-up on boot', () => {
getBaseQueryCacheThreads() { getBaseQueryCacheThreads() {
return 2; return 2;
} }
getBaseQueryCacheTempDirectory() {
return require('node:os').tmpdir() + '/docmost-duckdb-test';
}
getRedisUrl() { getRedisUrl() {
return REDIS_URL; return REDIS_URL;
} }
@@ -36,16 +36,34 @@ export class CollectionLoader {
const properties = await this.basePropertyRepo.findByBaseId(baseId); const properties = await this.basePropertyRepo.findByBaseId(baseId);
const specs = buildColumnSpecs(properties); const specs = buildColumnSpecs(properties);
const { memoryLimit, threads } = this.config.config; const { memoryLimit, threads, tempDirectory } = this.config.config;
// Ensure the temp directory exists so DuckDB can spill to it.
// Swallow errors — if creation fails, DuckDB will fail its own sanity
// check and we'll log that instead of crashing here.
try {
const fs = require('node:fs');
fs.mkdirSync(tempDirectory, { recursive: true });
} catch {
/* swallow */
}
const instance = await DuckDBInstance.create(':memory:', { const instance = await DuckDBInstance.create(':memory:', {
memory_limit: memoryLimit, memory_limit: memoryLimit,
threads: String(threads), threads: String(threads),
temp_directory: tempDirectory,
}); });
const connection = await instance.connect(); const connection = await instance.connect();
try { try {
await this.pgExtension.configureOnConnection(connection); await this.pgExtension.configureOnConnection(connection);
// Disable insertion-order preservation during bulk load — DuckDB's docs
// explicitly recommend this for memory-pressure on large inserts. Our
// loader doesn't depend on the insertion order (we sort via indexes
// or keyset cursors later), so this is free memory savings.
await connection.run('SET preserve_insertion_order = false');
// Bulk load via CREATE TABLE AS SELECT. JSONB extraction happens // Bulk load via CREATE TABLE AS SELECT. JSONB extraction happens
// server-side via the base_cell_* helpers; DuckDB streams typed // server-side via the base_cell_* helpers; DuckDB streams typed
// columns over COPY BINARY into its vectorized insert path. // columns over COPY BINARY into its vectorized insert path.
@@ -9,6 +9,7 @@ export type QueryCacheConfig = {
memoryLimit: string; memoryLimit: string;
threads: number; threads: number;
trace: boolean; trace: boolean;
tempDirectory: string;
}; };
@Injectable() @Injectable()
@@ -23,6 +24,7 @@ export class QueryCacheConfigProvider {
memoryLimit: env.getBaseQueryCacheMemoryLimit(), memoryLimit: env.getBaseQueryCacheMemoryLimit(),
threads: env.getBaseQueryCacheThreads(), threads: env.getBaseQueryCacheThreads(),
trace: env.getBaseQueryCacheTrace(), trace: env.getBaseQueryCacheTrace(),
tempDirectory: env.getBaseQueryCacheTempDirectory(),
}; };
} }
} }
@@ -354,11 +354,26 @@ export class EnvironmentService {
getBaseQueryCacheMemoryLimit(): string { getBaseQueryCacheMemoryLimit(): string {
// Per-DuckDB-instance memory ceiling. DuckDB accepts human-readable sizes: // Per-DuckDB-instance memory ceiling. DuckDB accepts human-readable sizes:
// '32MB', '128MB', '1GB'. Default keeps a single instance from // '256MB', '1GB', etc. Default 512MB is sized for bases up to ~300K rows
// monopolising the heap if a runaway query needs to spill. // with moderate schemas without spilling. DuckDB automatically spills
// to `temp_directory` when this is exceeded, so over-allocating is
// cheap — the risk is under-sizing.
return this.configService.get<string>( return this.configService.get<string>(
'BASE_QUERY_CACHE_MEMORY_LIMIT', 'BASE_QUERY_CACHE_MEMORY_LIMIT',
'64MB', '512MB',
);
}
getBaseQueryCacheTempDirectory(): string {
// Directory DuckDB uses to spill pages when an instance exceeds its
// memory_limit. Defaults to the system temp dir plus a namespace so
// different processes don't collide. Setting this explicitly is what
// enables spill-to-disk on `:memory:` instances — without it, DuckDB
// OOMs at memory_limit instead of paging.
const defaultPath = `${require('node:os').tmpdir()}/docmost-duckdb-cache`;
return this.configService.get<string>(
'BASE_QUERY_CACHE_TEMP_DIR',
defaultPath,
); );
} }