From 17db634029f86aa054d54ac4420dfecc094adebd Mon Sep 17 00:00:00 2001 From: Philipinho <16838612+Philipinho@users.noreply.github.com> Date: Thu, 23 Apr 2026 13:56:31 +0100 Subject: [PATCH] fix(base): enable duckdb disk spill + raise memory default to avoid oom on large bases --- .../base-query-cache.integration.spec.ts | 9 ++++++++ .../base/query-cache/collection-loader.ts | 20 +++++++++++++++++- .../base/query-cache/query-cache.config.ts | 2 ++ .../environment/environment.service.ts | 21 ++++++++++++++++--- 4 files changed, 48 insertions(+), 4 deletions(-) diff --git a/apps/server/src/core/base/query-cache/base-query-cache.integration.spec.ts b/apps/server/src/core/base/query-cache/base-query-cache.integration.spec.ts index 071dbbb5..7898939e 100644 --- a/apps/server/src/core/base/query-cache/base-query-cache.integration.spec.ts +++ b/apps/server/src/core/base/query-cache/base-query-cache.integration.spec.ts @@ -65,6 +65,9 @@ class FakeEnvService { getBaseQueryCacheThreads() { return 2; } + getBaseQueryCacheTempDirectory() { + return require('node:os').tmpdir() + '/docmost-duckdb-test'; + } getRedisUrl() { return REDIS_URL; } @@ -453,6 +456,9 @@ describeIntegration('BaseQueryCacheService LRU eviction', () => { getBaseQueryCacheThreads() { return 2; } + getBaseQueryCacheTempDirectory() { + return require('node:os').tmpdir() + '/docmost-duckdb-test'; + } getRedisUrl() { return REDIS_URL; } @@ -1101,6 +1107,9 @@ describeIntegration('BaseQueryCacheService warm-up on boot', () => { getBaseQueryCacheThreads() { return 2; } + getBaseQueryCacheTempDirectory() { + return require('node:os').tmpdir() + '/docmost-duckdb-test'; + } getRedisUrl() { return REDIS_URL; } diff --git a/apps/server/src/core/base/query-cache/collection-loader.ts b/apps/server/src/core/base/query-cache/collection-loader.ts index 5ac18c88..9be69867 100644 --- a/apps/server/src/core/base/query-cache/collection-loader.ts +++ b/apps/server/src/core/base/query-cache/collection-loader.ts @@ -36,16 +36,34 @@ export class CollectionLoader { const properties = await this.basePropertyRepo.findByBaseId(baseId); const specs = buildColumnSpecs(properties); - const { memoryLimit, threads } = this.config.config; + const { memoryLimit, threads, tempDirectory } = this.config.config; + + // Ensure the temp directory exists so DuckDB can spill to it. + // Swallow errors — if creation fails, DuckDB will fail its own sanity + // check and we'll log that instead of crashing here. + try { + const fs = require('node:fs'); + fs.mkdirSync(tempDirectory, { recursive: true }); + } catch { + /* swallow */ + } + const instance = await DuckDBInstance.create(':memory:', { memory_limit: memoryLimit, threads: String(threads), + temp_directory: tempDirectory, }); const connection = await instance.connect(); try { await this.pgExtension.configureOnConnection(connection); + // Disable insertion-order preservation during bulk load — DuckDB's docs + // explicitly recommend this for memory-pressure on large inserts. Our + // loader doesn't depend on the insertion order (we sort via indexes + // or keyset cursors later), so this is free memory savings. + await connection.run('SET preserve_insertion_order = false'); + // Bulk load via CREATE TABLE AS SELECT. JSONB extraction happens // server-side via the base_cell_* helpers; DuckDB streams typed // columns over COPY BINARY into its vectorized insert path. diff --git a/apps/server/src/core/base/query-cache/query-cache.config.ts b/apps/server/src/core/base/query-cache/query-cache.config.ts index 3dafbc89..44b7d3f5 100644 --- a/apps/server/src/core/base/query-cache/query-cache.config.ts +++ b/apps/server/src/core/base/query-cache/query-cache.config.ts @@ -9,6 +9,7 @@ export type QueryCacheConfig = { memoryLimit: string; threads: number; trace: boolean; + tempDirectory: string; }; @Injectable() @@ -23,6 +24,7 @@ export class QueryCacheConfigProvider { memoryLimit: env.getBaseQueryCacheMemoryLimit(), threads: env.getBaseQueryCacheThreads(), trace: env.getBaseQueryCacheTrace(), + tempDirectory: env.getBaseQueryCacheTempDirectory(), }; } } diff --git a/apps/server/src/integrations/environment/environment.service.ts b/apps/server/src/integrations/environment/environment.service.ts index ffe96a15..3ef0b95b 100644 --- a/apps/server/src/integrations/environment/environment.service.ts +++ b/apps/server/src/integrations/environment/environment.service.ts @@ -354,11 +354,26 @@ export class EnvironmentService { getBaseQueryCacheMemoryLimit(): string { // Per-DuckDB-instance memory ceiling. DuckDB accepts human-readable sizes: - // '32MB', '128MB', '1GB'. Default keeps a single instance from - // monopolising the heap if a runaway query needs to spill. + // '256MB', '1GB', etc. Default 512MB is sized for bases up to ~300K rows + // with moderate schemas without spilling. DuckDB automatically spills + // to `temp_directory` when this is exceeded, so over-allocating is + // cheap — the risk is under-sizing. return this.configService.get( 'BASE_QUERY_CACHE_MEMORY_LIMIT', - '64MB', + '512MB', + ); + } + + getBaseQueryCacheTempDirectory(): string { + // Directory DuckDB uses to spill pages when an instance exceeds its + // memory_limit. Defaults to the system temp dir plus a namespace so + // different processes don't collide. Setting this explicitly is what + // enables spill-to-disk on `:memory:` instances — without it, DuckDB + // OOMs at memory_limit instead of paging. + const defaultPath = `${require('node:os').tmpdir()}/docmost-duckdb-cache`; + return this.configService.get( + 'BASE_QUERY_CACHE_TEMP_DIR', + defaultPath, ); }