fix(base): enable duckdb disk spill + raise memory default to avoid oom on large bases

2026-05-07 06:23:06 +08:00 · 2026-04-23 13:56:31 +01:00
parent 5ebab5cd9e
commit 17db634029
4 changed files with 48 additions and 4 deletions
@@ -65,6 +65,9 @@ class FakeEnvService {
  getBaseQueryCacheThreads() {
    return 2;
  }
+  getBaseQueryCacheTempDirectory() {
+    return require('node:os').tmpdir() + '/docmost-duckdb-test';
+  }
  getRedisUrl() {
    return REDIS_URL;
  }
@@ -453,6 +456,9 @@ describeIntegration('BaseQueryCacheService LRU eviction', () => {
    getBaseQueryCacheThreads() {
      return 2;
    }
+    getBaseQueryCacheTempDirectory() {
+      return require('node:os').tmpdir() + '/docmost-duckdb-test';
+    }
    getRedisUrl() {
      return REDIS_URL;
    }
@@ -1101,6 +1107,9 @@ describeIntegration('BaseQueryCacheService warm-up on boot', () => {
    getBaseQueryCacheThreads() {
      return 2;
    }
+    getBaseQueryCacheTempDirectory() {
+      return require('node:os').tmpdir() + '/docmost-duckdb-test';
+    }
    getRedisUrl() {
      return REDIS_URL;
    }
@@ -36,16 +36,34 @@ export class CollectionLoader {
    const properties = await this.basePropertyRepo.findByBaseId(baseId);
    const specs = buildColumnSpecs(properties);

-    const { memoryLimit, threads } = this.config.config;
+    const { memoryLimit, threads, tempDirectory } = this.config.config;
+
+    // Ensure the temp directory exists so DuckDB can spill to it.
+    // Swallow errors — if creation fails, DuckDB will fail its own sanity
+    // check and we'll log that instead of crashing here.
+    try {
+      const fs = require('node:fs');
+      fs.mkdirSync(tempDirectory, { recursive: true });
+    } catch {
+      /* swallow */
+    }
+
    const instance = await DuckDBInstance.create(':memory:', {
      memory_limit: memoryLimit,
      threads: String(threads),
+      temp_directory: tempDirectory,
    });
    const connection = await instance.connect();

    try {
      await this.pgExtension.configureOnConnection(connection);

+      // Disable insertion-order preservation during bulk load — DuckDB's docs
+      // explicitly recommend this for memory-pressure on large inserts. Our
+      // loader doesn't depend on the insertion order (we sort via indexes
+      // or keyset cursors later), so this is free memory savings.
+      await connection.run('SET preserve_insertion_order = false');
+
      // Bulk load via CREATE TABLE AS SELECT. JSONB extraction happens
      // server-side via the base_cell_* helpers; DuckDB streams typed
      // columns over COPY BINARY into its vectorized insert path.
@@ -9,6 +9,7 @@ export type QueryCacheConfig = {
  memoryLimit: string;
  threads: number;
  trace: boolean;
+  tempDirectory: string;
 };

@Injectable()
@@ -23,6 +24,7 @@ export class QueryCacheConfigProvider {
      memoryLimit: env.getBaseQueryCacheMemoryLimit(),
      threads: env.getBaseQueryCacheThreads(),
      trace: env.getBaseQueryCacheTrace(),
+      tempDirectory: env.getBaseQueryCacheTempDirectory(),
    };
  }
 }
@@ -354,11 +354,26 @@ export class EnvironmentService {

  getBaseQueryCacheMemoryLimit(): string {
    // Per-DuckDB-instance memory ceiling. DuckDB accepts human-readable sizes:
-    // '32MB', '128MB', '1GB'. Default keeps a single instance from
-    // monopolising the heap if a runaway query needs to spill.
+    // '256MB', '1GB', etc. Default 512MB is sized for bases up to ~300K rows
+    // with moderate schemas without spilling. DuckDB automatically spills
+    // to `temp_directory` when this is exceeded, so over-allocating is
+    // cheap — the risk is under-sizing.
    return this.configService.get<string>(
      'BASE_QUERY_CACHE_MEMORY_LIMIT',
-      '64MB',
+      '512MB',
+    );
+  }
+
+  getBaseQueryCacheTempDirectory(): string {
+    // Directory DuckDB uses to spill pages when an instance exceeds its
+    // memory_limit. Defaults to the system temp dir plus a namespace so
+    // different processes don't collide. Setting this explicitly is what
+    // enables spill-to-disk on `:memory:` instances — without it, DuckDB
+    // OOMs at memory_limit instead of paging.
+    const defaultPath = `${require('node:os').tmpdir()}/docmost-duckdb-cache`;
+    return this.configService.get<string>(
+      'BASE_QUERY_CACHE_TEMP_DIR',
+      defaultPath,
    );
  }