From 1a848e88fc64406437c075d654a373b4c45e63e1 Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Wed, 4 Mar 2026 23:12:14 +0000 Subject: [PATCH 1/3] refactor: introduce @workglow/knowledge-base package as we finish to switch from the name datasets - Replaced the deprecated `@workglow/dataset` with the new `@workglow/knowledge-base` package across various modules, including updates to package.json, README, and task schemas. - Consolidated document and chunk management under the new KnowledgeBase architecture, enhancing the handling of documents and chunks in RAG workflows. - Updated import paths and dependencies in multiple files to reflect the transition to the new package structure. - Removed the old dataset package and its related files, streamlining the codebase. --- .claude/CLAUDE.md | 8 +- TODO.md | 10 +- bun.lock | 40 ++--- docs/developers/03_extending.md | 2 +- examples/web/package.json | 2 +- examples/web/vite.config.js | 2 +- packages/ai/README.md | 2 +- packages/ai/package.json | 6 +- packages/ai/src/task/ChunkRetrievalTask.ts | 4 +- packages/ai/src/task/ChunkToVectorTask.ts | 2 +- .../src/task/ChunkVectorHybridSearchTask.ts | 2 +- packages/ai/src/task/ChunkVectorSearchTask.ts | 4 +- packages/ai/src/task/ChunkVectorUpsertTask.ts | 2 +- packages/ai/src/task/ContextBuilderTask.ts | 2 +- packages/ai/src/task/DocumentEnricherTask.ts | 2 +- .../ai/src/task/HierarchicalChunkerTask.ts | 2 +- packages/ai/src/task/HierarchyJoinTask.ts | 2 +- packages/ai/src/task/StructuralParserTask.ts | 2 +- packages/ai/tsconfig.json | 1 - .../knowledge-base/KnowledgeBaseRegistry.ts | 74 --------- .../{dataset => knowledge-base}/CHANGELOG.md | 2 +- packages/{dataset => knowledge-base}/LICENSE | 0 .../{dataset => knowledge-base}/README.md | 16 +- .../{dataset => knowledge-base}/package.json | 2 +- .../src/browser.ts | 0 .../{dataset => knowledge-base}/src/bun.ts | 0 .../src/chunk/ChunkSchema.ts | 0 .../src/chunk/ChunkVectorStorageSchema.ts | 0 .../src/common-server.ts | 0 .../{dataset => knowledge-base}/src/common.ts | 3 + .../src/document/Document.ts | 0 .../src/document/DocumentNode.ts | 0 .../src/document/DocumentSchema.ts | 0 .../src/document/DocumentStorageSchema.ts | 0 .../src/document/StructuralParser.ts | 0 .../InMemoryKnowledgeBaseRepository.ts | 18 +++ .../src/knowledge-base/KnowledgeBase.ts | 13 +- .../knowledge-base/KnowledgeBaseRegistry.ts | 122 ++++++++++++++ .../knowledge-base/KnowledgeBaseRepository.ts | 151 ++++++++++++++++++ .../src/knowledge-base/KnowledgeBaseSchema.ts | 52 ++++++ .../src/knowledge-base/createKnowledgeBase.ts | 8 +- .../{dataset => knowledge-base}/src/node.ts | 0 .../{dataset => knowledge-base}/src/types.ts | 0 .../src/util/DatasetSchema.ts | 8 +- .../{dataset => knowledge-base}/tsconfig.json | 0 packages/storage/src/vector/README.md | 16 +- packages/test/package.json | 6 +- .../test/src/test/rag/ChunkToVector.test.ts | 2 +- packages/test/src/test/rag/Document.test.ts | 4 +- .../rag/DocumentChunkRetrievalTask.test.ts | 2 +- .../test/rag/DocumentChunkSearchTask.test.ts | 2 +- .../test/rag/DocumentChunkUpsertTask.test.ts | 2 +- .../src/test/rag/DocumentRepository.test.ts | 2 +- .../src/test/rag/EndToEnd.integration.test.ts | 2 +- packages/test/src/test/rag/FullChain.test.ts | 2 +- .../src/test/rag/HierarchicalChunker.test.ts | 2 +- .../src/test/rag/HybridSearchTask.test.ts | 2 +- .../test/rag/RagWorkflow.integration.test.ts | 2 +- .../src/test/rag/StructuralParser.test.ts | 2 +- .../src/test/task-graph/InputResolver.test.ts | 2 +- packages/test/src/test/util/Document.test.ts | 4 +- packages/workglow/README.md | 2 +- packages/workglow/package.json | 2 +- packages/workglow/src/common.ts | 2 +- 64 files changed, 447 insertions(+), 179 deletions(-) delete mode 100644 packages/dataset/src/knowledge-base/KnowledgeBaseRegistry.ts rename packages/{dataset => knowledge-base}/CHANGELOG.md (99%) rename packages/{dataset => knowledge-base}/LICENSE (100%) rename packages/{dataset => knowledge-base}/README.md (98%) rename packages/{dataset => knowledge-base}/package.json (98%) rename packages/{dataset => knowledge-base}/src/browser.ts (100%) rename packages/{dataset => knowledge-base}/src/bun.ts (100%) rename packages/{dataset => knowledge-base}/src/chunk/ChunkSchema.ts (100%) rename packages/{dataset => knowledge-base}/src/chunk/ChunkVectorStorageSchema.ts (100%) rename packages/{dataset => knowledge-base}/src/common-server.ts (100%) rename packages/{dataset => knowledge-base}/src/common.ts (79%) rename packages/{dataset => knowledge-base}/src/document/Document.ts (100%) rename packages/{dataset => knowledge-base}/src/document/DocumentNode.ts (100%) rename packages/{dataset => knowledge-base}/src/document/DocumentSchema.ts (100%) rename packages/{dataset => knowledge-base}/src/document/DocumentStorageSchema.ts (100%) rename packages/{dataset => knowledge-base}/src/document/StructuralParser.ts (100%) create mode 100644 packages/knowledge-base/src/knowledge-base/InMemoryKnowledgeBaseRepository.ts rename packages/{dataset => knowledge-base}/src/knowledge-base/KnowledgeBase.ts (97%) create mode 100644 packages/knowledge-base/src/knowledge-base/KnowledgeBaseRegistry.ts create mode 100644 packages/knowledge-base/src/knowledge-base/KnowledgeBaseRepository.ts create mode 100644 packages/knowledge-base/src/knowledge-base/KnowledgeBaseSchema.ts rename packages/{dataset => knowledge-base}/src/knowledge-base/createKnowledgeBase.ts (91%) rename packages/{dataset => knowledge-base}/src/node.ts (100%) rename packages/{dataset => knowledge-base}/src/types.ts (100%) rename packages/{dataset => knowledge-base}/src/util/DatasetSchema.ts (83%) rename packages/{dataset => knowledge-base}/tsconfig.json (100%) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 23654eda3..8975556fa 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -118,7 +118,7 @@ Required static properties: `type`, `category`, `title`, `description`, `cacheab - `runReactive()` → `executeReactive()` — lightweight, UI previews only, keeps PENDING, must be <1ms - Lifecycle: `PENDING → PROCESSING → COMPLETED | FAILED | ABORTED` -**Schema conventions**: JSON Schema objects. Properties can have `format` annotations for runtime type resolution: `format: "model"`, `format: "model:EmbeddingTask"`, `format: "storage:tabular"`, `format: "dataset:knowledge-base"`. Properties with `x-ui-manual: true` are user-added ports. +**Schema conventions**: JSON Schema objects. Properties can have `format` annotations for runtime type resolution: `format: "model"`, `format: "model:EmbeddingTask"`, `format: "storage:tabular"`, `format: "knowledge-base"`. Properties with `x-ui-manual: true` are user-added ports. **TaskRegistry** — global class registry: `TaskRegistry.registerTask(MyTask)`. @@ -132,13 +132,13 @@ Event-driven: storages emit `put`, `get`, `delete`, `deleteAll`. Auto-generated PKs: `x-auto-generated: true` in schema — integers auto-increment, strings get UUID. -### `@workglow/dataset` — knowledge base & documents +### `@workglow/knowledge-base` — knowledge base & documents -`KnowledgeBase` — unified class owning both document storage (tabular) and chunk storage (vector). Replaces the old `DocumentDataset` + `DocumentChunkDataset` split. +`KnowledgeBase` — unified class owning both document storage (tabular) and chunk storage (vector). - `createKnowledgeBase({ name, vectorDimensions })` — factory (in-memory, auto-registers) - `registerKnowledgeBase(id, kb)` / `getKnowledgeBase(id)` / `getGlobalKnowledgeBases()` — global registry -- `TypeKnowledgeBase()` — JSON Schema helper for task inputs (format `"dataset:knowledge-base"`) +- `TypeKnowledgeBase()` — JSON Schema helper for task inputs (format `"knowledge-base"`) - `Document` — wraps a `DocumentRootNode` tree + metadata - `ChunkRecord` — flat chunk with tree linkage (`nodePath`, `depth`) - `ChunkVectorStorageSchema` / `ChunkVectorPrimaryKey` — vector storage schema for chunks diff --git a/TODO.md b/TODO.md index ca37b51c1..7a01a0e5c 100644 --- a/TODO.md +++ b/TODO.md @@ -6,13 +6,9 @@ TODO.md - [x] No fixed column names, use the schema to define the columns. - [ ] Option for which column to use if there are multiple, default to the first one. - [ ] Use @mceachen/sqlite-vec for sqlite storage. -- [ ] Datasets Package - - [x] Documents dataset (mabye rename to DocumentDataset) - - [ ] Chunks Package (or part of DocumentDataset?) - - [x] Move Model repository to datasets package. - [x] Chunks and nodes are not always the same. - [x] And we may need to save the chunk's node path. Or paths? or document range? Standard metadata? -- [ ] Instead of passing doc_id around, pass a document key that is of type unknown (string or object) + - [ ] Instead of passing doc_id around, pass a document key that is of type unknown (string or object) - [ ] Get a better model for question answering. - [ ] Get a better model for named entity recognition, the current one recognized everything as a token, not helpful. @@ -27,10 +23,6 @@ TODO.md - [ ] Consider different ways to connect tasks to queues. What is a task? What is a job? -- [ ] Input and outputs are all scalar, arrays, or unions. But what about streams? Stream of items in an array, stream of content for a scalar like a string, etc. - onnx-community/ModernBERT-finetuned-squad-ONNX - summarization -Rework the Document Dataset. Currently there is a Document storage of tabular storage type, and that should be registered as a "dataset:document:source" meaning the source material in node format. And there is already a "dataset:document-chunk" for the chunk/vector storage which should be registered as a "dataset:document:chunk" with a well defined metadata schema. The two combined should be registered as a "dataset:document" which is the complete document with its source and all its chunks and metadata. This is for convenience but not used by tasks or ai tasks. - The sqlitevectorstorage currently does not use a built in vector search. Use @mceachen/sqlite-vec for sqlite storage vector indexing. diff --git a/bun.lock b/bun.lock index 72462cdc4..eff1fd7c4 100644 --- a/bun.lock +++ b/bun.lock @@ -76,9 +76,9 @@ "@uiw/react-codemirror": "^4.25.7", "@workglow/ai": "workspace:*", "@workglow/ai-provider": "workspace:*", - "@workglow/dataset": "workspace:*", "@workglow/debug": "workspace:*", "@workglow/job-queue": "workspace:*", + "@workglow/knowledge-base": "workspace:*", "@workglow/sqlite": "workspace:*", "@workglow/storage": "workspace:*", "@workglow/task-graph": "workspace:*", @@ -108,15 +108,15 @@ "name": "@workglow/ai", "version": "0.0.114", "devDependencies": { - "@workglow/dataset": "workspace:*", "@workglow/job-queue": "workspace:*", + "@workglow/knowledge-base": "workspace:*", "@workglow/storage": "workspace:*", "@workglow/task-graph": "workspace:*", "@workglow/util": "workspace:*", }, "peerDependencies": { - "@workglow/dataset": "workspace:*", "@workglow/job-queue": "workspace:*", + "@workglow/knowledge-base": "workspace:*", "@workglow/storage": "workspace:*", "@workglow/task-graph": "workspace:*", "@workglow/util": "workspace:*", @@ -178,18 +178,6 @@ "tiktoken", ], }, - "packages/dataset": { - "name": "@workglow/dataset", - "version": "0.0.114", - "devDependencies": { - "@workglow/storage": "workspace:*", - "@workglow/util": "workspace:*", - }, - "peerDependencies": { - "@workglow/storage": "workspace:*", - "@workglow/util": "workspace:*", - }, - }, "packages/debug": { "name": "@workglow/debug", "version": "0.0.114", @@ -210,6 +198,18 @@ "@workglow/util": "workspace:*", }, }, + "packages/knowledge-base": { + "name": "@workglow/knowledge-base", + "version": "0.0.114", + "devDependencies": { + "@workglow/storage": "workspace:*", + "@workglow/util": "workspace:*", + }, + "peerDependencies": { + "@workglow/storage": "workspace:*", + "@workglow/util": "workspace:*", + }, + }, "packages/sqlite": { "name": "@workglow/sqlite", "version": "0.0.114", @@ -286,8 +286,8 @@ "@types/pg": "^8.18.0", "@workglow/ai": "workspace:*", "@workglow/ai-provider": "workspace:*", - "@workglow/dataset": "workspace:*", "@workglow/job-queue": "workspace:*", + "@workglow/knowledge-base": "workspace:*", "@workglow/sqlite": "workspace:*", "@workglow/storage": "workspace:*", "@workglow/task-graph": "workspace:*", @@ -306,8 +306,8 @@ "@supabase/supabase-js": "catalog:", "@workglow/ai": "workspace:*", "@workglow/ai-provider": "workspace:*", - "@workglow/dataset": "workspace:*", "@workglow/job-queue": "workspace:*", + "@workglow/knowledge-base": "workspace:*", "@workglow/sqlite": "workspace:*", "@workglow/storage": "workspace:*", "@workglow/task-graph": "workspace:*", @@ -336,9 +336,9 @@ "dependencies": { "@workglow/ai": "workspace:*", "@workglow/ai-provider": "workspace:*", - "@workglow/dataset": "workspace:*", "@workglow/debug": "workspace:*", "@workglow/job-queue": "workspace:*", + "@workglow/knowledge-base": "workspace:*", "@workglow/sqlite": "workspace:*", "@workglow/storage": "workspace:*", "@workglow/task-graph": "workspace:*", @@ -910,12 +910,12 @@ "@workglow/cli": ["@workglow/cli@workspace:examples/cli"], - "@workglow/dataset": ["@workglow/dataset@workspace:packages/dataset"], - "@workglow/debug": ["@workglow/debug@workspace:packages/debug"], "@workglow/job-queue": ["@workglow/job-queue@workspace:packages/job-queue"], + "@workglow/knowledge-base": ["@workglow/knowledge-base@workspace:packages/knowledge-base"], + "@workglow/sqlite": ["@workglow/sqlite@workspace:packages/sqlite"], "@workglow/storage": ["@workglow/storage@workspace:packages/storage"], diff --git a/docs/developers/03_extending.md b/docs/developers/03_extending.md index f2459d1d0..cdac6b895 100644 --- a/docs/developers/03_extending.md +++ b/docs/developers/03_extending.md @@ -290,7 +290,7 @@ Tasks chain together through compatible input/output schemas: ```typescript import { Workflow } from "@workglow/task-graph"; -import { createKnowledgeBase } from "@workglow/dataset"; +import { createKnowledgeBase } from "@workglow/knowledge-base"; // Create a KnowledgeBase (auto-registers globally as "my-kb") const kb = await createKnowledgeBase({ diff --git a/examples/web/package.json b/examples/web/package.json index be1cf6d47..d62be5752 100644 --- a/examples/web/package.json +++ b/examples/web/package.json @@ -16,7 +16,7 @@ "@codemirror/lang-json": "^6.0.2", "@workglow/ai": "workspace:*", "@workglow/debug": "workspace:*", - "@workglow/dataset": "workspace:*", + "@workglow/knowledge-base": "workspace:*", "@workglow/ai-provider": "workspace:*", "@workglow/storage": "workspace:*", "@workglow/job-queue": "workspace:*", diff --git a/examples/web/vite.config.js b/examples/web/vite.config.js index 93ca160a2..2fc9fc0c4 100644 --- a/examples/web/vite.config.js +++ b/examples/web/vite.config.js @@ -26,7 +26,7 @@ export default defineConfig({ "@workglow/storage", "@workglow/task-graph", "@workglow/debug", - "@workglow/dataset", + "@workglow/knowledge-base", "@workglow/tasks", "@workglow/util", "@workglow/sqlite", diff --git a/packages/ai/README.md b/packages/ai/README.md index 267675b2c..ecef51a40 100644 --- a/packages/ai/README.md +++ b/packages/ai/README.md @@ -445,7 +445,7 @@ The AI package provides a comprehensive set of tasks for building RAG pipelines. ```typescript import { Workflow } from "@workglow/task-graph"; -import { createKnowledgeBase } from "@workglow/dataset"; +import { createKnowledgeBase } from "@workglow/knowledge-base"; // Create a KnowledgeBase (auto-registers globally as "my-kb") const kb = await createKnowledgeBase({ diff --git a/packages/ai/package.json b/packages/ai/package.json index 297b9b2f7..cb40bf282 100644 --- a/packages/ai/package.json +++ b/packages/ai/package.json @@ -37,14 +37,14 @@ "access": "public" }, "peerDependencies": { - "@workglow/dataset": "workspace:*", + "@workglow/knowledge-base": "workspace:*", "@workglow/job-queue": "workspace:*", "@workglow/storage": "workspace:*", "@workglow/task-graph": "workspace:*", "@workglow/util": "workspace:*" }, "peerDependenciesMeta": { - "@workglow/dataset": { + "@workglow/knowledge-base": { "optional": false }, "@workglow/job-queue": { @@ -61,7 +61,7 @@ } }, "devDependencies": { - "@workglow/dataset": "workspace:*", + "@workglow/knowledge-base": "workspace:*", "@workglow/job-queue": "workspace:*", "@workglow/storage": "workspace:*", "@workglow/task-graph": "workspace:*", diff --git a/packages/ai/src/task/ChunkRetrievalTask.ts b/packages/ai/src/task/ChunkRetrievalTask.ts index 99c73fbca..8be11fde8 100644 --- a/packages/ai/src/task/ChunkRetrievalTask.ts +++ b/packages/ai/src/task/ChunkRetrievalTask.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { KnowledgeBase, TypeKnowledgeBase, type ChunkRecord } from "@workglow/dataset"; +import { KnowledgeBase, TypeKnowledgeBase, type ChunkRecord } from "@workglow/knowledge-base"; import { CreateWorkflow, IExecuteContext, @@ -22,7 +22,7 @@ import { } from "@workglow/util"; import { TypeModel, TypeSingleOrArray } from "./base/AiTaskSchemas"; import { TextEmbeddingTask } from "./TextEmbeddingTask"; -import type { ChunkSearchResult } from "@workglow/dataset"; +import type { ChunkSearchResult } from "@workglow/knowledge-base"; const inputSchema = { type: "object", diff --git a/packages/ai/src/task/ChunkToVectorTask.ts b/packages/ai/src/task/ChunkToVectorTask.ts index 89256a699..2856ce278 100644 --- a/packages/ai/src/task/ChunkToVectorTask.ts +++ b/packages/ai/src/task/ChunkToVectorTask.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { ChunkRecordSchema, type ChunkRecord } from "@workglow/dataset"; +import { ChunkRecordSchema, type ChunkRecord } from "@workglow/knowledge-base"; import { CreateWorkflow, IExecuteContext, diff --git a/packages/ai/src/task/ChunkVectorHybridSearchTask.ts b/packages/ai/src/task/ChunkVectorHybridSearchTask.ts index 0a85032d2..f15a8deba 100644 --- a/packages/ai/src/task/ChunkVectorHybridSearchTask.ts +++ b/packages/ai/src/task/ChunkVectorHybridSearchTask.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { KnowledgeBase, TypeKnowledgeBase, type ChunkRecord } from "@workglow/dataset"; +import { KnowledgeBase, TypeKnowledgeBase, type ChunkRecord } from "@workglow/knowledge-base"; import { CreateWorkflow, IExecuteContext, diff --git a/packages/ai/src/task/ChunkVectorSearchTask.ts b/packages/ai/src/task/ChunkVectorSearchTask.ts index b9ab8a1af..8b3a874a4 100644 --- a/packages/ai/src/task/ChunkVectorSearchTask.ts +++ b/packages/ai/src/task/ChunkVectorSearchTask.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { KnowledgeBase, TypeKnowledgeBase } from "@workglow/dataset"; +import { KnowledgeBase, TypeKnowledgeBase } from "@workglow/knowledge-base"; import { CreateWorkflow, IExecuteContext, @@ -127,7 +127,7 @@ export class ChunkVectorSearchTask extends Task< async execute( input: VectorStoreSearchTaskInput, - context: IExecuteContext + _context: IExecuteContext ): Promise { const { knowledgeBase, query, topK = 10, filter, scoreThreshold = 0 } = input; diff --git a/packages/ai/src/task/ChunkVectorUpsertTask.ts b/packages/ai/src/task/ChunkVectorUpsertTask.ts index fefac5221..72c2e392f 100644 --- a/packages/ai/src/task/ChunkVectorUpsertTask.ts +++ b/packages/ai/src/task/ChunkVectorUpsertTask.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { KnowledgeBase, TypeKnowledgeBase } from "@workglow/dataset"; +import { KnowledgeBase, TypeKnowledgeBase } from "@workglow/knowledge-base"; import { CreateWorkflow, IExecuteContext, diff --git a/packages/ai/src/task/ContextBuilderTask.ts b/packages/ai/src/task/ContextBuilderTask.ts index 9ec12ce28..ef46bbfb2 100644 --- a/packages/ai/src/task/ContextBuilderTask.ts +++ b/packages/ai/src/task/ContextBuilderTask.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { estimateTokens } from "@workglow/dataset"; +import { estimateTokens } from "@workglow/knowledge-base"; import { CreateWorkflow, IExecuteReactiveContext, diff --git a/packages/ai/src/task/DocumentEnricherTask.ts b/packages/ai/src/task/DocumentEnricherTask.ts index 862f12b91..7d8323be3 100644 --- a/packages/ai/src/task/DocumentEnricherTask.ts +++ b/packages/ai/src/task/DocumentEnricherTask.ts @@ -10,7 +10,7 @@ import { type DocumentNode, type Entity, type NodeEnrichment, -} from "@workglow/dataset"; +} from "@workglow/knowledge-base"; import { CreateWorkflow, IExecuteContext, diff --git a/packages/ai/src/task/HierarchicalChunkerTask.ts b/packages/ai/src/task/HierarchicalChunkerTask.ts index 0bf74d9a5..de75c1d01 100644 --- a/packages/ai/src/task/HierarchicalChunkerTask.ts +++ b/packages/ai/src/task/HierarchicalChunkerTask.ts @@ -12,7 +12,7 @@ import { type ChunkRecord, type DocumentNode, type TokenBudget, -} from "@workglow/dataset"; +} from "@workglow/knowledge-base"; import { CreateWorkflow, IExecuteContext, diff --git a/packages/ai/src/task/HierarchyJoinTask.ts b/packages/ai/src/task/HierarchyJoinTask.ts index 6a90e1489..33b00648b 100644 --- a/packages/ai/src/task/HierarchyJoinTask.ts +++ b/packages/ai/src/task/HierarchyJoinTask.ts @@ -9,7 +9,7 @@ import { TypeKnowledgeBase, type ChunkRecord, type KnowledgeBase, -} from "@workglow/dataset"; +} from "@workglow/knowledge-base"; import { CreateWorkflow, IExecuteContext, diff --git a/packages/ai/src/task/StructuralParserTask.ts b/packages/ai/src/task/StructuralParserTask.ts index 48934885a..e50f9927f 100644 --- a/packages/ai/src/task/StructuralParserTask.ts +++ b/packages/ai/src/task/StructuralParserTask.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { DocumentNode, StructuralParser } from "@workglow/dataset"; +import { DocumentNode, StructuralParser } from "@workglow/knowledge-base"; import { CreateWorkflow, IExecuteContext, diff --git a/packages/ai/tsconfig.json b/packages/ai/tsconfig.json index 60a6e538c..40a4caa27 100644 --- a/packages/ai/tsconfig.json +++ b/packages/ai/tsconfig.json @@ -6,7 +6,6 @@ "compilerOptions": { "composite": true, "outDir": "./dist", - "baseUrl": "./src", "rootDir": "./src" } } diff --git a/packages/dataset/src/knowledge-base/KnowledgeBaseRegistry.ts b/packages/dataset/src/knowledge-base/KnowledgeBaseRegistry.ts deleted file mode 100644 index d6090418a..000000000 --- a/packages/dataset/src/knowledge-base/KnowledgeBaseRegistry.ts +++ /dev/null @@ -1,74 +0,0 @@ -/** - * @license - * Copyright 2025 Steven Roussey - * SPDX-License-Identifier: Apache-2.0 - */ - -import { - createServiceToken, - globalServiceRegistry, - registerInputResolver, - ServiceRegistry, -} from "@workglow/util"; -import type { KnowledgeBase } from "./KnowledgeBase"; - -/** - * Service token for the knowledge base registry - * Maps knowledge base IDs to KnowledgeBase instances - */ -export const KNOWLEDGE_BASES = - createServiceToken>("dataset.knowledge-bases"); - -// Register default factory if not already registered -if (!globalServiceRegistry.has(KNOWLEDGE_BASES)) { - globalServiceRegistry.register( - KNOWLEDGE_BASES, - (): Map => new Map(), - true - ); -} - -/** - * Gets the global knowledge base registry - */ -export function getGlobalKnowledgeBases(): Map { - return globalServiceRegistry.get(KNOWLEDGE_BASES); -} - -/** - * Registers a knowledge base globally by ID - */ -export function registerKnowledgeBase(id: string, kb: KnowledgeBase): void { - const kbs = getGlobalKnowledgeBases(); - kbs.set(id, kb); -} - -/** - * Gets a knowledge base by ID from the global registry - */ -export function getKnowledgeBase(id: string): KnowledgeBase | undefined { - return getGlobalKnowledgeBases().get(id); -} - -/** - * Resolves a knowledge base ID from the registry. - * Used by the input resolver system. - */ -async function resolveKnowledgeBaseFromRegistry( - id: string, - format: string, - registry: ServiceRegistry -): Promise { - const kbs = registry.has(KNOWLEDGE_BASES) - ? registry.get>(KNOWLEDGE_BASES) - : getGlobalKnowledgeBases(); - - const kb = kbs.get(id); - if (!kb) { - throw new Error(`Knowledge base "${id}" not found in registry`); - } - return kb; -} - -// Register the resolver for format: "dataset:knowledge-base" -registerInputResolver("dataset:knowledge-base", resolveKnowledgeBaseFromRegistry); diff --git a/packages/dataset/CHANGELOG.md b/packages/knowledge-base/CHANGELOG.md similarity index 99% rename from packages/dataset/CHANGELOG.md rename to packages/knowledge-base/CHANGELOG.md index fd44af0e1..2b7cdaf1d 100644 --- a/packages/dataset/CHANGELOG.md +++ b/packages/knowledge-base/CHANGELOG.md @@ -1,4 +1,4 @@ -# @workglow/dataset +# @workglow/knowledge-base ## 0.0.114 diff --git a/packages/dataset/LICENSE b/packages/knowledge-base/LICENSE similarity index 100% rename from packages/dataset/LICENSE rename to packages/knowledge-base/LICENSE diff --git a/packages/dataset/README.md b/packages/knowledge-base/README.md similarity index 98% rename from packages/dataset/README.md rename to packages/knowledge-base/README.md index 7a41f6ddb..3d2eeae7c 100644 --- a/packages/dataset/README.md +++ b/packages/knowledge-base/README.md @@ -1,4 +1,4 @@ -# @workglow/dataset +# @workglow/knowledge-base Document management, hierarchical chunking, and knowledge base infrastructure for RAG pipelines. @@ -64,7 +64,7 @@ This package provides the data layer for RAG (Retrieval-Augmented Generation) wo ## Installation ```bash -bun install @workglow/dataset +bun install @workglow/knowledge-base ``` Peer dependencies: `@workglow/storage`, `@workglow/util`. @@ -76,7 +76,7 @@ import { createKnowledgeBase, Document, StructuralParser, -} from "@workglow/dataset"; +} from "@workglow/knowledge-base"; // 1. Create a knowledge base const kb = await createKnowledgeBase({ @@ -180,7 +180,7 @@ interface DocumentNodeBase { Use the `NodeKind` constants for comparisons: ```typescript -import { NodeKind } from "@workglow/dataset"; +import { NodeKind } from "@workglow/knowledge-base"; if (node.kind === NodeKind.SECTION) { console.log(node.title, node.level, node.children.length); @@ -192,7 +192,7 @@ if (node.kind === NodeKind.SECTION) { `StructuralParser` converts raw text into a `DocumentRootNode`: ```typescript -import { StructuralParser } from "@workglow/dataset"; +import { StructuralParser } from "@workglow/knowledge-base"; // Markdown — detects headers, creates nested sections const root = await StructuralParser.parseMarkdown(docId, markdownText, "Title"); @@ -280,7 +280,7 @@ The `metadata` field holds the complete `ChunkRecord`, so search results carry a **Factory function (recommended):** ```typescript -import { createKnowledgeBase } from "@workglow/dataset"; +import { createKnowledgeBase } from "@workglow/knowledge-base"; const kb = await createKnowledgeBase({ name: "my-kb", // Identifier @@ -294,7 +294,7 @@ const kb = await createKnowledgeBase({ **Direct construction (custom storage backends):** ```typescript -import { KnowledgeBase } from "@workglow/dataset"; +import { KnowledgeBase } from "@workglow/knowledge-base"; const kb = new KnowledgeBase( "my-kb", @@ -417,7 +417,7 @@ kb.destroy(); Knowledge bases can be registered globally by name, allowing tasks to reference them by string ID: ```typescript -import { registerKnowledgeBase, getKnowledgeBase, TypeKnowledgeBase } from "@workglow/dataset"; +import { registerKnowledgeBase, getKnowledgeBase, TypeKnowledgeBase } from "@workglow/knowledge-base"; // Register registerKnowledgeBase("my-kb", kb); diff --git a/packages/dataset/package.json b/packages/knowledge-base/package.json similarity index 98% rename from packages/dataset/package.json rename to packages/knowledge-base/package.json index 3e5c28c61..93c7b528c 100644 --- a/packages/dataset/package.json +++ b/packages/knowledge-base/package.json @@ -1,5 +1,5 @@ { - "name": "@workglow/dataset", + "name": "@workglow/knowledge-base", "type": "module", "version": "0.0.114", "description": "Dataset package for Workglow.", diff --git a/packages/dataset/src/browser.ts b/packages/knowledge-base/src/browser.ts similarity index 100% rename from packages/dataset/src/browser.ts rename to packages/knowledge-base/src/browser.ts diff --git a/packages/dataset/src/bun.ts b/packages/knowledge-base/src/bun.ts similarity index 100% rename from packages/dataset/src/bun.ts rename to packages/knowledge-base/src/bun.ts diff --git a/packages/dataset/src/chunk/ChunkSchema.ts b/packages/knowledge-base/src/chunk/ChunkSchema.ts similarity index 100% rename from packages/dataset/src/chunk/ChunkSchema.ts rename to packages/knowledge-base/src/chunk/ChunkSchema.ts diff --git a/packages/dataset/src/chunk/ChunkVectorStorageSchema.ts b/packages/knowledge-base/src/chunk/ChunkVectorStorageSchema.ts similarity index 100% rename from packages/dataset/src/chunk/ChunkVectorStorageSchema.ts rename to packages/knowledge-base/src/chunk/ChunkVectorStorageSchema.ts diff --git a/packages/dataset/src/common-server.ts b/packages/knowledge-base/src/common-server.ts similarity index 100% rename from packages/dataset/src/common-server.ts rename to packages/knowledge-base/src/common-server.ts diff --git a/packages/dataset/src/common.ts b/packages/knowledge-base/src/common.ts similarity index 79% rename from packages/dataset/src/common.ts rename to packages/knowledge-base/src/common.ts index 977d02a89..776598be2 100644 --- a/packages/dataset/src/common.ts +++ b/packages/knowledge-base/src/common.ts @@ -8,6 +8,9 @@ export * from "./chunk/ChunkSchema"; export * from "./chunk/ChunkVectorStorageSchema"; export * from "./knowledge-base/KnowledgeBase"; +export * from "./knowledge-base/KnowledgeBaseSchema"; +export * from "./knowledge-base/KnowledgeBaseRepository"; +export * from "./knowledge-base/InMemoryKnowledgeBaseRepository"; export * from "./knowledge-base/KnowledgeBaseRegistry"; export * from "./knowledge-base/createKnowledgeBase"; diff --git a/packages/dataset/src/document/Document.ts b/packages/knowledge-base/src/document/Document.ts similarity index 100% rename from packages/dataset/src/document/Document.ts rename to packages/knowledge-base/src/document/Document.ts diff --git a/packages/dataset/src/document/DocumentNode.ts b/packages/knowledge-base/src/document/DocumentNode.ts similarity index 100% rename from packages/dataset/src/document/DocumentNode.ts rename to packages/knowledge-base/src/document/DocumentNode.ts diff --git a/packages/dataset/src/document/DocumentSchema.ts b/packages/knowledge-base/src/document/DocumentSchema.ts similarity index 100% rename from packages/dataset/src/document/DocumentSchema.ts rename to packages/knowledge-base/src/document/DocumentSchema.ts diff --git a/packages/dataset/src/document/DocumentStorageSchema.ts b/packages/knowledge-base/src/document/DocumentStorageSchema.ts similarity index 100% rename from packages/dataset/src/document/DocumentStorageSchema.ts rename to packages/knowledge-base/src/document/DocumentStorageSchema.ts diff --git a/packages/dataset/src/document/StructuralParser.ts b/packages/knowledge-base/src/document/StructuralParser.ts similarity index 100% rename from packages/dataset/src/document/StructuralParser.ts rename to packages/knowledge-base/src/document/StructuralParser.ts diff --git a/packages/knowledge-base/src/knowledge-base/InMemoryKnowledgeBaseRepository.ts b/packages/knowledge-base/src/knowledge-base/InMemoryKnowledgeBaseRepository.ts new file mode 100644 index 000000000..308cf8ba6 --- /dev/null +++ b/packages/knowledge-base/src/knowledge-base/InMemoryKnowledgeBaseRepository.ts @@ -0,0 +1,18 @@ +/** + * @license + * Copyright 2025 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { InMemoryTabularStorage } from "@workglow/storage"; +import { KnowledgeBaseRepository } from "./KnowledgeBaseRepository"; +import { KnowledgeBasePrimaryKeyNames, KnowledgeBaseRecordSchema } from "./KnowledgeBaseSchema"; + +/** + * In-memory implementation of a knowledge base repository. + */ +export class InMemoryKnowledgeBaseRepository extends KnowledgeBaseRepository { + constructor() { + super(new InMemoryTabularStorage(KnowledgeBaseRecordSchema, KnowledgeBasePrimaryKeyNames)); + } +} diff --git a/packages/dataset/src/knowledge-base/KnowledgeBase.ts b/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts similarity index 97% rename from packages/dataset/src/knowledge-base/KnowledgeBase.ts rename to packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts index 99af18f0f..4947cfb0e 100644 --- a/packages/dataset/src/knowledge-base/KnowledgeBase.ts +++ b/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts @@ -22,21 +22,26 @@ import type { } from "../document/DocumentStorageSchema"; /** - * Unified KnowledgeBase that owns both document and vector storage. - * Replaces DocumentDataset and DocumentChunkDataset with a single cohesive class - * that provides lifecycle management and cascading deletes. + * Unified KnowledgeBase that owns both document and vector storage, + * providing lifecycle management and cascading deletes. */ export class KnowledgeBase { readonly name: string; + readonly title: string; + readonly description: string; private tabularStorage: DocumentTabularStorage; private chunkStorage: ChunkVectorStorage; constructor( name: string, documentStorage: DocumentTabularStorage, - chunkStorage: ChunkVectorStorage + chunkStorage: ChunkVectorStorage, + title?: string, + description?: string ) { this.name = name; + this.title = title ?? name; + this.description = description ?? ""; this.tabularStorage = documentStorage; this.chunkStorage = chunkStorage; } diff --git a/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRegistry.ts b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRegistry.ts new file mode 100644 index 000000000..e36cf5488 --- /dev/null +++ b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRegistry.ts @@ -0,0 +1,122 @@ +/** + * @license + * Copyright 2025 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { + createServiceToken, + globalServiceRegistry, + registerInputResolver, + ServiceRegistry, +} from "@workglow/util"; +import type { KnowledgeBase } from "./KnowledgeBase"; +import { InMemoryKnowledgeBaseRepository } from "./InMemoryKnowledgeBaseRepository"; +import { KnowledgeBaseRepository } from "./KnowledgeBaseRepository"; +import { knowledgeBaseTableNames, type KnowledgeBaseRecord } from "./KnowledgeBaseSchema"; + +/** + * Service token for the knowledge base registry + * Maps knowledge base IDs to KnowledgeBase instances + */ +export const KNOWLEDGE_BASES = + createServiceToken>("knowledge-base.registry"); + +/** + * Service token for the knowledge base repository + */ +export const KNOWLEDGE_BASE_REPOSITORY = + createServiceToken("knowledge-base.repository"); + +// Register default factory for live KB map if not already registered +if (!globalServiceRegistry.has(KNOWLEDGE_BASES)) { + globalServiceRegistry.register( + KNOWLEDGE_BASES, + (): Map => new Map(), + true + ); +} + +// Register default factory for KB repository if not already registered +if (!globalServiceRegistry.has(KNOWLEDGE_BASE_REPOSITORY)) { + globalServiceRegistry.register( + KNOWLEDGE_BASE_REPOSITORY, + (): KnowledgeBaseRepository => new InMemoryKnowledgeBaseRepository(), + true + ); +} + +/** + * Gets the global knowledge base registry + */ +export function getGlobalKnowledgeBases(): Map { + return globalServiceRegistry.get(KNOWLEDGE_BASES); +} + +/** + * Gets the global knowledge base repository instance + */ +export function getGlobalKnowledgeBaseRepository(): KnowledgeBaseRepository { + return globalServiceRegistry.get(KNOWLEDGE_BASE_REPOSITORY); +} + +/** + * Sets the global knowledge base repository instance + */ +export function setGlobalKnowledgeBaseRepository(repository: KnowledgeBaseRepository): void { + globalServiceRegistry.registerInstance(KNOWLEDGE_BASE_REPOSITORY, repository); +} + +/** + * Registers a knowledge base globally by ID. + * Adds to both the live Map and the persistent repository. + */ +export function registerKnowledgeBase(id: string, kb: KnowledgeBase): void { + const kbs = getGlobalKnowledgeBases(); + kbs.set(id, kb); + + const now = new Date().toISOString(); + const tableNames = knowledgeBaseTableNames(id); + const record: KnowledgeBaseRecord = { + kb_id: id, + title: kb.title, + description: kb.description, + vector_dimensions: kb.getVectorDimensions(), + document_table: tableNames.documentTable, + chunk_table: tableNames.chunkTable, + created_at: now, + updated_at: now, + }; + const repo = getGlobalKnowledgeBaseRepository(); + repo.addKnowledgeBase(record); +} + +/** + * Gets a knowledge base by ID from the global registry + */ +export function getKnowledgeBase(id: string): KnowledgeBase | undefined { + return getGlobalKnowledgeBases().get(id); +} + +/** + * Resolves a knowledge base ID from the registry. + * Used by the input resolver system. + */ +async function resolveKnowledgeBaseFromRegistry( + id: string, + format: string, + registry: ServiceRegistry +): Promise { + const kbs = registry.has(KNOWLEDGE_BASES) + ? registry.get>(KNOWLEDGE_BASES) + : getGlobalKnowledgeBases(); + + const kb = kbs.get(id); + if (!kb) { + throw new Error(`Knowledge base "${id}" not found in registry`); + } + return kb; +} + +// Register the resolver for format: "knowledge-base" +registerInputResolver("knowledge-base", resolveKnowledgeBaseFromRegistry); diff --git a/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRepository.ts b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRepository.ts new file mode 100644 index 000000000..59a922f21 --- /dev/null +++ b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRepository.ts @@ -0,0 +1,151 @@ +/** + * @license + * Copyright 2025 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { type BaseTabularStorage } from "@workglow/storage"; +import { EventEmitter, type EventParameters } from "@workglow/util"; + +import { + KnowledgeBasePrimaryKeyNames, + type KnowledgeBaseRecord, + KnowledgeBaseRecordSchema, +} from "./KnowledgeBaseSchema"; + +/** + * Events that can be emitted by the KnowledgeBaseRepository + */ + +export type KnowledgeBaseEventListeners = { + knowledge_base_added: (record: KnowledgeBaseRecord) => void; + knowledge_base_removed: (record: KnowledgeBaseRecord) => void; + knowledge_base_updated: (record: KnowledgeBaseRecord) => void; +}; + +export type KnowledgeBaseEvents = keyof KnowledgeBaseEventListeners; + +export type KnowledgeBaseEventListener = + KnowledgeBaseEventListeners[Event]; + +export type KnowledgeBaseEventParameters = EventParameters< + KnowledgeBaseEventListeners, + Event +>; + +/** + * Repository for persisting KnowledgeBase metadata to tabular storage. + * Follows the same pattern as ModelRepository. + */ +export class KnowledgeBaseRepository { + /** + * Storage for KnowledgeBase records + */ + protected readonly storage: BaseTabularStorage< + typeof KnowledgeBaseRecordSchema, + typeof KnowledgeBasePrimaryKeyNames + >; + + constructor( + storage: BaseTabularStorage< + typeof KnowledgeBaseRecordSchema, + typeof KnowledgeBasePrimaryKeyNames + > + ) { + this.storage = storage; + } + + /** Event emitter for repository events */ + protected events = new EventEmitter(); + + /** + * Sets up the database for the repository. + * Must be called before using any other methods. + */ + async setupDatabase(): Promise { + await this.storage.setupDatabase?.(); + } + + /** + * Registers an event listener for the specified event + */ + on( + name: Event, + fn: KnowledgeBaseEventListener + ) { + this.events.on(name, fn); + } + + /** + * Removes an event listener for the specified event + */ + off( + name: Event, + fn: KnowledgeBaseEventListener + ) { + this.events.off(name, fn); + } + + /** + * Adds an event listener that will only be called once + */ + once( + name: Event, + fn: KnowledgeBaseEventListener + ) { + this.events.once(name, fn); + } + + /** + * Returns when the event was emitted (promise form of once) + */ + waitOn(name: Event) { + return this.events.waitOn(name); + } + + /** + * Adds a new knowledge base record to the repository + */ + async addKnowledgeBase(record: KnowledgeBaseRecord): Promise { + await this.storage.put(record); + this.events.emit("knowledge_base_added", record); + return record; + } + + /** + * Removes a knowledge base record from the repository + */ + async removeKnowledgeBase(kb_id: string): Promise { + const record = await this.storage.get({ kb_id }); + if (!record) { + throw new Error(`KnowledgeBase with id "${kb_id}" not found`); + } + await this.storage.delete({ kb_id }); + this.events.emit("knowledge_base_removed", record); + } + + /** + * Retrieves a knowledge base record by ID + */ + async getKnowledgeBase(kb_id: string): Promise { + if (typeof kb_id !== "string") return undefined; + const record = await this.storage.get({ kb_id }); + return record ?? undefined; + } + + /** + * Enumerates all knowledge base records + */ + async enumerateAll(): Promise { + const records = await this.storage.getAll(); + if (!records || records.length === 0) return undefined; + return records; + } + + /** + * Gets the total number of knowledge base records + */ + async size(): Promise { + return await this.storage.size(); + } +} diff --git a/packages/knowledge-base/src/knowledge-base/KnowledgeBaseSchema.ts b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseSchema.ts new file mode 100644 index 000000000..8a5728873 --- /dev/null +++ b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseSchema.ts @@ -0,0 +1,52 @@ +/** + * @license + * Copyright 2025 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { DataPortSchemaObject, FromSchema } from "@workglow/util"; + +/** + * Schema for persisting KnowledgeBase metadata to tabular storage. + */ +export const KnowledgeBaseRecordSchema = { + type: "object", + properties: { + kb_id: { type: "string" }, + title: { type: "string" }, + description: { type: "string" }, + vector_dimensions: { type: "integer" }, + document_table: { type: "string" }, + chunk_table: { type: "string" }, + created_at: { type: "string" }, + updated_at: { type: "string" }, + }, + required: [ + "kb_id", + "title", + "description", + "vector_dimensions", + "document_table", + "chunk_table", + "created_at", + "updated_at", + ], + additionalProperties: false, +} as const satisfies DataPortSchemaObject; + +export type KnowledgeBaseRecord = FromSchema; +export const KnowledgeBasePrimaryKeyNames = ["kb_id"] as const; + +/** + * Generates SQL-safe table names for a knowledge base's document and chunk storage. + */ +export function knowledgeBaseTableNames(kbId: string): { + readonly documentTable: string; + readonly chunkTable: string; +} { + const safe = kbId.replace(/[^a-zA-Z0-9_]/g, "_"); + return { + documentTable: `kb_docs_${safe}`, + chunkTable: `kb_chunks_${safe}`, + }; +} diff --git a/packages/dataset/src/knowledge-base/createKnowledgeBase.ts b/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts similarity index 91% rename from packages/dataset/src/knowledge-base/createKnowledgeBase.ts rename to packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts index 3730dc1db..0a87a0d36 100644 --- a/packages/dataset/src/knowledge-base/createKnowledgeBase.ts +++ b/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts @@ -18,6 +18,8 @@ export interface CreateKnowledgeBaseOptions { readonly vectorDimensions: number; readonly vectorType?: { new (array: number[]): TypedArray }; readonly register?: boolean; + readonly title?: string; + readonly description?: string; } /** @@ -39,6 +41,8 @@ export async function createKnowledgeBase( vectorDimensions, vectorType = Float32Array, register: shouldRegister = true, + title, + description, } = options; const tabularStorage = new InMemoryTabularStorage( @@ -59,7 +63,9 @@ export async function createKnowledgeBase( const kb = new KnowledgeBase( name, tabularStorage as unknown as DocumentTabularStorage, - vectorStorage as unknown as ChunkVectorStorage + vectorStorage as unknown as ChunkVectorStorage, + title, + description ); if (shouldRegister) { diff --git a/packages/dataset/src/node.ts b/packages/knowledge-base/src/node.ts similarity index 100% rename from packages/dataset/src/node.ts rename to packages/knowledge-base/src/node.ts diff --git a/packages/dataset/src/types.ts b/packages/knowledge-base/src/types.ts similarity index 100% rename from packages/dataset/src/types.ts rename to packages/knowledge-base/src/types.ts diff --git a/packages/dataset/src/util/DatasetSchema.ts b/packages/knowledge-base/src/util/DatasetSchema.ts similarity index 83% rename from packages/dataset/src/util/DatasetSchema.ts rename to packages/knowledge-base/src/util/DatasetSchema.ts index 8edd9be76..0372a3fcb 100644 --- a/packages/dataset/src/util/DatasetSchema.ts +++ b/packages/knowledge-base/src/util/DatasetSchema.ts @@ -6,12 +6,6 @@ import type { JsonSchema } from "@workglow/util"; -/** - * Semantic format types for dataset schema annotations. - * These are used by the InputResolver to determine how to resolve string IDs. - */ -export type DatasetSemantic = "dataset:tabular" | "dataset:knowledge-base"; - /** * Creates a JSON schema for a tabular dataset input. * The schema accepts either a string ID (resolved from registry) or a direct dataset instance. @@ -38,7 +32,7 @@ export function TypeKnowledgeBase = {}>(option title: "Knowledge Base", description: "Knowledge base ID or instance", ...options, - format: "dataset:knowledge-base" as const, + format: "knowledge-base" as const, anyOf: [ { type: "string" as const, title: "Knowledge Base ID" }, { title: "Knowledge Base Instance", additionalProperties: true }, diff --git a/packages/dataset/tsconfig.json b/packages/knowledge-base/tsconfig.json similarity index 100% rename from packages/dataset/tsconfig.json rename to packages/knowledge-base/tsconfig.json diff --git a/packages/storage/src/vector/README.md b/packages/storage/src/vector/README.md index 0a6a129d7..978fdc8fa 100644 --- a/packages/storage/src/vector/README.md +++ b/packages/storage/src/vector/README.md @@ -293,7 +293,7 @@ Register and retrieve chunk vector repositories globally: ```typescript import { getChunkVectorRepository, getGlobalChunkVectorRepositories } from "@workglow/storage"; -import { registerChunkVectorRepository, getGlobalChunkVectorRepositories } from "@workglow/dataset"; +import { registerChunkVectorRepository, getGlobalChunkVectorRepositories } from "@workglow/knowledge-base"; // Register a repository registerChunkVectorRepository("my-chunks", repo); @@ -344,13 +344,13 @@ Quantized vectors reduce storage and can improve performance: - **Cons:** Requires PostgreSQL server and pgvector extension - **Setup:** `CREATE EXTENSION vector;` -## Integration with DocumentDataset +## Integration with KnowledgeBase -The chunk vector repository works alongside `DocumentDataset` for hierarchical document storage: +The chunk vector repository works alongside `KnowledgeBase` for hierarchical document storage: ```typescript +import { KnowledgeBase } from "@workglow/knowledge-base"; import { - DocumentDataset, InMemoryChunkVectorStorage, InMemoryTabularStorage, } from "@workglow/storage"; @@ -363,14 +363,14 @@ await tabularStorage.setupDatabase(); const vectorStorage = new InMemoryChunkVectorStorage(384); await vectorStorage.setupDatabase(); -// Create document dataset with both storages -const docDataset = new DocumentDataset(tabularStorage, vectorStorage); +// Create knowledge base with both storages +const kb = new KnowledgeBase("my-kb", tabularStorage, vectorStorage, "My KB", "Description"); // Store document structure in tabular, chunks in vector -await docDataset.upsert(document); +await kb.upsertDocument(document); // Search chunks by vector similarity -const results = await docDataset.search(queryVector, { topK: 5 }); +const results = await kb.similaritySearch(queryVector, { topK: 5 }); ``` ### Chunk Metadata for Hierarchical Documents diff --git a/packages/test/package.json b/packages/test/package.json index eb6b574a1..75768fd03 100644 --- a/packages/test/package.json +++ b/packages/test/package.json @@ -37,7 +37,7 @@ "peerDependencies": { "@workglow/ai": "workspace:*", "@workglow/ai-provider": "workspace:*", - "@workglow/dataset": "workspace:*", + "@workglow/knowledge-base": "workspace:*", "@workglow/job-queue": "workspace:*", "@workglow/storage": "workspace:*", "@workglow/task-graph": "workspace:*", @@ -56,7 +56,7 @@ "@workglow/ai-provider": { "optional": false }, - "@workglow/dataset": { + "@workglow/knowledge-base": { "optional": false }, "@workglow/job-queue": { @@ -90,7 +90,7 @@ "@types/pg": "^8.18.0", "@workglow/ai": "workspace:*", "@workglow/ai-provider": "workspace:*", - "@workglow/dataset": "workspace:*", + "@workglow/knowledge-base": "workspace:*", "@workglow/job-queue": "workspace:*", "@workglow/sqlite": "workspace:*", "@workglow/storage": "workspace:*", diff --git a/packages/test/src/test/rag/ChunkToVector.test.ts b/packages/test/src/test/rag/ChunkToVector.test.ts index f0abfb444..ed182bd3c 100644 --- a/packages/test/src/test/rag/ChunkToVector.test.ts +++ b/packages/test/src/test/rag/ChunkToVector.test.ts @@ -6,7 +6,7 @@ import "@workglow/ai"; // Trigger Workflow prototype extensions import type { ChunkToVectorTaskOutput, HierarchicalChunkerTaskOutput } from "@workglow/ai"; -import { type ChunkRecord, StructuralParser } from "@workglow/dataset"; +import { type ChunkRecord, StructuralParser } from "@workglow/knowledge-base"; import { Workflow } from "@workglow/task-graph"; import { uuid4, setLogger } from "@workglow/util"; import { describe, expect, it } from "vitest"; diff --git a/packages/test/src/test/rag/Document.test.ts b/packages/test/src/test/rag/Document.test.ts index 7b44c0f91..2121c0cf5 100644 --- a/packages/test/src/test/rag/Document.test.ts +++ b/packages/test/src/test/rag/Document.test.ts @@ -4,8 +4,8 @@ * SPDX-License-Identifier: Apache-2.0 */ -import type { ChunkRecord, DocumentNode } from "@workglow/dataset"; -import { Document, NodeKind } from "@workglow/dataset"; +import type { ChunkRecord, DocumentNode } from "@workglow/knowledge-base"; +import { Document, NodeKind } from "@workglow/knowledge-base"; import { describe, expect, test } from "vitest"; import { setLogger } from "@workglow/util"; import { getTestingLogger } from "../../binding/TestingLogger"; diff --git a/packages/test/src/test/rag/DocumentChunkRetrievalTask.test.ts b/packages/test/src/test/rag/DocumentChunkRetrievalTask.test.ts index f15bc61c9..745c9fdab 100644 --- a/packages/test/src/test/rag/DocumentChunkRetrievalTask.test.ts +++ b/packages/test/src/test/rag/DocumentChunkRetrievalTask.test.ts @@ -9,7 +9,7 @@ import { createKnowledgeBase, KnowledgeBase, registerKnowledgeBase, -} from "@workglow/dataset"; +} from "@workglow/knowledge-base"; import { afterEach, beforeEach, describe, expect, test } from "vitest"; import { setLogger, uuid4 } from "@workglow/util"; import { getTestingLogger } from "../../binding/TestingLogger"; diff --git a/packages/test/src/test/rag/DocumentChunkSearchTask.test.ts b/packages/test/src/test/rag/DocumentChunkSearchTask.test.ts index 608699968..07517bcc0 100644 --- a/packages/test/src/test/rag/DocumentChunkSearchTask.test.ts +++ b/packages/test/src/test/rag/DocumentChunkSearchTask.test.ts @@ -9,7 +9,7 @@ import { createKnowledgeBase, KnowledgeBase, registerKnowledgeBase, -} from "@workglow/dataset"; +} from "@workglow/knowledge-base"; import { afterEach, beforeEach, describe, expect, test } from "vitest"; import { setLogger, uuid4 } from "@workglow/util"; import { getTestingLogger } from "../../binding/TestingLogger"; diff --git a/packages/test/src/test/rag/DocumentChunkUpsertTask.test.ts b/packages/test/src/test/rag/DocumentChunkUpsertTask.test.ts index 1ea045701..0e92f485d 100644 --- a/packages/test/src/test/rag/DocumentChunkUpsertTask.test.ts +++ b/packages/test/src/test/rag/DocumentChunkUpsertTask.test.ts @@ -9,7 +9,7 @@ import { createKnowledgeBase, KnowledgeBase, registerKnowledgeBase, -} from "@workglow/dataset"; +} from "@workglow/knowledge-base"; import { afterEach, beforeEach, describe, expect, test } from "vitest"; import { setLogger, uuid4 } from "@workglow/util"; import { getTestingLogger } from "../../binding/TestingLogger"; diff --git a/packages/test/src/test/rag/DocumentRepository.test.ts b/packages/test/src/test/rag/DocumentRepository.test.ts index ef6c50202..a127ea6fb 100644 --- a/packages/test/src/test/rag/DocumentRepository.test.ts +++ b/packages/test/src/test/rag/DocumentRepository.test.ts @@ -11,7 +11,7 @@ import { StructuralParser, createKnowledgeBase, type SectionNode, -} from "@workglow/dataset"; +} from "@workglow/knowledge-base"; import { setLogger, uuid4 } from "@workglow/util"; import { beforeEach, describe, expect, it } from "vitest"; import { getTestingLogger } from "../../binding/TestingLogger"; diff --git a/packages/test/src/test/rag/EndToEnd.integration.test.ts b/packages/test/src/test/rag/EndToEnd.integration.test.ts index 7498778fb..f1f04e69f 100644 --- a/packages/test/src/test/rag/EndToEnd.integration.test.ts +++ b/packages/test/src/test/rag/EndToEnd.integration.test.ts @@ -61,7 +61,7 @@ import { createKnowledgeBase, KnowledgeBase, registerKnowledgeBase, -} from "@workglow/dataset"; +} from "@workglow/knowledge-base"; import { getTaskQueueRegistry, setTaskQueueRegistry, Workflow } from "@workglow/task-graph"; import { setLogger } from "@workglow/util"; import { join } from "path"; diff --git a/packages/test/src/test/rag/FullChain.test.ts b/packages/test/src/test/rag/FullChain.test.ts index b75ae9bb8..91c9a499d 100644 --- a/packages/test/src/test/rag/FullChain.test.ts +++ b/packages/test/src/test/rag/FullChain.test.ts @@ -5,7 +5,7 @@ */ import { HierarchicalChunkerTaskOutput } from "@workglow/ai"; -import { ChunkRecord } from "@workglow/dataset"; +import { ChunkRecord } from "@workglow/knowledge-base"; import { uuid4, setLogger } from "@workglow/util"; import { Workflow } from "@workglow/task-graph"; import { beforeAll, describe, expect, it } from "vitest"; diff --git a/packages/test/src/test/rag/HierarchicalChunker.test.ts b/packages/test/src/test/rag/HierarchicalChunker.test.ts index b6ef9368a..c5430afcd 100644 --- a/packages/test/src/test/rag/HierarchicalChunker.test.ts +++ b/packages/test/src/test/rag/HierarchicalChunker.test.ts @@ -5,7 +5,7 @@ */ import { hierarchicalChunker } from "@workglow/ai"; -import { estimateTokens, StructuralParser } from "@workglow/dataset"; +import { estimateTokens, StructuralParser } from "@workglow/knowledge-base"; import { Workflow } from "@workglow/task-graph"; import { uuid4, setLogger } from "@workglow/util"; import { describe, expect, it } from "vitest"; diff --git a/packages/test/src/test/rag/HybridSearchTask.test.ts b/packages/test/src/test/rag/HybridSearchTask.test.ts index 1a48a71d3..ff78a307d 100644 --- a/packages/test/src/test/rag/HybridSearchTask.test.ts +++ b/packages/test/src/test/rag/HybridSearchTask.test.ts @@ -9,7 +9,7 @@ import { createKnowledgeBase, KnowledgeBase, registerKnowledgeBase, -} from "@workglow/dataset"; +} from "@workglow/knowledge-base"; import { afterEach, beforeEach, describe, expect, test } from "vitest"; import { setLogger, uuid4 } from "@workglow/util"; import { getTestingLogger } from "../../binding/TestingLogger"; diff --git a/packages/test/src/test/rag/RagWorkflow.integration.test.ts b/packages/test/src/test/rag/RagWorkflow.integration.test.ts index b44af272e..c386b09e8 100644 --- a/packages/test/src/test/rag/RagWorkflow.integration.test.ts +++ b/packages/test/src/test/rag/RagWorkflow.integration.test.ts @@ -36,7 +36,7 @@ import { createKnowledgeBase, KnowledgeBase, registerKnowledgeBase, -} from "@workglow/dataset"; +} from "@workglow/knowledge-base"; import { getTaskQueueRegistry, setTaskQueueRegistry, Workflow } from "@workglow/task-graph"; import { setLogger } from "@workglow/util"; import { readdirSync } from "fs"; diff --git a/packages/test/src/test/rag/StructuralParser.test.ts b/packages/test/src/test/rag/StructuralParser.test.ts index 50aca93b2..15674f344 100644 --- a/packages/test/src/test/rag/StructuralParser.test.ts +++ b/packages/test/src/test/rag/StructuralParser.test.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { NodeKind, StructuralParser, type SectionNode } from "@workglow/dataset"; +import { NodeKind, StructuralParser, type SectionNode } from "@workglow/knowledge-base"; import { describe, expect, it } from "vitest"; import { setLogger } from "@workglow/util"; import { getTestingLogger } from "../../binding/TestingLogger"; diff --git a/packages/test/src/test/task-graph/InputResolver.test.ts b/packages/test/src/test/task-graph/InputResolver.test.ts index 793d09965..a3694eb5e 100644 --- a/packages/test/src/test/task-graph/InputResolver.test.ts +++ b/packages/test/src/test/task-graph/InputResolver.test.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { TypeTabularStorage } from "@workglow/dataset"; +import { TypeTabularStorage } from "@workglow/knowledge-base"; import { AnyTabularStorage, getGlobalTabularRepositories, diff --git a/packages/test/src/test/util/Document.test.ts b/packages/test/src/test/util/Document.test.ts index 485510c34..ce52d3af9 100644 --- a/packages/test/src/test/util/Document.test.ts +++ b/packages/test/src/test/util/Document.test.ts @@ -4,8 +4,8 @@ * SPDX-License-Identifier: Apache-2.0 */ -import type { ChunkRecord, DocumentNode } from "@workglow/dataset"; -import { Document, NodeKind } from "@workglow/dataset"; +import type { ChunkRecord, DocumentNode } from "@workglow/knowledge-base"; +import { Document, NodeKind } from "@workglow/knowledge-base"; import { describe, expect, test } from "vitest"; import { setLogger } from "@workglow/util"; import { getTestingLogger } from "../../binding/TestingLogger"; diff --git a/packages/workglow/README.md b/packages/workglow/README.md index 7a1325b54..4392c9b10 100644 --- a/packages/workglow/README.md +++ b/packages/workglow/README.md @@ -46,7 +46,7 @@ const result = await workflow.run(); | `@workglow/storage` | Storage abstraction (IndexedDB, PostgreSQL, Supabase) | | `@workglow/job-queue` | Job queue management and task scheduling | | `@workglow/task-graph` | DAG task graph construction and execution | -| `@workglow/dataset` | Dataset and document management | +| `@workglow/knowledge-base` | Dataset and document management | | `@workglow/ai` | Core AI functionality, tasks, and model management | | `@workglow/ai-provider` | AI provider integrations (constants, schemas, provider classes) | | `@workglow/tasks` | Pre-built utility tasks (arrays, scalars, vectors, etc.) | diff --git a/packages/workglow/package.json b/packages/workglow/package.json index 85f870d52..b6fbd459f 100644 --- a/packages/workglow/package.json +++ b/packages/workglow/package.json @@ -73,7 +73,7 @@ "dependencies": { "@workglow/ai": "workspace:*", "@workglow/ai-provider": "workspace:*", - "@workglow/dataset": "workspace:*", + "@workglow/knowledge-base": "workspace:*", "@workglow/debug": "workspace:*", "@workglow/job-queue": "workspace:*", "@workglow/sqlite": "workspace:*", diff --git a/packages/workglow/src/common.ts b/packages/workglow/src/common.ts index 4d542e8f3..07fc8f6a6 100644 --- a/packages/workglow/src/common.ts +++ b/packages/workglow/src/common.ts @@ -8,7 +8,7 @@ import "./logging"; export * from "@workglow/ai"; export * from "@workglow/ai-provider"; -export * from "@workglow/dataset"; +export * from "@workglow/knowledge-base"; export * from "@workglow/job-queue"; export * from "@workglow/sqlite"; export * from "@workglow/storage"; From 7c6cb29a771acadd1b2e005a8de79fdb5014c38a Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Wed, 4 Mar 2026 23:36:56 +0000 Subject: [PATCH 2/3] fix: async registerKnowledgeBase, consistent enumerateAll, correct README imports and description (#280) --- .../src/knowledge-base/KnowledgeBaseRegistry.ts | 4 ++-- .../src/knowledge-base/KnowledgeBaseRepository.ts | 4 ++-- .../knowledge-base/src/knowledge-base/createKnowledgeBase.ts | 2 +- packages/storage/src/vector/README.md | 3 +-- packages/test/src/test/rag/DocumentChunkRetrievalTask.test.ts | 2 +- packages/test/src/test/rag/DocumentChunkSearchTask.test.ts | 2 +- packages/test/src/test/rag/DocumentChunkUpsertTask.test.ts | 2 +- packages/test/src/test/rag/HybridSearchTask.test.ts | 2 +- packages/workglow/README.md | 2 +- 9 files changed, 11 insertions(+), 12 deletions(-) diff --git a/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRegistry.ts b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRegistry.ts index e36cf5488..037e73a0f 100644 --- a/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRegistry.ts +++ b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRegistry.ts @@ -71,7 +71,7 @@ export function setGlobalKnowledgeBaseRepository(repository: KnowledgeBaseReposi * Registers a knowledge base globally by ID. * Adds to both the live Map and the persistent repository. */ -export function registerKnowledgeBase(id: string, kb: KnowledgeBase): void { +export async function registerKnowledgeBase(id: string, kb: KnowledgeBase): Promise { const kbs = getGlobalKnowledgeBases(); kbs.set(id, kb); @@ -88,7 +88,7 @@ export function registerKnowledgeBase(id: string, kb: KnowledgeBase): void { updated_at: now, }; const repo = getGlobalKnowledgeBaseRepository(); - repo.addKnowledgeBase(record); + await repo.addKnowledgeBase(record); } /** diff --git a/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRepository.ts b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRepository.ts index 59a922f21..82bf36b09 100644 --- a/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRepository.ts +++ b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRepository.ts @@ -136,9 +136,9 @@ export class KnowledgeBaseRepository { /** * Enumerates all knowledge base records */ - async enumerateAll(): Promise { + async enumerateAll(): Promise { const records = await this.storage.getAll(); - if (!records || records.length === 0) return undefined; + if (!records || records.length === 0) return []; return records; } diff --git a/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts b/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts index 0a87a0d36..1a9e69ed9 100644 --- a/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts +++ b/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts @@ -69,7 +69,7 @@ export async function createKnowledgeBase( ); if (shouldRegister) { - registerKnowledgeBase(name, kb); + await registerKnowledgeBase(name, kb); } return kb; diff --git a/packages/storage/src/vector/README.md b/packages/storage/src/vector/README.md index 978fdc8fa..d5e28f6ba 100644 --- a/packages/storage/src/vector/README.md +++ b/packages/storage/src/vector/README.md @@ -349,12 +349,11 @@ Quantized vectors reduce storage and can improve performance: The chunk vector repository works alongside `KnowledgeBase` for hierarchical document storage: ```typescript -import { KnowledgeBase } from "@workglow/knowledge-base"; +import { KnowledgeBase, DocumentStorageSchema } from "@workglow/knowledge-base"; import { InMemoryChunkVectorStorage, InMemoryTabularStorage, } from "@workglow/storage"; -import { DocumentStorageSchema } from "@workglow/storage"; // Initialize storage backends const tabularStorage = new InMemoryTabularStorage(DocumentStorageSchema, ["doc_id"]); diff --git a/packages/test/src/test/rag/DocumentChunkRetrievalTask.test.ts b/packages/test/src/test/rag/DocumentChunkRetrievalTask.test.ts index 745c9fdab..9b09d6904 100644 --- a/packages/test/src/test/rag/DocumentChunkRetrievalTask.test.ts +++ b/packages/test/src/test/rag/DocumentChunkRetrievalTask.test.ts @@ -181,7 +181,7 @@ describe("ChunkRetrievalTask", () => { }); test("should resolve knowledge base from string ID", async () => { - registerKnowledgeBase("test-retrieval-kb", kb); + await registerKnowledgeBase("test-retrieval-kb", kb); const queryVector = new Float32Array([1.0, 0.0, 0.0]); diff --git a/packages/test/src/test/rag/DocumentChunkSearchTask.test.ts b/packages/test/src/test/rag/DocumentChunkSearchTask.test.ts index 07517bcc0..b544142b0 100644 --- a/packages/test/src/test/rag/DocumentChunkSearchTask.test.ts +++ b/packages/test/src/test/rag/DocumentChunkSearchTask.test.ts @@ -204,7 +204,7 @@ describe("ChunkVectorSearchTask", () => { }); test("should resolve knowledge base from string ID", async () => { - registerKnowledgeBase("test-vector-kb", kb); + await registerKnowledgeBase("test-vector-kb", kb); const queryVector = new Float32Array([1.0, 0.0, 0.0]); diff --git a/packages/test/src/test/rag/DocumentChunkUpsertTask.test.ts b/packages/test/src/test/rag/DocumentChunkUpsertTask.test.ts index 0e92f485d..82bf80c39 100644 --- a/packages/test/src/test/rag/DocumentChunkUpsertTask.test.ts +++ b/packages/test/src/test/rag/DocumentChunkUpsertTask.test.ts @@ -142,7 +142,7 @@ describe("ChunkVectorUpsertTask", () => { test("should resolve knowledge base from string ID", async () => { // Register kb by ID - registerKnowledgeBase("test-upsert-kb", kb); + await registerKnowledgeBase("test-upsert-kb", kb); const vector = new Float32Array([0.1, 0.2, 0.3]); const metadata = { chunkId: "c1", doc_id: "doc1", text: "Test document", nodePath: [], depth: 0, source: "test.txt" }; diff --git a/packages/test/src/test/rag/HybridSearchTask.test.ts b/packages/test/src/test/rag/HybridSearchTask.test.ts index ff78a307d..292c3cad6 100644 --- a/packages/test/src/test/rag/HybridSearchTask.test.ts +++ b/packages/test/src/test/rag/HybridSearchTask.test.ts @@ -223,7 +223,7 @@ describe("ChunkVectorHybridSearchTask", () => { }); test("should resolve knowledge base from string ID", async () => { - registerKnowledgeBase("test-hybrid-kb", kb); + await registerKnowledgeBase("test-hybrid-kb", kb); const queryVector = new Float32Array([1.0, 0.0, 0.0]); const queryText = "machine learning"; diff --git a/packages/workglow/README.md b/packages/workglow/README.md index 4392c9b10..05a5cd7e0 100644 --- a/packages/workglow/README.md +++ b/packages/workglow/README.md @@ -46,7 +46,7 @@ const result = await workflow.run(); | `@workglow/storage` | Storage abstraction (IndexedDB, PostgreSQL, Supabase) | | `@workglow/job-queue` | Job queue management and task scheduling | | `@workglow/task-graph` | DAG task graph construction and execution | -| `@workglow/knowledge-base` | Dataset and document management | +| `@workglow/knowledge-base` | Knowledge base, document management, and RAG infrastructure | | `@workglow/ai` | Core AI functionality, tasks, and model management | | `@workglow/ai-provider` | AI provider integrations (constants, schemas, provider classes) | | `@workglow/tasks` | Pre-built utility tasks (arrays, scalars, vectors, etc.) | From 16630d0754f6125f4c6106bbc808e8b1b325bacd Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Thu, 5 Mar 2026 00:03:57 +0000 Subject: [PATCH 3/3] feat: introduce shared-table mode for KnowledgeBase - Added support for shared-table mode in the KnowledgeBase, allowing multiple knowledge bases to share the same underlying storage tables, reducing table proliferation. - Implemented `ScopedTabularStorage` and `ScopedVectorStorage` wrappers to manage data partitioning by `kb_id`. - Updated documentation to include new shared-table features and examples for setting up shared storage. - Enhanced format annotations and task schemas to accommodate the new storage structure. --- docs/developers/03_extending.md | 26 ++- packages/ai/README.md | 20 +- packages/knowledge-base/README.md | 166 ++++++++++++++ packages/knowledge-base/src/common.ts | 3 + .../knowledge-base/KnowledgeBaseRegistry.ts | 25 ++- .../src/knowledge-base/KnowledgeBaseSchema.ts | 8 + .../knowledge-base/ScopedTabularStorage.ts | 206 ++++++++++++++++++ .../src/knowledge-base/ScopedVectorStorage.ts | 94 ++++++++ .../src/knowledge-base/SharedTableSchemas.ts | 74 +++++++ packages/storage/README.md | 11 - packages/task-graph/src/task/InputResolver.ts | 1 - 11 files changed, 603 insertions(+), 31 deletions(-) create mode 100644 packages/knowledge-base/src/knowledge-base/ScopedTabularStorage.ts create mode 100644 packages/knowledge-base/src/knowledge-base/ScopedVectorStorage.ts create mode 100644 packages/knowledge-base/src/knowledge-base/SharedTableSchemas.ts diff --git a/docs/developers/03_extending.md b/docs/developers/03_extending.md index cdac6b895..4fab20a64 100644 --- a/docs/developers/03_extending.md +++ b/docs/developers/03_extending.md @@ -7,9 +7,20 @@ This document covers how to write your own tasks. For a more practical guide to - [Define Inputs and Outputs](#define-inputs-and-outputs) - [Register the Task](#register-the-task) - [Schema Format Annotations](#schema-format-annotations) + - [Built-in Format Annotations](#built-in-format-annotations) + - [Example: Using Format Annotations](#example-using-format-annotations) + - [Creating Custom Format Resolvers](#creating-custom-format-resolvers) - [Job Queues and LLM tasks](#job-queues-and-llm-tasks) - [Write a new Compound Task](#write-a-new-compound-task) - [Reactive Task UIs](#reactive-task-uis) +- [AI and RAG Tasks](#ai-and-rag-tasks) + - [Document Processing Tasks](#document-processing-tasks) + - [Vector and Embedding Tasks](#vector-and-embedding-tasks) + - [Retrieval and Generation Tasks](#retrieval-and-generation-tasks) + - [Chainable RAG Pipeline Example](#chainable-rag-pipeline-example) + - [Retrieval Pipeline Example](#retrieval-pipeline-example) + - [Hierarchical Document Structure](#hierarchical-document-structure) + - [Task Data Flow](#task-data-flow) ## Write a new Task @@ -138,13 +149,14 @@ When defining task input schemas, you can use `format` annotations to enable aut The system supports several format annotations out of the box: -| Format | Description | Helper Function | -| ------------------------------ | ----------------------------------- | ----------------------------- | -| `model` | Any AI model configuration | `TypeModel()` | -| `model:TaskName` | Model compatible with specific task | — | -| `storage:tabular` | Tabular data dataset | `TypeTabularStorage()` | -| `dataset:document-node-vector` | Vector storage dataset | `TypeChunkVectorRepository()` | -| `dataset:document` | Document dataset | `TypeDocumentRepository()` | +| Format | Description | Helper Function | +| --------------------------------- | ----------------------------------- | ----------------------------- | +| `model` | Any AI model configuration | `TypeModel()` | +| `model:TaskName` | Model compatible with specific task | — | +| `storage:tabular` | Tabular data storage | `TypeTabularStorage()` | +| `knowledge-base` | Knowledge base instance | `TypeKnowledgeBase()` | +| `credential` | Credential from credential store | — | +| `tasks` | Task class from task registry | — | ### Example: Using Format Annotations diff --git a/packages/ai/README.md b/packages/ai/README.md index ecef51a40..0afff559d 100644 --- a/packages/ai/README.md +++ b/packages/ai/README.md @@ -453,6 +453,11 @@ const kb = await createKnowledgeBase({ vectorDimensions: 384, // must match your embedding model }); +// Or use shared-table mode for multi-tenant scenarios — see @workglow/knowledge-base docs +// const scopedDocs = new ScopedTabularStorage(sharedDocStorage, "my-kb"); +// const scopedChunks = new ScopedVectorStorage(sharedChunkStorage, "my-kb"); +// const kb = new KnowledgeBase("my-kb", scopedDocs, scopedChunks); + // Document ingestion - fully chainable, no loops required await new Workflow() .structuralParser({ @@ -596,13 +601,14 @@ This resolution is handled by the input resolver system, which inspects schema ` ### Supported Format Annotations -| Format | Description | Resolver | -| --------------------------------- | ---------------------------------------- | -------------------------- | -| `model` | Any AI model configuration | ModelRepository | -| `model:TaskName` | Model compatible with specific task type | ModelRepository | -| `repository:tabular` | Tabular data repository | TabularStorageRegistry | -| `repository:document-node-vector` | Vector storage repository | VectorRepositoryRegistry | -| `repository:document` | Document repository | DocumentRepositoryRegistry | +| Format | Description | Resolver | +| ------------------ | ---------------------------------------- | -------------------------- | +| `model` | Any AI model configuration | ModelRepository | +| `model:TaskName` | Model compatible with specific task type | ModelRepository | +| `storage:tabular` | Tabular data storage | TabularStorageRegistry | +| `knowledge-base` | Knowledge base instance | KnowledgeBaseRegistry | +| `credential` | Credential from credential store | CredentialStoreRegistry | +| `tasks` | Task class from task registry | TaskRegistry | ### Custom Model Validation diff --git a/packages/knowledge-base/README.md b/packages/knowledge-base/README.md index 3d2eeae7c..cc5543226 100644 --- a/packages/knowledge-base/README.md +++ b/packages/knowledge-base/README.md @@ -21,6 +21,13 @@ Document management, hierarchical chunking, and knowledge base infrastructure fo - [Tree Traversal](#tree-traversal) - [Lifecycle Management](#lifecycle-management) - [Registry](#registry) +- [Shared-Table Mode](#shared-table-mode) + - [Overview](#overview-1) + - [Setting Up Shared Storage](#setting-up-shared-storage) + - [Scoped Wrappers](#scoped-wrappers) + - [Registering with Shared Tables](#registering-with-shared-tables) + - [Schemas and Indexes](#schemas-and-indexes) + - [When to Use Shared Tables](#when-to-use-shared-tables) - [Data Flow](#data-flow) - [Ingestion Pipeline](#ingestion-pipeline) - [Retrieval Pipeline](#retrieval-pipeline) @@ -28,6 +35,8 @@ Document management, hierarchical chunking, and knowledge base infrastructure fo - [Document](#document) - [KnowledgeBase](#knowledgebase-1) - [createKnowledgeBase](#createknowledgebase) + - [ScopedTabularStorage](#scopedtabularstorage) + - [ScopedVectorStorage](#scopedvectorstorage) - [StructuralParser](#structuralparser) - [Type Helpers](#type-helpers) - [License](#license) @@ -442,6 +451,134 @@ await task.run({ knowledgeBase: kb }); // Direct instance await task.run({ knowledgeBase: "my-kb" }); // Resolved from registry ``` +## Shared-Table Mode + +### Overview + +By default, each `KnowledgeBase` gets its own document table and chunk table. **Shared-table mode** lets multiple knowledge bases share the same underlying storage tables, partitioned by a `kb_id` column. This is useful when you have many knowledge bases and want to reduce table proliferation in your database. + +``` +Default mode (per-KB tables): Shared-table mode: +┌──────────────────────┐ ┌──────────────────────────┐ +│ kb_docs_my_kb │ │ shared_documents │ +│ (doc_id, data) │ │ (doc_id, kb_id, data) │ +├──────────────────────┤ │ ├─ kb_id = "kb-1" rows │ +│ kb_chunks_my_kb │ │ └─ kb_id = "kb-2" rows │ +│ (chunk_id, vector..) │ ├──────────────────────────┤ +├──────────────────────┤ │ shared_chunks │ +│ kb_docs_other_kb │ │ (chunk_id, kb_id, vec..) │ +│ (doc_id, data) │ │ ├─ kb_id = "kb-1" rows │ +├──────────────────────┤ │ └─ kb_id = "kb-2" rows │ +│ kb_chunks_other_kb │ └──────────────────────────┘ +│ (chunk_id, vector..) │ +└──────────────────────┘ +``` + +The `KnowledgeBase` class itself is unchanged — shared-table mode is implemented via thin wrapper classes (`ScopedTabularStorage`, `ScopedVectorStorage`) that inject `kb_id` on writes and filter by `kb_id` on reads. + +### Setting Up Shared Storage + +Create the shared storage instances once, globally: + +```typescript +import { InMemoryTabularStorage, InMemoryVectorStorage } from "@workglow/storage"; +import { + SharedDocumentStorageSchema, + SharedChunkVectorStorageSchema, + SharedDocumentIndexes, + SharedChunkIndexes, + SHARED_DOCUMENT_TABLE, + SHARED_CHUNK_TABLE, + DocumentStorageKey, + ChunkVectorPrimaryKey, +} from "@workglow/knowledge-base"; + +const sharedDocStorage = new InMemoryTabularStorage( + SharedDocumentStorageSchema, + DocumentStorageKey, + SharedDocumentIndexes +); + +const sharedChunkStorage = new InMemoryVectorStorage( + SharedChunkVectorStorageSchema, + ChunkVectorPrimaryKey, + SharedChunkIndexes, + 1024 // vector dimensions +); +``` + +For SQL backends (SQLite, PostgreSQL), replace `InMemoryTabularStorage` / `InMemoryVectorStorage` with the appropriate implementations. The shared schemas include indexes on `kb_id` and `[kb_id, doc_id]` for efficient scoped queries. + +### Scoped Wrappers + +For each knowledge base, create scoped wrappers that filter to that KB's data: + +```typescript +import { + ScopedTabularStorage, + ScopedVectorStorage, + KnowledgeBase, +} from "@workglow/knowledge-base"; + +// KB 1 +const scopedDocs1 = new ScopedTabularStorage(sharedDocStorage, "kb-1"); +const scopedChunks1 = new ScopedVectorStorage(sharedChunkStorage, "kb-1"); +const kb1 = new KnowledgeBase("kb-1", scopedDocs1, scopedChunks1); + +// KB 2 +const scopedDocs2 = new ScopedTabularStorage(sharedDocStorage, "kb-2"); +const scopedChunks2 = new ScopedVectorStorage(sharedChunkStorage, "kb-2"); +const kb2 = new KnowledgeBase("kb-2", scopedDocs2, scopedChunks2); +``` + +Each `KnowledgeBase` instance works exactly the same as in default mode — all CRUD, search, and lifecycle operations are transparently scoped to the KB's data. + +### Registering with Shared Tables + +Pass `{ sharedTables: true }` when registering so that the metadata record uses the shared table names: + +```typescript +import { registerKnowledgeBase } from "@workglow/knowledge-base"; + +registerKnowledgeBase("kb-1", kb1, { sharedTables: true }); +registerKnowledgeBase("kb-2", kb2, { sharedTables: true }); +``` + +You can check whether a persisted record uses shared tables with the `isSharedTableMode` helper: + +```typescript +import { isSharedTableMode } from "@workglow/knowledge-base"; + +const record = await repo.getKnowledgeBase("kb-1"); +if (isSharedTableMode(record)) { + // reconstruct using scoped wrappers +} +``` + +### Schemas and Indexes + +The shared schemas augment the standard schemas with a `kb_id` column: + +| Schema | Base Schema | Added Column | +| ------------------------------- | -------------------------- | ------------ | +| `SharedDocumentStorageSchema` | `DocumentStorageSchema` | `kb_id: string` | +| `SharedChunkVectorStorageSchema`| `ChunkVectorStorageSchema` | `kb_id: string` | + +Default shared table names: `SHARED_DOCUMENT_TABLE = "shared_documents"`, `SHARED_CHUNK_TABLE = "shared_chunks"`. + +Pre-defined index arrays for efficient queries: +- `SharedDocumentIndexes` — `[["kb_id"]]` +- `SharedChunkIndexes` — `[["kb_id"], ["kb_id", "doc_id"]]` + +### When to Use Shared Tables + +| Scenario | Recommendation | +| --- | --- | +| Few knowledge bases, each large | Default (per-KB tables) — simpler, no `kb_id` overhead | +| Many knowledge bases (e.g., per-user, per-tenant) | Shared tables — avoids table proliferation | +| Need cross-KB queries | Shared tables — query the shared storage directly | +| Using managed databases with table limits | Shared tables | + ## Data Flow ### Ingestion Pipeline @@ -640,6 +777,35 @@ interface CreateKnowledgeBaseOptions { } ``` +### ScopedTabularStorage + +```typescript +class ScopedTabularStorage + implements ITabularStorage +{ + constructor(inner: AnyTabularStorage, kbId: string); + + // All ITabularStorage methods are implemented. + // Writes inject kb_id, reads filter by kb_id, results strip kb_id. + // setupDatabase() and destroy() are no-ops (shared storage lifecycle is external). +} +``` + +### ScopedVectorStorage + +```typescript +class ScopedVectorStorage + extends ScopedTabularStorage + implements IVectorStorage +{ + constructor(inner: AnyVectorStorage, kbId: string); + + getVectorDimensions(): number; // Delegates to inner + similaritySearch(query, options?): Promise<(Entity & { score })[]>; // Post-filters by kb_id + hybridSearch?(query, options): Promise<(Entity & { score })[]>; // Post-filters by kb_id +} +``` + ### StructuralParser ```typescript diff --git a/packages/knowledge-base/src/common.ts b/packages/knowledge-base/src/common.ts index 776598be2..e8d05d0d7 100644 --- a/packages/knowledge-base/src/common.ts +++ b/packages/knowledge-base/src/common.ts @@ -13,6 +13,9 @@ export * from "./knowledge-base/KnowledgeBaseRepository"; export * from "./knowledge-base/InMemoryKnowledgeBaseRepository"; export * from "./knowledge-base/KnowledgeBaseRegistry"; export * from "./knowledge-base/createKnowledgeBase"; +export * from "./knowledge-base/ScopedTabularStorage"; +export * from "./knowledge-base/ScopedVectorStorage"; +export * from "./knowledge-base/SharedTableSchemas"; // Core document types (unchanged) export * from "./util/DatasetSchema"; diff --git a/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRegistry.ts b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRegistry.ts index 037e73a0f..69c9c9e59 100644 --- a/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRegistry.ts +++ b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseRegistry.ts @@ -10,10 +10,11 @@ import { registerInputResolver, ServiceRegistry, } from "@workglow/util"; -import type { KnowledgeBase } from "./KnowledgeBase"; import { InMemoryKnowledgeBaseRepository } from "./InMemoryKnowledgeBaseRepository"; +import type { KnowledgeBase } from "./KnowledgeBase"; import { KnowledgeBaseRepository } from "./KnowledgeBaseRepository"; import { knowledgeBaseTableNames, type KnowledgeBaseRecord } from "./KnowledgeBaseSchema"; +import { SHARED_CHUNK_TABLE, SHARED_DOCUMENT_TABLE } from "./SharedTableSchemas"; /** * Service token for the knowledge base registry @@ -25,8 +26,9 @@ export const KNOWLEDGE_BASES = /** * Service token for the knowledge base repository */ -export const KNOWLEDGE_BASE_REPOSITORY = - createServiceToken("knowledge-base.repository"); +export const KNOWLEDGE_BASE_REPOSITORY = createServiceToken( + "knowledge-base.repository" +); // Register default factory for live KB map if not already registered if (!globalServiceRegistry.has(KNOWLEDGE_BASES)) { @@ -67,16 +69,29 @@ export function setGlobalKnowledgeBaseRepository(repository: KnowledgeBaseReposi globalServiceRegistry.registerInstance(KNOWLEDGE_BASE_REPOSITORY, repository); } +export interface RegisterKnowledgeBaseOptions { + /** When true, record uses shared table names instead of per-KB table names. */ + readonly sharedTables?: boolean; +} + /** * Registers a knowledge base globally by ID. * Adds to both the live Map and the persistent repository. */ -export async function registerKnowledgeBase(id: string, kb: KnowledgeBase): Promise { + +export async function registerKnowledgeBase( + id: string, + kb: KnowledgeBase, + options?: RegisterKnowledgeBaseOptions +): Promise { const kbs = getGlobalKnowledgeBases(); kbs.set(id, kb); const now = new Date().toISOString(); - const tableNames = knowledgeBaseTableNames(id); + const useShared = options?.sharedTables === true; + const tableNames = useShared + ? { documentTable: SHARED_DOCUMENT_TABLE, chunkTable: SHARED_CHUNK_TABLE } + : knowledgeBaseTableNames(id); const record: KnowledgeBaseRecord = { kb_id: id, title: kb.title, diff --git a/packages/knowledge-base/src/knowledge-base/KnowledgeBaseSchema.ts b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseSchema.ts index 8a5728873..311a890e9 100644 --- a/packages/knowledge-base/src/knowledge-base/KnowledgeBaseSchema.ts +++ b/packages/knowledge-base/src/knowledge-base/KnowledgeBaseSchema.ts @@ -40,6 +40,14 @@ export const KnowledgeBasePrimaryKeyNames = ["kb_id"] as const; /** * Generates SQL-safe table names for a knowledge base's document and chunk storage. */ +/** + * Checks whether a KnowledgeBaseRecord uses shared-table mode. + */ +export function isSharedTableMode(record: KnowledgeBaseRecord): boolean { + // Avoid circular import — inline the constants + return record.document_table === "shared_documents" && record.chunk_table === "shared_chunks"; +} + export function knowledgeBaseTableNames(kbId: string): { readonly documentTable: string; readonly chunkTable: string; diff --git a/packages/knowledge-base/src/knowledge-base/ScopedTabularStorage.ts b/packages/knowledge-base/src/knowledge-base/ScopedTabularStorage.ts new file mode 100644 index 000000000..3ca8fa6e3 --- /dev/null +++ b/packages/knowledge-base/src/knowledge-base/ScopedTabularStorage.ts @@ -0,0 +1,206 @@ +/** + * @license + * Copyright 2025 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { + AnyTabularStorage, + DeleteSearchCriteria, + ITabularStorage, + QueryOptions, + SearchCriteria, + TabularChangePayload, + TabularEventListener, + TabularEventName, + TabularEventParameters, + TabularSubscribeOptions, +} from "@workglow/storage"; +import type { DataPortSchemaObject } from "@workglow/util"; + +/** + * Wrapper implementing `ITabularStorage` that delegates to an inner shared + * storage instance, injecting `kb_id` on writes and filtering by `kb_id` on + * reads. The outer interface does not include `kb_id` — it is transparent to + * the `KnowledgeBase` class. + */ +export class ScopedTabularStorage< + Schema extends DataPortSchemaObject, + PrimaryKeyNames extends ReadonlyArray, + Entity = any, + PrimaryKey = any, + InsertType = any, +> implements ITabularStorage +{ + protected readonly inner: AnyTabularStorage; + protected readonly kbId: string; + + constructor(inner: AnyTabularStorage, kbId: string) { + this.inner = inner; + this.kbId = kbId; + } + + private inject(value: any): any { + return { ...value, kb_id: this.kbId }; + } + + private strip(entity: any): Entity { + if (!entity) return entity; + const { kb_id: _, ...rest } = entity; + return rest as Entity; + } + + private stripArray(entities: any[] | undefined): Entity[] | undefined { + if (!entities) return undefined; + return entities.map((e) => this.strip(e)); + } + + async put(value: InsertType): Promise { + const result = await this.inner.put(this.inject(value)); + return this.strip(result); + } + + async putBulk(values: InsertType[]): Promise { + const injected = values.map((v) => this.inject(v)); + const results = await this.inner.putBulk(injected); + return results.map((r: any) => this.strip(r)); + } + + async get(key: PrimaryKey): Promise { + const result = await this.inner.get(key as any); + if (!result) return undefined; + if ((result as any).kb_id !== this.kbId) return undefined; + return this.strip(result); + } + + async delete(key: PrimaryKey | Entity): Promise { + return this.inner.delete(key as any); + } + + async getAll(options?: QueryOptions): Promise { + const results = await this.inner.query({ kb_id: this.kbId } as any, options as any); + return this.stripArray(results); + } + + async deleteAll(): Promise { + await this.inner.deleteSearch({ kb_id: this.kbId } as any); + } + + async size(): Promise { + const results = await this.inner.query({ kb_id: this.kbId } as any); + return results ? results.length : 0; + } + + async query( + criteria: SearchCriteria, + options?: QueryOptions + ): Promise { + const results = await this.inner.query( + { ...(criteria as any), kb_id: this.kbId }, + options as any + ); + return this.stripArray(results); + } + + async deleteSearch(criteria: DeleteSearchCriteria): Promise { + await this.inner.deleteSearch({ ...(criteria as any), kb_id: this.kbId }); + } + + async getBulk(offset: number, limit: number): Promise { + const results = await this.inner.query({ kb_id: this.kbId } as any, { offset, limit }); + return this.stripArray(results); + } + + async *records(pageSize: number = 100): AsyncGenerator { + if (pageSize <= 0) { + throw new RangeError(`pageSize must be greater than 0, got ${pageSize}`); + } + let offset = 0; + while (true) { + const page = await this.getBulk(offset, pageSize); + if (!page || page.length === 0) { + break; + } + for (const entity of page) { + yield entity; + } + if (page.length < pageSize) break; + offset += pageSize; + } + } + + async *pages(pageSize: number = 100): AsyncGenerator { + if (pageSize <= 0) { + throw new RangeError(`pageSize must be greater than 0, got ${pageSize}`); + } + let offset = 0; + while (true) { + const page = await this.getBulk(offset, pageSize); + if (!page || page.length === 0) { + break; + } + yield page; + if (page.length < pageSize) break; + offset += pageSize; + } + } + + // Event delegation + on( + name: Event, + fn: TabularEventListener + ): void { + this.inner.on(name, fn as any); + } + + off( + name: Event, + fn: TabularEventListener + ): void { + this.inner.off(name, fn as any); + } + + emit( + name: Event, + ...args: TabularEventParameters + ): void { + this.inner.emit(name, ...(args as any)); + } + + once( + name: Event, + fn: TabularEventListener + ): void { + this.inner.once(name, fn as any); + } + + waitOn( + name: Event + ): Promise> { + return this.inner.waitOn(name) as any; + } + + subscribeToChanges( + callback: (change: TabularChangePayload) => void, + options?: TabularSubscribeOptions + ): () => void { + return this.inner.subscribeToChanges(callback as any, options); + } + + // Lifecycle — no-op for shared storage + async setupDatabase(): Promise { + // No-op: shared storage lifecycle is managed externally + } + + destroy(): void { + // No-op: shared storage lifecycle is managed externally + } + + [Symbol.dispose](): void { + // No-op + } + + async [Symbol.asyncDispose](): Promise { + // No-op + } +} diff --git a/packages/knowledge-base/src/knowledge-base/ScopedVectorStorage.ts b/packages/knowledge-base/src/knowledge-base/ScopedVectorStorage.ts new file mode 100644 index 000000000..9334c43fa --- /dev/null +++ b/packages/knowledge-base/src/knowledge-base/ScopedVectorStorage.ts @@ -0,0 +1,94 @@ +/** + * @license + * Copyright 2025 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { + AnyVectorStorage, + HybridSearchOptions, + IVectorStorage, + VectorSearchOptions, +} from "@workglow/storage"; +import type { DataPortSchemaObject, TypedArray } from "@workglow/util"; +import { ScopedTabularStorage } from "./ScopedTabularStorage"; + +/** + * Wrapper extending `ScopedTabularStorage` that also implements `IVectorStorage`. + * Delegates vector search methods to the inner shared vector storage, + * post-filtering results by `kb_id`. + */ +export class ScopedVectorStorage< + Metadata extends Record | undefined, + Schema extends DataPortSchemaObject, + Entity = any, + PrimaryKeyNames extends ReadonlyArray = ReadonlyArray< + keyof Schema["properties"] + >, + > + extends ScopedTabularStorage + implements IVectorStorage +{ + protected override readonly inner: AnyVectorStorage; + + constructor(inner: AnyVectorStorage, kbId: string) { + super(inner, kbId); + this.inner = inner; + } + + getVectorDimensions(): number { + return this.inner.getVectorDimensions(); + } + + async similaritySearch( + query: TypedArray, + options?: VectorSearchOptions + ): Promise<(Entity & { score: number })[]> { + const results = await this.inner.similaritySearch(query, { + ...options, + // Request extra results to account for post-filtering + topK: options?.topK ? options.topK * 3 : undefined, + } as any); + + const filtered = results + .filter((r: any) => r.kb_id === this.kbId) + .slice(0, options?.topK); + + return filtered.map((r: any) => { + const { kb_id: _, ...rest } = r; + return rest as Entity & { score: number }; + }); + } + + hybridSearch?( + query: TypedArray, + options: HybridSearchOptions + ): Promise<(Entity & { score: number })[]>; +} + +// Implement hybridSearch on the prototype so it matches the optional interface +ScopedVectorStorage.prototype.hybridSearch = async function ( + this: ScopedVectorStorage, + query: TypedArray, + options: HybridSearchOptions +): Promise { + if (typeof this.inner.hybridSearch !== "function") { + throw new Error( + "Hybrid search is not supported by the configured chunk storage backend. " + + "Please use a vector storage implementation that provides `hybridSearch`." + ); + } + const results = await this.inner.hybridSearch(query, { + ...options, + topK: options?.topK ? options.topK * 3 : undefined, + } as any); + + const filtered = results + .filter((r: any) => r.kb_id === this.kbId) + .slice(0, options?.topK); + + return filtered.map((r: any) => { + const { kb_id: _, ...rest } = r; + return rest; + }); +}; diff --git a/packages/knowledge-base/src/knowledge-base/SharedTableSchemas.ts b/packages/knowledge-base/src/knowledge-base/SharedTableSchemas.ts new file mode 100644 index 000000000..7b4b0acba --- /dev/null +++ b/packages/knowledge-base/src/knowledge-base/SharedTableSchemas.ts @@ -0,0 +1,74 @@ +/** + * @license + * Copyright 2025 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { TypedArraySchema, type DataPortSchemaObject } from "@workglow/util"; + +/** + * Default table names for shared-table mode. + */ +export const SHARED_DOCUMENT_TABLE = "shared_documents"; +export const SHARED_CHUNK_TABLE = "shared_chunks"; + +/** + * Augmented document storage schema with kb_id column for shared-table mode. + */ +export const SharedDocumentStorageSchema = { + type: "object", + properties: { + doc_id: { + type: "string", + "x-auto-generated": true, + title: "Document ID", + description: "Unique identifier for the document", + }, + kb_id: { + type: "string", + title: "Knowledge Base ID", + description: "Owning knowledge base identifier", + }, + data: { + type: "string", + title: "Document Data", + description: "JSON-serialized document", + }, + metadata: { + type: "object", + title: "Metadata", + description: "Metadata of the document", + }, + }, + required: ["doc_id", "kb_id", "data"], + additionalProperties: true, +} as const satisfies DataPortSchemaObject; + +/** + * Augmented chunk vector storage schema with kb_id column for shared-table mode. + */ +export const SharedChunkVectorStorageSchema = { + type: "object", + properties: { + chunk_id: { type: "string", "x-auto-generated": true }, + kb_id: { type: "string" }, + doc_id: { type: "string" }, + vector: TypedArraySchema(), + metadata: { type: "object", format: "metadata", additionalProperties: true }, + }, + required: ["chunk_id", "kb_id", "doc_id", "vector", "metadata"], + additionalProperties: false, +} as const satisfies DataPortSchemaObject; + +/** + * Index definitions for efficient KB-scoped queries on shared document table. + */ +export const SharedDocumentIndexes: ReadonlyArray> = [["kb_id"]]; + +/** + * Index definitions for efficient KB-scoped queries on shared chunk table. + */ +export const SharedChunkIndexes: ReadonlyArray> = [ + ["kb_id"], + ["kb_id", "doc_id"], +]; diff --git a/packages/storage/README.md b/packages/storage/README.md index b0278a420..f4649c6b9 100644 --- a/packages/storage/README.md +++ b/packages/storage/README.md @@ -644,17 +644,6 @@ const tabularSchema = TypeTabularStorage({ description: "Tabular data repository", }); -// Vector repository (format: "repository:document-node-vector") -const vectorSchema = TypeVectorRepository({ - title: "Embeddings Store", - description: "Vector embeddings repository", -}); - -// Document repository (format: "repository:document") -const docSchema = TypeDocumentRepository({ - title: "Document Store", - description: "Document storage repository", -}); ``` ### Event-Driven Architecture diff --git a/packages/task-graph/src/task/InputResolver.ts b/packages/task-graph/src/task/InputResolver.ts index 13f4f5a65..5aee4cf05 100644 --- a/packages/task-graph/src/task/InputResolver.ts +++ b/packages/task-graph/src/task/InputResolver.ts @@ -121,7 +121,6 @@ export async function resolveSchemaInputs>( // Phase 1: Resolve format-annotated string values const format = getSchemaFormat(propSchema); if (format) { - // Try full format first (e.g., "dataset:document-chunk"), then fall back to prefix (e.g., "dataset") let resolver = resolvers.get(format); if (!resolver) { const prefix = getFormatPrefix(format);