Forked from mindstudio/big-rag
Project Files
src / vectorstore / vectorStore.lancedb.ts.bak
import * as lancedb from "@lancedb/lancedb";
import * as path from "path";
import * as fs from "fs";
const TABLE_NAME = "chunks";
export interface DocumentChunk {
id: string;
text: string;
vector: number[];
filePath: string;
fileName: string;
fileHash: string;
chunkIndex: number;
metadata: Record<string, any>;
}
export interface SearchResult {
text: string;
score: number;
filePath: string;
fileName: string;
chunkIndex: number;
metadata: Record<string, any>;
}
type Connection = Awaited<ReturnType<typeof lancedb.connect>>;
type Table = Awaited<ReturnType<Connection["openTable"]>>;
export class VectorStore {
private db: Connection | null = null;
private table: Table | null = null;
private dbPath: string;
private updateMutex: Promise<void> = Promise.resolve();
constructor(dbPath: string) {
this.dbPath = path.resolve(dbPath);
}
/**
* Escape a string for use in a LanceDB SQL predicate (single-quoted literal).
*/
private escapePredicateString(value: string): string {
return value.replace(/'/g, "''");
}
/**
* Initialize the vector store
*/
async initialize(): Promise<void> {
try {
await fs.promises.mkdir(this.dbPath, { recursive: true });
this.db = await lancedb.connect(this.dbPath);
const names = await this.db.tableNames();
if (names.includes(TABLE_NAME)) {
this.table = await this.db.openTable(TABLE_NAME);
} else {
this.table = null;
}
console.log("Vector store initialized successfully");
} catch (error) {
console.error("Error initializing vector store:", error);
throw error;
}
}
/**
* Add document chunks to the vector store
* Uses a mutex to prevent concurrent updates
*/
async addChunks(chunks: DocumentChunk[]): Promise<void> {
if (!this.db) {
throw new Error("Vector store not initialized");
}
if (chunks.length === 0) {
return;
}
const records = chunks.map((chunk) => ({
id: chunk.id,
vector: chunk.vector,
text: chunk.text,
filePath: chunk.filePath,
fileName: chunk.fileName,
fileHash: chunk.fileHash,
chunkIndex: chunk.chunkIndex,
...chunk.metadata,
}));
this.updateMutex = this.updateMutex.then(async () => {
try {
if (!this.table) {
this.table = await this.db!.createTable(TABLE_NAME, records, {
mode: "create",
});
} else {
await this.table.add(records, { mode: "append" });
}
console.log(`Added ${chunks.length} chunks to vector store`);
} catch (error) {
console.error("Error adding chunks to vector store:", error);
throw error;
}
});
return this.updateMutex;
}
/**
* Search for similar chunks
*/
async search(
queryVector: number[],
limit: number = 5,
threshold: number = 0.5,
): Promise<SearchResult[]> {
if (!this.table) {
console.log("No index available for search");
return [];
}
try {
const raw = await this.table
.vectorSearch(queryVector)
.distanceType("cosine")
.limit(limit)
.toArray();
return raw
.map((row: Record<string, unknown>) => {
const dist = (row._distance as number) ?? 1;
const score = 1 - dist;
return { row, score };
})
.filter(({ score }) => score >= threshold)
.map(({ row, score }) => ({
text: row.text as string,
score,
filePath: row.filePath as string,
fileName: row.fileName as string,
chunkIndex: (row.chunkIndex as number) ?? 0,
metadata: row as Record<string, any>,
}));
} catch (error) {
console.error("Error searching vector store:", error);
return [];
}
}
/**
* Delete chunks for a specific file (by hash)
* Uses a mutex to prevent concurrent updates
*/
async deleteByFileHash(fileHash: string): Promise<void> {
if (!this.table) {
return;
}
const escaped = this.escapePredicateString(fileHash);
const predicate = `fileHash = '${escaped}'`;
this.updateMutex = this.updateMutex.then(async () => {
try {
await this.table!.delete(predicate);
console.log(`Deleted chunks for file hash: ${fileHash}`);
} catch (error) {
console.error(`Error deleting chunks for file hash ${fileHash}:`, error);
}
});
return this.updateMutex;
}
/**
* Check if a file (by hash) exists in the store
*/
async hasFile(fileHash: string): Promise<boolean> {
if (!this.table) {
return false;
}
try {
const escaped = this.escapePredicateString(fileHash);
const n = await this.table.countRows(`fileHash = '${escaped}'`);
return n > 0;
} catch (error) {
console.error(`Error checking file hash ${fileHash}:`, error);
return false;
}
}
/**
* Get a map of file paths to the set of hashes currently stored.
*/
async getFileHashInventory(): Promise<Map<string, Set<string>>> {
const inventory = new Map<string, Set<string>>();
if (!this.table) {
return inventory;
}
try {
const rows = await this.table
.query()
.select(["filePath", "fileHash"])
.toArray();
for (const row of rows) {
const filePath = row.filePath as string | undefined;
const fileHash = row.fileHash as string | undefined;
if (!filePath || !fileHash) continue;
let hashes = inventory.get(filePath);
if (!hashes) {
hashes = new Set<string>();
inventory.set(filePath, hashes);
}
hashes.add(fileHash);
}
return inventory;
} catch (error) {
console.error("Error building file hash inventory:", error);
return inventory;
}
}
/**
* Get statistics about the vector store
*/
async getStats(): Promise<{ totalChunks: number; uniqueFiles: number }> {
if (!this.table) {
return { totalChunks: 0, uniqueFiles: 0 };
}
try {
const totalChunks = await this.table.countRows();
const rows = await this.table.query().select(["fileHash"]).toArray();
const uniqueFiles = new Set(
rows.map((r: Record<string, string>) => r.fileHash).filter(Boolean),
).size;
return { totalChunks, uniqueFiles };
} catch (error) {
console.error("Error getting stats:", error);
return { totalChunks: 0, uniqueFiles: 0 };
}
}
/**
* Close the vector store connection
*/
async close(): Promise<void> {
if (this.db) {
this.db.close();
}
this.db = null;
this.table = null;
}
}