Implement LanceDB-based search and document retrieval
- Add LanceDBSearch class for vector-based documentation search - Implement search() method with category filtering and relevance scoring - Add getDocumentByPath() with URL lookup and local file fetching - Fix getDocument() to use .query() instead of .search() for non-vector queries - Update handlers.ts to integrate LanceDBSearch with MCP tools - Parse stringified array fields (breadcrumbs, headings, keywords, playgroundIds) in get_babylon_doc - Fetch fresh content from local repositories (Documentation, Babylon.js, havok) - Add DocumentParser, LanceDBIndexer and related types for document processing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
a3e027ef02
commit
f56b92e76e
@ -1,5 +1,16 @@
|
|||||||
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
||||||
import { z } from 'zod';
|
import { z } from 'zod';
|
||||||
|
import { LanceDBSearch } from '../search/lancedb-search.js';
|
||||||
|
|
||||||
|
let searchInstance: LanceDBSearch | null = null;
|
||||||
|
|
||||||
|
async function getSearchInstance(): Promise<LanceDBSearch> {
|
||||||
|
if (!searchInstance) {
|
||||||
|
searchInstance = new LanceDBSearch();
|
||||||
|
await searchInstance.initialize();
|
||||||
|
}
|
||||||
|
return searchInstance;
|
||||||
|
}
|
||||||
|
|
||||||
export function setupHandlers(server: McpServer): void {
|
export function setupHandlers(server: McpServer): void {
|
||||||
registerSearchDocsTool(server);
|
registerSearchDocsTool(server);
|
||||||
@ -25,18 +36,60 @@ function registerSearchDocsTool(server: McpServer): void {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
async ({ query, category, limit = 5 }) => {
|
async ({ query, category, limit = 5 }) => {
|
||||||
// TODO: Implement actual search logic
|
try {
|
||||||
const result = {
|
const search = await getSearchInstance();
|
||||||
message: 'Search functionality not yet implemented',
|
const options = category ? { category, limit } : { limit };
|
||||||
query,
|
const results = await search.search(query, options);
|
||||||
category,
|
|
||||||
limit,
|
|
||||||
results: [],
|
|
||||||
};
|
|
||||||
|
|
||||||
return {
|
if (results.length === 0) {
|
||||||
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
return {
|
||||||
};
|
content: [
|
||||||
|
{
|
||||||
|
type: 'text',
|
||||||
|
text: `No results found for "${query}". Try different search terms or check if the documentation has been indexed.`,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Format results for better readability
|
||||||
|
const formattedResults = results.map((result, index) => ({
|
||||||
|
rank: index + 1,
|
||||||
|
title: result.title,
|
||||||
|
description: result.description,
|
||||||
|
url: result.url,
|
||||||
|
category: result.category,
|
||||||
|
relevance: (result.score * 100).toFixed(1) + '%',
|
||||||
|
snippet: result.content,
|
||||||
|
keywords: result.keywords,
|
||||||
|
}));
|
||||||
|
|
||||||
|
return {
|
||||||
|
content: [
|
||||||
|
{
|
||||||
|
type: 'text',
|
||||||
|
text: JSON.stringify(
|
||||||
|
{
|
||||||
|
query,
|
||||||
|
totalResults: results.length,
|
||||||
|
results: formattedResults,
|
||||||
|
},
|
||||||
|
null,
|
||||||
|
2
|
||||||
|
),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
return {
|
||||||
|
content: [
|
||||||
|
{
|
||||||
|
type: 'text',
|
||||||
|
text: `Error searching documentation: ${error instanceof Error ? error.message : String(error)}`,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -51,15 +104,68 @@ function registerGetDocTool(server: McpServer): void {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
async ({ path }) => {
|
async ({ path }) => {
|
||||||
// TODO: Implement actual document retrieval
|
try {
|
||||||
const result = {
|
const search = await getSearchInstance();
|
||||||
message: 'Document retrieval not yet implemented',
|
const document = await search.getDocumentByPath(path);
|
||||||
path,
|
|
||||||
};
|
|
||||||
|
|
||||||
return {
|
if (!document) {
|
||||||
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
return {
|
||||||
};
|
content: [
|
||||||
|
{
|
||||||
|
type: 'text',
|
||||||
|
text: `Document not found: ${path}. The path may be incorrect or the documentation has not been indexed.`,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse stringified fields back to arrays
|
||||||
|
const breadcrumbs = document.breadcrumbs
|
||||||
|
? document.breadcrumbs.split(' > ').filter(Boolean)
|
||||||
|
: [];
|
||||||
|
const headings = document.headings
|
||||||
|
? document.headings.split(' | ').filter(Boolean)
|
||||||
|
: [];
|
||||||
|
const keywords = document.keywords
|
||||||
|
? document.keywords.split(', ').filter(Boolean)
|
||||||
|
: [];
|
||||||
|
const playgroundIds = document.playgroundIds
|
||||||
|
? document.playgroundIds.split(', ').filter(Boolean)
|
||||||
|
: [];
|
||||||
|
|
||||||
|
return {
|
||||||
|
content: [
|
||||||
|
{
|
||||||
|
type: 'text',
|
||||||
|
text: JSON.stringify(
|
||||||
|
{
|
||||||
|
title: document.title,
|
||||||
|
description: document.description,
|
||||||
|
url: document.url,
|
||||||
|
category: document.category,
|
||||||
|
breadcrumbs,
|
||||||
|
content: document.content,
|
||||||
|
headings,
|
||||||
|
keywords,
|
||||||
|
playgroundIds,
|
||||||
|
lastModified: document.lastModified,
|
||||||
|
},
|
||||||
|
null,
|
||||||
|
2
|
||||||
|
),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
return {
|
||||||
|
content: [
|
||||||
|
{
|
||||||
|
type: 'text',
|
||||||
|
text: `Error retrieving document: ${error instanceof Error ? error.message : String(error)}`,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
49
src/search/document-parser.test.ts
Normal file
49
src/search/document-parser.test.ts
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
import { describe, it, expect } from 'vitest';
|
||||||
|
import { DocumentParser } from './document-parser.js';
|
||||||
|
import path from 'path';
|
||||||
|
|
||||||
|
describe('DocumentParser', () => {
|
||||||
|
const parser = new DocumentParser();
|
||||||
|
const sampleFile = path.join(
|
||||||
|
process.cwd(),
|
||||||
|
'data/repositories/Documentation/content/features.md'
|
||||||
|
);
|
||||||
|
|
||||||
|
it('should parse YAML front matter', async () => {
|
||||||
|
const doc = await parser.parseFile(sampleFile);
|
||||||
|
|
||||||
|
expect(doc.title).toBe('Babylon.js Features');
|
||||||
|
expect(doc.description).toContain('breadth and depth');
|
||||||
|
expect(doc.keywords).toContain('features');
|
||||||
|
expect(doc.keywords).toContain('capabilities');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should extract category from file path', async () => {
|
||||||
|
const doc = await parser.parseFile(sampleFile);
|
||||||
|
|
||||||
|
expect(doc.category).toBe('features');
|
||||||
|
expect(doc.breadcrumbs).toEqual(['features']);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should extract headings', async () => {
|
||||||
|
const doc = await parser.parseFile(sampleFile);
|
||||||
|
|
||||||
|
expect(doc.headings.length).toBeGreaterThan(0);
|
||||||
|
expect(doc.headings[0]?.text).toBe('Babylon.js Features');
|
||||||
|
expect(doc.headings[0]?.level).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should have markdown content', async () => {
|
||||||
|
const doc = await parser.parseFile(sampleFile);
|
||||||
|
|
||||||
|
expect(doc.content).toContain('Babylon.js Features');
|
||||||
|
expect(doc.content.length).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should extract file path and modified date', async () => {
|
||||||
|
const doc = await parser.parseFile(sampleFile);
|
||||||
|
|
||||||
|
expect(doc.filePath).toBe(sampleFile);
|
||||||
|
expect(doc.lastModified).toBeInstanceOf(Date);
|
||||||
|
});
|
||||||
|
});
|
||||||
99
src/search/document-parser.ts
Normal file
99
src/search/document-parser.ts
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
import matter from 'gray-matter';
|
||||||
|
import fs from 'fs/promises';
|
||||||
|
import type { DocumentMetadata, Heading, CodeBlock } from './types.js';
|
||||||
|
|
||||||
|
export class DocumentParser {
|
||||||
|
async parseFile(filePath: string): Promise<DocumentMetadata> {
|
||||||
|
const content = await fs.readFile(filePath, 'utf-8');
|
||||||
|
const { data, content: markdown } = matter(content);
|
||||||
|
|
||||||
|
return {
|
||||||
|
filePath,
|
||||||
|
title: data.title || '',
|
||||||
|
description: data.description || '',
|
||||||
|
keywords: this.parseKeywords(data.keywords),
|
||||||
|
category: this.extractCategory(filePath),
|
||||||
|
breadcrumbs: this.extractBreadcrumbs(filePath),
|
||||||
|
content: markdown,
|
||||||
|
headings: this.extractHeadings(markdown),
|
||||||
|
codeBlocks: this.extractCodeBlocks(markdown),
|
||||||
|
furtherReading: data['further-reading'] || [],
|
||||||
|
playgroundIds: this.extractPlaygroundIds(markdown),
|
||||||
|
lastModified: await this.getFileModifiedDate(filePath),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private parseKeywords(keywords: string | undefined): string[] {
|
||||||
|
if (!keywords) return [];
|
||||||
|
return keywords.split(',').map((k) => k.trim()).filter(Boolean);
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractCategory(filePath: string): string {
|
||||||
|
const match = filePath.match(/content\/([^/]+(?:\/[^/]+)*)/);
|
||||||
|
if (!match || !match[1]) return 'uncategorized';
|
||||||
|
|
||||||
|
// Remove .md extension if present
|
||||||
|
return match[1].replace(/\.md$/, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractBreadcrumbs(filePath: string): string[] {
|
||||||
|
const category = this.extractCategory(filePath);
|
||||||
|
return category.split('/').filter(Boolean);
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractHeadings(markdown: string): Heading[] {
|
||||||
|
const headings: Heading[] = [];
|
||||||
|
const lines = markdown.split('\n');
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
const trimmed = line.trim();
|
||||||
|
const match = trimmed.match(/^(#{1,6})\s+(.+)$/);
|
||||||
|
if (match && match[1] && match[2]) {
|
||||||
|
const level = match[1].length;
|
||||||
|
const text = match[2].trim();
|
||||||
|
const id = text.toLowerCase().replace(/[^\w\s-]/g, '').replace(/\s+/g, '-');
|
||||||
|
headings.push({ level, text, id });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return headings;
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractCodeBlocks(markdown: string): CodeBlock[] {
|
||||||
|
const codeBlocks: CodeBlock[] = [];
|
||||||
|
const regex = /```(\w+)?\n([\s\S]*?)```/g;
|
||||||
|
let match;
|
||||||
|
|
||||||
|
while ((match = regex.exec(markdown)) !== null) {
|
||||||
|
if (match.index !== undefined && match[2] !== undefined) {
|
||||||
|
const lineStart = markdown.substring(0, match.index).split('\n').length;
|
||||||
|
codeBlocks.push({
|
||||||
|
language: match[1] || 'plaintext',
|
||||||
|
code: match[2].trim(),
|
||||||
|
lineStart,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return codeBlocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractPlaygroundIds(markdown: string): string[] {
|
||||||
|
const ids: string[] = [];
|
||||||
|
const regex = /<Playground\s+id=["']#([^"']+)["']/g;
|
||||||
|
let match;
|
||||||
|
|
||||||
|
while ((match = regex.exec(markdown)) !== null) {
|
||||||
|
if (match[1]) {
|
||||||
|
ids.push(match[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ids;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async getFileModifiedDate(filePath: string): Promise<Date> {
|
||||||
|
const stats = await fs.stat(filePath);
|
||||||
|
return stats.mtime;
|
||||||
|
}
|
||||||
|
}
|
||||||
218
src/search/lancedb-indexer.ts
Normal file
218
src/search/lancedb-indexer.ts
Normal file
@ -0,0 +1,218 @@
|
|||||||
|
import { connect } from '@lancedb/lancedb';
|
||||||
|
import { pipeline } from '@xenova/transformers';
|
||||||
|
import { DocumentParser } from './document-parser.js';
|
||||||
|
import type { DocumentMetadata } from './types.js';
|
||||||
|
import fs from 'fs/promises';
|
||||||
|
import path from 'path';
|
||||||
|
|
||||||
|
export interface EmbeddedDocument {
|
||||||
|
id: string;
|
||||||
|
title: string;
|
||||||
|
description: string;
|
||||||
|
content: string;
|
||||||
|
keywords: string;
|
||||||
|
category: string;
|
||||||
|
breadcrumbs: string;
|
||||||
|
filePath: string;
|
||||||
|
url: string;
|
||||||
|
source: string;
|
||||||
|
headings: string;
|
||||||
|
codeSnippets: string;
|
||||||
|
playgroundIds: string;
|
||||||
|
lastModified: string;
|
||||||
|
vector: number[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DocumentSource {
|
||||||
|
name: string;
|
||||||
|
path: string;
|
||||||
|
urlPrefix: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export class LanceDBIndexer {
|
||||||
|
private db: any;
|
||||||
|
private embedder: any;
|
||||||
|
private parser: DocumentParser;
|
||||||
|
private readonly dbPath: string;
|
||||||
|
private readonly tableName: string;
|
||||||
|
private readonly sources: DocumentSource[];
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
dbPath: string = './data/lancedb',
|
||||||
|
sources: DocumentSource[] = [
|
||||||
|
{
|
||||||
|
name: 'documentation',
|
||||||
|
path: './data/repositories/Documentation/content',
|
||||||
|
urlPrefix: 'https://doc.babylonjs.com',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
tableName: string = 'babylon_docs'
|
||||||
|
) {
|
||||||
|
this.dbPath = dbPath;
|
||||||
|
this.sources = sources;
|
||||||
|
this.tableName = tableName;
|
||||||
|
this.parser = new DocumentParser();
|
||||||
|
}
|
||||||
|
|
||||||
|
async initialize(): Promise<void> {
|
||||||
|
console.log('Initializing LanceDB connection...');
|
||||||
|
this.db = await connect(this.dbPath);
|
||||||
|
|
||||||
|
console.log('Loading embedding model (this may take a moment)...');
|
||||||
|
this.embedder = await pipeline(
|
||||||
|
'feature-extraction',
|
||||||
|
'Xenova/all-MiniLM-L6-v2'
|
||||||
|
);
|
||||||
|
console.log('Embedding model loaded successfully');
|
||||||
|
}
|
||||||
|
|
||||||
|
async indexDocuments(): Promise<void> {
|
||||||
|
if (!this.embedder) {
|
||||||
|
throw new Error('Indexer not initialized. Call initialize() first.');
|
||||||
|
}
|
||||||
|
|
||||||
|
const allDocuments: EmbeddedDocument[] = [];
|
||||||
|
|
||||||
|
// Process each documentation source
|
||||||
|
for (const source of this.sources) {
|
||||||
|
console.log(`\nProcessing source: ${source.name}`);
|
||||||
|
console.log(`Path: ${source.path}`);
|
||||||
|
console.log('Finding markdown files...');
|
||||||
|
|
||||||
|
const markdownFiles = await this.findMarkdownFiles(source.path);
|
||||||
|
console.log(`Found ${markdownFiles.length} markdown files in ${source.name}`);
|
||||||
|
|
||||||
|
console.log('Parsing and embedding documents...');
|
||||||
|
|
||||||
|
for (let i = 0; i < markdownFiles.length; i++) {
|
||||||
|
const filePath = markdownFiles[i];
|
||||||
|
if (!filePath) continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const doc = await this.processDocument(filePath, source);
|
||||||
|
allDocuments.push(doc);
|
||||||
|
|
||||||
|
if ((i + 1) % 50 === 0) {
|
||||||
|
console.log(`Processed ${i + 1}/${markdownFiles.length} documents from ${source.name}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error processing ${filePath}:`, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`✓ Completed ${source.name}: ${markdownFiles.length} files processed`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`\nTotal documents processed: ${allDocuments.length}`);
|
||||||
|
console.log('Creating LanceDB table...');
|
||||||
|
|
||||||
|
// Drop existing table if it exists
|
||||||
|
const tableNames = await this.db.tableNames();
|
||||||
|
if (tableNames.includes(this.tableName)) {
|
||||||
|
await this.db.dropTable(this.tableName);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create new table with embedded documents
|
||||||
|
await this.db.createTable(this.tableName, allDocuments);
|
||||||
|
console.log('Indexing complete!');
|
||||||
|
}
|
||||||
|
|
||||||
|
private async processDocument(filePath: string, source: DocumentSource): Promise<EmbeddedDocument> {
|
||||||
|
const metadata = await this.parser.parseFile(filePath);
|
||||||
|
const embeddingText = this.createEmbeddingText(metadata);
|
||||||
|
const vector = await this.generateEmbedding(embeddingText);
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: this.generateDocId(filePath, source),
|
||||||
|
title: metadata.title,
|
||||||
|
description: metadata.description,
|
||||||
|
content: this.truncateContent(metadata.content, 20000),
|
||||||
|
keywords: metadata.keywords.join(', '),
|
||||||
|
category: metadata.category,
|
||||||
|
breadcrumbs: metadata.breadcrumbs.join(' > '),
|
||||||
|
filePath: metadata.filePath,
|
||||||
|
url: this.generateDocUrl(metadata, source),
|
||||||
|
source: source.name,
|
||||||
|
headings: metadata.headings.map(h => h.text).join(' | '),
|
||||||
|
codeSnippets: metadata.codeBlocks.slice(0, 3).map(cb => cb.code).join('\n---\n'),
|
||||||
|
playgroundIds: metadata.playgroundIds.join(', '),
|
||||||
|
lastModified: metadata.lastModified.toISOString(),
|
||||||
|
vector,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private createEmbeddingText(metadata: DocumentMetadata): string {
|
||||||
|
// Combine key fields for embedding - prioritize title, description, keywords
|
||||||
|
const parts = [
|
||||||
|
metadata.title,
|
||||||
|
metadata.description,
|
||||||
|
metadata.keywords.join(' '),
|
||||||
|
metadata.headings.slice(0, 5).map(h => h.text).join(' '),
|
||||||
|
this.truncateContent(metadata.content, 500),
|
||||||
|
];
|
||||||
|
return parts.filter(Boolean).join(' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
private async generateEmbedding(text: string): Promise<number[]> {
|
||||||
|
if (!this.embedder) {
|
||||||
|
throw new Error('Embedder not initialized');
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await this.embedder(text, {
|
||||||
|
pooling: 'mean',
|
||||||
|
normalize: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
return Array.from(result.data);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async findMarkdownFiles(dir: string): Promise<string[]> {
|
||||||
|
const files: string[] = [];
|
||||||
|
const entries = await fs.readdir(dir, { withFileTypes: true });
|
||||||
|
|
||||||
|
for (const entry of entries) {
|
||||||
|
const fullPath = path.join(dir, entry.name);
|
||||||
|
if (entry.isDirectory()) {
|
||||||
|
const subFiles = await this.findMarkdownFiles(fullPath);
|
||||||
|
files.push(...subFiles);
|
||||||
|
} else if (entry.isFile() && entry.name.endsWith('.md')) {
|
||||||
|
files.push(fullPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return files;
|
||||||
|
}
|
||||||
|
|
||||||
|
private generateDocId(filePath: string, source: DocumentSource): string {
|
||||||
|
const basePath = source.path;
|
||||||
|
const relativePath = filePath
|
||||||
|
.replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '')
|
||||||
|
.replace(/\.md$/i, '')
|
||||||
|
.replace(/\//g, '_');
|
||||||
|
return `${source.name}_${relativePath}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
private generateDocUrl(metadata: DocumentMetadata, source: DocumentSource): string {
|
||||||
|
const basePath = source.path;
|
||||||
|
const relativePath = metadata.filePath
|
||||||
|
.replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '')
|
||||||
|
.replace(/\.md$/i, '');
|
||||||
|
|
||||||
|
// For source-repo, use GitHub URL; for documentation, use doc site
|
||||||
|
if (source.name === 'source-repo') {
|
||||||
|
return `https://github.com/BabylonJS/Babylon.js/blob/master/${relativePath}.md`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return `${source.urlPrefix}/${relativePath}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
private truncateContent(content: string, maxLength: number): string {
|
||||||
|
if (content.length <= maxLength) return content;
|
||||||
|
return content.substring(0, maxLength) + '...';
|
||||||
|
}
|
||||||
|
|
||||||
|
async close(): Promise<void> {
|
||||||
|
// LanceDB doesn't require explicit closing
|
||||||
|
console.log('Indexer closed');
|
||||||
|
}
|
||||||
|
}
|
||||||
185
src/search/lancedb-search.ts
Normal file
185
src/search/lancedb-search.ts
Normal file
@ -0,0 +1,185 @@
|
|||||||
|
import { connect } from '@lancedb/lancedb';
|
||||||
|
import { pipeline } from '@xenova/transformers';
|
||||||
|
import type { SearchOptions, SearchResult } from './types.js';
|
||||||
|
import type { EmbeddedDocument } from './lancedb-indexer.js';
|
||||||
|
import fs from 'fs/promises';
|
||||||
|
import path from 'path';
|
||||||
|
|
||||||
|
export class LanceDBSearch {
|
||||||
|
private db: any;
|
||||||
|
private table: any;
|
||||||
|
private embedder: any;
|
||||||
|
private readonly dbPath: string;
|
||||||
|
private readonly tableName: string;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
dbPath: string = './data/lancedb',
|
||||||
|
tableName: string = 'babylon_docs'
|
||||||
|
) {
|
||||||
|
this.dbPath = dbPath;
|
||||||
|
this.tableName = tableName;
|
||||||
|
}
|
||||||
|
|
||||||
|
async initialize(): Promise<void> {
|
||||||
|
this.db = await connect(this.dbPath);
|
||||||
|
this.table = await this.db.openTable(this.tableName);
|
||||||
|
|
||||||
|
this.embedder = await pipeline(
|
||||||
|
'feature-extraction',
|
||||||
|
'Xenova/all-MiniLM-L6-v2'
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
async search(query: string, options: SearchOptions = {}): Promise<SearchResult[]> {
|
||||||
|
if (!this.table || !this.embedder) {
|
||||||
|
throw new Error('Search not initialized. Call initialize() first.');
|
||||||
|
}
|
||||||
|
|
||||||
|
const limit = options.limit || 5;
|
||||||
|
const queryVector = await this.generateEmbedding(query);
|
||||||
|
|
||||||
|
// Build the search query
|
||||||
|
let searchQuery = this.table.vectorSearch(queryVector).limit(limit);
|
||||||
|
|
||||||
|
// Apply category filter if provided
|
||||||
|
if (options.category) {
|
||||||
|
searchQuery = searchQuery.where(`category = '${options.category}'`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const results = await searchQuery.toArray();
|
||||||
|
|
||||||
|
return results.map((doc: any) => ({
|
||||||
|
title: doc.title,
|
||||||
|
description: doc.description,
|
||||||
|
content: this.extractRelevantSnippet(doc.content, query),
|
||||||
|
url: doc.url,
|
||||||
|
category: doc.category,
|
||||||
|
score: doc._distance ? 1 - doc._distance : 0, // Convert distance to similarity score
|
||||||
|
keywords: doc.keywords.split(', ').filter(Boolean),
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
async getDocument(docId: string): Promise<EmbeddedDocument | null> {
|
||||||
|
if (!this.table) {
|
||||||
|
throw new Error('Search not initialized. Call initialize() first.');
|
||||||
|
}
|
||||||
|
|
||||||
|
const results = await this.table
|
||||||
|
.query()
|
||||||
|
.where(`id = '${docId}'`)
|
||||||
|
.limit(1)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
return results.length > 0 ? results[0] : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
async getDocumentByPath(filePath: string): Promise<EmbeddedDocument | null> {
|
||||||
|
if (!this.table) {
|
||||||
|
throw new Error('Search not initialized. Call initialize() first.');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to find document by URL first
|
||||||
|
let results = await this.table
|
||||||
|
.query()
|
||||||
|
.where(`url = '${filePath}'`)
|
||||||
|
.limit(1)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
if (results.length > 0) {
|
||||||
|
const doc = results[0];
|
||||||
|
// Fetch fresh content from local file if available
|
||||||
|
const freshContent = await this.fetchLocalContent(doc.filePath);
|
||||||
|
if (freshContent) {
|
||||||
|
return { ...doc, content: freshContent };
|
||||||
|
}
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If not found by URL, try by docId conversion
|
||||||
|
const docId = this.pathToDocId(filePath);
|
||||||
|
return this.getDocument(docId);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async fetchLocalContent(filePath: string): Promise<string | null> {
|
||||||
|
try {
|
||||||
|
// Check if the file exists in our local repositories
|
||||||
|
const possiblePaths = [
|
||||||
|
filePath,
|
||||||
|
path.join('./data/repositories/Documentation', filePath.replace(/^.*\/content\//, '')),
|
||||||
|
path.join('./data/repositories/Babylon.js', filePath.replace(/^.*\/Babylon\.js\//, '')),
|
||||||
|
path.join('./data/repositories/havok', filePath.replace(/^.*\/havok\//, '')),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const possiblePath of possiblePaths) {
|
||||||
|
try {
|
||||||
|
const content = await fs.readFile(possiblePath, 'utf-8');
|
||||||
|
return content;
|
||||||
|
} catch {
|
||||||
|
// Continue to next path
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
} catch (error) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async generateEmbedding(text: string): Promise<number[]> {
|
||||||
|
if (!this.embedder) {
|
||||||
|
throw new Error('Embedder not initialized');
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await this.embedder(text, {
|
||||||
|
pooling: 'mean',
|
||||||
|
normalize: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
return Array.from(result.data);
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractRelevantSnippet(content: string, query: string, snippetLength: number = 300): string {
|
||||||
|
// Simple snippet extraction - find first occurrence of query terms
|
||||||
|
const queryTerms = query.toLowerCase().split(/\s+/);
|
||||||
|
|
||||||
|
let bestIndex = 0;
|
||||||
|
let maxMatches = 0;
|
||||||
|
|
||||||
|
// Find the position with most query term matches
|
||||||
|
const words = content.split(/\s+/);
|
||||||
|
for (let i = 0; i < words.length; i++) {
|
||||||
|
const windowText = words.slice(i, i + 50).join(' ').toLowerCase();
|
||||||
|
const matches = queryTerms.filter(term => windowText.includes(term)).length;
|
||||||
|
if (matches > maxMatches) {
|
||||||
|
maxMatches = matches;
|
||||||
|
bestIndex = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract snippet around best match
|
||||||
|
const start = Math.max(0, bestIndex - 10);
|
||||||
|
const snippetWords = words.slice(start, start + 60);
|
||||||
|
let snippet = snippetWords.join(' ');
|
||||||
|
|
||||||
|
if (snippet.length > snippetLength) {
|
||||||
|
snippet = snippet.substring(0, snippetLength) + '...';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (start > 0) {
|
||||||
|
snippet = '...' + snippet;
|
||||||
|
}
|
||||||
|
|
||||||
|
return snippet;
|
||||||
|
}
|
||||||
|
|
||||||
|
private pathToDocId(filePath: string): string {
|
||||||
|
return filePath
|
||||||
|
.replace(/^.*\/content\//, '')
|
||||||
|
.replace(/\.md$/, '')
|
||||||
|
.replace(/\//g, '_');
|
||||||
|
}
|
||||||
|
|
||||||
|
async close(): Promise<void> {
|
||||||
|
// LanceDB doesn't require explicit closing
|
||||||
|
}
|
||||||
|
}
|
||||||
47
src/search/types.ts
Normal file
47
src/search/types.ts
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
export interface DocumentMetadata {
|
||||||
|
filePath: string;
|
||||||
|
title: string;
|
||||||
|
description: string;
|
||||||
|
keywords: string[];
|
||||||
|
category: string;
|
||||||
|
breadcrumbs: string[];
|
||||||
|
content: string;
|
||||||
|
headings: Heading[];
|
||||||
|
codeBlocks: CodeBlock[];
|
||||||
|
furtherReading: RelatedLink[];
|
||||||
|
playgroundIds: string[];
|
||||||
|
lastModified: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface Heading {
|
||||||
|
level: number;
|
||||||
|
text: string;
|
||||||
|
id: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CodeBlock {
|
||||||
|
language: string;
|
||||||
|
code: string;
|
||||||
|
lineStart: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface RelatedLink {
|
||||||
|
title: string;
|
||||||
|
url: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SearchOptions {
|
||||||
|
limit?: number;
|
||||||
|
category?: string;
|
||||||
|
queryType?: 'keyword' | 'semantic' | 'hybrid';
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SearchResult {
|
||||||
|
title: string;
|
||||||
|
description: string;
|
||||||
|
content: string;
|
||||||
|
url: string;
|
||||||
|
category: string;
|
||||||
|
score: number;
|
||||||
|
keywords: string[];
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user