Implement LanceDB-based search and document retrieval

- Add LanceDBSearch class for vector-based documentation search - Implement search() method with category filtering and relevance scoring - Add getDocumentByPath() with URL lookup and local file fetching - Fix getDocument() to use .query() instead of .search() for non-vector queries - Update handlers.ts to integrate LanceDBSearch with MCP tools - Parse stringified array fields (breadcrumbs, headings, keywords, playgroundIds) in get_babylon_doc - Fetch fresh content from local repositories (Documentation, Babylon.js, havok) - Add DocumentParser, LanceDBIndexer and related types for document processing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-23 04:57:29 -06:00 · 2025-11-23 04:57:29 -06:00 · f56b92e76e
commit f56b92e76e
parent a3e027ef02
6 changed files with 723 additions and 19 deletions
--- a/src/mcp/handlers.ts
+++ b/src/mcp/handlers.ts
@ -1,5 +1,16 @@
 import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { z } from 'zod';
+import { LanceDBSearch } from '../search/lancedb-search.js';
+
+let searchInstance: LanceDBSearch | null = null;
+
+async function getSearchInstance(): Promise<LanceDBSearch> {
+  if (!searchInstance) {
+    searchInstance = new LanceDBSearch();
+    await searchInstance.initialize();
+  }
+  return searchInstance;
+}

 export function setupHandlers(server: McpServer): void {
  registerSearchDocsTool(server);
@ -25,18 +36,60 @@ function registerSearchDocsTool(server: McpServer): void {
      },
    },
    async ({ query, category, limit = 5 }) => {
-      // TODO: Implement actual search logic
-      const result = {
-        message: 'Search functionality not yet implemented',
-        query,
-        category,
-        limit,
-        results: [],
-      };
+      try {
+        const search = await getSearchInstance();
+        const options = category ? { category, limit } : { limit };
+        const results = await search.search(query, options);

-      return {
-        content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
-      };
+        if (results.length === 0) {
+          return {
+            content: [
+              {
+                type: 'text',
+                text: `No results found for "${query}". Try different search terms or check if the documentation has been indexed.`,
+              },
+            ],
+          };
+        }
+
+        // Format results for better readability
+        const formattedResults = results.map((result, index) => ({
+          rank: index + 1,
+          title: result.title,
+          description: result.description,
+          url: result.url,
+          category: result.category,
+          relevance: (result.score * 100).toFixed(1) + '%',
+          snippet: result.content,
+          keywords: result.keywords,
+        }));
+
+        return {
+          content: [
+            {
+              type: 'text',
+              text: JSON.stringify(
+                {
+                  query,
+                  totalResults: results.length,
+                  results: formattedResults,
+                },
+                null,
+                2
+              ),
+            },
+          ],
+        };
+      } catch (error) {
+        return {
+          content: [
+            {
+              type: 'text',
+              text: `Error searching documentation: ${error instanceof Error ? error.message : String(error)}`,
+            },
+          ],
+        };
+      }
    }
  );
 }
@ -51,15 +104,68 @@ function registerGetDocTool(server: McpServer): void {
      },
    },
    async ({ path }) => {
-      // TODO: Implement actual document retrieval
-      const result = {
-        message: 'Document retrieval not yet implemented',
-        path,
-      };
+      try {
+        const search = await getSearchInstance();
+        const document = await search.getDocumentByPath(path);

-      return {
-        content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
-      };
+        if (!document) {
+          return {
+            content: [
+              {
+                type: 'text',
+                text: `Document not found: ${path}. The path may be incorrect or the documentation has not been indexed.`,
+              },
+            ],
+          };
+        }
+
+        // Parse stringified fields back to arrays
+        const breadcrumbs = document.breadcrumbs
+          ? document.breadcrumbs.split(' > ').filter(Boolean)
+          : [];
+        const headings = document.headings
+          ? document.headings.split(' | ').filter(Boolean)
+          : [];
+        const keywords = document.keywords
+          ? document.keywords.split(', ').filter(Boolean)
+          : [];
+        const playgroundIds = document.playgroundIds
+          ? document.playgroundIds.split(', ').filter(Boolean)
+          : [];
+
+        return {
+          content: [
+            {
+              type: 'text',
+              text: JSON.stringify(
+                {
+                  title: document.title,
+                  description: document.description,
+                  url: document.url,
+                  category: document.category,
+                  breadcrumbs,
+                  content: document.content,
+                  headings,
+                  keywords,
+                  playgroundIds,
+                  lastModified: document.lastModified,
+                },
+                null,
+                2
+              ),
+            },
+          ],
+        };
+      } catch (error) {
+        return {
+          content: [
+            {
+              type: 'text',
+              text: `Error retrieving document: ${error instanceof Error ? error.message : String(error)}`,
+            },
+          ],
+        };
+      }
    }
  );
 }
--- a/src/search/document-parser.test.ts
+++ b/src/search/document-parser.test.ts
@ -0,0 +1,49 @@
+import { describe, it, expect } from 'vitest';
+import { DocumentParser } from './document-parser.js';
+import path from 'path';
+
+describe('DocumentParser', () => {
+  const parser = new DocumentParser();
+  const sampleFile = path.join(
+    process.cwd(),
+    'data/repositories/Documentation/content/features.md'
+  );
+
+  it('should parse YAML front matter', async () => {
+    const doc = await parser.parseFile(sampleFile);
+
+    expect(doc.title).toBe('Babylon.js Features');
+    expect(doc.description).toContain('breadth and depth');
+    expect(doc.keywords).toContain('features');
+    expect(doc.keywords).toContain('capabilities');
+  });
+
+  it('should extract category from file path', async () => {
+    const doc = await parser.parseFile(sampleFile);
+
+    expect(doc.category).toBe('features');
+    expect(doc.breadcrumbs).toEqual(['features']);
+  });
+
+  it('should extract headings', async () => {
+    const doc = await parser.parseFile(sampleFile);
+
+    expect(doc.headings.length).toBeGreaterThan(0);
+    expect(doc.headings[0]?.text).toBe('Babylon.js Features');
+    expect(doc.headings[0]?.level).toBe(1);
+  });
+
+  it('should have markdown content', async () => {
+    const doc = await parser.parseFile(sampleFile);
+
+    expect(doc.content).toContain('Babylon.js Features');
+    expect(doc.content.length).toBeGreaterThan(0);
+  });
+
+  it('should extract file path and modified date', async () => {
+    const doc = await parser.parseFile(sampleFile);
+
+    expect(doc.filePath).toBe(sampleFile);
+    expect(doc.lastModified).toBeInstanceOf(Date);
+  });
+});
--- a/src/search/document-parser.ts
+++ b/src/search/document-parser.ts
@ -0,0 +1,99 @@
+import matter from 'gray-matter';
+import fs from 'fs/promises';
+import type { DocumentMetadata, Heading, CodeBlock } from './types.js';
+
+export class DocumentParser {
+  async parseFile(filePath: string): Promise<DocumentMetadata> {
+    const content = await fs.readFile(filePath, 'utf-8');
+    const { data, content: markdown } = matter(content);
+
+    return {
+      filePath,
+      title: data.title || '',
+      description: data.description || '',
+      keywords: this.parseKeywords(data.keywords),
+      category: this.extractCategory(filePath),
+      breadcrumbs: this.extractBreadcrumbs(filePath),
+      content: markdown,
+      headings: this.extractHeadings(markdown),
+      codeBlocks: this.extractCodeBlocks(markdown),
+      furtherReading: data['further-reading'] || [],
+      playgroundIds: this.extractPlaygroundIds(markdown),
+      lastModified: await this.getFileModifiedDate(filePath),
+    };
+  }
+
+  private parseKeywords(keywords: string | undefined): string[] {
+    if (!keywords) return [];
+    return keywords.split(',').map((k) => k.trim()).filter(Boolean);
+  }
+
+  private extractCategory(filePath: string): string {
+    const match = filePath.match(/content\/([^/]+(?:\/[^/]+)*)/);
+    if (!match || !match[1]) return 'uncategorized';
+
+    // Remove .md extension if present
+    return match[1].replace(/\.md$/, '');
+  }
+
+  private extractBreadcrumbs(filePath: string): string[] {
+    const category = this.extractCategory(filePath);
+    return category.split('/').filter(Boolean);
+  }
+
+  private extractHeadings(markdown: string): Heading[] {
+    const headings: Heading[] = [];
+    const lines = markdown.split('\n');
+
+    for (const line of lines) {
+      const trimmed = line.trim();
+      const match = trimmed.match(/^(#{1,6})\s+(.+)$/);
+      if (match && match[1] && match[2]) {
+        const level = match[1].length;
+        const text = match[2].trim();
+        const id = text.toLowerCase().replace(/[^\w\s-]/g, '').replace(/\s+/g, '-');
+        headings.push({ level, text, id });
+      }
+    }
+
+    return headings;
+  }
+
+  private extractCodeBlocks(markdown: string): CodeBlock[] {
+    const codeBlocks: CodeBlock[] = [];
+    const regex = /```(\w+)?\n([\s\S]*?)```/g;
+    let match;
+
+    while ((match = regex.exec(markdown)) !== null) {
+      if (match.index !== undefined && match[2] !== undefined) {
+        const lineStart = markdown.substring(0, match.index).split('\n').length;
+        codeBlocks.push({
+          language: match[1] || 'plaintext',
+          code: match[2].trim(),
+          lineStart,
+        });
+      }
+    }
+
+    return codeBlocks;
+  }
+
+  private extractPlaygroundIds(markdown: string): string[] {
+    const ids: string[] = [];
+    const regex = /<Playground\s+id=["']#([^"']+)["']/g;
+    let match;
+
+    while ((match = regex.exec(markdown)) !== null) {
+      if (match[1]) {
+        ids.push(match[1]);
+      }
+    }
+
+    return ids;
+  }
+
+  private async getFileModifiedDate(filePath: string): Promise<Date> {
+    const stats = await fs.stat(filePath);
+    return stats.mtime;
+  }
+}
--- a/src/search/lancedb-indexer.ts
+++ b/src/search/lancedb-indexer.ts
@ -0,0 +1,218 @@
+import { connect } from '@lancedb/lancedb';
+import { pipeline } from '@xenova/transformers';
+import { DocumentParser } from './document-parser.js';
+import type { DocumentMetadata } from './types.js';
+import fs from 'fs/promises';
+import path from 'path';
+
+export interface EmbeddedDocument {
+  id: string;
+  title: string;
+  description: string;
+  content: string;
+  keywords: string;
+  category: string;
+  breadcrumbs: string;
+  filePath: string;
+  url: string;
+  source: string;
+  headings: string;
+  codeSnippets: string;
+  playgroundIds: string;
+  lastModified: string;
+  vector: number[];
+}
+
+export interface DocumentSource {
+  name: string;
+  path: string;
+  urlPrefix: string;
+}
+
+export class LanceDBIndexer {
+  private db: any;
+  private embedder: any;
+  private parser: DocumentParser;
+  private readonly dbPath: string;
+  private readonly tableName: string;
+  private readonly sources: DocumentSource[];
+
+  constructor(
+    dbPath: string = './data/lancedb',
+    sources: DocumentSource[] = [
+      {
+        name: 'documentation',
+        path: './data/repositories/Documentation/content',
+        urlPrefix: 'https://doc.babylonjs.com',
+      },
+    ],
+    tableName: string = 'babylon_docs'
+  ) {
+    this.dbPath = dbPath;
+    this.sources = sources;
+    this.tableName = tableName;
+    this.parser = new DocumentParser();
+  }
+
+  async initialize(): Promise<void> {
+    console.log('Initializing LanceDB connection...');
+    this.db = await connect(this.dbPath);
+
+    console.log('Loading embedding model (this may take a moment)...');
+    this.embedder = await pipeline(
+      'feature-extraction',
+      'Xenova/all-MiniLM-L6-v2'
+    );
+    console.log('Embedding model loaded successfully');
+  }
+
+  async indexDocuments(): Promise<void> {
+    if (!this.embedder) {
+      throw new Error('Indexer not initialized. Call initialize() first.');
+    }
+
+    const allDocuments: EmbeddedDocument[] = [];
+
+    // Process each documentation source
+    for (const source of this.sources) {
+      console.log(`\nProcessing source: ${source.name}`);
+      console.log(`Path: ${source.path}`);
+      console.log('Finding markdown files...');
+
+      const markdownFiles = await this.findMarkdownFiles(source.path);
+      console.log(`Found ${markdownFiles.length} markdown files in ${source.name}`);
+
+      console.log('Parsing and embedding documents...');
+
+      for (let i = 0; i < markdownFiles.length; i++) {
+        const filePath = markdownFiles[i];
+        if (!filePath) continue;
+
+        try {
+          const doc = await this.processDocument(filePath, source);
+          allDocuments.push(doc);
+
+          if ((i + 1) % 50 === 0) {
+            console.log(`Processed ${i + 1}/${markdownFiles.length} documents from ${source.name}`);
+          }
+        } catch (error) {
+          console.error(`Error processing ${filePath}:`, error);
+        }
+      }
+
+      console.log(`✓ Completed ${source.name}: ${markdownFiles.length} files processed`);
+    }
+
+    console.log(`\nTotal documents processed: ${allDocuments.length}`);
+    console.log('Creating LanceDB table...');
+
+    // Drop existing table if it exists
+    const tableNames = await this.db.tableNames();
+    if (tableNames.includes(this.tableName)) {
+      await this.db.dropTable(this.tableName);
+    }
+
+    // Create new table with embedded documents
+    await this.db.createTable(this.tableName, allDocuments);
+    console.log('Indexing complete!');
+  }
+
+  private async processDocument(filePath: string, source: DocumentSource): Promise<EmbeddedDocument> {
+    const metadata = await this.parser.parseFile(filePath);
+    const embeddingText = this.createEmbeddingText(metadata);
+    const vector = await this.generateEmbedding(embeddingText);
+
+    return {
+      id: this.generateDocId(filePath, source),
+      title: metadata.title,
+      description: metadata.description,
+      content: this.truncateContent(metadata.content, 20000),
+      keywords: metadata.keywords.join(', '),
+      category: metadata.category,
+      breadcrumbs: metadata.breadcrumbs.join(' > '),
+      filePath: metadata.filePath,
+      url: this.generateDocUrl(metadata, source),
+      source: source.name,
+      headings: metadata.headings.map(h => h.text).join(' | '),
+      codeSnippets: metadata.codeBlocks.slice(0, 3).map(cb => cb.code).join('\n---\n'),
+      playgroundIds: metadata.playgroundIds.join(', '),
+      lastModified: metadata.lastModified.toISOString(),
+      vector,
+    };
+  }
+
+  private createEmbeddingText(metadata: DocumentMetadata): string {
+    // Combine key fields for embedding - prioritize title, description, keywords
+    const parts = [
+      metadata.title,
+      metadata.description,
+      metadata.keywords.join(' '),
+      metadata.headings.slice(0, 5).map(h => h.text).join(' '),
+      this.truncateContent(metadata.content, 500),
+    ];
+    return parts.filter(Boolean).join(' ');
+  }
+
+  private async generateEmbedding(text: string): Promise<number[]> {
+    if (!this.embedder) {
+      throw new Error('Embedder not initialized');
+    }
+
+    const result = await this.embedder(text, {
+      pooling: 'mean',
+      normalize: true,
+    });
+
+    return Array.from(result.data);
+  }
+
+  private async findMarkdownFiles(dir: string): Promise<string[]> {
+    const files: string[] = [];
+    const entries = await fs.readdir(dir, { withFileTypes: true });
+
+    for (const entry of entries) {
+      const fullPath = path.join(dir, entry.name);
+      if (entry.isDirectory()) {
+        const subFiles = await this.findMarkdownFiles(fullPath);
+        files.push(...subFiles);
+      } else if (entry.isFile() && entry.name.endsWith('.md')) {
+        files.push(fullPath);
+      }
+    }
+
+    return files;
+  }
+
+  private generateDocId(filePath: string, source: DocumentSource): string {
+    const basePath = source.path;
+    const relativePath = filePath
+      .replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '')
+      .replace(/\.md$/i, '')
+      .replace(/\//g, '_');
+    return `${source.name}_${relativePath}`;
+  }
+
+  private generateDocUrl(metadata: DocumentMetadata, source: DocumentSource): string {
+    const basePath = source.path;
+    const relativePath = metadata.filePath
+      .replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '')
+      .replace(/\.md$/i, '');
+
+    // For source-repo, use GitHub URL; for documentation, use doc site
+    if (source.name === 'source-repo') {
+      return `https://github.com/BabylonJS/Babylon.js/blob/master/${relativePath}.md`;
+    }
+
+    return `${source.urlPrefix}/${relativePath}`;
+  }
+
+  private truncateContent(content: string, maxLength: number): string {
+    if (content.length <= maxLength) return content;
+    return content.substring(0, maxLength) + '...';
+  }
+
+  async close(): Promise<void> {
+    // LanceDB doesn't require explicit closing
+    console.log('Indexer closed');
+  }
+}
--- a/src/search/lancedb-search.ts
+++ b/src/search/lancedb-search.ts
@ -0,0 +1,185 @@
+import { connect } from '@lancedb/lancedb';
+import { pipeline } from '@xenova/transformers';
+import type { SearchOptions, SearchResult } from './types.js';
+import type { EmbeddedDocument } from './lancedb-indexer.js';
+import fs from 'fs/promises';
+import path from 'path';
+
+export class LanceDBSearch {
+  private db: any;
+  private table: any;
+  private embedder: any;
+  private readonly dbPath: string;
+  private readonly tableName: string;
+
+  constructor(
+    dbPath: string = './data/lancedb',
+    tableName: string = 'babylon_docs'
+  ) {
+    this.dbPath = dbPath;
+    this.tableName = tableName;
+  }
+
+  async initialize(): Promise<void> {
+    this.db = await connect(this.dbPath);
+    this.table = await this.db.openTable(this.tableName);
+
+    this.embedder = await pipeline(
+      'feature-extraction',
+      'Xenova/all-MiniLM-L6-v2'
+    );
+  }
+
+  async search(query: string, options: SearchOptions = {}): Promise<SearchResult[]> {
+    if (!this.table || !this.embedder) {
+      throw new Error('Search not initialized. Call initialize() first.');
+    }
+
+    const limit = options.limit || 5;
+    const queryVector = await this.generateEmbedding(query);
+
+    // Build the search query
+    let searchQuery = this.table.vectorSearch(queryVector).limit(limit);
+
+    // Apply category filter if provided
+    if (options.category) {
+      searchQuery = searchQuery.where(`category = '${options.category}'`);
+    }
+
+    const results = await searchQuery.toArray();
+
+    return results.map((doc: any) => ({
+      title: doc.title,
+      description: doc.description,
+      content: this.extractRelevantSnippet(doc.content, query),
+      url: doc.url,
+      category: doc.category,
+      score: doc._distance ? 1 - doc._distance : 0, // Convert distance to similarity score
+      keywords: doc.keywords.split(', ').filter(Boolean),
+    }));
+  }
+
+  async getDocument(docId: string): Promise<EmbeddedDocument | null> {
+    if (!this.table) {
+      throw new Error('Search not initialized. Call initialize() first.');
+    }
+
+    const results = await this.table
+      .query()
+      .where(`id = '${docId}'`)
+      .limit(1)
+      .toArray();
+
+    return results.length > 0 ? results[0] : null;
+  }
+
+  async getDocumentByPath(filePath: string): Promise<EmbeddedDocument | null> {
+    if (!this.table) {
+      throw new Error('Search not initialized. Call initialize() first.');
+    }
+
+    // Try to find document by URL first
+    let results = await this.table
+      .query()
+      .where(`url = '${filePath}'`)
+      .limit(1)
+      .toArray();
+
+    if (results.length > 0) {
+      const doc = results[0];
+      // Fetch fresh content from local file if available
+      const freshContent = await this.fetchLocalContent(doc.filePath);
+      if (freshContent) {
+        return { ...doc, content: freshContent };
+      }
+      return doc;
+    }
+
+    // If not found by URL, try by docId conversion
+    const docId = this.pathToDocId(filePath);
+    return this.getDocument(docId);
+  }
+
+  private async fetchLocalContent(filePath: string): Promise<string | null> {
+    try {
+      // Check if the file exists in our local repositories
+      const possiblePaths = [
+        filePath,
+        path.join('./data/repositories/Documentation', filePath.replace(/^.*\/content\//, '')),
+        path.join('./data/repositories/Babylon.js', filePath.replace(/^.*\/Babylon\.js\//, '')),
+        path.join('./data/repositories/havok', filePath.replace(/^.*\/havok\//, '')),
+      ];
+
+      for (const possiblePath of possiblePaths) {
+        try {
+          const content = await fs.readFile(possiblePath, 'utf-8');
+          return content;
+        } catch {
+          // Continue to next path
+        }
+      }
+
+      return null;
+    } catch (error) {
+      return null;
+    }
+  }
+
+  private async generateEmbedding(text: string): Promise<number[]> {
+    if (!this.embedder) {
+      throw new Error('Embedder not initialized');
+    }
+
+    const result = await this.embedder(text, {
+      pooling: 'mean',
+      normalize: true,
+    });
+
+    return Array.from(result.data);
+  }
+
+  private extractRelevantSnippet(content: string, query: string, snippetLength: number = 300): string {
+    // Simple snippet extraction - find first occurrence of query terms
+    const queryTerms = query.toLowerCase().split(/\s+/);
+
+    let bestIndex = 0;
+    let maxMatches = 0;
+
+    // Find the position with most query term matches
+    const words = content.split(/\s+/);
+    for (let i = 0; i < words.length; i++) {
+      const windowText = words.slice(i, i + 50).join(' ').toLowerCase();
+      const matches = queryTerms.filter(term => windowText.includes(term)).length;
+      if (matches > maxMatches) {
+        maxMatches = matches;
+        bestIndex = i;
+      }
+    }
+
+    // Extract snippet around best match
+    const start = Math.max(0, bestIndex - 10);
+    const snippetWords = words.slice(start, start + 60);
+    let snippet = snippetWords.join(' ');
+
+    if (snippet.length > snippetLength) {
+      snippet = snippet.substring(0, snippetLength) + '...';
+    }
+
+    if (start > 0) {
+      snippet = '...' + snippet;
+    }
+
+    return snippet;
+  }
+
+  private pathToDocId(filePath: string): string {
+    return filePath
+      .replace(/^.*\/content\//, '')
+      .replace(/\.md$/, '')
+      .replace(/\//g, '_');
+  }
+
+  async close(): Promise<void> {
+    // LanceDB doesn't require explicit closing
+  }
+}
--- a/src/search/types.ts
+++ b/src/search/types.ts
@ -0,0 +1,47 @@
+export interface DocumentMetadata {
+  filePath: string;
+  title: string;
+  description: string;
+  keywords: string[];
+  category: string;
+  breadcrumbs: string[];
+  content: string;
+  headings: Heading[];
+  codeBlocks: CodeBlock[];
+  furtherReading: RelatedLink[];
+  playgroundIds: string[];
+  lastModified: Date;
+}
+
+export interface Heading {
+  level: number;
+  text: string;
+  id: string;
+}
+
+export interface CodeBlock {
+  language: string;
+  code: string;
+  lineStart: number;
+}
+
+export interface RelatedLink {
+  title: string;
+  url: string;
+}
+
+export interface SearchOptions {
+  limit?: number;
+  category?: string;
+  queryType?: 'keyword' | 'semantic' | 'hybrid';
+}
+
+export interface SearchResult {
+  title: string;
+  description: string;
+  content: string;
+  url: string;
+  category: string;
+  score: number;
+  keywords: string[];
+}