Implement LanceDB-based search and document retrieval

- Add LanceDBSearch class for vector-based documentation search - Implement search() method with category filtering and relevance scoring - Add getDocumentByPath() with URL lookup and local file fetching - Fix getDocument() to use .query() instead of .search() for non-vector queries - Update handlers.ts to integrate LanceDBSearch with MCP tools - Parse stringified array fields (breadcrumbs, headings, keywords, playgroundIds) in get_babylon_doc - Fetch fresh content from local repositories (Documentation, Babylon.js, havok) - Add DocumentParser, LanceDBIndexer and related types for document processing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-23 04:57:29 -06:00 · 2025-11-23 04:57:29 -06:00 · f56b92e76e
commit f56b92e76e
parent a3e027ef02
6 changed files with 723 additions and 19 deletions
--- a/src/mcp/handlers.ts
+++ b/src/mcp/handlers.ts
@ -1,5 +1,16 @@
 import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { z } from 'zod';
 import { LanceDBSearch } from '../search/lancedb-search.js';
 let searchInstance: LanceDBSearch | null = null;
 async function getSearchInstance(): Promise<LanceDBSearch> {
  if (!searchInstance) {
    searchInstance = new LanceDBSearch();
    await searchInstance.initialize();
  }
  return searchInstance;
 }
 export function setupHandlers(server: McpServer): void {
  registerSearchDocsTool(server);
@ -25,18 +36,60 @@ function registerSearchDocsTool(server: McpServer): void {
      },
    },
    async ({ query, category, limit = 5 }) => {
-      // TODO: Implement actual search logic
+      try {
-      const result = {
+        const search = await getSearchInstance();
-        message: 'Search functionality not yet implemented',
+        const options = category ? { category, limit } : { limit };
-        query,
+        const results = await search.search(query, options);
        category,
        limit,
        results: [],
      };
-      return {
+        if (results.length === 0) {
-        content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
+          return {
-      };
+            content: [
              {
                type: 'text',
                text: `No results found for "${query}". Try different search terms or check if the documentation has been indexed.`,
              },
            ],
          };
        }
        // Format results for better readability
        const formattedResults = results.map((result, index) => ({
          rank: index + 1,
          title: result.title,
          description: result.description,
          url: result.url,
          category: result.category,
          relevance: (result.score * 100).toFixed(1) + '%',
          snippet: result.content,
          keywords: result.keywords,
        }));
        return {
          content: [
            {
              type: 'text',
              text: JSON.stringify(
                {
                  query,
                  totalResults: results.length,
                  results: formattedResults,
                },
                null,
                2
              ),
            },
          ],
        };
      } catch (error) {
        return {
          content: [
            {
              type: 'text',
              text: `Error searching documentation: ${error instanceof Error ? error.message : String(error)}`,
            },
          ],
        };
      }
    }
  );
 }
@ -51,15 +104,68 @@ function registerGetDocTool(server: McpServer): void {
      },
    },
    async ({ path }) => {
-      // TODO: Implement actual document retrieval
+      try {
-      const result = {
+        const search = await getSearchInstance();
-        message: 'Document retrieval not yet implemented',
+        const document = await search.getDocumentByPath(path);
        path,
      };
-      return {
+        if (!document) {
-        content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
+          return {
-      };
+            content: [
              {
                type: 'text',
                text: `Document not found: ${path}. The path may be incorrect or the documentation has not been indexed.`,
              },
            ],
          };
        }
        // Parse stringified fields back to arrays
        const breadcrumbs = document.breadcrumbs
          ? document.breadcrumbs.split(' > ').filter(Boolean)
          : [];
        const headings = document.headings
          ? document.headings.split(' | ').filter(Boolean)
          : [];
        const keywords = document.keywords
          ? document.keywords.split(', ').filter(Boolean)
          : [];
        const playgroundIds = document.playgroundIds
          ? document.playgroundIds.split(', ').filter(Boolean)
          : [];
        return {
          content: [
            {
              type: 'text',
              text: JSON.stringify(
                {
                  title: document.title,
                  description: document.description,
                  url: document.url,
                  category: document.category,
                  breadcrumbs,
                  content: document.content,
                  headings,
                  keywords,
                  playgroundIds,
                  lastModified: document.lastModified,
                },
                null,
                2
              ),
            },
          ],
        };
      } catch (error) {
        return {
          content: [
            {
              type: 'text',
              text: `Error retrieving document: ${error instanceof Error ? error.message : String(error)}`,
            },
          ],
        };
      }
    }
  );
 }
--- a/src/search/document-parser.test.ts
+++ b/src/search/document-parser.test.ts
@ -0,0 +1,49 @@
 import { describe, it, expect } from 'vitest';
 import { DocumentParser } from './document-parser.js';
 import path from 'path';
 describe('DocumentParser', () => {
  const parser = new DocumentParser();
  const sampleFile = path.join(
    process.cwd(),
    'data/repositories/Documentation/content/features.md'
  );
  it('should parse YAML front matter', async () => {
    const doc = await parser.parseFile(sampleFile);
    expect(doc.title).toBe('Babylon.js Features');
    expect(doc.description).toContain('breadth and depth');
    expect(doc.keywords).toContain('features');
    expect(doc.keywords).toContain('capabilities');
  });
  it('should extract category from file path', async () => {
    const doc = await parser.parseFile(sampleFile);
    expect(doc.category).toBe('features');
    expect(doc.breadcrumbs).toEqual(['features']);
  });
  it('should extract headings', async () => {
    const doc = await parser.parseFile(sampleFile);
    expect(doc.headings.length).toBeGreaterThan(0);
    expect(doc.headings[0]?.text).toBe('Babylon.js Features');
    expect(doc.headings[0]?.level).toBe(1);
  });
  it('should have markdown content', async () => {
    const doc = await parser.parseFile(sampleFile);
    expect(doc.content).toContain('Babylon.js Features');
    expect(doc.content.length).toBeGreaterThan(0);
  });
  it('should extract file path and modified date', async () => {
    const doc = await parser.parseFile(sampleFile);
    expect(doc.filePath).toBe(sampleFile);
    expect(doc.lastModified).toBeInstanceOf(Date);
  });
 });
--- a/src/search/document-parser.ts
+++ b/src/search/document-parser.ts
@ -0,0 +1,99 @@
 import matter from 'gray-matter';
 import fs from 'fs/promises';
 import type { DocumentMetadata, Heading, CodeBlock } from './types.js';
 export class DocumentParser {
  async parseFile(filePath: string): Promise<DocumentMetadata> {
    const content = await fs.readFile(filePath, 'utf-8');
    const { data, content: markdown } = matter(content);
    return {
      filePath,
      title: data.title || '',
      description: data.description || '',
      keywords: this.parseKeywords(data.keywords),
      category: this.extractCategory(filePath),
      breadcrumbs: this.extractBreadcrumbs(filePath),
      content: markdown,
      headings: this.extractHeadings(markdown),
      codeBlocks: this.extractCodeBlocks(markdown),
      furtherReading: data['further-reading'] || [],
      playgroundIds: this.extractPlaygroundIds(markdown),
      lastModified: await this.getFileModifiedDate(filePath),
    };
  }
  private parseKeywords(keywords: string | undefined): string[] {
    if (!keywords) return [];
    return keywords.split(',').map((k) => k.trim()).filter(Boolean);
  }
  private extractCategory(filePath: string): string {
    const match = filePath.match(/content\/([^/]+(?:\/[^/]+)*)/);
    if (!match || !match[1]) return 'uncategorized';
    // Remove .md extension if present
    return match[1].replace(/\.md$/, '');
  }
  private extractBreadcrumbs(filePath: string): string[] {
    const category = this.extractCategory(filePath);
    return category.split('/').filter(Boolean);
  }
  private extractHeadings(markdown: string): Heading[] {
    const headings: Heading[] = [];
    const lines = markdown.split('\n');
    for (const line of lines) {
      const trimmed = line.trim();
      const match = trimmed.match(/^(#{1,6})\s+(.+)$/);
      if (match && match[1] && match[2]) {
        const level = match[1].length;
        const text = match[2].trim();
        const id = text.toLowerCase().replace(/[^\w\s-]/g, '').replace(/\s+/g, '-');
        headings.push({ level, text, id });
      }
    }
    return headings;
  }
  private extractCodeBlocks(markdown: string): CodeBlock[] {
    const codeBlocks: CodeBlock[] = [];
    const regex = /```(\w+)?\n([\s\S]*?)```/g;
    let match;
    while ((match = regex.exec(markdown)) !== null) {
      if (match.index !== undefined && match[2] !== undefined) {
        const lineStart = markdown.substring(0, match.index).split('\n').length;
        codeBlocks.push({
          language: match[1] || 'plaintext',
          code: match[2].trim(),
          lineStart,
        });
      }
    }
    return codeBlocks;
  }
  private extractPlaygroundIds(markdown: string): string[] {
    const ids: string[] = [];
    const regex = /<Playground\s+id=["']#([^"']+)["']/g;
    let match;
    while ((match = regex.exec(markdown)) !== null) {
      if (match[1]) {
        ids.push(match[1]);
      }
    }
    return ids;
  }
  private async getFileModifiedDate(filePath: string): Promise<Date> {
    const stats = await fs.stat(filePath);
    return stats.mtime;
  }
 }
--- a/src/search/lancedb-indexer.ts
+++ b/src/search/lancedb-indexer.ts
@ -0,0 +1,218 @@
 import { connect } from '@lancedb/lancedb';
 import { pipeline } from '@xenova/transformers';
 import { DocumentParser } from './document-parser.js';
 import type { DocumentMetadata } from './types.js';
 import fs from 'fs/promises';
 import path from 'path';
 export interface EmbeddedDocument {
  id: string;
  title: string;
  description: string;
  content: string;
  keywords: string;
  category: string;
  breadcrumbs: string;
  filePath: string;
  url: string;
  source: string;
  headings: string;
  codeSnippets: string;
  playgroundIds: string;
  lastModified: string;
  vector: number[];
 }
 export interface DocumentSource {
  name: string;
  path: string;
  urlPrefix: string;
 }
 export class LanceDBIndexer {
  private db: any;
  private embedder: any;
  private parser: DocumentParser;
  private readonly dbPath: string;
  private readonly tableName: string;
  private readonly sources: DocumentSource[];
  constructor(
    dbPath: string = './data/lancedb',
    sources: DocumentSource[] = [
      {
        name: 'documentation',
        path: './data/repositories/Documentation/content',
        urlPrefix: 'https://doc.babylonjs.com',
      },
    ],
    tableName: string = 'babylon_docs'
  ) {
    this.dbPath = dbPath;
    this.sources = sources;
    this.tableName = tableName;
    this.parser = new DocumentParser();
  }
  async initialize(): Promise<void> {
    console.log('Initializing LanceDB connection...');
    this.db = await connect(this.dbPath);
    console.log('Loading embedding model (this may take a moment)...');
    this.embedder = await pipeline(
      'feature-extraction',
      'Xenova/all-MiniLM-L6-v2'
    );
    console.log('Embedding model loaded successfully');
  }
  async indexDocuments(): Promise<void> {
    if (!this.embedder) {
      throw new Error('Indexer not initialized. Call initialize() first.');
    }
    const allDocuments: EmbeddedDocument[] = [];
    // Process each documentation source
    for (const source of this.sources) {
      console.log(`\nProcessing source: ${source.name}`);
      console.log(`Path: ${source.path}`);
      console.log('Finding markdown files...');
      const markdownFiles = await this.findMarkdownFiles(source.path);
      console.log(`Found ${markdownFiles.length} markdown files in ${source.name}`);
      console.log('Parsing and embedding documents...');
      for (let i = 0; i < markdownFiles.length; i++) {
        const filePath = markdownFiles[i];
        if (!filePath) continue;
        try {
          const doc = await this.processDocument(filePath, source);
          allDocuments.push(doc);
          if ((i + 1) % 50 === 0) {
            console.log(`Processed ${i + 1}/${markdownFiles.length} documents from ${source.name}`);
          }
        } catch (error) {
          console.error(`Error processing ${filePath}:`, error);
        }
      }
      console.log(`✓ Completed ${source.name}: ${markdownFiles.length} files processed`);
    }
    console.log(`\nTotal documents processed: ${allDocuments.length}`);
    console.log('Creating LanceDB table...');
    // Drop existing table if it exists
    const tableNames = await this.db.tableNames();
    if (tableNames.includes(this.tableName)) {
      await this.db.dropTable(this.tableName);
    }
    // Create new table with embedded documents
    await this.db.createTable(this.tableName, allDocuments);
    console.log('Indexing complete!');
  }
  private async processDocument(filePath: string, source: DocumentSource): Promise<EmbeddedDocument> {
    const metadata = await this.parser.parseFile(filePath);
    const embeddingText = this.createEmbeddingText(metadata);
    const vector = await this.generateEmbedding(embeddingText);
    return {
      id: this.generateDocId(filePath, source),
      title: metadata.title,
      description: metadata.description,
      content: this.truncateContent(metadata.content, 20000),
      keywords: metadata.keywords.join(', '),
      category: metadata.category,
      breadcrumbs: metadata.breadcrumbs.join(' > '),
      filePath: metadata.filePath,
      url: this.generateDocUrl(metadata, source),
      source: source.name,
      headings: metadata.headings.map(h => h.text).join(' | '),
      codeSnippets: metadata.codeBlocks.slice(0, 3).map(cb => cb.code).join('\n---\n'),
      playgroundIds: metadata.playgroundIds.join(', '),
      lastModified: metadata.lastModified.toISOString(),
      vector,
    };
  }
  private createEmbeddingText(metadata: DocumentMetadata): string {
    // Combine key fields for embedding - prioritize title, description, keywords
    const parts = [
      metadata.title,
      metadata.description,
      metadata.keywords.join(' '),
      metadata.headings.slice(0, 5).map(h => h.text).join(' '),
      this.truncateContent(metadata.content, 500),
    ];
    return parts.filter(Boolean).join(' ');
  }
  private async generateEmbedding(text: string): Promise<number[]> {
    if (!this.embedder) {
      throw new Error('Embedder not initialized');
    }
    const result = await this.embedder(text, {
      pooling: 'mean',
      normalize: true,
    });
    return Array.from(result.data);
  }
  private async findMarkdownFiles(dir: string): Promise<string[]> {
    const files: string[] = [];
    const entries = await fs.readdir(dir, { withFileTypes: true });
    for (const entry of entries) {
      const fullPath = path.join(dir, entry.name);
      if (entry.isDirectory()) {
        const subFiles = await this.findMarkdownFiles(fullPath);
        files.push(...subFiles);
      } else if (entry.isFile() && entry.name.endsWith('.md')) {
        files.push(fullPath);
      }
    }
    return files;
  }
  private generateDocId(filePath: string, source: DocumentSource): string {
    const basePath = source.path;
    const relativePath = filePath
      .replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '')
      .replace(/\.md$/i, '')
      .replace(/\//g, '_');
    return `${source.name}_${relativePath}`;
  }
  private generateDocUrl(metadata: DocumentMetadata, source: DocumentSource): string {
    const basePath = source.path;
    const relativePath = metadata.filePath
      .replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '')
      .replace(/\.md$/i, '');
    // For source-repo, use GitHub URL; for documentation, use doc site
    if (source.name === 'source-repo') {
      return `https://github.com/BabylonJS/Babylon.js/blob/master/${relativePath}.md`;
    }
    return `${source.urlPrefix}/${relativePath}`;
  }
  private truncateContent(content: string, maxLength: number): string {
    if (content.length <= maxLength) return content;
    return content.substring(0, maxLength) + '...';
  }
  async close(): Promise<void> {
    // LanceDB doesn't require explicit closing
    console.log('Indexer closed');
  }
 }
--- a/src/search/lancedb-search.ts
+++ b/src/search/lancedb-search.ts
@ -0,0 +1,185 @@
 import { connect } from '@lancedb/lancedb';
 import { pipeline } from '@xenova/transformers';
 import type { SearchOptions, SearchResult } from './types.js';
 import type { EmbeddedDocument } from './lancedb-indexer.js';
 import fs from 'fs/promises';
 import path from 'path';
 export class LanceDBSearch {
  private db: any;
  private table: any;
  private embedder: any;
  private readonly dbPath: string;
  private readonly tableName: string;
  constructor(
    dbPath: string = './data/lancedb',
    tableName: string = 'babylon_docs'
  ) {
    this.dbPath = dbPath;
    this.tableName = tableName;
  }
  async initialize(): Promise<void> {
    this.db = await connect(this.dbPath);
    this.table = await this.db.openTable(this.tableName);
    this.embedder = await pipeline(
      'feature-extraction',
      'Xenova/all-MiniLM-L6-v2'
    );
  }
  async search(query: string, options: SearchOptions = {}): Promise<SearchResult[]> {
    if (!this.table || !this.embedder) {
      throw new Error('Search not initialized. Call initialize() first.');
    }
    const limit = options.limit || 5;
    const queryVector = await this.generateEmbedding(query);
    // Build the search query
    let searchQuery = this.table.vectorSearch(queryVector).limit(limit);
    // Apply category filter if provided
    if (options.category) {
      searchQuery = searchQuery.where(`category = '${options.category}'`);
    }
    const results = await searchQuery.toArray();
    return results.map((doc: any) => ({
      title: doc.title,
      description: doc.description,
      content: this.extractRelevantSnippet(doc.content, query),
      url: doc.url,
      category: doc.category,
      score: doc._distance ? 1 - doc._distance : 0, // Convert distance to similarity score
      keywords: doc.keywords.split(', ').filter(Boolean),
    }));
  }
  async getDocument(docId: string): Promise<EmbeddedDocument | null> {
    if (!this.table) {
      throw new Error('Search not initialized. Call initialize() first.');
    }
    const results = await this.table
      .query()
      .where(`id = '${docId}'`)
      .limit(1)
      .toArray();
    return results.length > 0 ? results[0] : null;
  }
  async getDocumentByPath(filePath: string): Promise<EmbeddedDocument | null> {
    if (!this.table) {
      throw new Error('Search not initialized. Call initialize() first.');
    }
    // Try to find document by URL first
    let results = await this.table
      .query()
      .where(`url = '${filePath}'`)
      .limit(1)
      .toArray();
    if (results.length > 0) {
      const doc = results[0];
      // Fetch fresh content from local file if available
      const freshContent = await this.fetchLocalContent(doc.filePath);
      if (freshContent) {
        return { ...doc, content: freshContent };
      }
      return doc;
    }
    // If not found by URL, try by docId conversion
    const docId = this.pathToDocId(filePath);
    return this.getDocument(docId);
  }
  private async fetchLocalContent(filePath: string): Promise<string | null> {
    try {
      // Check if the file exists in our local repositories
      const possiblePaths = [
        filePath,
        path.join('./data/repositories/Documentation', filePath.replace(/^.*\/content\//, '')),
        path.join('./data/repositories/Babylon.js', filePath.replace(/^.*\/Babylon\.js\//, '')),
        path.join('./data/repositories/havok', filePath.replace(/^.*\/havok\//, '')),
      ];
      for (const possiblePath of possiblePaths) {
        try {
          const content = await fs.readFile(possiblePath, 'utf-8');
          return content;
        } catch {
          // Continue to next path
        }
      }
      return null;
    } catch (error) {
      return null;
    }
  }
  private async generateEmbedding(text: string): Promise<number[]> {
    if (!this.embedder) {
      throw new Error('Embedder not initialized');
    }
    const result = await this.embedder(text, {
      pooling: 'mean',
      normalize: true,
    });
    return Array.from(result.data);
  }
  private extractRelevantSnippet(content: string, query: string, snippetLength: number = 300): string {
    // Simple snippet extraction - find first occurrence of query terms
    const queryTerms = query.toLowerCase().split(/\s+/);
    let bestIndex = 0;
    let maxMatches = 0;
    // Find the position with most query term matches
    const words = content.split(/\s+/);
    for (let i = 0; i < words.length; i++) {
      const windowText = words.slice(i, i + 50).join(' ').toLowerCase();
      const matches = queryTerms.filter(term => windowText.includes(term)).length;
      if (matches > maxMatches) {
        maxMatches = matches;
        bestIndex = i;
      }
    }
    // Extract snippet around best match
    const start = Math.max(0, bestIndex - 10);
    const snippetWords = words.slice(start, start + 60);
    let snippet = snippetWords.join(' ');
    if (snippet.length > snippetLength) {
      snippet = snippet.substring(0, snippetLength) + '...';
    }
    if (start > 0) {
      snippet = '...' + snippet;
    }
    return snippet;
  }
  private pathToDocId(filePath: string): string {
    return filePath
      .replace(/^.*\/content\//, '')
      .replace(/\.md$/, '')
      .replace(/\//g, '_');
  }
  async close(): Promise<void> {
    // LanceDB doesn't require explicit closing
  }
 }
--- a/src/search/types.ts
+++ b/src/search/types.ts
@ -0,0 +1,47 @@
 export interface DocumentMetadata {
  filePath: string;
  title: string;
  description: string;
  keywords: string[];
  category: string;
  breadcrumbs: string[];
  content: string;
  headings: Heading[];
  codeBlocks: CodeBlock[];
  furtherReading: RelatedLink[];
  playgroundIds: string[];
  lastModified: Date;
 }
 export interface Heading {
  level: number;
  text: string;
  id: string;
 }
 export interface CodeBlock {
  language: string;
  code: string;
  lineStart: number;
 }
 export interface RelatedLink {
  title: string;
  url: string;
 }
 export interface SearchOptions {
  limit?: number;
  category?: string;
  queryType?: 'keyword' | 'semantic' | 'hybrid';
 }
 export interface SearchResult {
  title: string;
  description: string;
  content: string;
  url: string;
  category: string;
  score: number;
  keywords: string[];
 }