babylon-mcp/src/search/lancedb-search.ts
Michael Mainguy 779fa53363 Add source code indexing and search with comprehensive documentation
Features:
- Implemented SourceCodeIndexer class for indexing TypeScript/JavaScript source files
  - Chunks large files into 200-line segments with 20-line overlap
  - Extracts imports, exports, and metadata
  - Generates semantic embeddings using Xenova/all-MiniLM-L6-v2
  - Creates GitHub URLs with line numbers for easy navigation

- Enhanced LanceDBSearch with source code search capabilities
  - Added searchSourceCode() method for semantic source code search
  - Added getSourceFile() method for retrieving specific files or line ranges
  - Supports package filtering and configurable table names
  - Fixed score calculation to ensure values between 0-100%

- Added two new MCP tools
  - search_babylon_source: Search Babylon.js source code with semantic search
  - get_babylon_source: Retrieve full source files or specific line ranges
  - Both tools include comprehensive error handling and JSON responses

- Created indexing and testing scripts
  - scripts/index-source.ts: Production script for indexing all packages
  - scripts/test-source-indexing.ts: Test script for core package only
  - scripts/test-source-search.ts: Test script for search functionality

- Updated package.json with comprehensive indexing commands
  - npm run index:docs - Index documentation only
  - npm run index:api - Index API documentation only
  - npm run index:source - Index source code only
  - npm run index:all - Master script to index everything

- Created comprehensive README.md
  - Complete setup and installation instructions
  - Claude Desktop integration guide with configuration examples
  - Documentation of all 5 MCP tools with parameters and examples
  - Project structure, development commands, and troubleshooting guide
  - Architecture overview and disk space requirements

Testing:
- All 118 tests passing
- TypeScript compilation successful
- Source code search verified with real queries
- Successfully indexed 1,561 files into 5,650 searchable chunks

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-23 06:34:00 -06:00

256 lines
7.5 KiB
TypeScript

import { connect } from '@lancedb/lancedb';
import { pipeline } from '@xenova/transformers';
import type { SearchOptions, SearchResult } from './types.js';
import type { EmbeddedDocument } from './lancedb-indexer.js';
import type { EmbeddedApiDoc } from './api-indexer.js';
import fs from 'fs/promises';
import path from 'path';
export class LanceDBSearch {
private db: any;
private table: any;
private embedder: any;
private readonly dbPath: string;
private readonly tableName: string;
constructor(
dbPath: string = './data/lancedb',
tableName: string = 'babylon_docs'
) {
this.dbPath = dbPath;
this.tableName = tableName;
}
async initialize(): Promise<void> {
this.db = await connect(this.dbPath);
this.table = await this.db.openTable(this.tableName);
this.embedder = await pipeline(
'feature-extraction',
'Xenova/all-MiniLM-L6-v2'
);
}
async search(query: string, options: SearchOptions = {}): Promise<SearchResult[]> {
if (!this.table || !this.embedder) {
throw new Error('Search not initialized. Call initialize() first.');
}
const limit = options.limit || 5;
const queryVector = await this.generateEmbedding(query);
// Build the search query
let searchQuery = this.table.vectorSearch(queryVector).limit(limit);
// Apply category filter if provided
if (options.category) {
searchQuery = searchQuery.where(`category = '${options.category}'`);
}
const results = await searchQuery.toArray();
return results.map((doc: any) => ({
title: doc.title,
description: doc.description,
content: this.extractRelevantSnippet(doc.content, query),
url: doc.url,
category: doc.category,
score: doc._distance ? 1 - doc._distance : 0, // Convert distance to similarity score
keywords: doc.keywords.split(', ').filter(Boolean),
}));
}
async searchApi(query: string, options: { limit?: number } = {}): Promise<Array<EmbeddedApiDoc & { score: number }>> {
if (!this.db || !this.embedder) {
throw new Error('Search not initialized. Call initialize() first.');
}
const limit = options.limit || 5;
const queryVector = await this.generateEmbedding(query);
// Open the API table (use babylon_api for production, babylon_api_test for testing)
const apiTable = await this.db.openTable('babylon_api');
// Perform vector search
const results = await apiTable
.vectorSearch(queryVector)
.limit(limit)
.toArray();
return results.map((doc: any) => ({
...doc,
score: doc._distance ? 1 - doc._distance : 0, // Convert distance to similarity score
}));
}
async getDocument(docId: string): Promise<EmbeddedDocument | null> {
if (!this.table) {
throw new Error('Search not initialized. Call initialize() first.');
}
const results = await this.table
.query()
.where(`id = '${docId}'`)
.limit(1)
.toArray();
return results.length > 0 ? results[0] : null;
}
async getDocumentByPath(filePath: string): Promise<EmbeddedDocument | null> {
if (!this.table) {
throw new Error('Search not initialized. Call initialize() first.');
}
// Try to find document by URL first
let results = await this.table
.query()
.where(`url = '${filePath}'`)
.limit(1)
.toArray();
if (results.length > 0) {
const doc = results[0];
// Fetch fresh content from local file if available
const freshContent = await this.fetchLocalContent(doc.filePath);
if (freshContent) {
return { ...doc, content: freshContent };
}
return doc;
}
// If not found by URL, try by docId conversion
const docId = this.pathToDocId(filePath);
return this.getDocument(docId);
}
private async fetchLocalContent(filePath: string): Promise<string | null> {
try {
// Check if the file exists in our local repositories
const possiblePaths = [
filePath,
path.join('./data/repositories/Documentation', filePath.replace(/^.*\/content\//, '')),
path.join('./data/repositories/Babylon.js', filePath.replace(/^.*\/Babylon\.js\//, '')),
path.join('./data/repositories/havok', filePath.replace(/^.*\/havok\//, '')),
];
for (const possiblePath of possiblePaths) {
try {
const content = await fs.readFile(possiblePath, 'utf-8');
return content;
} catch {
// Continue to next path
}
}
return null;
} catch (error) {
return null;
}
}
private async generateEmbedding(text: string): Promise<number[]> {
if (!this.embedder) {
throw new Error('Embedder not initialized');
}
const result = await this.embedder(text, {
pooling: 'mean',
normalize: true,
});
return Array.from(result.data);
}
private extractRelevantSnippet(content: string, query: string, snippetLength: number = 300): string {
// Simple snippet extraction - find first occurrence of query terms
const queryTerms = query.toLowerCase().split(/\s+/);
let bestIndex = 0;
let maxMatches = 0;
// Find the position with most query term matches
const words = content.split(/\s+/);
for (let i = 0; i < words.length; i++) {
const windowText = words.slice(i, i + 50).join(' ').toLowerCase();
const matches = queryTerms.filter(term => windowText.includes(term)).length;
if (matches > maxMatches) {
maxMatches = matches;
bestIndex = i;
}
}
// Extract snippet around best match
const start = Math.max(0, bestIndex - 10);
const snippetWords = words.slice(start, start + 60);
let snippet = snippetWords.join(' ');
if (snippet.length > snippetLength) {
snippet = snippet.substring(0, snippetLength) + '...';
}
if (start > 0) {
snippet = '...' + snippet;
}
return snippet;
}
private pathToDocId(filePath: string): string {
return filePath
.replace(/^.*\/content\//, '')
.replace(/\.md$/, '')
.replace(/\//g, '_');
}
async searchSourceCode(
query: string,
options: { package?: string; limit?: number; tableName?: string } = {}
): Promise<Array<any & { score: number }>> {
if (!this.db || !this.embedder) {
throw new Error('Search not initialized');
}
const limit = options.limit || 5;
const tableName = options.tableName || 'babylon_source_code';
const queryVector = await this.generateEmbedding(query);
const sourceTable = await this.db.openTable(tableName);
let searchQuery = sourceTable.vectorSearch(queryVector).limit(limit);
if (options.package) {
searchQuery = searchQuery.where(`package = '${options.package}'`);
}
const results = await searchQuery.toArray();
return results.map((doc: any) => ({
...doc,
score: doc._distance ? Math.max(0, 1 - doc._distance) : 0,
}));
}
async getSourceFile(
filePath: string,
startLine?: number,
endLine?: number
): Promise<string | null> {
try {
const fullPath = path.join('./data/repositories/Babylon.js', filePath);
const content = await fs.readFile(fullPath, 'utf-8');
if (startLine !== undefined && endLine !== undefined) {
const lines = content.split('\n');
return lines.slice(startLine - 1, endLine).join('\n');
}
return content;
} catch (error) {
console.error(`Error reading source file ${filePath}:`, error);
return null;
}
}
async close(): Promise<void> {
// LanceDB doesn't require explicit closing
}
}