Features: - Implemented SourceCodeIndexer class for indexing TypeScript/JavaScript source files - Chunks large files into 200-line segments with 20-line overlap - Extracts imports, exports, and metadata - Generates semantic embeddings using Xenova/all-MiniLM-L6-v2 - Creates GitHub URLs with line numbers for easy navigation - Enhanced LanceDBSearch with source code search capabilities - Added searchSourceCode() method for semantic source code search - Added getSourceFile() method for retrieving specific files or line ranges - Supports package filtering and configurable table names - Fixed score calculation to ensure values between 0-100% - Added two new MCP tools - search_babylon_source: Search Babylon.js source code with semantic search - get_babylon_source: Retrieve full source files or specific line ranges - Both tools include comprehensive error handling and JSON responses - Created indexing and testing scripts - scripts/index-source.ts: Production script for indexing all packages - scripts/test-source-indexing.ts: Test script for core package only - scripts/test-source-search.ts: Test script for search functionality - Updated package.json with comprehensive indexing commands - npm run index:docs - Index documentation only - npm run index:api - Index API documentation only - npm run index:source - Index source code only - npm run index:all - Master script to index everything - Created comprehensive README.md - Complete setup and installation instructions - Claude Desktop integration guide with configuration examples - Documentation of all 5 MCP tools with parameters and examples - Project structure, development commands, and troubleshooting guide - Architecture overview and disk space requirements Testing: - All 118 tests passing - TypeScript compilation successful - Source code search verified with real queries - Successfully indexed 1,561 files into 5,650 searchable chunks 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
256 lines
7.5 KiB
TypeScript
256 lines
7.5 KiB
TypeScript
import { connect } from '@lancedb/lancedb';
|
|
import { pipeline } from '@xenova/transformers';
|
|
import type { SearchOptions, SearchResult } from './types.js';
|
|
import type { EmbeddedDocument } from './lancedb-indexer.js';
|
|
import type { EmbeddedApiDoc } from './api-indexer.js';
|
|
import fs from 'fs/promises';
|
|
import path from 'path';
|
|
|
|
export class LanceDBSearch {
|
|
private db: any;
|
|
private table: any;
|
|
private embedder: any;
|
|
private readonly dbPath: string;
|
|
private readonly tableName: string;
|
|
|
|
constructor(
|
|
dbPath: string = './data/lancedb',
|
|
tableName: string = 'babylon_docs'
|
|
) {
|
|
this.dbPath = dbPath;
|
|
this.tableName = tableName;
|
|
}
|
|
|
|
async initialize(): Promise<void> {
|
|
this.db = await connect(this.dbPath);
|
|
this.table = await this.db.openTable(this.tableName);
|
|
|
|
this.embedder = await pipeline(
|
|
'feature-extraction',
|
|
'Xenova/all-MiniLM-L6-v2'
|
|
);
|
|
}
|
|
|
|
async search(query: string, options: SearchOptions = {}): Promise<SearchResult[]> {
|
|
if (!this.table || !this.embedder) {
|
|
throw new Error('Search not initialized. Call initialize() first.');
|
|
}
|
|
|
|
const limit = options.limit || 5;
|
|
const queryVector = await this.generateEmbedding(query);
|
|
|
|
// Build the search query
|
|
let searchQuery = this.table.vectorSearch(queryVector).limit(limit);
|
|
|
|
// Apply category filter if provided
|
|
if (options.category) {
|
|
searchQuery = searchQuery.where(`category = '${options.category}'`);
|
|
}
|
|
|
|
const results = await searchQuery.toArray();
|
|
|
|
return results.map((doc: any) => ({
|
|
title: doc.title,
|
|
description: doc.description,
|
|
content: this.extractRelevantSnippet(doc.content, query),
|
|
url: doc.url,
|
|
category: doc.category,
|
|
score: doc._distance ? 1 - doc._distance : 0, // Convert distance to similarity score
|
|
keywords: doc.keywords.split(', ').filter(Boolean),
|
|
}));
|
|
}
|
|
|
|
async searchApi(query: string, options: { limit?: number } = {}): Promise<Array<EmbeddedApiDoc & { score: number }>> {
|
|
if (!this.db || !this.embedder) {
|
|
throw new Error('Search not initialized. Call initialize() first.');
|
|
}
|
|
|
|
const limit = options.limit || 5;
|
|
const queryVector = await this.generateEmbedding(query);
|
|
|
|
// Open the API table (use babylon_api for production, babylon_api_test for testing)
|
|
const apiTable = await this.db.openTable('babylon_api');
|
|
|
|
// Perform vector search
|
|
const results = await apiTable
|
|
.vectorSearch(queryVector)
|
|
.limit(limit)
|
|
.toArray();
|
|
|
|
return results.map((doc: any) => ({
|
|
...doc,
|
|
score: doc._distance ? 1 - doc._distance : 0, // Convert distance to similarity score
|
|
}));
|
|
}
|
|
|
|
async getDocument(docId: string): Promise<EmbeddedDocument | null> {
|
|
if (!this.table) {
|
|
throw new Error('Search not initialized. Call initialize() first.');
|
|
}
|
|
|
|
const results = await this.table
|
|
.query()
|
|
.where(`id = '${docId}'`)
|
|
.limit(1)
|
|
.toArray();
|
|
|
|
return results.length > 0 ? results[0] : null;
|
|
}
|
|
|
|
async getDocumentByPath(filePath: string): Promise<EmbeddedDocument | null> {
|
|
if (!this.table) {
|
|
throw new Error('Search not initialized. Call initialize() first.');
|
|
}
|
|
|
|
// Try to find document by URL first
|
|
let results = await this.table
|
|
.query()
|
|
.where(`url = '${filePath}'`)
|
|
.limit(1)
|
|
.toArray();
|
|
|
|
if (results.length > 0) {
|
|
const doc = results[0];
|
|
// Fetch fresh content from local file if available
|
|
const freshContent = await this.fetchLocalContent(doc.filePath);
|
|
if (freshContent) {
|
|
return { ...doc, content: freshContent };
|
|
}
|
|
return doc;
|
|
}
|
|
|
|
// If not found by URL, try by docId conversion
|
|
const docId = this.pathToDocId(filePath);
|
|
return this.getDocument(docId);
|
|
}
|
|
|
|
private async fetchLocalContent(filePath: string): Promise<string | null> {
|
|
try {
|
|
// Check if the file exists in our local repositories
|
|
const possiblePaths = [
|
|
filePath,
|
|
path.join('./data/repositories/Documentation', filePath.replace(/^.*\/content\//, '')),
|
|
path.join('./data/repositories/Babylon.js', filePath.replace(/^.*\/Babylon\.js\//, '')),
|
|
path.join('./data/repositories/havok', filePath.replace(/^.*\/havok\//, '')),
|
|
];
|
|
|
|
for (const possiblePath of possiblePaths) {
|
|
try {
|
|
const content = await fs.readFile(possiblePath, 'utf-8');
|
|
return content;
|
|
} catch {
|
|
// Continue to next path
|
|
}
|
|
}
|
|
|
|
return null;
|
|
} catch (error) {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private async generateEmbedding(text: string): Promise<number[]> {
|
|
if (!this.embedder) {
|
|
throw new Error('Embedder not initialized');
|
|
}
|
|
|
|
const result = await this.embedder(text, {
|
|
pooling: 'mean',
|
|
normalize: true,
|
|
});
|
|
|
|
return Array.from(result.data);
|
|
}
|
|
|
|
private extractRelevantSnippet(content: string, query: string, snippetLength: number = 300): string {
|
|
// Simple snippet extraction - find first occurrence of query terms
|
|
const queryTerms = query.toLowerCase().split(/\s+/);
|
|
|
|
let bestIndex = 0;
|
|
let maxMatches = 0;
|
|
|
|
// Find the position with most query term matches
|
|
const words = content.split(/\s+/);
|
|
for (let i = 0; i < words.length; i++) {
|
|
const windowText = words.slice(i, i + 50).join(' ').toLowerCase();
|
|
const matches = queryTerms.filter(term => windowText.includes(term)).length;
|
|
if (matches > maxMatches) {
|
|
maxMatches = matches;
|
|
bestIndex = i;
|
|
}
|
|
}
|
|
|
|
// Extract snippet around best match
|
|
const start = Math.max(0, bestIndex - 10);
|
|
const snippetWords = words.slice(start, start + 60);
|
|
let snippet = snippetWords.join(' ');
|
|
|
|
if (snippet.length > snippetLength) {
|
|
snippet = snippet.substring(0, snippetLength) + '...';
|
|
}
|
|
|
|
if (start > 0) {
|
|
snippet = '...' + snippet;
|
|
}
|
|
|
|
return snippet;
|
|
}
|
|
|
|
private pathToDocId(filePath: string): string {
|
|
return filePath
|
|
.replace(/^.*\/content\//, '')
|
|
.replace(/\.md$/, '')
|
|
.replace(/\//g, '_');
|
|
}
|
|
|
|
async searchSourceCode(
|
|
query: string,
|
|
options: { package?: string; limit?: number; tableName?: string } = {}
|
|
): Promise<Array<any & { score: number }>> {
|
|
if (!this.db || !this.embedder) {
|
|
throw new Error('Search not initialized');
|
|
}
|
|
|
|
const limit = options.limit || 5;
|
|
const tableName = options.tableName || 'babylon_source_code';
|
|
const queryVector = await this.generateEmbedding(query);
|
|
|
|
const sourceTable = await this.db.openTable(tableName);
|
|
let searchQuery = sourceTable.vectorSearch(queryVector).limit(limit);
|
|
|
|
if (options.package) {
|
|
searchQuery = searchQuery.where(`package = '${options.package}'`);
|
|
}
|
|
|
|
const results = await searchQuery.toArray();
|
|
return results.map((doc: any) => ({
|
|
...doc,
|
|
score: doc._distance ? Math.max(0, 1 - doc._distance) : 0,
|
|
}));
|
|
}
|
|
|
|
async getSourceFile(
|
|
filePath: string,
|
|
startLine?: number,
|
|
endLine?: number
|
|
): Promise<string | null> {
|
|
try {
|
|
const fullPath = path.join('./data/repositories/Babylon.js', filePath);
|
|
const content = await fs.readFile(fullPath, 'utf-8');
|
|
|
|
if (startLine !== undefined && endLine !== undefined) {
|
|
const lines = content.split('\n');
|
|
return lines.slice(startLine - 1, endLine).join('\n');
|
|
}
|
|
return content;
|
|
} catch (error) {
|
|
console.error(`Error reading source file ${filePath}:`, error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async close(): Promise<void> {
|
|
// LanceDB doesn't require explicit closing
|
|
}
|
|
}
|