Implement LanceDB-based search and document retrieval

- Add LanceDBSearch class for vector-based documentation search
- Implement search() method with category filtering and relevance scoring
- Add getDocumentByPath() with URL lookup and local file fetching
- Fix getDocument() to use .query() instead of .search() for non-vector queries
- Update handlers.ts to integrate LanceDBSearch with MCP tools
- Parse stringified array fields (breadcrumbs, headings, keywords, playgroundIds) in get_babylon_doc
- Fetch fresh content from local repositories (Documentation, Babylon.js, havok)
- Add DocumentParser, LanceDBIndexer and related types for document processing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Michael Mainguy 2025-11-23 04:57:29 -06:00
parent a3e027ef02
commit f56b92e76e
6 changed files with 723 additions and 19 deletions

View File

@ -1,5 +1,16 @@
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
import { z } from 'zod';
import { LanceDBSearch } from '../search/lancedb-search.js';
let searchInstance: LanceDBSearch | null = null;
async function getSearchInstance(): Promise<LanceDBSearch> {
if (!searchInstance) {
searchInstance = new LanceDBSearch();
await searchInstance.initialize();
}
return searchInstance;
}
export function setupHandlers(server: McpServer): void {
registerSearchDocsTool(server);
@ -25,18 +36,60 @@ function registerSearchDocsTool(server: McpServer): void {
},
},
async ({ query, category, limit = 5 }) => {
// TODO: Implement actual search logic
const result = {
message: 'Search functionality not yet implemented',
query,
category,
limit,
results: [],
};
try {
const search = await getSearchInstance();
const options = category ? { category, limit } : { limit };
const results = await search.search(query, options);
return {
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
};
if (results.length === 0) {
return {
content: [
{
type: 'text',
text: `No results found for "${query}". Try different search terms or check if the documentation has been indexed.`,
},
],
};
}
// Format results for better readability
const formattedResults = results.map((result, index) => ({
rank: index + 1,
title: result.title,
description: result.description,
url: result.url,
category: result.category,
relevance: (result.score * 100).toFixed(1) + '%',
snippet: result.content,
keywords: result.keywords,
}));
return {
content: [
{
type: 'text',
text: JSON.stringify(
{
query,
totalResults: results.length,
results: formattedResults,
},
null,
2
),
},
],
};
} catch (error) {
return {
content: [
{
type: 'text',
text: `Error searching documentation: ${error instanceof Error ? error.message : String(error)}`,
},
],
};
}
}
);
}
@ -51,15 +104,68 @@ function registerGetDocTool(server: McpServer): void {
},
},
async ({ path }) => {
// TODO: Implement actual document retrieval
const result = {
message: 'Document retrieval not yet implemented',
path,
};
try {
const search = await getSearchInstance();
const document = await search.getDocumentByPath(path);
return {
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
};
if (!document) {
return {
content: [
{
type: 'text',
text: `Document not found: ${path}. The path may be incorrect or the documentation has not been indexed.`,
},
],
};
}
// Parse stringified fields back to arrays
const breadcrumbs = document.breadcrumbs
? document.breadcrumbs.split(' > ').filter(Boolean)
: [];
const headings = document.headings
? document.headings.split(' | ').filter(Boolean)
: [];
const keywords = document.keywords
? document.keywords.split(', ').filter(Boolean)
: [];
const playgroundIds = document.playgroundIds
? document.playgroundIds.split(', ').filter(Boolean)
: [];
return {
content: [
{
type: 'text',
text: JSON.stringify(
{
title: document.title,
description: document.description,
url: document.url,
category: document.category,
breadcrumbs,
content: document.content,
headings,
keywords,
playgroundIds,
lastModified: document.lastModified,
},
null,
2
),
},
],
};
} catch (error) {
return {
content: [
{
type: 'text',
text: `Error retrieving document: ${error instanceof Error ? error.message : String(error)}`,
},
],
};
}
}
);
}

View File

@ -0,0 +1,49 @@
import { describe, it, expect } from 'vitest';
import { DocumentParser } from './document-parser.js';
import path from 'path';
describe('DocumentParser', () => {
const parser = new DocumentParser();
const sampleFile = path.join(
process.cwd(),
'data/repositories/Documentation/content/features.md'
);
it('should parse YAML front matter', async () => {
const doc = await parser.parseFile(sampleFile);
expect(doc.title).toBe('Babylon.js Features');
expect(doc.description).toContain('breadth and depth');
expect(doc.keywords).toContain('features');
expect(doc.keywords).toContain('capabilities');
});
it('should extract category from file path', async () => {
const doc = await parser.parseFile(sampleFile);
expect(doc.category).toBe('features');
expect(doc.breadcrumbs).toEqual(['features']);
});
it('should extract headings', async () => {
const doc = await parser.parseFile(sampleFile);
expect(doc.headings.length).toBeGreaterThan(0);
expect(doc.headings[0]?.text).toBe('Babylon.js Features');
expect(doc.headings[0]?.level).toBe(1);
});
it('should have markdown content', async () => {
const doc = await parser.parseFile(sampleFile);
expect(doc.content).toContain('Babylon.js Features');
expect(doc.content.length).toBeGreaterThan(0);
});
it('should extract file path and modified date', async () => {
const doc = await parser.parseFile(sampleFile);
expect(doc.filePath).toBe(sampleFile);
expect(doc.lastModified).toBeInstanceOf(Date);
});
});

View File

@ -0,0 +1,99 @@
import matter from 'gray-matter';
import fs from 'fs/promises';
import type { DocumentMetadata, Heading, CodeBlock } from './types.js';
export class DocumentParser {
async parseFile(filePath: string): Promise<DocumentMetadata> {
const content = await fs.readFile(filePath, 'utf-8');
const { data, content: markdown } = matter(content);
return {
filePath,
title: data.title || '',
description: data.description || '',
keywords: this.parseKeywords(data.keywords),
category: this.extractCategory(filePath),
breadcrumbs: this.extractBreadcrumbs(filePath),
content: markdown,
headings: this.extractHeadings(markdown),
codeBlocks: this.extractCodeBlocks(markdown),
furtherReading: data['further-reading'] || [],
playgroundIds: this.extractPlaygroundIds(markdown),
lastModified: await this.getFileModifiedDate(filePath),
};
}
private parseKeywords(keywords: string | undefined): string[] {
if (!keywords) return [];
return keywords.split(',').map((k) => k.trim()).filter(Boolean);
}
private extractCategory(filePath: string): string {
const match = filePath.match(/content\/([^/]+(?:\/[^/]+)*)/);
if (!match || !match[1]) return 'uncategorized';
// Remove .md extension if present
return match[1].replace(/\.md$/, '');
}
private extractBreadcrumbs(filePath: string): string[] {
const category = this.extractCategory(filePath);
return category.split('/').filter(Boolean);
}
private extractHeadings(markdown: string): Heading[] {
const headings: Heading[] = [];
const lines = markdown.split('\n');
for (const line of lines) {
const trimmed = line.trim();
const match = trimmed.match(/^(#{1,6})\s+(.+)$/);
if (match && match[1] && match[2]) {
const level = match[1].length;
const text = match[2].trim();
const id = text.toLowerCase().replace(/[^\w\s-]/g, '').replace(/\s+/g, '-');
headings.push({ level, text, id });
}
}
return headings;
}
private extractCodeBlocks(markdown: string): CodeBlock[] {
const codeBlocks: CodeBlock[] = [];
const regex = /```(\w+)?\n([\s\S]*?)```/g;
let match;
while ((match = regex.exec(markdown)) !== null) {
if (match.index !== undefined && match[2] !== undefined) {
const lineStart = markdown.substring(0, match.index).split('\n').length;
codeBlocks.push({
language: match[1] || 'plaintext',
code: match[2].trim(),
lineStart,
});
}
}
return codeBlocks;
}
private extractPlaygroundIds(markdown: string): string[] {
const ids: string[] = [];
const regex = /<Playground\s+id=["']#([^"']+)["']/g;
let match;
while ((match = regex.exec(markdown)) !== null) {
if (match[1]) {
ids.push(match[1]);
}
}
return ids;
}
private async getFileModifiedDate(filePath: string): Promise<Date> {
const stats = await fs.stat(filePath);
return stats.mtime;
}
}

View File

@ -0,0 +1,218 @@
import { connect } from '@lancedb/lancedb';
import { pipeline } from '@xenova/transformers';
import { DocumentParser } from './document-parser.js';
import type { DocumentMetadata } from './types.js';
import fs from 'fs/promises';
import path from 'path';
export interface EmbeddedDocument {
id: string;
title: string;
description: string;
content: string;
keywords: string;
category: string;
breadcrumbs: string;
filePath: string;
url: string;
source: string;
headings: string;
codeSnippets: string;
playgroundIds: string;
lastModified: string;
vector: number[];
}
export interface DocumentSource {
name: string;
path: string;
urlPrefix: string;
}
export class LanceDBIndexer {
private db: any;
private embedder: any;
private parser: DocumentParser;
private readonly dbPath: string;
private readonly tableName: string;
private readonly sources: DocumentSource[];
constructor(
dbPath: string = './data/lancedb',
sources: DocumentSource[] = [
{
name: 'documentation',
path: './data/repositories/Documentation/content',
urlPrefix: 'https://doc.babylonjs.com',
},
],
tableName: string = 'babylon_docs'
) {
this.dbPath = dbPath;
this.sources = sources;
this.tableName = tableName;
this.parser = new DocumentParser();
}
async initialize(): Promise<void> {
console.log('Initializing LanceDB connection...');
this.db = await connect(this.dbPath);
console.log('Loading embedding model (this may take a moment)...');
this.embedder = await pipeline(
'feature-extraction',
'Xenova/all-MiniLM-L6-v2'
);
console.log('Embedding model loaded successfully');
}
async indexDocuments(): Promise<void> {
if (!this.embedder) {
throw new Error('Indexer not initialized. Call initialize() first.');
}
const allDocuments: EmbeddedDocument[] = [];
// Process each documentation source
for (const source of this.sources) {
console.log(`\nProcessing source: ${source.name}`);
console.log(`Path: ${source.path}`);
console.log('Finding markdown files...');
const markdownFiles = await this.findMarkdownFiles(source.path);
console.log(`Found ${markdownFiles.length} markdown files in ${source.name}`);
console.log('Parsing and embedding documents...');
for (let i = 0; i < markdownFiles.length; i++) {
const filePath = markdownFiles[i];
if (!filePath) continue;
try {
const doc = await this.processDocument(filePath, source);
allDocuments.push(doc);
if ((i + 1) % 50 === 0) {
console.log(`Processed ${i + 1}/${markdownFiles.length} documents from ${source.name}`);
}
} catch (error) {
console.error(`Error processing ${filePath}:`, error);
}
}
console.log(`✓ Completed ${source.name}: ${markdownFiles.length} files processed`);
}
console.log(`\nTotal documents processed: ${allDocuments.length}`);
console.log('Creating LanceDB table...');
// Drop existing table if it exists
const tableNames = await this.db.tableNames();
if (tableNames.includes(this.tableName)) {
await this.db.dropTable(this.tableName);
}
// Create new table with embedded documents
await this.db.createTable(this.tableName, allDocuments);
console.log('Indexing complete!');
}
private async processDocument(filePath: string, source: DocumentSource): Promise<EmbeddedDocument> {
const metadata = await this.parser.parseFile(filePath);
const embeddingText = this.createEmbeddingText(metadata);
const vector = await this.generateEmbedding(embeddingText);
return {
id: this.generateDocId(filePath, source),
title: metadata.title,
description: metadata.description,
content: this.truncateContent(metadata.content, 20000),
keywords: metadata.keywords.join(', '),
category: metadata.category,
breadcrumbs: metadata.breadcrumbs.join(' > '),
filePath: metadata.filePath,
url: this.generateDocUrl(metadata, source),
source: source.name,
headings: metadata.headings.map(h => h.text).join(' | '),
codeSnippets: metadata.codeBlocks.slice(0, 3).map(cb => cb.code).join('\n---\n'),
playgroundIds: metadata.playgroundIds.join(', '),
lastModified: metadata.lastModified.toISOString(),
vector,
};
}
private createEmbeddingText(metadata: DocumentMetadata): string {
// Combine key fields for embedding - prioritize title, description, keywords
const parts = [
metadata.title,
metadata.description,
metadata.keywords.join(' '),
metadata.headings.slice(0, 5).map(h => h.text).join(' '),
this.truncateContent(metadata.content, 500),
];
return parts.filter(Boolean).join(' ');
}
private async generateEmbedding(text: string): Promise<number[]> {
if (!this.embedder) {
throw new Error('Embedder not initialized');
}
const result = await this.embedder(text, {
pooling: 'mean',
normalize: true,
});
return Array.from(result.data);
}
private async findMarkdownFiles(dir: string): Promise<string[]> {
const files: string[] = [];
const entries = await fs.readdir(dir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dir, entry.name);
if (entry.isDirectory()) {
const subFiles = await this.findMarkdownFiles(fullPath);
files.push(...subFiles);
} else if (entry.isFile() && entry.name.endsWith('.md')) {
files.push(fullPath);
}
}
return files;
}
private generateDocId(filePath: string, source: DocumentSource): string {
const basePath = source.path;
const relativePath = filePath
.replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '')
.replace(/\.md$/i, '')
.replace(/\//g, '_');
return `${source.name}_${relativePath}`;
}
private generateDocUrl(metadata: DocumentMetadata, source: DocumentSource): string {
const basePath = source.path;
const relativePath = metadata.filePath
.replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '')
.replace(/\.md$/i, '');
// For source-repo, use GitHub URL; for documentation, use doc site
if (source.name === 'source-repo') {
return `https://github.com/BabylonJS/Babylon.js/blob/master/${relativePath}.md`;
}
return `${source.urlPrefix}/${relativePath}`;
}
private truncateContent(content: string, maxLength: number): string {
if (content.length <= maxLength) return content;
return content.substring(0, maxLength) + '...';
}
async close(): Promise<void> {
// LanceDB doesn't require explicit closing
console.log('Indexer closed');
}
}

View File

@ -0,0 +1,185 @@
import { connect } from '@lancedb/lancedb';
import { pipeline } from '@xenova/transformers';
import type { SearchOptions, SearchResult } from './types.js';
import type { EmbeddedDocument } from './lancedb-indexer.js';
import fs from 'fs/promises';
import path from 'path';
export class LanceDBSearch {
private db: any;
private table: any;
private embedder: any;
private readonly dbPath: string;
private readonly tableName: string;
constructor(
dbPath: string = './data/lancedb',
tableName: string = 'babylon_docs'
) {
this.dbPath = dbPath;
this.tableName = tableName;
}
async initialize(): Promise<void> {
this.db = await connect(this.dbPath);
this.table = await this.db.openTable(this.tableName);
this.embedder = await pipeline(
'feature-extraction',
'Xenova/all-MiniLM-L6-v2'
);
}
async search(query: string, options: SearchOptions = {}): Promise<SearchResult[]> {
if (!this.table || !this.embedder) {
throw new Error('Search not initialized. Call initialize() first.');
}
const limit = options.limit || 5;
const queryVector = await this.generateEmbedding(query);
// Build the search query
let searchQuery = this.table.vectorSearch(queryVector).limit(limit);
// Apply category filter if provided
if (options.category) {
searchQuery = searchQuery.where(`category = '${options.category}'`);
}
const results = await searchQuery.toArray();
return results.map((doc: any) => ({
title: doc.title,
description: doc.description,
content: this.extractRelevantSnippet(doc.content, query),
url: doc.url,
category: doc.category,
score: doc._distance ? 1 - doc._distance : 0, // Convert distance to similarity score
keywords: doc.keywords.split(', ').filter(Boolean),
}));
}
async getDocument(docId: string): Promise<EmbeddedDocument | null> {
if (!this.table) {
throw new Error('Search not initialized. Call initialize() first.');
}
const results = await this.table
.query()
.where(`id = '${docId}'`)
.limit(1)
.toArray();
return results.length > 0 ? results[0] : null;
}
async getDocumentByPath(filePath: string): Promise<EmbeddedDocument | null> {
if (!this.table) {
throw new Error('Search not initialized. Call initialize() first.');
}
// Try to find document by URL first
let results = await this.table
.query()
.where(`url = '${filePath}'`)
.limit(1)
.toArray();
if (results.length > 0) {
const doc = results[0];
// Fetch fresh content from local file if available
const freshContent = await this.fetchLocalContent(doc.filePath);
if (freshContent) {
return { ...doc, content: freshContent };
}
return doc;
}
// If not found by URL, try by docId conversion
const docId = this.pathToDocId(filePath);
return this.getDocument(docId);
}
private async fetchLocalContent(filePath: string): Promise<string | null> {
try {
// Check if the file exists in our local repositories
const possiblePaths = [
filePath,
path.join('./data/repositories/Documentation', filePath.replace(/^.*\/content\//, '')),
path.join('./data/repositories/Babylon.js', filePath.replace(/^.*\/Babylon\.js\//, '')),
path.join('./data/repositories/havok', filePath.replace(/^.*\/havok\//, '')),
];
for (const possiblePath of possiblePaths) {
try {
const content = await fs.readFile(possiblePath, 'utf-8');
return content;
} catch {
// Continue to next path
}
}
return null;
} catch (error) {
return null;
}
}
private async generateEmbedding(text: string): Promise<number[]> {
if (!this.embedder) {
throw new Error('Embedder not initialized');
}
const result = await this.embedder(text, {
pooling: 'mean',
normalize: true,
});
return Array.from(result.data);
}
private extractRelevantSnippet(content: string, query: string, snippetLength: number = 300): string {
// Simple snippet extraction - find first occurrence of query terms
const queryTerms = query.toLowerCase().split(/\s+/);
let bestIndex = 0;
let maxMatches = 0;
// Find the position with most query term matches
const words = content.split(/\s+/);
for (let i = 0; i < words.length; i++) {
const windowText = words.slice(i, i + 50).join(' ').toLowerCase();
const matches = queryTerms.filter(term => windowText.includes(term)).length;
if (matches > maxMatches) {
maxMatches = matches;
bestIndex = i;
}
}
// Extract snippet around best match
const start = Math.max(0, bestIndex - 10);
const snippetWords = words.slice(start, start + 60);
let snippet = snippetWords.join(' ');
if (snippet.length > snippetLength) {
snippet = snippet.substring(0, snippetLength) + '...';
}
if (start > 0) {
snippet = '...' + snippet;
}
return snippet;
}
private pathToDocId(filePath: string): string {
return filePath
.replace(/^.*\/content\//, '')
.replace(/\.md$/, '')
.replace(/\//g, '_');
}
async close(): Promise<void> {
// LanceDB doesn't require explicit closing
}
}

47
src/search/types.ts Normal file
View File

@ -0,0 +1,47 @@
export interface DocumentMetadata {
filePath: string;
title: string;
description: string;
keywords: string[];
category: string;
breadcrumbs: string[];
content: string;
headings: Heading[];
codeBlocks: CodeBlock[];
furtherReading: RelatedLink[];
playgroundIds: string[];
lastModified: Date;
}
export interface Heading {
level: number;
text: string;
id: string;
}
export interface CodeBlock {
language: string;
code: string;
lineStart: number;
}
export interface RelatedLink {
title: string;
url: string;
}
export interface SearchOptions {
limit?: number;
category?: string;
queryType?: 'keyword' | 'semantic' | 'hybrid';
}
export interface SearchResult {
title: string;
description: string;
content: string;
url: string;
category: string;
score: number;
keywords: string[];
}