diff --git a/src/mcp/handlers.ts b/src/mcp/handlers.ts index 80bdd42..364b0fd 100644 --- a/src/mcp/handlers.ts +++ b/src/mcp/handlers.ts @@ -1,5 +1,16 @@ import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; import { z } from 'zod'; +import { LanceDBSearch } from '../search/lancedb-search.js'; + +let searchInstance: LanceDBSearch | null = null; + +async function getSearchInstance(): Promise { + if (!searchInstance) { + searchInstance = new LanceDBSearch(); + await searchInstance.initialize(); + } + return searchInstance; +} export function setupHandlers(server: McpServer): void { registerSearchDocsTool(server); @@ -25,18 +36,60 @@ function registerSearchDocsTool(server: McpServer): void { }, }, async ({ query, category, limit = 5 }) => { - // TODO: Implement actual search logic - const result = { - message: 'Search functionality not yet implemented', - query, - category, - limit, - results: [], - }; + try { + const search = await getSearchInstance(); + const options = category ? { category, limit } : { limit }; + const results = await search.search(query, options); - return { - content: [{ type: 'text', text: JSON.stringify(result, null, 2) }], - }; + if (results.length === 0) { + return { + content: [ + { + type: 'text', + text: `No results found for "${query}". Try different search terms or check if the documentation has been indexed.`, + }, + ], + }; + } + + // Format results for better readability + const formattedResults = results.map((result, index) => ({ + rank: index + 1, + title: result.title, + description: result.description, + url: result.url, + category: result.category, + relevance: (result.score * 100).toFixed(1) + '%', + snippet: result.content, + keywords: result.keywords, + })); + + return { + content: [ + { + type: 'text', + text: JSON.stringify( + { + query, + totalResults: results.length, + results: formattedResults, + }, + null, + 2 + ), + }, + ], + }; + } catch (error) { + return { + content: [ + { + type: 'text', + text: `Error searching documentation: ${error instanceof Error ? error.message : String(error)}`, + }, + ], + }; + } } ); } @@ -51,15 +104,68 @@ function registerGetDocTool(server: McpServer): void { }, }, async ({ path }) => { - // TODO: Implement actual document retrieval - const result = { - message: 'Document retrieval not yet implemented', - path, - }; + try { + const search = await getSearchInstance(); + const document = await search.getDocumentByPath(path); - return { - content: [{ type: 'text', text: JSON.stringify(result, null, 2) }], - }; + if (!document) { + return { + content: [ + { + type: 'text', + text: `Document not found: ${path}. The path may be incorrect or the documentation has not been indexed.`, + }, + ], + }; + } + + // Parse stringified fields back to arrays + const breadcrumbs = document.breadcrumbs + ? document.breadcrumbs.split(' > ').filter(Boolean) + : []; + const headings = document.headings + ? document.headings.split(' | ').filter(Boolean) + : []; + const keywords = document.keywords + ? document.keywords.split(', ').filter(Boolean) + : []; + const playgroundIds = document.playgroundIds + ? document.playgroundIds.split(', ').filter(Boolean) + : []; + + return { + content: [ + { + type: 'text', + text: JSON.stringify( + { + title: document.title, + description: document.description, + url: document.url, + category: document.category, + breadcrumbs, + content: document.content, + headings, + keywords, + playgroundIds, + lastModified: document.lastModified, + }, + null, + 2 + ), + }, + ], + }; + } catch (error) { + return { + content: [ + { + type: 'text', + text: `Error retrieving document: ${error instanceof Error ? error.message : String(error)}`, + }, + ], + }; + } } ); } diff --git a/src/search/document-parser.test.ts b/src/search/document-parser.test.ts new file mode 100644 index 0000000..8b02b47 --- /dev/null +++ b/src/search/document-parser.test.ts @@ -0,0 +1,49 @@ +import { describe, it, expect } from 'vitest'; +import { DocumentParser } from './document-parser.js'; +import path from 'path'; + +describe('DocumentParser', () => { + const parser = new DocumentParser(); + const sampleFile = path.join( + process.cwd(), + 'data/repositories/Documentation/content/features.md' + ); + + it('should parse YAML front matter', async () => { + const doc = await parser.parseFile(sampleFile); + + expect(doc.title).toBe('Babylon.js Features'); + expect(doc.description).toContain('breadth and depth'); + expect(doc.keywords).toContain('features'); + expect(doc.keywords).toContain('capabilities'); + }); + + it('should extract category from file path', async () => { + const doc = await parser.parseFile(sampleFile); + + expect(doc.category).toBe('features'); + expect(doc.breadcrumbs).toEqual(['features']); + }); + + it('should extract headings', async () => { + const doc = await parser.parseFile(sampleFile); + + expect(doc.headings.length).toBeGreaterThan(0); + expect(doc.headings[0]?.text).toBe('Babylon.js Features'); + expect(doc.headings[0]?.level).toBe(1); + }); + + it('should have markdown content', async () => { + const doc = await parser.parseFile(sampleFile); + + expect(doc.content).toContain('Babylon.js Features'); + expect(doc.content.length).toBeGreaterThan(0); + }); + + it('should extract file path and modified date', async () => { + const doc = await parser.parseFile(sampleFile); + + expect(doc.filePath).toBe(sampleFile); + expect(doc.lastModified).toBeInstanceOf(Date); + }); +}); diff --git a/src/search/document-parser.ts b/src/search/document-parser.ts new file mode 100644 index 0000000..90f503f --- /dev/null +++ b/src/search/document-parser.ts @@ -0,0 +1,99 @@ +import matter from 'gray-matter'; +import fs from 'fs/promises'; +import type { DocumentMetadata, Heading, CodeBlock } from './types.js'; + +export class DocumentParser { + async parseFile(filePath: string): Promise { + const content = await fs.readFile(filePath, 'utf-8'); + const { data, content: markdown } = matter(content); + + return { + filePath, + title: data.title || '', + description: data.description || '', + keywords: this.parseKeywords(data.keywords), + category: this.extractCategory(filePath), + breadcrumbs: this.extractBreadcrumbs(filePath), + content: markdown, + headings: this.extractHeadings(markdown), + codeBlocks: this.extractCodeBlocks(markdown), + furtherReading: data['further-reading'] || [], + playgroundIds: this.extractPlaygroundIds(markdown), + lastModified: await this.getFileModifiedDate(filePath), + }; + } + + private parseKeywords(keywords: string | undefined): string[] { + if (!keywords) return []; + return keywords.split(',').map((k) => k.trim()).filter(Boolean); + } + + private extractCategory(filePath: string): string { + const match = filePath.match(/content\/([^/]+(?:\/[^/]+)*)/); + if (!match || !match[1]) return 'uncategorized'; + + // Remove .md extension if present + return match[1].replace(/\.md$/, ''); + } + + private extractBreadcrumbs(filePath: string): string[] { + const category = this.extractCategory(filePath); + return category.split('/').filter(Boolean); + } + + private extractHeadings(markdown: string): Heading[] { + const headings: Heading[] = []; + const lines = markdown.split('\n'); + + for (const line of lines) { + const trimmed = line.trim(); + const match = trimmed.match(/^(#{1,6})\s+(.+)$/); + if (match && match[1] && match[2]) { + const level = match[1].length; + const text = match[2].trim(); + const id = text.toLowerCase().replace(/[^\w\s-]/g, '').replace(/\s+/g, '-'); + headings.push({ level, text, id }); + } + } + + return headings; + } + + private extractCodeBlocks(markdown: string): CodeBlock[] { + const codeBlocks: CodeBlock[] = []; + const regex = /```(\w+)?\n([\s\S]*?)```/g; + let match; + + while ((match = regex.exec(markdown)) !== null) { + if (match.index !== undefined && match[2] !== undefined) { + const lineStart = markdown.substring(0, match.index).split('\n').length; + codeBlocks.push({ + language: match[1] || 'plaintext', + code: match[2].trim(), + lineStart, + }); + } + } + + return codeBlocks; + } + + private extractPlaygroundIds(markdown: string): string[] { + const ids: string[] = []; + const regex = / { + const stats = await fs.stat(filePath); + return stats.mtime; + } +} diff --git a/src/search/lancedb-indexer.ts b/src/search/lancedb-indexer.ts new file mode 100644 index 0000000..89b24b1 --- /dev/null +++ b/src/search/lancedb-indexer.ts @@ -0,0 +1,218 @@ +import { connect } from '@lancedb/lancedb'; +import { pipeline } from '@xenova/transformers'; +import { DocumentParser } from './document-parser.js'; +import type { DocumentMetadata } from './types.js'; +import fs from 'fs/promises'; +import path from 'path'; + +export interface EmbeddedDocument { + id: string; + title: string; + description: string; + content: string; + keywords: string; + category: string; + breadcrumbs: string; + filePath: string; + url: string; + source: string; + headings: string; + codeSnippets: string; + playgroundIds: string; + lastModified: string; + vector: number[]; +} + +export interface DocumentSource { + name: string; + path: string; + urlPrefix: string; +} + +export class LanceDBIndexer { + private db: any; + private embedder: any; + private parser: DocumentParser; + private readonly dbPath: string; + private readonly tableName: string; + private readonly sources: DocumentSource[]; + + constructor( + dbPath: string = './data/lancedb', + sources: DocumentSource[] = [ + { + name: 'documentation', + path: './data/repositories/Documentation/content', + urlPrefix: 'https://doc.babylonjs.com', + }, + ], + tableName: string = 'babylon_docs' + ) { + this.dbPath = dbPath; + this.sources = sources; + this.tableName = tableName; + this.parser = new DocumentParser(); + } + + async initialize(): Promise { + console.log('Initializing LanceDB connection...'); + this.db = await connect(this.dbPath); + + console.log('Loading embedding model (this may take a moment)...'); + this.embedder = await pipeline( + 'feature-extraction', + 'Xenova/all-MiniLM-L6-v2' + ); + console.log('Embedding model loaded successfully'); + } + + async indexDocuments(): Promise { + if (!this.embedder) { + throw new Error('Indexer not initialized. Call initialize() first.'); + } + + const allDocuments: EmbeddedDocument[] = []; + + // Process each documentation source + for (const source of this.sources) { + console.log(`\nProcessing source: ${source.name}`); + console.log(`Path: ${source.path}`); + console.log('Finding markdown files...'); + + const markdownFiles = await this.findMarkdownFiles(source.path); + console.log(`Found ${markdownFiles.length} markdown files in ${source.name}`); + + console.log('Parsing and embedding documents...'); + + for (let i = 0; i < markdownFiles.length; i++) { + const filePath = markdownFiles[i]; + if (!filePath) continue; + + try { + const doc = await this.processDocument(filePath, source); + allDocuments.push(doc); + + if ((i + 1) % 50 === 0) { + console.log(`Processed ${i + 1}/${markdownFiles.length} documents from ${source.name}`); + } + } catch (error) { + console.error(`Error processing ${filePath}:`, error); + } + } + + console.log(`✓ Completed ${source.name}: ${markdownFiles.length} files processed`); + } + + console.log(`\nTotal documents processed: ${allDocuments.length}`); + console.log('Creating LanceDB table...'); + + // Drop existing table if it exists + const tableNames = await this.db.tableNames(); + if (tableNames.includes(this.tableName)) { + await this.db.dropTable(this.tableName); + } + + // Create new table with embedded documents + await this.db.createTable(this.tableName, allDocuments); + console.log('Indexing complete!'); + } + + private async processDocument(filePath: string, source: DocumentSource): Promise { + const metadata = await this.parser.parseFile(filePath); + const embeddingText = this.createEmbeddingText(metadata); + const vector = await this.generateEmbedding(embeddingText); + + return { + id: this.generateDocId(filePath, source), + title: metadata.title, + description: metadata.description, + content: this.truncateContent(metadata.content, 20000), + keywords: metadata.keywords.join(', '), + category: metadata.category, + breadcrumbs: metadata.breadcrumbs.join(' > '), + filePath: metadata.filePath, + url: this.generateDocUrl(metadata, source), + source: source.name, + headings: metadata.headings.map(h => h.text).join(' | '), + codeSnippets: metadata.codeBlocks.slice(0, 3).map(cb => cb.code).join('\n---\n'), + playgroundIds: metadata.playgroundIds.join(', '), + lastModified: metadata.lastModified.toISOString(), + vector, + }; + } + + private createEmbeddingText(metadata: DocumentMetadata): string { + // Combine key fields for embedding - prioritize title, description, keywords + const parts = [ + metadata.title, + metadata.description, + metadata.keywords.join(' '), + metadata.headings.slice(0, 5).map(h => h.text).join(' '), + this.truncateContent(metadata.content, 500), + ]; + return parts.filter(Boolean).join(' '); + } + + private async generateEmbedding(text: string): Promise { + if (!this.embedder) { + throw new Error('Embedder not initialized'); + } + + const result = await this.embedder(text, { + pooling: 'mean', + normalize: true, + }); + + return Array.from(result.data); + } + + private async findMarkdownFiles(dir: string): Promise { + const files: string[] = []; + const entries = await fs.readdir(dir, { withFileTypes: true }); + + for (const entry of entries) { + const fullPath = path.join(dir, entry.name); + if (entry.isDirectory()) { + const subFiles = await this.findMarkdownFiles(fullPath); + files.push(...subFiles); + } else if (entry.isFile() && entry.name.endsWith('.md')) { + files.push(fullPath); + } + } + + return files; + } + + private generateDocId(filePath: string, source: DocumentSource): string { + const basePath = source.path; + const relativePath = filePath + .replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '') + .replace(/\.md$/i, '') + .replace(/\//g, '_'); + return `${source.name}_${relativePath}`; + } + + private generateDocUrl(metadata: DocumentMetadata, source: DocumentSource): string { + const basePath = source.path; + const relativePath = metadata.filePath + .replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '') + .replace(/\.md$/i, ''); + + // For source-repo, use GitHub URL; for documentation, use doc site + if (source.name === 'source-repo') { + return `https://github.com/BabylonJS/Babylon.js/blob/master/${relativePath}.md`; + } + + return `${source.urlPrefix}/${relativePath}`; + } + + private truncateContent(content: string, maxLength: number): string { + if (content.length <= maxLength) return content; + return content.substring(0, maxLength) + '...'; + } + + async close(): Promise { + // LanceDB doesn't require explicit closing + console.log('Indexer closed'); + } +} diff --git a/src/search/lancedb-search.ts b/src/search/lancedb-search.ts new file mode 100644 index 0000000..59bfee7 --- /dev/null +++ b/src/search/lancedb-search.ts @@ -0,0 +1,185 @@ +import { connect } from '@lancedb/lancedb'; +import { pipeline } from '@xenova/transformers'; +import type { SearchOptions, SearchResult } from './types.js'; +import type { EmbeddedDocument } from './lancedb-indexer.js'; +import fs from 'fs/promises'; +import path from 'path'; + +export class LanceDBSearch { + private db: any; + private table: any; + private embedder: any; + private readonly dbPath: string; + private readonly tableName: string; + + constructor( + dbPath: string = './data/lancedb', + tableName: string = 'babylon_docs' + ) { + this.dbPath = dbPath; + this.tableName = tableName; + } + + async initialize(): Promise { + this.db = await connect(this.dbPath); + this.table = await this.db.openTable(this.tableName); + + this.embedder = await pipeline( + 'feature-extraction', + 'Xenova/all-MiniLM-L6-v2' + ); + } + + async search(query: string, options: SearchOptions = {}): Promise { + if (!this.table || !this.embedder) { + throw new Error('Search not initialized. Call initialize() first.'); + } + + const limit = options.limit || 5; + const queryVector = await this.generateEmbedding(query); + + // Build the search query + let searchQuery = this.table.vectorSearch(queryVector).limit(limit); + + // Apply category filter if provided + if (options.category) { + searchQuery = searchQuery.where(`category = '${options.category}'`); + } + + const results = await searchQuery.toArray(); + + return results.map((doc: any) => ({ + title: doc.title, + description: doc.description, + content: this.extractRelevantSnippet(doc.content, query), + url: doc.url, + category: doc.category, + score: doc._distance ? 1 - doc._distance : 0, // Convert distance to similarity score + keywords: doc.keywords.split(', ').filter(Boolean), + })); + } + + async getDocument(docId: string): Promise { + if (!this.table) { + throw new Error('Search not initialized. Call initialize() first.'); + } + + const results = await this.table + .query() + .where(`id = '${docId}'`) + .limit(1) + .toArray(); + + return results.length > 0 ? results[0] : null; + } + + async getDocumentByPath(filePath: string): Promise { + if (!this.table) { + throw new Error('Search not initialized. Call initialize() first.'); + } + + // Try to find document by URL first + let results = await this.table + .query() + .where(`url = '${filePath}'`) + .limit(1) + .toArray(); + + if (results.length > 0) { + const doc = results[0]; + // Fetch fresh content from local file if available + const freshContent = await this.fetchLocalContent(doc.filePath); + if (freshContent) { + return { ...doc, content: freshContent }; + } + return doc; + } + + // If not found by URL, try by docId conversion + const docId = this.pathToDocId(filePath); + return this.getDocument(docId); + } + + private async fetchLocalContent(filePath: string): Promise { + try { + // Check if the file exists in our local repositories + const possiblePaths = [ + filePath, + path.join('./data/repositories/Documentation', filePath.replace(/^.*\/content\//, '')), + path.join('./data/repositories/Babylon.js', filePath.replace(/^.*\/Babylon\.js\//, '')), + path.join('./data/repositories/havok', filePath.replace(/^.*\/havok\//, '')), + ]; + + for (const possiblePath of possiblePaths) { + try { + const content = await fs.readFile(possiblePath, 'utf-8'); + return content; + } catch { + // Continue to next path + } + } + + return null; + } catch (error) { + return null; + } + } + + private async generateEmbedding(text: string): Promise { + if (!this.embedder) { + throw new Error('Embedder not initialized'); + } + + const result = await this.embedder(text, { + pooling: 'mean', + normalize: true, + }); + + return Array.from(result.data); + } + + private extractRelevantSnippet(content: string, query: string, snippetLength: number = 300): string { + // Simple snippet extraction - find first occurrence of query terms + const queryTerms = query.toLowerCase().split(/\s+/); + + let bestIndex = 0; + let maxMatches = 0; + + // Find the position with most query term matches + const words = content.split(/\s+/); + for (let i = 0; i < words.length; i++) { + const windowText = words.slice(i, i + 50).join(' ').toLowerCase(); + const matches = queryTerms.filter(term => windowText.includes(term)).length; + if (matches > maxMatches) { + maxMatches = matches; + bestIndex = i; + } + } + + // Extract snippet around best match + const start = Math.max(0, bestIndex - 10); + const snippetWords = words.slice(start, start + 60); + let snippet = snippetWords.join(' '); + + if (snippet.length > snippetLength) { + snippet = snippet.substring(0, snippetLength) + '...'; + } + + if (start > 0) { + snippet = '...' + snippet; + } + + return snippet; + } + + private pathToDocId(filePath: string): string { + return filePath + .replace(/^.*\/content\//, '') + .replace(/\.md$/, '') + .replace(/\//g, '_'); + } + + async close(): Promise { + // LanceDB doesn't require explicit closing + } +} diff --git a/src/search/types.ts b/src/search/types.ts new file mode 100644 index 0000000..c0b48dd --- /dev/null +++ b/src/search/types.ts @@ -0,0 +1,47 @@ +export interface DocumentMetadata { + filePath: string; + title: string; + description: string; + keywords: string[]; + category: string; + breadcrumbs: string[]; + content: string; + headings: Heading[]; + codeBlocks: CodeBlock[]; + furtherReading: RelatedLink[]; + playgroundIds: string[]; + lastModified: Date; +} + +export interface Heading { + level: number; + text: string; + id: string; +} + +export interface CodeBlock { + language: string; + code: string; + lineStart: number; +} + +export interface RelatedLink { + title: string; + url: string; +} + +export interface SearchOptions { + limit?: number; + category?: string; + queryType?: 'keyword' | 'semantic' | 'hybrid'; +} + +export interface SearchResult { + title: string; + description: string; + content: string; + url: string; + category: string; + score: number; + keywords: string[]; +}