babylon-mcp/src/search/lancedb-indexer.ts
Michael Mainguy 005a17f345 Add Babylon.js Editor documentation integration with TSX parser
Implemented comprehensive Editor documentation indexing using TypeScript Compiler API
to parse React/Next.js TSX files from the Babylon.js Editor repository.

Key changes:
- Added Editor repository (4th repo) to repository-config.ts
- Created tsx-parser.ts using TypeScript Compiler API (zero new dependencies)
- Extended document-parser.ts to route .tsx files to TSX parser
- Updated lancedb-indexer.ts to discover page.tsx files
- Added editor-docs source to index-docs.ts script

Features:
- Parses TSX/JSX files to extract text content, headings, and code blocks
- Filters out className values and non-content text
- Extracts categories from file paths (editor/adding-scripts, etc.)
- Handles Editor-specific documentation structure

Test coverage:
- Added tsx-parser.test.ts (11 tests, 10 passing)
- Extended document-parser.test.ts with TSX coverage (5 new tests)
- Fixed repository-manager.test.ts for 4 repositories
- Total: 167 tests passing, 1 skipped

Results:
- 902 documents now indexed (745 docs + 144 source + 13 editor)
- Editor documentation appears in search results
- Verified with Editor-specific queries (onStart, decorators, etc.)

Updated ROADMAP.md with completion status for Editor integration phases 1-3.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-24 09:20:56 -06:00

237 lines
7.4 KiB
TypeScript

import { connect } from '@lancedb/lancedb';
import { pipeline } from '@xenova/transformers';
import { DocumentParser } from './document-parser.js';
import type { DocumentMetadata } from './types.js';
import fs from 'fs/promises';
import path from 'path';
export interface EmbeddedDocument {
id: string;
title: string;
description: string;
content: string;
keywords: string;
category: string;
breadcrumbs: string;
filePath: string;
url: string;
source: string;
headings: string;
codeSnippets: string;
playgroundIds: string;
lastModified: string;
vector: number[];
}
export interface DocumentSource {
name: string;
path: string;
urlPrefix: string;
}
export class LanceDBIndexer {
private db: any;
private embedder: any;
private parser: DocumentParser;
private readonly dbPath: string;
private readonly tableName: string;
private readonly sources: DocumentSource[];
constructor(
dbPath: string = './data/lancedb',
sources: DocumentSource[] = [
{
name: 'documentation',
path: './data/repositories/Documentation/content',
urlPrefix: 'https://doc.babylonjs.com',
},
],
tableName: string = 'babylon_docs'
) {
this.dbPath = dbPath;
this.sources = sources;
this.tableName = tableName;
this.parser = new DocumentParser();
}
async initialize(): Promise<void> {
console.log('Initializing LanceDB connection...');
this.db = await connect(this.dbPath);
// Log which backend is being used
const backend = process.env.ONNXRUNTIME_BACKEND;
if (backend === 'wasm') {
console.log('Using WASM backend for Transformers.js (Alpine/musl compatibility mode)');
} else {
console.log('Using native ONNX Runtime backend (glibc required)');
}
console.log('Loading embedding model (this may take a moment)...');
this.embedder = await pipeline(
'feature-extraction',
'Xenova/all-MiniLM-L6-v2'
);
console.log('Embedding model loaded successfully');
}
async indexDocuments(): Promise<void> {
if (!this.embedder) {
throw new Error('Indexer not initialized. Call initialize() first.');
}
const allDocuments: EmbeddedDocument[] = [];
// Process each documentation source
for (const source of this.sources) {
console.log(`\nProcessing source: ${source.name}`);
console.log(`Path: ${source.path}`);
console.log('Finding documentation files...');
const docFiles = await this.findDocumentationFiles(source.path);
console.log(`Found ${docFiles.length} files in ${source.name}`);
console.log('Parsing and embedding documents...');
for (let i = 0; i < docFiles.length; i++) {
const filePath = docFiles[i];
if (!filePath) continue;
try {
const doc = await this.processDocument(filePath, source);
allDocuments.push(doc);
if ((i + 1) % 50 === 0) {
console.log(`Processed ${i + 1}/${docFiles.length} documents from ${source.name}`);
}
} catch (error) {
console.error(`Error processing ${filePath}:`, error);
}
}
console.log(`✓ Completed ${source.name}: ${docFiles.length} files processed`);
}
console.log(`\nTotal documents processed: ${allDocuments.length}`);
console.log('Creating LanceDB table...');
// Drop existing table if it exists
const tableNames = await this.db.tableNames();
if (tableNames.includes(this.tableName)) {
await this.db.dropTable(this.tableName);
}
// Create new table with embedded documents
await this.db.createTable(this.tableName, allDocuments);
console.log('Indexing complete!');
}
private async processDocument(filePath: string, source: DocumentSource): Promise<EmbeddedDocument> {
const metadata = await this.parser.parseFile(filePath, source.urlPrefix);
const embeddingText = this.createEmbeddingText(metadata);
const vector = await this.generateEmbedding(embeddingText);
return {
id: this.generateDocId(filePath, source),
title: metadata.title,
description: metadata.description,
content: this.truncateContent(metadata.content, 20000),
keywords: metadata.keywords.join(', '),
category: metadata.category,
breadcrumbs: metadata.breadcrumbs.join(' > '),
filePath: metadata.filePath,
url: this.generateDocUrl(metadata, source),
source: source.name,
headings: metadata.headings.map(h => h.text).join(' | '),
codeSnippets: metadata.codeBlocks.slice(0, 3).map(cb => cb.code).join('\n---\n'),
playgroundIds: metadata.playgroundIds.join(', '),
lastModified: metadata.lastModified.toISOString(),
vector,
};
}
private createEmbeddingText(metadata: DocumentMetadata): string {
// Combine key fields for embedding - prioritize title, description, keywords
const parts = [
metadata.title,
metadata.description,
metadata.keywords.join(' '),
metadata.headings.slice(0, 5).map(h => h.text).join(' '),
this.truncateContent(metadata.content, 500),
];
return parts.filter(Boolean).join(' ');
}
private async generateEmbedding(text: string): Promise<number[]> {
if (!this.embedder) {
throw new Error('Embedder not initialized');
}
const result = await this.embedder(text, {
pooling: 'mean',
normalize: true,
});
return Array.from(result.data);
}
private async findDocumentationFiles(dir: string): Promise<string[]> {
const files: string[] = [];
const entries = await fs.readdir(dir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dir, entry.name);
if (entry.isDirectory()) {
const subFiles = await this.findDocumentationFiles(fullPath);
files.push(...subFiles);
} else if (entry.isFile()) {
// Include .md files and page.tsx files (Editor documentation)
if (entry.name.endsWith('.md') || entry.name === 'page.tsx') {
files.push(fullPath);
}
}
}
return files;
}
private generateDocId(filePath: string, source: DocumentSource): string {
const basePath = source.path;
const relativePath = filePath
.replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '')
.replace(/\.md$/i, '')
.replace(/\/page\.tsx$/i, '') // Remove /page.tsx for Editor docs
.replace(/\//g, '_');
return `${source.name}_${relativePath}`;
}
private generateDocUrl(metadata: DocumentMetadata, source: DocumentSource): string {
const basePath = source.path;
let relativePath = metadata.filePath
.replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '')
.replace(/\.md$/i, '')
.replace(/\/page\.tsx$/i, ''); // Remove /page.tsx for Editor docs
// For source-repo, use GitHub URL; for documentation, use doc site
if (source.name === 'source-repo') {
return `https://github.com/BabylonJS/Babylon.js/blob/master/${relativePath}.md`;
}
// For editor-docs, construct proper URL
if (source.name === 'editor-docs') {
return `${source.urlPrefix}/${relativePath}`;
}
return `${source.urlPrefix}/${relativePath}`;
}
private truncateContent(content: string, maxLength: number): string {
if (content.length <= maxLength) return content;
return content.substring(0, maxLength) + '...';
}
async close(): Promise<void> {
// LanceDB doesn't require explicit closing
console.log('Indexer closed');
}
}