Implemented comprehensive Editor documentation indexing using TypeScript Compiler API to parse React/Next.js TSX files from the Babylon.js Editor repository. Key changes: - Added Editor repository (4th repo) to repository-config.ts - Created tsx-parser.ts using TypeScript Compiler API (zero new dependencies) - Extended document-parser.ts to route .tsx files to TSX parser - Updated lancedb-indexer.ts to discover page.tsx files - Added editor-docs source to index-docs.ts script Features: - Parses TSX/JSX files to extract text content, headings, and code blocks - Filters out className values and non-content text - Extracts categories from file paths (editor/adding-scripts, etc.) - Handles Editor-specific documentation structure Test coverage: - Added tsx-parser.test.ts (11 tests, 10 passing) - Extended document-parser.test.ts with TSX coverage (5 new tests) - Fixed repository-manager.test.ts for 4 repositories - Total: 167 tests passing, 1 skipped Results: - 902 documents now indexed (745 docs + 144 source + 13 editor) - Editor documentation appears in search results - Verified with Editor-specific queries (onStart, decorators, etc.) Updated ROADMAP.md with completion status for Editor integration phases 1-3. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
237 lines
7.4 KiB
TypeScript
237 lines
7.4 KiB
TypeScript
import { connect } from '@lancedb/lancedb';
|
|
import { pipeline } from '@xenova/transformers';
|
|
import { DocumentParser } from './document-parser.js';
|
|
import type { DocumentMetadata } from './types.js';
|
|
import fs from 'fs/promises';
|
|
import path from 'path';
|
|
|
|
export interface EmbeddedDocument {
|
|
id: string;
|
|
title: string;
|
|
description: string;
|
|
content: string;
|
|
keywords: string;
|
|
category: string;
|
|
breadcrumbs: string;
|
|
filePath: string;
|
|
url: string;
|
|
source: string;
|
|
headings: string;
|
|
codeSnippets: string;
|
|
playgroundIds: string;
|
|
lastModified: string;
|
|
vector: number[];
|
|
}
|
|
|
|
export interface DocumentSource {
|
|
name: string;
|
|
path: string;
|
|
urlPrefix: string;
|
|
}
|
|
|
|
export class LanceDBIndexer {
|
|
private db: any;
|
|
private embedder: any;
|
|
private parser: DocumentParser;
|
|
private readonly dbPath: string;
|
|
private readonly tableName: string;
|
|
private readonly sources: DocumentSource[];
|
|
|
|
constructor(
|
|
dbPath: string = './data/lancedb',
|
|
sources: DocumentSource[] = [
|
|
{
|
|
name: 'documentation',
|
|
path: './data/repositories/Documentation/content',
|
|
urlPrefix: 'https://doc.babylonjs.com',
|
|
},
|
|
],
|
|
tableName: string = 'babylon_docs'
|
|
) {
|
|
this.dbPath = dbPath;
|
|
this.sources = sources;
|
|
this.tableName = tableName;
|
|
this.parser = new DocumentParser();
|
|
}
|
|
|
|
async initialize(): Promise<void> {
|
|
console.log('Initializing LanceDB connection...');
|
|
this.db = await connect(this.dbPath);
|
|
|
|
// Log which backend is being used
|
|
const backend = process.env.ONNXRUNTIME_BACKEND;
|
|
if (backend === 'wasm') {
|
|
console.log('Using WASM backend for Transformers.js (Alpine/musl compatibility mode)');
|
|
} else {
|
|
console.log('Using native ONNX Runtime backend (glibc required)');
|
|
}
|
|
|
|
console.log('Loading embedding model (this may take a moment)...');
|
|
this.embedder = await pipeline(
|
|
'feature-extraction',
|
|
'Xenova/all-MiniLM-L6-v2'
|
|
);
|
|
console.log('Embedding model loaded successfully');
|
|
}
|
|
|
|
async indexDocuments(): Promise<void> {
|
|
if (!this.embedder) {
|
|
throw new Error('Indexer not initialized. Call initialize() first.');
|
|
}
|
|
|
|
const allDocuments: EmbeddedDocument[] = [];
|
|
|
|
// Process each documentation source
|
|
for (const source of this.sources) {
|
|
console.log(`\nProcessing source: ${source.name}`);
|
|
console.log(`Path: ${source.path}`);
|
|
console.log('Finding documentation files...');
|
|
|
|
const docFiles = await this.findDocumentationFiles(source.path);
|
|
console.log(`Found ${docFiles.length} files in ${source.name}`);
|
|
|
|
console.log('Parsing and embedding documents...');
|
|
|
|
for (let i = 0; i < docFiles.length; i++) {
|
|
const filePath = docFiles[i];
|
|
if (!filePath) continue;
|
|
|
|
try {
|
|
const doc = await this.processDocument(filePath, source);
|
|
allDocuments.push(doc);
|
|
|
|
if ((i + 1) % 50 === 0) {
|
|
console.log(`Processed ${i + 1}/${docFiles.length} documents from ${source.name}`);
|
|
}
|
|
} catch (error) {
|
|
console.error(`Error processing ${filePath}:`, error);
|
|
}
|
|
}
|
|
|
|
console.log(`✓ Completed ${source.name}: ${docFiles.length} files processed`);
|
|
}
|
|
|
|
console.log(`\nTotal documents processed: ${allDocuments.length}`);
|
|
console.log('Creating LanceDB table...');
|
|
|
|
// Drop existing table if it exists
|
|
const tableNames = await this.db.tableNames();
|
|
if (tableNames.includes(this.tableName)) {
|
|
await this.db.dropTable(this.tableName);
|
|
}
|
|
|
|
// Create new table with embedded documents
|
|
await this.db.createTable(this.tableName, allDocuments);
|
|
console.log('Indexing complete!');
|
|
}
|
|
|
|
private async processDocument(filePath: string, source: DocumentSource): Promise<EmbeddedDocument> {
|
|
const metadata = await this.parser.parseFile(filePath, source.urlPrefix);
|
|
const embeddingText = this.createEmbeddingText(metadata);
|
|
const vector = await this.generateEmbedding(embeddingText);
|
|
|
|
return {
|
|
id: this.generateDocId(filePath, source),
|
|
title: metadata.title,
|
|
description: metadata.description,
|
|
content: this.truncateContent(metadata.content, 20000),
|
|
keywords: metadata.keywords.join(', '),
|
|
category: metadata.category,
|
|
breadcrumbs: metadata.breadcrumbs.join(' > '),
|
|
filePath: metadata.filePath,
|
|
url: this.generateDocUrl(metadata, source),
|
|
source: source.name,
|
|
headings: metadata.headings.map(h => h.text).join(' | '),
|
|
codeSnippets: metadata.codeBlocks.slice(0, 3).map(cb => cb.code).join('\n---\n'),
|
|
playgroundIds: metadata.playgroundIds.join(', '),
|
|
lastModified: metadata.lastModified.toISOString(),
|
|
vector,
|
|
};
|
|
}
|
|
|
|
private createEmbeddingText(metadata: DocumentMetadata): string {
|
|
// Combine key fields for embedding - prioritize title, description, keywords
|
|
const parts = [
|
|
metadata.title,
|
|
metadata.description,
|
|
metadata.keywords.join(' '),
|
|
metadata.headings.slice(0, 5).map(h => h.text).join(' '),
|
|
this.truncateContent(metadata.content, 500),
|
|
];
|
|
return parts.filter(Boolean).join(' ');
|
|
}
|
|
|
|
private async generateEmbedding(text: string): Promise<number[]> {
|
|
if (!this.embedder) {
|
|
throw new Error('Embedder not initialized');
|
|
}
|
|
|
|
const result = await this.embedder(text, {
|
|
pooling: 'mean',
|
|
normalize: true,
|
|
});
|
|
|
|
return Array.from(result.data);
|
|
}
|
|
|
|
private async findDocumentationFiles(dir: string): Promise<string[]> {
|
|
const files: string[] = [];
|
|
const entries = await fs.readdir(dir, { withFileTypes: true });
|
|
|
|
for (const entry of entries) {
|
|
const fullPath = path.join(dir, entry.name);
|
|
if (entry.isDirectory()) {
|
|
const subFiles = await this.findDocumentationFiles(fullPath);
|
|
files.push(...subFiles);
|
|
} else if (entry.isFile()) {
|
|
// Include .md files and page.tsx files (Editor documentation)
|
|
if (entry.name.endsWith('.md') || entry.name === 'page.tsx') {
|
|
files.push(fullPath);
|
|
}
|
|
}
|
|
}
|
|
|
|
return files;
|
|
}
|
|
|
|
private generateDocId(filePath: string, source: DocumentSource): string {
|
|
const basePath = source.path;
|
|
const relativePath = filePath
|
|
.replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '')
|
|
.replace(/\.md$/i, '')
|
|
.replace(/\/page\.tsx$/i, '') // Remove /page.tsx for Editor docs
|
|
.replace(/\//g, '_');
|
|
return `${source.name}_${relativePath}`;
|
|
}
|
|
|
|
private generateDocUrl(metadata: DocumentMetadata, source: DocumentSource): string {
|
|
const basePath = source.path;
|
|
let relativePath = metadata.filePath
|
|
.replace(new RegExp(`^.*${basePath.replace(/\//g, '\\/')}\\/`), '')
|
|
.replace(/\.md$/i, '')
|
|
.replace(/\/page\.tsx$/i, ''); // Remove /page.tsx for Editor docs
|
|
|
|
// For source-repo, use GitHub URL; for documentation, use doc site
|
|
if (source.name === 'source-repo') {
|
|
return `https://github.com/BabylonJS/Babylon.js/blob/master/${relativePath}.md`;
|
|
}
|
|
|
|
// For editor-docs, construct proper URL
|
|
if (source.name === 'editor-docs') {
|
|
return `${source.urlPrefix}/${relativePath}`;
|
|
}
|
|
|
|
return `${source.urlPrefix}/${relativePath}`;
|
|
}
|
|
|
|
private truncateContent(content: string, maxLength: number): string {
|
|
if (content.length <= maxLength) return content;
|
|
return content.substring(0, maxLength) + '...';
|
|
}
|
|
|
|
async close(): Promise<void> {
|
|
// LanceDB doesn't require explicit closing
|
|
console.log('Indexer closed');
|
|
}
|
|
}
|