// MUST set environment variable before any imports that use @xenova/transformers // This prevents onnxruntime-node from being loaded on Alpine Linux (musl libc) if (process.env.TRANSFORMERS_BACKEND === 'wasm' || process.env.TRANSFORMERS_BACKEND === 'onnxruntime-web') { process.env.ONNXRUNTIME_BACKEND = 'wasm'; } import { connect } from '@lancedb/lancedb'; import { pipeline } from '@xenova/transformers'; import fs from 'fs/promises'; import path from 'path'; import { fileURLToPath } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); interface SourceCodeChunk { id: string; filePath: string; package: string; content: string; startLine: number; endLine: number; language: string; imports: string; exports: string; url: string; vector: number[]; } const CHUNK_SIZE = 200; const CHUNK_OVERLAP = 20; async function getAllSourceFiles(dir: string): Promise { const files: string[] = []; try { const entries = await fs.readdir(dir, { withFileTypes: true }); for (const entry of entries) { const fullPath = path.join(dir, entry.name); if (entry.isDirectory()) { if (!['node_modules', 'dist', 'build', 'lib', '.git', 'declaration'].includes(entry.name)) { const subFiles = await getAllSourceFiles(fullPath); files.push(...subFiles); } } else if (entry.isFile()) { if (/\.(ts|tsx|js|jsx)$/.test(entry.name)) { files.push(fullPath); } } } } catch { return []; } return files; } function extractImports(content: string): string { const imports: string[] = []; const importRegex = /import\s+(?:{[^}]+}|[^;]+)\s+from\s+['"]([^'"]+)['"]/g; let match; while ((match = importRegex.exec(content)) !== null) { if (match[1]) { imports.push(match[1]); } } return imports.slice(0, 20).join(', '); } function extractExports(content: string): string { const exports: string[] = []; const exportRegex = /export\s+(?:class|function|interface|type|const|let|var|enum|default)\s+([A-Za-z_$][A-Za-z0-9_$]*)/g; let match; while ((match = exportRegex.exec(content)) !== null) { if (match[1]) { exports.push(match[1]); } } return exports.slice(0, 20).join(', '); } function extractComments(code: string): string { const comments: string[] = []; const singleLineRegex = /\/\/\s*(.+)$/gm; let match; while ((match = singleLineRegex.exec(code)) !== null) { if (match[1]) { comments.push(match[1].trim()); } } const multiLineRegex = /\/\*\*?([\s\S]*?)\*\//g; while ((match = multiLineRegex.exec(code)) !== null) { if (match[1]) { comments.push(match[1].trim()); } } return comments.slice(0, 5).join(' '); } async function main() { const projectRoot = path.join(__dirname, '..'); const dbPath = path.join(projectRoot, 'data', 'lancedb'); const repositoryPath = path.join(projectRoot, 'data', 'repositories', 'Editor'); const tableName = 'babylon_editor_source'; // Editor packages with their source paths (relative to repo root) const packages = [ { name: 'editor', srcPath: 'editor/src' }, { name: 'tools', srcPath: 'tools/src' }, { name: 'website', srcPath: 'website/src' }, ]; console.log('Starting Editor source code indexing...'); console.log(`Database path: ${dbPath}`); console.log(`Repository path: ${repositoryPath}`); console.log(`Packages: ${packages.map(p => p.name).join(', ')}`); console.log(); // Initialize LanceDB console.log('Initializing LanceDB connection...'); const db = await connect(dbPath); // Load embedding model console.log('Loading embedding model...'); const embedder = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2'); console.log('Embedding model loaded'); const chunks: SourceCodeChunk[] = []; let totalFiles = 0; for (const pkg of packages) { console.log(`\nIndexing package: ${pkg.name}...`); const packagePath = path.join(repositoryPath, pkg.srcPath); try { const files = await getAllSourceFiles(packagePath); console.log(`Found ${files.length} source files in ${pkg.name}`); for (let i = 0; i < files.length; i++) { const file = files[i]!; try { const content = await fs.readFile(file, 'utf-8'); const lines = content.split('\n'); const imports = extractImports(content); const exports = extractExports(content); const language = file.endsWith('.ts') || file.endsWith('.tsx') ? 'typescript' : 'javascript'; const relativePath = path.relative(repositoryPath, file); // Chunk the file for (let j = 0; j < lines.length; j += CHUNK_SIZE - CHUNK_OVERLAP) { const startLine = j + 1; const endLine = Math.min(j + CHUNK_SIZE, lines.length); const chunkLines = lines.slice(j, endLine); const chunkContent = chunkLines.join('\n'); if (chunkContent.trim().length === 0) { continue; } // Create embedding text const fileName = path.basename(file); const dirName = path.dirname(relativePath).split('/').pop() || ''; const comments = extractComments(chunkContent); const embeddingText = `${fileName} ${dirName} ${comments} ${chunkContent.substring(0, 1000)}`; // Generate embedding const result = await embedder(embeddingText, { pooling: 'mean', normalize: true, }); const vector = Array.from(result.data) as number[]; // Generate GitHub URL for Editor repo const url = `https://github.com/BabylonJS/Editor/blob/master/${relativePath}#L${startLine}-L${endLine}`; chunks.push({ id: `${relativePath}:${startLine}-${endLine}`, filePath: relativePath, package: pkg.name, content: chunkContent, startLine, endLine, language, imports, exports, url, vector, }); } totalFiles++; if (totalFiles % 50 === 0) { console.log(`Processed ${totalFiles} files, ${chunks.length} chunks...`); } } catch (error) { console.error(`Error processing ${file}:`, error); } } } catch (error) { console.error(`Error indexing package ${pkg.name}:`, error); } } console.log(`\nTotal files processed: ${totalFiles}`); console.log(`Total source code chunks: ${chunks.length}`); console.log('Creating LanceDB table...'); // Drop existing table if it exists const tableNames = await db.tableNames(); if (tableNames.includes(tableName)) { await db.dropTable(tableName); } // Create new table await db.createTable(tableName, chunks); console.log('\n✓ Editor source code indexing completed successfully!'); } main().catch(console.error);