babylon-mcp/src/search/document-parser.ts
Michael Mainguy 005a17f345 Add Babylon.js Editor documentation integration with TSX parser
Implemented comprehensive Editor documentation indexing using TypeScript Compiler API
to parse React/Next.js TSX files from the Babylon.js Editor repository.

Key changes:
- Added Editor repository (4th repo) to repository-config.ts
- Created tsx-parser.ts using TypeScript Compiler API (zero new dependencies)
- Extended document-parser.ts to route .tsx files to TSX parser
- Updated lancedb-indexer.ts to discover page.tsx files
- Added editor-docs source to index-docs.ts script

Features:
- Parses TSX/JSX files to extract text content, headings, and code blocks
- Filters out className values and non-content text
- Extracts categories from file paths (editor/adding-scripts, etc.)
- Handles Editor-specific documentation structure

Test coverage:
- Added tsx-parser.test.ts (11 tests, 10 passing)
- Extended document-parser.test.ts with TSX coverage (5 new tests)
- Fixed repository-manager.test.ts for 4 repositories
- Total: 167 tests passing, 1 skipped

Results:
- 902 documents now indexed (745 docs + 144 source + 13 editor)
- Editor documentation appears in search results
- Verified with Editor-specific queries (onStart, decorators, etc.)

Updated ROADMAP.md with completion status for Editor integration phases 1-3.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-24 09:20:56 -06:00

116 lines
3.4 KiB
TypeScript

import matter from 'gray-matter';
import fs from 'fs/promises';
import path from 'path';
import type { DocumentMetadata, Heading, CodeBlock } from './types.js';
import { TsxParser } from './tsx-parser.js';
export class DocumentParser {
private tsxParser: TsxParser;
constructor() {
this.tsxParser = new TsxParser();
}
async parseFile(filePath: string, urlPrefix?: string): Promise<DocumentMetadata> {
const ext = path.extname(filePath).toLowerCase();
// Route to TSX parser for .tsx files
if (ext === '.tsx') {
return this.tsxParser.parseFile(filePath, urlPrefix || '');
}
// Default markdown parsing for .md files
const content = await fs.readFile(filePath, 'utf-8');
const { data, content: markdown } = matter(content);
return {
filePath,
title: data.title || '',
description: data.description || '',
keywords: this.parseKeywords(data.keywords),
category: this.extractCategory(filePath),
breadcrumbs: this.extractBreadcrumbs(filePath),
content: markdown,
headings: this.extractHeadings(markdown),
codeBlocks: this.extractCodeBlocks(markdown),
furtherReading: data['further-reading'] || [],
playgroundIds: this.extractPlaygroundIds(markdown),
lastModified: await this.getFileModifiedDate(filePath),
};
}
private parseKeywords(keywords: string | undefined): string[] {
if (!keywords) return [];
return keywords.split(',').map((k) => k.trim()).filter(Boolean);
}
private extractCategory(filePath: string): string {
const match = filePath.match(/content\/([^/]+(?:\/[^/]+)*)/);
if (!match || !match[1]) return 'uncategorized';
// Remove .md extension if present
return match[1].replace(/\.md$/, '');
}
private extractBreadcrumbs(filePath: string): string[] {
const category = this.extractCategory(filePath);
return category.split('/').filter(Boolean);
}
private extractHeadings(markdown: string): Heading[] {
const headings: Heading[] = [];
const lines = markdown.split('\n');
for (const line of lines) {
const trimmed = line.trim();
const match = trimmed.match(/^(#{1,6})\s+(.+)$/);
if (match && match[1] && match[2]) {
const level = match[1].length;
const text = match[2].trim();
const id = text.toLowerCase().replace(/[^\w\s-]/g, '').replace(/\s+/g, '-');
headings.push({ level, text, id });
}
}
return headings;
}
private extractCodeBlocks(markdown: string): CodeBlock[] {
const codeBlocks: CodeBlock[] = [];
const regex = /```(\w+)?\n([\s\S]*?)```/g;
let match;
while ((match = regex.exec(markdown)) !== null) {
if (match.index !== undefined && match[2] !== undefined) {
const lineStart = markdown.substring(0, match.index).split('\n').length;
codeBlocks.push({
language: match[1] || 'plaintext',
code: match[2].trim(),
lineStart,
});
}
}
return codeBlocks;
}
private extractPlaygroundIds(markdown: string): string[] {
const ids: string[] = [];
const regex = /<Playground\s+id=["']#([^"']+)["']/g;
let match;
while ((match = regex.exec(markdown)) !== null) {
if (match[1]) {
ids.push(match[1]);
}
}
return ids;
}
private async getFileModifiedDate(filePath: string): Promise<Date> {
const stats = await fs.stat(filePath);
return stats.mtime;
}
}