project-standalo-sonic-cloud/skills/documentation-generator/scripts/analyze_project.py

#!/usr/bin/env python3
"""
Project Analyzer for Documentation Generation
Analyzes project structure and outputs YAML for documentation generation.
"""

import os
import sys
import json
import re
from pathlib import Path
from typing import Dict, List, Any, Optional
from datetime import datetime

# Try to import yaml, but provide fallback
try:
    import yaml
except ImportError:
    yaml = None


def detect_project_type(root_path: Path) -> Dict[str, Any]:
    """Detect project type from config files."""
    indicators = {
        'node': ['package.json'],
        'python': ['requirements.txt', 'pyproject.toml', 'setup.py', 'Pipfile'],
        'rust': ['Cargo.toml'],
        'go': ['go.mod'],
        'java': ['pom.xml', 'build.gradle', 'build.gradle.kts'],
        'dotnet': list(root_path.glob('*.csproj')) + list(root_path.glob('*.sln')),
        'ruby': ['Gemfile'],
        'php': ['composer.json'],
    }

    for lang, files in indicators.items():
        if isinstance(files, list) and isinstance(files[0], str):
            for f in files:
                if (root_path / f).exists():
                    return {'type': lang, 'config_file': f}
        elif files:  # Already Path objects from glob
            return {'type': lang, 'config_file': str(files[0].name)}

    return {'type': 'other', 'config_file': None}


def parse_package_json(root_path: Path) -> Dict[str, Any]:
    """Parse package.json for Node.js projects."""
    pkg_path = root_path / 'package.json'
    if not pkg_path.exists():
        return {}

    with open(pkg_path, 'r') as f:
        data = json.load(f)

    deps = data.get('dependencies', {})
    dev_deps = data.get('devDependencies', {})

    # Detect framework
    framework = None
    if 'next' in deps:
        framework = 'Next.js'
    elif 'react' in deps:
        framework = 'React'
    elif 'vue' in deps:
        framework = 'Vue.js'
    elif '@angular/core' in deps:
        framework = 'Angular'
    elif 'express' in deps:
        framework = 'Express'
    elif 'fastify' in deps:
        framework = 'Fastify'

    # Detect database
    database = None
    if '@prisma/client' in deps:
        database = 'Prisma (PostgreSQL/MySQL/SQLite)'
    elif 'mongoose' in deps:
        database = 'MongoDB (Mongoose)'
    elif 'typeorm' in deps:
        database = 'TypeORM'
    elif 'sequelize' in deps:
        database = 'Sequelize'

    # Detect UI framework
    ui_framework = None
    if 'tailwindcss' in dev_deps or 'tailwindcss' in deps:
        ui_framework = 'Tailwind CSS'
    if '@mui/material' in deps:
        ui_framework = 'Material UI'
    elif '@chakra-ui/react' in deps:
        ui_framework = 'Chakra UI'

    # Categorize dependencies
    key_deps = []
    dep_categories = {
        'core': ['react', 'next', 'vue', 'angular', 'express', 'fastify'],
        'database': ['@prisma/client', 'mongoose', 'typeorm', 'sequelize', 'pg', 'mysql2'],
        'auth': ['next-auth', 'passport', 'jsonwebtoken', '@auth0/nextjs-auth0'],
        'ui': ['@mui/material', '@chakra-ui/react', 'antd', '@radix-ui'],
        'state': ['zustand', 'redux', '@reduxjs/toolkit', 'recoil', 'jotai'],
        'testing': ['jest', 'vitest', '@testing-library/react', 'cypress'],
    }

    for dep, version in {**deps, **dev_deps}.items():
        category = 'utility'
        for cat, patterns in dep_categories.items():
            if any(p in dep for p in patterns):
                category = cat
                break

        if category != 'utility' or dep in ['axios', 'zod', 'date-fns', 'lodash']:
            key_deps.append({
                'name': dep,
                'version': version.replace('^', '').replace('~', ''),
                'category': category,
                'purpose': get_dep_purpose(dep)
            })

    return {
        'name': data.get('name', 'Unknown'),
        'version': data.get('version', '0.0.0'),
        'description': data.get('description', ''),
        'framework': framework,
        'database': database,
        'ui_framework': ui_framework,
        'key_dependencies': key_deps[:15],  # Limit to 15 most important
        'scripts': data.get('scripts', {})
    }


def get_dep_purpose(dep_name: str) -> str:
    """Get plain English purpose for common dependencies."""
    purposes = {
        'react': 'UI component library',
        'next': 'Full-stack React framework',
        'vue': 'Progressive UI framework',
        'express': 'Web server framework',
        'fastify': 'High-performance web framework',
        '@prisma/client': 'Database ORM and query builder',
        'mongoose': 'MongoDB object modeling',
        'typeorm': 'TypeScript ORM',
        'sequelize': 'SQL ORM',
        'next-auth': 'Authentication for Next.js',
        'passport': 'Authentication middleware',
        'jsonwebtoken': 'JWT token handling',
        '@mui/material': 'Material Design components',
        '@chakra-ui/react': 'Accessible component library',
        'tailwindcss': 'Utility-first CSS framework',
        'zustand': 'State management',
        'redux': 'Predictable state container',
        '@reduxjs/toolkit': 'Redux development toolkit',
        'axios': 'HTTP client',
        'zod': 'Schema validation',
        'date-fns': 'Date utility functions',
        'lodash': 'Utility functions',
        'jest': 'Testing framework',
        'vitest': 'Fast unit testing',
        '@testing-library/react': 'React component testing',
        'cypress': 'End-to-end testing',
    }
    return purposes.get(dep_name, 'Utility library')


def scan_directory_structure(root_path: Path) -> Dict[str, Any]:
    """Scan and categorize directory structure."""
    ignore_dirs = {
        'node_modules', '.git', '.next', '__pycache__', 'venv',
        '.venv', 'dist', 'build', '.cache', 'coverage', '.turbo'
    }

    common_purposes = {
        'src': 'Main source code directory',
        'app': 'Application code (Next.js App Router)',
        'pages': 'Page components (Next.js Pages Router)',
        'components': 'Reusable UI components',
        'lib': 'Shared utilities and libraries',
        'utils': 'Utility functions',
        'hooks': 'Custom React hooks',
        'context': 'React context providers',
        'store': 'State management',
        'styles': 'CSS and styling',
        'types': 'TypeScript type definitions',
        'api': 'API route handlers',
        'services': 'Business logic services',
        'models': 'Data models/entities',
        'prisma': 'Database schema and migrations',
        'public': 'Static assets',
        'tests': 'Test files',
        '__tests__': 'Jest test files',
        'test': 'Test files',
        'spec': 'Test specifications',
        'docs': 'Documentation',
        'scripts': 'Build and utility scripts',
        'config': 'Configuration files',
    }

    directories = []
    source_dir = None

    # Find main source directory
    for candidate in ['src', 'app', 'lib', 'source']:
        if (root_path / candidate).is_dir():
            source_dir = candidate
            break

    # Scan directories
    for item in sorted(root_path.iterdir()):
        if item.is_dir() and item.name not in ignore_dirs and not item.name.startswith('.'):
            file_count = sum(1 for _ in item.rglob('*') if _.is_file())
            key_files = [
                f.name for f in item.iterdir()
                if f.is_file() and f.suffix in ['.ts', '.tsx', '.js', '.jsx', '.py', '.rs', '.go']
            ][:5]

            directories.append({
                'path': item.name,
                'purpose': common_purposes.get(item.name, 'Project directory'),
                'file_count': file_count,
                'key_files': key_files
            })

    return {
        'source_dir': source_dir or '.',
        'directories': directories
    }


def detect_features(root_path: Path) -> List[Dict[str, Any]]:
    """Detect main features from code patterns."""
    features = []

    feature_patterns = {
        'authentication': {
            'keywords': ['auth', 'login', 'logout', 'session', 'jwt', 'oauth'],
            'description': 'User authentication and session management',
            'technical_notes': 'Handles user login, logout, and session tokens'
        },
        'user_management': {
            'keywords': ['user', 'profile', 'account', 'register', 'signup'],
            'description': 'User account creation and profile management',
            'technical_notes': 'CRUD operations for user data'
        },
        'api': {
            'keywords': ['api', 'endpoint', 'route'],
            'description': 'REST API endpoints for data operations',
            'technical_notes': 'HTTP handlers for client-server communication'
        },
        'database': {
            'keywords': ['prisma', 'model', 'entity', 'schema', 'migration'],
            'description': 'Database storage and data persistence',
            'technical_notes': 'ORM-based data layer with migrations'
        },
        'file_upload': {
            'keywords': ['upload', 'file', 'storage', 's3', 'blob'],
            'description': 'File upload and storage functionality',
            'technical_notes': 'Handles file uploads and cloud storage'
        },
        'search': {
            'keywords': ['search', 'filter', 'query'],
            'description': 'Search and filtering capabilities',
            'technical_notes': 'Full-text search or database queries'
        },
    }

    # Scan for features
    all_files = list(root_path.rglob('*.ts')) + list(root_path.rglob('*.tsx')) + \
                list(root_path.rglob('*.js')) + list(root_path.rglob('*.jsx'))

    file_names = [f.stem.lower() for f in all_files]
    file_paths = [str(f.relative_to(root_path)).lower() for f in all_files]

    for feature_name, config in feature_patterns.items():
        found_files = []
        for keyword in config['keywords']:
            found_files.extend([
                str(f.relative_to(root_path)) for f in all_files
                if keyword in str(f).lower()
            ])

        if found_files:
            features.append({
                'name': feature_name.replace('_', ' ').title(),
                'description': config['description'],
                'technical_notes': config['technical_notes'],
                'files': list(set(found_files))[:5]
            })

    return features


def find_components(root_path: Path) -> List[Dict[str, Any]]:
    """Find UI components in the project."""
    components = []
    component_dirs = ['components', 'src/components', 'app/components']

    for comp_dir in component_dirs:
        comp_path = root_path / comp_dir
        if comp_path.exists():
            for file in comp_path.rglob('*.tsx'):
                if file.name.startswith('_') or file.name == 'index.tsx':
                    continue

                name = file.stem
                if name[0].isupper():  # Component names are PascalCase
                    components.append({
                        'id': f'component_{name.lower()}',
                        'name': name,
                        'path': str(file.relative_to(root_path)),
                        'description': f'{name} component',
                        'props': 'See source file'
                    })

    return components[:20]  # Limit to 20 components


def find_api_endpoints(root_path: Path) -> List[Dict[str, Any]]:
    """Find API endpoints in the project."""
    endpoints = []

    # Next.js App Router: app/api/**/route.ts
    api_dir = root_path / 'app' / 'api'
    if api_dir.exists():
        for route_file in api_dir.rglob('route.ts'):
            path_parts = route_file.parent.relative_to(api_dir).parts
            api_path = '/api/' + '/'.join(path_parts)

            # Read file to detect methods
            content = route_file.read_text()
            methods = []
            for method in ['GET', 'POST', 'PUT', 'PATCH', 'DELETE']:
                if f'export async function {method}' in content or f'export function {method}' in content:
                    methods.append(method)

            for method in methods:
                endpoints.append({
                    'method': method,
                    'path': api_path.replace('[', ':').replace(']', ''),
                    'handler_file': str(route_file.relative_to(root_path)),
                    'description': f'{method} {api_path}',
                    'technical_notes': 'Next.js App Router endpoint'
                })

    # Next.js Pages Router: pages/api/**/*.ts
    pages_api = root_path / 'pages' / 'api'
    if pages_api.exists():
        for api_file in pages_api.rglob('*.ts'):
            path_parts = api_file.relative_to(pages_api).with_suffix('').parts
            api_path = '/api/' + '/'.join(path_parts)

            endpoints.append({
                'method': 'MULTIPLE',
                'path': api_path.replace('[', ':').replace(']', ''),
                'handler_file': str(api_file.relative_to(root_path)),
                'description': f'API endpoint at {api_path}',
                'technical_notes': 'Next.js Pages Router endpoint'
            })

    return endpoints


def find_data_models(root_path: Path) -> List[Dict[str, Any]]:
    """Find data models in the project."""
    models = []

    # Prisma schema
    prisma_schema = root_path / 'prisma' / 'schema.prisma'
    if prisma_schema.exists():
        content = prisma_schema.read_text()
        model_pattern = re.compile(r'model\s+(\w+)\s*\{([^}]+)\}', re.MULTILINE)

        for match in model_pattern.finditer(content):
            model_name = match.group(1)
            model_body = match.group(2)

            # Extract fields
            fields = []
            for line in model_body.strip().split('\n'):
                line = line.strip()
                if line and not line.startswith('@@') and not line.startswith('//'):
                    parts = line.split()
                    if len(parts) >= 2:
                        fields.append({
                            'name': parts[0],
                            'type': parts[1],
                            'description': f'{parts[0]} field'
                        })

            models.append({
                'name': model_name,
                'description': f'{model_name} data model',
                'fields': fields[:10]  # Limit fields
            })

    return models


def collect_glossary_terms(features: List, components: List, endpoints: List) -> List[Dict[str, str]]:
    """Collect technical terms that need definitions."""
    common_terms = {
        'API': 'Application Programming Interface - a way for different software to communicate',
        'REST': 'Representational State Transfer - a standard way to design web APIs',
        'Component': 'A reusable piece of the user interface',
        'Endpoint': 'A specific URL that the application responds to',
        'ORM': 'Object-Relational Mapping - connects code to database tables',
        'JWT': 'JSON Web Token - a secure way to transmit user identity',
        'CRUD': 'Create, Read, Update, Delete - basic data operations',
        'Props': 'Properties passed to a component to customize it',
        'State': 'Data that can change and affects what users see',
        'Hook': 'A way to add features to React components',
        'Migration': 'A controlled change to database structure',
        'Schema': 'The structure/shape of data',
        'Route': 'A URL path that maps to specific functionality',
        'Handler': 'Code that responds to a specific request',
    }

    return [{'term': k, 'definition': v} for k, v in common_terms.items()]


def generate_analysis(root_path: Path) -> Dict[str, Any]:
    """Generate complete project analysis."""
    project_info = detect_project_type(root_path)
    pkg_info = parse_package_json(root_path) if project_info['type'] == 'node' else {}
    structure = scan_directory_structure(root_path)
    features = detect_features(root_path)
    components = find_components(root_path)
    endpoints = find_api_endpoints(root_path)
    models = find_data_models(root_path)
    glossary = collect_glossary_terms(features, components, endpoints)

    return {
        'analysis_timestamp': datetime.now().isoformat(),
        'project': {
            'name': pkg_info.get('name', root_path.name),
            'version': pkg_info.get('version', '0.0.0'),
            'description': pkg_info.get('description', ''),
            'type': project_info['type'],
        },
        'tech_stack': {
            'language': 'TypeScript' if project_info['type'] == 'node' else project_info['type'],
            'framework': pkg_info.get('framework'),
            'database': pkg_info.get('database'),
            'ui_framework': pkg_info.get('ui_framework'),
            'key_dependencies': pkg_info.get('key_dependencies', []),
        },
        'structure': structure,
        'features': features,
        'components': components,
        'api_endpoints': endpoints,
        'data_models': models,
        'glossary_terms': glossary,
    }


def output_yaml(data: Dict[str, Any], output_path: Optional[Path] = None):
    """Output analysis as YAML."""
    if yaml:
        output = yaml.dump(data, default_flow_style=False, allow_unicode=True, sort_keys=False)
    else:
        # Fallback to JSON if yaml not available
        output = json.dumps(data, indent=2)

    if output_path:
        output_path.write_text(output)
        print(f"Analysis written to: {output_path}")
    else:
        print(output)


def main():
    """Main entry point."""
    root_path = Path.cwd()

    if len(sys.argv) > 1:
        root_path = Path(sys.argv[1])

    if not root_path.exists():
        print(f"Error: Path does not exist: {root_path}", file=sys.stderr)
        sys.exit(1)

    output_path = None
    if len(sys.argv) > 2:
        output_path = Path(sys.argv[2])

    analysis = generate_analysis(root_path)
    output_yaml(analysis, output_path)


if __name__ == '__main__':
    main()