feat: convenience helper scripts

lxobr · lxobr · commit e1fa478b1916 · 2025-06-06T14:19:34.000+02:00
diff --git a/evals/comparative_eval/helpers/calculate_aggregate_metrics.py b/evals/comparative_eval/helpers/calculate_aggregate_metrics.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+"""Simple script to calculate aggregate metrics for multiple JSON files."""
+
+import os
+from cognee.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
+from cognee.shared.logging_utils import get_logger
+
+logger = get_logger()
+
+
+def calculate_aggregates_for_files(json_paths: list[str]) -> None:
+    """Calculate aggregate metrics for a list of JSON files."""
+    for json_path in json_paths:
+        if not os.path.exists(json_path):
+            logger.error(f"File not found: {json_path}")
+            continue
+
+        # Generate output path for aggregate metrics in the same folder as input
+        input_dir = os.path.dirname(json_path)
+        base_name = os.path.splitext(os.path.basename(json_path))[0]
+        output_path = os.path.join(input_dir, f"aggregate_metrics_{base_name}.json")
+
+        try:
+            logger.info(f"Calculating aggregate metrics for {json_path}")
+            calculate_metrics_statistics(json_path, output_path)
+            logger.info(f"Saved aggregate metrics to {output_path}")
+        except Exception as e:
+            logger.error(f"Failed to calculate metrics for {json_path}: {e}")
+
+
+if __name__ == "__main__":
+    dir_path = ""
+    json_file_paths = [
+        os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith(".json")
+    ]
+
+    calculate_aggregates_for_files(json_file_paths)
+    print("Done calculating aggregate metrics!")
diff --git a/evals/comparative_eval/helpers/convert_metrics.py b/evals/comparative_eval/helpers/convert_metrics.py
@@ -0,0 +1,107 @@
+import json
+import os
+from pathlib import Path
+from typing import List, Dict, Any
+import pandas as pd
+
+
+def convert_metrics_file(json_path: str, metrics: List[str] = None) -> Dict[str, Any]:
+    """Convert a single metrics JSON file to the desired format."""
+    if metrics is None:
+        metrics = ["correctness", "f1", "EM"]
+
+    with open(json_path, "r") as f:
+        data = json.load(f)
+
+    # Extract filename without extension for system name
+    filename = Path(json_path).stem
+
+    # Convert to desired format
+    result = {
+        "system": filename,
+        "Human-LLM Correctness": None,
+        "Human-LLM Correctness Error": None,
+    }
+
+    # Add metrics dynamically based on the metrics list
+    for metric in metrics:
+        if metric in data:
+            result[f"DeepEval {metric.title()}"] = data[metric]["mean"]
+            result[f"DeepEval {metric.title()} Error"] = [
+                data[metric]["ci_lower"],
+                data[metric]["ci_upper"],
+            ]
+        else:
+            print(f"Warning: Metric '{metric}' not found in {json_path}")
+
+    return result
+
+
+def convert_to_dataframe(results: List[Dict[str, Any]]) -> pd.DataFrame:
+    """Convert results list to DataFrame with expanded error columns."""
+    df_data = []
+
+    for result in results:
+        row = {}
+        for key, value in result.items():
+            if key.endswith("Error") and isinstance(value, list) and len(value) == 2:
+                # Split error columns into lower and upper
+                row[f"{key} Lower"] = value[0]
+                row[f"{key} Upper"] = value[1]
+            else:
+                row[key] = value
+        df_data.append(row)
+
+    return pd.DataFrame(df_data)
+
+
+def process_multiple_files(
+    json_paths: List[str], output_path: str, metrics: List[str] = None
+) -> None:
+    """Process multiple JSON files and save concatenated results."""
+    if metrics is None:
+        metrics = ["correctness", "f1", "EM"]
+
+    results = []
+
+    for json_path in json_paths:
+        try:
+            converted = convert_metrics_file(json_path, metrics)
+            results.append(converted)
+            print(f"Processed: {json_path}")
+        except Exception as e:
+            print(f"Error processing {json_path}: {e}")
+
+    # Save JSON results
+    with open(output_path, "w") as f:
+        json.dump(results, f, indent=2)
+
+    print(f"Saved {len(results)} results to {output_path}")
+
+    # Convert to DataFrame and save CSV
+    df = convert_to_dataframe(results)
+    csv_path = output_path.replace(".json", ".csv")
+    df.to_csv(csv_path, index=False)
+    print(f"Saved DataFrame to {csv_path}")
+
+
+if __name__ == "__main__":
+    # Default metrics (can be customized here)
+    # default_metrics = ['correctness', 'f1', 'EM']
+    default_metrics = ["correctness"]
+
+    # List JSON files in the current directory
+    current_dir = ""
+    json_files = [f for f in os.listdir(current_dir) if f.endswith(".json")]
+
+    if json_files:
+        print(f"Found {len(json_files)} JSON files:")
+        for f in json_files:
+            print(f"  - {f}")
+
+        # Create full paths for JSON files and output file in current working directory
+        json_full_paths = [os.path.join(current_dir, f) for f in json_files]
+        output_file = os.path.join(current_dir, "converted_metrics.json")
+        process_multiple_files(json_full_paths, output_file, default_metrics)
+    else:
+        print("No JSON files found in current directory")
diff --git a/evals/comparative_eval/helpers/modal_evaluate_answers.py b/evals/comparative_eval/helpers/modal_evaluate_answers.py
@@ -0,0 +1,161 @@
+import modal
+import os
+import asyncio
+import datetime
+import hashlib
+import json
+from cognee.shared.logging_utils import get_logger
+from cognee.eval_framework.eval_config import EvalConfig
+from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
+from cognee.eval_framework.metrics_dashboard import create_dashboard
+
+logger = get_logger()
+vol = modal.Volume.from_name("comparison-eval-answers", create_if_missing=True)
+
+app = modal.App("comparison-eval-answerst")
+
+image = (
+    modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
+    .copy_local_file("pyproject.toml", "pyproject.toml")
+    .copy_local_file("poetry.lock", "poetry.lock")
+    .env(
+        {
+            "ENV": os.getenv("ENV"),
+            "LLM_API_KEY": os.getenv("LLM_API_KEY"),
+            "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
+        }
+    )
+    .pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
+)
+
+
+@app.function(image=image, concurrency_limit=10, timeout=86400, volumes={"/data": vol})
+async def modal_evaluate_answers(
+    answers_json_content: dict, answers_filename: str, eval_config: dict = None
+):
+    """Evaluates answers from JSON content and returns metrics results."""
+    if eval_config is None:
+        eval_config = EvalConfig().to_dict()
+
+    timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
+
+    # Create temporary file path for the JSON content
+    base_name = os.path.splitext(answers_filename)[0]
+    temp_answers_path = f"/data/temp_answers_{base_name}_{timestamp}.json"
+
+    # Write JSON content to temporary file
+    with open(temp_answers_path, "w") as f:
+        json.dump(answers_json_content, f, ensure_ascii=False, indent=4)
+
+    # Set up output paths with simplified naming: prefix_original_file_name
+    eval_params = eval_config.copy()
+    eval_params["answers_path"] = temp_answers_path
+    eval_params["metrics_path"] = f"/data/metrics_{answers_filename}"
+    eval_params["aggregate_metrics_path"] = f"/data/aggregate_metrics_{answers_filename}"
+    eval_params["dashboard_path"] = f"/data/dashboard_{os.path.splitext(answers_filename)[0]}.html"
+
+    # eval_params["evaluation_engine"] = "DirectLLM"
+    # eval_params["evaluation_metrics"] = ["correctness"]
+
+    logger.info(f"Evaluating answers from: {answers_filename}")
+    logger.info(f"Using eval params: {eval_params}")
+
+    try:
+        # Only run evaluation (skip corpus building and question answering)
+        evaluated_answers = await run_evaluation(eval_params)
+
+        # Save evaluated answers
+        evaluated_answers_path = f"/data/evaluated_{answers_filename}"
+        with open(evaluated_answers_path, "w") as f:
+            json.dump(evaluated_answers, f, ensure_ascii=False, indent=4)
+        vol.commit()
+
+        # Generate dashboard if requested
+        if eval_params.get("dashboard"):
+            logger.info("Generating dashboard...")
+            html_output = create_dashboard(
+                metrics_path=eval_params["metrics_path"],
+                aggregate_metrics_path=eval_params["aggregate_metrics_path"],
+                output_file=eval_params["dashboard_path"],
+                benchmark=eval_params.get("benchmark", "Unknown"),
+            )
+
+            with open(eval_params["dashboard_path"], "w") as f:
+                f.write(html_output)
+            vol.commit()
+
+        logger.info(f"Evaluation completed for {answers_filename}")
+
+        # Return metrics results
+        result = {
+            "answers_file": answers_filename,
+            "metrics_path": eval_params["metrics_path"],
+            "aggregate_metrics_path": eval_params["aggregate_metrics_path"],
+            "dashboard_path": eval_params["dashboard_path"]
+            if eval_params.get("dashboard")
+            else None,
+            "evaluated_answers_path": evaluated_answers_path,
+        }
+
+        return result
+
+    except Exception as e:
+        logger.error(f"Error evaluating {answers_filename}: {e}")
+        raise
+
+
+@app.local_entrypoint()
+async def main():
+    """Main entry point that evaluates multiple JSON answer files in parallel."""
+
+    json_files_dir = ""
+    json_files = [f for f in os.listdir(json_files_dir) if f.endswith(".json")]
+    json_file_paths = [os.path.join(json_files_dir, f) for f in json_files]
+
+    # Manually specify your evaluation configuration here
+    eval_config = EvalConfig(
+        # Only evaluation-related settings
+        evaluating_answers=True,
+        evaluating_contexts=False,
+        evaluation_engine="DeepEval",
+        evaluation_metrics=["correctness", "EM", "f1"],
+        calculate_metrics=True,
+        dashboard=True,
+        deepeval_model="gpt-4o-mini",
+    ).to_dict()
+
+    logger.info(f"Starting evaluation of {len(json_file_paths)} JSON files")
+
+    # Read JSON files locally and prepare tasks
+    modal_tasks = []
+    for json_path in json_file_paths:
+        try:
+            # Read JSON content locally
+            with open(json_path, "r", encoding="utf-8") as f:
+                json_content = json.load(f)
+
+            filename = os.path.basename(json_path)
+
+            # Create remote evaluation task with JSON content
+            task = modal_evaluate_answers.remote.aio(json_content, filename, eval_config)
+            modal_tasks.append(task)
+
+        except (FileNotFoundError, json.JSONDecodeError) as e:
+            logger.error(f"Error reading {json_path}: {e}")
+            continue
+
+    if not modal_tasks:
+        logger.error("No valid JSON files found to process")
+        return []
+
+    # Run evaluations in parallel
+    results = await asyncio.gather(*modal_tasks, return_exceptions=True)
+
+    # Log results
+    for i, result in enumerate(results):
+        if isinstance(result, Exception):
+            logger.error(f"Failed to evaluate {json_file_paths[i]}: {result}")
+        else:
+            logger.info(f"Successfully evaluated {result['answers_file']}")
+
+    return results