|
| 1 | +import modal |
| 2 | +import os |
| 3 | +import asyncio |
| 4 | +import datetime |
| 5 | +import hashlib |
| 6 | +import json |
| 7 | +from cognee.shared.logging_utils import get_logger |
| 8 | +from cognee.eval_framework.eval_config import EvalConfig |
| 9 | +from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation |
| 10 | +from cognee.eval_framework.metrics_dashboard import create_dashboard |
| 11 | + |
| 12 | +logger = get_logger() |
| 13 | +vol = modal.Volume.from_name("comparison-eval-answers", create_if_missing=True) |
| 14 | + |
| 15 | +app = modal.App("comparison-eval-answerst") |
| 16 | + |
| 17 | +image = ( |
| 18 | + modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False) |
| 19 | + .copy_local_file("pyproject.toml", "pyproject.toml") |
| 20 | + .copy_local_file("poetry.lock", "poetry.lock") |
| 21 | + .env( |
| 22 | + { |
| 23 | + "ENV": os.getenv("ENV"), |
| 24 | + "LLM_API_KEY": os.getenv("LLM_API_KEY"), |
| 25 | + "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"), |
| 26 | + } |
| 27 | + ) |
| 28 | + .pip_install("protobuf", "h2", "deepeval", "gdown", "plotly") |
| 29 | +) |
| 30 | + |
| 31 | + |
| 32 | +@app.function(image=image, concurrency_limit=10, timeout=86400, volumes={"/data": vol}) |
| 33 | +async def modal_evaluate_answers( |
| 34 | + answers_json_content: dict, answers_filename: str, eval_config: dict = None |
| 35 | +): |
| 36 | + """Evaluates answers from JSON content and returns metrics results.""" |
| 37 | + if eval_config is None: |
| 38 | + eval_config = EvalConfig().to_dict() |
| 39 | + |
| 40 | + timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ") |
| 41 | + |
| 42 | + # Create temporary file path for the JSON content |
| 43 | + base_name = os.path.splitext(answers_filename)[0] |
| 44 | + temp_answers_path = f"/data/temp_answers_{base_name}_{timestamp}.json" |
| 45 | + |
| 46 | + # Write JSON content to temporary file |
| 47 | + with open(temp_answers_path, "w") as f: |
| 48 | + json.dump(answers_json_content, f, ensure_ascii=False, indent=4) |
| 49 | + |
| 50 | + # Set up output paths with simplified naming: prefix_original_file_name |
| 51 | + eval_params = eval_config.copy() |
| 52 | + eval_params["answers_path"] = temp_answers_path |
| 53 | + eval_params["metrics_path"] = f"/data/metrics_{answers_filename}" |
| 54 | + eval_params["aggregate_metrics_path"] = f"/data/aggregate_metrics_{answers_filename}" |
| 55 | + eval_params["dashboard_path"] = f"/data/dashboard_{os.path.splitext(answers_filename)[0]}.html" |
| 56 | + |
| 57 | + # eval_params["evaluation_engine"] = "DirectLLM" |
| 58 | + # eval_params["evaluation_metrics"] = ["correctness"] |
| 59 | + |
| 60 | + logger.info(f"Evaluating answers from: {answers_filename}") |
| 61 | + logger.info(f"Using eval params: {eval_params}") |
| 62 | + |
| 63 | + try: |
| 64 | + # Only run evaluation (skip corpus building and question answering) |
| 65 | + evaluated_answers = await run_evaluation(eval_params) |
| 66 | + |
| 67 | + # Save evaluated answers |
| 68 | + evaluated_answers_path = f"/data/evaluated_{answers_filename}" |
| 69 | + with open(evaluated_answers_path, "w") as f: |
| 70 | + json.dump(evaluated_answers, f, ensure_ascii=False, indent=4) |
| 71 | + vol.commit() |
| 72 | + |
| 73 | + # Generate dashboard if requested |
| 74 | + if eval_params.get("dashboard"): |
| 75 | + logger.info("Generating dashboard...") |
| 76 | + html_output = create_dashboard( |
| 77 | + metrics_path=eval_params["metrics_path"], |
| 78 | + aggregate_metrics_path=eval_params["aggregate_metrics_path"], |
| 79 | + output_file=eval_params["dashboard_path"], |
| 80 | + benchmark=eval_params.get("benchmark", "Unknown"), |
| 81 | + ) |
| 82 | + |
| 83 | + with open(eval_params["dashboard_path"], "w") as f: |
| 84 | + f.write(html_output) |
| 85 | + vol.commit() |
| 86 | + |
| 87 | + logger.info(f"Evaluation completed for {answers_filename}") |
| 88 | + |
| 89 | + # Return metrics results |
| 90 | + result = { |
| 91 | + "answers_file": answers_filename, |
| 92 | + "metrics_path": eval_params["metrics_path"], |
| 93 | + "aggregate_metrics_path": eval_params["aggregate_metrics_path"], |
| 94 | + "dashboard_path": eval_params["dashboard_path"] |
| 95 | + if eval_params.get("dashboard") |
| 96 | + else None, |
| 97 | + "evaluated_answers_path": evaluated_answers_path, |
| 98 | + } |
| 99 | + |
| 100 | + return result |
| 101 | + |
| 102 | + except Exception as e: |
| 103 | + logger.error(f"Error evaluating {answers_filename}: {e}") |
| 104 | + raise |
| 105 | + |
| 106 | + |
| 107 | +@app.local_entrypoint() |
| 108 | +async def main(): |
| 109 | + """Main entry point that evaluates multiple JSON answer files in parallel.""" |
| 110 | + |
| 111 | + json_files_dir = "" |
| 112 | + json_files = [f for f in os.listdir(json_files_dir) if f.endswith(".json")] |
| 113 | + json_file_paths = [os.path.join(json_files_dir, f) for f in json_files] |
| 114 | + |
| 115 | + # Manually specify your evaluation configuration here |
| 116 | + eval_config = EvalConfig( |
| 117 | + # Only evaluation-related settings |
| 118 | + evaluating_answers=True, |
| 119 | + evaluating_contexts=False, |
| 120 | + evaluation_engine="DeepEval", |
| 121 | + evaluation_metrics=["correctness", "EM", "f1"], |
| 122 | + calculate_metrics=True, |
| 123 | + dashboard=True, |
| 124 | + deepeval_model="gpt-4o-mini", |
| 125 | + ).to_dict() |
| 126 | + |
| 127 | + logger.info(f"Starting evaluation of {len(json_file_paths)} JSON files") |
| 128 | + |
| 129 | + # Read JSON files locally and prepare tasks |
| 130 | + modal_tasks = [] |
| 131 | + for json_path in json_file_paths: |
| 132 | + try: |
| 133 | + # Read JSON content locally |
| 134 | + with open(json_path, "r", encoding="utf-8") as f: |
| 135 | + json_content = json.load(f) |
| 136 | + |
| 137 | + filename = os.path.basename(json_path) |
| 138 | + |
| 139 | + # Create remote evaluation task with JSON content |
| 140 | + task = modal_evaluate_answers.remote.aio(json_content, filename, eval_config) |
| 141 | + modal_tasks.append(task) |
| 142 | + |
| 143 | + except (FileNotFoundError, json.JSONDecodeError) as e: |
| 144 | + logger.error(f"Error reading {json_path}: {e}") |
| 145 | + continue |
| 146 | + |
| 147 | + if not modal_tasks: |
| 148 | + logger.error("No valid JSON files found to process") |
| 149 | + return [] |
| 150 | + |
| 151 | + # Run evaluations in parallel |
| 152 | + results = await asyncio.gather(*modal_tasks, return_exceptions=True) |
| 153 | + |
| 154 | + # Log results |
| 155 | + for i, result in enumerate(results): |
| 156 | + if isinstance(result, Exception): |
| 157 | + logger.error(f"Failed to evaluate {json_file_paths[i]}: {result}") |
| 158 | + else: |
| 159 | + logger.info(f"Successfully evaluated {result['answers_file']}") |
| 160 | + |
| 161 | + return results |
0 commit comments