Skip to content

Commit e1fa478

Browse files
committed
feat: convenience helper scripts
1 parent 828ea7c commit e1fa478

File tree

3 files changed

+306
-0
lines changed

3 files changed

+306
-0
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/usr/bin/env python3
2+
"""Simple script to calculate aggregate metrics for multiple JSON files."""
3+
4+
import os
5+
from cognee.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
6+
from cognee.shared.logging_utils import get_logger
7+
8+
logger = get_logger()
9+
10+
11+
def calculate_aggregates_for_files(json_paths: list[str]) -> None:
12+
"""Calculate aggregate metrics for a list of JSON files."""
13+
for json_path in json_paths:
14+
if not os.path.exists(json_path):
15+
logger.error(f"File not found: {json_path}")
16+
continue
17+
18+
# Generate output path for aggregate metrics in the same folder as input
19+
input_dir = os.path.dirname(json_path)
20+
base_name = os.path.splitext(os.path.basename(json_path))[0]
21+
output_path = os.path.join(input_dir, f"aggregate_metrics_{base_name}.json")
22+
23+
try:
24+
logger.info(f"Calculating aggregate metrics for {json_path}")
25+
calculate_metrics_statistics(json_path, output_path)
26+
logger.info(f"Saved aggregate metrics to {output_path}")
27+
except Exception as e:
28+
logger.error(f"Failed to calculate metrics for {json_path}: {e}")
29+
30+
31+
if __name__ == "__main__":
32+
dir_path = ""
33+
json_file_paths = [
34+
os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith(".json")
35+
]
36+
37+
calculate_aggregates_for_files(json_file_paths)
38+
print("Done calculating aggregate metrics!")
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import json
2+
import os
3+
from pathlib import Path
4+
from typing import List, Dict, Any
5+
import pandas as pd
6+
7+
8+
def convert_metrics_file(json_path: str, metrics: List[str] = None) -> Dict[str, Any]:
9+
"""Convert a single metrics JSON file to the desired format."""
10+
if metrics is None:
11+
metrics = ["correctness", "f1", "EM"]
12+
13+
with open(json_path, "r") as f:
14+
data = json.load(f)
15+
16+
# Extract filename without extension for system name
17+
filename = Path(json_path).stem
18+
19+
# Convert to desired format
20+
result = {
21+
"system": filename,
22+
"Human-LLM Correctness": None,
23+
"Human-LLM Correctness Error": None,
24+
}
25+
26+
# Add metrics dynamically based on the metrics list
27+
for metric in metrics:
28+
if metric in data:
29+
result[f"DeepEval {metric.title()}"] = data[metric]["mean"]
30+
result[f"DeepEval {metric.title()} Error"] = [
31+
data[metric]["ci_lower"],
32+
data[metric]["ci_upper"],
33+
]
34+
else:
35+
print(f"Warning: Metric '{metric}' not found in {json_path}")
36+
37+
return result
38+
39+
40+
def convert_to_dataframe(results: List[Dict[str, Any]]) -> pd.DataFrame:
41+
"""Convert results list to DataFrame with expanded error columns."""
42+
df_data = []
43+
44+
for result in results:
45+
row = {}
46+
for key, value in result.items():
47+
if key.endswith("Error") and isinstance(value, list) and len(value) == 2:
48+
# Split error columns into lower and upper
49+
row[f"{key} Lower"] = value[0]
50+
row[f"{key} Upper"] = value[1]
51+
else:
52+
row[key] = value
53+
df_data.append(row)
54+
55+
return pd.DataFrame(df_data)
56+
57+
58+
def process_multiple_files(
59+
json_paths: List[str], output_path: str, metrics: List[str] = None
60+
) -> None:
61+
"""Process multiple JSON files and save concatenated results."""
62+
if metrics is None:
63+
metrics = ["correctness", "f1", "EM"]
64+
65+
results = []
66+
67+
for json_path in json_paths:
68+
try:
69+
converted = convert_metrics_file(json_path, metrics)
70+
results.append(converted)
71+
print(f"Processed: {json_path}")
72+
except Exception as e:
73+
print(f"Error processing {json_path}: {e}")
74+
75+
# Save JSON results
76+
with open(output_path, "w") as f:
77+
json.dump(results, f, indent=2)
78+
79+
print(f"Saved {len(results)} results to {output_path}")
80+
81+
# Convert to DataFrame and save CSV
82+
df = convert_to_dataframe(results)
83+
csv_path = output_path.replace(".json", ".csv")
84+
df.to_csv(csv_path, index=False)
85+
print(f"Saved DataFrame to {csv_path}")
86+
87+
88+
if __name__ == "__main__":
89+
# Default metrics (can be customized here)
90+
# default_metrics = ['correctness', 'f1', 'EM']
91+
default_metrics = ["correctness"]
92+
93+
# List JSON files in the current directory
94+
current_dir = ""
95+
json_files = [f for f in os.listdir(current_dir) if f.endswith(".json")]
96+
97+
if json_files:
98+
print(f"Found {len(json_files)} JSON files:")
99+
for f in json_files:
100+
print(f" - {f}")
101+
102+
# Create full paths for JSON files and output file in current working directory
103+
json_full_paths = [os.path.join(current_dir, f) for f in json_files]
104+
output_file = os.path.join(current_dir, "converted_metrics.json")
105+
process_multiple_files(json_full_paths, output_file, default_metrics)
106+
else:
107+
print("No JSON files found in current directory")
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
import modal
2+
import os
3+
import asyncio
4+
import datetime
5+
import hashlib
6+
import json
7+
from cognee.shared.logging_utils import get_logger
8+
from cognee.eval_framework.eval_config import EvalConfig
9+
from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
10+
from cognee.eval_framework.metrics_dashboard import create_dashboard
11+
12+
logger = get_logger()
13+
vol = modal.Volume.from_name("comparison-eval-answers", create_if_missing=True)
14+
15+
app = modal.App("comparison-eval-answerst")
16+
17+
image = (
18+
modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
19+
.copy_local_file("pyproject.toml", "pyproject.toml")
20+
.copy_local_file("poetry.lock", "poetry.lock")
21+
.env(
22+
{
23+
"ENV": os.getenv("ENV"),
24+
"LLM_API_KEY": os.getenv("LLM_API_KEY"),
25+
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
26+
}
27+
)
28+
.pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
29+
)
30+
31+
32+
@app.function(image=image, concurrency_limit=10, timeout=86400, volumes={"/data": vol})
33+
async def modal_evaluate_answers(
34+
answers_json_content: dict, answers_filename: str, eval_config: dict = None
35+
):
36+
"""Evaluates answers from JSON content and returns metrics results."""
37+
if eval_config is None:
38+
eval_config = EvalConfig().to_dict()
39+
40+
timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
41+
42+
# Create temporary file path for the JSON content
43+
base_name = os.path.splitext(answers_filename)[0]
44+
temp_answers_path = f"/data/temp_answers_{base_name}_{timestamp}.json"
45+
46+
# Write JSON content to temporary file
47+
with open(temp_answers_path, "w") as f:
48+
json.dump(answers_json_content, f, ensure_ascii=False, indent=4)
49+
50+
# Set up output paths with simplified naming: prefix_original_file_name
51+
eval_params = eval_config.copy()
52+
eval_params["answers_path"] = temp_answers_path
53+
eval_params["metrics_path"] = f"/data/metrics_{answers_filename}"
54+
eval_params["aggregate_metrics_path"] = f"/data/aggregate_metrics_{answers_filename}"
55+
eval_params["dashboard_path"] = f"/data/dashboard_{os.path.splitext(answers_filename)[0]}.html"
56+
57+
# eval_params["evaluation_engine"] = "DirectLLM"
58+
# eval_params["evaluation_metrics"] = ["correctness"]
59+
60+
logger.info(f"Evaluating answers from: {answers_filename}")
61+
logger.info(f"Using eval params: {eval_params}")
62+
63+
try:
64+
# Only run evaluation (skip corpus building and question answering)
65+
evaluated_answers = await run_evaluation(eval_params)
66+
67+
# Save evaluated answers
68+
evaluated_answers_path = f"/data/evaluated_{answers_filename}"
69+
with open(evaluated_answers_path, "w") as f:
70+
json.dump(evaluated_answers, f, ensure_ascii=False, indent=4)
71+
vol.commit()
72+
73+
# Generate dashboard if requested
74+
if eval_params.get("dashboard"):
75+
logger.info("Generating dashboard...")
76+
html_output = create_dashboard(
77+
metrics_path=eval_params["metrics_path"],
78+
aggregate_metrics_path=eval_params["aggregate_metrics_path"],
79+
output_file=eval_params["dashboard_path"],
80+
benchmark=eval_params.get("benchmark", "Unknown"),
81+
)
82+
83+
with open(eval_params["dashboard_path"], "w") as f:
84+
f.write(html_output)
85+
vol.commit()
86+
87+
logger.info(f"Evaluation completed for {answers_filename}")
88+
89+
# Return metrics results
90+
result = {
91+
"answers_file": answers_filename,
92+
"metrics_path": eval_params["metrics_path"],
93+
"aggregate_metrics_path": eval_params["aggregate_metrics_path"],
94+
"dashboard_path": eval_params["dashboard_path"]
95+
if eval_params.get("dashboard")
96+
else None,
97+
"evaluated_answers_path": evaluated_answers_path,
98+
}
99+
100+
return result
101+
102+
except Exception as e:
103+
logger.error(f"Error evaluating {answers_filename}: {e}")
104+
raise
105+
106+
107+
@app.local_entrypoint()
108+
async def main():
109+
"""Main entry point that evaluates multiple JSON answer files in parallel."""
110+
111+
json_files_dir = ""
112+
json_files = [f for f in os.listdir(json_files_dir) if f.endswith(".json")]
113+
json_file_paths = [os.path.join(json_files_dir, f) for f in json_files]
114+
115+
# Manually specify your evaluation configuration here
116+
eval_config = EvalConfig(
117+
# Only evaluation-related settings
118+
evaluating_answers=True,
119+
evaluating_contexts=False,
120+
evaluation_engine="DeepEval",
121+
evaluation_metrics=["correctness", "EM", "f1"],
122+
calculate_metrics=True,
123+
dashboard=True,
124+
deepeval_model="gpt-4o-mini",
125+
).to_dict()
126+
127+
logger.info(f"Starting evaluation of {len(json_file_paths)} JSON files")
128+
129+
# Read JSON files locally and prepare tasks
130+
modal_tasks = []
131+
for json_path in json_file_paths:
132+
try:
133+
# Read JSON content locally
134+
with open(json_path, "r", encoding="utf-8") as f:
135+
json_content = json.load(f)
136+
137+
filename = os.path.basename(json_path)
138+
139+
# Create remote evaluation task with JSON content
140+
task = modal_evaluate_answers.remote.aio(json_content, filename, eval_config)
141+
modal_tasks.append(task)
142+
143+
except (FileNotFoundError, json.JSONDecodeError) as e:
144+
logger.error(f"Error reading {json_path}: {e}")
145+
continue
146+
147+
if not modal_tasks:
148+
logger.error("No valid JSON files found to process")
149+
return []
150+
151+
# Run evaluations in parallel
152+
results = await asyncio.gather(*modal_tasks, return_exceptions=True)
153+
154+
# Log results
155+
for i, result in enumerate(results):
156+
if isinstance(result, Exception):
157+
logger.error(f"Failed to evaluate {json_file_paths[i]}: {result}")
158+
else:
159+
logger.info(f"Successfully evaluated {result['answers_file']}")
160+
161+
return results

0 commit comments

Comments
 (0)