Skip to content

Commit 13400f1

Browse files
committed
fix: retry deepeval evaluation
1 parent ebebbb8 commit 13400f1

File tree

2 files changed

+46
-17
lines changed

2 files changed

+46
-17
lines changed

cognee/eval_framework/analysis/metrics_calculator.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,19 @@ def extract_metrics_and_details(
3838
for entry in data:
3939
for metric, values in entry["metrics"].items():
4040
score = values["score"]
41-
metrics_data[metric].append(score)
42-
if "reason" in values:
43-
metric_details[metric].append(
44-
{
45-
"question": entry["question"],
46-
"answer": entry["answer"],
47-
"golden_answer": entry["golden_answer"],
48-
"reason": values["reason"],
49-
"score": score,
50-
}
51-
)
41+
# Skip None scores from failed evaluations
42+
if score is not None:
43+
metrics_data[metric].append(score)
44+
if "reason" in values:
45+
metric_details[metric].append(
46+
{
47+
"question": entry["question"],
48+
"answer": entry["answer"],
49+
"golden_answer": entry["golden_answer"],
50+
"reason": values["reason"],
51+
"score": score,
52+
}
53+
)
5254

5355
return metrics_data, metric_details
5456

cognee/eval_framework/evaluation/deep_eval_adapter.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,15 @@
77
from cognee.eval_framework.evaluation.metrics.context_coverage import ContextCoverageMetric
88
from typing import Any, Dict, List
99
from deepeval.metrics import ContextualRelevancyMetric
10+
import time
11+
from cognee.shared.logging_utils import get_logger
12+
13+
logger = get_logger()
1014

1115

1216
class DeepEvalAdapter(BaseEvalAdapter):
1317
def __init__(self):
18+
self.n_retries = 5
1419
self.g_eval_metrics = {
1520
"correctness": self.g_eval_correctness(),
1621
"EM": ExactMatchMetric(),
@@ -19,6 +24,33 @@ def __init__(self):
1924
"context_coverage": ContextCoverageMetric(),
2025
}
2126

27+
def _calculate_metric(self, metric: str, test_case: LLMTestCase) -> Dict[str, Any]:
28+
"""Calculate a single metric for a test case with retry logic."""
29+
metric_to_calculate = self.g_eval_metrics[metric]
30+
31+
for attempt in range(self.n_retries):
32+
try:
33+
metric_to_calculate.measure(test_case)
34+
return {
35+
"score": metric_to_calculate.score,
36+
"reason": metric_to_calculate.reason,
37+
}
38+
except Exception as e:
39+
logger.warning(
40+
f"Attempt {attempt + 1}/{self.n_retries} failed for metric '{metric}': {e}"
41+
)
42+
if attempt < self.n_retries - 1:
43+
time.sleep(2**attempt) # Exponential backoff
44+
else:
45+
logger.error(
46+
f"All {self.n_retries} attempts failed for metric '{metric}'. Returning None values."
47+
)
48+
49+
return {
50+
"score": None,
51+
"reason": None,
52+
}
53+
2254
async def evaluate_answers(
2355
self, answers: List[Dict[str, Any]], evaluator_metrics: List[str]
2456
) -> List[Dict[str, Any]]:
@@ -40,12 +72,7 @@ async def evaluate_answers(
4072
)
4173
metric_results = {}
4274
for metric in evaluator_metrics:
43-
metric_to_calculate = self.g_eval_metrics[metric]
44-
metric_to_calculate.measure(test_case)
45-
metric_results[metric] = {
46-
"score": metric_to_calculate.score,
47-
"reason": metric_to_calculate.reason,
48-
}
75+
metric_results[metric] = self._calculate_metric(metric, test_case)
4976
results.append({**answer, "metrics": metric_results})
5077

5178
return results

0 commit comments

Comments
 (0)