12
12
# or implied. See the License for the specific language governing
13
13
# permissions and limitations under the License.
14
14
15
- from typing import Annotated , Dict , List , Tuple
15
+ from typing import Annotated , Dict , List
16
16
17
17
import plotly .graph_objects as go
18
- from plotly .subplots import make_subplots
19
- from zenml import ArtifactConfig , get_step_context , log_metadata , step
18
+ from zenml import get_step_context , log_metadata , step
20
19
from zenml .types import HTMLString
21
20
21
+
22
22
def create_plotly_bar_chart (
23
23
labels : List [str ],
24
24
scores : List [float ],
@@ -43,54 +43,66 @@ def create_plotly_bar_chart(
43
43
"""
44
44
# Generate colors for bars
45
45
if alternate_colors :
46
- colors = ["rgba(66, 133, 244, 0.8)" if i % 2 == 0 else "rgba(219, 68, 55, 0.8)" for i in range (len (labels ))]
46
+ colors = [
47
+ "rgba(66, 133, 244, 0.8)"
48
+ if i % 2 == 0
49
+ else "rgba(219, 68, 55, 0.8)"
50
+ for i in range (len (labels ))
51
+ ]
47
52
else :
48
53
colors = ["rgba(66, 133, 244, 0.8)" for _ in range (len (labels ))]
49
54
50
55
# Prepare hover text
51
56
if descriptions :
52
- hover_text = [f"<b>{ label } </b><br>Value: { score :.2f} <br>{ descriptions .get (label , '' )} "
53
- for label , score in zip (labels , scores )]
57
+ hover_text = [
58
+ f"<b>{ label } </b><br>Value: { score :.2f} <br>{ descriptions .get (label , '' )} "
59
+ for label , score in zip (labels , scores )
60
+ ]
54
61
else :
55
- hover_text = [f"<b>{ label } </b><br>Value: { score :.2f} " for label , score in zip (labels , scores )]
62
+ hover_text = [
63
+ f"<b>{ label } </b><br>Value: { score :.2f} "
64
+ for label , score in zip (labels , scores )
65
+ ]
56
66
57
67
# Create figure
58
68
fig = go .Figure ()
59
-
69
+
60
70
fig .add_trace (
61
71
go .Bar (
62
72
y = labels ,
63
73
x = scores ,
64
- orientation = 'h' ,
74
+ orientation = "h" ,
65
75
marker_color = colors ,
66
76
text = [f"{ score :.2f} " for score in scores ],
67
- textposition = ' auto' ,
77
+ textposition = " auto" ,
68
78
hovertext = hover_text ,
69
- hoverinfo = ' text' ,
79
+ hoverinfo = " text" ,
70
80
)
71
81
)
72
82
73
83
# Set layout
74
84
max_value = max (scores ) if scores else 5
75
- xaxis_range = [0 , 100 ] if percentage_scale else [0 , max (5 , max_value * 1.1 )]
85
+ xaxis_range = (
86
+ [0 , 100 ] if percentage_scale else [0 , max (5 , max_value * 1.1 )]
87
+ )
76
88
xaxis_title = "Percentage (%)" if percentage_scale else "Score"
77
-
89
+
78
90
fig .update_layout (
79
91
title = title ,
80
92
xaxis = dict (
81
93
title = xaxis_title ,
82
94
range = xaxis_range ,
83
95
showgrid = True ,
84
- gridcolor = ' rgba(230, 230, 230, 0.8)' ,
96
+ gridcolor = " rgba(230, 230, 230, 0.8)" ,
85
97
),
86
98
yaxis = dict (
87
99
autorange = "reversed" , # Make labels read top-to-bottom
88
100
),
89
101
margin = dict (l = 20 , r = 20 , t = 60 , b = 20 ),
90
102
height = max (300 , 70 * len (labels )),
91
- plot_bgcolor = ' rgba(255, 255, 255, 1)' ,
103
+ plot_bgcolor = " rgba(255, 255, 255, 1)" ,
92
104
)
93
-
105
+
94
106
return fig
95
107
96
108
@@ -122,58 +134,49 @@ def generate_evaluation_html(
122
134
"""
123
135
# Metric descriptions for hovering
124
136
metric_descriptions = {
125
- "Small Retrieval Eval Failure Rate" :
126
- "Percentage of small test cases where retrieval failed to find relevant documents." ,
127
- "Small Retrieval Eval Failure Rate Reranking" :
128
- "Percentage of small test cases where retrieval with reranking failed to find relevant documents." ,
129
- "Full Retrieval Eval Failure Rate" :
130
- "Percentage of all test cases where retrieval failed to find relevant documents." ,
131
- "Full Retrieval Eval Failure Rate Reranking" :
132
- "Percentage of all test cases where retrieval with reranking failed to find relevant documents." ,
133
- "Failure Rate Bad Answers" :
134
- "Percentage of responses that were factually incorrect or misleading." ,
135
- "Failure Rate Bad Immediate Responses" :
136
- "Percentage of immediate responses that did not adequately address the query." ,
137
- "Failure Rate Good Responses" :
138
- "Percentage of responses rated as good by evaluators." ,
139
- "Average Toxicity Score" :
140
- "Average score measuring harmful, offensive, or inappropriate content (lower is better)." ,
141
- "Average Faithfulness Score" :
142
- "Average score measuring how accurately the response represents the source material (higher is better)." ,
143
- "Average Helpfulness Score" :
144
- "Average score measuring the practical utility of responses to users (higher is better)." ,
145
- "Average Relevance Score" :
146
- "Average score measuring how well responses address the specific query intent (higher is better)." ,
137
+ "Small Retrieval Eval Failure Rate" : "Percentage of small test cases where retrieval failed to find relevant documents." ,
138
+ "Small Retrieval Eval Failure Rate Reranking" : "Percentage of small test cases where retrieval with reranking failed to find relevant documents." ,
139
+ "Full Retrieval Eval Failure Rate" : "Percentage of all test cases where retrieval failed to find relevant documents." ,
140
+ "Full Retrieval Eval Failure Rate Reranking" : "Percentage of all test cases where retrieval with reranking failed to find relevant documents." ,
141
+ "Failure Rate Bad Answers" : "Percentage of responses that were factually incorrect or misleading." ,
142
+ "Failure Rate Bad Immediate Responses" : "Percentage of immediate responses that did not adequately address the query." ,
143
+ "Failure Rate Good Responses" : "Percentage of responses rated as good by evaluators." ,
144
+ "Average Toxicity Score" : "Average score measuring harmful, offensive, or inappropriate content (lower is better)." ,
145
+ "Average Faithfulness Score" : "Average score measuring how accurately the response represents the source material (higher is better)." ,
146
+ "Average Helpfulness Score" : "Average score measuring the practical utility of responses to users (higher is better)." ,
147
+ "Average Relevance Score" : "Average score measuring how well responses address the specific query intent (higher is better)." ,
147
148
}
148
149
149
150
# Create individual charts
150
151
retrieval_fig = create_plotly_bar_chart (
151
- retrieval_labels ,
152
- retrieval_scores ,
153
- f"Retrieval Evaluation Metrics" ,
152
+ retrieval_labels ,
153
+ retrieval_scores ,
154
+ f"Retrieval Evaluation Metrics" ,
154
155
alternate_colors = True ,
155
- descriptions = metric_descriptions
156
+ descriptions = metric_descriptions ,
156
157
)
157
-
158
+
158
159
generation_basic_fig = create_plotly_bar_chart (
159
- generation_basic_labels ,
160
- generation_basic_scores ,
161
- f"Basic Generation Metrics" ,
160
+ generation_basic_labels ,
161
+ generation_basic_scores ,
162
+ f"Basic Generation Metrics" ,
162
163
percentage_scale = True ,
163
- descriptions = metric_descriptions
164
+ descriptions = metric_descriptions ,
164
165
)
165
-
166
+
166
167
generation_quality_fig = create_plotly_bar_chart (
167
- generation_quality_labels ,
168
- generation_quality_scores ,
168
+ generation_quality_labels ,
169
+ generation_quality_scores ,
169
170
f"Generation Quality Metrics" ,
170
- descriptions = metric_descriptions
171
+ descriptions = metric_descriptions ,
171
172
)
172
173
173
174
# Create summary metrics cards
174
175
composite_quality = metrics_metadata .get ("composite.overall_quality" , 0 )
175
- retrieval_effectiveness = metrics_metadata .get ("composite.retrieval_effectiveness" , 0 )
176
-
176
+ retrieval_effectiveness = metrics_metadata .get (
177
+ "composite.retrieval_effectiveness" , 0
178
+ )
179
+
177
180
# Combine into complete HTML report
178
181
html = f"""
179
182
<!DOCTYPE html>
@@ -388,7 +391,7 @@ def generate_evaluation_html(
388
391
</body>
389
392
</html>
390
393
"""
391
-
394
+
392
395
return HTMLString (html )
393
396
394
397
@@ -434,10 +437,10 @@ def visualize_evaluation_results(
434
437
+ average_helpfulness_score
435
438
+ average_relevance_score
436
439
) / 3
437
-
440
+
438
441
composite_retrieval_effectiveness = (
439
- (1 - small_retrieval_eval_failure_rate / 100 )
440
- + (1 - full_retrieval_eval_failure_rate / 100 )
442
+ (1 - small_retrieval_eval_failure_rate / 100 )
443
+ + (1 - full_retrieval_eval_failure_rate / 100 )
441
444
) / 2
442
445
443
446
# Collect all metrics for dashboard and logging
0 commit comments