-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Categorization according to assessment
Closes #32
- Loading branch information
1 parent
2d1d198
commit 9cc7bc4
Showing
3 changed files
with
191 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package metrics | ||
|
||
// AssessmentCategory represents a categorical ranking of a model based on Assessments. | ||
type AssessmentCategory string | ||
|
||
var ( | ||
// AssessmentCategoryError indicates that a model has encountered an error trying to produce a response. | ||
AssessmentCategoryResponseError = AssessmentCategory("response-error") | ||
// AssessmentCategoryResponseEmpty indicates that a model has returned an empty response. | ||
AssessmentCategoryResponseEmpty = AssessmentCategory("response-empty") | ||
// AssessmentCategoryResponseNoCode indicates that a model's response did not contain any source code. | ||
AssessmentCategoryResponseNoCode = AssessmentCategory("response-no-code") | ||
// AssessmentCategoryCodeInvalid indicates that a model's generated code produced an error when executed. | ||
AssessmentCategoryCodeInvalid = AssessmentCategory("code-invalid") | ||
// AssessmentCategoryCodeExecuted indicates that a model's generated code could be executed without an error. | ||
AssessmentCategoryCodeExecuted = AssessmentCategory("code-executed") | ||
// AssessmentCategoryCodeCoverageStatementReached indicates that a model's generated code reached 100% statement coverage. | ||
AssessmentCategoryCodeCoverageStatementReached = AssessmentCategory("code-coverage-statement") | ||
// AssessmentCategoryCodeNoExcess indicates that a model's response did not contain more content than requested. | ||
AssessmentCategoryCodeNoExcess = AssessmentCategory("code-no-excess") | ||
) | ||
|
||
// Category infers a categorical ranking of a model based on assessment values. | ||
// A models overall category corresponds to the criterion where the model was consistently able to receive "total" amount of points. I.e. if there were 3 tasks in total and a model was able to produce executing code for all tasks, but only in one case the coverage goal was reached, then the category is only "CodeExecuted" because the coverage goal was not reached consistently. | ||
func (a Assessments) Category(total uint) AssessmentCategory { | ||
switch { | ||
case a[AssessmentKeyResponseNoError] != total: | ||
return AssessmentCategoryResponseError | ||
case a[AssessmentKeyResponseNotEmpty] != total: | ||
return AssessmentCategoryResponseEmpty | ||
case a[AssessmentKeyResponseWithCode] != total && a[AssessmentKeyFilesExecuted] != total: // TODO We cannot always detect yet if a model response contains source code, so ensure we don't categorize into "no code" if the code actually ran successfully all the time. https://github.com/symflower/eval-dev-quality/issues/43 | ||
return AssessmentCategoryResponseNoCode | ||
case a[AssessmentKeyFilesExecuted] != total: | ||
return AssessmentCategoryCodeInvalid | ||
case a[AssessmentKeyCoverageStatement] != total: | ||
return AssessmentCategoryCodeExecuted | ||
case a[AssessmentKeyResponseNoExcess] != total: | ||
return AssessmentCategoryCodeCoverageStatementReached | ||
default: | ||
return AssessmentCategoryCodeNoExcess | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
package metrics | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestAssessmentsCategory(t *testing.T) { | ||
type testCase struct { | ||
Name string | ||
|
||
Assessments Assessments | ||
Total uint | ||
|
||
ExpectedAssessmentCategory AssessmentCategory | ||
} | ||
|
||
validate := func(t *testing.T, tc *testCase) { | ||
t.Run(tc.Name, func(t *testing.T) { | ||
actualAssessmentCategory := tc.Assessments.Category(tc.Total) | ||
|
||
assert.Equal(t, tc.ExpectedAssessmentCategory, actualAssessmentCategory) | ||
}) | ||
} | ||
|
||
validate(t, &testCase{ | ||
Name: "No Points", | ||
|
||
Assessments: NewAssessments(), | ||
Total: 1, | ||
|
||
ExpectedAssessmentCategory: AssessmentCategoryResponseError, | ||
}) | ||
|
||
validate(t, &testCase{ | ||
Name: "No Response Error", | ||
|
||
Assessments: Assessments{ | ||
AssessmentKeyResponseNoError: 1, | ||
}, | ||
Total: 1, | ||
|
||
ExpectedAssessmentCategory: AssessmentCategoryResponseEmpty, | ||
}) | ||
|
||
validate(t, &testCase{ | ||
Name: "No Empty Response", | ||
|
||
Assessments: Assessments{ | ||
AssessmentKeyResponseNoError: 1, | ||
AssessmentKeyResponseNotEmpty: 1, | ||
}, | ||
Total: 1, | ||
|
||
ExpectedAssessmentCategory: AssessmentCategoryResponseNoCode, | ||
}) | ||
|
||
validate(t, &testCase{ | ||
Name: "Contains Code", | ||
|
||
Assessments: Assessments{ | ||
AssessmentKeyResponseNoError: 1, | ||
AssessmentKeyResponseNotEmpty: 1, | ||
AssessmentKeyResponseWithCode: 1, | ||
}, | ||
Total: 1, | ||
|
||
ExpectedAssessmentCategory: AssessmentCategoryCodeInvalid, | ||
}) | ||
|
||
validate(t, &testCase{ | ||
Name: "Code not Detected but Executes", // TODO We cannot always detect yet if a model response contains source code, so ensure we don't categorize into "no code" if the code actually ran successfully. https://github.com/symflower/eval-dev-quality/issues/43 | ||
|
||
Assessments: Assessments{ | ||
AssessmentKeyResponseNoError: 1, | ||
AssessmentKeyResponseNotEmpty: 1, | ||
AssessmentKeyFilesExecuted: 1, | ||
}, | ||
Total: 1, | ||
|
||
ExpectedAssessmentCategory: AssessmentCategoryCodeExecuted, | ||
}) | ||
|
||
validate(t, &testCase{ | ||
Name: "Executes", | ||
|
||
Assessments: Assessments{ | ||
AssessmentKeyResponseNoError: 1, | ||
AssessmentKeyResponseNotEmpty: 1, | ||
AssessmentKeyResponseWithCode: 1, | ||
AssessmentKeyFilesExecuted: 1, | ||
}, | ||
Total: 1, | ||
|
||
ExpectedAssessmentCategory: AssessmentCategoryCodeExecuted, | ||
}) | ||
|
||
validate(t, &testCase{ | ||
Name: "Full Statement Coverage", | ||
|
||
Assessments: Assessments{ | ||
AssessmentKeyResponseNoError: 1, | ||
AssessmentKeyResponseNotEmpty: 1, | ||
AssessmentKeyResponseWithCode: 1, | ||
AssessmentKeyFilesExecuted: 1, | ||
AssessmentKeyCoverageStatement: 1, | ||
}, | ||
Total: 1, | ||
|
||
ExpectedAssessmentCategory: AssessmentCategoryCodeCoverageStatementReached, | ||
}) | ||
|
||
validate(t, &testCase{ | ||
Name: "No Excess", | ||
|
||
Assessments: Assessments{ | ||
AssessmentKeyResponseNoError: 1, | ||
AssessmentKeyResponseNotEmpty: 1, | ||
AssessmentKeyResponseWithCode: 1, | ||
AssessmentKeyFilesExecuted: 1, | ||
AssessmentKeyCoverageStatement: 1, | ||
AssessmentKeyResponseNoExcess: 1, | ||
}, | ||
Total: 1, | ||
|
||
ExpectedAssessmentCategory: AssessmentCategoryCodeNoExcess, | ||
}) | ||
|
||
validate(t, &testCase{ | ||
Name: "Inconsistent", | ||
|
||
Assessments: Assessments{ | ||
AssessmentKeyResponseNoError: 2, | ||
AssessmentKeyResponseNotEmpty: 2, | ||
AssessmentKeyResponseWithCode: 2, | ||
AssessmentKeyFilesExecuted: 2, | ||
AssessmentKeyCoverageStatement: 1, | ||
AssessmentKeyResponseNoExcess: 0, | ||
}, | ||
Total: 2, | ||
|
||
ExpectedAssessmentCategory: AssessmentCategoryCodeExecuted, | ||
}) | ||
} |