Skip to content

Commit

Permalink
Categorization according to assessment
Browse files Browse the repository at this point in the history
Closes #32
  • Loading branch information
bauersimon committed Apr 18, 2024
1 parent 2d1d198 commit 9cc7bc4
Show file tree
Hide file tree
Showing 3 changed files with 191 additions and 0 deletions.
4 changes: 4 additions & 0 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,12 @@ func (command *Evaluate) Execute(args []string) (err error) {
}
}

isPlain := len(commandRepositories) == 0 || (len(commandRepositories) == 1 && commandRepositories["plain"])
_ = metrics.WalkByScore(assessmentsPerModel, func(model string, assessment metrics.Assessments, score uint) error {
log.Printf("Evaluation score for %q: %s", model, assessment)
if isPlain {
log.Printf("Categorization for %q: %q", model, assessment.Category(uint(len(command.Languages))))
}

return nil
})
Expand Down
42 changes: 42 additions & 0 deletions evaluate/metrics/category.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package metrics

// AssessmentCategory represents a categorical ranking of a model based on Assessments.
type AssessmentCategory string

var (
// AssessmentCategoryError indicates that a model has encountered an error trying to produce a response.
AssessmentCategoryResponseError = AssessmentCategory("response-error")
// AssessmentCategoryResponseEmpty indicates that a model has returned an empty response.
AssessmentCategoryResponseEmpty = AssessmentCategory("response-empty")
// AssessmentCategoryResponseNoCode indicates that a model's response did not contain any source code.
AssessmentCategoryResponseNoCode = AssessmentCategory("response-no-code")
// AssessmentCategoryCodeInvalid indicates that a model's generated code produced an error when executed.
AssessmentCategoryCodeInvalid = AssessmentCategory("code-invalid")
// AssessmentCategoryCodeExecuted indicates that a model's generated code could be executed without an error.
AssessmentCategoryCodeExecuted = AssessmentCategory("code-executed")
// AssessmentCategoryCodeCoverageStatementReached indicates that a model's generated code reached 100% statement coverage.
AssessmentCategoryCodeCoverageStatementReached = AssessmentCategory("code-coverage-statement")
// AssessmentCategoryCodeNoExcess indicates that a model's response did not contain more content than requested.
AssessmentCategoryCodeNoExcess = AssessmentCategory("code-no-excess")
)

// Category infers a categorical ranking of a model based on assessment values.
// A models overall category corresponds to the criterion where the model was consistently able to receive "total" amount of points. I.e. if there were 3 tasks in total and a model was able to produce executing code for all tasks, but only in one case the coverage goal was reached, then the category is only "CodeExecuted" because the coverage goal was not reached consistently.
func (a Assessments) Category(total uint) AssessmentCategory {
switch {
case a[AssessmentKeyResponseNoError] != total:
return AssessmentCategoryResponseError
case a[AssessmentKeyResponseNotEmpty] != total:
return AssessmentCategoryResponseEmpty
case a[AssessmentKeyResponseWithCode] != total && a[AssessmentKeyFilesExecuted] != total: // TODO We cannot always detect yet if a model response contains source code, so ensure we don't categorize into "no code" if the code actually ran successfully all the time. https://github.com/symflower/eval-dev-quality/issues/43
return AssessmentCategoryResponseNoCode
case a[AssessmentKeyFilesExecuted] != total:
return AssessmentCategoryCodeInvalid
case a[AssessmentKeyCoverageStatement] != total:
return AssessmentCategoryCodeExecuted
case a[AssessmentKeyResponseNoExcess] != total:
return AssessmentCategoryCodeCoverageStatementReached
default:
return AssessmentCategoryCodeNoExcess
}
}
145 changes: 145 additions & 0 deletions evaluate/metrics/category_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
package metrics

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestAssessmentsCategory(t *testing.T) {
type testCase struct {
Name string

Assessments Assessments
Total uint

ExpectedAssessmentCategory AssessmentCategory
}

validate := func(t *testing.T, tc *testCase) {
t.Run(tc.Name, func(t *testing.T) {
actualAssessmentCategory := tc.Assessments.Category(tc.Total)

assert.Equal(t, tc.ExpectedAssessmentCategory, actualAssessmentCategory)
})
}

validate(t, &testCase{
Name: "No Points",

Assessments: NewAssessments(),
Total: 1,

ExpectedAssessmentCategory: AssessmentCategoryResponseError,
})

validate(t, &testCase{
Name: "No Response Error",

Assessments: Assessments{
AssessmentKeyResponseNoError: 1,
},
Total: 1,

ExpectedAssessmentCategory: AssessmentCategoryResponseEmpty,
})

validate(t, &testCase{
Name: "No Empty Response",

Assessments: Assessments{
AssessmentKeyResponseNoError: 1,
AssessmentKeyResponseNotEmpty: 1,
},
Total: 1,

ExpectedAssessmentCategory: AssessmentCategoryResponseNoCode,
})

validate(t, &testCase{
Name: "Contains Code",

Assessments: Assessments{
AssessmentKeyResponseNoError: 1,
AssessmentKeyResponseNotEmpty: 1,
AssessmentKeyResponseWithCode: 1,
},
Total: 1,

ExpectedAssessmentCategory: AssessmentCategoryCodeInvalid,
})

validate(t, &testCase{
Name: "Code not Detected but Executes", // TODO We cannot always detect yet if a model response contains source code, so ensure we don't categorize into "no code" if the code actually ran successfully. https://github.com/symflower/eval-dev-quality/issues/43

Assessments: Assessments{
AssessmentKeyResponseNoError: 1,
AssessmentKeyResponseNotEmpty: 1,
AssessmentKeyFilesExecuted: 1,
},
Total: 1,

ExpectedAssessmentCategory: AssessmentCategoryCodeExecuted,
})

validate(t, &testCase{
Name: "Executes",

Assessments: Assessments{
AssessmentKeyResponseNoError: 1,
AssessmentKeyResponseNotEmpty: 1,
AssessmentKeyResponseWithCode: 1,
AssessmentKeyFilesExecuted: 1,
},
Total: 1,

ExpectedAssessmentCategory: AssessmentCategoryCodeExecuted,
})

validate(t, &testCase{
Name: "Full Statement Coverage",

Assessments: Assessments{
AssessmentKeyResponseNoError: 1,
AssessmentKeyResponseNotEmpty: 1,
AssessmentKeyResponseWithCode: 1,
AssessmentKeyFilesExecuted: 1,
AssessmentKeyCoverageStatement: 1,
},
Total: 1,

ExpectedAssessmentCategory: AssessmentCategoryCodeCoverageStatementReached,
})

validate(t, &testCase{
Name: "No Excess",

Assessments: Assessments{
AssessmentKeyResponseNoError: 1,
AssessmentKeyResponseNotEmpty: 1,
AssessmentKeyResponseWithCode: 1,
AssessmentKeyFilesExecuted: 1,
AssessmentKeyCoverageStatement: 1,
AssessmentKeyResponseNoExcess: 1,
},
Total: 1,

ExpectedAssessmentCategory: AssessmentCategoryCodeNoExcess,
})

validate(t, &testCase{
Name: "Inconsistent",

Assessments: Assessments{
AssessmentKeyResponseNoError: 2,
AssessmentKeyResponseNotEmpty: 2,
AssessmentKeyResponseWithCode: 2,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyCoverageStatement: 1,
AssessmentKeyResponseNoExcess: 0,
},
Total: 2,

ExpectedAssessmentCategory: AssessmentCategoryCodeExecuted,
})
}

0 comments on commit 9cc7bc4

Please sign in to comment.