Skip to content

Commit

Permalink
Merge pull request #39 from symflower/automatic-assessment
Browse files Browse the repository at this point in the history
More assessments based on a model response
  • Loading branch information
zimmski authored Apr 18, 2024
2 parents e194515 + ea7b57a commit 2d9ef6f
Show file tree
Hide file tree
Showing 15 changed files with 320 additions and 59 deletions.
36 changes: 29 additions & 7 deletions evaluate/metrics/assessment.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import (

pkgerrors "github.com/pkg/errors"
"golang.org/x/exp/maps"

"github.com/symflower/eval-dev-quality/util"
)

// AssessmentKey defines a key for a numerical key-value assessment pair.
Expand All @@ -23,23 +25,28 @@ var (
// RegisterAssessmentKey registers a new assessment key.
func RegisterAssessmentKey(key string) AssessmentKey {
assessment := AssessmentKey(key)
allAssessmentKeys = append(allAssessmentKeys, assessment)
allAssessmentKeysStrings = append(allAssessmentKeysStrings, key)

allAssessmentKeys = util.InsertToSortedSlice(allAssessmentKeys, assessment)
allAssessmentKeysStrings = util.InsertToSortedSlice(allAssessmentKeysStrings, key)

return assessment
}

var (
// AssessmentKeyFilesExecutes holds the successfully executed files.
AssessmentKeyFilesExecuted = RegisterAssessmentKey("files-executed")
// AssessmentKeyFilesProblems holds the files with problems.
AssessmentKeyFilesProblems = RegisterAssessmentKey("files-problems")

// AssessmentKeyCoverageStatement counts the cases where 100% coverage was reached.
AssessmentKeyCoverageStatement = RegisterAssessmentKey("coverage-statement")

// AssessmentKeyNoExcessResponse indicates that a model did not produce more content as requested.
AssessmentKeyNoExcessResponse = RegisterAssessmentKey("no-excess-response")
// AssessmentKeyResponseNoError indicates that a model responded without error.
AssessmentKeyResponseNoError = RegisterAssessmentKey("response-no-error")
// AssessmentKeyResponseNotEmpty indicates that a model response was not empty.
AssessmentKeyResponseNotEmpty = RegisterAssessmentKey("response-not-empty")
// AssessmentKeyResponseWithCode indicates that a model responded with code.
AssessmentKeyResponseWithCode = RegisterAssessmentKey("response-with-code")
// AssessmentKeyResponseNoExcess indicates that a model did not produce more content as requested.
AssessmentKeyResponseNoExcess = RegisterAssessmentKey("response-no-excess")
)

// Assessments holds a collection of numerical assessment metrics.
Expand All @@ -57,6 +64,21 @@ func (a Assessments) Add(x Assessments) {
}
}

// Equal checks if both assessment collections are equal.
func (a Assessments) Equal(x Assessments) bool {
if a == nil || x == nil {
return a == nil && x == nil
}

for _, key := range allAssessmentKeys {
if a[key] != x[key] {
return false
}
}

return true
}

// Merge combines two assessment collections into a new assessment collection and returns the new assessment collection.
func Merge(a Assessments, b Assessments) (c Assessments) {
c = NewAssessments()
Expand Down Expand Up @@ -108,7 +130,7 @@ func FormatStringCSV(metricsPerModel map[string]Assessments) (string, error) {
csv := csv.NewWriter(&out)

if err := csv.Write(csvHeader()); err != nil {
return "", err
return "", pkgerrors.WithStack(err)
}
models := maps.Keys(metricsPerModel)
sort.Strings(models)
Expand Down
136 changes: 109 additions & 27 deletions evaluate/metrics/assessment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,31 +39,31 @@ func TestAssessmentsAdd(t *testing.T) {

Assessments: NewAssessments(),
X: map[AssessmentKey]uint{
AssessmentKeyNoExcessResponse: 1,
AssessmentKeyResponseNoExcess: 1,
},

ExpectedAssessments: map[AssessmentKey]uint{
AssessmentKeyNoExcessResponse: 1,
AssessmentKeyResponseNoExcess: 1,
},
})

validate(t, &testCase{
Name: "Existing key",

Assessments: map[AssessmentKey]uint{
AssessmentKeyNoExcessResponse: 1,
AssessmentKeyResponseNoExcess: 1,
},
X: map[AssessmentKey]uint{
AssessmentKeyNoExcessResponse: 1,
AssessmentKeyResponseNoExcess: 1,
},

ExpectedAssessments: map[AssessmentKey]uint{
AssessmentKeyNoExcessResponse: 2,
AssessmentKeyResponseNoExcess: 2,
},
})
}

func TestMerge(t *testing.T) {
func TestAssessmentsMerge(t *testing.T) {
type testCase struct {
Name string

Expand Down Expand Up @@ -92,26 +92,26 @@ func TestMerge(t *testing.T) {

A: NewAssessments(),
B: map[AssessmentKey]uint{
AssessmentKeyNoExcessResponse: 1,
AssessmentKeyResponseNoExcess: 1,
},

ExpectedC: map[AssessmentKey]uint{
AssessmentKeyNoExcessResponse: 1,
AssessmentKeyResponseNoExcess: 1,
},
})

validate(t, &testCase{
Name: "Existing key",

A: map[AssessmentKey]uint{
AssessmentKeyNoExcessResponse: 1,
AssessmentKeyResponseNoExcess: 1,
},
B: map[AssessmentKey]uint{
AssessmentKeyNoExcessResponse: 1,
AssessmentKeyResponseNoExcess: 1,
},

ExpectedC: map[AssessmentKey]uint{
AssessmentKeyNoExcessResponse: 2,
AssessmentKeyResponseNoExcess: 2,
},
})
}
Expand All @@ -134,24 +134,26 @@ func TestAssessmentString(t *testing.T) {
}

validate(t, &testCase{
Name: "Initial Metrics",
Name: "Empty Metrics",

Assessment: NewAssessments(),

ExpectedString: "files-executed=0, files-problems=0, coverage-statement=0, no-excess-response=0",
ExpectedString: "coverage-statement=0, files-executed=0, response-no-error=0, response-no-excess=0, response-not-empty=0, response-with-code=0",
})

validate(t, &testCase{
Name: "Empty Metrics",
Name: "Non-empty Metrics",

Assessment: Assessments{
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyFilesProblems: 3,
AssessmentKeyNoExcessResponse: 4,
AssessmentKeyResponseNoError: 4,
AssessmentKeyResponseNoExcess: 5,
AssessmentKeyResponseNotEmpty: 6,
AssessmentKeyResponseWithCode: 7,
},

ExpectedString: "files-executed=2, files-problems=3, coverage-statement=1, no-excess-response=4",
ExpectedString: "coverage-statement=1, files-executed=2, response-no-error=4, response-no-excess=5, response-not-empty=6, response-with-code=7",
})
}

Expand Down Expand Up @@ -181,8 +183,8 @@ func TestFormatStringCSV(t *testing.T) {
},

ExpectedString: `
model,files-executed,files-problems,coverage-statement,no-excess-response
Model,0,0,0,0
model,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
Model,0,0,0,0,0,0
`,
})
validate(t, &testCase{
Expand All @@ -192,21 +194,101 @@ func TestFormatStringCSV(t *testing.T) {
"ModelA": Assessments{
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyFilesProblems: 3,
AssessmentKeyNoExcessResponse: 4,
AssessmentKeyResponseNoError: 4,
AssessmentKeyResponseNoExcess: 5,
AssessmentKeyResponseNotEmpty: 6,
AssessmentKeyResponseWithCode: 7,
},
"ModelB": Assessments{
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyFilesProblems: 3,
AssessmentKeyNoExcessResponse: 4,
AssessmentKeyFilesExecuted: 2, AssessmentKeyResponseNoError: 4,
AssessmentKeyResponseNoExcess: 5,
AssessmentKeyResponseNotEmpty: 6,
AssessmentKeyResponseWithCode: 7,
},
},

ExpectedString: `
model,files-executed,files-problems,coverage-statement,no-excess-response
ModelA,2,3,1,4
ModelB,2,3,1,4
model,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
ModelA,1,2,4,5,6,7
ModelB,1,2,4,5,6,7
`,
})
}

func TestAssessmentsEqual(t *testing.T) {
type testCase struct {
Name string

Assessments Assessments
X Assessments

ExpectedBool bool
}

validate := func(t *testing.T, tc *testCase) {
t.Run(tc.Name, func(t *testing.T) {
actualBool := tc.Assessments.Equal(tc.X)

assert.Equal(t, tc.ExpectedBool, actualBool)
})
}

validate(t, &testCase{
Name: "Empty",

Assessments: NewAssessments(),
X: NewAssessments(),

ExpectedBool: true,
})

validate(t, &testCase{
Name: "Nil",

Assessments: nil,
X: nil,

ExpectedBool: true,
})

validate(t, &testCase{
Name: "Equal Values",

Assessments: Assessments{
AssessmentKeyResponseWithCode: 2,
},
X: Assessments{
AssessmentKeyResponseWithCode: 2,
},

ExpectedBool: true,
})

validate(t, &testCase{
Name: "Default Value",

Assessments: Assessments{
AssessmentKeyResponseWithCode: 2,
AssessmentKeyResponseNoError: 0,
},
X: Assessments{
AssessmentKeyResponseWithCode: 2,
},

ExpectedBool: true,
})

validate(t, &testCase{
Name: "Different Values",

Assessments: Assessments{
AssessmentKeyResponseWithCode: 3,
},
X: Assessments{
AssessmentKeyResponseWithCode: 2,
},

ExpectedBool: false,
})
}
13 changes: 13 additions & 0 deletions evaluate/metrics/testing/assessments.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package metricstesting

import (
"testing"

"github.com/stretchr/testify/assert"
"github.com/symflower/eval-dev-quality/evaluate/metrics"
)

// AssertAssessmentsEqual checks if the given assessments are equal ignoring default values.
func AssertAssessmentsEqual(t *testing.T, expected metrics.Assessments, actual metrics.Assessments) {
assert.Truef(t, expected.Equal(actual), "expected:%s\nactual:%s", expected, actual)
}
3 changes: 1 addition & 2 deletions evaluate/repository.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,15 @@ func EvaluateRepository(resultPath string, model model.Model, language language.
assessments, err := model.GenerateTestsForFile(language, temporaryRepositoryPath, filePath)
if err != nil {
problems = append(problems, pkgerrors.WithMessage(err, filePath))
repositoryAssessment[metrics.AssessmentKeyFilesProblems]++

continue
}
repositoryAssessment.Add(assessments)
repositoryAssessment[metrics.AssessmentKeyResponseNoError]++

coverage, err := language.Execute(temporaryRepositoryPath)
if err != nil {
problems = append(problems, pkgerrors.WithMessage(err, filePath))
repositoryAssessment[metrics.AssessmentKeyFilesProblems]++

continue
}
Expand Down
8 changes: 6 additions & 2 deletions evaluate/repository_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/zimmski/osutil"

"github.com/symflower/eval-dev-quality/evaluate/metrics"
metricstesting "github.com/symflower/eval-dev-quality/evaluate/metrics/testing"
"github.com/symflower/eval-dev-quality/language"
"github.com/symflower/eval-dev-quality/model"
"github.com/symflower/eval-dev-quality/model/symflower"
Expand All @@ -35,7 +36,7 @@ func TestEvaluateRepository(t *testing.T) {

actualRepositoryAssessment, actualProblems, actualErr := EvaluateRepository(temporaryPath, tc.Model, tc.Language, tc.TestDataPath, tc.RepositoryPath)

assert.Equal(t, tc.ExpectedRepositoryAssessment, actualRepositoryAssessment)
metricstesting.AssertAssessmentsEqual(t, tc.ExpectedRepositoryAssessment, actualRepositoryAssessment)
assert.Equal(t, tc.ExpectedProblems, actualProblems)
assert.Equal(t, tc.ExpectedError, actualErr)

Expand All @@ -60,7 +61,10 @@ func TestEvaluateRepository(t *testing.T) {
ExpectedRepositoryAssessment: metrics.Assessments{
metrics.AssessmentKeyCoverageStatement: 1,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyNoExcessResponse: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedResultFiles: []string{
"symflower_symbolic-execution/golang/golang/plain.log",
Expand Down
2 changes: 1 addition & 1 deletion model/llm/llm.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func (m *llm) ID() (id string) {
func (m *llm) GenerateTestsForFile(language language.Language, repositoryPath string, filePath string) (assessment metrics.Assessments, err error) {
data, err := os.ReadFile(filepath.Join(repositoryPath, filePath))
if err != nil {
return nil, err
return nil, pkgerrors.WithStack(err)
}
fileContent := strings.TrimSpace(string(data))

Expand Down
8 changes: 6 additions & 2 deletions model/llm/llm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import (

"github.com/symflower/eval-dev-quality/evaluate/metrics"
"github.com/symflower/eval-dev-quality/language"

metricstesting "github.com/symflower/eval-dev-quality/evaluate/metrics/testing"
providertesting "github.com/symflower/eval-dev-quality/provider/testing"
)

Expand Down Expand Up @@ -46,7 +48,7 @@ func TestModelLLMGenerateTestsForFile(t *testing.T) {

actualAssessment, actualError := llm.GenerateTestsForFile(tc.Language, temporaryPath, tc.SourceFilePath)
assert.NoError(t, actualError)
assert.Equal(t, tc.ExpectedAssessment, actualAssessment)
metricstesting.AssertAssessmentsEqual(t, tc.ExpectedAssessment, actualAssessment)

actualTestFileContent, err := os.ReadFile(filepath.Join(temporaryPath, tc.ExpectedTestFilePath))
assert.NoError(t, err)
Expand Down Expand Up @@ -88,7 +90,9 @@ func TestModelLLMGenerateTestsForFile(t *testing.T) {
SourceFilePath: sourceFilePath,

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyNoExcessResponse: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedTestFileContent: `
package native
Expand Down
Loading

0 comments on commit 2d9ef6f

Please sign in to comment.