Skip to content

Commit

Permalink
Merge pull request #42 from symflower/scoring
Browse files Browse the repository at this point in the history
Scoring and Ranking
  • Loading branch information
zimmski authored Apr 18, 2024
2 parents 2d9ef6f + e65aa8d commit f0b2e5e
Show file tree
Hide file tree
Showing 3 changed files with 207 additions and 40 deletions.
22 changes: 12 additions & 10 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,21 +109,21 @@ func (command *Evaluate) Execute(args []string) (err error) {

// Check that models and languages can be evaluated by executing the "plain" repositories.
log.Printf("Checking that models and languages can be used for evaluation")
metricsPerModel := map[string]metrics.Assessments{}
assessmentsPerModel := map[string]metrics.Assessments{}
problemsPerModel := map[string][]error{}
{
// Ensure we report metrics for every model even if they are excluded.
for _, modelID := range command.Models {
metricsPerModel[modelID] = metrics.NewAssessments()
assessmentsPerModel[modelID] = metrics.NewAssessments()
}

for _, languageID := range command.Languages {
for _, modelID := range command.Models {
model := models[modelID]
language := language.Languages[languageID]

metrics, ps, err := evaluate.EvaluateRepository(command.ResultPath, model, language, command.TestdataPath, filepath.Join(language.ID(), "plain"))
metricsPerModel[modelID].Add(metrics)
assessment, ps, err := evaluate.EvaluateRepository(command.ResultPath, model, language, command.TestdataPath, filepath.Join(language.ID(), "plain"))
assessmentsPerModel[modelID].Add(assessment)
if err != nil {
ps = append(ps, err)
}
Expand Down Expand Up @@ -162,8 +162,8 @@ func (command *Evaluate) Execute(args []string) (err error) {
model := models[modelID]
language := language.Languages[languageID]

metrics, ps, err := evaluate.EvaluateRepository(command.ResultPath, model, language, command.TestdataPath, filepath.Join(languageID, repository.Name()))
metricsPerModel[model.ID()].Add(metrics)
assessment, ps, err := evaluate.EvaluateRepository(command.ResultPath, model, language, command.TestdataPath, filepath.Join(languageID, repository.Name()))
assessmentsPerModel[model.ID()].Add(assessment)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
if err != nil {
log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repository.Name(), err)
Expand All @@ -172,11 +172,13 @@ func (command *Evaluate) Execute(args []string) (err error) {
}
}

for _, modelID := range command.Models {
log.Printf("Evaluation score for %q: %s", modelID, metricsPerModel[modelID])
}
_ = metrics.WalkByScore(assessmentsPerModel, func(model string, assessment metrics.Assessments, score uint) error {
log.Printf("Evaluation score for %q: %s", model, assessment)

return nil
})

csv, err := metrics.FormatStringCSV(metricsPerModel)
csv, err := metrics.FormatStringCSV(assessmentsPerModel)
if err != nil {
log.Fatalf("ERROR: could not create result summary: %s", err)
}
Expand Down
63 changes: 51 additions & 12 deletions evaluate/metrics/assessment.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"encoding/csv"
"fmt"
"sort"
"strconv"
"strings"

pkgerrors "github.com/pkg/errors"
Expand Down Expand Up @@ -92,18 +93,32 @@ func Merge(a Assessments, b Assessments) (c Assessments) {
return c
}

// Score computes the score over all assessments in the collection.
func (a Assessments) Score() (score uint) {
if len(a) == 0 {
return 0
}

for _, value := range maps.Values(a) {
score += value
}

return score
}

// String returns a string representation of the metrics.
func (a Assessments) String() string {
if a == nil {
a = NewAssessments()
}
metrics := make([]string, len(allAssessmentKeys))
entries := make([]string, len(allAssessmentKeys))

for i, key := range allAssessmentKeys {
metrics[i] = fmt.Sprintf("%s=%d", key, a[key])
entries[i] = fmt.Sprintf("%s=%d", key, a[key])
}
entries = append([]string{fmt.Sprintf("score=%d", a.Score())}, entries...)

return strings.Join(metrics, ", ")
return strings.Join(entries, ", ")
}

// StringCSV returns a CSV row string representation of the metrics.
Expand All @@ -121,27 +136,51 @@ func (a Assessments) StringCSV() (row []string) {
}

func csvHeader() []string {
return append([]string{"model"}, allAssessmentKeysStrings...)
return append([]string{"model", "score"}, allAssessmentKeysStrings...)
}

// FormatStringCSV formats the given metrics as CSV.
func FormatStringCSV(metricsPerModel map[string]Assessments) (string, error) {
// FormatStringCSV formats the given assessment metrics as CSV.
func FormatStringCSV(assessmentsPerModel map[string]Assessments) (string, error) {
var out strings.Builder
csv := csv.NewWriter(&out)

if err := csv.Write(csvHeader()); err != nil {
return "", pkgerrors.WithStack(err)
}
models := maps.Keys(metricsPerModel)
sort.Strings(models)
for _, model := range models {
row := metricsPerModel[model].StringCSV()

if err := csv.Write(append([]string{model}, row...)); err != nil {
return "", pkgerrors.WithStack(err)
if err := WalkByScore(assessmentsPerModel, func(model string, assessment Assessments, score uint) error {
row := assessment.StringCSV()

if err := csv.Write(append([]string{model, strconv.FormatUint(uint64(score), 10)}, row...)); err != nil {
return pkgerrors.WithStack(err)
}

return nil
}); err != nil {
return "", err
}
csv.Flush()

return out.String(), nil
}

// WalkByScore walks the given assessment metrics by their score.
func WalkByScore(assessmentsPerModel map[string]Assessments, function func(model string, assessment Assessments, score uint) error) error {
models := maps.Keys(assessmentsPerModel)
sort.Strings(models)
scores := make(map[string]uint, len(models))
for _, model := range models {
scores[model] = assessmentsPerModel[model].Score()
}
sort.SliceStable(models, func(i, j int) bool {
return scores[models[i]] < scores[models[j]]
})

for _, model := range models {
if err := function(model, assessmentsPerModel[model], scores[model]); err != nil {
return err
}
}

return nil
}
162 changes: 144 additions & 18 deletions evaluate/metrics/assessment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/zimmski/osutil/bytesutil"
)

Expand Down Expand Up @@ -138,7 +139,7 @@ func TestAssessmentString(t *testing.T) {

Assessment: NewAssessments(),

ExpectedString: "coverage-statement=0, files-executed=0, response-no-error=0, response-no-excess=0, response-not-empty=0, response-with-code=0",
ExpectedString: "score=0, coverage-statement=0, files-executed=0, response-no-error=0, response-no-excess=0, response-not-empty=0, response-with-code=0",
})

validate(t, &testCase{
Expand All @@ -147,13 +148,13 @@ func TestAssessmentString(t *testing.T) {
Assessment: Assessments{
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyResponseNoError: 4,
AssessmentKeyResponseNoExcess: 5,
AssessmentKeyResponseNotEmpty: 6,
AssessmentKeyResponseWithCode: 7,
AssessmentKeyResponseNoError: 3,
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseNotEmpty: 5,
AssessmentKeyResponseWithCode: 6,
},

ExpectedString: "coverage-statement=1, files-executed=2, response-no-error=4, response-no-excess=5, response-not-empty=6, response-with-code=7",
ExpectedString: "score=21, coverage-statement=1, files-executed=2, response-no-error=3, response-no-excess=4, response-not-empty=5, response-with-code=6",
})
}

Expand Down Expand Up @@ -183,8 +184,8 @@ func TestFormatStringCSV(t *testing.T) {
},

ExpectedString: `
model,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
Model,0,0,0,0,0,0
model,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
Model,0,0,0,0,0,0,0
`,
})
validate(t, &testCase{
Expand All @@ -194,24 +195,25 @@ func TestFormatStringCSV(t *testing.T) {
"ModelA": Assessments{
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyResponseNoError: 3,
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseNotEmpty: 5,
AssessmentKeyResponseWithCode: 6,
},
"ModelB": Assessments{
AssessmentKeyCoverageStatement: 2,
AssessmentKeyFilesExecuted: 3,
AssessmentKeyResponseNoError: 4,
AssessmentKeyResponseNoExcess: 5,
AssessmentKeyResponseNotEmpty: 6,
AssessmentKeyResponseWithCode: 7,
},
"ModelB": Assessments{
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2, AssessmentKeyResponseNoError: 4,
AssessmentKeyResponseNoExcess: 5,
AssessmentKeyResponseNotEmpty: 6,
AssessmentKeyResponseWithCode: 7,
},
},

ExpectedString: `
model,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
ModelA,1,2,4,5,6,7
ModelB,1,2,4,5,6,7
model,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
ModelA,21,1,2,3,4,5,6
ModelB,27,2,3,4,5,6,7
`,
})
}
Expand Down Expand Up @@ -292,3 +294,127 @@ func TestAssessmentsEqual(t *testing.T) {
ExpectedBool: false,
})
}

func TestAssessmentsScore(t *testing.T) {
type testCase struct {
Name string

Assessments Assessments

ExpectedScore uint
}

validate := func(t *testing.T, tc *testCase) {
t.Run(tc.Name, func(t *testing.T) {
actualScore := tc.Assessments.Score()

assert.Equal(t, tc.ExpectedScore, actualScore)
})
}

validate(t, &testCase{
Name: "Empty Assessment",

Assessments: NewAssessments(),

ExpectedScore: 0,
})

validate(t, &testCase{
Name: "Values Assessment",

Assessments: Assessments{
AssessmentKeyFilesExecuted: 5,
AssessmentKeyCoverageStatement: 4,
},

ExpectedScore: 9,
})
}

func TestWalkByScore(t *testing.T) {
type testCase struct {
Name string

AssessmentPerModel map[string]Assessments

ExpectedModelOrder []string
ExpectedScoreOrder []uint
}

validate := func(t *testing.T, tc *testCase) {
t.Run(tc.Name, func(t *testing.T) {
require.Equal(t, len(tc.ExpectedModelOrder), len(tc.ExpectedScoreOrder), "expected order needs equal lengths")

actualModelOrder := make([]string, 0, len(tc.ExpectedModelOrder))
actualAssessmentOrder := make([]Assessments, 0, len(tc.ExpectedModelOrder))
actualScoreOrder := make([]uint, 0, len(tc.ExpectedScoreOrder))
assert.NoError(t, WalkByScore(tc.AssessmentPerModel, func(model string, assessment Assessments, score uint) error {
actualModelOrder = append(actualModelOrder, model)
actualAssessmentOrder = append(actualAssessmentOrder, assessment)
actualScoreOrder = append(actualScoreOrder, score)

return nil
}))

assert.Equal(t, tc.ExpectedModelOrder, actualModelOrder)
assert.Equal(t, tc.ExpectedScoreOrder, actualScoreOrder)
for i, model := range tc.ExpectedModelOrder {
assert.Equal(t, tc.AssessmentPerModel[model], actualAssessmentOrder[i])
}
})
}

validate(t, &testCase{
Name: "No Assessment",

AssessmentPerModel: map[string]Assessments{},

ExpectedModelOrder: []string{},
ExpectedScoreOrder: []uint{},
})

validate(t, &testCase{
Name: "Single Assessment",

AssessmentPerModel: map[string]Assessments{
"Model": Assessments{
AssessmentKeyFilesExecuted: 1,
},
},

ExpectedModelOrder: []string{
"Model",
},
ExpectedScoreOrder: []uint{
1,
},
})

validate(t, &testCase{
Name: "Multiple Assessments",

AssessmentPerModel: map[string]Assessments{
"ModelA": Assessments{
AssessmentKeyFilesExecuted: 1,
},
"ModelB": Assessments{
AssessmentKeyFilesExecuted: 2,
},
"ModelC": Assessments{
AssessmentKeyFilesExecuted: 3,
},
},

ExpectedModelOrder: []string{
"ModelA",
"ModelB",
"ModelC",
},
ExpectedScoreOrder: []uint{
1,
2,
3,
},
})
}

0 comments on commit f0b2e5e

Please sign in to comment.