Skip to content

Commit

Permalink
Merge pull request #140 from m-lab/sandbox-disable-task-count
Browse files Browse the repository at this point in the history
update dedup query and table detail query
  • Loading branch information
gfr10598 authored Feb 21, 2019
2 parents 4c7d0fd + 6cf9dc8 commit 8e1b000
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 19 deletions.
8 changes: 6 additions & 2 deletions cloud/bq/dedup.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,13 @@ var dedupTemplateNDT = `
# Delete all duplicate rows based on test_id, preferring gz over non-gz, later parse_time
SELECT * except (row_number, gz, stripped_id)
from (
select *, ROW_NUMBER() OVER (PARTITION BY stripped_id order by gz DESC, parse_time DESC) row_number
select *,
# Prefer more snapshots, metadata, earlier task names, gzipped, later parse time
ROW_NUMBER() OVER (PARTITION BY stripped_id ORDER BY anomalies.num_snaps DESC, anomalies.no_meta, task_filename, gz DESC, parse_time DESC) row_number
FROM (
SELECT *, regexp_replace(test_id, ".gz$", "") as stripped_id, regexp_extract(test_id, ".*(.gz)$") as gz
SELECT *,
REGEXP_REPLACE(test_id, ".gz$", "") AS stripped_id,
REGEXP_EXTRACT(test_id, ".*(.gz)$") AS gz
FROM ` + "`%s`" + `
)
)
Expand Down
40 changes: 26 additions & 14 deletions cloud/bq/sanity.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,22 +168,22 @@ func GetTableDetail(ctx context.Context, dsExt *dataset.Dataset, table bqiface.T
detail := Detail{}
queryString := fmt.Sprintf(`
#standardSQL
SELECT SUM(tests) AS TestCount, COUNT(task)-1 AS TaskFileCount
FROM (
-- This avoids null counts when the partition doesn't exist or is empty.
SELECT 0 AS tests, "fake-task" AS task
UNION ALL
SELECT COUNT(test_id) AS tests, task_filename AS task
FROM `+"`%s.%s`"+`
%s -- where clause
GROUP BY task
)`, dataset, tableName, where)
SELECT SUM(tests) AS TestCount, COUNT(DISTINCT task)-1 AS TaskFileCount
FROM (
-- This avoids null counts when the partition doesn't exist or is empty.
SELECT 0 AS tests, "fake-task" AS task
UNION ALL
SELECT COUNT(DISTINCT test_id) AS tests, task_filename AS task
FROM `+"`%s.%s`"+`
%s -- where clause
GROUP BY task
)`, dataset, tableName, where)

// TODO - this should take a context?
err := dsExt.QueryAndParse(ctx, queryString, &detail)
if err != nil {
log.Println(err)
log.Println("Query error:", queryString)
log.Println("Query:", queryString)
}
return &detail, err
}
Expand Down Expand Up @@ -278,6 +278,12 @@ func (at *AnnotatedTable) GetPartitionInfo(ctx context.Context) (*dataset.Partit
return &pInfo, nil
}

// IncludeTaskFileCountCheck temporarily disables the task file count check, to address the problem
// with 2012.
const IncludeTaskFileCountCheck = false
const testCountRequirement = 0.99 // Query updated to count DISTINCT test_ids, so this can now be much tighter.
const taskCountRequirement = 0.99

// checkAlmostAsBig compares the current and given AnnotatedTable test counts and
// task file counts. When the current AnnotatedTable has more than 1% fewer task files or 5%
// fewer rows compare to the given AnnotatedTable, then a descriptive error is returned.
Expand All @@ -294,11 +300,16 @@ func (at *AnnotatedTable) checkAlmostAsBig(ctx context.Context, other *Annotated
// Check that receiver table contains at least 99% as many tasks as
// other table.
if thisDetail.TaskFileCount < otherDetail.TaskFileCount {
log.Printf("Warning - fewer task files: %s(%d) < %s(%d)\n",
log.Printf("Warning - fewer task files: %s(%d) < %s(%d) possibly due to redundant task files.\n",
at.Table.FullyQualifiedName(), thisDetail.TaskFileCount,
other.Table.FullyQualifiedName(), otherDetail.TaskFileCount)
}
if float32(thisDetail.TaskFileCount) < 0.99*float32(otherDetail.TaskFileCount) {

// NOTE: We have discovered that in 2012, some archives contain tests that are entirely
// redundant with tests in other archives. This means that some archives are completely removed
// in the dedup process. Since these archives appear in the original "base_tables", this check
// has been causing the sanity check to fail.
if IncludeTaskFileCountCheck && float32(thisDetail.TaskFileCount) < taskCountRequirement*float32(otherDetail.TaskFileCount) {
return ErrTooFewTasks
}

Expand All @@ -309,7 +320,8 @@ func (at *AnnotatedTable) checkAlmostAsBig(ctx context.Context, other *Annotated
at.Table.FullyQualifiedName(), thisDetail.TestCount,
other.Table.FullyQualifiedName(), otherDetail.TestCount)
}
if float32(thisDetail.TestCount) < 0.95*float32(otherDetail.TestCount) {
// We are now using DISTINCT test counts, so we can use a tighter bound.
if float32(thisDetail.TestCount) < testCountRequirement*float32(otherDetail.TestCount) {
return ErrTooFewTests
}
return nil
Expand Down
6 changes: 3 additions & 3 deletions cloud/bq/sanity_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ func TestGetTableDetail(t *testing.T) {
if err != nil {
t.Error(err)
} else if detail.TaskFileCount != 2 || detail.TestCount != 4 {
t.Error("Wrong number of tasks or tests")
t.Errorf("Wrong number of tasks or tests %v\n", detail)
}

srcDS, err := dataset.NewDataset(ctx, "mlab-testing", "src")
Expand All @@ -50,8 +50,8 @@ func TestGetTableDetail(t *testing.T) {
detail, err = bq.GetTableDetail(ctx, &srcDS, srcDS.Table("DedupTest_19990101"))
if err != nil {
t.Error(err)
} else if detail.TaskFileCount != 2 || detail.TestCount != 6 {
t.Error("Wrong number of tasks or tests")
} else if detail.TaskFileCount != 2 || detail.TestCount != 4 {
t.Errorf("Wrong number of tasks or tests %v\n", detail)
}
}

Expand Down

0 comments on commit 8e1b000

Please sign in to comment.