Skip to content

Commit

Permalink
update dedup query to prefer metadata and earlier task filename
Browse files Browse the repository at this point in the history
  • Loading branch information
gfr10598 committed Feb 21, 2019
1 parent 5510de2 commit 703c159
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 6 deletions.
8 changes: 6 additions & 2 deletions cloud/bq/dedup.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,13 @@ var dedupTemplateNDT = `
# Delete all duplicate rows based on test_id, preferring gz over non-gz, later parse_time
SELECT * except (row_number, gz, stripped_id)
from (
select *, ROW_NUMBER() OVER (PARTITION BY stripped_id order by gz DESC, parse_time DESC) row_number
select *,
# Prefer more snapshots, metadata, earlier task names, gzipped, later parse time
ROW_NUMBER() OVER (PARTITION BY stripped_id ORDER BY anomalies.num_snaps DESC, anomalies.no_meta, task_filename, gz DESC, parse_time DESC) row_number
FROM (
SELECT *, regexp_replace(test_id, ".gz$", "") as stripped_id, regexp_extract(test_id, ".*(.gz)$") as gz
SELECT *,
REGEXP_REPLACE(test_id, ".gz$", "") AS stripped_id,
REGEXP_EXTRACT(test_id, ".*(.gz)$") AS gz
FROM ` + "`%s`" + `
)
)
Expand Down
11 changes: 7 additions & 4 deletions cloud/bq/sanity.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,12 @@ func GetTableDetail(ctx context.Context, dsExt *dataset.Dataset, table bqiface.T
detail := Detail{}
queryString := fmt.Sprintf(`
#standardSQL
SELECT SUM(tests) AS TestCount, COUNT(task)-1 AS TaskFileCount
SELECT SUM(tests) AS TestCount, COUNT(DISTINCT task)-1 AS TaskFileCount
FROM (
-- This avoids null counts when the partition doesn't exist or is empty.
SELECT 0 AS tests, "fake-task" AS task
UNION ALL
SELECT COUNT(test_id) AS tests, task_filename AS task
SELECT COUNT(DISTINCT test_id) AS tests, task_filename AS task
FROM `+"`%s.%s`"+`
%s -- where clause
GROUP BY task
Expand Down Expand Up @@ -281,6 +281,8 @@ func (at *AnnotatedTable) GetPartitionInfo(ctx context.Context) (*dataset.Partit
// IncludeTaskFileCountCheck temporarily disables the task file count check, to address the problem
// with 2012.
const IncludeTaskFileCountCheck = false
const testCountRequirement = 0.99 // Query updated to count DISTINCT test_ids, so this can now be much tighter.
const taskCountRequirement = 0.99

// checkAlmostAsBig compares the current and given AnnotatedTable test counts and
// task file counts. When the current AnnotatedTable has more than 1% fewer task files or 5%
Expand All @@ -307,7 +309,7 @@ func (at *AnnotatedTable) checkAlmostAsBig(ctx context.Context, other *Annotated
// redundant with tests in other archives. This means that some archives are completely removed
// in the dedup process. Since these archives appear in the original "base_tables", this check
// has been causing the sanity check to fail.
if IncludeTaskFileCountCheck && float32(thisDetail.TaskFileCount) < 0.99*float32(otherDetail.TaskFileCount) {
if IncludeTaskFileCountCheck && float32(thisDetail.TaskFileCount) < taskCountRequirement*float32(otherDetail.TaskFileCount) {
return ErrTooFewTasks
}

Expand All @@ -318,7 +320,8 @@ func (at *AnnotatedTable) checkAlmostAsBig(ctx context.Context, other *Annotated
at.Table.FullyQualifiedName(), thisDetail.TestCount,
other.Table.FullyQualifiedName(), otherDetail.TestCount)
}
if float32(thisDetail.TestCount) < 0.95*float32(otherDetail.TestCount) {
// We are now using DISTINCT test counts, so we can use a tighter bound.
if float32(thisDetail.TestCount) < testCountRequirement*float32(otherDetail.TestCount) {
return ErrTooFewTests
}
return nil
Expand Down

0 comments on commit 703c159

Please sign in to comment.