Skip to content

Commit

Permalink
Add docs, unit tests, and refactor repro test infra
Browse files Browse the repository at this point in the history
Update reproducibility_tests/reproducibility_utils.jl

Co-authored-by: Gabriele Bozzola <[email protected]>

Update reproducibility_tests/reproducibility_utils.jl

Co-authored-by: Gabriele Bozzola <[email protected]>

Update reproducibility_tests/reproducibility_utils.jl

Co-authored-by: Gabriele Bozzola <[email protected]>

Update compute_bins
  • Loading branch information
charleskawczynski committed Nov 15, 2024
1 parent 1d372ce commit 577fb49
Show file tree
Hide file tree
Showing 3 changed files with 452 additions and 57 deletions.
5 changes: 4 additions & 1 deletion reproducibility_tests/move_output.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,8 @@ else
end

if buildkite_ci && in_merge_queue
cleanup_central(cluster_data_prefix)
folders = get_reference_paths_to_delete(; root_path = cluster_data_prefix)
for f in folders
rm(f; recursive = true, force = true)
end
end
225 changes: 181 additions & 44 deletions reproducibility_tests/reproducibility_utils.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,44 @@
#=
################################################################################
Reproducibility Terminology.
Consider the following set of reproducibility
folders, prefixed by "reference counters", which
allow users to compare against other reproducible
states in that column.
Note that reference counter changes can "rewind"
(which may happen in the case of reverted commits).
In such cases, we do consider the rewound state as
an entirely new state, in order to fully preserve
the history (to some depth).
An important consequence of this requires precise
terminology to avoid ambiguous descriptions.
For example, "comparable references per reference counter"
is not well defined, because the reference counter can
be reverted. So, let's introduce the concept of a "bin",
which can be defined as a collection of folders
created in a period with the same reference counter.
Folders created before and after that bin have a different
reference counter. Also, `n_bins == n_reference_changes + 1`
(modulo the edge case of when there are no bins)
because, if the reference counter doesn't change, new results
are put into the same bin.
```
comparable states
| ref counter changes ----> | oldest
| |
| bin 1 bin 2 bin 3 bin 4 bin 5 bin 6 bin 7 |
| |
| 02_49f92 04_36ebe 05_beb8a 06_4d837 05_8c311 08_45875 10_bc1e0 |
| 04_d6e48 06_d6d73 08_1cc58 |
v 04_4c042 v newest
```
################################################################################
=#

import Dates

read_ref_counter(filename) = parse(Int, first(readlines(filename)))
Expand Down Expand Up @@ -118,57 +159,153 @@ function latest_comparable_paths(;
return comparable_paths
end

function reason(path)
f = joinpath(path, "ref_counter.jl")
if !isfile(f)
return "ref_counter.jl does not exist"
else
ref_counter = parse(Int, first(readlines(f)))
return "ref_counter: $ref_counter"
"""
invalid_reference_folders(; root_path)
Return all subfolders in `root_path`
that meet the following criteria:
- A `ref_counter.jl` file is missing
"""
function invalid_reference_folders(; root_path)
paths = sorted_dataset_folder(; dir = root_path)
invalid_folders = filter(paths) do p
!isfile(joinpath(p, "ref_counter.jl"))
end
return invalid_folders
end

function cleanup_central(cluster_data_prefix)
@warn "Cleaning up old files on central"
# Get (sorted) array of paths, `pop!(sorted_paths)`
# is the most recent merged folder.
sorted_paths = sorted_dataset_folder(; dir = cluster_data_prefix)
keep_latest_n = 0
keep_latest_ref_counters = 5
if !isempty(sorted_paths)
N = length(sorted_paths) - keep_latest_n
paths_to_delete = []
ref_counters_main = ref_counters_per_path(sorted_paths)
i_largest_reference = argmax(ref_counters_main)
path = sorted_paths[i_largest_reference]
ref_counter_file_main = joinpath(path, "ref_counter.jl")
@assert isfile(ref_counter_file_main)
ref_counter_main = parse(Int, first(readlines(ref_counter_file_main)))

for i in 1:N
path = sorted_paths[i]
ref_counter_file = joinpath(path, "ref_counter.jl")
if !isfile(ref_counter_file)
push!(paths_to_delete, path)
"""
compute_bins(root_path::String)
compute_bins(sorted_paths::Vector{String})
Return a vector of reproducibility bins.
Bins are sorted from newest to oldest:
- `bins[1], bins[end]` are the newest and oldest bins
- `bins[i][1], bins[i][end]` are the newest oldest comparable states.
```
comparable states
| ref counter changes ----> | oldest
| |
| bin 1 bin 2 bin 3 bin 4 bin 5 bin 6 bin 7 |
| |
| 02_49f92 04_36ebe 05_beb8a 06_4d837 05_8c311 08_45875 10_bc1e0 |
| 04_d6e48 06_d6d73 08_1cc58 |
v 04_4c042 v newest
```
"""
compute_bins(root_path::String) =
compute_bins(reverse(sorted_dataset_folder(; dir = root_path)))
function compute_bins(sorted_paths::Vector{String})
bins = Vector{String}[]
path_index = 1
while path_index length(sorted_paths)
paths_per_bin = String[]
while path_index length(sorted_paths)
path = sorted_paths[path_index]
if isempty(paths_per_bin)
push!(paths_per_bin, path)
path_index += 1
else
ref_counter = parse(Int, first(readlines(ref_counter_file)))
# Just to be safe, let's also make sure that we don't delete
# any paths with recent (let's say 5) ref counter increments ago.
if ref_counter + keep_latest_ref_counters < ref_counter_main
push!(paths_to_delete, path)
ref_counter_bin = read_ref_counter(
joinpath(first(paths_per_bin), "ref_counter.jl"),
)
ref_counter_path =
read_ref_counter(joinpath(path, "ref_counter.jl"))
if ref_counter_bin == ref_counter_path
push!(paths_per_bin, path)
path_index += 1
else
break
end
end
end
@show ref_counter_main
@show length(sorted_paths)
@show length(paths_to_delete)
@info "Deleting files:"
for i in 1:length(paths_to_delete)
f = paths_to_delete[i]
@info " (File, date): ($(f), $(Dates.unix2datetime(stat(f).mtime))). Reason: $(reason(f))"
end
for i in 1:length(paths_to_delete)
rm(paths_to_delete[i]; recursive = true, force = true)
push!(bins, paths_per_bin)
end
return bins
end

"""
get_reference_paths_to_delete(;
root_path,
keep_n_comparable_states = 5,
keep_n_bins_back = 7,
)
Return a list of folders to delete.
Our reference folders are saved, and can
therefore build up significantly and take
a lot of storage.
Consider a collection of folders whose
names are prepended by the reference
counter:
```
keep_n_comparable_states
| <---- keep_n_bins_back | oldest
| |
| bin 1 bin 2 bin 3 bin 4 bin 5 bin 6 bin 7 |
| |
| 02_49f92 04_36ebe 05_beb8a 06_4d837 05_8c311 08_45875 10_bc1e0 |
| 04_d6e48 06_d6d73 08_1cc58 |
v 04_4c042 v newest
```
With these folders, and given a reference
counter of 10, we'll see the following
behavior:
```
get_reference_paths_to_delete(;
keep_n_comparable_states = 4,
keep_n_bins_back = 3
) -> [02_49f92, 04_36ebe, 04_d6e48, 04_4c042]
get_reference_paths_to_delete(;
keep_n_comparable_states = 1,
keep_n_bins_back = 5
) -> [02_49f92, 04_d6e48, 04_4c042, 06_d6d73, 08_1cc58]
```
Note:
`keep_n_references_back` is sorted _chronologically_,
in order to correctly operate in the case of
reverted pull requests. In other words, the above
references may look like this:
```
keep_n_comparable_states
| <---- keep_n_bins_back | oldest
| |
| bin 1 bin 2 bin 3 bin 4 bin 5 bin 6 bin 7 |
| |
| 02_49f92 04_36ebe 05_beb8a 06_4d837 05_8c311 08_45875 10_bc1e0 |
| 04_d6e48 06_d6d73 08_1cc58 |
v 04_4c042 v newest
```
"""
function get_reference_paths_to_delete(;
root_path,
keep_n_comparable_states = 5,
keep_n_bins_back = 7,
)
@assert isempty(invalid_reference_folders(; root_path))
paths_to_delete = String[]
sorted_paths = reverse(sorted_dataset_folder(; dir = root_path))
if !isempty(sorted_paths)
# Now, sorted_paths[1] is newest, sorted_paths[end] is oldest
bins = compute_bins(sorted_paths)
for i in 1:length(bins), j in 1:length(bins[i])
if i keep_n_bins_back
if !(j keep_n_comparable_states)
push!(paths_to_delete, bins[i][j])
end
else
push!(paths_to_delete, bins[i][j])
end
end
end
return paths_to_delete
end
Loading

0 comments on commit 577fb49

Please sign in to comment.