Skip to content

Commit

Permalink
Merge pull request #55 from mgyoo86/improve/h5merge
Browse files Browse the repository at this point in the history
Improve the flexibility of `h5merge` with new keywords
  • Loading branch information
orso82 authored Feb 22, 2025
2 parents cfdcb9a + fc5d83a commit 1691990
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 26 deletions.
160 changes: 134 additions & 26 deletions src/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1342,27 +1342,53 @@ end
# h5merge #
#= ======= =#
"""
h5merge(output_file::AbstractString, keys_files::AbstractVector{<:Pair{<:AbstractString, <:AbstractString}};
mode::AbstractString="a", skip_existing_entries::Bool=false)
h5merge(output_file::AbstractString, keys_files::Union{AbstractDict{<:AbstractString,<:AbstractString},AbstractVector{<:Pair{<:AbstractString,<:AbstractString}};
mode::AbstractString="a", skip_existing_entries::Bool=false,
h5_group_search_depth::Integer=0, h5_strip_group_prefix::Bool=false,
verbose::Bool=false)
Merges multiple files into a single HDF5 `output_file`, using `keys_files` as a vector of pairs where each key is a group name and each value is the corresponding input filename.
Merges multiple files into a single HDF5 output file.
`.h5` files will be added to the `output_file`, while other file formats will be added either as strings (for text file formats) or bytes of raw data.
Note that keys that have `/` will result in nested groups in the `output_file`
The `mode` argument specifies whether to create a new file (`"w"`) or append to an existing one (`"a"`).
If `skip_existing_entries = false`, files with the same key in the output file will be updated.
# Arguments
- `output_file`: Path to the HDF5 file where data will be merged.
- `keys_files`: A vector or dictionary mapping target group names to input filenames.
- `mode`: `"w"` to create a new file or `"a"` to append to an existing one.
- `skip_existing_entries`: If `true`, groups already present in the output file are not overwritten.
- `h5_group_search_depth`: For input HDF5 files, the depth at which to collect group paths.
- `0` means use the root (`"/"`).
- `1` means collect immediate children of the root.
- Higher values collect groups deeper in the hierarchy.
- `h5_strip_group_prefix`: If `true`, the target group name (the key from `keys_files`) is omitted from the output HDF5 path.
- For example, if an input file contains a group path `"/level1/level2"` and the key is `"parent"`, then
- With `h5_strip_group_prefix = false`, the output path becomes `"/parent/level1/level2"`.
- With `h5_strip_group_prefix = true`, the output path becomes `"/level1/level2"`.
- `verbose`: If `true`, additional logging information is printed.
# Behavior
- For input files with the `.h5` extension, the function opens the file and collects group paths up to `h5_group_search_depth`.
Each collected path is modified by stripping the first N components (using "/" as the delimiter) according to the flag `h5_strip_group_prefix` (if `true`, the parent key is omitted).
Then, the corresponding objects are copied into the output file.
- For other file types (e.g., JSON, YAML, text, markdown), the file is read and stored as text or raw binary data.
- The function records attributes for each copied group that indicate the original file paths.
Returns a vector of group names (as strings) that were processed.
"""
function h5merge(
output_file::AbstractString,
keys_files::Union{AbstractDict{<:AbstractString,<:AbstractString},AbstractVector{<:Pair{<:AbstractString,<:AbstractString}}};
mode::AbstractString="a",
skip_existing_entries::Bool=false,
h5_group_search_depth::Integer=0,
h5_strip_group_prefix::Bool=false,
verbose::Bool=false
)
@assert mode in ("w", "a")
@assert h5_group_search_depth >= 0 "h5_group_search_depth cannot be negative"

if h5_strip_group_prefix
# Ensure that at least level 1 is searched if the parent prefix is to be omitted.
h5_group_search_depth = max(h5_group_search_depth, 1)
end

for file in values(keys_files)
@assert !isdir(file) "h5merge: File `$file` is a directory, not a file."
Expand All @@ -1376,51 +1402,81 @@ function h5merge(
mode = "r+"
end

check_group_list = String[];

HDF5.h5open(output_file, mode) do output_h5

update_file_attributes(output_h5)

for (group_name, input_file) in keys_files
if haskey(output_h5, group_name)
for (gparent_name, input_file) in keys_files
if haskey(output_h5, gparent_name)
if skip_existing_entries
continue
end
HDF5.delete_object(output_h5, group_name)
HDF5.delete_object(output_h5, gparent_name)
end

if isempty(input_file) || filesize(input_file) == 0
file_type = "EMPTY"
HDF5.create_dataset(output_h5, group_name, UInt8, 0)
HDF5.create_dataset(output_h5, gparent_name, UInt8, 0)

elseif endswith(input_file, ".h5")
file_type = "HDF5"

HDF5.h5open(input_file, "r") do input_h5
return HDF5.copy_object(input_h5, "/", output_h5, group_name)
end

target_h5_paths = h5_collect_group_paths(input_h5, h5_group_search_depth)
# Use gparent_name only if h5_strip_group_prefix is false.
prefix = h5_strip_group_prefix ? "" : gparent_name

for ori_h5_path in target_h5_paths
new_h5_path = norm_hdf5_path("/" * prefix * "/" * ori_h5_path)

if haskey(output_h5, new_h5_path)
if skip_existing_entries
verbose && @info "Skipping [$(relpath(input_h5.filename))$ori_h5_path]"
continue
else
@warn "Overwriting: [$(relpath(input_h5.filename))$ori_h5_path] --> [$(output_h5.filename)$new_h5_path]"
HDF5.delete_object(output_h5, new_h5_path)
end
end

HDF5.copy_object(input_h5, ori_h5_path, output_h5, new_h5_path)

attr = HDF5.attrs(output_h5[new_h5_path])
attr["original_file_abs_path"] = abspath(input_file)
attr["original_file_rel_path"] = relpath(input_file)
push!(check_group_list, new_h5_path)
end
end
elseif split(input_file, ".")[end] in ("json", "yaml", "txt", "md") || is_text_file(input_file)
file_type = "TEXT"
open(input_file, "r") do io
text = read(io, String)
return HDF5.write(output_h5, group_name, text)
HDF5.write(output_h5, gparent_name, text)
end
else
file_type = "BINARY"
open(input_file, "r") do io
data = read(io) # Read the file as bytes
dset = HDF5.create_dataset(output_h5, group_name, UInt8, length(data))
return dset[:] = data
dset = HDF5.create_dataset(output_h5, gparent_name, UInt8, length(data))
dset[:] = data
end
end
if verbose
@info "$(group_name) --> [$(file_type)] @ $(input_file)"
@info "$(gparent_name) --> [$(file_type)] @ $(input_file)"
end

attr = HDF5.attrs(output_h5[group_name])
attr["original_file_abs_path"] = abspath(input_file)
attr["original_file_rel_path"] = relpath(input_file)
if file_type != "HDF5"
attr = HDF5.attrs(output_h5[gparent_name])
attr["original_file_abs_path"] = abspath(input_file)
attr["original_file_rel_path"] = relpath(input_file)
push!(check_group_list, gparent_name)
end
end
end
return check_group_list
end

"""
Expand Down Expand Up @@ -1504,15 +1560,59 @@ function h5merge(
end


h5merge(output_file, keys_files; mode, skip_existing_entries, verbose, kwargs...)
check_group_list = h5merge(output_file, keys_files; mode, skip_existing_entries, verbose, kwargs...)

if cleanup
cleanup_files_and_directory(output_file, directory, keys_files; verbose, pattern)
cleanup_files_and_directory(output_file, directory, keys_files; check_group_list, verbose, pattern)
end

return
end

"""
h5_collect_group_paths(root::HDF5.Group, search_depth::Integer)
Recursively collects group paths in an HDF5 file up to the specified target depth using an iterative, stack-based approach.
- If `search_depth == 0`, returns ["/"].
- If `search_depth == 1`, returns paths of all objects directly under the root (e.g., "/group1", "/group2").
- For `search_depth > 1`, returns paths at the specified depth (e.g., for `search_depth == 2`, returns "/group1/subgroup1", etc.).
Returns an array of strings containing the collected group paths.
"""
function h5_collect_group_paths(H5_file::HDF5.File, search_depth::Integer)
paths = String[]
# Initialize the stack with a tuple: (current group, current path, current depth)
root_group = H5_file["/"]
stack = [(root_group, "/", 0)]

while !isempty(stack)
grp, current_path, depth = pop!(stack)

if depth == search_depth
# If we've reached the target depth, add the current path to the results.
push!(paths, current_path)
else
# Otherwise, iterate over the children of the current group.
for child in keys(grp)
new_path = joinpath(current_path, child)
child_obj = grp[child]
if child_obj isa HDF5.Group
# Push the child group onto the stack with incremented depth.
push!(stack, (child_obj, new_path, depth + 1))
else
# If the child is not a group but would reach the target depth,
# add its path to the results.
if depth + 1 == search_depth
push!(paths, new_path)
end
end
end
end
end

return paths
end

"""
read_combined_h5(filename::AbstractString; error_on_missing_coordinates::Bool=true, pattern::Regex=r"", kw...)
Expand Down Expand Up @@ -1612,12 +1712,20 @@ function cleanup_files_and_directory(
combined_file_name::AbstractString,
base_directory::AbstractString,
keys_files::Union{AbstractDict{<:AbstractString,<:AbstractString},AbstractVector{<:Pair{<:AbstractString,<:AbstractString}}};
check_group_list::AbstractArray{<:AbstractString}=String[],
verbose::Bool=false,
pattern::Union{Regex,Nothing}=nothing
)

if isempty(check_group_list)
for (group_name, input_file) in keys_files
push!(check_group_list, group_name)
end
end

HDF5.h5open(combined_file_name, "r") do merged_h5
all_exist = true
for (group_name, input_file) in keys_files
for group_name in check_group_list
if !haskey(merged_h5, group_name)
all_exist = false
@warn "Group '$group_name' is missing in $combined_file_name; cleanup aborted."
Expand Down
4 changes: 4 additions & 0 deletions test/runtests_io_extended.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ include(joinpath(@__DIR__,"test_expressions_dicts.jl"))
@test keys(fid) == ["11.h5", "22.h5", "aa.h5", "bb.h5"]
close(fid)

# h5_strip_group_prefix test
h5merge(combined_file_name, [tmp_dir1, tmp_dir2]; mode="w", pattern=r".*h5", verbose=true, h5_strip_group_prefix=true, skip_existing_entries=true)
h5merge(combined_file_name, [tmp_dir1, tmp_dir2]; mode="w", pattern=r".*h5", verbose=true, h5_strip_group_prefix=true, skip_existing_entries=false)

# Merge including non-hdf5 files (txt and binaryfile)
touch(joinpath(tmp_dir1,"empty.txt"))
open(joinpath(tmp_dir1, "test.txt"), "w") do f
Expand Down

0 comments on commit 1691990

Please sign in to comment.