Skip to content

Commit

Permalink
fix(dogs v cats example): changed md5 hashing to sha256
Browse files Browse the repository at this point in the history
Signed-off-by: Pant, Akshay <[email protected]>
  • Loading branch information
theakshaypant committed Nov 4, 2024
1 parent 97c538b commit e87c912
Showing 1 changed file with 14 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import json
import os
import shutil
from hashlib import md5
from hashlib import sha256
from logging import getLogger
from pathlib import Path
from random import shuffle
Expand Down Expand Up @@ -120,7 +120,7 @@ def download_dataset(self):

os.remove(self.data_folder / 'train.zip')

self.save_all_md5()
self.save_all_sha256()

def get_dataset(self, dataset_type='train'):
"""Return a shard dataset by type."""
Expand All @@ -132,39 +132,39 @@ def get_dataset(self, dataset_type='train'):
enforce_image_hw=self.enforce_image_hw
)

def calc_all_md5(self):
def calc_all_sha256(self):
"""Calculate hash of all dataset."""
md5_dict = {}
sha256_dict = {}
for root, _, files in os.walk(self.data_folder):
for file in files:
if file == 'dataset.json':
continue
md5_calc = md5(usedforsecurity=False)
sha256_calc = sha256(usedforsecurity=False)
rel_dir = os.path.relpath(root, self.data_folder)
rel_file = os.path.join(rel_dir, file)

with open(self.data_folder / rel_file, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
md5_calc.update(chunk)
md5_dict[rel_file] = md5_calc.hexdigest()
return md5_dict
sha256_calc.update(chunk)
sha256_dict[rel_file] = sha256_calc.hexdigest()
return sha256_dict

def save_all_md5(self):
def save_all_sha256(self):
"""Save dataset hash."""
all_md5 = self.calc_all_md5()
all_sha256 = self.calc_all_sha256()
with open(os.path.join(self.data_folder, 'dataset.json'), 'w', encoding='utf-8') as f:
json.dump(all_md5, f)
json.dump(all_sha256, f)

def is_dataset_complete(self):
"""Check dataset integrity."""
new_md5 = self.calc_all_md5()
new_sha256 = self.calc_all_sha256()
try:
with open(os.path.join(self.data_folder, 'dataset.json'), 'r', encoding='utf-8') as f:
old_md5 = json.load(f)
old_sha256 = json.load(f)
except FileNotFoundError:
return False

return new_md5 == old_md5
return new_sha256 == old_sha256

@property
def sample_shape(self):
Expand Down

0 comments on commit e87c912

Please sign in to comment.