Skip to content

Commit

Permalink
fix for resuming
Browse files Browse the repository at this point in the history
  • Loading branch information
johndpope committed Oct 10, 2024
1 parent 8197b79 commit 0c2abb1
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 5 deletions.
2 changes: 1 addition & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ profiling:
profile_step: 10
training:

load_checkpoint: False # Set this to true when you want to load from a checkpoint
load_checkpoint: True # Set this to true when you want to load from a checkpoint
checkpoint_path: './checkpoints/checkpoint.pth'
use_eye_loss: False
use_subsampling: False # saves ram? https://github.com/johndpope/MegaPortrait-hack/issues/41
Expand Down
16 changes: 12 additions & 4 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,17 +332,22 @@ def save_checkpoint(self, epoch, is_final=False):

def load_checkpoint(self, checkpoint_path):
try:
checkpoint = self.accelerator.load(checkpoint_path)
checkpoint = torch.load(checkpoint_path, map_location=self.accelerator.device)

self.model.load_state_dict(checkpoint['model_state_dict'])
self.discriminator.load_state_dict(checkpoint['discriminator_state_dict'])
# Unwrap the models before loading state dict
unwrapped_model = self.accelerator.unwrap_model(self.model)
unwrapped_discriminator = self.accelerator.unwrap_model(self.discriminator)

unwrapped_model.load_state_dict(checkpoint['model_state_dict'])
unwrapped_discriminator.load_state_dict(checkpoint['discriminator_state_dict'])
self.optimizer_g.load_state_dict(checkpoint['optimizer_g_state_dict'])
self.optimizer_d.load_state_dict(checkpoint['optimizer_d_state_dict'])
self.scheduler_g.load_state_dict(checkpoint['scheduler_g_state_dict'])
self.scheduler_d.load_state_dict(checkpoint['scheduler_d_state_dict'])

if self.ema and 'ema_state_dict' in checkpoint:
self.ema.load_state_dict(checkpoint['ema_state_dict'])
unwrapped_ema = self.accelerator.unwrap_model(self.ema)
unwrapped_ema.load_state_dict(checkpoint['ema_state_dict'])

start_epoch = checkpoint['epoch'] + 1
print(f"Loaded checkpoint from epoch {start_epoch - 1}")
Expand Down Expand Up @@ -398,6 +403,9 @@ def main():
collate_fn=gpu_padded_collate
)

print("using float32 for onnx training....")
torch.set_default_dtype(torch.float32)


trainer = IMFTrainer(config, model, discriminator, dataloader, accelerator)
# Check if a checkpoint path is provided in the config
Expand Down

0 comments on commit 0c2abb1

Please sign in to comment.