-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_common.py
103 lines (87 loc) · 3.21 KB
/
train_common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from utils import config
import itertools
import os
import torch
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def save_checkpoint(model, epoch, checkpoint_dir, stats):
state = {
'epoch': epoch,
'state_dict': model.state_dict(),
'stats': stats,
}
filename = os.path.join(checkpoint_dir,
'epoch={}.checkpoint.pth.tar'.format(epoch))
torch.save(state, filename)
def restore_checkpoint(model, checkpoint_dir, cuda=False, force=False,
pretrain=False):
"""
If a checkpoint exists, restores the PyTorch model from the checkpoint.
Returns the model and the current epoch.
"""
cp_files = [file_ for file_ in os.listdir(checkpoint_dir)
if file_.startswith('epoch=') and file_.endswith('.checkpoint.pth.tar')]
if not cp_files:
print('No saved model parameters found')
if force:
raise Exception("Checkpoint not found")
else:
return model, 0, []
# Find latest epoch
for i in itertools.count(1):
if 'epoch={}.checkpoint.pth.tar'.format(i) in cp_files:
epoch = i
else:
break
if not force:
print("Which epoch to load from? Choose in range [0, {}]."
.format(epoch), "Enter 0 to train from scratch.")
print(">> ", end='')
inp_epoch = int(input())
if inp_epoch not in range(epoch+1):
raise Exception("Invalid epoch number")
if inp_epoch == 0:
print("Checkpoint not loaded")
clear_checkpoint(checkpoint_dir)
return model, 0, []
else:
print("Which epoch to load from? Choose in range [1, {}].".format(epoch))
inp_epoch = int(input())
if inp_epoch not in range(1, epoch+1):
raise Exception("Invalid epoch number")
filename = os.path.join(checkpoint_dir,
'epoch={}.checkpoint.pth.tar'.format(inp_epoch))
print("Loading from checkpoint {}?".format(filename))
if cuda:
checkpoint = torch.load(filename)
else:
# Load GPU model on CPU
checkpoint = torch.load(filename,
map_location=lambda storage, loc: storage)
try:
start_epoch = checkpoint['epoch']
stats = checkpoint['stats']
if pretrain:
model.load_state_dict(checkpoint['state_dict'], strict=False)
else:
model.load_state_dict(checkpoint['state_dict'])
print("=> Successfully restored checkpoint (trained for {} epochs)"
.format(checkpoint['epoch']))
except:
print("=> Checkpoint not successfully restored")
raise
return model, inp_epoch, stats
def clear_checkpoint(checkpoint_dir):
filelist = [ f for f in os.listdir(checkpoint_dir) if f.endswith(".pth.tar") ]
for f in filelist:
os.remove(os.path.join(checkpoint_dir, f))
print("Checkpoint successfully removed")
def predictions(logits):
"""
Given the network output, determines the predicted class index, which should
be the index with the highest value
Returns:
the predicted class output as a PyTorch Tensor
"""
# TODO: implement this function
return torch.argmax(logits, dim=1)