-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathengine.py
293 lines (226 loc) · 11 KB
/
engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Train and eval functions used in main.py
"""
import math
import os
import sys
import torch
import torch.distributed as dist
from torchvision.ops import box_iou
from tqdm import tqdm
from typing import Iterable
import utils.misc as utils
import utils.loss_utils as loss_utils
import utils.eval_utils as eval_utils
from models.clip import clip
from utils.misc import NestedTensor
import utils.drawing_utils as drawing_utils
from transformers import BertTokenizer
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from IPython.display import Image, display
def train_one_epoch(args, model: torch.nn.Module, data_loader: Iterable,
optimizer: torch.optim.Optimizer, device: torch.device,
epoch: int, max_norm: float = 0):
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
header = 'Epoch: [{}]'.format(epoch)
print_freq = 10
weight_contrast = args.weight_contrast
iter = epoch * len(data_loader)
for batch in metric_logger.log_every(data_loader, print_freq, header):
( img_data, text_data, target ,tem_imgs, tem_txts, _, category, tem_cat)= batch
# Copy all tensors to GPU
img_data = img_data.to(device)
target = target.to(device)
if args.model_type == "ResNet":
text_data = text_data.to(device)
else:
text_data = clip.tokenize(text_data).to(device)
# tem_imgs와 tem_txts는 리스트이므로, 각 NestedTensor를 GPU로 이동시킴
tem_imgs = [tmpl.to(device) for tmpl in tem_imgs]
tem_txts = [tmpl.to(device) for tmpl in tem_txts]
# model forward
output = model(img_data, text_data,tem_imgs, tem_txts, category, tem_cat)
if output is None:
print("Error: Model returned None.")
if args.contrastive_loss == 1:
loss_dict = loss_utils.trans_vg_contrast(output, target, weight_contrast)
losses = sum(loss_dict[k] for k in loss_dict.keys())
else :
output = output[0]
loss_dict = loss_utils.trans_vg_loss(output, target)
losses = sum(loss_dict[k] for k in loss_dict.keys())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
loss_dict_reduced_unscaled = {k: v
for k, v in loss_dict_reduced.items()}
losses_reduced_unscaled = sum(loss_dict_reduced_unscaled.values())
loss_value = losses_reduced_unscaled.item()
if not math.isfinite(loss_value):
print("Loss is {}, stopping training".format(loss_value))
print(loss_dict_reduced)
sys.exit(1)
optimizer.zero_grad()
losses.backward()
if max_norm > 0:
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
optimizer.step()
metric_logger.update(loss=loss_value, **loss_dict_reduced_unscaled)
metric_logger.update(lr=optimizer.param_groups[0]["lr"])
iter = iter + 1
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print("Averaged stats:", metric_logger)
return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
@torch.no_grad()
def validate(args, model: torch.nn.Module, data_loader: Iterable, device: torch.device):
model.eval()
weight_contrast = args.weight_contrast
metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Eval:'
for batch in metric_logger.log_every(data_loader, 10, header):
( img_data, text_data, target ,tem_imgs, tem_txts, tem_bboxes, category,tem_cat ) = batch
if args.model_type == "ResNet":
batch_size = img_data.tensors.size(0)
else:
batch_size = img_data.size(0)
# copy to GPU
img_data = img_data.to(device)
# tem_imgs와 tem_txts는 리스트이므로, 각 NestedTensor를 GPU로 이동시킴
tem_imgs = [tmpl.to(device) for tmpl in tem_imgs]
tem_txts = [tmpl.to(device) for tmpl in tem_txts]
tem_bboxes = [tmpl.to(device) for tmpl in tem_bboxes]
if args.model_type == "ResNet":
text_data = text_data.to(device)
else:
text_data = clip.tokenize(text_data).to(device)
target = target.to(device)
pred_boxes = model(img_data, text_data, tem_imgs, tem_txts, category,tem_cat)
if args.contrastive_loss == 1:
loss_dict = loss_utils.trans_vg_contrast(pred_boxes, target,weight_contrast)
losses = sum(loss_dict[k] for k in loss_dict.keys())
pred_boxes = pred_boxes[0]
else :
pred_boxes = pred_boxes[0]
loss_dict = loss_utils.trans_vg_loss(pred_boxes, target)
losses = sum(loss_dict[k] for k in loss_dict.keys())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
loss_dict_reduced_unscaled = {k: v
for k, v in loss_dict_reduced.items()}
losses_reduced_unscaled = sum(loss_dict_reduced_unscaled.values())
loss_value = losses_reduced_unscaled.item()
miou, accu = eval_utils.trans_vg_eval_val(pred_boxes, target)
metric_logger.update(loss=loss_value, **loss_dict_reduced_unscaled)
metric_logger.update_v2('miou', torch.mean(miou), batch_size)
metric_logger.update_v2('accu', accu, batch_size)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
print("Averaged stats:", metric_logger)
return stats
# BERT tokenizer load
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
@torch.no_grad()
def evaluate(args, model: torch.nn.Module, data_loader: Iterable, device: torch.device):
model.eval()
weight_contrast = args.weight_contrast
pred_box_list = []
gt_box_list = []
for batch_idx, batch in enumerate(tqdm(data_loader)):
(img_data, text_data, target, tem_imgs, tem_txts, tem_bboxes, category, tem_cat) = batch
batch_size = img_data.tensors.size(0)
img_data = img_data.to(device)
text_data = text_data.to(device)
target = target.to(device)
tem_imgs = [tmpl.to(device) for tmpl in tem_imgs]
tem_txts = [tmpl.to(device) for tmpl in tem_txts]
tem_bboxes = [tmpl.to(device) for tmpl in tem_bboxes]
# Model prediction
output = model(img_data, text_data, tem_imgs, tem_txts, category, tem_cat)
if args.contrastive_loss == 1:
loss_dict = loss_utils.trans_vg_contrast(output, target,weight_contrast)
losses = sum(loss_dict[k] for k in loss_dict.keys())
output = output[0]
else :
output = output[0]
loss_dict = loss_utils.trans_vg_loss(output, target)
losses = sum(loss_dict[k] for k in loss_dict.keys())
# Save predictions and ground truth
pred_box_list.append(output.cpu())
gt_box_list.append(target.cpu())
# 이미지 시각화
if isinstance(text_data, NestedTensor):
text_data = text_data.tensors
# token 화 된 text decoding 수행
current_text_tokens = text_data[0].cpu().numpy()
current_text = tokenizer.decode(current_text_tokens, skip_special_tokens=True)
if batch_idx % 1 == 0:
output_dir_path = args.output_dir
save_path = f"batch_{batch_idx}_eval_image.png"
drawing_utils.draw_bounding_boxes(img_data.tensors[0], tem_imgs, output[0], current_text, target[0], save_path=save_path)
pred_boxes = torch.cat(pred_box_list, dim=0)
gt_boxes = torch.cat(gt_box_list, dim=0)
total_num = gt_boxes.shape[0]
accu_num, AP = eval_utils.trans_vg_eval_test(pred_boxes, gt_boxes)
result_tensor = torch.tensor([accu_num, total_num]).to(device)
torch.cuda.synchronize()
dist.all_reduce(result_tensor)
accuracy = float(result_tensor[0]) / float(result_tensor[1])
return accuracy, AP
def draw_bounding_boxes_inference(image, pred_boxes, text, gt_boxes=None, figsize=(10, 10), save_path="output_image.png"):
fig, ax = plt.subplots(1, figsize=figsize)
# default로 쓰이는 image 평균과 표준편차
mean = torch.tensor([0.485, 0.456, 0.406])
std = torch.tensor([0.229, 0.224, 0.225])
# image 역정규화
img = drawing_utils.denormalize(image, mean, std)
img = torch.clamp(img, 0, 1)
height, width = image.shape[1], image.shape[2]
# 시각화
ax.imshow(img.permute(1, 2, 0).cpu().numpy())
# 예측 바운딩 박스 그리기 (Center-Width-Height 형식으로 출력되므로 Xmin-Xmax-Ymin-Ymax로 변환)
if pred_boxes.numel() == 4:
center_x, center_y, box_width, box_height = pred_boxes
xmin = (center_x - (box_width / 2)) * width
xmax = (center_x + (box_width / 2)) * width
ymin = (center_y - (box_height / 2)) * height
ymax = (center_y + (box_height / 2)) * height
# 좌표로부터 너비와 높이 계산
width_rect = xmax - xmin
height_rect = ymax - ymin
# 예측된 바운딩 박스 (빨간색)
rect = patches.Rectangle((xmin.cpu(), ymin.cpu()), width_rect.cpu(), height_rect.cpu(),
linewidth=3, edgecolor='r', facecolor='none', label="Prediction")
ax.add_patch(rect)
# 텍스트 크기와 위치 조정
ax.text(20, 20, f"Text: {text}", color='white', fontsize=16, bbox=dict(facecolor='black', alpha=0.7))
# 범례 추가
ax.legend(loc="upper right")
# 이미지 저장
plt.savefig(save_path)
plt.close(fig)
# inference 코드
@torch.no_grad()
def inference(args, model: torch.nn.Module, data_loader: Iterable, device: torch.device):
model.eval()
for batch_idx, batch in enumerate(tqdm(data_loader)):
(img_data, text_data, target, tem_imgs, tem_txts, tem_bboxes, category, tem_cat) = batch
batch_size = img_data.tensors.size(0)
img_data = img_data.to(device)
text_data = text_data.to(device)
tem_imgs = [tmpl.to(device) for tmpl in tem_imgs]
tem_txts = [tmpl.to(device) for tmpl in tem_txts]
tem_bboxes = [tmpl.to(device) for tmpl in tem_bboxes]
# Model prediction
output = model(img_data, text_data, tem_imgs, tem_txts, category, tem_cat)
if isinstance(text_data, NestedTensor):
text_data = text_data.tensors
current_text_tokens = text_data[0].cpu().numpy()
current_text = tokenizer.decode(current_text_tokens, skip_special_tokens=True)
if batch_idx % 20 == 0:
save_path = f"batch_{batch_idx}_inference_image.png"
draw_bounding_boxes_inference(img_data.tensors[0], output[0], current_text, save_path=save_path)