You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Is there a way to plot validation loss curves along with the training loss during the Training process?
Here is my train method:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = model.state_dict()
best_acc = 0.0
last_margin = 0.0
for epoch in range(num_epochs - start_epoch):
epoch = epoch + start_epoch
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
model.train(True) # Set model to training mode
else:
model.train(False) # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0.0
running_margin = 0.0
running_reg = 0.0
# Iterate over data.
for data in dataloaders[phase]:
# get the inputs
inputs, labels, pos, pos_labels = data
now_batch_size, c, h, w = inputs.shape
if now_batch_size<opt.batchsize: # next epoch
continue
pos = pos.view(4*opt.batchsize, c, h, w)
# copy pos 4times
pos_labels = pos_labels.repeat(4).reshape(4, opt.batchsize)
pos_labels = pos_labels.transpose(0, 1).reshape(4*opt.batchsize)
# wrap them in Variable
if use_gpu:
inputs = Variable(inputs.cuda())
pos = Variable(pos.cuda())
labels = Variable(labels.cuda())
else:
inputs, labels = Variable(inputs), Variable(labels)
# zero the parameter gradients
optimizer.zero_grad()
# forward
if phase == 'val':
with torch.no_grad():
outputs, f = model(inputs)
_, pf = model(pos)
else:
# model_eval = copy.deepcopy(model)
# model_eval = model_eval.eval()
outputs, f = model(inputs)
_, pf = model(pos)
# pf = Variable( pf, requires_grad=True)
neg_labels = pos_labels
# hard-neg
# ----------------------------------
nf_data = pf # 128*512
# 128 is too much, we use pool size = 64
rand = np.random.permutation(4*opt.batchsize)[0:opt.poolsize]
nf_data = nf_data[rand, :]
neg_labels = neg_labels[rand]
nf_t = nf_data.transpose(0, 1) # 512*128
score = torch.mm(f.data, nf_t) # cosine 32*128
score, rank = score.sort(dim=1, descending=True) # score high == hard
labels_cpu = labels.cpu()
nf_hard = torch.zeros(f.shape).cuda()
for k in range(now_batch_size):
hard = rank[k, :]
for kk in hard:
now_label = neg_labels[kk]
anchor_label = labels_cpu[k]
if now_label != anchor_label:
nf_hard[k, :] = nf_data[kk, :]
break
# hard-pos
# ----------------------------------
pf_hard = torch.zeros(f.shape).cuda() # 32*512
for k in range(now_batch_size):
pf_data = pf[4*k:4*k+4,:]
pf_t = pf_data.transpose(0,1) # 512*4
ff = f.data[k,:].reshape(1,-1) # 1*512
score = torch.mm(ff, pf_t) #cosine
score, rank = score.sort(dim=1, descending = False) # score low == hard
pf_hard[k,:] = pf_data[rank[0][0],:]
# loss
# ---------------------------------
criterion_triplet = nn.MarginRankingLoss(margin=opt.margin)
pscore = torch.sum(f * pf_hard, dim=1)
nscore = torch.sum(f * nf_hard, dim=1)
y = torch.ones(now_batch_size)
y = Variable(y.cuda())
if not opt.PCB:
_, preds = torch.max(outputs.data, 1)
#loss = criterion(outputs, labels)
#loss_triplet = criterion_triplet(f, pf, nf)
reg = torch.sum((1+nscore)**2) + torch.sum((-1+pscore)**2)
loss = torch.sum(torch.nn.functional.relu(nscore + opt.margin - pscore)) #Here I use sum
loss_triplet = loss + opt.alpha*reg
else:
part = {}
sm = nn.Softmax(dim=1)
num_part = 6
for i in range(num_part):
part[i] = outputs[i]
score = sm(part[0]) + sm(part[1]) +sm(part[2]) + sm(part[3]) +sm(part[4]) +sm(part[5])
_, preds = torch.max(score.data, 1)
loss = criterion(part[0], labels)
for i in range(num_part-1):
loss += criterion(part[i+1], labels)
# backward + optimize only if in training phase
if phase == 'train':
if fp16: # we use optimier to backward loss
with amp.scale_loss(loss_triplet, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss_triplet.backward()
optimizer.step()
# statistics
if int(version[0]) > 0 or int(version[2]) > 3: # for the new version like 0.4.0 and 0.5.0
running_loss += loss_triplet.item() #* opt.batchsize
else : # for the old version like 0.3.0 and 0.3.1
running_loss += loss_triplet.data[0] #*opt.batchsize
# print( loss_triplet.item())
running_corrects += float(torch.sum(pscore>nscore+opt.margin))
running_margin +=float(torch.sum(pscore-nscore))
running_reg += reg
datasize = dataset_sizes[phase]//opt.batchsize * opt.batchsize
epoch_loss = running_loss / datasize
epoch_reg = opt.alpha*running_reg/ datasize
epoch_acc = running_corrects / datasize
epoch_margin = running_margin / datasize
#if epoch_acc>0.75:
# opt.margin = min(opt.margin+0.02, 1.0)
print('now_margin: %.4f'%opt.margin)
print('{} Loss: {:.4f} Reg: {:.4f} Acc: {:.4f} MeanMargin: {:.4f}'.format(
phase, epoch_loss, epoch_reg, epoch_acc, epoch_margin))
y_loss[phase].append(epoch_loss)
y_err[phase].append(1.0-epoch_acc)
# deep copy the model
if epoch_margin>last_margin:
last_margin = epoch_margin
last_model_wts = model.state_dict()
if phase == 'val':
last_model_wts = model.state_dict()
if epoch % 10 == 9:
save_network(model, epoch)
draw_curve(epoch)
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
#print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(last_model_wts)
save_network(model, 'last')
return model
I tried to add 'val' phase in your train method: "for phase in ['train', 'val']". But I am getting the following error while looping through the Val dataloader
(at this line "for data in dataloaders[phase]"):
but I am getting this error probably because of dataloader['val']:
Traceback (most recent call last):
File "train_siamese.py", line 593, in <module>
num_epochs=150)
File "train_siamese.py", line 323, in train_model
for data in dataloaders[phase]:
File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 582, in __next__
return self._process_next_batch(batch)
File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 608, in _process_next_batch
raise batch.exc_type(batch.exc_msg)
ZeroDivisionError: Traceback (most recent call last):
File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in _worker_loop
samples = collate_fn([dataset[i] for i in batch_indices])
File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in <listcomp>
samples = collate_fn([dataset[i] for i in batch_indices])
File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/tripletfolder.py", line 47, in __getitem__
pos_path = self._get_pos_sample(target, index)
File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/tripletfolder.py", line 32, in _get_pos_sample
t = i%len(rand)
ZeroDivisionError: integer division or modulo by zero
The text was updated successfully, but these errors were encountered:
Is there a way to plot validation loss curves along with the training loss during the Training process?
Here is my train method:
I tried to add 'val' phase in your train method: "for phase in ['train', 'val']". But I am getting the following error while looping through the Val dataloader
(at this line "for data in dataloaders[phase]"):
but I am getting this error probably because of dataloader['val']:
The text was updated successfully, but these errors were encountered: