Skip to content

Commit

Permalink
add turn source codes
Browse files Browse the repository at this point in the history
  • Loading branch information
jiyanggao committed Sep 14, 2017
1 parent d8660af commit 8d2c567
Show file tree
Hide file tree
Showing 17 changed files with 704,370 additions and 0 deletions.
279,584 changes: 279,584 additions & 0 deletions turn_codes/background_samples.txt

Large diffs are not rendered by default.

190 changes: 190 additions & 0 deletions turn_codes/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@

import numpy as np
from math import sqrt
import os
import random
import pickle

def calculate_IoU(i0,i1):
union=(min(i0[0],i1[0]) , max(i0[1],i1[1]))
inter=(max(i0[0],i1[0]) , min(i0[1],i1[1]))
iou=1.0*(inter[1]-inter[0])/(union[1]-union[0])
return iou

'''
A class that handles the training set
'''
class TrainingDataSet(object):
def __init__(self,feat_dir,clip_gt_path,background_path,batch_size,movie_length_info,ctx_num,unit_feature_size,unit_size):
#it_path: image_token_file path
self.ctx_num=ctx_num
self.unit_feature_size=unit_feature_size
self.unit_size=unit_size
self.batch_size=batch_size
self.movie_length_info=movie_length_info
self.visual_feature_dim=self.unit_feature_size*3
self.feat_dir=feat_dir
self.training_samples=[]

print "Reading training data list from "+clip_gt_path+" and "+background_path
with open(clip_gt_path) as f:
for l in f:
movie_name=l.rstrip().split(" ")[0]
clip_start=float(l.rstrip().split(" ")[1])
clip_end=float(l.rstrip().split(" ")[2])
gt_start=float(l.rstrip().split(" ")[3])
gt_end=float(l.rstrip().split(" ")[4])
round_gt_start=np.round(gt_start/unit_size)*self.unit_size+1
round_gt_end=np.round(gt_end/unit_size)*self.unit_size+1
self.training_samples.append((movie_name,clip_start,clip_end,gt_start,gt_end,round_gt_start,round_gt_end,1))
print str(len(self.training_samples))+" training samples are read"
positive_num=len(self.training_samples)*1.0
with open(background_path) as f:
for l in f:
# control the ratio between background samples and positive samples to be 10:1
if random.random()>10.0*positive_num/270000: continue
movie_name=l.rstrip().split(" ")[0]
clip_start=float(l.rstrip().split(" ")[1])
clip_end=float(l.rstrip().split(" ")[2])
self.training_samples.append((movie_name,clip_start,clip_end,0,0,0,0,0))
self.num_samples=len(self.training_samples)
print str(len(self.training_samples))+" training samples are read"

def calculate_regoffset(self,clip_start,clip_end,round_gt_start,round_gt_end):
start_offset=(round_gt_start-clip_start)/self.unit_size
end_offset=(round_gt_end-clip_end)/self.unit_size
return start_offset, end_offset

'''
Get the central features
'''
def get_pooling_feature(self,feat_dir,movie_name,start,end):
swin_step=self.unit_size
all_feat=np.zeros([0,self.unit_feature_size],dtype=np.float32)
current_pos=start
while current_pos<end:
swin_start=current_pos
swin_end=swin_start+swin_step
feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy")
all_feat=np.vstack((all_feat,feat))
current_pos+=swin_step
pool_feat=np.mean(all_feat,axis=0)
return pool_feat


'''
Get the past (on the left of the central unit) context features
'''
def get_left_context_feature(self,feat_dir,movie_name,start,end):
swin_step=self.unit_size
all_feat=np.zeros([0,self.unit_feature_size],dtype=np.float32)
count=0
current_pos=start
context_ext=False
while count<self.ctx_num:
swin_start=current_pos-swin_step
swin_end=current_pos
if os.path.exists(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy"):
feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy")
all_feat=np.vstack((all_feat,feat))
context_ext=True
current_pos-=swin_step
count+=1
if context_ext:
pool_feat=np.mean(all_feat,axis=0)
else:
# print "no left "+str(start)
pool_feat=np.zeros([self.unit_feature_size],dtype=np.float32)
#print pool_feat.shape
return pool_feat


'''
Get the future (on the right of the central unit) context features
'''
def get_right_context_feature(self,feat_dir,movie_name,start,end):
swin_step=self.unit_size
all_feat=np.zeros([0,self.unit_feature_size],dtype=np.float32)
count=0
current_pos=end
context_ext=False
while count<self.ctx_num:
swin_start=current_pos
swin_end=current_pos+swin_step
if os.path.exists(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy"):
feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy")
all_feat=np.vstack((all_feat,feat))
context_ext=True
current_pos+=swin_step
count+=1
if context_ext:
pool_feat=np.mean(all_feat,axis=0)
else:
# print "no right "+str(end)
pool_feat=np.zeros([self.unit_feature_size],dtype=np.float32)
#print pool_feat.shape
return pool_feat

def next_batch(self):

random_batch_index=random.sample(range(self.num_samples),self.batch_size)
image_batch=np.zeros([self.batch_size,self.visual_feature_dim])
label_batch=np.zeros([self.batch_size],dtype=np.int32)
offset_batch=np.zeros([self.batch_size,2],dtype=np.float32)
index=0
while index < self.batch_size:
k=random_batch_index[index]
movie_name=self.training_samples[k][0]
if self.training_samples[k][7]==1:
clip_start=self.training_samples[k][1]
clip_end=self.training_samples[k][2]
round_gt_start=self.training_samples[k][5]
round_gt_end=self.training_samples[k][6]
start_offset,end_offset=self.calculate_regoffset(clip_start,clip_end,round_gt_start,round_gt_end)
featmap=self.get_pooling_feature(self.feat_dir,movie_name,clip_start,clip_end)
left_feat=self.get_left_context_feature(self.feat_dir,movie_name,clip_start,clip_end)
right_feat=self.get_right_context_feature(self.feat_dir,movie_name,clip_start,clip_end)
image_batch[index,:]=np.hstack((left_feat,featmap,right_feat))
label_batch[index]=1
offset_batch[index,0]=start_offset
offset_batch[index,1]=end_offset
#print str(clip_start)+" "+str(clip_end)+" "+str(round_gt_start)+" "+str(round_gt_end)+" "+str(start_offset)+" "+str(end_offset)
index+=1
else:
clip_start=self.training_samples[k][1]
clip_end=self.training_samples[k][2]
left_feat=self.get_left_context_feature(self.feat_dir,movie_name,clip_start,clip_end)
right_feat=self.get_right_context_feature(self.feat_dir,movie_name,clip_start,clip_end)
featmap=self.get_pooling_feature(self.feat_dir,movie_name,clip_start,clip_end)
image_batch[index,:]=np.hstack((left_feat,featmap,right_feat))
label_batch[index]=0
offset_batch[index,0]=0
offset_batch[index,1]=0
index+=1

return image_batch, label_batch,offset_batch


'''
A class that handles the test set
'''
class TestingDataSet(object):
def __init__(self,feat_dir,test_clip_path,batch_size,ctx_num):
self.ctx_num=ctx_num
#il_path: image_label_file path
self.batch_size=batch_size
self.feat_dir=feat_dir
print "Reading testing data list from "+test_clip_path
self.test_samples=[]
with open(test_clip_path) as f:
for l in f:
movie_name=l.rstrip().split(" ")[0]
clip_start=float(l.rstrip().split(" ")[1])
clip_end=float(l.rstrip().split(" ")[2])
self.test_samples.append((movie_name,clip_start,clip_end))
self.num_samples=len(self.test_samples)
print "test clips number: "+str(len(self.test_samples))




Binary file added turn_codes/dataset.pyc
Binary file not shown.
165 changes: 165 additions & 0 deletions turn_codes/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import tensorflow as tf
import numpy as np
import turn_model
from six.moves import xrange
import time
from sklearn.metrics import average_precision_score
import pickle
import vs_multilayer
import operator
import os

ctx_num=4
unit_size=16.0
unit_feature_size=2048
lr=0.005
lambda_reg=2.0
batch_size=128
test_steps=4000

def get_pooling_feature(feat_dir,movie_name,start,end):
swin_step=unit_size
all_feat=np.zeros([0,unit_feature_size],dtype=np.float32)
current_pos=start
while current_pos<end:
swin_start=current_pos
swin_end=swin_start+swin_step
feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy")
all_feat=np.vstack((all_feat,feat))
current_pos+=swin_step
pool_feat=np.mean(all_feat,axis=0)
return pool_feat


def get_left_context_feature(feat_dir,movie_name,start,end):
swin_step=unit_size
all_feat=np.zeros([0,unit_feature_size],dtype=np.float32)
count=0
current_pos=start
context_ext=False
while count<ctx_num:
swin_start=current_pos-swin_step
swin_end=current_pos
if os.path.exists(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy"):
feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy")
all_feat=np.vstack((all_feat,feat))
context_ext=True
current_pos-=swin_step
count+=1
if context_ext:
pool_feat=np.mean(all_feat,axis=0)
else:
pool_feat=np.zeros([unit_feature_size],dtype=np.float32)
return np.reshape(pool_feat,[unit_feature_size])


def get_right_context_feature(feat_dir,movie_name,start,end):
swin_step=unit_size
all_feat=np.zeros([0,unit_feature_size],dtype=np.float32)
count=0
current_pos=end
context_ext=False
while count<ctx_num:
swin_start=current_pos
swin_end=current_pos+swin_step
if os.path.exists(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy"):
feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy")
all_feat=np.vstack((all_feat,feat))
context_ext=True
current_pos+=swin_step
count+=1
if context_ext:
pool_feat=np.mean(all_feat,axis=0)
else:
pool_feat=np.zeros([unit_feature_size],dtype=np.float32)
return np.reshape(pool_feat,[unit_feature_size])


def softmax(x):
return np.exp(x)/np.sum(np.exp(x),axis=0)

# test
def do_eval_slidingclips(sess,vs_eval_op,model,movie_length_info,iter_step):
results_lst=[]
for k,test_sample in enumerate(model.test_set.test_samples):
if k%1000==0:
print str(k)+"/"+str(len(model.test_set.test_samples))
movie_name=test_sample[0]
movie_length=movie_length_info[movie_name]
clip_start=test_sample[1]
clip_end=test_sample[2]
featmap=get_pooling_feature(model.test_set.feat_dir,movie_name,clip_start,clip_end)
left_feat=get_left_context_feature(model.test_set.feat_dir,movie_name,clip_start,clip_end)
right_feat=get_right_context_feature(model.test_set.feat_dir,movie_name,clip_start,clip_end)
feat=np.hstack((left_feat,featmap,right_feat))
feat=np.reshape(feat,[1,unit_feature_size*3])

feed_dict = {
model.visual_featmap_ph_test: feat
}

outputs=sess.run(vs_eval_op,feed_dict=feed_dict)
reg_end=clip_end+outputs[3]*unit_size
reg_start=clip_start+outputs[2]*unit_size
round_reg_end=clip_end+np.round(outputs[3])*unit_size
round_reg_start=clip_start+np.round(outputs[2])*unit_size
softmax_score=softmax(outputs[0:2])
action_score=softmax_score[1]
results_lst.append((movie_name,round_reg_start,round_reg_end,reg_start,reg_end,action_score,outputs[0],outputs[1]))
pickle.dump(results_lst,open("./test_results/results_TURN_flow_iter"+str(iter_step)+".pkl","w"))


def run_training():
initial_steps=0
max_steps=30000
train_clip_path="./val_training_samples.txt"
background_path="./background_samples.txt"
train_featmap_dir="./path_to_features_val/"
test_featmap_dir="./path_to_features_test/"
test_clip_path="./test_swin.txt"
test_video_length_info={}
with open("./thumos14_video_length_test.txt") as f:
for l in f:
test_video_length_info[l.rstrip().split(" ")[0]]=int(l.rstrip().split(" ")[2])
train_video_length_info={}
with open("./thumos14_video_length_val.txt") as f:
for l in f:
train_video_length_info[l.rstrip().split(" ")[0]]=int(l.rstrip().split(" ")[2])

model=turn_model.TURN_Model(batch_size,train_video_length_info,ctx_num,unit_feature_size,unit_size,
lambda_reg,lr,train_clip_path,background_path,test_clip_path,train_featmap_dir,test_featmap_dir)

with tf.Graph().as_default():

loss_cls_reg,vs_train_op,vs_eval_op, loss_reg=model.construct_model()
# Create a session for running Ops on the Graph.
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.2)
sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options))
# Run the Op to initialize the variables.
init = tf.initialize_all_variables()
sess.run(init)
for step in xrange(max_steps):
start_time = time.time()
feed_dict = model.fill_feed_dict_train_reg()

_, loss_v, loss_reg_v = sess.run([vs_train_op,loss_cls_reg, loss_reg], feed_dict=feed_dict)
duration = time.time() - start_time

if step % 5 == 0:
# Print status to stdout.
print('Step %d: total loss = %.2f, regression loss = %.2f(%.3f sec)' % (step, loss_v, loss_reg_v, duration))

if (step+1) % test_steps == 0:
print "Start to test:-----------------\n"
do_eval_slidingclips(sess,vs_eval_op,model,test_video_length_info,step+1)

def main(_):
run_training()


if __name__ == '__main__':
tf.app.run()




Loading

0 comments on commit 8d2c567

Please sign in to comment.