-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
17 changed files
with
704,370 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,190 @@ | ||
|
||
import numpy as np | ||
from math import sqrt | ||
import os | ||
import random | ||
import pickle | ||
|
||
def calculate_IoU(i0,i1): | ||
union=(min(i0[0],i1[0]) , max(i0[1],i1[1])) | ||
inter=(max(i0[0],i1[0]) , min(i0[1],i1[1])) | ||
iou=1.0*(inter[1]-inter[0])/(union[1]-union[0]) | ||
return iou | ||
|
||
''' | ||
A class that handles the training set | ||
''' | ||
class TrainingDataSet(object): | ||
def __init__(self,feat_dir,clip_gt_path,background_path,batch_size,movie_length_info,ctx_num,unit_feature_size,unit_size): | ||
#it_path: image_token_file path | ||
self.ctx_num=ctx_num | ||
self.unit_feature_size=unit_feature_size | ||
self.unit_size=unit_size | ||
self.batch_size=batch_size | ||
self.movie_length_info=movie_length_info | ||
self.visual_feature_dim=self.unit_feature_size*3 | ||
self.feat_dir=feat_dir | ||
self.training_samples=[] | ||
|
||
print "Reading training data list from "+clip_gt_path+" and "+background_path | ||
with open(clip_gt_path) as f: | ||
for l in f: | ||
movie_name=l.rstrip().split(" ")[0] | ||
clip_start=float(l.rstrip().split(" ")[1]) | ||
clip_end=float(l.rstrip().split(" ")[2]) | ||
gt_start=float(l.rstrip().split(" ")[3]) | ||
gt_end=float(l.rstrip().split(" ")[4]) | ||
round_gt_start=np.round(gt_start/unit_size)*self.unit_size+1 | ||
round_gt_end=np.round(gt_end/unit_size)*self.unit_size+1 | ||
self.training_samples.append((movie_name,clip_start,clip_end,gt_start,gt_end,round_gt_start,round_gt_end,1)) | ||
print str(len(self.training_samples))+" training samples are read" | ||
positive_num=len(self.training_samples)*1.0 | ||
with open(background_path) as f: | ||
for l in f: | ||
# control the ratio between background samples and positive samples to be 10:1 | ||
if random.random()>10.0*positive_num/270000: continue | ||
movie_name=l.rstrip().split(" ")[0] | ||
clip_start=float(l.rstrip().split(" ")[1]) | ||
clip_end=float(l.rstrip().split(" ")[2]) | ||
self.training_samples.append((movie_name,clip_start,clip_end,0,0,0,0,0)) | ||
self.num_samples=len(self.training_samples) | ||
print str(len(self.training_samples))+" training samples are read" | ||
|
||
def calculate_regoffset(self,clip_start,clip_end,round_gt_start,round_gt_end): | ||
start_offset=(round_gt_start-clip_start)/self.unit_size | ||
end_offset=(round_gt_end-clip_end)/self.unit_size | ||
return start_offset, end_offset | ||
|
||
''' | ||
Get the central features | ||
''' | ||
def get_pooling_feature(self,feat_dir,movie_name,start,end): | ||
swin_step=self.unit_size | ||
all_feat=np.zeros([0,self.unit_feature_size],dtype=np.float32) | ||
current_pos=start | ||
while current_pos<end: | ||
swin_start=current_pos | ||
swin_end=swin_start+swin_step | ||
feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy") | ||
all_feat=np.vstack((all_feat,feat)) | ||
current_pos+=swin_step | ||
pool_feat=np.mean(all_feat,axis=0) | ||
return pool_feat | ||
|
||
|
||
''' | ||
Get the past (on the left of the central unit) context features | ||
''' | ||
def get_left_context_feature(self,feat_dir,movie_name,start,end): | ||
swin_step=self.unit_size | ||
all_feat=np.zeros([0,self.unit_feature_size],dtype=np.float32) | ||
count=0 | ||
current_pos=start | ||
context_ext=False | ||
while count<self.ctx_num: | ||
swin_start=current_pos-swin_step | ||
swin_end=current_pos | ||
if os.path.exists(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy"): | ||
feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy") | ||
all_feat=np.vstack((all_feat,feat)) | ||
context_ext=True | ||
current_pos-=swin_step | ||
count+=1 | ||
if context_ext: | ||
pool_feat=np.mean(all_feat,axis=0) | ||
else: | ||
# print "no left "+str(start) | ||
pool_feat=np.zeros([self.unit_feature_size],dtype=np.float32) | ||
#print pool_feat.shape | ||
return pool_feat | ||
|
||
|
||
''' | ||
Get the future (on the right of the central unit) context features | ||
''' | ||
def get_right_context_feature(self,feat_dir,movie_name,start,end): | ||
swin_step=self.unit_size | ||
all_feat=np.zeros([0,self.unit_feature_size],dtype=np.float32) | ||
count=0 | ||
current_pos=end | ||
context_ext=False | ||
while count<self.ctx_num: | ||
swin_start=current_pos | ||
swin_end=current_pos+swin_step | ||
if os.path.exists(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy"): | ||
feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy") | ||
all_feat=np.vstack((all_feat,feat)) | ||
context_ext=True | ||
current_pos+=swin_step | ||
count+=1 | ||
if context_ext: | ||
pool_feat=np.mean(all_feat,axis=0) | ||
else: | ||
# print "no right "+str(end) | ||
pool_feat=np.zeros([self.unit_feature_size],dtype=np.float32) | ||
#print pool_feat.shape | ||
return pool_feat | ||
|
||
def next_batch(self): | ||
|
||
random_batch_index=random.sample(range(self.num_samples),self.batch_size) | ||
image_batch=np.zeros([self.batch_size,self.visual_feature_dim]) | ||
label_batch=np.zeros([self.batch_size],dtype=np.int32) | ||
offset_batch=np.zeros([self.batch_size,2],dtype=np.float32) | ||
index=0 | ||
while index < self.batch_size: | ||
k=random_batch_index[index] | ||
movie_name=self.training_samples[k][0] | ||
if self.training_samples[k][7]==1: | ||
clip_start=self.training_samples[k][1] | ||
clip_end=self.training_samples[k][2] | ||
round_gt_start=self.training_samples[k][5] | ||
round_gt_end=self.training_samples[k][6] | ||
start_offset,end_offset=self.calculate_regoffset(clip_start,clip_end,round_gt_start,round_gt_end) | ||
featmap=self.get_pooling_feature(self.feat_dir,movie_name,clip_start,clip_end) | ||
left_feat=self.get_left_context_feature(self.feat_dir,movie_name,clip_start,clip_end) | ||
right_feat=self.get_right_context_feature(self.feat_dir,movie_name,clip_start,clip_end) | ||
image_batch[index,:]=np.hstack((left_feat,featmap,right_feat)) | ||
label_batch[index]=1 | ||
offset_batch[index,0]=start_offset | ||
offset_batch[index,1]=end_offset | ||
#print str(clip_start)+" "+str(clip_end)+" "+str(round_gt_start)+" "+str(round_gt_end)+" "+str(start_offset)+" "+str(end_offset) | ||
index+=1 | ||
else: | ||
clip_start=self.training_samples[k][1] | ||
clip_end=self.training_samples[k][2] | ||
left_feat=self.get_left_context_feature(self.feat_dir,movie_name,clip_start,clip_end) | ||
right_feat=self.get_right_context_feature(self.feat_dir,movie_name,clip_start,clip_end) | ||
featmap=self.get_pooling_feature(self.feat_dir,movie_name,clip_start,clip_end) | ||
image_batch[index,:]=np.hstack((left_feat,featmap,right_feat)) | ||
label_batch[index]=0 | ||
offset_batch[index,0]=0 | ||
offset_batch[index,1]=0 | ||
index+=1 | ||
|
||
return image_batch, label_batch,offset_batch | ||
|
||
|
||
''' | ||
A class that handles the test set | ||
''' | ||
class TestingDataSet(object): | ||
def __init__(self,feat_dir,test_clip_path,batch_size,ctx_num): | ||
self.ctx_num=ctx_num | ||
#il_path: image_label_file path | ||
self.batch_size=batch_size | ||
self.feat_dir=feat_dir | ||
print "Reading testing data list from "+test_clip_path | ||
self.test_samples=[] | ||
with open(test_clip_path) as f: | ||
for l in f: | ||
movie_name=l.rstrip().split(" ")[0] | ||
clip_start=float(l.rstrip().split(" ")[1]) | ||
clip_end=float(l.rstrip().split(" ")[2]) | ||
self.test_samples.append((movie_name,clip_start,clip_end)) | ||
self.num_samples=len(self.test_samples) | ||
print "test clips number: "+str(len(self.test_samples)) | ||
|
||
|
||
|
||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
import tensorflow as tf | ||
import numpy as np | ||
import turn_model | ||
from six.moves import xrange | ||
import time | ||
from sklearn.metrics import average_precision_score | ||
import pickle | ||
import vs_multilayer | ||
import operator | ||
import os | ||
|
||
ctx_num=4 | ||
unit_size=16.0 | ||
unit_feature_size=2048 | ||
lr=0.005 | ||
lambda_reg=2.0 | ||
batch_size=128 | ||
test_steps=4000 | ||
|
||
def get_pooling_feature(feat_dir,movie_name,start,end): | ||
swin_step=unit_size | ||
all_feat=np.zeros([0,unit_feature_size],dtype=np.float32) | ||
current_pos=start | ||
while current_pos<end: | ||
swin_start=current_pos | ||
swin_end=swin_start+swin_step | ||
feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy") | ||
all_feat=np.vstack((all_feat,feat)) | ||
current_pos+=swin_step | ||
pool_feat=np.mean(all_feat,axis=0) | ||
return pool_feat | ||
|
||
|
||
def get_left_context_feature(feat_dir,movie_name,start,end): | ||
swin_step=unit_size | ||
all_feat=np.zeros([0,unit_feature_size],dtype=np.float32) | ||
count=0 | ||
current_pos=start | ||
context_ext=False | ||
while count<ctx_num: | ||
swin_start=current_pos-swin_step | ||
swin_end=current_pos | ||
if os.path.exists(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy"): | ||
feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy") | ||
all_feat=np.vstack((all_feat,feat)) | ||
context_ext=True | ||
current_pos-=swin_step | ||
count+=1 | ||
if context_ext: | ||
pool_feat=np.mean(all_feat,axis=0) | ||
else: | ||
pool_feat=np.zeros([unit_feature_size],dtype=np.float32) | ||
return np.reshape(pool_feat,[unit_feature_size]) | ||
|
||
|
||
def get_right_context_feature(feat_dir,movie_name,start,end): | ||
swin_step=unit_size | ||
all_feat=np.zeros([0,unit_feature_size],dtype=np.float32) | ||
count=0 | ||
current_pos=end | ||
context_ext=False | ||
while count<ctx_num: | ||
swin_start=current_pos | ||
swin_end=current_pos+swin_step | ||
if os.path.exists(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy"): | ||
feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy") | ||
all_feat=np.vstack((all_feat,feat)) | ||
context_ext=True | ||
current_pos+=swin_step | ||
count+=1 | ||
if context_ext: | ||
pool_feat=np.mean(all_feat,axis=0) | ||
else: | ||
pool_feat=np.zeros([unit_feature_size],dtype=np.float32) | ||
return np.reshape(pool_feat,[unit_feature_size]) | ||
|
||
|
||
def softmax(x): | ||
return np.exp(x)/np.sum(np.exp(x),axis=0) | ||
|
||
# test | ||
def do_eval_slidingclips(sess,vs_eval_op,model,movie_length_info,iter_step): | ||
results_lst=[] | ||
for k,test_sample in enumerate(model.test_set.test_samples): | ||
if k%1000==0: | ||
print str(k)+"/"+str(len(model.test_set.test_samples)) | ||
movie_name=test_sample[0] | ||
movie_length=movie_length_info[movie_name] | ||
clip_start=test_sample[1] | ||
clip_end=test_sample[2] | ||
featmap=get_pooling_feature(model.test_set.feat_dir,movie_name,clip_start,clip_end) | ||
left_feat=get_left_context_feature(model.test_set.feat_dir,movie_name,clip_start,clip_end) | ||
right_feat=get_right_context_feature(model.test_set.feat_dir,movie_name,clip_start,clip_end) | ||
feat=np.hstack((left_feat,featmap,right_feat)) | ||
feat=np.reshape(feat,[1,unit_feature_size*3]) | ||
|
||
feed_dict = { | ||
model.visual_featmap_ph_test: feat | ||
} | ||
|
||
outputs=sess.run(vs_eval_op,feed_dict=feed_dict) | ||
reg_end=clip_end+outputs[3]*unit_size | ||
reg_start=clip_start+outputs[2]*unit_size | ||
round_reg_end=clip_end+np.round(outputs[3])*unit_size | ||
round_reg_start=clip_start+np.round(outputs[2])*unit_size | ||
softmax_score=softmax(outputs[0:2]) | ||
action_score=softmax_score[1] | ||
results_lst.append((movie_name,round_reg_start,round_reg_end,reg_start,reg_end,action_score,outputs[0],outputs[1])) | ||
pickle.dump(results_lst,open("./test_results/results_TURN_flow_iter"+str(iter_step)+".pkl","w")) | ||
|
||
|
||
def run_training(): | ||
initial_steps=0 | ||
max_steps=30000 | ||
train_clip_path="./val_training_samples.txt" | ||
background_path="./background_samples.txt" | ||
train_featmap_dir="./path_to_features_val/" | ||
test_featmap_dir="./path_to_features_test/" | ||
test_clip_path="./test_swin.txt" | ||
test_video_length_info={} | ||
with open("./thumos14_video_length_test.txt") as f: | ||
for l in f: | ||
test_video_length_info[l.rstrip().split(" ")[0]]=int(l.rstrip().split(" ")[2]) | ||
train_video_length_info={} | ||
with open("./thumos14_video_length_val.txt") as f: | ||
for l in f: | ||
train_video_length_info[l.rstrip().split(" ")[0]]=int(l.rstrip().split(" ")[2]) | ||
|
||
model=turn_model.TURN_Model(batch_size,train_video_length_info,ctx_num,unit_feature_size,unit_size, | ||
lambda_reg,lr,train_clip_path,background_path,test_clip_path,train_featmap_dir,test_featmap_dir) | ||
|
||
with tf.Graph().as_default(): | ||
|
||
loss_cls_reg,vs_train_op,vs_eval_op, loss_reg=model.construct_model() | ||
# Create a session for running Ops on the Graph. | ||
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.2) | ||
sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) | ||
# Run the Op to initialize the variables. | ||
init = tf.initialize_all_variables() | ||
sess.run(init) | ||
for step in xrange(max_steps): | ||
start_time = time.time() | ||
feed_dict = model.fill_feed_dict_train_reg() | ||
|
||
_, loss_v, loss_reg_v = sess.run([vs_train_op,loss_cls_reg, loss_reg], feed_dict=feed_dict) | ||
duration = time.time() - start_time | ||
|
||
if step % 5 == 0: | ||
# Print status to stdout. | ||
print('Step %d: total loss = %.2f, regression loss = %.2f(%.3f sec)' % (step, loss_v, loss_reg_v, duration)) | ||
|
||
if (step+1) % test_steps == 0: | ||
print "Start to test:-----------------\n" | ||
do_eval_slidingclips(sess,vs_eval_op,model,test_video_length_info,step+1) | ||
|
||
def main(_): | ||
run_training() | ||
|
||
|
||
if __name__ == '__main__': | ||
tf.app.run() | ||
|
||
|
||
|
||
|
Oops, something went wrong.