add turn source codes

jiyanggao · Sep 14, 2017 · 8d2c567 · 8d2c567
1 parent d8660af
commit 8d2c567
Show file tree

Hide file tree

Showing 17 changed files with 704,370 additions and 0 deletions.
diff --git a/turn_codes/background_samples.txt b/turn_codes/background_samples.txt
diff --git a/turn_codes/dataset.py b/turn_codes/dataset.py
@@ -0,0 +1,190 @@
+
+import numpy as np
+from math import sqrt
+import os
+import random
+import pickle
+
+def calculate_IoU(i0,i1):
+    union=(min(i0[0],i1[0]) , max(i0[1],i1[1]))
+    inter=(max(i0[0],i1[0]) , min(i0[1],i1[1]))
+    iou=1.0*(inter[1]-inter[0])/(union[1]-union[0])
+    return iou
+
+'''
+A class that handles the training set
+'''
+class TrainingDataSet(object):
+    def __init__(self,feat_dir,clip_gt_path,background_path,batch_size,movie_length_info,ctx_num,unit_feature_size,unit_size):
+        #it_path: image_token_file path
+        self.ctx_num=ctx_num
+        self.unit_feature_size=unit_feature_size
+        self.unit_size=unit_size
+        self.batch_size=batch_size
+        self.movie_length_info=movie_length_info
+        self.visual_feature_dim=self.unit_feature_size*3
+        self.feat_dir=feat_dir
+        self.training_samples=[]
+
+        print "Reading training data list from "+clip_gt_path+" and "+background_path
+        with open(clip_gt_path) as f:
+            for l in f:
+                movie_name=l.rstrip().split(" ")[0]
+                clip_start=float(l.rstrip().split(" ")[1])
+                clip_end=float(l.rstrip().split(" ")[2])
+                gt_start=float(l.rstrip().split(" ")[3])
+                gt_end=float(l.rstrip().split(" ")[4])
+                round_gt_start=np.round(gt_start/unit_size)*self.unit_size+1
+                round_gt_end=np.round(gt_end/unit_size)*self.unit_size+1
+                self.training_samples.append((movie_name,clip_start,clip_end,gt_start,gt_end,round_gt_start,round_gt_end,1))
+        print str(len(self.training_samples))+" training samples are read"
+        positive_num=len(self.training_samples)*1.0
+        with open(background_path) as f:
+            for l in f:
+                # control the ratio between  background samples and positive samples to be 10:1
+                if random.random()>10.0*positive_num/270000: continue
+                movie_name=l.rstrip().split(" ")[0]
+                clip_start=float(l.rstrip().split(" ")[1])
+                clip_end=float(l.rstrip().split(" ")[2])
+                self.training_samples.append((movie_name,clip_start,clip_end,0,0,0,0,0))
+        self.num_samples=len(self.training_samples)
+        print str(len(self.training_samples))+" training samples are read"
+
+    def calculate_regoffset(self,clip_start,clip_end,round_gt_start,round_gt_end):
+        start_offset=(round_gt_start-clip_start)/self.unit_size
+        end_offset=(round_gt_end-clip_end)/self.unit_size
+        return start_offset, end_offset
+
+    '''
+    Get the central features
+    '''    
+    def get_pooling_feature(self,feat_dir,movie_name,start,end):
+        swin_step=self.unit_size
+        all_feat=np.zeros([0,self.unit_feature_size],dtype=np.float32)
+        current_pos=start
+        while current_pos<end:
+            swin_start=current_pos
+            swin_end=swin_start+swin_step
+            feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy")
+            all_feat=np.vstack((all_feat,feat))
+            current_pos+=swin_step
+        pool_feat=np.mean(all_feat,axis=0)
+        return pool_feat
+
+
+    '''
+    Get the past (on the left of the central unit) context features
+    '''
+    def get_left_context_feature(self,feat_dir,movie_name,start,end):
+        swin_step=self.unit_size
+        all_feat=np.zeros([0,self.unit_feature_size],dtype=np.float32)
+        count=0
+        current_pos=start
+        context_ext=False
+        while  count<self.ctx_num:
+            swin_start=current_pos-swin_step
+            swin_end=current_pos
+            if os.path.exists(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy"):
+                feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy")
+                all_feat=np.vstack((all_feat,feat))
+                context_ext=True
+            current_pos-=swin_step
+            count+=1
+        if context_ext:
+            pool_feat=np.mean(all_feat,axis=0)
+        else:
+        #    print "no left "+str(start)
+            pool_feat=np.zeros([self.unit_feature_size],dtype=np.float32)
+        #print pool_feat.shape
+        return pool_feat
+
+
+    '''
+    Get the future (on the right of the central unit) context features
+    ''' 
+    def get_right_context_feature(self,feat_dir,movie_name,start,end):
+        swin_step=self.unit_size
+        all_feat=np.zeros([0,self.unit_feature_size],dtype=np.float32)
+        count=0
+        current_pos=end
+        context_ext=False
+        while  count<self.ctx_num:
+            swin_start=current_pos
+            swin_end=current_pos+swin_step
+            if os.path.exists(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy"):
+                feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy")
+                all_feat=np.vstack((all_feat,feat))
+                context_ext=True
+            current_pos+=swin_step
+            count+=1
+        if context_ext:
+            pool_feat=np.mean(all_feat,axis=0)
+        else:
+        #    print "no right "+str(end)
+            pool_feat=np.zeros([self.unit_feature_size],dtype=np.float32)
+        #print pool_feat.shape
+        return pool_feat
+
+    def next_batch(self):
+
+        random_batch_index=random.sample(range(self.num_samples),self.batch_size)
+        image_batch=np.zeros([self.batch_size,self.visual_feature_dim])
+        label_batch=np.zeros([self.batch_size],dtype=np.int32)
+        offset_batch=np.zeros([self.batch_size,2],dtype=np.float32)
+        index=0
+        while index < self.batch_size:
+            k=random_batch_index[index]
+            movie_name=self.training_samples[k][0]
+            if self.training_samples[k][7]==1:
+                clip_start=self.training_samples[k][1]
+                clip_end=self.training_samples[k][2]
+                round_gt_start=self.training_samples[k][5]
+                round_gt_end=self.training_samples[k][6]
+                start_offset,end_offset=self.calculate_regoffset(clip_start,clip_end,round_gt_start,round_gt_end) 
+                featmap=self.get_pooling_feature(self.feat_dir,movie_name,clip_start,clip_end)
+                left_feat=self.get_left_context_feature(self.feat_dir,movie_name,clip_start,clip_end)
+                right_feat=self.get_right_context_feature(self.feat_dir,movie_name,clip_start,clip_end)
+                image_batch[index,:]=np.hstack((left_feat,featmap,right_feat))
+                label_batch[index]=1
+                offset_batch[index,0]=start_offset
+                offset_batch[index,1]=end_offset
+                #print str(clip_start)+" "+str(clip_end)+" "+str(round_gt_start)+" "+str(round_gt_end)+" "+str(start_offset)+" "+str(end_offset)
+                index+=1
+            else:
+                clip_start=self.training_samples[k][1]
+                clip_end=self.training_samples[k][2]
+                left_feat=self.get_left_context_feature(self.feat_dir,movie_name,clip_start,clip_end)
+                right_feat=self.get_right_context_feature(self.feat_dir,movie_name,clip_start,clip_end)
+                featmap=self.get_pooling_feature(self.feat_dir,movie_name,clip_start,clip_end)
+                image_batch[index,:]=np.hstack((left_feat,featmap,right_feat))
+                label_batch[index]=0
+                offset_batch[index,0]=0
+                offset_batch[index,1]=0
+                index+=1  
+
+        return image_batch, label_batch,offset_batch
+
+
+'''
+A class that handles the test set
+'''
+class TestingDataSet(object):
+    def __init__(self,feat_dir,test_clip_path,batch_size,ctx_num):
+        self.ctx_num=ctx_num
+        #il_path: image_label_file path
+        self.batch_size=batch_size
+        self.feat_dir=feat_dir
+        print "Reading testing data list from "+test_clip_path
+        self.test_samples=[]
+        with open(test_clip_path) as f:
+            for l in f:
+                movie_name=l.rstrip().split(" ")[0]
+                clip_start=float(l.rstrip().split(" ")[1])
+                clip_end=float(l.rstrip().split(" ")[2])
+                self.test_samples.append((movie_name,clip_start,clip_end))
+        self.num_samples=len(self.test_samples)
+        print "test clips number: "+str(len(self.test_samples))
+
+
+
+
diff --git a/turn_codes/dataset.pyc b/turn_codes/dataset.pyc
diff --git a/turn_codes/main.py b/turn_codes/main.py
@@ -0,0 +1,165 @@
+import tensorflow as tf
+import numpy as np
+import turn_model
+from six.moves import xrange
+import time
+from sklearn.metrics import average_precision_score
+import pickle
+import vs_multilayer
+import operator
+import os 
+
+ctx_num=4
+unit_size=16.0
+unit_feature_size=2048
+lr=0.005
+lambda_reg=2.0
+batch_size=128
+test_steps=4000
+
+def get_pooling_feature(feat_dir,movie_name,start,end):
+    swin_step=unit_size
+    all_feat=np.zeros([0,unit_feature_size],dtype=np.float32)
+    current_pos=start
+    while current_pos<end:
+        swin_start=current_pos
+        swin_end=swin_start+swin_step
+        feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy")
+        all_feat=np.vstack((all_feat,feat))
+        current_pos+=swin_step
+    pool_feat=np.mean(all_feat,axis=0)
+    return pool_feat
+
+
+def get_left_context_feature(feat_dir,movie_name,start,end):
+    swin_step=unit_size
+    all_feat=np.zeros([0,unit_feature_size],dtype=np.float32)
+    count=0
+    current_pos=start
+    context_ext=False
+    while  count<ctx_num:
+        swin_start=current_pos-swin_step
+        swin_end=current_pos
+        if os.path.exists(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy"):
+            feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy")
+            all_feat=np.vstack((all_feat,feat))
+            context_ext=True
+        current_pos-=swin_step
+        count+=1
+    if context_ext:
+        pool_feat=np.mean(all_feat,axis=0)
+    else:
+        pool_feat=np.zeros([unit_feature_size],dtype=np.float32)
+    return np.reshape(pool_feat,[unit_feature_size])
+
+
+def get_right_context_feature(feat_dir,movie_name,start,end):
+    swin_step=unit_size
+    all_feat=np.zeros([0,unit_feature_size],dtype=np.float32)
+    count=0
+    current_pos=end
+    context_ext=False
+    while  count<ctx_num:
+        swin_start=current_pos
+        swin_end=current_pos+swin_step
+        if os.path.exists(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy"):
+            feat=np.load(feat_dir+movie_name+".mp4"+"_"+str(swin_start)+"_"+str(swin_end)+".npy")
+            all_feat=np.vstack((all_feat,feat))
+            context_ext=True
+        current_pos+=swin_step
+        count+=1
+    if context_ext:
+        pool_feat=np.mean(all_feat,axis=0)
+    else:
+        pool_feat=np.zeros([unit_feature_size],dtype=np.float32)
+    return np.reshape(pool_feat,[unit_feature_size])
+
+
+def softmax(x):
+    return np.exp(x)/np.sum(np.exp(x),axis=0)
+
+# test 
+def do_eval_slidingclips(sess,vs_eval_op,model,movie_length_info,iter_step):
+    results_lst=[]
+    for k,test_sample in enumerate(model.test_set.test_samples):
+        if k%1000==0:
+            print str(k)+"/"+str(len(model.test_set.test_samples))
+        movie_name=test_sample[0]
+        movie_length=movie_length_info[movie_name]
+        clip_start=test_sample[1]
+        clip_end=test_sample[2]
+        featmap=get_pooling_feature(model.test_set.feat_dir,movie_name,clip_start,clip_end)
+        left_feat=get_left_context_feature(model.test_set.feat_dir,movie_name,clip_start,clip_end)
+        right_feat=get_right_context_feature(model.test_set.feat_dir,movie_name,clip_start,clip_end)
+        feat=np.hstack((left_feat,featmap,right_feat))
+        feat=np.reshape(feat,[1,unit_feature_size*3])
+
+        feed_dict = {
+            model.visual_featmap_ph_test: feat
+            }
+
+        outputs=sess.run(vs_eval_op,feed_dict=feed_dict) 
+        reg_end=clip_end+outputs[3]*unit_size
+        reg_start=clip_start+outputs[2]*unit_size
+        round_reg_end=clip_end+np.round(outputs[3])*unit_size
+        round_reg_start=clip_start+np.round(outputs[2])*unit_size
+        softmax_score=softmax(outputs[0:2])
+        action_score=softmax_score[1] 
+        results_lst.append((movie_name,round_reg_start,round_reg_end,reg_start,reg_end,action_score,outputs[0],outputs[1]))
+    pickle.dump(results_lst,open("./test_results/results_TURN_flow_iter"+str(iter_step)+".pkl","w")) 
+
+
+def run_training():
+    initial_steps=0
+    max_steps=30000
+    train_clip_path="./val_training_samples.txt"
+    background_path="./background_samples.txt"
+    train_featmap_dir="./path_to_features_val/"
+    test_featmap_dir="./path_to_features_test/"
+    test_clip_path="./test_swin.txt"
+    test_video_length_info={}
+    with open("./thumos14_video_length_test.txt") as f:
+        for l in f:
+            test_video_length_info[l.rstrip().split(" ")[0]]=int(l.rstrip().split(" ")[2])
+    train_video_length_info={}
+    with open("./thumos14_video_length_val.txt") as f:
+        for l in f:
+            train_video_length_info[l.rstrip().split(" ")[0]]=int(l.rstrip().split(" ")[2])
+
+    model=turn_model.TURN_Model(batch_size,train_video_length_info,ctx_num,unit_feature_size,unit_size,
+        lambda_reg,lr,train_clip_path,background_path,test_clip_path,train_featmap_dir,test_featmap_dir)
+
+    with tf.Graph().as_default():
+
+        loss_cls_reg,vs_train_op,vs_eval_op, loss_reg=model.construct_model()
+        # Create a session for running Ops on the Graph.
+        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.2)
+        sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options))
+        # Run the Op to initialize the variables.
+        init = tf.initialize_all_variables()
+        sess.run(init)
+        for step in xrange(max_steps):
+            start_time = time.time()
+            feed_dict = model.fill_feed_dict_train_reg()
+
+            _, loss_v, loss_reg_v = sess.run([vs_train_op,loss_cls_reg, loss_reg], feed_dict=feed_dict)
+            duration = time.time() - start_time
+
+            if step % 5 == 0:
+                # Print status to stdout.
+                print('Step %d: total loss = %.2f, regression loss = %.2f(%.3f sec)' % (step, loss_v, loss_reg_v, duration)) 
+
+            if (step+1) % test_steps == 0:
+                print "Start to test:-----------------\n"
+                do_eval_slidingclips(sess,vs_eval_op,model,test_video_length_info,step+1)
+
+def main(_):
+    run_training()
+
+
+if __name__ == '__main__':
+    tf.app.run()
+
+
+
+