Skip to content

Commit

Permalink
Merge pull request BVLC#203 from sergeyk/hdf5_data
Browse files Browse the repository at this point in the history
HDF5DataLayer source now takes list of filenames, loads one at a time.
  • Loading branch information
sguada committed Mar 18, 2014
2 parents 915f83f + 802715d commit 2fec8cf
Show file tree
Hide file tree
Showing 10 changed files with 205 additions and 173 deletions.
17 changes: 8 additions & 9 deletions include/caffe/util/io.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include "hdf5_hl.h"
#include "caffe/proto/caffe.pb.h"

#include "boost/scoped_ptr.hpp"
#include "caffe/blob.hpp"

using std::string;
Expand Down Expand Up @@ -52,14 +51,14 @@ inline bool ReadImageToDatum(const string& filename, const int label,
}

template <typename Dtype>
void hd5_load_nd_dataset(
hid_t file_id, const char* dataset_name_,
int min_dim,//inclusive
int max_dim,//inclusive
//output:
boost::scoped_ptr<Dtype>* array,
std::vector<hsize_t>& dims
);
void hdf5_load_nd_dataset_helper(
hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
Blob<Dtype>* blob);

template <typename Dtype>
void hdf5_load_nd_dataset(
hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
Blob<Dtype>* blob);

} // namespace caffe

Expand Down
12 changes: 8 additions & 4 deletions include/caffe/vision_layers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#define CAFFE_VISION_LAYERS_HPP_

#include <vector>
#include <string>

#include "leveldb/db.h"
#include "pthread.h"
Expand Down Expand Up @@ -397,12 +398,15 @@ class HDF5DataLayer : public Layer<Dtype> {
const bool propagate_down, vector<Blob<Dtype>*>* bottom);
virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
const bool propagate_down, vector<Blob<Dtype>*>* bottom);
virtual void load_hdf5_file_data(const char* filename);

boost::scoped_ptr<Dtype> data_;
boost::scoped_ptr<Dtype> label_;
std::vector<hsize_t> data_dims_;
std::vector<hsize_t> label_dims_;
std::vector<std::string> hdf_filenames_;
unsigned int num_files_;
unsigned int current_file_;
hsize_t current_row_;

Blob<Dtype> data_blob_;
Blob<Dtype> label_blob_;
};


Expand Down
110 changes: 69 additions & 41 deletions src/caffe/layers/hdf5_data_layer.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
// Copyright Sergey Karayev 2014
// Copyright 2014 BVLC.
/*
Contributors:
- Sergey Karayev, 2014.
- Tobias Domhan, 2014.
TODO:
- only load parts of the file, in accordance with a prototxt param "max_mem"
- load file in a separate thread ("prefetch")
- can be smarter about the memcpy call instead of doing it row-by-row
:: use util functions caffe_copy, and Blob->offset()
:: don't forget to update hdf5_daa_layer.cu accordingly
- add ability to shuffle filenames if flag is set
*/

#include <stdint.h>
#include <string>
#include <vector>
#include <fstream>

#include "hdf5.h"
#include "hdf5_hl.h"
Expand All @@ -15,53 +23,66 @@
#include "caffe/util/io.hpp"
#include "caffe/vision_layers.hpp"

using std::string;

namespace caffe {

template <typename Dtype>
HDF5DataLayer<Dtype>::~HDF5DataLayer<Dtype>() { }

// Load data and label from HDF5 filename into the class property blobs.
template <typename Dtype>
void HDF5DataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
vector<Blob<Dtype>*>* top) {
CHECK_EQ(bottom.size(), 0) << "HDF5DataLayer takes no input blobs.";
CHECK_EQ(top->size(), 2) << "HDF5DataLayer takes two blobs as output.";

// Load the HDF5 file and initialize the counter.
const char* hdf_filename = this->layer_param_.source().c_str();
LOG(INFO) << "Loading HDF5 file" << hdf_filename;
hid_t file_id = H5Fopen(hdf_filename, H5F_ACC_RDONLY, H5P_DEFAULT);
void HDF5DataLayer<Dtype>::load_hdf5_file_data(const char* filename) {
LOG(INFO) << "Loading HDF5 file" << filename;
hid_t file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT);
if (file_id < 0) {
LOG(ERROR) << "Failed opening HDF5 file" << hdf_filename;
LOG(ERROR) << "Failed opening HDF5 file" << filename;
return;
}

const int MIN_DATA_DIM = 2;
const int MAX_DATA_DIM = 4;
const int MAX_LABEL_DIM = 2;
const int MIN_DIM = 2;
hd5_load_nd_dataset(file_id, "data", MIN_DIM, MAX_DATA_DIM,
&data_, data_dims_);
hd5_load_nd_dataset(file_id, "label", MIN_DIM, MAX_LABEL_DIM,
&label_, label_dims_);

while(data_dims_.size() < MAX_DATA_DIM) {
data_dims_.push_back(1);
}
hdf5_load_nd_dataset(
file_id, "data", MIN_DATA_DIM, MAX_DATA_DIM, &data_blob_);

//add missing dimensions:
label_dims_.push_back(1);
label_dims_.push_back(1);
const int MIN_LABEL_DIM = 1;
const int MAX_LABEL_DIM = 2;
hdf5_load_nd_dataset(
file_id, "label", MIN_LABEL_DIM, MAX_LABEL_DIM, &label_blob_);

herr_t status = H5Fclose(file_id);
CHECK_EQ(data_dims_[0], label_dims_[0]);
LOG(INFO) << "Successully loaded " << data_dims_[0] << " rows";
CHECK_EQ(data_blob_.num(), label_blob_.num());
LOG(INFO) << "Successully loaded " << data_blob_.num() << " rows";
}

template <typename Dtype>
void HDF5DataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
vector<Blob<Dtype>*>* top) {
CHECK_EQ(bottom.size(), 0) << "HDF5DataLayer takes no input blobs.";
CHECK_EQ(top->size(), 2) << "HDF5DataLayer takes two blobs as output.";

// Read the source to parse the filenames.
LOG(INFO) << "Loading filename from " << this->layer_param_.source();
hdf_filenames_.clear();
std::ifstream myfile(this->layer_param_.source().c_str());
if (myfile.is_open()) {
std::string line;
while (myfile >> line) {
hdf_filenames_.push_back(line);
}
}
myfile.close();
num_files_ = hdf_filenames_.size();
current_file_ = 0;
LOG(INFO) << "Number of files: " << num_files_;

// Load the first HDF5 file and initialize the line counter.
load_hdf5_file_data(hdf_filenames_[current_file_].c_str());
current_row_ = 0;

// Reshape blobs.
(*top)[0]->Reshape(this->layer_param_.batchsize(),
data_dims_[1], data_dims_[2], data_dims_[3]);
(*top)[1]->Reshape(this->layer_param_.batchsize(),
label_dims_[1], label_dims_[2], label_dims_[3]);
(*top)[0]->Reshape(this->layer_param_.batchsize(), data_blob_.channels(),
data_blob_.width(), data_blob_.height());
(*top)[1]->Reshape(this->layer_param_.batchsize(), label_blob_.channels(),
label_blob_.width(), label_blob_.height());
LOG(INFO) << "output data size: " << (*top)[0]->num() << ","
<< (*top)[0]->channels() << "," << (*top)[0]->height() << ","
<< (*top)[0]->width();
Expand All @@ -74,24 +95,31 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const int data_count = (*top)[0]->count() / (*top)[0]->num();
const int label_data_count = (*top)[1]->count() / (*top)[1]->num();

//TODO: consolidate into a single memcpy call

for (int i = 0; i < batchsize; ++i, ++current_row_) {
if (current_row_ == data_dims_[0]) {
if (current_row_ == data_blob_.num()) {
if (num_files_ > 1) {
current_file_ += 1;

if (current_file_ == num_files_) {
current_file_ = 0;
LOG(INFO) << "looping around to first file";
}

load_hdf5_file_data(hdf_filenames_[current_file_].c_str());
}
current_row_ = 0;
}

memcpy(&(*top)[0]->mutable_cpu_data()[i * data_count],
&(data_.get()[current_row_ * data_count]),
sizeof(Dtype) * data_count);
&data_blob_.cpu_data()[current_row_ * data_count],
sizeof(Dtype) * data_count);

memcpy(&(*top)[1]->mutable_cpu_data()[i * label_data_count],
&(label_.get()[current_row_ * label_data_count]),
&label_blob_.cpu_data()[current_row_ * label_data_count],
sizeof(Dtype) * label_data_count);
}
}


// The backward operations are dummy - they do not carry any computation.
template <typename Dtype>
Dtype HDF5DataLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
Expand Down
16 changes: 13 additions & 3 deletions src/caffe/layers/hdf5_data_layer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,29 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const int label_data_count = (*top)[1]->count() / (*top)[1]->num();

for (int i = 0; i < batchsize; ++i, ++current_row_) {
if (current_row_ == data_dims_[0]) {
if (current_row_ == data_blob_.num()) {
if (num_files_ > 1) {
current_file_ += 1;

if (current_file_ == num_files_) {
current_file_ = 0;
LOG(INFO) << "looping around to first file";
}

load_hdf5_file_data(hdf_filenames_[current_file_].c_str());
}
current_row_ = 0;
}

CUDA_CHECK(cudaMemcpy(
&(*top)[0]->mutable_gpu_data()[i * data_count],
&(data_.get()[current_row_ * data_count]),
&data_blob_.cpu_data()[current_row_ * data_count],
sizeof(Dtype) * data_count,
cudaMemcpyHostToDevice));

CUDA_CHECK(cudaMemcpy(
&(*top)[1]->mutable_gpu_data()[i * label_data_count],
&(label_.get()[current_row_ * label_data_count]),
&label_blob_.cpu_data()[current_row_ * label_data_count],
sizeof(Dtype) * label_data_count,
cudaMemcpyHostToDevice));
}
Expand Down
30 changes: 25 additions & 5 deletions src/caffe/test/test_data/generate_sample_data.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,39 @@
"""
Generate data used in the HDF5DataLayer test.
"""

import os
import numpy as np
import h5py

num_cols = 8
num_rows = 10
height = 5
width = 5
data = np.arange(num_cols * num_rows * height * width).reshape(num_rows, num_cols, height, width)
total_size = num_cols * num_rows * height * width

data = np.arange(total_size)
data = data.reshape(num_rows, num_cols, height, width)
data = data.astype('float32')
label = np.arange(num_rows)[:, np.newaxis]
label = label.astype('float32')

print data
print label

with h5py.File('./sample_data.h5', 'w') as f:
f['data'] = data.astype('float32')
f['label'] = label.astype('float32')
with h5py.File(os.path.dirname(__file__) + '/sample_data.h5', 'w') as f:
f['data'] = data
f['label'] = label

with h5py.File(os.path.dirname(__file__) + '/sample_data_2_gzip.h5', 'w') as f:
f.create_dataset(
'data', data=data + total_size,
compression='gzip', compression_opts=1
)
f.create_dataset(
'label', data=label,
compression='gzip', compression_opts=1
)

with open(os.path.dirname(__file__) + '/sample_data_list.txt', 'w') as f:
f.write(os.path.dirname(__file__) + '/sample_data.h5\n')
f.write(os.path.dirname(__file__) + '/sample_data_2_gzip.h5\n')
Binary file modified src/caffe/test/test_data/sample_data.h5
Binary file not shown.
Binary file added src/caffe/test/test_data/sample_data_2_gzip.h5
Binary file not shown.
2 changes: 2 additions & 0 deletions src/caffe/test/test_data/sample_data_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
src/caffe/test/test_data/sample_data.h5
src/caffe/test/test_data/sample_data_2_gzip.h5
Loading

0 comments on commit 2fec8cf

Please sign in to comment.