Skip to content

Commit

Permalink
Support "index" field of dataframe (#110)
Browse files Browse the repository at this point in the history
  • Loading branch information
acezen authored Jan 4, 2021
1 parent 3f8a69c commit d87643c
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 4 deletions.
8 changes: 8 additions & 0 deletions modules/basic/ds/dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ const std::vector<std::string>& DataFrame::Columns() const {
return this->columns_;
}

std::shared_ptr<ITensor> DataFrame::Index() const {
return values_.at(INDEX_COL_NAME);
}

std::shared_ptr<ITensor> DataFrame::Column(std::string const& column) const {
return values_.at(column);
}
Expand Down Expand Up @@ -55,6 +59,10 @@ void DataFrameBuilder::set_row_batch_index(size_t row_batch_index) {
this->set_row_batch_index_(row_batch_index);
}

void DataFrameBuilder::set_index(std::shared_ptr<ITensorBuilder> builder) {
this->values_.emplace(INDEX_COL_NAME, builder);
}

std::shared_ptr<ITensorBuilder> DataFrameBuilder::Column(
std::string const& column) const {
return values_.at(column);
Expand Down
9 changes: 9 additions & 0 deletions modules/basic/ds/dataframe.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ limitations under the License.
#include "client/ds/i_object.h"
#include "common/util/ptree.h"

#define INDEX_COL_NAME "index_"

namespace vineyard {

/**
Expand Down Expand Up @@ -73,6 +75,13 @@ class DataFrameBuilder : public DataFrameBaseBuilder {
*/
void set_row_batch_index(size_t row_batch_index);

/**
* @brief Set the index of dataframe by add a index column to dataframe.
*
* @param builder The index tensor builder.
*/
void set_index(std::shared_ptr<ITensorBuilder> builder);

/**
* @brief Get the column of the given column name.
*
Expand Down
7 changes: 7 additions & 0 deletions modules/basic/ds/dataframe.vineyard-mod
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ class DataFrame : public Registered<DataFrame> {
*/
const std::vector<std::string>& Columns() const;

/**
* @brief Get the index of dataframe.
*
* @return The shared pointer to the index tensor.
*/
std::shared_ptr<ITensor> Index() const;

/**
* @brief Get the column of the given column name.
*
Expand Down
2 changes: 2 additions & 0 deletions python/vineyard/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from vineyard.data.base import register_base_types
from vineyard.data.arrow import register_arrow_types
from vineyard.data.tensor import register_tensor_types
from vineyard.data.index import register_index_types
from vineyard.data.dataframe import register_dataframe_types
from vineyard.data.graph import register_graph_types

Expand All @@ -31,6 +32,7 @@ def register_builtin_types(builder_ctx, resolver_ctx):
register_base_types(builder_ctx, resolver_ctx)
register_arrow_types(builder_ctx, resolver_ctx)
register_tensor_types(builder_ctx, resolver_ctx)
register_index_types(builder_ctx, resolver_ctx)
register_dataframe_types(builder_ctx, resolver_ctx)
register_graph_types(builder_ctx, resolver_ctx)

Expand Down
8 changes: 4 additions & 4 deletions python/vineyard/data/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
def pandas_dataframe_builder(client, value, builder, **kw):
meta = ObjectMeta()
meta['typename'] = 'vineyard::DataFrame'
meta['columns_'] = json.dumps([str(x) for x in value.columns])
meta['columns_'] = json.dumps(value.columns.values.tolist())
meta.add_member('index_', builder.run(client, value.index))
for i, (name, column_value) in enumerate(value.iteritems()):
np_value = column_value.to_numpy(copy=False)
meta['__values_-key-%d' % i] = str(name)
Expand All @@ -44,17 +45,16 @@ def pandas_dataframe_builder(client, value, builder, **kw):
def dataframe_resolver(obj, resolver):
meta = obj.meta
columns = json.loads(meta['columns_'])
index = resolver.run(obj.member('index_'))
if not columns:
return pd.DataFrame()
# ensure zero-copy
blocks = []
index_size = 0
for idx, name in enumerate(columns):
np_value = resolver.run(obj.member('__values_-value-%d' % idx))
# ndim: 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame
blocks.append(Block(np.expand_dims(np_value, 0), slice(idx, idx + 1, 1), ndim=2))
index_size = len(np_value)
return pd.DataFrame(BlockManager(blocks, [columns, np.arange(index_size)]))
return pd.DataFrame(BlockManager(blocks, [columns, index]))


def register_dataframe_types(builder_ctx, resolver_ctx):
Expand Down
49 changes: 49 additions & 0 deletions python/vineyard/data/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2020 Alibaba Group Holding Limited.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import json
import pandas as pd

from vineyard._C import ObjectMeta

from .utils import normalize_dtype


def pandas_index_builder(client, value, builder, **kw):
meta = ObjectMeta()
meta['typename'] = 'vineyard::Index'
meta['name'] = json.dumps(value.name)
meta['value_type_'] = value.dtype.name
meta.add_member('value_', builder.run(client, value.to_numpy(), **kw))
return client.create_metadata(meta)


def pandas_index_resolver(obj, resolver):
meta = obj.meta
value_type = normalize_dtype(meta['value_type_'])
name = json.loads(meta['name'])
value = resolver.run(obj.member('value_'))
return pd.Index(value, dtype=value_type, name=name)


def register_index_types(builder_ctx, resolver_ctx):
if builder_ctx is not None:
builder_ctx.register(pd.Index, pandas_index_builder)

if resolver_ctx is not None:
resolver_ctx.register('vineyard::Index', pandas_index_resolver)
15 changes: 15 additions & 0 deletions python/vineyard/data/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import pandas as pd
import pytest
import numpy as np

import vineyard
from vineyard.core import default_builder_context, default_resolver_context
Expand All @@ -30,3 +31,17 @@ def test_pandas_dataframe(vineyard_client):
df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]})
object_id = vineyard_client.put(df)
pd.testing.assert_frame_equal(df, vineyard_client.get(object_id))


def test_dataframe_reindex(vineyard_client):
df = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5'])
expected = df.reindex(index=np.arange(10, 1, -1))
object_id = vineyard_client.put(expected)
pd.testing.assert_frame_equal(expected, vineyard_client.get(object_id))


def test_dataframe_set_index(vineyard_client):
df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], index=['a1', 'a2', 'a3'], columns=['x', 'y', 'z'])
expected = df1.set_index('y', drop=True)
object_id = vineyard_client.put(expected)
pd.testing.assert_frame_equal(expected, vineyard_client.get(object_id))

0 comments on commit d87643c

Please sign in to comment.