Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-243 Use libhdfs3 #108

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,14 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
"Build the Arrow IPC extensions"
ON)

option(ARROW_HDFS
"Build the Arrow IO extensions for the Hadoop file system"
OFF)

option(arrow_hdfs
"build the arrow io extensions for the hadoop file system"
off)

option(USE_LIBHDFS3
"build the arrow io extensions using libhdfs3 instead of libhdfs JNI"
off)

option(ARROW_BOOST_USE_SHARED
"Rely on boost shared libraries where relevant"
ON)
Expand Down Expand Up @@ -735,6 +739,5 @@ if(ARROW_IPC)
add_library(flatbuffers STATIC IMPORTED)
set_target_properties(flatbuffers PROPERTIES
IMPORTED_LOCATION ${FLATBUFFERS_STATIC_LIB})

add_subdirectory(src/arrow/ipc)
endif()
40 changes: 40 additions & 0 deletions cpp/cmake_modules/FindGSasl.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright 2011-2015 Quickstep Technologies LLC.
# Copyright 2015 Pivotal Software, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# - Try to find the GNU sasl library (gsasl)
#
# Once done this will define
#
# GSASL_FOUND - System has gnutls
# GSASL_INCLUDE_DIR - The gnutls include directory
# GSASL_LIBRARY - The libraries needed to use gnutls


IF (GSASL_INCLUDE_DIR AND GSASL_LIBRARY)
# in cache already
SET(GSASL_FIND_QUIETLY TRUE)
ENDIF (GSASL_INCLUDE_DIR AND GSASL_LIBRARY)

FIND_PATH(GSASL_INCLUDE_DIR gsasl.h)

FIND_LIBRARY(GSASL_LIBRARY gsasl)

INCLUDE(FindPackageHandleStandardArgs)

# handle the QUIETLY and REQUIRED arguments and set GSASL_FOUND to TRUE if
# all listed variables are TRUE
FIND_PACKAGE_HANDLE_STANDARD_ARGS(GSASL DEFAULT_MSG GSASL_LIBRARY GSASL_INCLUDE_DIR)

MARK_AS_ADVANCED(GSASL_INCLUDE_DIR GSASL_LIBRARY)
38 changes: 38 additions & 0 deletions cpp/cmake_modules/FindKerberos.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright 2011-2015 Quickstep Technologies LLC.
# Copyright 2015 Pivotal Software, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# - Find kerberos
# Find the native KERBEROS includes and library
#
# KERBEROS_INCLUDE_DIR - where to find krb5.h, etc.
# KERBEROS_LIBRARY - List of libraries when using krb5.
# KERBEROS_FOUND - True if krb5 found.

IF (KERBEROS_INCLUDE_DIR)
# Already in cache, be silent
SET(KERBEROS_FIND_QUIETLY TRUE)
ENDIF (KERBEROS_INCLUDE_DIR)

FIND_PATH(KERBEROS_INCLUDE_DIR krb5.h)

SET(KERBEROS_NAMES krb5 k5crypto com_err)
FIND_LIBRARY(KERBEROS_LIBRARY NAMES ${KERBEROS_NAMES})

# handle the QUIETLY and REQUIRED arguments and set KERBEROS_FOUND to TRUE if
# all listed variables are TRUE
INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(KERBEROS DEFAULT_MSG KERBEROS_LIBRARY KERBEROS_INCLUDE_DIR)

MARK_AS_ADVANCED(KERBEROS_LIBRARY KERBEROS_INCLUDE_DIR)
37 changes: 37 additions & 0 deletions cpp/cmake_modules/FindLibhdfs3.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2011-2015 Quickstep Technologies LLC.
# Copyright 2015 Pivotal Software, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Module to find the Pivotal libhdfs3.

find_path(LIBHDFS3_INCLUDE_DIR hdfs/hdfs.h PATHS ${LIBHDFS3_ROOT} NO_DEFAULT_PATH PATH_SUFFIXES "include")

find_library(LIBHDFS3_LIBRARY NAMES hdfs3 libhdfs3 PATHS ${LIBHDFS3_ROOT} NO_DEFAULT_PATH PATH_SUFFIXES "lib")

# Linking against libhdfs3 also requires linking against gsasl and kerberos.
find_package(GSasl REQUIRED)
find_package(Kerberos REQUIRED)

set(LIBHDFS3_LIBRARIES ${LIBHDFS3_LIBRARY}
${GSASL_LIBRARY}
${KERBEROS_LIBRARY})
set(LIBHDFS3_INCLUDE_DIRS ${LIBHDFS3_INCLUDE_DIR}
${GSASL_INCLUDE_DIR}
${KERBEROS_INCLUDE_DIR})

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Libhdfs3 DEFAULT_MSG
LIBHDFS3_LIBRARY LIBHDFS3_INCLUDE_DIR)

mark_as_advanced(LIBHDFS3_INCLUDE_DIR LIBHDFS3_LIBRARY)
37 changes: 18 additions & 19 deletions cpp/src/arrow/io/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,15 @@
# ----------------------------------------------------------------------
# arrow_io : Arrow IO interfaces

if( USE_LIBHDFS3)
set(LIBHDFS3_ROOT ${THIRDPARTY_DIR}/installed)
find_package(LIBHDFS3 REQUIRED)
include_directories(${LIBHDFS3_INCLUDE_DIR})
add_definitions(-DUSE_LIBHDFS3)
endif()

set(ARROW_IO_LINK_LIBS
arrow_shared
arrow_shared
)

if (ARROW_BOOST_USE_SHARED)
Expand All @@ -31,6 +38,9 @@ else()
boost_system_static
boost_filesystem_static)
endif()
if(USE_LIBHDFS3)
set(ARROW_IO_PRIVATE_LINK_LIBS ${ARROW_IO_PRIVATE_LINK_LIBS} ${LIBHDFS3_LIBRARIES})
endif()

set(ARROW_IO_TEST_LINK_LIBS
arrow_io
Expand All @@ -43,26 +53,15 @@ if(ARROW_HDFS)
if(NOT THIRDPARTY_DIR)
message(FATAL_ERROR "THIRDPARTY_DIR not set")
endif()

if (DEFINED ENV{HADOOP_HOME})
set(HADOOP_HOME $ENV{HADOOP_HOME})
set(ARROW_HDFS_SRCS
hdfs.cc)
if( NOT USE_LIBHDFS3)
set(ARROW_HDFS_SRCS
${ARROW_HDFS_SRCS} libhdfs_shim.cc)
else()
set(HADOOP_HOME "${THIRDPARTY_DIR}/hadoop")
set(ARROW_HDFS_SRCS
${ARROW_HDFS_SRCS} libhdfs3_shim.cc)
endif()

set(HDFS_H_PATH "${HADOOP_HOME}/include/hdfs.h")
if (NOT EXISTS ${HDFS_H_PATH})
message(FATAL_ERROR "Did not find hdfs.h at ${HDFS_H_PATH}")
endif()
message(STATUS "Found hdfs.h at: " ${HDFS_H_PATH})
message(STATUS "Building libhdfs shim component")

include_directories(SYSTEM "${HADOOP_HOME}/include")

set(ARROW_HDFS_SRCS
hdfs.cc
libhdfs_shim.cc)

set_property(SOURCE ${ARROW_HDFS_SRCS}
APPEND_STRING PROPERTY
COMPILE_FLAGS "-DHAS_HADOOP")
Expand Down
11 changes: 10 additions & 1 deletion cpp/src/arrow/io/hdfs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#ifdef USE_LIBHDFS3
#include <hdfs/hdfs.h>
#else
#include <hdfs.h>
#endif

#include <cstdint>
#include <sstream>
Expand Down Expand Up @@ -101,8 +104,14 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl {
}

Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) {
#ifdef USE_LIBHDFS3
hdfsSeek(fs_, file_, static_cast<tOffset>(position));
tSize ret = hdfsRead(fs_, file_,
reinterpret_cast<void*>(buffer), nbytes);
#else
tSize ret = hdfsPread(fs_, file_, static_cast<tOffset>(position),
reinterpret_cast<void*>(buffer), nbytes);
#endif
RETURN_NOT_OK(CheckReadResult(ret));
*bytes_read = ret;
return Status::OK();
Expand Down
53 changes: 53 additions & 0 deletions cpp/src/arrow/io/libhdfs3_shim.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// This shim interface to libhdfs (for runtime shared library loading) has been
// adapted from the SFrame project, released under the ASF-compatible 3-clause
// BSD license
//
// Using this required having the $JAVA_HOME and $HADOOP_HOME environment
// variables set, so that libjvm and libhdfs can be located easily

// Copyright (C) 2015 Dato, Inc.
// All rights reserved.
//
// This software may be modified and distributed under the terms
// of the BSD license. See the LICENSE file for details.

#ifdef HAS_HADOOP

#include <iostream>
#include <mutex>
#include <sstream>
#include <string>
#include <type_traits>
#include <vector>

#include "arrow/util/status.h"
#include "arrow/util/visibility.h"

namespace arrow {
namespace io {

Status ARROW_EXPORT ConnectLibHdfs() {
return Status::OK();
}

} // namespace io
} // namespace arrow

#endif // HAS_HADOOP
15 changes: 15 additions & 0 deletions cpp/thirdparty/build_thirdparty.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ else
"gtest") F_GTEST=1 ;;
"gbenchmark") F_GBENCHMARK=1 ;;
"flatbuffers") F_FLATBUFFERS=1 ;;
"libhdfs3") F_LIBHDFS3=1 ;;
*) echo "Unknown module: $arg"; exit 1 ;;
esac
done
Expand Down Expand Up @@ -88,5 +89,19 @@ if [ -n "$F_ALL" -o -n "$F_FLATBUFFERS" ]; then
make install || { echo "install $FLATBUFFERS_ERROR" ; exit 1; }
fi

LIBHDFS3_ERROR="failed for libhdfs3"
if [ -n "$F_ALL" -o -n "$F_LIBHDFS3" ]; then
cd $TP_DIR/$LIBHDFS3_BASEDIR
rm -rf build
mkdir build
cd build
pwd
../bootstrap --prefix=$PREFIX
make install || { echo "install $LIBHDFS3_ERROR" ; exit 1; }
fi




echo "---------------------"
echo "Thirdparty dependencies built and installed into $PREFIX successfully"
5 changes: 5 additions & 0 deletions cpp/thirdparty/download_thirdparty.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,8 @@ if [ ! -d ${FLATBUFFERS_BASEDIR} ]; then
echo "Fetching flatbuffers"
download_extract_and_cleanup $FLATBUFFERS_URL
fi

if [ ! -d ${LIBHDFS3_BASEDIR} ]; then
echo "Fetching libhdfs3"
download_extract_and_cleanup $LIBHDFS3_URL
fi
1 change: 1 addition & 0 deletions cpp/thirdparty/set_thirdparty_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ fi
export GTEST_HOME=$THIRDPARTY_DIR/$GTEST_BASEDIR
export GBENCHMARK_HOME=$THIRDPARTY_DIR/installed
export FLATBUFFERS_HOME=$THIRDPARTY_DIR/installed
export LIBHDFS3_HOME=$THIRDPARTY_DIR/installed
4 changes: 4 additions & 0 deletions cpp/thirdparty/versions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,7 @@ GBENCHMARK_BASEDIR=benchmark-$GBENCHMARK_VERSION
FLATBUFFERS_VERSION=1.3.0
FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/v${FLATBUFFERS_VERSION}.tar.gz"
FLATBUFFERS_BASEDIR=flatbuffers-$FLATBUFFERS_VERSION

LIBHDFS3_VERSION=2.2.31
LIBHDFS3_URL="https://github.com/Pivotal-Data-Attic/pivotalrd-libhdfs3/archive/v${LIBHDFS3_VERSION}.tar.gz"
LIBHDFS3_BASEDIR=pivotalrd-libhdfs3-$LIBHDFS3_VERSION