Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

[Drivers] Fix the issue when installing IB drivers. #2275

Merged
merged 14 commits into from
Mar 7, 2019
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/drivers/build/drivers-384.111.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,6 @@ RUN cd $MLNX_OFED_STRING/DEBS && \


COPY build/* $STAGE_DIR/

RUN chmod a+x enable-nvidia-persistenced-mode.sh install-all-drivers install-gdr-drivers install-ib-drivers install-nvidia-drivers

CMD /bin/bash install-all-drivers
1 change: 1 addition & 0 deletions src/drivers/build/drivers-390.25.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -136,5 +136,6 @@ RUN cd $MLNX_OFED_STRING/DEBS && \
done

COPY build/* $STAGE_DIR/
RUN chmod a+x enable-nvidia-persistenced-mode.sh install-all-drivers install-gdr-drivers install-ib-drivers install-nvidia-drivers

CMD /bin/bash install-all-drivers
2 changes: 1 addition & 1 deletion src/drivers/build/drivers-410.73.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,6 @@ RUN cd $MLNX_OFED_STRING/DEBS && \
done

COPY build/* $STAGE_DIR/

RUN chmod a+x enable-nvidia-persistenced-mode.sh install-all-drivers install-gdr-drivers install-ib-drivers install-nvidia-drivers

CMD /bin/bash install-all-drivers
8 changes: 4 additions & 4 deletions src/drivers/build/install-all-drivers
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

export MLNX_PREFIX=/var/drivers/mellanox/$MLNX_OFED_STRING/usermode
export NV_DRIVER=${DRIVER_PATH}/$NVIDIA_VERSION
export LD_LIBRARY_PATH=${MLNX_PREFIX}/lib:$LD_LIBRARY_PATH:$NV_DRIVER/lib:$NV_DRIVER/lib64
export LIBRARY_PATH=${LIBRARY_PATH:+$LIBRARY_PATH:}${MLNX_PREFIX}/lib
export LD_LIBRARY_PATH=${MLNX_PREFIX}/lib:$LD_LIBRARY_PATH:$NV_DRIVER/lib:$NV_DRIVER/lib64:/usr/local/cuda/lib64
export PATH=${MLNX_PREFIX}/bin:$PATH:$NV_DRIVER/bin
export C_INCLUDE_PATH=${C_INCLUDE_PATH:+$C_INCLUDE_PATH:}${MLNX_PREFIX}/include:${MLNX_PREFIX}/include/infiniband
export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH:+$CPLUS_INCLUDE_PATH:}${MLNX_PREFIX}/include:${MLNX_PREFIX}/include/
export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH:+$CPLUS_INCLUDE_PATH:}${MLNX_PREFIX}/include:${MLNX_PREFIX}/include/infiniband

if lspci | grep -qE "[0-9a-fA-F][0-9a-fA-F]:[0-9a-fA-F][0-9a-fA-F].[0-9] (3D|VGA compatible) controller: NVIDIA Corporation.*"; then
if [ -f "$PRE_INSTALLED_NV_DRIVER_PATH/bin/nvidia-smi" ]; then
Expand All @@ -46,7 +46,7 @@ fi
if lspci | grep -qE '(Network|Infiniband) controller.*Mellanox.*ConnectX'; then
echo Infiniband hardware detected
# Installing InfiniBand drivers and GPU direct RDMA drivers
/bin/bash install-ib-drivers || exit $?
./install-ib-drivers || exit $?
echo Infiniband drivers is installed successfully.
else
echo Infiniband hardware is not detected, skipping driver installation
Expand Down
18 changes: 12 additions & 6 deletions src/drivers/build/install-ib-drivers
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,21 @@ KERNEL_FULL_VERSION=`uname -r`
HOSTNAME=`hostname`
# HACK: using last octet of the host's IP
LAST_OCTET=`host $HOSTNAME | head -n1 | sed 's/^.*\.//'`
IB_SUBNET=33
IP_ADDRESS="192.168.$IB_SUBNET.$LAST_OCTET"
echo POD_IP: ${POD_IP}
OCT1=33
echo OCT1: $OCT1
OCT2=`echo ${POD_IP} | awk -F'.' '{ print $(NF) }'`
echo OCT1: $OCT2
IP_ADDRESS="192.168.${OCT1}.${OCT2}"

echo IB_ADRESS: $IP_ADDRESS

CURRENT_DRIVER=/var/drivers/mellanox/current

if [[ ! -f /var/drivers/mellanox/$MLNX_OFED_STRING/mlnxofedinstall ]]; then
[[ -f /tmp/$MLNX_OFED_STRING-ext.tgz ]] ||
{
./mlnx_add_kernel_support.sh -y -m ./$MLNX_OFED_STRING --make-tgz || exit $?
./$MLNX_OFED_STRING/mlnx_add_kernel_support.sh -y -m ./$MLNX_OFED_STRING --make-tgz || exit $?
}
mkdir -p /var/drivers/mellanox/$MLNX_OFED_STRING || exit $?
tar -xvf /tmp/$MLNX_OFED_STRING-ext.tgz -C /var/drivers/mellanox/$MLNX_OFED_STRING --strip 1 || exit $?
Expand Down Expand Up @@ -113,7 +119,7 @@ EOF
# Installing GPU direct RDMA drivers
# NOTE: do this here because it takes some time to install GDR drivers
# and that's enough time for IB devices to come up so we can test them
/bin/bash install-gdr-drivers || exit $?
./install-gdr-drivers || exit $?

IB_DEVICES=`ibstat -l | xargs`

Expand All @@ -139,12 +145,12 @@ do
GID=$(cat "$port_path/gids/0" | sed "s/://g")
GID_ADDRESS=${GID: -12}
IB_INTERFACE=${ADDRESS_MAP[$GID_ADDRESS]}
IB_IP_ADDRESS="192.168.$IB_SUBNET.$LAST_OCTET"
IB_IP_ADDRESS="192.168.${OCT1}.${OCT2}"
echo "Assiging ip address $IB_IP_ADDRESS for $IB_INTERFACE interface"
ifconfig $IB_INTERFACE up $IB_IP_ADDRESS/24 || exit $?
grep -q "connected" /sys/class/net/$IB_INTERFACE/mode || echo "connected" > /sys/class/net/$IB_INTERFACE/mode || exit $?
grep -q "65520" /sys/class/net/$IB_INTERFACE/mtu || echo "65520" > /sys/class/net/$IB_INTERFACE/mtu || exit $?
IB_SUBNET=$((IB_SUBNET+1))
OCT1=$((OCT1+1))
fi

if grep -q Ethernet "$port_path/link_layer" && grep -qE "LinkUp|Polling" "$port_path/phys_state"; then
Expand Down
3 changes: 2 additions & 1 deletion src/drivers/build/install-nvidia-drivers
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ mkdir -p $NV_DRIVER/lib $NV_DRIVER/lib64 $NV_DRIVER/bin || exit $?
--utility-libdir=lib64 \
--x-library-path=lib64 \
--compat32-libdir=lib \
-s -N
--dkms \
-a -s -N

echo === Loading NVIDIA UVM module
modprobe nvidia-uvm || exit $?
Expand Down
4 changes: 4 additions & 0 deletions src/drivers/deploy/drivers.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ spec:
value: /var/drivers/nvidia
- name: PRE_INSTALLED_NV_DRIVER_PATH
value: /usr/local/nvidia # the path user has pre-installed nvidia driver
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
readinessProbe:
exec:
command:
Expand Down