Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
[Drivers] Fix the issue when installing IB drivers. (#2275)
Browse files Browse the repository at this point in the history
  • Loading branch information
ydye authored Mar 7, 2019
1 parent c67ab37 commit c275ca6
Show file tree
Hide file tree
Showing 7 changed files with 25 additions and 14 deletions.
2 changes: 1 addition & 1 deletion src/drivers/build/drivers-384.111.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,6 @@ RUN cd $MLNX_OFED_STRING/DEBS && \


COPY build/* $STAGE_DIR/

RUN chmod a+x enable-nvidia-persistenced-mode.sh install-all-drivers install-gdr-drivers install-ib-drivers install-nvidia-drivers

CMD /bin/bash install-all-drivers
1 change: 1 addition & 0 deletions src/drivers/build/drivers-390.25.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -136,5 +136,6 @@ RUN cd $MLNX_OFED_STRING/DEBS && \
done

COPY build/* $STAGE_DIR/
RUN chmod a+x enable-nvidia-persistenced-mode.sh install-all-drivers install-gdr-drivers install-ib-drivers install-nvidia-drivers

CMD /bin/bash install-all-drivers
2 changes: 1 addition & 1 deletion src/drivers/build/drivers-410.73.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,6 @@ RUN cd $MLNX_OFED_STRING/DEBS && \
done

COPY build/* $STAGE_DIR/

RUN chmod a+x enable-nvidia-persistenced-mode.sh install-all-drivers install-gdr-drivers install-ib-drivers install-nvidia-drivers

CMD /bin/bash install-all-drivers
8 changes: 4 additions & 4 deletions src/drivers/build/install-all-drivers
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

export MLNX_PREFIX=/var/drivers/mellanox/$MLNX_OFED_STRING/usermode
export NV_DRIVER=${DRIVER_PATH}/$NVIDIA_VERSION
export LD_LIBRARY_PATH=${MLNX_PREFIX}/lib:$LD_LIBRARY_PATH:$NV_DRIVER/lib:$NV_DRIVER/lib64
export LIBRARY_PATH=${LIBRARY_PATH:+$LIBRARY_PATH:}${MLNX_PREFIX}/lib
export LD_LIBRARY_PATH=${MLNX_PREFIX}/lib:$LD_LIBRARY_PATH:$NV_DRIVER/lib:$NV_DRIVER/lib64:/usr/local/cuda/lib64
export PATH=${MLNX_PREFIX}/bin:$PATH:$NV_DRIVER/bin
export C_INCLUDE_PATH=${C_INCLUDE_PATH:+$C_INCLUDE_PATH:}${MLNX_PREFIX}/include:${MLNX_PREFIX}/include/infiniband
export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH:+$CPLUS_INCLUDE_PATH:}${MLNX_PREFIX}/include:${MLNX_PREFIX}/include/
export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH:+$CPLUS_INCLUDE_PATH:}${MLNX_PREFIX}/include:${MLNX_PREFIX}/include/infiniband

if lspci | grep -qE "[0-9a-fA-F][0-9a-fA-F]:[0-9a-fA-F][0-9a-fA-F].[0-9] (3D|VGA compatible) controller: NVIDIA Corporation.*"; then
if [ -f "$PRE_INSTALLED_NV_DRIVER_PATH/bin/nvidia-smi" ]; then
Expand All @@ -46,7 +46,7 @@ fi
if lspci | grep -qE '(Network|Infiniband) controller.*Mellanox.*ConnectX'; then
echo Infiniband hardware detected
# Installing InfiniBand drivers and GPU direct RDMA drivers
/bin/bash install-ib-drivers || exit $?
./install-ib-drivers || exit $?
echo Infiniband drivers is installed successfully.
else
echo Infiniband hardware is not detected, skipping driver installation
Expand Down
19 changes: 12 additions & 7 deletions src/drivers/build/install-ib-drivers
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,21 @@ KERNEL_FULL_VERSION=`uname -r`
HOSTNAME=`hostname`
# HACK: using last octet of the host's IP
LAST_OCTET=`host $HOSTNAME | head -n1 | sed 's/^.*\.//'`
IB_SUBNET=33
IP_ADDRESS="192.168.$IB_SUBNET.$LAST_OCTET"
echo POD_IP: ${POD_IP}
OCT1=33
echo OCT1: $OCT1
OCT2=`echo ${POD_IP} | awk -F'.' '{ print $(NF) }'`
echo OCT1: $OCT2
IP_ADDRESS="192.168.${OCT1}.${OCT2}"

echo IB_ADRESS: $IP_ADDRESS

CURRENT_DRIVER=/var/drivers/mellanox/current

if [[ ! -f /var/drivers/mellanox/$MLNX_OFED_STRING/mlnxofedinstall ]]; then
[[ -f /tmp/$MLNX_OFED_STRING-ext.tgz ]] ||
{
./mlnx_add_kernel_support.sh -y -m ./$MLNX_OFED_STRING --make-tgz || exit $?
./$MLNX_OFED_STRING/mlnx_add_kernel_support.sh -y -m ./$MLNX_OFED_STRING --make-tgz || exit $?
}
mkdir -p /var/drivers/mellanox/$MLNX_OFED_STRING || exit $?
tar -xvf /tmp/$MLNX_OFED_STRING-ext.tgz -C /var/drivers/mellanox/$MLNX_OFED_STRING --strip 1 || exit $?
Expand Down Expand Up @@ -113,7 +119,7 @@ EOF
# Installing GPU direct RDMA drivers
# NOTE: do this here because it takes some time to install GDR drivers
# and that's enough time for IB devices to come up so we can test them
/bin/bash install-gdr-drivers || exit $?
./install-gdr-drivers || exit $?

IB_DEVICES=`ibstat -l | xargs`

Expand All @@ -139,12 +145,12 @@ do
GID=$(cat "$port_path/gids/0" | sed "s/://g")
GID_ADDRESS=${GID: -12}
IB_INTERFACE=${ADDRESS_MAP[$GID_ADDRESS]}
IB_IP_ADDRESS="192.168.$IB_SUBNET.$LAST_OCTET"
IB_IP_ADDRESS="192.168.${OCT1}.${OCT2}"
echo "Assiging ip address $IB_IP_ADDRESS for $IB_INTERFACE interface"
ifconfig $IB_INTERFACE up $IB_IP_ADDRESS/24 || exit $?
grep -q "connected" /sys/class/net/$IB_INTERFACE/mode || echo "connected" > /sys/class/net/$IB_INTERFACE/mode || exit $?
grep -q "65520" /sys/class/net/$IB_INTERFACE/mtu || echo "65520" > /sys/class/net/$IB_INTERFACE/mtu || exit $?
IB_SUBNET=$((IB_SUBNET+1))
OCT1=$((OCT1+1))
fi

if grep -q Ethernet "$port_path/link_layer" && grep -qE "LinkUp|Polling" "$port_path/phys_state"; then
Expand All @@ -161,4 +167,3 @@ ibdev2netdev || exit $?
# Final check
ibPresent
echo ibPresent exit value: $?

3 changes: 2 additions & 1 deletion src/drivers/build/install-nvidia-drivers
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ mkdir -p $NV_DRIVER/lib $NV_DRIVER/lib64 $NV_DRIVER/bin || exit $?
--utility-libdir=lib64 \
--x-library-path=lib64 \
--compat32-libdir=lib \
-s -N
--dkms \
-a -s -N

echo === Loading NVIDIA UVM module
modprobe nvidia-uvm || exit $?
Expand Down
4 changes: 4 additions & 0 deletions src/drivers/deploy/drivers.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ spec:
value: /var/drivers/nvidia
- name: PRE_INSTALLED_NV_DRIVER_PATH
value: /usr/local/nvidia # the path user has pre-installed nvidia driver
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
readinessProbe:
exec:
command:
Expand Down

0 comments on commit c275ca6

Please sign in to comment.