Loading tensorflowimage-centos-cuda/Dockerfile.mpiimagebase_peak +2 −4 Original line number Diff line number Diff line # Centos 8, Cuda 11.0.3, tensorflow 2.5.0, JAX FROM code.ornl.gov:4567/76a/olcfbaseimages/mpiimage-centos-cuda:latest ARG mpi_root RUN dnf install -y dnf-plugins-core \ && dnf config-manager --set-enabled powertools Loading Loading @@ -113,9 +114,6 @@ RUN pip install /tmp/tensorflow_pkg/tensorflow-2.5.0-cp39-cp39-linux_ppc64le.whl #RUN pip install horovod #ENV LD_LIBRARY_PATH=/sw/peak/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-9.1.0/spectrum-mpi-10.4.0.3-20210112-6jbupg3thjwhsabgevk6xmwhd2bbyxdc/lib:/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 #RUN dnf install -y --allowerasing openmpi ENV PATH=$PATH:/usr/mpi/gcc/openmpi-4.0.3rc4/bin ENV CPATH=$CPATH:/usr/mpi/gcc/openmpi-4.0.3rc4/include ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/mpi/gcc/openmpi-4.0.3rc4/lib64 RUN HOROVOD_WITH_MPI=1 CXX=/usr/bin/g++ CC=/usr/bin/gcc pip install --no-cache-dir --no-binary horovod horovod RUN HOROVOD_WITH_MPI=1 CC=$mpi_root/bin/mpicc CXX=$mpi_root/bin/mpicxx pip install --no-cache-dir --no-binary horovod horovod # #ENV LD_LIBRARY_PATH="/opt/conda/lib:$LD_LIBRARY_PATH" tensorflowimage-centos-cuda/build_on_peak.sh 0 → 100755 +19 −0 Original line number Diff line number Diff line #!/bin/bash module purge module load DefApps module load gcc/9.1.0 export XDG_CACHE_HOME=/gpfs/alpine/stf007/world-shared/subil/tmp/podman-cache; # this is needed because a dnf install step during the container build will fail saying that it can't create a file. # Said file might have been created in previous build attempts and so needs to be cleared. rm -rf /var/cache/dnf; ulimit -n 262144; podman build --build-arg mpi_root=$MPI_ROOT -v $MPI_ROOT:$MPI_ROOT,/gpfs/alpine/stf007/world-shared/subil/tmp:/tmp --ulimit nofile=262144:262144 -f Dockerfile.mpiimagebase_peak -t code.ornl.gov:4567/76a/olcfbaseimages/tensorflowimage-mpiimagebase-centos-cuda:latest . ; podman save -o tensorflow_v2.5.0_mpiimagebase.tar code.ornl.gov:4567/76a/olcfbaseimages/tensorflowimage-mpiimagebase-centos-cuda:latest; singularity build tensorflow_v2.5.0_mpiimagebase.sif docker-archive://tensorflow_v2.5.0_mpiimagebase.tar; Loading
tensorflowimage-centos-cuda/Dockerfile.mpiimagebase_peak +2 −4 Original line number Diff line number Diff line # Centos 8, Cuda 11.0.3, tensorflow 2.5.0, JAX FROM code.ornl.gov:4567/76a/olcfbaseimages/mpiimage-centos-cuda:latest ARG mpi_root RUN dnf install -y dnf-plugins-core \ && dnf config-manager --set-enabled powertools Loading Loading @@ -113,9 +114,6 @@ RUN pip install /tmp/tensorflow_pkg/tensorflow-2.5.0-cp39-cp39-linux_ppc64le.whl #RUN pip install horovod #ENV LD_LIBRARY_PATH=/sw/peak/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-9.1.0/spectrum-mpi-10.4.0.3-20210112-6jbupg3thjwhsabgevk6xmwhd2bbyxdc/lib:/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 #RUN dnf install -y --allowerasing openmpi ENV PATH=$PATH:/usr/mpi/gcc/openmpi-4.0.3rc4/bin ENV CPATH=$CPATH:/usr/mpi/gcc/openmpi-4.0.3rc4/include ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/mpi/gcc/openmpi-4.0.3rc4/lib64 RUN HOROVOD_WITH_MPI=1 CXX=/usr/bin/g++ CC=/usr/bin/gcc pip install --no-cache-dir --no-binary horovod horovod RUN HOROVOD_WITH_MPI=1 CC=$mpi_root/bin/mpicc CXX=$mpi_root/bin/mpicxx pip install --no-cache-dir --no-binary horovod horovod # #ENV LD_LIBRARY_PATH="/opt/conda/lib:$LD_LIBRARY_PATH"
tensorflowimage-centos-cuda/build_on_peak.sh 0 → 100755 +19 −0 Original line number Diff line number Diff line #!/bin/bash module purge module load DefApps module load gcc/9.1.0 export XDG_CACHE_HOME=/gpfs/alpine/stf007/world-shared/subil/tmp/podman-cache; # this is needed because a dnf install step during the container build will fail saying that it can't create a file. # Said file might have been created in previous build attempts and so needs to be cleared. rm -rf /var/cache/dnf; ulimit -n 262144; podman build --build-arg mpi_root=$MPI_ROOT -v $MPI_ROOT:$MPI_ROOT,/gpfs/alpine/stf007/world-shared/subil/tmp:/tmp --ulimit nofile=262144:262144 -f Dockerfile.mpiimagebase_peak -t code.ornl.gov:4567/76a/olcfbaseimages/tensorflowimage-mpiimagebase-centos-cuda:latest . ; podman save -o tensorflow_v2.5.0_mpiimagebase.tar code.ornl.gov:4567/76a/olcfbaseimages/tensorflowimage-mpiimagebase-centos-cuda:latest; singularity build tensorflow_v2.5.0_mpiimagebase.sif docker-archive://tensorflow_v2.5.0_mpiimagebase.tar;