Commit e1992ae2 authored by Abraham, Subil's avatar Abraham, Subil
Browse files

pytorch 1.10.0 attempts

parent 60a2388d
Loading
Loading
Loading
Loading
+207 −0
Original line number Diff line number Diff line
# Centos 8, Cuda 11.0.3, Pytorch 1.10 built from source
FROM code.ornl.gov:4567/76a/olcfbaseimages/mpiimage-centos-cuda:latest

RUN dnf install -y dnf-plugins-core \
	&& dnf config-manager --set-enabled powertools
RUN dnf -y install \
	kernel-devel \
	cuda-command-line-tools-11-0 \
	java-11-openjdk-devel \
	zip unzip \
	python3-devel \
	python3 \
	python3-pip \
	tzdata \
	gpg \
	patch \
	glibc-devel \
	&& dnf -y clean all

ENV CONDA_HOME="/opt/conda"
RUN mkdir /code
WORKDIR /code
ENV _CONDA_INSTALLER="Miniconda3-py39_4.9.2-Linux-ppc64le.sh"
RUN wget -q -P /code \
  https://repo.anaconda.com/miniconda/${_CONDA_INSTALLER} \
    && bash /code/${_CONDA_INSTALLER} -b -p ${CONDA_HOME} \
    && rm /code/${_CONDA_INSTALLER}

ENV PATH="/opt/conda/bin:$PATH"
RUN conda install -y conda-build conda-verify

ENV INSTALL_ROOT=/code
ENV PYTORCH_VER=1.10.0

RUN mkdir -p ${INSTALL_ROOT}

# Download and build NCCL
RUN cd ${INSTALL_ROOT} \
	&& git clone https://github.com/NVIDIA/nccl.git --branch v2.11.4-1 nccl_build \
	&& cd nccl_build \
	&& make -j10 src.build CUDA_HOME=${CUDA_HOME} NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70" \
	&& cd ${INSTALL_ROOT} \
	&& mkdir nccl \
	&& mv nccl_build/build/lib nccl \ 
	&& mv nccl_build/build/include nccl \
	&& rm -rf nccl_build

# Set to pre-installed NCCL and CUDNN paths
ENV NCCL_ROOT=${INSTALL_ROOT}/nccl


# install magma
ENV TORCH_CUDA_ARCH_LIST="7.0"
RUN conda install -y pip astunparse numpy ninja pyyaml setuptools cmake cffi typing_extensions future six requests dataclasses
RUN conda install -y -c conda-forge lapack==3.9.0
RUN cd /code \
	&& wget -q -P /code http://icl.utk.edu/projectsfiles/magma/downloads/magma-2.6.1.tar.gz \ 
	&& tar -xzf magma-2.6.1.tar.gz \
	&& cd magma-2.6.1 \
	&& mkdir build \
	&& cd build \
	&& cmake -DBUILD_SHARED_LIBS=ON -DUSE_FORTRAN=yes -DGPU_TARGET=sm_70 -DMAGMA_ENABLE_CUDA=ON -DCUDA_NVCC_FLAGS="-Xcompiler;-mno-float128"  .. \
	&& make install \
	&& cd .. \
	&& cp magmablas/atomics.cuh control/magma_threadsetting.h control/pthread_barrier.h \
		control/magma_internal.h /usr/include 

#conda install -y magma-cuda110
## Install PyTorch
RUN git config --global user.email "you@example.com" \
	&& git config --global user.name "Your Name"
RUN cd ${INSTALL_ROOT} \
	&& git clone --recursive  --branch v${PYTORCH_VER} https://github.com/pytorch/pytorch.git pytorch \
	&& cd pytorch \ 
	&& git cherry-pick 7452b65144340d199e99c5d69849414d970002fa \
	&& python setup.py clean
ENV MAX_JOBS=10 
ENV CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} 
ENV USE_CUDNN=1 
ENV CUDNN_INCLUDE_DIR="/usr/include" 
ENV CUDNN_LIB_DIR="/usr/lib" 
ENV NCCL_INCLUDE_DIR="${NCCL_ROOT}/include/" 
ENV NCCL_LIB_DIR="${NCCL_ROOT}/lib/" 
ENV USE_SYSTEM_NCCL=1 
ENV USE_NINJA=OFF
ENV USE_BREAKPAD=OFF
RUN cd ${INSTALL_ROOT}/pytorch \
	&& python3 setup.py -v install


# Hide libgomp inside env (it has an old one included that causes compatibility issues when importing torch)
RUN cd ${CONDA_HOME}/lib \
	&& mkdir hide \ 
	&& mv libgomp* libstdc++* hide/

# Install DGL
RUN pip install pybind11 pythran scipy
COPY ./gklib_patch /code
RUN cd ${INSTALL_ROOT} \
	&& git clone --recurse-submodules https://github.com/dmlc/dgl.git --branch=0.7.1 \ 
	&& cd dgl \
	&& cd third_party/METIS/GKlib \
	&& git apply ${INSTALL_ROOT}/gklib_patch \
	&& cd ../../../ \
	&& mkdir build \
	&& cd build \
	&& cmake -DUSE_CUDA=ON -DCUDA_ARCH_NAME=Manual -DCUDA_ARCH_BIN="70" -DUSE_AVX=OFF \
		 -DUSE_LIBXSMM=OFF  -DBUILD_TORCH=ON .. \
	&& make -j \
	&& cd ../python \
	&& python setup.py install

# Install Apex
RUN cd ${INSTALL_ROOT} \
	&& git clone https://github.com/NVIDIA/apex.git \
	&& cd apex \
	&& pip install --no-cache-dir --disable-pip-version-check --global-option="--cpp_ext" --global-option="--cuda_ext" .



## Install DALI_deps 
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
ENV LD_RUN_PATH=$LD_RUN_PATH:/usr/local/lib
ENV PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/usr/local/lib/pkgconfig
RUN cd ${INSTALL_ROOT} \
	&& git clone https://github.com/NVIDIA/DALI_deps --depth 1 --branch v1.6.0 \
	&& cd DALI_deps \
	&& git submodule init \
	&& git submodule update --depth 1 --recursive \
	&& sed -i "s/\"cmake\"//" build_scripts/build_deps.sh \
	&& dnf install -y libtool clang-devel autogen automake  \
	&& pip install clang \
	&& cd ${INSTALL_ROOT}/DALI_deps/third_party/protobuf \
	&& libtoolize \
	&& cd ${INSTALL_ROOT}/DALI_deps/third_party/flac \
	&& libtoolize \
	&& cd ${INSTALL_ROOT}/DALI_deps/third_party/ogg \
	&& libtoolize \
	&& cd ${INSTALL_ROOT}/DALI_deps/third_party/vorbis \
	&& libtoolize \
	&& cd ${INSTALL_ROOT}/DALI_deps/third_party/opus \
	&& libtoolize \
	&& cd ${INSTALL_ROOT}/DALI_deps/third_party/libtar \
	&& libtoolize \
	&& cd ${INSTALL_ROOT}/DALI_deps \
	&& build_scripts/build_deps.sh

# Install Torchvision
RUN cd ${INSTALL_ROOT} \
	&& git clone --depth 1 --branch v0.10.0 https://github.com/pytorch/vision \
	&& cd vision \
	&& python setup.py install
	#&& dnf install -y libjpeg-turbo-devel \

# build opencv (need this to build libquirc.a and libade.a)
RUN cd ${INSTALL_ROOT} \ 
	&& git clone https://github.com/opencv/opencv --branch 4.5.3 \
	&& cd opencv \
	&& mkdir build \
	&& cd build \
	&& cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=/usr/local -D INSTALL_PYTHON_EXAMPLES=ON -D INSTALL_C_EXAMPLES=ON .. \
	&& make -j32 \
	&& cp /code/opencv/build/3rdparty/lib/libquirc.a /usr/local/lib64/opencv4/3rdparty/ \
	&& cp /code/opencv/build/3rdparty/lib/libade.a /usr/local/lib64/opencv4/3rdparty/ \
	&& cd ${INSTALL_ROOT} \
	&& rm -rf opencv

# Install horovod
RUN pip install horovod


## Install DALI (not working)
#COPY ./cmake_patch /code 
#RUN cd ${INSTALL_ROOT} \
#	&& git clone --recurse-submodules --shallow-submodules https://github.com/NVIDIA/DALI --depth 1 --branch v1.6.0 \
#	&& cd DALI \ 
#	&& git apply /code/cmake_patch \
#	&& mkdir build \
#	&& cd build \
#	&& ln -s /usr/lib64/libclang.so /usr/lib64/libclang-11.so \
#	&& cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_NVJPEG2K=OFF -DFFMPEG_ROOT_DIR=${CONDA_HOME} -DJPEG_INCLUDE_DIR=/usr/local/include -DJPEG_LIBRARY=/usr/local/lib/libturbojpeg.so -DCUDA_TARGET_ARCHS=70 -DBUILD_PROTOBUF=ON -DProtobuf_LIBRARY=/usr/local/lib/libprotobuf.a -DFFmpeg_Libavcodec=/usr/local/lib/libavcodec.so -DFFmpeg_Libavfilter=/usr/local/lib/libavfilter.so -DFFmpeg_Libavformat=/usr/local/lib/libavformat.so -DFFmpeg_Libavutil=/usr/local/lib/libavutil.so .. \
#	&& make -j1 \
#	&& make install \
#	&& pip install dali/python

### failed attempt 1
#RUN cd ${INSTALL_ROOT} \
#	&& git clone --recurse-submodules --shallow-submodules https://github.com/NVIDIA/DALI --depth 1 --branch v1.6.0 \
#	&& cd DALI \ 
#	&& mkdir build \
#	&& cd build \
#	&& dnf install -y clang-devel \
#	&& pip install clang \
#	&& ln -s /usr/lib64/libclang.so /usr/lib64/libclang-11.so \
#	&& conda install -y -c conda-forge opencv libsndfile libtar ffmpeg protobuf libjpeg-turbo \
#	&& cmake  -DCMAKE_BUILD_TYPE=Release -DBUILD_NVJPEG2K=OFF -DFFMPEG_ROOT_DIR=${CONDA_HOME} -DJPEG_INCLUDE_DIR=${CONDA_HOME}/include -DJPEG_LIBRARY=${CONDA_HOME}/lib/libturbojpeg.so -DCUDA_TARGET_ARCHS=70 -DBUILD_PROTOBUF=ON -DProtobuf_LIBRARY=/opt/conda/lib/libprotobuf.so .. \
#	&& make-j4 \
#	&& make install \
#	&& pip install dali/python
### failed attempt 2
## dnf install -y clang-devel opencv libsndfile libtar protobuf
#	&& pip install clang \
#	&& ln -s /usr/lib64/libclang.so /usr/lib64/libclang-11.so \
#	&& conda install -y -c conda-forge ffmpeg \
#	&& cmake  -DCMAKE_BUILD_TYPE=Release -DBUILD_NVJPEG2K=OFF -DFFMPEG_ROOT_DIR=${CONDA_HOME} -DCUDA_TARGET_ARCHS=70 -DBUILD_PROTOBUF=ON -DProtobuf_LIBRARY=/usr/lib64/libprotobuf.so .. \
# Install DLPRof
+5253 −0

File added.

Preview size limit exceeded, changes collapsed.