[OpenMP][NVPTX] Drop dependence on CUDA to build NVPTX `deviceRTLs` (e7535f8f) · Commits · llvm-doe / llvm-project

openmp/README.rst

+1 −1

Original line number	Diff line number	Diff line
		@@ -281,7 +281,7 @@ Options for ``NVPTX device RTL``

		LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES = ``35``
		List of CUDA compute capabilities that should be supported by the NVPTX
		device RTL. E.g. for compute capabilities 6.0 and 7.0, the option "60,70"
		device RTL. E.g. for compute capabilities 6.0 and 7.0, the option "60;70"
		should be used. Compute capability 3.5 is the minimum required.

		LIBOMPTARGET_NVPTX_DEBUG = ``OFF\|ON``

openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake

+12 −10

Original line number	Diff line number	Diff line
		@@ -118,7 +118,9 @@ endif()
		find_package(CUDA QUIET)

		# Try to get the highest Nvidia GPU architecture the system supports
		if (CUDA_FOUND)
		set(LIBOMPTARGET_NVPTX_AUTODETECT_COMPUTE_CAPABILITY TRUE CACHE BOOL
		"Auto detect CUDA Compute Capability if CUDA is detected.")
		if (CUDA_FOUND AND LIBOMPTARGET_NVPTX_AUTODETECT_COMPUTE_CAPABILITY)
		cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS)
		string(REGEX MATCH "sm_([0-9]+)" CUDA_ARCH_MATCH_OUTPUT ${CUDA_ARCH_FLAGS})
		if (NOT DEFINED CUDA_ARCH_MATCH_OUTPUT OR "${CMAKE_MATCH_1}" LESS 35)

openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake

deleted100644 → 0

+0 −111

Original line number	Diff line number	Diff line
		#
		#//===----------------------------------------------------------------------===//
		#//
		#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		#// See https://llvm.org/LICENSE.txt for license information.
		#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		#//
		#//===----------------------------------------------------------------------===//
		#

		# We use the compiler and linker provided by the user, attempt to use the one
		# used to build libomptarget or just fail.
		set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED FALSE)

		if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
		set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
		elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
		set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
		else()
		return()
		endif()

		# Get compiler directory to try to locate a suitable linker.
		get_filename_component(compiler_dir ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} DIRECTORY)
		set(llvm_link "${compiler_dir}/llvm-link")

		if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
		set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER})
		elseif (EXISTS "${llvm_link}")
		# Use llvm-link from the compiler directory.
		set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER "${llvm_link}")
		else()
		return()
		endif()

		function(try_compile_bitcode output source)
		set(srcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/src.cu)
		file(WRITE ${srcfile} "${source}\n")
		set(bcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/out.bc)

		# The remaining arguments are the flags to be tested.
		# FIXME: Don't hardcode GPU version. This is currently required because
		# Clang refuses to compile its default of sm_20 with CUDA 9.
		execute_process(
		COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${ARGN}
		--cuda-gpu-arch=sm_35 -c ${srcfile} -o ${bcfile}
		RESULT_VARIABLE result
		OUTPUT_QUIET ERROR_QUIET)
		if (result EQUAL 0)
		set(${output} TRUE PARENT_SCOPE)
		else()
		set(${output} FALSE PARENT_SCOPE)
		endif()
		endfunction()

		# Save for which compiler we are going to do the following checks so that we
		# can discard cached values if the user specifies a different value.
		set(discard_cached FALSE)
		if (DEFINED LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER AND
		NOT("${LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER}" STREQUAL "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}"))
		set(discard_cached TRUE)
		endif()
		set(LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}" CACHE INTERNAL "" FORCE)

		function(check_bitcode_compilation output source)
		if (${discard_cached} OR NOT DEFINED ${output})
		message(STATUS "Performing Test ${output}")
		# Forward additional arguments which contain the flags.
		try_compile_bitcode(result "${source}" ${ARGN})
		set(${output} ${result} CACHE INTERNAL "" FORCE)
		if(${result})
		message(STATUS "Performing Test ${output} - Success")
		else()
		message(STATUS "Performing Test ${output} - Failed")
		endif()
		endif()
		endfunction()

		# These flags are required to emit LLVM Bitcode. We check them together because
		# if any of them are not supported, there is no point in finding out which are.
		set(compiler_flags_required -emit-llvm -O1 --cuda-device-only -std=c++14 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
		set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }")
		check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required})

		# It makes no sense to continue given that the compiler doesn't support
		# emitting basic LLVM Bitcode
		if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED)
		return()
		endif()

		set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS ${compiler_flags_required})

		# Declaring external shared device variables might need an additional flag
		# since Clang 7.0 and was entirely unsupported since version 4.0.
		set(extern_device_shared_src "extern __device__ __shared__ int test;")

		check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED "${extern_device_shared_src}" ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS})
		if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED)
		set(compiler_flag_fcuda_rdc -fcuda-rdc)
		set(compiler_flag_fcuda_rdc_full ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} ${compiler_flag_fcuda_rdc})
		check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full})

		if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC)
		return()
		endif()

		set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS "${compiler_flag_fcuda_rdc_full}")
		endif()

		# We can compile LLVM Bitcode from CUDA source code!
		set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED TRUE)

openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt

+165 −143

Original line number	Diff line number	Diff line
		@@ -10,6 +10,37 @@
		#
		##===----------------------------------------------------------------------===##

		# Check if we can create an LLVM bitcode implementation of the runtime library
		# that could be inlined in the user application. For that we need to find
		# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
		# an LLVM linker.
		set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
		"Location of a CUDA compiler capable of emitting LLVM bitcode.")
		set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
		"Location of a linker capable of linking LLVM bitcode objects.")

		if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
		set(cuda_compiler ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
		elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
		set(cuda_compiler ${CMAKE_C_COMPILER})
		else()
		libomptarget_say("Not building NVPTX deviceRTL: clang not found")
		return()
		endif()

		# Get compiler directory to try to locate a suitable linker.
		get_filename_component(compiler_dir ${cuda_compiler} DIRECTORY)
		set(llvm_link "${compiler_dir}/llvm-link")

		if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
		set(bc_linker ${LIBOMPTARGET_NVPTX_BC_LINKER})
		elseif (EXISTS ${llvm_link})
		set(bc_linker ${llvm_link})
		else()
		libomptarget_say("Not building NVPTX deviceRTL: llvm-link not found")
		return()
		endif()

		# TODO: This part needs to be refined when libomptarget is going to support
		# Windows!
		# TODO: This part can also be removed if we can change the clang driver to make
		@@ -33,25 +64,29 @@ set(devicertl_common_directory
		set(devicertl_nvptx_directory
		${devicertl_base_directory}/nvptx)

		if(LIBOMPTARGET_DEP_CUDA_FOUND)
		# Build library support for the highest compute capability the system supports
		# and always build support for sm_35 by default
		if (${LIBOMPTARGET_DEP_CUDA_ARCH} EQUAL 35)
		set(default_capabilities 35)
		if (DEFINED LIBOMPTARGET_DEP_CUDA_ARCH)
		set(default_capabilities ${LIBOMPTARGET_DEP_CUDA_ARCH})
		else()
		set(default_capabilities "35,${LIBOMPTARGET_DEP_CUDA_ARCH}")
		set(default_capabilities 35 37 50 52 53 60 61 62 70 72 75 80)
		endif()

		if (DEFINED LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY)
		set(default_capabilities ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
		libomptarget_warning_say("LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY is deprecated, please use LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES")
		endif()
		set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING
		"List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
		string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES})

		set(nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES})

		# If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the
		# build.
		if (NOT nvptx_sm_list)
		libomptarget_say("Not building CUDA offloading device RTL: empty compute capability list")
		return()
		endif()

		# Check all SM values
		foreach(sm ${nvptx_sm_list})
		set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
		if (NOT ${sm} IN_LIST default_capabilities)
		message(FATAL_ERROR "LIBOMPTARGET-NVPTX: compute capability ${sm} is not supported. Supported values: ${default_capabilities}")
		endif()
		endforeach()

		# Override default MAX_SM in src/target_impl.h if requested
		@@ -63,18 +98,6 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
		set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
		"Activate NVPTX device RTL debug messages.")

		# Check if we can create an LLVM bitcode implementation of the runtime library
		# that could be inlined in the user application. For that we need to find
		# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
		# an LLVM linker.
		set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
		"Location of a CUDA compiler capable of emitting LLVM bitcode.")
		set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
		"Location of a linker capable of linking LLVM bitcode objects.")

		include(LibomptargetNVPTXBitcodeLibrary)

		if (LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED)
		libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")

		set(cuda_src_files
		@@ -143,7 +166,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
		set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")

		add_custom_command(OUTPUT ${outfile}
		COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags}
		COMMAND ${cuda_compiler} ${bc_flags}
		${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
		DEPENDS ${infile}
		IMPLICIT_DEPENDS CXX ${infile}
		@@ -159,7 +182,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)

		# Link to a bitcode library.
		add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
		COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
		COMMAND ${bc_linker}
		-o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
		DEPENDS ${bc_files}
		COMMENT "Linking LLVM bitcode ${bclib_name}"
		@@ -180,9 +203,8 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
		install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
		endforeach()
		endforeach()
		endif()

		# Test will be enabled if the building machine supports CUDA
		if (LIBOMPTARGET_DEP_CUDA_FOUND)
		add_subdirectory(test)
		else()
		libomptarget_say("Not building CUDA offloading device RTL: tools to build bc lib not found in the system.")
		endif()