Commit e7535f8f authored by Shilei Tian's avatar Shilei Tian
Browse files

[OpenMP][NVPTX] Drop dependence on CUDA to build NVPTX `deviceRTLs`

With D94745, we no longer use CUDA SDK to compile `deviceRTLs`. Therefore,
many CMake code in the project is useless. This patch cleans up unnecessary code
and also drops the requirement to build NVPTX `deviceRTLs`. CUDA detection is
still being used however to determine whether we need to involve the tests. Auto
detection of compute capability is enabled by default and can be disabled by
setting CMake variable `LIBOMPTARGET_NVPTX_AUTODETECT_COMPUTE_CAPABILITY=OFF`.
If auto detection is enabled, and CUDA is also valid, it will only build the
bitcode library for the detected version; otherwise, all variants supported will
be generated. One drawback of this patch is, we now generate 96 variants of
bitcode library, and totally 1485 files to be built with a clean build on a
non-CUDA system. `LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES=""` can be used to
disable building NVPTX `deviceRTLs`.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D95466
parent e1d61789
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -281,7 +281,7 @@ Options for ``NVPTX device RTL``

 **LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES** = ``35``
  List of CUDA compute capabilities that should be supported by the NVPTX
  device RTL. E.g. for compute capabilities 6.0 and 7.0, the option "60,70"
  device RTL. E.g. for compute capabilities 6.0 and 7.0, the option "60;70"
  should be used. Compute capability 3.5 is the minimum required.

 **LIBOMPTARGET_NVPTX_DEBUG** = ``OFF|ON``
+12 −10
Original line number Diff line number Diff line
@@ -118,7 +118,9 @@ endif()
find_package(CUDA QUIET)

# Try to get the highest Nvidia GPU architecture the system supports
if (CUDA_FOUND)
set(LIBOMPTARGET_NVPTX_AUTODETECT_COMPUTE_CAPABILITY TRUE CACHE BOOL
  "Auto detect CUDA Compute Capability if CUDA is detected.")
if (CUDA_FOUND AND LIBOMPTARGET_NVPTX_AUTODETECT_COMPUTE_CAPABILITY)
  cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS)
  string(REGEX MATCH "sm_([0-9]+)" CUDA_ARCH_MATCH_OUTPUT ${CUDA_ARCH_FLAGS})
  if (NOT DEFINED CUDA_ARCH_MATCH_OUTPUT OR "${CMAKE_MATCH_1}" LESS 35)
+0 −111
Original line number Diff line number Diff line
#
#//===----------------------------------------------------------------------===//
#//
#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
#// See https://llvm.org/LICENSE.txt for license information.
#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#//
#//===----------------------------------------------------------------------===//
#

# We use the compiler and linker provided by the user, attempt to use the one
# used to build libomptarget or just fail.
set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED FALSE)

if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
else()
  return()
endif()

# Get compiler directory to try to locate a suitable linker.
get_filename_component(compiler_dir ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} DIRECTORY)
set(llvm_link "${compiler_dir}/llvm-link")

if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
  set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER})
elseif (EXISTS "${llvm_link}")
  # Use llvm-link from the compiler directory.
  set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER "${llvm_link}")
else()
  return()
endif()

function(try_compile_bitcode output source)
  set(srcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/src.cu)
  file(WRITE ${srcfile} "${source}\n")
  set(bcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/out.bc)

  # The remaining arguments are the flags to be tested.
  # FIXME: Don't hardcode GPU version. This is currently required because
  #        Clang refuses to compile its default of sm_20 with CUDA 9.
  execute_process(
    COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${ARGN}
      --cuda-gpu-arch=sm_35 -c ${srcfile} -o ${bcfile}
    RESULT_VARIABLE result
    OUTPUT_QUIET ERROR_QUIET)
  if (result EQUAL 0)
    set(${output} TRUE PARENT_SCOPE)
  else()
    set(${output} FALSE PARENT_SCOPE)
  endif()
endfunction()

# Save for which compiler we are going to do the following checks so that we
# can discard cached values if the user specifies a different value.
set(discard_cached FALSE)
if (DEFINED LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER AND
    NOT("${LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER}" STREQUAL "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}"))
  set(discard_cached TRUE)
endif()
set(LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}" CACHE INTERNAL "" FORCE)

function(check_bitcode_compilation output source)
  if (${discard_cached} OR NOT DEFINED ${output})
    message(STATUS "Performing Test ${output}")
    # Forward additional arguments which contain the flags.
    try_compile_bitcode(result "${source}" ${ARGN})
    set(${output} ${result} CACHE INTERNAL "" FORCE)
    if(${result})
      message(STATUS "Performing Test ${output} - Success")
    else()
      message(STATUS "Performing Test ${output} - Failed")
    endif()
  endif()
endfunction()

# These flags are required to emit LLVM Bitcode. We check them together because
# if any of them are not supported, there is no point in finding out which are.
set(compiler_flags_required -emit-llvm -O1 --cuda-device-only -std=c++14 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }")
check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required})

# It makes no sense to continue given that the compiler doesn't support
# emitting basic LLVM Bitcode
if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED)
  return()
endif()

set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS ${compiler_flags_required})

# Declaring external shared device variables might need an additional flag
# since Clang 7.0 and was entirely unsupported since version 4.0.
set(extern_device_shared_src "extern __device__ __shared__ int test;")

check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED "${extern_device_shared_src}" ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS})
if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED)
  set(compiler_flag_fcuda_rdc -fcuda-rdc)
  set(compiler_flag_fcuda_rdc_full ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} ${compiler_flag_fcuda_rdc})
  check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full})

  if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC)
    return()
  endif()

  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS "${compiler_flag_fcuda_rdc_full}")
endif()

# We can compile LLVM Bitcode from CUDA source code!
set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED TRUE)
+165 −143
Original line number Diff line number Diff line
@@ -10,6 +10,37 @@
#
##===----------------------------------------------------------------------===##

# Check if we can create an LLVM bitcode implementation of the runtime library
# that could be inlined in the user application. For that we need to find
# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
# an LLVM linker.
set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
  "Location of a CUDA compiler capable of emitting LLVM bitcode.")
set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
  "Location of a linker capable of linking LLVM bitcode objects.")

if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
  set(cuda_compiler ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
  set(cuda_compiler ${CMAKE_C_COMPILER})
else()
  libomptarget_say("Not building NVPTX deviceRTL: clang not found")
  return()
endif()

# Get compiler directory to try to locate a suitable linker.
get_filename_component(compiler_dir ${cuda_compiler} DIRECTORY)
set(llvm_link "${compiler_dir}/llvm-link")

if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
  set(bc_linker ${LIBOMPTARGET_NVPTX_BC_LINKER})
elseif (EXISTS ${llvm_link})
  set(bc_linker ${llvm_link})
else()
  libomptarget_say("Not building NVPTX deviceRTL: llvm-link not found")
  return()
endif()

# TODO: This part needs to be refined when libomptarget is going to support
# Windows!
# TODO: This part can also be removed if we can change the clang driver to make
@@ -33,25 +64,29 @@ set(devicertl_common_directory
set(devicertl_nvptx_directory
  ${devicertl_base_directory}/nvptx)

if(LIBOMPTARGET_DEP_CUDA_FOUND)
  # Build library support for the highest compute capability the system supports
  # and always build support for sm_35 by default
  if (${LIBOMPTARGET_DEP_CUDA_ARCH} EQUAL 35)
    set(default_capabilities 35)
if (DEFINED LIBOMPTARGET_DEP_CUDA_ARCH)
  set(default_capabilities ${LIBOMPTARGET_DEP_CUDA_ARCH})
else()
      set(default_capabilities "35,${LIBOMPTARGET_DEP_CUDA_ARCH}")
  set(default_capabilities 35 37 50 52 53 60 61 62 70 72 75 80)
endif()

  if (DEFINED LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY)
    set(default_capabilities ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
    libomptarget_warning_say("LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY is deprecated, please use LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES")
  endif()
set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING
  "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
  string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES})

set(nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES})

# If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the
# build.
if (NOT nvptx_sm_list)
  libomptarget_say("Not building CUDA offloading device RTL: empty compute capability list")
  return()
endif()

# Check all SM values
foreach(sm ${nvptx_sm_list})
    set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
  if (NOT ${sm} IN_LIST default_capabilities)
    message(FATAL_ERROR "LIBOMPTARGET-NVPTX: compute capability ${sm} is not supported. Supported values: ${default_capabilities}")
  endif()
endforeach()

# Override default MAX_SM in src/target_impl.h if requested
@@ -63,18 +98,6 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
  "Activate NVPTX device RTL debug messages.")

  # Check if we can create an LLVM bitcode implementation of the runtime library
  # that could be inlined in the user application. For that we need to find
  # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
  # an LLVM linker.
  set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
    "Location of a CUDA compiler capable of emitting LLVM bitcode.")
  set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
    "Location of a linker capable of linking LLVM bitcode objects.")

  include(LibomptargetNVPTXBitcodeLibrary)

  if (LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED)
libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")

set(cuda_src_files
@@ -143,7 +166,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
      set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")

      add_custom_command(OUTPUT ${outfile}
            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags}
        COMMAND ${cuda_compiler} ${bc_flags}
          ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
        DEPENDS ${infile}
        IMPLICIT_DEPENDS CXX ${infile}
@@ -159,7 +182,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)

    # Link to a bitcode library.
    add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
        COMMAND ${bc_linker}
          -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
        DEPENDS ${bc_files}
        COMMENT "Linking LLVM bitcode ${bclib_name}"
@@ -180,9 +203,8 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
  endforeach()
endforeach()
  endif()

# Test will be enabled if the building machine supports CUDA
if (LIBOMPTARGET_DEP_CUDA_FOUND)
  add_subdirectory(test)
else()
  libomptarget_say("Not building CUDA offloading device RTL: tools to build bc lib not found in the system.")
endif()