Unverified Commit 333f636c authored by Sergio Afonso's avatar Sergio Afonso Committed by GitHub
Browse files

[OpenMPOpt] Make parallel regions reachable from new DeviceRTL loop functions (#150927)

This patch updates the OpenMP optimization pass to know about the new
DeviceRTL functions for loop constructs.

This change marks these functions as potentially containing parallel
regions, which fixes a current bug with the state machine rewrite
optimization. It previously failed to identify parallel regions located
inside of the callbacks passed to these new DeviceRTL functions, causing
the resulting code to skip executing these parallel regions.

As a result, Generic kernels produced by Flang that contain parallel
regions now work properly.

One known related issue not fixed by this patch is that the presence of
calls to these functions will prevent the SPMD-ization of Generic
kernels by OpenMPOpt. Previously, this was due to assuming there was no
parallel region. This is changed by this patch, but instead we now mark
it temporarily as unsupported in an SPMD context. The reason is that,
without additional changes, code intended for the main thread of the
team located outside of the parallel region would not be guarded
properly, resulting in race conditions and generally invalid behavior.
parent d463a276
Loading
Loading
Loading
Loading
+23 −0
Original line number Diff line number Diff line
@@ -5021,6 +5021,29 @@ struct AAKernelInfoCallSite : AAKernelInfo {
      case OMPRTL___kmpc_free_shared:
        // Return without setting a fixpoint, to be resolved in updateImpl.
        return;
      case OMPRTL___kmpc_distribute_static_loop_4:
      case OMPRTL___kmpc_distribute_static_loop_4u:
      case OMPRTL___kmpc_distribute_static_loop_8:
      case OMPRTL___kmpc_distribute_static_loop_8u:
      case OMPRTL___kmpc_distribute_for_static_loop_4:
      case OMPRTL___kmpc_distribute_for_static_loop_4u:
      case OMPRTL___kmpc_distribute_for_static_loop_8:
      case OMPRTL___kmpc_distribute_for_static_loop_8u:
      case OMPRTL___kmpc_for_static_loop_4:
      case OMPRTL___kmpc_for_static_loop_4u:
      case OMPRTL___kmpc_for_static_loop_8:
      case OMPRTL___kmpc_for_static_loop_8u:
        // Parallel regions might be reached by these calls, as they take a
        // callback argument potentially containing arbitrary user-provided
        // code.
        ReachedUnknownParallelRegions.insert(&CB);
        // TODO: The presence of these calls on their own does not prevent a
        // kernel from being SPMD-izable. We mark it as such because we need
        // further changes in order to also consider the contents of the
        // callbacks passed to them.
        SPMDCompatibilityTracker.indicatePessimisticFixpoint();
        SPMDCompatibilityTracker.insert(&CB);
        break;
      default:
        // Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
        // generally. However, they do not hide parallel regions.
+130 −0
Original line number Diff line number Diff line
! Offloading test for generic target regions containing different kinds of
! loop constructs inside.
! REQUIRES: flang, amdgpu

! RUN: %libomptarget-compile-fortran-run-and-check-generic
program main
  integer :: i1, i2, n1, n2, counter

  n1 = 100
  n2 = 50

  counter = 0
  !$omp target map(tofrom:counter)
    !$omp teams distribute reduction(+:counter)
    do i1=1, n1
      counter = counter + 1
    end do
  !$omp end target

  ! CHECK: 1 100
  print '(I2" "I0)', 1, counter

  counter = 0
  !$omp target map(tofrom:counter)
    !$omp parallel do reduction(+:counter)
    do i1=1, n1
      counter = counter + 1
    end do
    !$omp parallel do reduction(+:counter)
    do i1=1, n1
      counter = counter + 1
    end do
  !$omp end target

  ! CHECK: 2 200
  print '(I2" "I0)', 2, counter

  counter = 0
  !$omp target map(tofrom:counter)
    counter = counter + 1
    !$omp parallel do reduction(+:counter)
    do i1=1, n1
      counter = counter + 1
    end do
    counter = counter + 1
    !$omp parallel do reduction(+:counter)
    do i1=1, n1
      counter = counter + 1
    end do
    counter = counter + 1
  !$omp end target

  ! CHECK: 3 203
  print '(I2" "I0)', 3, counter

  counter = 0
  !$omp target map(tofrom: counter)
    counter = counter + 1
    !$omp parallel do reduction(+:counter)
    do i1=1, n1
      counter = counter + 1
    end do
    counter = counter + 1
  !$omp end target

  ! CHECK: 4 102
  print '(I2" "I0)', 4, counter


  counter = 0
  !$omp target teams distribute reduction(+:counter)
  do i1=1, n1
    !$omp parallel do reduction(+:counter)
    do i2=1, n2
      counter = counter + 1
    end do
  end do

  ! CHECK: 5 5000
  print '(I2" "I0)', 5, counter

  counter = 0
  !$omp target teams distribute reduction(+:counter)
  do i1=1, n1
    counter = counter + 1
    !$omp parallel do reduction(+:counter)
    do i2=1, n2
      counter = counter + 1
    end do
    counter = counter + 1
  end do

  ! CHECK: 6 5200
  print '(I2" "I0)', 6, counter

  counter = 0
  !$omp target teams distribute reduction(+:counter)
  do i1=1, n1
    !$omp parallel do reduction(+:counter)
    do i2=1, n2
      counter = counter + 1
    end do
    !$omp parallel do reduction(+:counter)
    do i2=1, n2
      counter = counter + 1
    end do
  end do

  ! CHECK: 7 10000
  print '(I2" "I0)', 7, counter

  counter = 0
  !$omp target teams distribute reduction(+:counter)
  do i1=1, n1
    counter = counter + 1
    !$omp parallel do reduction(+:counter)
    do i2=1, n2
      counter = counter + 1
    end do
    counter = counter + 1
    !$omp parallel do reduction(+:counter)
    do i2=1, n2
      counter = counter + 1
    end do
    counter = counter + 1
  end do

  ! CHECK: 8 10300
  print '(I2" "I0)', 8, counter
end program
+39 −0
Original line number Diff line number Diff line
! Offloading test for generic target regions containing different kinds of
! loop constructs inside.
! REQUIRES: flang, amdgpu

! RUN: %libomptarget-compile-fortran-run-and-check-generic
program main
  integer :: i1, n1, counter

  n1 = 100

  counter = 0
  !$omp target parallel do reduction(+:counter)
  do i1=1, n1
    counter = counter + 1
  end do

  ! CHECK: 1 100
  print '(I2" "I0)', 1, counter

  counter = 0
  !$omp target map(tofrom:counter)
    !$omp parallel do reduction(+:counter)
    do i1=1, n1
      counter = counter + 1
    end do
  !$omp end target

  ! CHECK: 2 100
  print '(I2" "I0)', 2, counter

  counter = 0
  !$omp target teams distribute parallel do reduction(+:counter)
  do i1=1, n1
    counter = counter + 1
  end do

  ! CHECK: 3 100
  print '(I2" "I0)', 3, counter
end program