Commit 5e407950 authored by Tobias Grosser's avatar Tobias Grosser
Browse files

isl: Detect openmp parallelism

Based on code written by Riyadh Baghdadi.

Merged from: https://llvm.org/svn/llvm-project/polly/trunk@170102

llvm-svn: 170753
parent 2c9d1010
Loading
Loading
Loading
Loading
+205 −6
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@
#include "polly/CodeGen/IslAst.h"

#include "polly/LinkAllPasses.h"
#include "polly/Dependences.h"
#include "polly/ScopInfo.h"

#define DEBUG_TYPE "polly-ast"
@@ -43,10 +44,14 @@ static cl::opt<bool>
UseContext("polly-ast-use-context", cl::desc("Use context"), cl::Hidden,
           cl::init(false), cl::ZeroOrMore);

static cl::opt<bool>
DetectParallel("polly-ast-detect-parallel", cl::desc("Detect parallelism"),
               cl::Hidden, cl::init(false), cl::ZeroOrMore);

namespace polly {
class IslAst {
public:
  IslAst(Scop *Scop);
  IslAst(Scop *Scop, Dependences &D);

  ~IslAst();

@@ -72,7 +77,182 @@ static void IslAstUserFree(void *User)
  free(UserStruct);
}

static __isl_give isl_ast_node *AtEachDomain(__isl_keep isl_ast_node *Node,
// Information about an ast node.
struct AstNodeUserInfo {
  // The node is the outermost parallel loop.
  int IsOutermostParallel;
};

// Temporary information used when building the ast.
struct AstBuildUserInfo {
  // The dependence information.
  Dependences *Deps;

  // We are inside a parallel for node.
  int InParallelFor;
};

// Print a loop annotated with OpenMP pragmas.
static __isl_give isl_printer *
printParallelFor(__isl_keep isl_ast_node *Node, __isl_take isl_printer *Printer,
                 __isl_take isl_ast_print_options *PrintOptions,
                 AstNodeUserInfo *Info) {
  if (Info && Info->IsOutermostParallel) {
    Printer = isl_printer_start_line(Printer);
    if (Info->IsOutermostParallel)
      Printer = isl_printer_print_str(Printer, "#pragma omp parallel for");
    Printer = isl_printer_end_line(Printer);
  }
  return isl_ast_node_for_print(Node, Printer, PrintOptions);
}

// Print an isl_ast_for.
static __isl_give isl_printer *
printFor(__isl_take isl_printer *Printer,
         __isl_take isl_ast_print_options *PrintOptions,
         __isl_keep isl_ast_node *Node, void *User) {
  isl_id *Id = isl_ast_node_get_annotation(Node);
  if (!Id)
    return isl_ast_node_for_print(Node, Printer, PrintOptions);

  struct AstNodeUserInfo *Info = (struct AstNodeUserInfo *) isl_id_get_user(Id);
  Printer = printParallelFor(Node, Printer, PrintOptions, Info);
  isl_id_free(Id);
  return Printer;
}

// Allocate an AstNodeInfo structure and initialize it with default values.
static struct AstNodeUserInfo *allocateAstNodeUserInfo() {
  struct AstNodeUserInfo *NodeInfo;
  NodeInfo = (struct AstNodeUserInfo *) malloc(sizeof(struct AstNodeUserInfo));
  NodeInfo->IsOutermostParallel = 0;
  return NodeInfo;
}

// Free the AstNodeInfo structure.
static void freeAstNodeUserInfo(void *Ptr) {
  struct AstNodeUserInfo *Info;
  Info = (struct AstNodeUserInfo *) Ptr;
  free(Info);
}

// Check if the current scheduling dimension is parallel.
//
// We check for parallelism by verifying that the loop does not carry any
// dependences.
//
// Parallelism test: if the distance is zero in all outer dimensions, then it
// has to be zero in the current dimension as well.
//
// Implementation: first, translate dependences into time space, then force
// outer dimensions to be equal. If the distance is zero in the current
// dimension, then the loop is parallel. The distance is zero in the current
// dimension if it is a subset of a map with equal values for the current
// dimension.
static bool astScheduleDimIsParallel(__isl_keep isl_ast_build *Build,
                                     Dependences *D) {
  isl_union_map *Schedule, *Deps;
  isl_map *ScheduleDeps, *Test;
  isl_space *ScheduleSpace;
  unsigned Dimension, IsParallel;

  Schedule = isl_ast_build_get_schedule(Build);
  ScheduleSpace = isl_ast_build_get_schedule_space(Build);

  Dimension = isl_space_dim(ScheduleSpace, isl_dim_out) - 1;

  Deps = D->getDependences(Dependences::TYPE_ALL);
  Deps = isl_union_map_apply_range(Deps, isl_union_map_copy(Schedule));
  Deps = isl_union_map_apply_domain(Deps, Schedule);

  if (isl_union_map_is_empty(Deps)) {
    isl_union_map_free(Deps);
    isl_space_free(ScheduleSpace);
    return 1;
  }

  ScheduleDeps = isl_map_from_union_map(Deps);

  for (unsigned i = 0; i < Dimension; i++)
    ScheduleDeps = isl_map_equate(ScheduleDeps, isl_dim_out, i, isl_dim_in, i);

  Test = isl_map_universe(isl_map_get_space(ScheduleDeps));
  Test = isl_map_equate(Test, isl_dim_out, Dimension, isl_dim_in, Dimension);
  IsParallel = isl_map_is_subset(ScheduleDeps, Test);

  isl_space_free(ScheduleSpace);
  isl_map_free(Test);
  isl_map_free(ScheduleDeps);

  return IsParallel;
}

// Mark a for node openmp parallel, if it is the outermost parallel for node.
static void markOpenmpParallel(__isl_keep isl_ast_build *Build,
                               struct AstBuildUserInfo *BuildInfo,
                               struct AstNodeUserInfo *NodeInfo) {
  if (BuildInfo->InParallelFor)
    return;

  if (astScheduleDimIsParallel(Build, BuildInfo->Deps)) {
    BuildInfo->InParallelFor = 1;
    NodeInfo->IsOutermostParallel = 1;
  }
}

// This method is executed before the construction of a for node. It creates
// an isl_id that is used to annotate the subsequently generated ast for nodes.
//
// In this function we also run the following analyses:
//
// - Detection of openmp parallel loops
//
static __isl_give isl_id *astBuildBeforeFor(__isl_keep isl_ast_build *Build,
                                            void *User) {
  isl_id *Id;
  struct AstBuildUserInfo *BuildInfo;
  struct AstNodeUserInfo *NodeInfo;

  BuildInfo = (struct AstBuildUserInfo *) User;
  NodeInfo = allocateAstNodeUserInfo();
  Id = isl_id_alloc(isl_ast_build_get_ctx(Build), "", NodeInfo);
  Id = isl_id_set_free_user(Id, freeAstNodeUserInfo);

  markOpenmpParallel(Build, BuildInfo, NodeInfo);

  return Id;
}

// This method is executed after the construction of a for node.
//
// It performs the following actions:
//
// - Reset the 'InParallelFor' flag, as soon as we leave a for node,
//   that is marked as openmp parallel.
//
static __isl_give isl_ast_node *
astBuildAfterFor(__isl_take isl_ast_node *Node,
                 __isl_keep isl_ast_build *Build, void *User) {
  isl_id *Id;
  struct AstBuildUserInfo *BuildInfo;
  struct AstNodeUserInfo *Info;

  Id = isl_ast_node_get_annotation(Node);
  if (!Id)
    return Node;
  Info = (struct AstNodeUserInfo *) isl_id_get_user(Id);
  if (Info && Info->IsOutermostParallel) {
    BuildInfo = (struct AstBuildUserInfo *) User;
    BuildInfo->InParallelFor = 0;
  }

  isl_id_free(Id);

  return Node;
}

static __isl_give isl_ast_node *
AtEachDomain(__isl_keep isl_ast_node *Node,
             __isl_keep isl_ast_build *Context, void *User)
{
  isl_map *Map;
@@ -90,10 +270,11 @@ static __isl_give isl_ast_node *AtEachDomain(__isl_keep isl_ast_node *Node,
  return isl_ast_node_set_annotation(Node, Annotation);
}

IslAst::IslAst(Scop *Scop) : S(Scop) {
IslAst::IslAst(Scop *Scop, Dependences &D) : S(Scop) {
  isl_ctx *Ctx = S->getIslCtx();
  isl_options_set_ast_build_atomic_upper_bound(Ctx, true);
  isl_ast_build *Context;
  struct AstBuildUserInfo BuildInfo;

  if (UseContext)
    Context = isl_ast_build_from_context(S->getContext());
@@ -112,6 +293,16 @@ IslAst::IslAst(Scop *Scop) : S(Scop) {
    isl_union_map_dump(Schedule);
  );

  if (DetectParallel) {
    BuildInfo.Deps = &D;
    BuildInfo.InParallelFor = 0;

    Context = isl_ast_build_set_before_each_for(Context, &astBuildBeforeFor,
                                                &BuildInfo);
    Context = isl_ast_build_set_after_each_for(Context, &astBuildAfterFor,
                                               &BuildInfo);
  }

  Root = isl_ast_build_ast_from_schedule(Context, Schedule);

  isl_ast_build_free(Context);
@@ -141,7 +332,11 @@ IslAst::~IslAst() {
/// Print a C like representation of the program.
void IslAst::pprint(llvm::raw_ostream &OS) {
  isl_ast_node *Root;
  isl_ast_print_options *Options = isl_ast_print_options_alloc(S->getIslCtx());
  isl_ast_print_options *Options;

  Options = isl_ast_print_options_alloc(S->getIslCtx());
  Options = isl_ast_print_options_set_print_for(Options, &printFor, NULL);

  isl_printer *P = isl_printer_to_str(S->getIslCtx());
  P = isl_printer_set_output_format(P, ISL_FORMAT_C);
  Root = getAst();
@@ -174,7 +369,9 @@ bool IslAstInfo::runOnScop(Scop &Scop) {

  S = &Scop;

  Ast = new IslAst(&Scop);
  Dependences &D = getAnalysis<Dependences>();

  Ast = new IslAst(&Scop, D);

  return false;
}
@@ -195,12 +392,14 @@ void IslAstInfo::getAnalysisUsage(AnalysisUsage &AU) const {
  // Get the Common analysis usage of ScopPasses.
  ScopPass::getAnalysisUsage(AU);
  AU.addRequired<ScopInfo>();
  AU.addRequired<Dependences>();
}
char IslAstInfo::ID = 0;

INITIALIZE_PASS_BEGIN(IslAstInfo, "polly-ast",
                      "Generate an AST of the SCoP (isl)", false, false)
INITIALIZE_PASS_DEPENDENCY(ScopInfo)
INITIALIZE_PASS_DEPENDENCY(Dependences)
INITIALIZE_PASS_END(IslAstInfo, "polly-ast",
                    "Generate an AST from the SCoP (isl)", false, false)

+46 −0
Original line number Diff line number Diff line
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-pc-linux-gnu"

; for (i = 0; i < 1024; i++)
;   for (j = 0; j < 1024; j++)
;     A[i][j] = 1;

@A = common global [1024 x [1024 x i32]] zeroinitializer
define void @bar() {
start:
  fence seq_cst
  br label %loop.i

loop.i:
  %i = phi i64 [ 0, %start ], [ %i.next, %loop.i.backedge ]
  %exitcond.i = icmp ne i64 %i, 1024
  br i1 %exitcond.i, label %loop.j, label %ret

loop.j:
  %j = phi i64 [ 0, %loop.i], [ %j.next, %loop.j.backedge ]
  %exitcond.j = icmp ne i64 %j, 1024
  br i1 %exitcond.j, label %loop.body, label %loop.i.backedge

loop.body:
  %scevgep = getelementptr [1024 x [1024 x i32] ]* @A, i64 0, i64 %j, i64 %i
  store i32 1, i32* %scevgep
  br label %loop.j.backedge

loop.j.backedge:
  %j.next = add nsw i64 %j, 1
  br label %loop.j

loop.i.backedge:
  %i.next = add nsw i64 %i, 1
  br label %loop.i

ret:
  fence seq_cst
  ret void
}

; CHECK: #pragma omp parallel for
; CHECK: for (int c1 = 0; c1 <= 1023; c1 += 1)
; CHECK:   for (int c3 = 0; c3 <= 1023; c3 += 1)
; CHECK:     Stmt_loop_body(c1, c3);
+55 −0
Original line number Diff line number Diff line
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-pc-linux-gnu"

; for (i = 0; i < n; i++)
;   for (j = 0; j < n; j++)
;     A[i][j] = 1;

@A = common global [1024 x [1024 x i32]] zeroinitializer
define void @bar(i64 %n) {
start:
  fence seq_cst
  br label %loop.i

loop.i:
  %i = phi i64 [ 0, %start ], [ %i.next, %loop.i.backedge ]
  %exitcond.i = icmp ne i64 %i, %n
  br i1 %exitcond.i, label %loop.j, label %ret

loop.j:
  %j = phi i64 [ 0, %loop.i], [ %j.next, %loop.j.backedge ]
  %exitcond.j = icmp ne i64 %j, %n
  br i1 %exitcond.j, label %loop.body, label %loop.i.backedge

loop.body:
  %scevgep = getelementptr [1024 x [1024 x i32] ]* @A, i64 0, i64 %j, i64 %i
  store i32 1, i32* %scevgep
  br label %loop.j.backedge

loop.j.backedge:
  %j.next = add nsw i64 %j, 1
  br label %loop.j

loop.i.backedge:
  %i.next = add nsw i64 %i, 1
  br label %loop.i

ret:
  fence seq_cst
  ret void
}

; At the first look both loops seem parallel, however due to the delinearization
; we get the following dependences:
;    [n] -> { loop_body[i0, i1] -> loop_body[1024 + i0, -1 + i1]:
;                                           0 <= i0 < n - 1024  and 1 <= i1 < n}
; They cause the outer loop to be non-parallel.  We can only prove their
; absence, if we know that n < 1024. This information is currently not available
; to polly. However, we should be able to obtain it due to the out of bounds
; memory accesses, that would happen if n >= 1024.
;
; CHECK: for (int c1 = 0; c1 < n; c1 += 1)
; CHECK:   #pragma omp parallel for
; CHECK:   for (int c3 = 0; c3 < n; c3 += 1)
; CHECK:     Stmt_loop_body(c1, c3);
+46 −0
Original line number Diff line number Diff line
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-pc-linux-gnu"

; for (i = 0; i < n; i++)
;   for (j = 0; j < n; j++)
;     A[j] = 1;

@A = common global [1024 x i32] zeroinitializer
define void @bar(i64 %n) {
start:
  fence seq_cst
  br label %loop.i

loop.i:
  %i = phi i64 [ 0, %start ], [ %i.next, %loop.i.backedge ]
  %exitcond.i = icmp ne i64 %i, %n
  br i1 %exitcond.i, label %loop.j, label %ret

loop.j:
  %j = phi i64 [ 0, %loop.i], [ %j.next, %loop.j.backedge ]
  %exitcond.j = icmp ne i64 %j, %n
  br i1 %exitcond.j, label %loop.body, label %loop.i.backedge

loop.body:
  %scevgep = getelementptr [1024 x i32]* @A, i64 0, i64 %j
  store i32 1, i32* %scevgep
  br label %loop.j.backedge

loop.j.backedge:
  %j.next = add nsw i64 %j, 1
  br label %loop.j

loop.i.backedge:
  %i.next = add nsw i64 %i, 1
  br label %loop.i

ret:
  fence seq_cst
  ret void
}

; CHECK: for (int c1 = 0; c1 < n; c1 += 1)
; CHECK:   #pragma omp parallel for
; CHECK:   for (int c3 = 0; c3 < n; c3 += 1)
; CHECK:     Stmt_loop_body(c1, c3);
+46 −0
Original line number Diff line number Diff line
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-pc-linux-gnu"

; for (i = 0; i < n; i++)
;   for (j = 0; j < n; j++)
;     A[i] = 1;

@A = common global [1024 x i32] zeroinitializer
define void @bar(i64 %n) {
start:
  fence seq_cst
  br label %loop.i

loop.i:
  %i = phi i64 [ 0, %start ], [ %i.next, %loop.i.backedge ]
  %exitcond.i = icmp ne i64 %i, %n
  br i1 %exitcond.i, label %loop.j, label %ret

loop.j:
  %j = phi i64 [ 0, %loop.i], [ %j.next, %loop.j.backedge ]
  %exitcond.j = icmp ne i64 %j, %n
  br i1 %exitcond.j, label %loop.body, label %loop.i.backedge

loop.body:
  %scevgep = getelementptr [1024 x i32]* @A, i64 0, i64 %i
  store i32 1, i32* %scevgep
  br label %loop.j.backedge

loop.j.backedge:
  %j.next = add nsw i64 %j, 1
  br label %loop.j

loop.i.backedge:
  %i.next = add nsw i64 %i, 1
  br label %loop.i

ret:
  fence seq_cst
  ret void
}

; CHECK: #pragma omp parallel for
; CHECK: for (int c1 = 0; c1 < n; c1 += 1)
; CHECK:   for (int c3 = 0; c3 < n; c3 += 1)
; CHECK:     Stmt_loop_body(c1, c3);
Loading