Commit e702bdb8 authored by Ilya Biryukov's avatar Ilya Biryukov
Browse files

[Syntax] Build SimpleDeclaration node that groups multiple declarators

Summary:
Also remove the temporary TopLevelDeclaration node and add
UnknownDeclaration to represent other unknown nodes.

See the follow-up change for building more top-level declarations.
Adding declarators is also pretty involved and will be done in another
follow-up patch.

Reviewers: gribozavr2

Reviewed By: gribozavr2

Subscribers: merge_guards_bot, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D70787
parent a48b5e24
Loading
Loading
Loading
Loading
+37 −16
Original line number Diff line number Diff line
@@ -37,7 +37,6 @@ namespace syntax {
enum class NodeKind : uint16_t {
  Leaf,
  TranslationUnit,
  TopLevelDeclaration,

  // Expressions
  UnknownExpression,
@@ -57,7 +56,11 @@ enum class NodeKind : uint16_t {
  ReturnStatement,
  RangeBasedForStatement,
  ExpressionStatement,
  CompoundStatement
  CompoundStatement,

  // Declarations
  UnknownDeclaration,
  SimpleDeclaration,
};
/// For debugging purposes.
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, NodeKind K);
@@ -102,20 +105,6 @@ public:
  }
};

/// FIXME: this node is temporary and will be replaced with nodes for various
///        'declarations' and 'declarators' from the C/C++ grammar
///
/// Represents any top-level declaration. Only there to give the syntax tree a
/// bit of structure until we implement syntax nodes for declarations and
/// declarators.
class TopLevelDeclaration final : public Tree {
public:
  TopLevelDeclaration() : Tree(NodeKind::TopLevelDeclaration) {}
  static bool classof(const Node *N) {
    return N->kind() == NodeKind::TopLevelDeclaration;
  }
};

/// A base class for all expressions. Note that expressions are not statements,
/// even though they are in clang.
class Expression : public Tree {
@@ -313,6 +302,38 @@ public:
  syntax::Leaf *rbrace();
};

/// A declaration that can appear at the top-level. Note that this does *not*
/// correspond 1-to-1 to clang::Decl. Syntax trees distinguish between top-level
/// declarations (e.g. namespace definitions) and declarators (e.g. variables,
/// typedefs, etc.). Declarators are stored inside SimpleDeclaration.
class Declaration : public Tree {
public:
  Declaration(NodeKind K) : Tree(K) {}
  static bool classof(const Node *N) {
    return NodeKind::UnknownDeclaration <= N->kind() &&
           N->kind() <= NodeKind::SimpleDeclaration;
  }
};

/// Declaration of an unknown kind, e.g. not yet supported in syntax trees.
class UnknownDeclaration final : public Declaration {
public:
  UnknownDeclaration() : Declaration(NodeKind::UnknownDeclaration) {}
  static bool classof(const Node *N) {
    return N->kind() == NodeKind::UnknownDeclaration;
  }
};

/// Groups multiple declarators (e.g. variables, typedefs, etc.) together. All
/// grouped declarators share the same declaration specifiers (e.g. 'int' or
/// 'typedef').
class SimpleDeclaration final : public Declaration {
public:
  SimpleDeclaration() : Declaration(NodeKind::SimpleDeclaration) {}
  static bool classof(const Node *N) {
    return N->kind() == NodeKind::SimpleDeclaration;
  }
};
} // namespace syntax
} // namespace clang
#endif
+136 −35
Original line number Diff line number Diff line
@@ -6,6 +6,8 @@
//
//===----------------------------------------------------------------------===//
#include "clang/Tooling/Syntax/BuildTree.h"
#include "clang/AST/Decl.h"
#include "clang/AST/DeclBase.h"
#include "clang/AST/RecursiveASTVisitor.h"
#include "clang/AST/Stmt.h"
#include "clang/Basic/LLVM.h"
@@ -56,6 +58,14 @@ public:
  /// Range.
  void foldNode(llvm::ArrayRef<syntax::Token> Range, syntax::Tree *New);

  /// Must be called with the range of each `DeclaratorDecl`. Ensures the
  /// corresponding declarator nodes are covered by `SimpleDeclaration`.
  void noticeDeclaratorRange(llvm::ArrayRef<syntax::Token> Range);

  /// Notifies that we should not consume trailing semicolon when computing
  /// token range of \p D.
  void noticeDeclaratorWithoutSemicolon(Decl *D);

  /// Mark the \p Child node with a corresponding \p Role. All marked children
  /// should be consumed by foldNode.
  /// (!) when called on expressions (clang::Expr is derived from clang::Stmt),
@@ -94,7 +104,14 @@ public:
    return llvm::makeArrayRef(findToken(First), std::next(findToken(Last)));
  }
  llvm::ArrayRef<syntax::Token> getRange(const Decl *D) const {
    return getRange(D->getBeginLoc(), D->getEndLoc());
    auto Tokens = getRange(D->getBeginLoc(), D->getEndLoc());
    if (llvm::isa<NamespaceDecl>(D))
      return Tokens;
    if (DeclsWithoutSemicolons.count(D))
      return Tokens;
    // FIXME: do not consume trailing semicolon on function definitions.
    // Most declarations own a semicolon in syntax trees, but not in clang AST.
    return withTrailingSemicolon(Tokens);
  }
  llvm::ArrayRef<syntax::Token> getExprRange(const Expr *E) const {
    return getRange(E->getBeginLoc(), E->getEndLoc());
@@ -108,14 +125,22 @@ public:

    // Some statements miss a trailing semicolon, e.g. 'return', 'continue' and
    // all statements that end with those. Consume this semicolon here.
    //
    // (!) statements never consume 'eof', so looking at the next token is ok.
    if (Tokens.back().kind() == tok::semi)
      return Tokens;
    return withTrailingSemicolon(Tokens);
  }

private:
  llvm::ArrayRef<syntax::Token>
  withTrailingSemicolon(llvm::ArrayRef<syntax::Token> Tokens) const {
    assert(!Tokens.empty());
    assert(Tokens.back().kind() != tok::eof);
    // (!) we never consume 'eof', so looking at the next token is ok.
    if (Tokens.back().kind() != tok::semi && Tokens.end()->kind() == tok::semi)
      return llvm::makeArrayRef(Tokens.begin(), Tokens.end() + 1);
    return Tokens;
  }

private:
  /// Finds a token starting at \p L. The token must exist.
  const syntax::Token *findToken(SourceLocation L) const;

@@ -136,6 +161,8 @@ private:
                     {&T, NodeAndRole{new (A.allocator()) syntax::Leaf(&T)}});
    }

    ~Forest() { assert(DelayedFolds.empty()); }

    void assignRole(llvm::ArrayRef<syntax::Token> Range,
                    syntax::NodeRole Role) {
      assert(!Range.empty());
@@ -148,30 +175,46 @@ private:
      It->second.Role = Role;
    }

    /// Add \p Node to the forest and fill its children nodes based on the \p
    /// NodeRange.
    void foldChildren(llvm::ArrayRef<syntax::Token> NodeTokens,
    /// Add \p Node to the forest and attach child nodes based on \p Tokens.
    void foldChildren(llvm::ArrayRef<syntax::Token> Tokens,
                      syntax::Tree *Node) {
      assert(!NodeTokens.empty());
      assert(Node->firstChild() == nullptr && "node already has children");

      auto *FirstToken = NodeTokens.begin();
      auto BeginChildren = Trees.lower_bound(FirstToken);
      assert(BeginChildren != Trees.end() &&
             BeginChildren->first == FirstToken &&
             "fold crosses boundaries of existing subtrees");
      auto EndChildren = Trees.lower_bound(NodeTokens.end());
      assert((EndChildren == Trees.end() ||
              EndChildren->first == NodeTokens.end()) &&
             "fold crosses boundaries of existing subtrees");

      // (!) we need to go in reverse order, because we can only prepend.
      for (auto It = EndChildren; It != BeginChildren; --It)
        Node->prependChildLowLevel(std::prev(It)->second.Node,
                                   std::prev(It)->second.Role);

      Trees.erase(BeginChildren, EndChildren);
      Trees.insert({FirstToken, NodeAndRole(Node)});
      // Execute delayed folds inside `Tokens`.
      auto BeginExecuted = DelayedFolds.lower_bound(Tokens.begin());
      auto It = BeginExecuted;
      for (; It != DelayedFolds.end() && It->second.End <= Tokens.end(); ++It)
        foldChildrenEager(llvm::makeArrayRef(It->first, It->second.End),
                          It->second.Node);
      DelayedFolds.erase(BeginExecuted, It);

      // Attach children to `Node`.
      foldChildrenEager(Tokens, Node);
    }

    /// Schedule a call to `foldChildren` that will only be executed when
    /// containing node is folded. The range of delayed nodes can be extended by
    /// calling `extendDelayedFold`. Only one delayed node for each starting
    /// token is allowed.
    void foldChildrenDelayed(llvm::ArrayRef<syntax::Token> Tokens,
                             syntax::Tree *Node) {
      assert(!Tokens.empty());
      bool Inserted =
          DelayedFolds.insert({Tokens.begin(), DelayedFold{Tokens.end(), Node}})
              .second;
      (void)Inserted;
      assert(Inserted && "Multiple delayed folds start at the same token");
    }

    /// If there a delayed fold, starting at `ExtendedRange.begin()`, extends
    /// its endpoint to `ExtendedRange.end()` and returns true.
    /// Otherwise, returns false.
    bool extendDelayedFold(llvm::ArrayRef<syntax::Token> ExtendedRange) {
      assert(!ExtendedRange.empty());
      auto It = DelayedFolds.find(ExtendedRange.data());
      if (It == DelayedFolds.end())
        return false;
      assert(It->second.End <= ExtendedRange.end());
      It->second.End = ExtendedRange.end();
      return true;
    }

    // EXPECTS: all tokens were consumed and are owned by a single root node.
@@ -199,6 +242,30 @@ private:
    }

  private:
    /// Implementation detail of `foldChildren`, does acutal folding ignoring
    /// delayed folds.
    void foldChildrenEager(llvm::ArrayRef<syntax::Token> Tokens,
                           syntax::Tree *Node) {
      assert(Node->firstChild() == nullptr && "node already has children");

      auto *FirstToken = Tokens.begin();
      auto BeginChildren = Trees.lower_bound(FirstToken);
      assert((BeginChildren == Trees.end() ||
              BeginChildren->first == FirstToken) &&
             "fold crosses boundaries of existing subtrees");
      auto EndChildren = Trees.lower_bound(Tokens.end());
      assert(
          (EndChildren == Trees.end() || EndChildren->first == Tokens.end()) &&
          "fold crosses boundaries of existing subtrees");

      // (!) we need to go in reverse order, because we can only prepend.
      for (auto It = EndChildren; It != BeginChildren; --It)
        Node->prependChildLowLevel(std::prev(It)->second.Node,
                                   std::prev(It)->second.Role);

      Trees.erase(BeginChildren, EndChildren);
      Trees.insert({FirstToken, NodeAndRole(Node)});
    }
    /// A with a role that should be assigned to it when adding to a parent.
    struct NodeAndRole {
      explicit NodeAndRole(syntax::Node *Node)
@@ -214,6 +281,13 @@ private:
    /// FIXME: storing the end tokens is redundant.
    /// FIXME: the key of a map is redundant, it is also stored in NodeForRange.
    std::map<const syntax::Token *, NodeAndRole> Trees;

    /// See documentation of `foldChildrenDelayed` for details.
    struct DelayedFold {
      const syntax::Token *End = nullptr;
      syntax::Tree *Node = nullptr;
    };
    std::map<const syntax::Token *, DelayedFold> DelayedFolds;
  };

  /// For debugging purposes.
@@ -221,6 +295,7 @@ private:

  syntax::Arena &Arena;
  Forest Pending;
  llvm::DenseSet<Decl*> DeclsWithoutSemicolons;
};

namespace {
@@ -231,20 +306,30 @@ public:

  bool shouldTraversePostOrder() const { return true; }

  bool TraverseDecl(Decl *D) {
    if (!D || isa<TranslationUnitDecl>(D))
      return RecursiveASTVisitor::TraverseDecl(D);
    if (!llvm::isa<TranslationUnitDecl>(D->getDeclContext()))
      return true; // Only build top-level decls for now, do not recurse.
    return RecursiveASTVisitor::TraverseDecl(D);
  bool WalkUpFromDeclaratorDecl(DeclaratorDecl *D) {
    // Ensure declarators are covered by SimpleDeclaration.
    Builder.noticeDeclaratorRange(Builder.getRange(D));
    // FIXME: build nodes for the declarator too.
    return true;
  }
  bool WalkUpFromTypedefNameDecl(TypedefNameDecl *D) {
    // Also a declarator.
    Builder.noticeDeclaratorRange(Builder.getRange(D));
    // FIXME: build nodes for the declarator too.
    return true;
  }

  bool VisitDecl(Decl *D) {
    assert(llvm::isa<TranslationUnitDecl>(D->getDeclContext()) &&
           "expected a top-level decl");
    assert(!D->isImplicit());
    Builder.foldNode(Builder.getRange(D),
                     new (allocator()) syntax::TopLevelDeclaration());
                     new (allocator()) syntax::UnknownDeclaration());
    return true;
  }

  bool WalkUpFromTagDecl(TagDecl *C) {
    // Avoid building UnknownDeclaration here, syntatically 'struct X {}' and
    // similar are part of declaration specifiers and do not introduce a new
    // top-level declaration.
    return true;
  }

@@ -291,7 +376,11 @@ public:
  }

  bool TraverseStmt(Stmt *S) {
    if (auto *E = llvm::dyn_cast_or_null<Expr>(S)) {
    if (auto *DS = llvm::dyn_cast_or_null<DeclStmt>(S)) {
      // We want to consume the semicolon, make sure SimpleDeclaration does not.
      for (auto *D : DS->decls())
        Builder.noticeDeclaratorWithoutSemicolon(D);
    } else if (auto *E = llvm::dyn_cast_or_null<Expr>(S)) {
      // (!) do not recurse into subexpressions.
      // we do not have syntax trees for expressions yet, so we only want to see
      // the first top-level expression.
@@ -429,6 +518,18 @@ void syntax::TreeBuilder::foldNode(llvm::ArrayRef<syntax::Token> Range,
  Pending.foldChildren(Range, New);
}

void syntax::TreeBuilder::noticeDeclaratorRange(
    llvm::ArrayRef<syntax::Token> Range) {
  if (Pending.extendDelayedFold(Range))
    return;
  Pending.foldChildrenDelayed(Range,
                              new (allocator()) syntax::SimpleDeclaration);
}

void syntax::TreeBuilder::noticeDeclaratorWithoutSemicolon(Decl *D) {
  DeclsWithoutSemicolons.insert(D);
}

void syntax::TreeBuilder::markChildToken(SourceLocation Loc, NodeRole Role) {
  if (Loc.isInvalid())
    return;
+4 −2
Original line number Diff line number Diff line
@@ -16,8 +16,6 @@ llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, NodeKind K) {
    return OS << "Leaf";
  case NodeKind::TranslationUnit:
    return OS << "TranslationUnit";
  case NodeKind::TopLevelDeclaration:
    return OS << "TopLevelDeclaration";
  case NodeKind::UnknownExpression:
    return OS << "UnknownExpression";
  case NodeKind::UnknownStatement:
@@ -50,6 +48,10 @@ llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, NodeKind K) {
    return OS << "ExpressionStatement";
  case NodeKind::CompoundStatement:
    return OS << "CompoundStatement";
  case NodeKind::UnknownDeclaration:
    return OS << "UnknownDeclaration";
  case NodeKind::SimpleDeclaration:
    return OS << "SimpleDeclaration";
  }
  llvm_unreachable("unknown node kind");
}
+93 −25
Original line number Diff line number Diff line
@@ -130,7 +130,7 @@ void foo() {}
    )cpp",
          R"txt(
*: TranslationUnit
|-TopLevelDeclaration
|-SimpleDeclaration
| |-int
| |-main
| |-(
@@ -138,7 +138,7 @@ void foo() {}
| `-CompoundStatement
|   |-{
|   `-}
`-TopLevelDeclaration
`-SimpleDeclaration
  |-void
  |-foo
  |-(
@@ -157,7 +157,7 @@ int main() {
        )cpp",
          R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
  |-int
  |-main
  |-(
@@ -202,7 +202,7 @@ void test() {
)cpp",
       R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
  |-void
  |-test
  |-(
@@ -224,7 +224,7 @@ void test() {
      {"void test() { int a = 10; }",
       R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
  |-void
  |-test
  |-(
@@ -232,16 +232,18 @@ void test() {
  `-CompoundStatement
    |-{
    |-DeclarationStatement
    | |-int
    | |-a
    | |-=
    | |-10
    | |-SimpleDeclaration
    | | |-int
    | | |-a
    | | |-=
    | | `-UnknownExpression
    | |   `-10
    | `-;
    `-}
)txt"},
      {"void test() { ; }", R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
  |-void
  |-test
  |-(
@@ -263,7 +265,7 @@ void test() {
)cpp",
       R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
  |-void
  |-test
  |-(
@@ -299,7 +301,7 @@ void test() {
)cpp",
       R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
  |-void
  |-test
  |-(
@@ -329,7 +331,7 @@ int test() { return 1; }
      )cpp",
       R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
  |-int
  |-test
  |-(
@@ -352,7 +354,7 @@ void test() {
      )cpp",
       R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
  |-void
  |-test
  |-(
@@ -360,18 +362,21 @@ void test() {
  `-CompoundStatement
    |-{
    |-DeclarationStatement
    | |-int
    | |-a
    | |-[
    | |-3
    | |-]
    | |-SimpleDeclaration
    | | |-int
    | | |-a
    | | |-[
    | | |-UnknownExpression
    | | | `-3
    | | `-]
    | `-;
    |-RangeBasedForStatement
    | |-for
    | |-(
    | |-int
    | |-x
    | |-:
    | |-SimpleDeclaration
    | | |-int
    | | |-x
    | | `-:
    | |-UnknownExpression
    | | `-a
    | |-)
@@ -384,7 +389,7 @@ void test() {
      // counterpart.
      {"void main() { foo: return 100; }", R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
  |-void
  |-main
  |-(
@@ -411,7 +416,7 @@ void test() {
    )cpp",
       R"txt(
*: TranslationUnit
`-TopLevelDeclaration
`-SimpleDeclaration
  |-void
  |-test
  |-(
@@ -444,6 +449,69 @@ void test() {
    |   | `-)
    |   `-;
    `-}
)txt"},
      // Multiple declarators group into a single SimpleDeclaration.
      {R"cpp(
      int *a, b;
  )cpp",
       R"txt(
*: TranslationUnit
`-SimpleDeclaration
  |-int
  |-*
  |-a
  |-,
  |-b
  `-;
  )txt"},
      {R"cpp(
    typedef int *a, b;
  )cpp",
       R"txt(
*: TranslationUnit
`-SimpleDeclaration
  |-typedef
  |-int
  |-*
  |-a
  |-,
  |-b
  `-;
  )txt"},
      // Multiple declarators inside a statement.
      {R"cpp(
void foo() {
      int *a, b;
      typedef int *ta, tb;
}
  )cpp",
       R"txt(
*: TranslationUnit
`-SimpleDeclaration
  |-void
  |-foo
  |-(
  |-)
  `-CompoundStatement
    |-{
    |-DeclarationStatement
    | |-SimpleDeclaration
    | | |-int
    | | |-*
    | | |-a
    | | |-,
    | | `-b
    | `-;
    |-DeclarationStatement
    | |-SimpleDeclaration
    | | |-typedef
    | | |-int
    | | |-*
    | | |-ta
    | | |-,
    | | `-tb
    | `-;
    `-}
  )txt"}};

  for (const auto &T : Cases) {