Commit 8eb8c92e authored by Sam McCall's avatar Sam McCall
Browse files

[clangd] Add library to semantically strip flags by name.

Summary:
This is designed for tweaking compile commands by specifying flags to add/remove
in a config file. Something like:
  CompileFlags: { Remove: -fcolor-diagnostics }

Having users tweak raw argv (e.g. with a regex) is going to end in tears: bugs
around clang-cl, xclang, aliases, joined-vs-separate args etc are inevitable.

This isn't in tooling because of the performance choices: build a big table
up-front to make subsequent actions fast. Maybe it should be though.

Reviewers: adamcz, hokein

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, kadircet, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D81958
parent 8a242089
Loading
Loading
Loading
Loading
+268 −0
Original line number Diff line number Diff line
@@ -9,8 +9,12 @@
#include "CompileCommands.h"
#include "Config.h"
#include "support/Logger.h"
#include "clang/Driver/Options.h"
#include "clang/Frontend/CompilerInvocation.h"
#include "clang/Tooling/ArgumentsAdjusters.h"
#include "llvm/Option/Option.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/FileUtilities.h"
#include "llvm/Support/MemoryBuffer.h"
@@ -234,5 +238,269 @@ CommandMangler::operator clang::tooling::ArgumentsAdjuster() && {
  };
}

// ArgStripper implementation
namespace {

// Determine total number of args consumed by this option.
// Return answers for {Exact, Prefix} match. 0 means not allowed.
std::pair<unsigned, unsigned> getArgCount(const llvm::opt::Option &Opt) {
  constexpr static unsigned Rest = 10000; // Should be all the rest!
  // Reference is llvm::opt::Option::acceptInternal()
  using llvm::opt::Option;
  switch (Opt.getKind()) {
  case Option::FlagClass:
    return {1, 0};
  case Option::JoinedClass:
  case Option::CommaJoinedClass:
    return {1, 1};
  case Option::GroupClass:
  case Option::InputClass:
  case Option::UnknownClass:
  case Option::ValuesClass:
    return {1, 0};
  case Option::JoinedAndSeparateClass:
    return {2, 2};
  case Option::SeparateClass:
    return {2, 0};
  case Option::MultiArgClass:
    return {1 + Opt.getNumArgs(), 0};
  case Option::JoinedOrSeparateClass:
    return {2, 1};
  case Option::RemainingArgsClass:
    return {Rest, 0};
  case Option::RemainingArgsJoinedClass:
    return {Rest, Rest};
  }
}

// Flag-parsing mode, which affects which flags are available.
enum DriverMode : unsigned char {
  DM_None = 0,
  DM_GCC = 1, // Default mode e.g. when invoked as 'clang'
  DM_CL = 2,  // MS CL.exe compatible mode e.g. when invoked as 'clang-cl'
  DM_CC1 = 4, // When invoked as 'clang -cc1' or after '-Xclang'
  DM_All = 7
};

// Examine args list to determine if we're in GCC, CL-compatible, or cc1 mode.
DriverMode getDriverMode(const std::vector<std::string> &Args) {
  DriverMode Mode = DM_GCC;
  llvm::StringRef Argv0 = Args.front();
  if (Argv0.endswith_lower(".exe"))
    Argv0 = Argv0.drop_back(strlen(".exe"));
  if (Argv0.endswith_lower("cl"))
    Mode = DM_CL;
  for (const llvm::StringRef Arg : Args) {
    if (Arg == "--driver-mode=cl") {
      Mode = DM_CL;
      break;
    }
    if (Arg == "-cc1") {
      Mode = DM_CC1;
      break;
    }
  }
  return Mode;
}

// Returns the set of DriverModes where an option may be used.
unsigned char getModes(const llvm::opt::Option &Opt) {
  // Why is this so complicated?!
  // Reference is clang::driver::Driver::getIncludeExcludeOptionFlagMasks()
  unsigned char Result = DM_None;
  if (Opt.hasFlag(driver::options::CC1Option))
    Result |= DM_CC1;
  if (!Opt.hasFlag(driver::options::NoDriverOption)) {
    if (Opt.hasFlag(driver::options::CLOption)) {
      Result |= DM_CL;
    } else {
      Result |= DM_GCC;
      if (Opt.hasFlag(driver::options::CoreOption)) {
        Result |= DM_CL;
      }
    }
  }
  return Result;
};

} // namespace

llvm::ArrayRef<ArgStripper::Rule> ArgStripper::rulesFor(llvm::StringRef Arg) {
  // All the hard work is done once in a static initializer.
  // We compute a table containing strings to look for and #args to skip.
  // e.g. "-x" => {-x 2 args, -x* 1 arg, --language 2 args, --language=* 1 arg}
  using TableTy =
      llvm::StringMap<llvm::SmallVector<Rule, 4>, llvm::BumpPtrAllocator>;
  static TableTy *Table = [] {
    auto &DriverTable = driver::getDriverOptTable();
    using DriverID = clang::driver::options::ID;

    // Collect sets of aliases, so we can treat -foo and -foo= as synonyms.
    // Conceptually a double-linked list: PrevAlias[I] -> I -> NextAlias[I].
    // If PrevAlias[I] is INVALID, then I is canonical.
    DriverID PrevAlias[DriverID::LastOption] = {DriverID::OPT_INVALID};
    DriverID NextAlias[DriverID::LastOption] = {DriverID::OPT_INVALID};
    auto AddAlias = [&](DriverID Self, DriverID T) {
      if (NextAlias[T]) {
        PrevAlias[NextAlias[T]] = Self;
        NextAlias[Self] = NextAlias[T];
      }
      PrevAlias[Self] = T;
      NextAlias[T] = Self;
    };
    // Also grab prefixes for each option, these are not fully exposed.
    const char *const *Prefixes[DriverID::LastOption] = {nullptr};
#define PREFIX(NAME, VALUE) static const char *const NAME[] = VALUE;
#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
               HELP, METAVAR, VALUES)                                          \
  if (DriverID::OPT_##ALIAS != DriverID::OPT_INVALID && ALIASARGS == nullptr)  \
    AddAlias(DriverID::OPT_##ID, DriverID::OPT_##ALIAS);                       \
  Prefixes[DriverID::OPT_##ID] = PREFIX;
#include "clang/Driver/Options.inc"
#undef OPTION
#undef PREFIX

    auto Result = std::make_unique<TableTy>();
    // Iterate over distinct options (represented by the canonical alias).
    // Every spelling of this option will get the same set of rules.
    for (unsigned ID = 1 /*Skip INVALID */; ID < DriverID::LastOption; ++ID) {
      if (PrevAlias[ID] || ID == DriverID::OPT_Xclang)
        continue; // Not canonical, or specially handled.
      llvm::SmallVector<Rule, 8> Rules;
      // Iterate over each alias, to add rules for parsing it.
      for (unsigned A = ID; A != DriverID::OPT_INVALID; A = NextAlias[A]) {
        if (Prefixes[A] == nullptr) // option groups.
          continue;
        auto Opt = DriverTable.getOption(A);
        // Exclude - and -foo pseudo-options.
        if (Opt.getName().empty())
          continue;
        auto Modes = getModes(Opt);
        std::pair<unsigned, unsigned> ArgCount = getArgCount(Opt);
        // Iterate over each spelling of the alias, e.g. -foo vs --foo.
        for (auto *Prefix = Prefixes[A]; *Prefix != nullptr; ++Prefix) {
          llvm::SmallString<64> Buf(*Prefix);
          Buf.append(Opt.getName());
          llvm::StringRef Spelling = Result->try_emplace(Buf).first->getKey();
          Rules.emplace_back();
          Rule &R = Rules.back();
          R.Text = Spelling;
          R.Modes = Modes;
          R.ExactArgs = ArgCount.first;
          R.PrefixArgs = ArgCount.second;
          // Concrete priority is the index into the option table.
          // Effectively, earlier entries take priority over later ones.
          assert(ID < std::numeric_limits<decltype(R.Priority)>::max() &&
                 "Rules::Priority overflowed by options table");
          R.Priority = ID;
        }
      }
      // Register the set of rules under each possible name.
      for (const auto &R : Rules)
        Result->find(R.Text)->second.append(Rules.begin(), Rules.end());
    }
#ifndef NDEBUG
    // Dump the table and various measures of its size.
    unsigned RuleCount = 0;
    dlog("ArgStripper Option spelling table");
    for (const auto &Entry : *Result) {
      dlog("{0}", Entry.first());
      RuleCount += Entry.second.size();
      for (const auto &R : Entry.second)
        dlog("  {0} #={1} *={2} Mode={3}", R.Text, R.ExactArgs, R.PrefixArgs,
             int(R.Modes));
    }
    dlog("Table spellings={0} rules={1} string-bytes={2}", Result->size(),
         RuleCount, Result->getAllocator().getBytesAllocated());
#endif
    // The static table will never be destroyed.
    return Result.release();
  }();

  auto It = Table->find(Arg);
  return (It == Table->end()) ? llvm::ArrayRef<Rule>() : It->second;
}

void ArgStripper::strip(llvm::StringRef Arg) {
  auto OptionRules = rulesFor(Arg);
  if (OptionRules.empty()) {
    // Not a recognized flag. Strip it literally.
    Storage.emplace_back(Arg);
    Rules.emplace_back();
    Rules.back().Text = Storage.back();
    Rules.back().ExactArgs = 1;
    if (Rules.back().Text.consume_back("*"))
      Rules.back().PrefixArgs = 1;
    Rules.back().Modes = DM_All;
    Rules.back().Priority = -1; // Max unsigned = lowest priority.
  } else {
    Rules.append(OptionRules.begin(), OptionRules.end());
  }
}

const ArgStripper::Rule *ArgStripper::matchingRule(llvm::StringRef Arg,
                                                   unsigned Mode,
                                                   unsigned &ArgCount) const {
  const ArgStripper::Rule *BestRule = nullptr;
  for (const Rule &R : Rules) {
    // Rule can fail to match if...
    if (!(R.Modes & Mode))
      continue; // not applicable to current driver mode
    if (BestRule && BestRule->Priority < R.Priority)
      continue; // lower-priority than best candidate.
    if (!Arg.startswith(R.Text))
      continue; // current arg doesn't match the prefix string
    bool PrefixMatch = Arg.size() > R.Text.size();
    // Can rule apply as an exact/prefix match?
    if (unsigned Count = PrefixMatch ? R.PrefixArgs : R.ExactArgs) {
      BestRule = &R;
      ArgCount = Count;
    }
    // Continue in case we find a higher-priority rule.
  }
  return BestRule;
}

void ArgStripper::process(std::vector<std::string> &Args) const {
  if (Args.empty())
    return;

  // We're parsing the args list in some mode (e.g. gcc-compatible) but may
  // temporarily switch to another mode with the -Xclang flag.
  DriverMode MainMode = getDriverMode(Args);
  DriverMode CurrentMode = MainMode;

  // Read and write heads for in-place deletion.
  unsigned Read = 0, Write = 0;
  bool WasXclang = false;
  while (Read < Args.size()) {
    unsigned ArgCount = 0;
    if (const Rule *R = matchingRule(Args[Read], CurrentMode, ArgCount)) {
      // Delete it and its args.
      if (WasXclang) {
        assert(Write > 0);
        --Write; // Drop previous -Xclang arg
        CurrentMode = MainMode;
        WasXclang = false;
      }
      // Advance to last arg. An arg may be foo or -Xclang foo.
      for (unsigned I = 1; Read < Args.size() && I < ArgCount; ++I) {
        ++Read;
        if (Read < Args.size() && Args[Read] == "-Xclang")
          ++Read;
      }
    } else {
      // No match, just copy the arg through.
      WasXclang = Args[Read] == "-Xclang";
      CurrentMode = WasXclang ? DM_CC1 : MainMode;
      if (Write != Read)
        Args[Write] = std::move(Args[Read]);
      ++Write;
    }
    ++Read;
  }
  Args.resize(Write);
}

} // namespace clangd
} // namespace clang
+39 −0
Original line number Diff line number Diff line
@@ -50,6 +50,45 @@ private:
  Memoize<llvm::StringMap<std::string>> ResolvedDriversNoFollow;
};

// Removes args from a command-line in a semantically-aware way.
//
// Internally this builds a large (0.5MB) table of clang options on first use.
// Both strip() and process() are fairly cheap after that.
//
// FIXME: this reimplements much of OptTable, it might be nice to expose more.
// The table-building strategy may not make sense outside clangd.
class ArgStripper {
public:
  // Adds the arg to the set which should be removed.
  //
  // Recognized clang flags are stripped semantically. When "-I" is stripped:
  //  - so is its value (either as -Ifoo or -I foo)
  //  - aliases like --include-directory=foo are also stripped
  //  - CL-style /Ifoo will be removed if the args indicate MS-compatible mode
  // Compile args not recognized as flags are removed literally, except:
  //  - strip("ABC*") will remove any arg with an ABC prefix.
  //
  // In either case, the -Xclang prefix will be dropped if present.
  void strip(llvm::StringRef Arg);
  // Remove the targets from a compile command, in-place.
  void process(std::vector<std::string> &Args) const;

private:
  // Deletion rules, to be checked for each arg.
  struct Rule {
    llvm::StringRef Text;    // Rule applies only if arg begins with Text.
    unsigned char Modes = 0; // Rule applies only in specified driver modes.
    uint16_t Priority = 0;   // Lower is better.
    uint16_t ExactArgs = 0;  // Num args consumed when Arg == Text.
    uint16_t PrefixArgs = 0; // Num args consumed when Arg starts with Text.
  };
  static llvm::ArrayRef<Rule> rulesFor(llvm::StringRef Arg);
  const Rule *matchingRule(llvm::StringRef Arg, unsigned Mode,
                           unsigned &ArgCount) const;
  llvm::SmallVector<Rule, 4> Rules;
  std::vector<std::string> Storage; // Store strings not found in option table.
};

} // namespace clangd
} // namespace clang

+160 −0
Original line number Diff line number Diff line
@@ -207,6 +207,166 @@ TEST(CommandMangler, ConfigEdits) {
  EXPECT_THAT(Cmd, ElementsAre(_, "FOO.CC", "--hello", "-fsyntax-only"));
}

static std::string strip(llvm::StringRef Arg, llvm::StringRef Argv) {
  llvm::SmallVector<llvm::StringRef, 8> Parts;
  llvm::SplitString(Argv, Parts);
  std::vector<std::string> Args = {Parts.begin(), Parts.end()};
  ArgStripper S;
  S.strip(Arg);
  S.process(Args);
  return llvm::join(Args, " ");
}

TEST(ArgStripperTest, Spellings) {
  // May use alternate prefixes.
  EXPECT_EQ(strip("-pedantic", "clang -pedantic foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("-pedantic", "clang --pedantic foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("--pedantic", "clang -pedantic foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("--pedantic", "clang --pedantic foo.cc"), "clang foo.cc");
  // May use alternate names.
  EXPECT_EQ(strip("-x", "clang -x c++ foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("-x", "clang --language=c++ foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("--language=", "clang -x c++ foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("--language=", "clang --language=c++ foo.cc"),
            "clang foo.cc");
}

TEST(ArgStripperTest, UnknownFlag) {
  EXPECT_EQ(strip("-xyzzy", "clang -xyzzy foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("-xyz*", "clang -xyzzy foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("-xyzzy", "clang -Xclang -xyzzy foo.cc"), "clang foo.cc");
}

TEST(ArgStripperTest, Xclang) {
  // Flags may be -Xclang escaped.
  EXPECT_EQ(strip("-ast-dump", "clang -Xclang -ast-dump foo.cc"),
            "clang foo.cc");
  // Args may be -Xclang escaped.
  EXPECT_EQ(strip("-add-plugin", "clang -Xclang -add-plugin -Xclang z foo.cc"),
            "clang foo.cc");
}

TEST(ArgStripperTest, ClangCL) {
  // /I is a synonym for -I in clang-cl mode only.
  // Not stripped by default.
  EXPECT_EQ(strip("-I", "clang -I /usr/inc /Interesting/file.cc"),
            "clang /Interesting/file.cc");
  // Stripped when invoked as clang-cl.
  EXPECT_EQ(strip("-I", "clang-cl -I /usr/inc /Interesting/file.cc"),
            "clang-cl");
  // Stripped when invoked as CL.EXE
  EXPECT_EQ(strip("-I", "CL.EXE -I /usr/inc /Interesting/file.cc"), "CL.EXE");
  // Stripped when passed --driver-mode=cl.
  EXPECT_EQ(strip("-I", "cc -I /usr/inc /Interesting/file.cc --driver-mode=cl"),
            "cc --driver-mode=cl");
}

TEST(ArgStripperTest, ArgStyles) {
  // Flag
  EXPECT_EQ(strip("-Qn", "clang -Qn foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("-Qn", "clang -QnZ foo.cc"), "clang -QnZ foo.cc");
  // Joined
  EXPECT_EQ(strip("-std=", "clang -std= foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("-std=", "clang -std=c++11 foo.cc"), "clang foo.cc");
  // Separate
  EXPECT_EQ(strip("-mllvm", "clang -mllvm X foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("-mllvm", "clang -mllvmX foo.cc"), "clang -mllvmX foo.cc");
  // RemainingArgsJoined
  EXPECT_EQ(strip("/link", "clang-cl /link b c d foo.cc"), "clang-cl");
  EXPECT_EQ(strip("/link", "clang-cl /linka b c d foo.cc"), "clang-cl");
  // CommaJoined
  EXPECT_EQ(strip("-Wl,", "clang -Wl,x,y foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("-Wl,", "clang -Wl, foo.cc"), "clang foo.cc");
  // MultiArg
  EXPECT_EQ(strip("-segaddr", "clang -segaddr a b foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("-segaddr", "clang -segaddra b foo.cc"),
            "clang -segaddra b foo.cc");
  // JoinedOrSeparate
  EXPECT_EQ(strip("-G", "clang -GX foo.cc"), "clang foo.cc");
  EXPECT_EQ(strip("-G", "clang -G X foo.cc"), "clang foo.cc");
  // JoinedAndSeparate
  EXPECT_EQ(strip("-plugin-arg-", "clang -cc1 -plugin-arg-X Y foo.cc"),
            "clang -cc1 foo.cc");
  EXPECT_EQ(strip("-plugin-arg-", "clang -cc1 -plugin-arg- Y foo.cc"),
            "clang -cc1 foo.cc");
}

TEST(ArgStripperTest, EndOfList) {
  // When we hit the end-of-args prematurely, we don't crash.
  // We consume the incomplete args if we've matched the target option.
  EXPECT_EQ(strip("-I", "clang -Xclang"), "clang -Xclang");
  EXPECT_EQ(strip("-I", "clang -Xclang -I"), "clang");
  EXPECT_EQ(strip("-I", "clang -I -Xclang"), "clang");
  EXPECT_EQ(strip("-I", "clang -I"), "clang");
}

TEST(ArgStripperTest, Multiple) {
  ArgStripper S;
  S.strip("-o");
  S.strip("-c");
  std::vector<std::string> Args = {"clang", "-o", "foo.o", "foo.cc", "-c"};
  S.process(Args);
  EXPECT_THAT(Args, ElementsAre("clang", "foo.cc"));
}

TEST(ArgStripperTest, Warning) {
  {
    // -W is a flag name
    ArgStripper S;
    S.strip("-W");
    std::vector<std::string> Args = {"clang", "-Wfoo", "-Wno-bar", "-Werror",
                                     "foo.cc"};
    S.process(Args);
    EXPECT_THAT(Args, ElementsAre("clang", "foo.cc"));
  }
  {
    // -Wfoo is not a flag name, matched literally.
    ArgStripper S;
    S.strip("-Wunused");
    std::vector<std::string> Args = {"clang", "-Wunused", "-Wno-unused",
                                     "foo.cc"};
    S.process(Args);
    EXPECT_THAT(Args, ElementsAre("clang", "-Wno-unused", "foo.cc"));
  }
}

TEST(ArgStripperTest, Define) {
  {
    // -D is a flag name
    ArgStripper S;
    S.strip("-D");
    std::vector<std::string> Args = {"clang", "-Dfoo", "-Dbar=baz", "foo.cc"};
    S.process(Args);
    EXPECT_THAT(Args, ElementsAre("clang", "foo.cc"));
  }
  {
    // -Dbar is not: matched literally
    ArgStripper S;
    S.strip("-Dbar");
    std::vector<std::string> Args = {"clang", "-Dfoo", "-Dbar=baz", "foo.cc"};
    S.process(Args);
    EXPECT_THAT(Args, ElementsAre("clang", "-Dfoo", "-Dbar=baz", "foo.cc"));
    S.strip("-Dfoo");
    S.process(Args);
    EXPECT_THAT(Args, ElementsAre("clang", "-Dbar=baz", "foo.cc"));
    S.strip("-Dbar=*");
    S.process(Args);
    EXPECT_THAT(Args, ElementsAre("clang", "foo.cc"));
  }
}

TEST(ArgStripperTest, OrderDependent) {
  ArgStripper S;
  // If -include is stripped first, we see -pch as its arg and foo.pch remains.
  // To get this case right, we must process -include-pch first.
  S.strip("-include");
  S.strip("-include-pch");
  std::vector<std::string> Args = {"clang", "-include-pch", "foo.pch",
                                   "foo.cc"};
  S.process(Args);
  EXPECT_THAT(Args, ElementsAre("clang", "foo.cc"));
}

} // namespace
} // namespace clangd
} // namespace clang