Unverified Commit ef0e0adc authored by William Junda Huang's avatar William Junda Huang Committed by GitHub
Browse files

[llvm-profdata] Do not create numerical strings for MD5 function names read...

[llvm-profdata] Do not create numerical strings for MD5 function names read from a Sample Profile. (#66164)

This is phase 2 of the MD5 refactoring on Sample Profile following
https://reviews.llvm.org/D147740
    
In previous implementation, when a MD5 Sample Profile is read, the
reader first converts the MD5 values to strings, and then create a
StringRef as if the numerical strings are regular function names, and
later on IPO transformation passes perform string comparison over these
numerical strings for profile matching. This is inefficient since it
causes many small heap allocations.
In this patch I created a class `ProfileFuncRef` that is similar to
`StringRef` but it can represent a hash value directly without any
conversion, and it will be more efficient (I will attach some benchmark
results later) when being used in associative containers.

ProfileFuncRef guarantees the same function name in string form or in
MD5 form has the same hash value, which also fix a few issue in IPO
passes where function matching/lookup only check for function name
string, while returns a no-match if the profile is MD5.

When testing on an internal large profile (> 1 GB, with more than 10
million functions), the full profile load time is reduced from 28 sec to
25 sec in average, and reading function offset table from 0.78s to 0.7s
parent e90ec58b
Loading
Loading
Loading
Loading
+213 −0
Original line number Diff line number Diff line
//===--- FunctionId.h - Sample profile function object ----------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
///
/// Defines FunctionId class.
///
//===----------------------------------------------------------------------===//

#ifndef LLVM_PROFILEDATA_FUNCTIONID_H
#define LLVM_PROFILEDATA_FUNCTIONID_H

#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/MD5.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>

namespace llvm {
namespace sampleprof {

/// This class represents a function that is read from a sample profile. It
/// comes with two forms: a string or a hash code. The latter form is the 64-bit
/// MD5 of the function name for efficient storage supported by ExtBinary
/// profile format, and when reading the profile, this class can represent it
/// without converting it to a string first.
/// When representing a hash code, we utilize the LengthOrHashCode field to
/// store it, and Name is set to null. When representing a string, it is same as
/// StringRef.
class FunctionId {

  const char *Data = nullptr;

  // Use uint64_t instead of size_t so that it can also hold a MD5 value on
  // 32-bit system.
  uint64_t LengthOrHashCode = 0;

  /// Extension to memcmp to handle hash code representation. If both are hash
  /// values, Lhs and Rhs are both null, function returns 0 (and needs an extra
  /// comparison using getIntValue). If only one is hash code, it is considered
  /// less than the StringRef one. Otherwise perform normal string comparison.
  static int compareMemory(const char *Lhs, const char *Rhs, uint64_t Length) {
    if (Lhs == Rhs)
      return 0;
    if (!Lhs)
      return -1;
    if (!Rhs)
      return 1;
    return ::memcmp(Lhs, Rhs, (size_t)Length);
  }

public:
  FunctionId() = default;

  /// Constructor from a StringRef.
  explicit FunctionId(StringRef Str)
      : Data(Str.data()), LengthOrHashCode(Str.size()) {
  }

  /// Constructor from a hash code.
  explicit FunctionId(uint64_t HashCode)
      : LengthOrHashCode(HashCode) {
    assert(HashCode != 0);
  }

  /// Check for equality. Similar to StringRef::equals, but will also cover for
  /// the case where one or both are hash codes. Comparing their int values are
  /// sufficient. A hash code FunctionId is considered not equal to a StringRef
  /// FunctionId regardless of actual contents.
  bool equals(const FunctionId &Other) const {
    return LengthOrHashCode == Other.LengthOrHashCode &&
           compareMemory(Data, Other.Data, LengthOrHashCode) == 0;
  }

  /// Total order comparison. If both FunctionId are StringRef, this is the same
  /// as StringRef::compare. If one of them is StringRef, it is considered
  /// greater than the hash code FunctionId. Otherwise this is the the same
  /// as comparing their int values.
  int compare(const FunctionId &Other) const {
    auto Res = compareMemory(
        Data, Other.Data, std::min(LengthOrHashCode, Other.LengthOrHashCode));
    if (Res != 0)
      return Res;
    if (LengthOrHashCode == Other.LengthOrHashCode)
      return 0;
    return LengthOrHashCode < Other.LengthOrHashCode ? -1 : 1;
  }

  /// Convert to a string, usually for output purpose. Use caution on return
  /// value's lifetime when converting to StringRef.
  std::string str() const {
    if (Data)
      return std::string(Data, LengthOrHashCode);
    if (LengthOrHashCode != 0)
      return std::to_string(LengthOrHashCode);
    return std::string();
  }

  /// Convert to StringRef. This is only allowed when it is known this object is
  /// representing a StringRef, not a hash code. Calling this function on a hash
  /// code is considered an error.
  StringRef stringRef() const {
    if (Data)
      return StringRef(Data, LengthOrHashCode);
    assert(LengthOrHashCode == 0 &&
           "Cannot convert MD5 FunctionId to StringRef");
    return StringRef();
  }

  friend raw_ostream &operator<<(raw_ostream &OS, const FunctionId &Obj);

  /// Get hash code of this object. Returns this object's hash code if it is
  /// already representing one, otherwise returns the MD5 of its string content.
  /// Note that it is not the same as std::hash because we want to keep the
  /// consistency that the same sample profile function in string form or MD5
  /// form has the same hash code.
  uint64_t getHashCode() const {
    if (Data)
      return MD5Hash(StringRef(Data, LengthOrHashCode));
    return LengthOrHashCode;
  }

  bool empty() const { return LengthOrHashCode == 0; }

  /// Check if this object represents a StringRef, or a hash code.
  bool isStringRef() const { return Data != nullptr; }
};

inline bool operator==(const FunctionId &LHS, const FunctionId &RHS) {
  return LHS.equals(RHS);
}

inline bool operator!=(const FunctionId &LHS, const FunctionId &RHS) {
  return !LHS.equals(RHS);
}

inline bool operator<(const FunctionId &LHS, const FunctionId &RHS) {
  return LHS.compare(RHS) < 0;
}

inline bool operator<=(const FunctionId &LHS, const FunctionId &RHS) {
  return LHS.compare(RHS) <= 0;
}

inline bool operator>(const FunctionId &LHS, const FunctionId &RHS) {
  return LHS.compare(RHS) > 0;
}

inline bool operator>=(const FunctionId &LHS, const FunctionId &RHS) {
  return LHS.compare(RHS) >= 0;
}

inline raw_ostream &operator<<(raw_ostream &OS, const FunctionId &Obj) {
  if (Obj.Data)
    return OS << StringRef(Obj.Data, Obj.LengthOrHashCode);
  if (Obj.LengthOrHashCode != 0)
    return OS << Obj.LengthOrHashCode;
  return OS;
}

inline uint64_t MD5Hash(const FunctionId &Obj) {
  return Obj.getHashCode();
}

inline uint64_t hash_value(const FunctionId &Obj) {
  return Obj.getHashCode();
}

} // end namespace sampleprof

/// Template specialization for FunctionId so that it can be used in LLVM map
/// containers.
template <> struct DenseMapInfo<sampleprof::FunctionId, void> {

  static inline sampleprof::FunctionId getEmptyKey() {
    return sampleprof::FunctionId(~0ULL);
  }

  static inline sampleprof::FunctionId getTombstoneKey() {
    return sampleprof::FunctionId(~1ULL);
  }

  static unsigned getHashValue(const sampleprof::FunctionId &Val) {
    return Val.getHashCode();
  }

  static bool isEqual(const sampleprof::FunctionId &LHS,
                      const sampleprof::FunctionId &RHS) {
    return LHS == RHS;
  }
};

} // end namespace llvm

namespace std {

/// Template specialization for FunctionId so that it can be used in STL
/// containers.
template <> struct hash<llvm::sampleprof::FunctionId> {
  size_t operator()(const llvm::sampleprof::FunctionId &Val) const {
    return Val.getHashCode();
  }
};

} // end namespace std

#endif // LLVM_PROFILEDATA_FUNCTIONID_H
+129 −0
Original line number Diff line number Diff line
//===--- HashKeyMap.h - Wrapper for maps using hash value key ---*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
///
/// Defines HashKeyMap template.
///
//===----------------------------------------------------------------------===//

#ifndef LLVM_PROFILEDATA_HASHKEYMAP_H
#define LLVM_PROFILEDATA_HASHKEYMAP_H

#include "llvm/ADT/Hashing.h"
#include <iterator>
#include <utility>

namespace llvm {

namespace sampleprof {

/// This class is a wrapper to associative container MapT<KeyT, ValueT> using
/// the hash value of the original key as the new key. This greatly improves the
/// performance of insert and query operations especially when hash values of
/// keys are available a priori, and reduces memory usage if KeyT has a large
/// size.
/// All keys with the same hash value are considered equivalent (i.e. hash
/// collision is silently ignored). Given such feature this class should only be
/// used where it does not affect compilation correctness, for example, when
/// loading a sample profile. The original key is not stored, so if the user
/// needs to preserve it, it should be stored in the mapped type.
/// Assuming the hashing algorithm is uniform, we use the formula
/// 1 - Permute(n, k) / n ^ k where n is the universe size and k is number of
/// elements chosen at random to calculate the probability of collision. With
/// 1,000,000 entries the probability is negligible:
/// 1 - (2^64)!/((2^64-1000000)!*(2^64)^1000000) ~= 3*10^-8.
/// Source: https://en.wikipedia.org/wiki/Birthday_problem
///
/// \param MapT The underlying associative container type.
/// \param KeyT The original key type, which requires the implementation of
///   llvm::hash_value(KeyT).
/// \param ValueT The original mapped type, which has the same requirement as
///   the underlying container.
/// \param MapTArgs Additional template parameters passed to the underlying
///   container.
template <template <typename, typename, typename...> typename MapT,
          typename KeyT, typename ValueT, typename... MapTArgs>
class HashKeyMap :
    public MapT<decltype(hash_value(KeyT())), ValueT, MapTArgs...> {
public:
  using base_type = MapT<decltype(hash_value(KeyT())), ValueT, MapTArgs...>;
  using key_type = decltype(hash_value(KeyT()));
  using original_key_type = KeyT;
  using mapped_type = ValueT;
  using value_type = typename base_type::value_type;

  using iterator = typename base_type::iterator;
  using const_iterator = typename base_type::const_iterator;

  template <typename... Ts>
  std::pair<iterator, bool> try_emplace(const key_type &Hash,
                                        const original_key_type &Key,
                                        Ts &&...Args) {
    assert(Hash == hash_value(Key));
    return base_type::try_emplace(Hash, std::forward<Ts>(Args)...);
  }

  template <typename... Ts>
  std::pair<iterator, bool> try_emplace(const original_key_type &Key,
                                        Ts &&...Args) {
    return try_emplace(hash_value(Key), Key, std::forward<Ts>(Args)...);
  }

  template <typename... Ts> std::pair<iterator, bool> emplace(Ts &&...Args) {
    return try_emplace(std::forward<Ts>(Args)...);
  }

  mapped_type &operator[](const original_key_type &Key) {
    return try_emplace(Key, mapped_type()).first->second;
  }

  iterator find(const original_key_type &Key) {
    auto It = base_type::find(hash_value(Key));
    if (It != base_type::end())
      return It;
    return base_type::end();
  }

  const_iterator find(const original_key_type &Key) const {
    auto It = base_type::find(hash_value(Key));
    if (It != base_type::end())
      return It;
    return base_type::end();
  }

  mapped_type lookup(const original_key_type &Key) const {
    auto It = base_type::find(hash_value(Key));
    if (It != base_type::end())
      return It->second;
    return mapped_type();
  }

  size_t count(const original_key_type &Key) const {
    return base_type::count(hash_value(Key));
  }

  size_t erase(const original_key_type &Ctx) {
    auto It = find(Ctx);
    if (It != base_type::end()) {
      base_type::erase(It);
      return 1;
    }
    return 0;
  }

  iterator erase(const_iterator It) {
    return base_type::erase(It);
  }
};

}

}

#endif // LLVM_PROFILEDATA_HASHKEYMAP_H
+103 −180

File changed.

Preview size limit exceeded, changes collapsed.

+8 −22
Original line number Diff line number Diff line
@@ -409,13 +409,13 @@ public:

  /// Return the samples collected for function \p F.
  FunctionSamples *getSamplesFor(StringRef Fname) {
    auto It = Profiles.find(Fname);
    auto It = Profiles.find(FunctionId(Fname));
    if (It != Profiles.end())
      return &It->second;

    if (Remapper) {
      if (auto NameInProfile = Remapper->lookUpNameInProfile(Fname)) {
        auto It = Profiles.find(*NameInProfile);
        auto It = Profiles.find(FunctionId(*NameInProfile));
        if (It != Profiles.end())
          return &It->second;
      }
@@ -474,7 +474,7 @@ public:

  /// It includes all the names that have samples either in outline instance
  /// or inline instance.
  virtual std::vector<StringRef> *getNameTable() { return nullptr; }
  virtual std::vector<FunctionId> *getNameTable() { return nullptr; }
  virtual bool dumpSectionInfo(raw_ostream &OS = dbgs()) { return false; };

  /// Return whether names in the profile are all MD5 numbers.
@@ -508,10 +508,6 @@ protected:
  /// Memory buffer holding the profile file.
  std::unique_ptr<MemoryBuffer> Buffer;

  /// Extra name buffer holding names created on demand.
  /// This should only be needed for md5 profiles.
  std::unordered_set<std::string> MD5NameBuffer;

  /// Profile summary information.
  std::unique_ptr<ProfileSummary> Summary;

@@ -595,7 +591,9 @@ public:

  /// It includes all the names that have samples either in outline instance
  /// or inline instance.
  std::vector<StringRef> *getNameTable() override { return &NameTable; }
  std::vector<FunctionId> *getNameTable() override {
    return &NameTable;
  }

protected:
  /// Read a numeric value of type T from the profile.
@@ -637,7 +635,7 @@ protected:
  std::error_code readNameTable();

  /// Read a string indirectly via the name table. Optionally return the index.
  ErrorOr<StringRef> readStringFromTable(size_t *RetIdx = nullptr);
  ErrorOr<FunctionId> readStringFromTable(size_t *RetIdx = nullptr);

  /// Read a context indirectly via the CSNameTable. Optionally return the
  /// index.
@@ -654,19 +652,7 @@ protected:
  const uint8_t *End = nullptr;

  /// Function name table.
  std::vector<StringRef> NameTable;

  /// If MD5 is used in NameTable section, the section saves uint64_t data.
  /// The uint64_t data has to be converted to a string and then the string
  /// will be used to initialize StringRef in NameTable.
  /// Note NameTable contains StringRef so it needs another buffer to own
  /// the string data. MD5StringBuf serves as the string buffer that is
  /// referenced by NameTable (vector of StringRef). We make sure
  /// the lifetime of MD5StringBuf is not shorter than that of NameTable.
  std::vector<std::string> MD5StringBuf;

  /// The starting address of fixed length MD5 name table section.
  const uint8_t *MD5NameMemStart = nullptr;
  std::vector<FunctionId> NameTable;

  /// CSNameTable is used to save full context vectors. It is the backing buffer
  /// for SampleContextFrames.
+8 −8
Original line number Diff line number Diff line
@@ -196,20 +196,20 @@ public:
  std::error_code writeSample(const FunctionSamples &S) override;

protected:
  virtual MapVector<StringRef, uint32_t> &getNameTable() { return NameTable; }
  virtual MapVector<FunctionId, uint32_t> &getNameTable() { return NameTable; }
  virtual std::error_code writeMagicIdent(SampleProfileFormat Format);
  virtual std::error_code writeNameTable();
  std::error_code writeHeader(const SampleProfileMap &ProfileMap) override;
  std::error_code writeSummary();
  virtual std::error_code writeContextIdx(const SampleContext &Context);
  std::error_code writeNameIdx(StringRef FName);
  std::error_code writeNameIdx(FunctionId FName);
  std::error_code writeBody(const FunctionSamples &S);
  inline void stablizeNameTable(MapVector<StringRef, uint32_t> &NameTable,
                                std::set<StringRef> &V);
  inline void stablizeNameTable(MapVector<FunctionId, uint32_t> &NameTable,
                                std::set<FunctionId> &V);
  
  MapVector<StringRef, uint32_t> NameTable;
  MapVector<FunctionId, uint32_t> NameTable;
  
  void addName(StringRef FName);
  void addName(FunctionId FName);
  virtual void addContext(const SampleContext &Context);
  void addNames(const FunctionSamples &S);

Loading