From: Justin Bogner <mail@justinbogner.com>
Date: Fri, 1 Aug 2014 22:50:07 +0000 (+0000)
Subject: InstrProf: Allow multiple functions with the same name
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=4a6fa1390abc6018dc06892517184e33f7197b49;p=oota-llvm.git

InstrProf: Allow multiple functions with the same name

This updates the instrumentation based profiling format so that when
we have multiple functions with the same name (but different function
hashes) we keep all of them instead of rejecting the later ones.

There are a number of scenarios where this can come up where it's more
useful to keep multiple function profiles:

* Name collisions in unrelated libraries that are profiled together.
* Multiple "main" functions from multiple tools built against a common
  library.
* Combining profiles from different build configurations (ie, asserts
  and no-asserts)

The profile format now stores the number of counters between the hash
and the counts themselves, so that multiple sets of counts can be
stored. Since this is backwards incompatible, I've bumped the format
version and added some trivial logic to skip this when reading the old
format.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214585 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index b6073a44bb3..87489befc0e 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -206,12 +206,17 @@ enum class HashT : uint32_t;
 /// Trait for lookups into the on-disk hash table for the binary instrprof
 /// format.
 class InstrProfLookupTrait {
-  std::vector<uint64_t> CountBuffer;
+  std::vector<uint64_t> DataBuffer;
   IndexedInstrProf::HashT HashType;
 public:
   InstrProfLookupTrait(IndexedInstrProf::HashT HashType) : HashType(HashType) {}
 
-  typedef InstrProfRecord data_type;
+  struct data_type {
+    data_type(StringRef Name, ArrayRef<uint64_t> Data)
+        : Name(Name), Data(Data) {}
+    StringRef Name;
+    ArrayRef<uint64_t> Data;
+  };
   typedef StringRef internal_key_type;
   typedef StringRef external_key_type;
   typedef uint64_t hash_value_type;
@@ -234,25 +239,20 @@ public:
     return StringRef((const char *)D, N);
   }
 
-  InstrProfRecord ReadData(StringRef K, const unsigned char *D, offset_type N) {
-    if (N < 2 * sizeof(uint64_t) || N % sizeof(uint64_t)) {
+  data_type ReadData(StringRef K, const unsigned char *D, offset_type N) {
+    DataBuffer.clear();
+    if (N % sizeof(uint64_t))
       // The data is corrupt, don't try to read it.
-      CountBuffer.clear();
-      return InstrProfRecord("", 0, CountBuffer);
-    }
+      return data_type("", DataBuffer);
 
     using namespace support;
-
-    // The first stored value is the hash.
-    uint64_t Hash = endian::readNext<uint64_t, little, unaligned>(D);
-    // Each counter follows.
-    unsigned NumCounters = N / sizeof(uint64_t) - 1;
-    CountBuffer.clear();
-    CountBuffer.reserve(NumCounters - 1);
-    for (unsigned I = 0; I < NumCounters; ++I)
-      CountBuffer.push_back(endian::readNext<uint64_t, little, unaligned>(D));
-
-    return InstrProfRecord(K, Hash, CountBuffer);
+    // We just treat the data as opaque here. It's simpler to handle in
+    // IndexedInstrProfReader.
+    unsigned NumEntries = N / sizeof(uint64_t);
+    DataBuffer.reserve(NumEntries);
+    for (unsigned I = 0; I < NumEntries; ++I)
+      DataBuffer.push_back(endian::readNext<uint64_t, little, unaligned>(D));
+    return data_type(K, DataBuffer);
   }
 };
 typedef OnDiskIterableChainedHashTable<InstrProfLookupTrait>
@@ -267,7 +267,11 @@ private:
   std::unique_ptr<InstrProfReaderIndex> Index;
   /// Iterator over the profile data.
   InstrProfReaderIndex::data_iterator RecordIterator;
-  /// The maximal execution count among all fucntions.
+  /// Offset into our current data set.
+  size_t CurrentOffset;
+  /// The file format version of the profile data.
+  uint64_t FormatVersion;
+  /// The maximal execution count among all functions.
   uint64_t MaxFunctionCount;
 
   IndexedInstrProfReader(const IndexedInstrProfReader &) LLVM_DELETED_FUNCTION;
@@ -275,7 +279,7 @@ private:
     LLVM_DELETED_FUNCTION;
 public:
   IndexedInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
-      : DataBuffer(std::move(DataBuffer)), Index(nullptr) {}
+      : DataBuffer(std::move(DataBuffer)), Index(nullptr), CurrentOffset(0) {}
 
   /// Return true if the given buffer is in an indexed instrprof format.
   static bool hasFormat(const MemoryBuffer &DataBuffer);
@@ -286,7 +290,7 @@ public:
   std::error_code readNextRecord(InstrProfRecord &Record) override;
 
   /// Fill Counts with the profile data for the given function name.
-  std::error_code getFunctionCounts(StringRef FuncName, uint64_t &FuncHash,
+  std::error_code getFunctionCounts(StringRef FuncName, uint64_t FuncHash,
                                     std::vector<uint64_t> &Counts);
   /// Return the maximum of all known function counts.
   uint64_t getMaximumFunctionCount() { return MaxFunctionCount; }
diff --git a/include/llvm/ProfileData/InstrProfWriter.h b/include/llvm/ProfileData/InstrProfWriter.h
index 6e68bee30eb..93f4128c073 100644
--- a/include/llvm/ProfileData/InstrProfWriter.h
+++ b/include/llvm/ProfileData/InstrProfWriter.h
@@ -16,6 +16,7 @@
 #define LLVM_PROFILEDATA_INSTRPROF_WRITER_H_
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/DataTypes.h"
@@ -28,13 +29,13 @@ namespace llvm {
 /// Writer for instrumentation based profile data.
 class InstrProfWriter {
 public:
-  struct CounterData {
-    uint64_t Hash;
-    std::vector<uint64_t> Counts;
-  };
+  typedef SmallDenseMap<uint64_t, std::vector<uint64_t>, 1> CounterData;
 private:
   StringMap<CounterData> FunctionData;
+  uint64_t MaxFunctionCount;
 public:
+  InstrProfWriter() : MaxFunctionCount(0) {}
+
   /// Add function counts for the given function. If there are already counts
   /// for this function and the hash and number of counts match, each counter is
   /// summed.
diff --git a/lib/ProfileData/InstrProfIndexed.h b/lib/ProfileData/InstrProfIndexed.h
index 776170407bc..792863d0707 100644
--- a/lib/ProfileData/InstrProfIndexed.h
+++ b/lib/ProfileData/InstrProfIndexed.h
@@ -46,7 +46,7 @@ static inline uint64_t ComputeHash(HashT Type, StringRef K) {
 }
 
 const uint64_t Magic = 0x8169666f72706cff; // "\xfflprofi\x81"
-const uint64_t Version = 1;
+const uint64_t Version = 2;
 const HashT HashType = HashT::MD5;
 }
 
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index e8f64614df5..5c1993766aa 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -307,8 +307,8 @@ std::error_code IndexedInstrProfReader::readHeader() {
     return error(instrprof_error::bad_magic);
 
   // Read the version.
-  uint64_t Version = endian::readNext<uint64_t, little, unaligned>(Cur);
-  if (Version != IndexedInstrProf::Version)
+  FormatVersion = endian::readNext<uint64_t, little, unaligned>(Cur);
+  if (FormatVersion > IndexedInstrProf::Version)
     return error(instrprof_error::unsupported_version);
 
   // Read the maximal function count.
@@ -331,18 +331,31 @@ std::error_code IndexedInstrProfReader::readHeader() {
 }
 
 std::error_code IndexedInstrProfReader::getFunctionCounts(
-    StringRef FuncName, uint64_t &FuncHash, std::vector<uint64_t> &Counts) {
-  const auto &Iter = Index->find(FuncName);
+    StringRef FuncName, uint64_t FuncHash, std::vector<uint64_t> &Counts) {
+  auto Iter = Index->find(FuncName);
   if (Iter == Index->end())
     return error(instrprof_error::unknown_function);
 
-  // Found it. Make sure it's valid before giving back a result.
-  const InstrProfRecord &Record = *Iter;
-  if (Record.Name.empty())
-    return error(instrprof_error::malformed);
-  FuncHash = Record.Hash;
-  Counts = Record.Counts;
-  return success();
+  // Found it. Look for counters with the right hash.
+  ArrayRef<uint64_t> Data = (*Iter).Data;
+  uint64_t NumCounts;
+  for (uint64_t I = 0, E = Data.size(); I != E; I += NumCounts) {
+    // The function hash comes first.
+    uint64_t FoundHash = Data[I++];
+    // In v1, we have at least one count. Later, we have the number of counts.
+    if (I == E)
+      return error(instrprof_error::malformed);
+    NumCounts = FormatVersion == 1 ? E - I : Data[I++];
+    // If we have more counts than data, this is bogus.
+    if (I + NumCounts > E)
+      return error(instrprof_error::malformed);
+    // Check for a match and fill the vector if there is one.
+    if (FoundHash == FuncHash) {
+      Counts = Data.slice(I, NumCounts);
+      return success();
+    }
+  }
+  return error(instrprof_error::hash_mismatch);
 }
 
 std::error_code
@@ -351,10 +364,30 @@ IndexedInstrProfReader::readNextRecord(InstrProfRecord &Record) {
   if (RecordIterator == Index->data_end())
     return error(instrprof_error::eof);
 
-  // Read the next one.
-  Record = *RecordIterator;
-  ++RecordIterator;
-  if (Record.Name.empty())
+  // Record the current function name.
+  Record.Name = (*RecordIterator).Name;
+
+  ArrayRef<uint64_t> Data = (*RecordIterator).Data;
+  // Valid data starts with a hash and either a count or the number of counts.
+  if (CurrentOffset + 1 > Data.size())
+    return error(instrprof_error::malformed);
+  // First we have a function hash.
+  Record.Hash = Data[CurrentOffset++];
+  // In version 1 we knew the number of counters implicitly, but in newer
+  // versions we store the number of counters next.
+  uint64_t NumCounts =
+      FormatVersion == 1 ? Data.size() - CurrentOffset : Data[CurrentOffset++];
+  if (CurrentOffset + NumCounts > Data.size())
     return error(instrprof_error::malformed);
+  // And finally the counts themselves.
+  Record.Counts = Data.slice(CurrentOffset, NumCounts);
+
+  // If we've exhausted this function's data, increment the record.
+  CurrentOffset += NumCounts;
+  if (CurrentOffset == Data.size()) {
+    ++RecordIterator;
+    CurrentOffset = 0;
+  }
+
   return success();
 }
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index e55c2991813..1c4a4fede28 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -45,7 +45,9 @@ public:
     offset_type N = K.size();
     LE.write<offset_type>(N);
 
-    offset_type M = (1 + V->Counts.size()) * sizeof(uint64_t);
+    offset_type M = 0;
+    for (const auto &Counts : *V)
+      M += (2 + Counts.second.size()) * sizeof(uint64_t);
     LE.write<offset_type>(M);
 
     return std::make_pair(N, M);
@@ -59,9 +61,13 @@ public:
                        offset_type) {
     using namespace llvm::support;
     endian::Writer<little> LE(Out);
-    LE.write<uint64_t>(V->Hash);
-    for (uint64_t I : V->Counts)
-      LE.write<uint64_t>(I);
+
+    for (const auto &Counts : *V) {
+      LE.write<uint64_t>(Counts.first);
+      LE.write<uint64_t>(Counts.second.size());
+      for (uint64_t I : Counts.second)
+        LE.write<uint64_t>(I);
+    }
   }
 };
 }
@@ -70,41 +76,44 @@ std::error_code
 InstrProfWriter::addFunctionCounts(StringRef FunctionName,
                                    uint64_t FunctionHash,
                                    ArrayRef<uint64_t> Counters) {
-  auto Where = FunctionData.find(FunctionName);
-  if (Where == FunctionData.end()) {
-    // If this is the first time we've seen this function, just add it.
-    auto &Data = FunctionData[FunctionName];
-    Data.Hash = FunctionHash;
-    Data.Counts = Counters;
+  auto &CounterData = FunctionData[FunctionName];
+
+  auto Where = CounterData.find(FunctionHash);
+  if (Where == CounterData.end()) {
+    // We've never seen a function with this name and hash, add it.
+    CounterData[FunctionHash] = Counters;
+    // We keep track of the max function count as we go for simplicity.
+    if (Counters[0] > MaxFunctionCount)
+      MaxFunctionCount = Counters[0];
     return instrprof_error::success;
   }
 
-  auto &Data = Where->getValue();
-  // We can only add to existing functions if they match, so we check the hash
-  // and number of counters.
-  if (Data.Hash != FunctionHash)
-    return instrprof_error::hash_mismatch;
-  if (Data.Counts.size() != Counters.size())
+  // We're updating a function we've seen before.
+  auto &FoundCounters = Where->second;
+  // If the number of counters doesn't match we either have bad data or a hash
+  // collision.
+  if (FoundCounters.size() != Counters.size())
     return instrprof_error::count_mismatch;
-  // These match, add up the counters.
+
   for (size_t I = 0, E = Counters.size(); I < E; ++I) {
-    if (Data.Counts[I] + Counters[I] < Data.Counts[I])
+    if (FoundCounters[I] + Counters[I] < FoundCounters[I])
       return instrprof_error::counter_overflow;
-    Data.Counts[I] += Counters[I];
+    FoundCounters[I] += Counters[I];
   }
+  // We keep track of the max function count as we go for simplicity.
+  if (FoundCounters[0] > MaxFunctionCount)
+    MaxFunctionCount = FoundCounters[0];
+
   return instrprof_error::success;
 }
 
 void InstrProfWriter::write(raw_fd_ostream &OS) {
   OnDiskChainedHashTableGenerator<InstrProfRecordTrait> Generator;
-  uint64_t MaxFunctionCount = 0;
 
   // Populate the hash table generator.
-  for (const auto &I : FunctionData) {
+  std::vector<uint64_t> CounterBuffer;
+  for (const auto &I : FunctionData)
     Generator.insert(I.getKey(), &I.getValue());
-    if (I.getValue().Counts[0] > MaxFunctionCount)
-      MaxFunctionCount = I.getValue().Counts[0];
-  }
 
   using namespace llvm::support;
   endian::Writer<little> LE(OS);
diff --git a/test/tools/llvm-profdata/Inputs/compat.profdata.v1 b/test/tools/llvm-profdata/Inputs/compat.profdata.v1
new file mode 100644
index 00000000000..fd17459d091
Binary files /dev/null and b/test/tools/llvm-profdata/Inputs/compat.profdata.v1 differ
diff --git a/test/tools/llvm-profdata/compat.proftext b/test/tools/llvm-profdata/compat.proftext
new file mode 100644
index 00000000000..14da3374b5e
--- /dev/null
+++ b/test/tools/llvm-profdata/compat.proftext
@@ -0,0 +1,47 @@
+# Compatibility tests for older profile format versions. These ensure
+# that we don't break compatibility with an older profile version
+# without noticing it.
+
+# The input file at %S/Inputs/compat.profdata.v1 was generated with
+# llvm-profdata merge from r214548.
+
+# RUN: llvm-profdata show %S/Inputs/compat.profdata.v1 --function function_count_only --counts | FileCheck %s -check-prefix=FUNC_COUNT_ONLY
+function_count_only
+0
+1
+97531
+# FUNC_COUNT_ONLY:      Hash: 0x{{0+$}}
+# FUNC_COUNT_ONLY-NEXT: Counters: 1
+# FUNC_COUNT_ONLY-NEXT: Function count: 97531
+# FUNC_COUNT_ONLY-NEXT: Block counts: []
+
+# RUN: llvm-profdata show %S/Inputs/compat.profdata.v1 --function "name with spaces" --counts | FileCheck %s -check-prefix=SPACES
+name with spaces
+1024
+2
+0
+0
+# SPACES:      Hash: 0x{{0+}}400
+# SPACES-NEXT: Counters: 2
+# SPACES-NEXT: Function count: 0
+# SPACES-NEXT: Block counts: [0]
+
+# RUN: llvm-profdata show %S/Inputs/compat.profdata.v1 --function large_numbers --counts | FileCheck %s -check-prefix=LARGENUM
+large_numbers
+4611686018427387903
+6
+2305843009213693952
+1152921504606846976
+576460752303423488
+288230376151711744
+144115188075855872
+72057594037927936
+# LARGENUM:      Hash: 0x3fffffffffffffff
+# LARGENUM-NEXT: Counters: 6
+# LARGENUM-NEXT: Function count: 2305843009213693952
+# LARGENUM-NEXT: Block counts: [1152921504606846976, 576460752303423488, 288230376151711744, 144115188075855872, 72057594037927936]
+
+# RUN: llvm-profdata show %S/Inputs/compat.profdata.v1 | FileCheck %s -check-prefix=SUMMARY
+# SUMMARY: Total functions: 3
+# SUMMARY: Maximum function count: 2305843009213693952
+# SUMMARY: Maximum internal block count: 1152921504606846976
diff --git a/test/tools/llvm-profdata/hash-mismatch.proftext b/test/tools/llvm-profdata/hash-mismatch.proftext
index e4f1a4ca431..fe0d4fb4f6b 100644
--- a/test/tools/llvm-profdata/hash-mismatch.proftext
+++ b/test/tools/llvm-profdata/hash-mismatch.proftext
@@ -1,6 +1,18 @@
-# RUN: llvm-profdata merge %s -o %t.out 2>&1 | FileCheck %s
-# CHECK: hash-mismatch.proftext: foo: Function hash mismatch
+# If we see the same function name, but with different hashes, make
+# sure we keep both.
 
+# RUN: llvm-profdata merge %s -o %t 2>&1
+# RUN: llvm-profdata show %t -all-functions -counts > %t.out
+
+# The function ordering is non-deterministic, so we need to do our
+# checks in multiple runs.
+# RUN: FileCheck -check-prefix=FOO3 -check-prefix=BOTH %s -input-file %t.out
+# RUN: FileCheck -check-prefix=FOO4 -check-prefix=BOTH %s -input-file %t.out
+
+# FOO3: Hash: 0x{{0+}}3
+# FOO3-NEXT: Counters: 3
+# FOO3-NEXT: Function count: 1
+# FOO3-NEXT: Block counts: [2, 3]
 foo
 3
 3
@@ -8,6 +20,10 @@ foo
 2
 3
 
+# FOO4: Hash: 0x{{0+}}4
+# FOO4-NEXT: Counters: 4
+# FOO4-NEXT: Function count: 11
+# FOO4-NEXT: Block counts: [22, 33, 44]
 foo
 4
 4
@@ -15,3 +31,7 @@ foo
 22
 33
 44
+
+# BOTH: Total functions: 2
+# BOTH: Maximum function count: 11
+# BOTH: Maximum internal block count: 44