From 6834f1d6c0293570354418aa005b0bc904a98906 Mon Sep 17 00:00:00 2001 From: Chip Turner Date: Fri, 8 Jun 2012 14:53:51 -0700 Subject: [PATCH] Move and refactor code Summary: Moves some string manipulation code like humanify into folly Moves hash-related functions into folly and removes some old includes to clean up some code Adds generic hashing for tuples, similar to pairs Updates all of the build breakages from the above Test Plan: run unit tests Reviewed By: delong.j@fb.com FB internal diff: D490478 --- folly/Hash.h | 59 ++++++++++++++++++- folly/Logging.h | 46 +++++++++++++++ folly/Makefile.am | 1 + folly/String-inl.h | 120 ++++++++++++++++++++++++++++++++++++++ folly/String.h | 68 +++++++++++++++++++++ folly/test/HashTest.cpp | 4 ++ folly/test/StringTest.cpp | 77 ++++++++++++++++++++++++ 7 files changed, 374 insertions(+), 1 deletion(-) create mode 100644 folly/Logging.h diff --git a/folly/Hash.h b/folly/Hash.h index a52da9c6..a6e33046 100644 --- a/folly/Hash.h +++ b/folly/Hash.h @@ -17,9 +17,10 @@ #ifndef FOLLY_BASE_HASH_H_ #define FOLLY_BASE_HASH_H_ -#include #include +#include #include +#include /* * Various hashing functions. @@ -27,6 +28,40 @@ namespace folly { namespace hash { +// This is a general-purpose way to create a single hash from multiple +// hashable objects. It relies on std::hash being available for all +// relevant types and combines those hashes in an order-dependent way +// to yield a new hash. + +// Never used, but gcc demands it. +inline size_t hash_combine() { + return 0; +} + +// This is the Hash128to64 function from Google's cityhash (available +// under the MIT License). We use it to reduce multiple 64 bit hashes +// into a single hash. +inline size_t hash_128_to_64(const size_t upper, const size_t lower) { + // Murmur-inspired hashing. + const size_t kMul = 0x9ddfea08eb382d69ULL; + size_t a = (lower ^ upper) * kMul; + a ^= (a >> 47); + size_t b = (upper ^ a) * kMul; + b ^= (b >> 47); + b *= kMul; + return b; +} + +template +size_t hash_combine(const T& t, const Ts&... ts) { + size_t seed = std::hash()(t); + if (sizeof...(ts) == 0) { + return seed; + } + size_t remainder = hash_combine(ts...); + return hash_128_to_64(seed, remainder); +} + ////////////////////////////////////////////////////////////////////// /* @@ -240,4 +275,26 @@ template<> struct hasher { } // namespace folly +// Custom hash functions. +namespace std { + // Hash function for pairs. Requires default hash functions for both + // items in the pair. + template + class hash > { + public: + size_t operator()(const std::pair& x) const { + return folly::hash::hash_combine(x.first, x.second); + } + }; + + // Same as above, but for arbitrary tuples. + template + class hash > { + public: + size_t operator()(const Ts&... ts) const { + return folly::hash::hash_combine(ts...); + } + }; +} // namespace std + #endif diff --git a/folly/Logging.h b/folly/Logging.h new file mode 100644 index 00000000..160ad8eb --- /dev/null +++ b/folly/Logging.h @@ -0,0 +1,46 @@ +/* + * Copyright 2012 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FOLLY_LOGGING_H_ +#define FOLLY_LOGGING_H_ + +#include +#include + +#ifndef FB_LOG_EVERY_MS +/** + * Issues a LOG(severity) no more often that every + * milliseconds. Example: + * + * FB_LOG_EVERY_MS(INFO, 10000) << "At least ten seconds passed" + * " since you last saw this."; + * + * The implementation uses for statements to introduce variables in + * a nice way that doesn't mess surrounding statements. + */ +#define FB_LOG_EVERY_MS(severity, milliseconds) \ + for (bool LOG_EVERY_MS_once = true; LOG_EVERY_MS_once; ) \ + for (const ::clock_t LOG_EVERY_MS_now = ::clock(); LOG_EVERY_MS_once; ) \ + for (static ::clock_t LOG_EVERY_MS_last; LOG_EVERY_MS_once; \ + LOG_EVERY_MS_once = false) \ + if (1000 * (LOG_EVERY_MS_now - LOG_EVERY_MS_last) < \ + (milliseconds) * CLOCKS_PER_SEC) {} \ + else \ + (LOG_EVERY_MS_last = LOG_EVERY_MS_now, LOG(severity)) + +#endif + +#endif // FOLLY_LOGGING_H_ diff --git a/folly/Makefile.am b/folly/Makefile.am index 68100f93..49573d0d 100644 --- a/folly/Makefile.am +++ b/folly/Makefile.am @@ -52,6 +52,7 @@ nobase_follyinclude_HEADERS = \ IntrusiveList.h \ json.h \ Likely.h \ + Logging.h \ Malloc.h \ MapUtil.h \ PackedSyncPtr.h \ diff --git a/folly/String-inl.h b/folly/String-inl.h index af4c4dcf..3829f1f8 100644 --- a/folly/String-inl.h +++ b/folly/String-inl.h @@ -298,6 +298,126 @@ void splitTo(const Delim& delimiter, ignoreEmpty); } +template +void backslashify(const String1& input, String2& output, bool hex_style) { + static const char hexValues[] = "0123456789abcdef"; + output.clear(); + output.reserve(3 * input.size()); + for (unsigned char c : input) { + // less than space or greater than '~' are considered unprintable + if (c < 0x20 || c > 0x7e || c == '\\') { + bool hex_append = false; + output.push_back('\\'); + if (hex_style) { + hex_append = true; + } else { + if (c == '\r') output += 'r'; + else if (c == '\n') output += 'n'; + else if (c == '\t') output += 't'; + else if (c == '\a') output += 'a'; + else if (c == '\b') output += 'b'; + else if (c == '\0') output += '0'; + else if (c == '\\') output += '\\'; + else { + hex_append = true; + } + } + if (hex_append) { + output.push_back('x'); + output.push_back(hexValues[(c >> 4) & 0xf]); + output.push_back(hexValues[c & 0xf]); + } + } else { + output += c; + } + } +} + +template +void humanify(const String1& input, String2& output) { + int numUnprintable = 0; + int numPrintablePrefix = 0; + for (unsigned char c : input) { + if (c < 0x20 || c > 0x7e || c == '\\') { + ++numUnprintable; + } + if (numUnprintable == 0) { + ++numPrintablePrefix; + } + } + + // hexlify doubles a string's size; backslashify can potentially + // explode it by 4x. Now, the printable range of the ascii + // "spectrum" is around 95 out of 256 values, so a "random" binary + // string should be around 60% unprintable. We use a 50% hueristic + // here, so if a string is 60% unprintable, then we just use hex + // output. Otherwise we backslash. + // + // UTF8 is completely ignored; as a result, utf8 characters will + // likely be \x escaped (since most common glyphs fit in two bytes). + // This is a tradeoff of complexity/speed instead of a convenience + // that likely would rarely matter. Moreover, this function is more + // about displaying underlying bytes, not about displaying glyphs + // from languages. + if (numUnprintable == 0) { + output = input; + } else if (5 * numUnprintable >= 3 * input.size()) { + // However! If we have a "meaningful" prefix of printable + // characters, say 20% of the string, we backslashify under the + // assumption viewing the prefix as ascii is worth blowing the + // output size up a bit. + if (5 * numPrintablePrefix >= input.size()) { + backslashify(input, output); + } else { + output = "0x"; + hexlify(input, output, true /* append output */); + } + } else { + backslashify(input, output); + } +} + +template +bool hexlify(const InputString& input, OutputString& output, + bool append_output=false) { + if (!append_output) output.clear(); + + static char hexValues[] = "0123456789abcdef"; + int j = output.size(); + output.resize(2 * input.size() + output.size()); + for (int i = 0; i < input.size(); ++i) { + int ch = input[i]; + output[j++] = hexValues[(ch >> 4) & 0xf]; + output[j++] = hexValues[ch & 0xf]; + } + return true; +} + +template +bool unhexlify(const InputString& input, OutputString& output) { + if (input.size() % 2 != 0) { + return false; + } + output.resize(input.size() / 2); + int j = 0; + auto unhex = [](char c) -> int { + return c >= '0' && c <= '9' ? c - '0' : + c >= 'A' && c <= 'F' ? c - 'A' + 10 : + c >= 'a' && c <= 'f' ? c - 'a' + 10 : + -1; + }; + + for (int i = 0; i < input.size(); i += 2) { + int highBits = unhex(input[i]); + int lowBits = unhex(input[i + 1]); + if (highBits < 0 || lowBits < 0) { + return false; + } + output[j++] = (highBits << 4) + lowBits; + } + return true; +} + namespace detail { /** * Hex-dump at most 16 bytes starting at offset from a memory area of size diff --git a/folly/String.h b/folly/String.h index 57dfe876..c193054a 100644 --- a/folly/String.h +++ b/folly/String.h @@ -128,6 +128,74 @@ void stringPrintf(std::string* out, const char* fmt, ...) std::string& stringAppendf(std::string* output, const char* format, ...) __attribute__ ((format (printf, 2, 3))); +/** + * Backslashify a string, that is, replace non-printable characters + * with C-style (but NOT C compliant) "\xHH" encoding. If hex_style + * is false, then shorthand notations like "\0" will be used instead + * of "\x00" for the most common backslash cases. + * + * There are two forms, one returning the input string, and one + * creating output in the specified output string. + * + * This is mainly intended for printing to a terminal, so it is not + * particularly optimized. + * + * Do *not* use this in situations where you expect to be able to feed + * the string to a C or C++ compiler, as there are nuances with how C + * parses such strings that lead to failures. This is for display + * purposed only. If you want a string you can embed for use in C or + * C++, use cEscape instead. This function is for display purposes + * only. + */ +template +void backslashify(const String1& input, String2& output, bool hex_style=false); + +template +String backslashify(const String& input, bool hex_style=false) { + String output; + backslashify(input, output, hex_style); + return output; +} + +/** + * Take a string and "humanify" it -- that is, make it look better. + * Since "better" is subjective, caveat emptor. The basic approach is + * to count the number of unprintable characters. If there are none, + * then the output is the input. If there are relatively few, or if + * there is a long "enough" prefix of printable characters, use + * backslashify. If it is mostly binary, then simply hex encode. + * + * This is an attempt to make a computer smart, and so likely is wrong + * most of the time. + */ +template +void humanify(const String1& input, String2& output); + +template +String humanify(const String& input) { + String output; + humanify(input, output); + return output; +} + +/** + * Same functionality as Python's binascii.hexlify. Returns true + * on successful conversion. + * + * If append_output is true, append data to the output rather than + * replace it. + */ +template +bool hexlify(const InputString& input, OutputString& output, + bool append=false); + +/** + * Same functionality as Python's binascii.unhexlify. Returns true + * on successful conversion. + */ +template +bool unhexlify(const InputString& input, OutputString& output); + /* * A pretty-printer for numbers that appends suffixes of units of the * given type. It prints 4 sig-figs of value with the most diff --git a/folly/test/HashTest.cpp b/folly/test/HashTest.cpp index 6750eab7..f06f8b84 100644 --- a/folly/test/HashTest.cpp +++ b/folly/test/HashTest.cpp @@ -135,3 +135,7 @@ TEST(Hash, hasher) { m.insert(std::make_pair(4, 5)); EXPECT_EQ(get_default(m, 4), 5); } + +TEST(Hash, hash_combine) { + EXPECT_NE(hash_combine(1, 2), hash_combine(2, 1)); +} diff --git a/folly/test/StringTest.cpp b/folly/test/StringTest.cpp index eab3912b..32de3194 100644 --- a/folly/test/StringTest.cpp +++ b/folly/test/StringTest.cpp @@ -604,6 +604,83 @@ TEST(Split, pieces_fbvector) { piecesTest(); } +TEST(String, hexlify) { + string input1 = "0123"; + string output1; + EXPECT_TRUE(hexlify(input1, output1)); + EXPECT_EQ(output1, "30313233"); + + fbstring input2 = "abcdefg"; + input2[1] = 0; + input2[3] = 0xff; + input2[5] = 0xb6; + fbstring output2; + EXPECT_TRUE(hexlify(input2, output2)); + EXPECT_EQ(output2, "610063ff65b667"); +} + +TEST(String, unhexlify) { + string input1 = "30313233"; + string output1; + EXPECT_TRUE(unhexlify(input1, output1)); + EXPECT_EQ(output1, "0123"); + + fbstring input2 = "610063ff65b667"; + fbstring output2; + EXPECT_TRUE(unhexlify(input2, output2)); + EXPECT_EQ(output2.size(), 7); + EXPECT_EQ(output2[0], 'a'); + EXPECT_EQ(output2[1], 0); + EXPECT_EQ(output2[2], 'c'); + EXPECT_EQ(output2[3] & 0xff, 0xff); + EXPECT_EQ(output2[4], 'e'); + EXPECT_EQ(output2[5] & 0xff, 0xb6); + EXPECT_EQ(output2[6], 'g'); + + string input3 = "x"; + string output3; + EXPECT_FALSE(unhexlify(input3, output3)); + + string input4 = "xy"; + string output4; + EXPECT_FALSE(unhexlify(input4, output4)); +} + +TEST(String, backslashify) { + EXPECT_EQ("abc", string("abc")); + EXPECT_EQ("abc", backslashify(string("abc"))); + EXPECT_EQ("abc\\r", backslashify(string("abc\r"))); + EXPECT_EQ("abc\\x0d", backslashify(string("abc\r"), true)); + EXPECT_EQ("\\0\\0", backslashify(string(2, '\0'))); +} + +TEST(String, humanify) { + // Simple cases; output is obvious. + EXPECT_EQ("abc", humanify(string("abc"))); + EXPECT_EQ("abc\\\\r", humanify(string("abc\\r"))); + EXPECT_EQ("0xff", humanify(string("\xff"))); + EXPECT_EQ("abc\\xff", humanify(string("abc\xff"))); + EXPECT_EQ("abc\\b", humanify(string("abc\b"))); + EXPECT_EQ("0x00", humanify(string(1, '\0'))); + EXPECT_EQ("0x0000", humanify(string(2, '\0'))); + + + // Mostly printable, so backslash! 80, 60, and 40% printable, respectively + EXPECT_EQ("aaaa\\xff", humanify(string("aaaa\xff"))); + EXPECT_EQ("aaa\\xff\\xff", humanify(string("aaa\xff\xff"))); + EXPECT_EQ("aa\\xff\\xff\\xff", humanify(string("aa\xff\xff\xff"))); + + // 20% printable, and the printable portion isn't the prefix; hexify! + EXPECT_EQ("0xff61ffffff", humanify(string("\xff" "a\xff\xff\xff"))); + + // Same as previous, except swap first two chars; prefix is + // printable and within the threshold, so backslashify. + EXPECT_EQ("a\\xff\\xff\\xff\\xff", humanify(string("a\xff\xff\xff\xff"))); + + // Just too much unprintable; hex, despite prefix. + EXPECT_EQ("0x61ffffffffff", humanify(string("a\xff\xff\xff\xff\xff"))); +} + ////////////////////////////////////////////////////////////////////// BENCHMARK(splitOnSingleChar, iters) { -- 2.34.1