folly/String-inl.h

   1 /*
   2  * Copyright 2012 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef FOLLY_STRING_INL_H_
  18 #define FOLLY_STRING_INL_H_
  19
  20 #include <stdexcept>
  21 #include <iterator>
  22
  23 #ifndef FOLLY_BASE_STRING_H_
  24 #error This file may only be included from String.h
  25 #endif
  26
  27 namespace folly {
  28
  29 namespace detail {
  30 // Map from character code to value of one-character escape sequence
  31 // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
  32 // an octal escape sequence, or 'P' if the character is printable and
  33 // should be printed as is.
  34 extern const char cEscapeTable[];
  35 }  // namespace detail
  36
  37 template <class String>
  38 void cEscape(StringPiece str, String& out) {
  39   char esc[4];
  40   esc[0] = '\\';
  41   out.reserve(out.size() + str.size());
  42   auto p = str.begin();
  43   auto last = p;  // last regular character
  44   // We advance over runs of regular characters (printable, not double-quote or
  45   // backslash) and copy them in one go; this is faster than calling push_back
  46   // repeatedly.
  47   while (p != str.end()) {
  48     char c = *p;
  49     unsigned char v = static_cast<unsigned char>(c);
  50     char e = detail::cEscapeTable[v];
  51     if (e == 'P') {  // printable
  52       ++p;
  53     } else if (e == 'O') {  // octal
  54       out.append(&*last, p - last);
  55       esc[1] = '0' + ((v >> 6) & 7);
  56       esc[2] = '0' + ((v >> 3) & 7);
  57       esc[3] = '0' + (v & 7);
  58       out.append(esc, 4);
  59       ++p;
  60       last = p;
  61     } else {  // special 1-character escape
  62       out.append(&*last, p - last);
  63       esc[1] = e;
  64       out.append(esc, 2);
  65       ++p;
  66       last = p;
  67     }
  68   }
  69   out.append(&*last, p - last);
  70 }
  71
  72 namespace detail {
  73 // Map from the character code of the character following a backslash to
  74 // the unescaped character if a valid one-character escape sequence
  75 // ('n' maps to 10 = '\n'), 'O' if this is the first character of an
  76 // octal escape sequence, 'X' if this is the first character of a
  77 // hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
  78 extern const char cUnescapeTable[];
  79
  80 // Map from the character code to the hex value, or 16 if invalid hex char.
  81 extern const unsigned char hexTable[];
  82 }  // namespace detail
  83
  84 template <class String>
  85 void cUnescape(StringPiece str, String& out, bool strict) {
  86   out.reserve(out.size() + str.size());
  87   auto p = str.begin();
  88   auto last = p;  // last regular character (not part of an escape sequence)
  89   // We advance over runs of regular characters (not backslash) and copy them
  90   // in one go; this is faster than calling push_back repeatedly.
  91   while (p != str.end()) {
  92     char c = *p;
  93     if (c != '\\') {  // normal case
  94       ++p;
  95       continue;
  96     }
  97     out.append(&*last, p - last);
  98     if (p == str.end()) {  // backslash at end of string
  99       if (strict) {
 100         throw std::invalid_argument("incomplete escape sequence");
 101       }
 102       out.push_back('\\');
 103       last = p;
 104       continue;
 105     }
 106     ++p;
 107     char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
 108     if (e == 'O') {  // octal
 109       unsigned char val = 0;
 110       for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
 111            ++i, ++p) {
 112         val = (val << 3) | (*p - '0');
 113       }
 114       out.push_back(val);
 115       last = p;
 116     } else if (e == 'X') {  // hex
 117       ++p;
 118       if (p == str.end()) {  // \x at end of string
 119         if (strict) {
 120           throw std::invalid_argument("incomplete hex escape sequence");
 121         }
 122         out.append("\\x");
 123         last = p;
 124         continue;
 125       }
 126       unsigned char val = 0;
 127       unsigned char h;
 128       for (; (p != str.end() &&
 129               (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
 130            ++p) {
 131         val = (val << 4) | h;
 132       }
 133       out.push_back(val);
 134       last = p;
 135     } else if (e == 'I') {  // invalid
 136       if (strict) {
 137         throw std::invalid_argument("invalid escape sequence");
 138       }
 139       out.push_back('\\');
 140       out.push_back(*p);
 141       ++p;
 142       last = p;
 143     } else {  // standard escape sequence, \' etc
 144       out.push_back(e);
 145       ++p;
 146       last = p;
 147     }
 148   }
 149   out.append(&*last, p - last);
 150 }
 151
 152 namespace detail {
 153
 154 /*
 155  * The following functions are type-overloaded helpers for
 156  * internalSplit().
 157  */
 158 inline size_t delimSize(char)          { return 1; }
 159 inline size_t delimSize(StringPiece s) { return s.size(); }
 160 inline bool atDelim(const char* s, char c) {
 161  return *s == c;
 162 }
 163 inline bool atDelim(const char* s, StringPiece sp) {
 164   return !std::memcmp(s, sp.start(), sp.size());
 165 }
 166
 167 // These are used to short-circuit internalSplit() in the case of
 168 // 1-character strings.
 169 inline char delimFront(char c) {
 170   // This one exists only for compile-time; it should never be called.
 171   std::abort();
 172   return c;
 173 }
 174 inline char delimFront(StringPiece s) {
 175   assert(!s.empty() && s.start() != nullptr);
 176   return *s.start();
 177 }
 178
 179 /*
 180  * These output conversion templates allow us to support multiple
 181  * output string types, even when we are using an arbitrary
 182  * OutputIterator.
 183  */
 184 template<class OutStringT> struct OutputConverter {};
 185
 186 template<> struct OutputConverter<std::string> {
 187   std::string operator()(StringPiece sp) const {
 188     return sp.toString();
 189   }
 190 };
 191
 192 template<> struct OutputConverter<fbstring> {
 193   fbstring operator()(StringPiece sp) const {
 194     return sp.toFbstring();
 195   }
 196 };
 197
 198 template<> struct OutputConverter<StringPiece> {
 199   StringPiece operator()(StringPiece sp) const { return sp; }
 200 };
 201
 202 /*
 203  * Shared implementation for all the split() overloads.
 204  *
 205  * This uses some external helpers that are overloaded to let this
 206  * algorithm be more performant if the deliminator is a single
 207  * character instead of a whole string.
 208  *
 209  * @param ignoreEmpty iff true, don't copy empty segments to output
 210  */
 211 template<class OutStringT, class DelimT, class OutputIterator>
 212 void internalSplit(DelimT delim, StringPiece sp, OutputIterator out,
 213     bool ignoreEmpty) {
 214   assert(sp.start() != nullptr);
 215
 216   const char* s = sp.start();
 217   const size_t strSize = sp.size();
 218   const size_t dSize = delimSize(delim);
 219
 220   OutputConverter<OutStringT> conv;
 221
 222   if (dSize > strSize || dSize == 0) {
 223     if (!ignoreEmpty || strSize > 0) {
 224       *out++ = conv(sp);
 225     }
 226     return;
 227   }
 228   if (boost::is_same<DelimT,StringPiece>::value && dSize == 1) {
 229     // Call the char version because it is significantly faster.
 230     return internalSplit<OutStringT>(delimFront(delim), sp, out,
 231       ignoreEmpty);
 232   }
 233
 234   int tokenStartPos = 0;
 235   int tokenSize = 0;
 236   for (int i = 0; i <= strSize - dSize; ++i) {
 237     if (atDelim(&s[i], delim)) {
 238       if (!ignoreEmpty || tokenSize > 0) {
 239         *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
 240       }
 241
 242       tokenStartPos = i + dSize;
 243       tokenSize = 0;
 244       i += dSize - 1;
 245     } else {
 246       ++tokenSize;
 247     }
 248   }
 249
 250   if (!ignoreEmpty || tokenSize > 0) {
 251     tokenSize = strSize - tokenStartPos;
 252     *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
 253   }
 254 }
 255
 256 template<class String> StringPiece prepareDelim(const String& s) {
 257   return StringPiece(s);
 258 }
 259 inline char prepareDelim(char c) { return c; }
 260
 261 }
 262
 263 //////////////////////////////////////////////////////////////////////
 264
 265 template<class Delim, class String, class OutputType>
 266 void split(const Delim& delimiter,
 267            const String& input,
 268            std::vector<OutputType>& out,
 269            bool ignoreEmpty) {
 270   detail::internalSplit<OutputType>(
 271     detail::prepareDelim(delimiter),
 272     StringPiece(input),
 273     std::back_inserter(out),
 274     ignoreEmpty);
 275 }
 276
 277 template<class Delim, class String, class OutputType>
 278 void split(const Delim& delimiter,
 279            const String& input,
 280            fbvector<OutputType>& out,
 281            bool ignoreEmpty) {
 282   detail::internalSplit<OutputType>(
 283     detail::prepareDelim(delimiter),
 284     StringPiece(input),
 285     std::back_inserter(out),
 286     ignoreEmpty);
 287 }
 288
 289 template<class OutputValueType, class Delim, class String,
 290          class OutputIterator>
 291 void splitTo(const Delim& delimiter,
 292              const String& input,
 293              OutputIterator out,
 294              bool ignoreEmpty) {
 295   detail::internalSplit<OutputValueType>(
 296     detail::prepareDelim(delimiter),
 297     StringPiece(input),
 298     out,
 299     ignoreEmpty);
 300 }
 301
 302 namespace detail {
 303
 304 template <class Iterator>
 305 struct IsStringContainerIterator :
 306   IsSomeString<typename std::iterator_traits<Iterator>::value_type> {
 307 };
 308
 309 template <class Delim, class Iterator, class String>
 310 void internalJoinAppend(Delim delimiter,
 311                         Iterator begin,
 312                         Iterator end,
 313                         String& output) {
 314   assert(begin != end);
 315   if (std::is_same<Delim, StringPiece>::value &&
 316       delimSize(delimiter) == 1) {
 317     internalJoinAppend(delimFront(delimiter), begin, end, output);
 318     return;
 319   }
 320   toAppend(*begin, &output);
 321   while (++begin != end) {
 322     toAppend(delimiter, *begin, &output);
 323   }
 324 }
 325
 326 template <class Delim, class Iterator, class String>
 327 typename std::enable_if<IsStringContainerIterator<Iterator>::value>::type
 328 internalJoin(Delim delimiter,
 329              Iterator begin,
 330              Iterator end,
 331              String& output) {
 332   output.clear();
 333   if (begin == end) {
 334     return;
 335   }
 336   const size_t dsize = delimSize(delimiter);
 337   Iterator it = begin;
 338   size_t size = it->size();
 339   while (++it != end) {
 340     size += dsize + it->size();
 341   }
 342   output.reserve(size);
 343   internalJoinAppend(delimiter, begin, end, output);
 344 }
 345
 346 template <class Delim, class Iterator, class String>
 347 typename std::enable_if<!IsStringContainerIterator<Iterator>::value>::type
 348 internalJoin(Delim delimiter,
 349              Iterator begin,
 350              Iterator end,
 351              String& output) {
 352   output.clear();
 353   if (begin == end) {
 354     return;
 355   }
 356   internalJoinAppend(delimiter, begin, end, output);
 357 }
 358
 359 }  // namespace detail
 360
 361 template <class Delim, class Iterator, class String>
 362 void join(const Delim& delimiter,
 363           Iterator begin,
 364           Iterator end,
 365           String& output) {
 366   detail::internalJoin(
 367     detail::prepareDelim(delimiter),
 368     begin,
 369     end,
 370     output);
 371 }
 372
 373 template <class String1, class String2>
 374 void backslashify(const String1& input, String2& output, bool hex_style) {
 375   static const char hexValues[] = "0123456789abcdef";
 376   output.clear();
 377   output.reserve(3 * input.size());
 378   for (unsigned char c : input) {
 379     // less than space or greater than '~' are considered unprintable
 380     if (c < 0x20 || c > 0x7e || c == '\\') {
 381       bool hex_append = false;
 382       output.push_back('\\');
 383       if (hex_style) {
 384         hex_append = true;
 385       } else {
 386         if (c == '\r') output += 'r';
 387         else if (c == '\n') output += 'n';
 388         else if (c == '\t') output += 't';
 389         else if (c == '\a') output += 'a';
 390         else if (c == '\b') output += 'b';
 391         else if (c == '\0') output += '0';
 392         else if (c == '\\') output += '\\';
 393         else {
 394           hex_append = true;
 395         }
 396       }
 397       if (hex_append) {
 398         output.push_back('x');
 399         output.push_back(hexValues[(c >> 4) & 0xf]);
 400         output.push_back(hexValues[c & 0xf]);
 401       }
 402     } else {
 403       output += c;
 404     }
 405   }
 406 }
 407
 408 template <class String1, class String2>
 409 void humanify(const String1& input, String2& output) {
 410   int numUnprintable = 0;
 411   int numPrintablePrefix = 0;
 412   for (unsigned char c : input) {
 413     if (c < 0x20 || c > 0x7e || c == '\\') {
 414       ++numUnprintable;
 415     }
 416     if (numUnprintable == 0) {
 417       ++numPrintablePrefix;
 418     }
 419   }
 420
 421   // hexlify doubles a string's size; backslashify can potentially
 422   // explode it by 4x.  Now, the printable range of the ascii
 423   // "spectrum" is around 95 out of 256 values, so a "random" binary
 424   // string should be around 60% unprintable.  We use a 50% hueristic
 425   // here, so if a string is 60% unprintable, then we just use hex
 426   // output.  Otherwise we backslash.
 427   //
 428   // UTF8 is completely ignored; as a result, utf8 characters will
 429   // likely be \x escaped (since most common glyphs fit in two bytes).
 430   // This is a tradeoff of complexity/speed instead of a convenience
 431   // that likely would rarely matter.  Moreover, this function is more
 432   // about displaying underlying bytes, not about displaying glyphs
 433   // from languages.
 434   if (numUnprintable == 0) {
 435     output = input;
 436   } else if (5 * numUnprintable >= 3 * input.size()) {
 437     // However!  If we have a "meaningful" prefix of printable
 438     // characters, say 20% of the string, we backslashify under the
 439     // assumption viewing the prefix as ascii is worth blowing the
 440     // output size up a bit.
 441     if (5 * numPrintablePrefix >= input.size()) {
 442       backslashify(input, output);
 443     } else {
 444       output = "0x";
 445       hexlify(input, output, true /* append output */);
 446     }
 447   } else {
 448     backslashify(input, output);
 449   }
 450 }
 451
 452 template<class InputString, class OutputString>
 453 bool hexlify(const InputString& input, OutputString& output,
 454              bool append_output) {
 455   if (!append_output) output.clear();
 456
 457   static char hexValues[] = "0123456789abcdef";
 458   int j = output.size();
 459   output.resize(2 * input.size() + output.size());
 460   for (int i = 0; i < input.size(); ++i) {
 461     int ch = input[i];
 462     output[j++] = hexValues[(ch >> 4) & 0xf];
 463     output[j++] = hexValues[ch & 0xf];
 464   }
 465   return true;
 466 }
 467
 468 template<class InputString, class OutputString>
 469 bool unhexlify(const InputString& input, OutputString& output) {
 470   if (input.size() % 2 != 0) {
 471     return false;
 472   }
 473   output.resize(input.size() / 2);
 474   int j = 0;
 475   auto unhex = [](char c) -> int {
 476     return c >= '0' && c <= '9' ? c - '0' :
 477            c >= 'A' && c <= 'F' ? c - 'A' + 10 :
 478            c >= 'a' && c <= 'f' ? c - 'a' + 10 :
 479            -1;
 480   };
 481
 482   for (int i = 0; i < input.size(); i += 2) {
 483     int highBits = unhex(input[i]);
 484     int lowBits = unhex(input[i + 1]);
 485     if (highBits < 0 || lowBits < 0) {
 486       return false;
 487     }
 488     output[j++] = (highBits << 4) + lowBits;
 489   }
 490   return true;
 491 }
 492
 493 namespace detail {
 494 /**
 495  * Hex-dump at most 16 bytes starting at offset from a memory area of size
 496  * bytes.  Return the number of bytes actually dumped.
 497  */
 498 size_t hexDumpLine(const void* ptr, size_t offset, size_t size,
 499                    std::string& line);
 500 }  // namespace detail
 501
 502 template <class OutIt>
 503 void hexDump(const void* ptr, size_t size, OutIt out) {
 504   size_t offset = 0;
 505   std::string line;
 506   while (offset < size) {
 507     offset += detail::hexDumpLine(ptr, offset, size, line);
 508     *out++ = line;
 509   }
 510 }
 511
 512 }  // namespace folly
 513
 514 #endif /* FOLLY_STRING_INL_H_ */
 515