folly/String-inl.h

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #pragma once
  18
  19 #include <stdexcept>
  20 #include <iterator>
  21
  22 #ifndef FOLLY_STRING_H_
  23 #error This file may only be included from String.h
  24 #endif
  25
  26 namespace folly {
  27
  28 namespace detail {
  29 // Map from character code to value of one-character escape sequence
  30 // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
  31 // an octal escape sequence, or 'P' if the character is printable and
  32 // should be printed as is.
  33 extern const char cEscapeTable[];
  34 }  // namespace detail
  35
  36 template <class String>
  37 void cEscape(StringPiece str, String& out) {
  38   char esc[4];
  39   esc[0] = '\\';
  40   out.reserve(out.size() + str.size());
  41   auto p = str.begin();
  42   auto last = p;  // last regular character
  43   // We advance over runs of regular characters (printable, not double-quote or
  44   // backslash) and copy them in one go; this is faster than calling push_back
  45   // repeatedly.
  46   while (p != str.end()) {
  47     char c = *p;
  48     unsigned char v = static_cast<unsigned char>(c);
  49     char e = detail::cEscapeTable[v];
  50     if (e == 'P') {  // printable
  51       ++p;
  52     } else if (e == 'O') {  // octal
  53       out.append(&*last, p - last);
  54       esc[1] = '0' + ((v >> 6) & 7);
  55       esc[2] = '0' + ((v >> 3) & 7);
  56       esc[3] = '0' + (v & 7);
  57       out.append(esc, 4);
  58       ++p;
  59       last = p;
  60     } else {  // special 1-character escape
  61       out.append(&*last, p - last);
  62       esc[1] = e;
  63       out.append(esc, 2);
  64       ++p;
  65       last = p;
  66     }
  67   }
  68   out.append(&*last, p - last);
  69 }
  70
  71 namespace detail {
  72 // Map from the character code of the character following a backslash to
  73 // the unescaped character if a valid one-character escape sequence
  74 // ('n' maps to 10 = '\n'), 'O' if this is the first character of an
  75 // octal escape sequence, 'X' if this is the first character of a
  76 // hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
  77 extern const char cUnescapeTable[];
  78
  79 // Map from the character code to the hex value, or 16 if invalid hex char.
  80 extern const unsigned char hexTable[];
  81 }  // namespace detail
  82
  83 template <class String>
  84 void cUnescape(StringPiece str, String& out, bool strict) {
  85   out.reserve(out.size() + str.size());
  86   auto p = str.begin();
  87   auto last = p;  // last regular character (not part of an escape sequence)
  88   // We advance over runs of regular characters (not backslash) and copy them
  89   // in one go; this is faster than calling push_back repeatedly.
  90   while (p != str.end()) {
  91     char c = *p;
  92     if (c != '\\') {  // normal case
  93       ++p;
  94       continue;
  95     }
  96     out.append(&*last, p - last);
  97     if (p == str.end()) {  // backslash at end of string
  98       if (strict) {
  99         throw std::invalid_argument("incomplete escape sequence");
 100       }
 101       out.push_back('\\');
 102       last = p;
 103       continue;
 104     }
 105     ++p;
 106     char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
 107     if (e == 'O') {  // octal
 108       unsigned char val = 0;
 109       for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
 110            ++i, ++p) {
 111         val = (val << 3) | (*p - '0');
 112       }
 113       out.push_back(val);
 114       last = p;
 115     } else if (e == 'X') {  // hex
 116       ++p;
 117       if (p == str.end()) {  // \x at end of string
 118         if (strict) {
 119           throw std::invalid_argument("incomplete hex escape sequence");
 120         }
 121         out.append("\\x");
 122         last = p;
 123         continue;
 124       }
 125       unsigned char val = 0;
 126       unsigned char h;
 127       for (; (p != str.end() &&
 128               (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
 129            ++p) {
 130         val = (val << 4) | h;
 131       }
 132       out.push_back(val);
 133       last = p;
 134     } else if (e == 'I') {  // invalid
 135       if (strict) {
 136         throw std::invalid_argument("invalid escape sequence");
 137       }
 138       out.push_back('\\');
 139       out.push_back(*p);
 140       ++p;
 141       last = p;
 142     } else {  // standard escape sequence, \' etc
 143       out.push_back(e);
 144       ++p;
 145       last = p;
 146     }
 147   }
 148   out.append(&*last, p - last);
 149 }
 150
 151 namespace detail {
 152 // Map from character code to escape mode:
 153 // 0 = pass through
 154 // 1 = unused
 155 // 2 = pass through in PATH mode
 156 // 3 = space, replace with '+' in QUERY mode
 157 // 4 = percent-encode
 158 extern const unsigned char uriEscapeTable[];
 159 }  // namespace detail
 160
 161 template <class String>
 162 void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
 163   static const char hexValues[] = "0123456789abcdef";
 164   char esc[3];
 165   esc[0] = '%';
 166   // Preallocate assuming that 25% of the input string will be escaped
 167   out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
 168   auto p = str.begin();
 169   auto last = p;  // last regular character
 170   // We advance over runs of passthrough characters and copy them in one go;
 171   // this is faster than calling push_back repeatedly.
 172   unsigned char minEncode = static_cast<unsigned char>(mode);
 173   while (p != str.end()) {
 174     char c = *p;
 175     unsigned char v = static_cast<unsigned char>(c);
 176     unsigned char discriminator = detail::uriEscapeTable[v];
 177     if (LIKELY(discriminator <= minEncode)) {
 178       ++p;
 179     } else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
 180       out.append(&*last, p - last);
 181       out.push_back('+');
 182       ++p;
 183       last = p;
 184     } else {
 185       out.append(&*last, p - last);
 186       esc[1] = hexValues[v >> 4];
 187       esc[2] = hexValues[v & 0x0f];
 188       out.append(esc, 3);
 189       ++p;
 190       last = p;
 191     }
 192   }
 193   out.append(&*last, p - last);
 194 }
 195
 196 template <class String>
 197 void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
 198   out.reserve(out.size() + str.size());
 199   auto p = str.begin();
 200   auto last = p;
 201   // We advance over runs of passthrough characters and copy them in one go;
 202   // this is faster than calling push_back repeatedly.
 203   while (p != str.end()) {
 204     char c = *p;
 205     switch (c) {
 206     case '%':
 207       {
 208         if (UNLIKELY(std::distance(p, str.end()) < 3)) {
 209           throw std::invalid_argument("incomplete percent encode sequence");
 210         }
 211         auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
 212         auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
 213         if (UNLIKELY(h1 == 16 || h2 == 16)) {
 214           throw std::invalid_argument("invalid percent encode sequence");
 215         }
 216         out.append(&*last, p - last);
 217         out.push_back((h1 << 4) | h2);
 218         p += 3;
 219         last = p;
 220         break;
 221       }
 222     case '+':
 223       if (mode == UriEscapeMode::QUERY) {
 224         out.append(&*last, p - last);
 225         out.push_back(' ');
 226         ++p;
 227         last = p;
 228         break;
 229       }
 230       // else fallthrough
 231     default:
 232       ++p;
 233       break;
 234     }
 235   }
 236   out.append(&*last, p - last);
 237 }
 238
 239 namespace detail {
 240
 241 /*
 242  * The following functions are type-overloaded helpers for
 243  * internalSplit().
 244  */
 245 inline size_t delimSize(char)          { return 1; }
 246 inline size_t delimSize(StringPiece s) { return s.size(); }
 247 inline bool atDelim(const char* s, char c) {
 248  return *s == c;
 249 }
 250 inline bool atDelim(const char* s, StringPiece sp) {
 251   return !std::memcmp(s, sp.start(), sp.size());
 252 }
 253
 254 // These are used to short-circuit internalSplit() in the case of
 255 // 1-character strings.
 256 inline char delimFront(char c) {
 257   // This one exists only for compile-time; it should never be called.
 258   std::abort();
 259   return c;
 260 }
 261 inline char delimFront(StringPiece s) {
 262   assert(!s.empty() && s.start() != nullptr);
 263   return *s.start();
 264 }
 265
 266 /*
 267  * These output conversion templates allow us to support multiple
 268  * output string types, even when we are using an arbitrary
 269  * OutputIterator.
 270  */
 271 template<class OutStringT> struct OutputConverter {};
 272
 273 template<> struct OutputConverter<std::string> {
 274   std::string operator()(StringPiece sp) const {
 275     return sp.toString();
 276   }
 277 };
 278
 279 template<> struct OutputConverter<fbstring> {
 280   fbstring operator()(StringPiece sp) const {
 281     return sp.toFbstring();
 282   }
 283 };
 284
 285 template<> struct OutputConverter<StringPiece> {
 286   StringPiece operator()(StringPiece sp) const { return sp; }
 287 };
 288
 289 /*
 290  * Shared implementation for all the split() overloads.
 291  *
 292  * This uses some external helpers that are overloaded to let this
 293  * algorithm be more performant if the deliminator is a single
 294  * character instead of a whole string.
 295  *
 296  * @param ignoreEmpty iff true, don't copy empty segments to output
 297  */
 298 template<class OutStringT, class DelimT, class OutputIterator>
 299 void internalSplit(DelimT delim, StringPiece sp, OutputIterator out,
 300     bool ignoreEmpty) {
 301   assert(sp.empty() || sp.start() != nullptr);
 302
 303   const char* s = sp.start();
 304   const size_t strSize = sp.size();
 305   const size_t dSize = delimSize(delim);
 306
 307   OutputConverter<OutStringT> conv;
 308
 309   if (dSize > strSize || dSize == 0) {
 310     if (!ignoreEmpty || strSize > 0) {
 311       *out++ = conv(sp);
 312     }
 313     return;
 314   }
 315   if (boost::is_same<DelimT,StringPiece>::value && dSize == 1) {
 316     // Call the char version because it is significantly faster.
 317     return internalSplit<OutStringT>(delimFront(delim), sp, out,
 318       ignoreEmpty);
 319   }
 320
 321   size_t tokenStartPos = 0;
 322   size_t tokenSize = 0;
 323   for (size_t i = 0; i <= strSize - dSize; ++i) {
 324     if (atDelim(&s[i], delim)) {
 325       if (!ignoreEmpty || tokenSize > 0) {
 326         *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
 327       }
 328
 329       tokenStartPos = i + dSize;
 330       tokenSize = 0;
 331       i += dSize - 1;
 332     } else {
 333       ++tokenSize;
 334     }
 335   }
 336   tokenSize = strSize - tokenStartPos;
 337   if (!ignoreEmpty || tokenSize > 0) {
 338     *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
 339   }
 340 }
 341
 342 template<class String> StringPiece prepareDelim(const String& s) {
 343   return StringPiece(s);
 344 }
 345 inline char prepareDelim(char c) { return c; }
 346
 347 template <class Dst>
 348 struct convertTo {
 349   template <class Src>
 350   static Dst from(const Src& src) { return folly::to<Dst>(src); }
 351   static Dst from(const Dst& src) { return src; }
 352 };
 353
 354 template<bool exact,
 355          class Delim,
 356          class OutputType>
 357 typename std::enable_if<IsSplitTargetType<OutputType>::value, bool>::type
 358 splitFixed(const Delim& delimiter,
 359            StringPiece input,
 360            OutputType& out) {
 361   if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) {
 362     return false;
 363   }
 364   out = convertTo<OutputType>::from(input);
 365   return true;
 366 }
 367
 368 template<bool exact,
 369          class Delim,
 370          class OutputType,
 371          class... OutputTypes>
 372 typename std::enable_if<IsSplitTargetType<OutputType>::value, bool>::type
 373 splitFixed(const Delim& delimiter,
 374            StringPiece input,
 375            OutputType& outHead,
 376            OutputTypes&... outTail) {
 377   size_t cut = input.find(delimiter);
 378   if (UNLIKELY(cut == std::string::npos)) {
 379     return false;
 380   }
 381   StringPiece head(input.begin(), input.begin() + cut);
 382   StringPiece tail(input.begin() + cut + detail::delimSize(delimiter),
 383                    input.end());
 384   if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) {
 385     outHead = convertTo<OutputType>::from(head);
 386     return true;
 387   }
 388   return false;
 389 }
 390
 391 }
 392
 393 //////////////////////////////////////////////////////////////////////
 394
 395 template<class Delim, class String, class OutputType>
 396 void split(const Delim& delimiter,
 397            const String& input,
 398            std::vector<OutputType>& out,
 399            bool ignoreEmpty) {
 400   detail::internalSplit<OutputType>(
 401     detail::prepareDelim(delimiter),
 402     StringPiece(input),
 403     std::back_inserter(out),
 404     ignoreEmpty);
 405 }
 406
 407 template<class Delim, class String, class OutputType>
 408 void split(const Delim& delimiter,
 409            const String& input,
 410            fbvector<OutputType>& out,
 411            bool ignoreEmpty) {
 412   detail::internalSplit<OutputType>(
 413     detail::prepareDelim(delimiter),
 414     StringPiece(input),
 415     std::back_inserter(out),
 416     ignoreEmpty);
 417 }
 418
 419 template<class OutputValueType, class Delim, class String,
 420          class OutputIterator>
 421 void splitTo(const Delim& delimiter,
 422              const String& input,
 423              OutputIterator out,
 424              bool ignoreEmpty) {
 425   detail::internalSplit<OutputValueType>(
 426     detail::prepareDelim(delimiter),
 427     StringPiece(input),
 428     out,
 429     ignoreEmpty);
 430 }
 431
 432 template<bool exact,
 433          class Delim,
 434          class OutputType,
 435          class... OutputTypes>
 436 typename std::enable_if<IsSplitTargetType<OutputType>::value, bool>::type
 437 split(const Delim& delimiter,
 438       StringPiece input,
 439       OutputType& outHead,
 440       OutputTypes&... outTail) {
 441   return detail::splitFixed<exact>(
 442     detail::prepareDelim(delimiter),
 443     input,
 444     outHead,
 445     outTail...);
 446 }
 447
 448 namespace detail {
 449
 450 /*
 451  * If a type can have its string size determined cheaply, we can more
 452  * efficiently append it in a loop (see internalJoinAppend). Note that the
 453  * struct need not conform to the std::string api completely (ex. does not need
 454  * to implement append()).
 455  */
 456 template <class T> struct IsSizableString {
 457   enum { value = IsSomeString<T>::value
 458          || std::is_same<T, StringPiece>::value };
 459 };
 460
 461 template <class Iterator>
 462 struct IsSizableStringContainerIterator :
 463   IsSizableString<typename std::iterator_traits<Iterator>::value_type> {
 464 };
 465
 466 template <class Delim, class Iterator, class String>
 467 void internalJoinAppend(Delim delimiter,
 468                         Iterator begin,
 469                         Iterator end,
 470                         String& output) {
 471   assert(begin != end);
 472   if (std::is_same<Delim, StringPiece>::value &&
 473       delimSize(delimiter) == 1) {
 474     internalJoinAppend(delimFront(delimiter), begin, end, output);
 475     return;
 476   }
 477   toAppend(*begin, &output);
 478   while (++begin != end) {
 479     toAppend(delimiter, *begin, &output);
 480   }
 481 }
 482
 483 template <class Delim, class Iterator, class String>
 484 typename std::enable_if<IsSizableStringContainerIterator<Iterator>::value>::type
 485 internalJoin(Delim delimiter,
 486              Iterator begin,
 487              Iterator end,
 488              String& output) {
 489   output.clear();
 490   if (begin == end) {
 491     return;
 492   }
 493   const size_t dsize = delimSize(delimiter);
 494   Iterator it = begin;
 495   size_t size = it->size();
 496   while (++it != end) {
 497     size += dsize + it->size();
 498   }
 499   output.reserve(size);
 500   internalJoinAppend(delimiter, begin, end, output);
 501 }
 502
 503 template <class Delim, class Iterator, class String>
 504 typename
 505 std::enable_if<!IsSizableStringContainerIterator<Iterator>::value>::type
 506 internalJoin(Delim delimiter,
 507              Iterator begin,
 508              Iterator end,
 509              String& output) {
 510   output.clear();
 511   if (begin == end) {
 512     return;
 513   }
 514   internalJoinAppend(delimiter, begin, end, output);
 515 }
 516
 517 }  // namespace detail
 518
 519 template <class Delim, class Iterator, class String>
 520 void join(const Delim& delimiter,
 521           Iterator begin,
 522           Iterator end,
 523           String& output) {
 524   detail::internalJoin(
 525     detail::prepareDelim(delimiter),
 526     begin,
 527     end,
 528     output);
 529 }
 530
 531 template <class String1, class String2>
 532 void backslashify(const String1& input, String2& output, bool hex_style) {
 533   static const char hexValues[] = "0123456789abcdef";
 534   output.clear();
 535   output.reserve(3 * input.size());
 536   for (unsigned char c : input) {
 537     // less than space or greater than '~' are considered unprintable
 538     if (c < 0x20 || c > 0x7e || c == '\\') {
 539       bool hex_append = false;
 540       output.push_back('\\');
 541       if (hex_style) {
 542         hex_append = true;
 543       } else {
 544         if (c == '\r') output += 'r';
 545         else if (c == '\n') output += 'n';
 546         else if (c == '\t') output += 't';
 547         else if (c == '\a') output += 'a';
 548         else if (c == '\b') output += 'b';
 549         else if (c == '\0') output += '0';
 550         else if (c == '\\') output += '\\';
 551         else {
 552           hex_append = true;
 553         }
 554       }
 555       if (hex_append) {
 556         output.push_back('x');
 557         output.push_back(hexValues[(c >> 4) & 0xf]);
 558         output.push_back(hexValues[c & 0xf]);
 559       }
 560     } else {
 561       output += c;
 562     }
 563   }
 564 }
 565
 566 template <class String1, class String2>
 567 void humanify(const String1& input, String2& output) {
 568   size_t numUnprintable = 0;
 569   size_t numPrintablePrefix = 0;
 570   for (unsigned char c : input) {
 571     if (c < 0x20 || c > 0x7e || c == '\\') {
 572       ++numUnprintable;
 573     }
 574     if (numUnprintable == 0) {
 575       ++numPrintablePrefix;
 576     }
 577   }
 578
 579   // hexlify doubles a string's size; backslashify can potentially
 580   // explode it by 4x.  Now, the printable range of the ascii
 581   // "spectrum" is around 95 out of 256 values, so a "random" binary
 582   // string should be around 60% unprintable.  We use a 50% hueristic
 583   // here, so if a string is 60% unprintable, then we just use hex
 584   // output.  Otherwise we backslash.
 585   //
 586   // UTF8 is completely ignored; as a result, utf8 characters will
 587   // likely be \x escaped (since most common glyphs fit in two bytes).
 588   // This is a tradeoff of complexity/speed instead of a convenience
 589   // that likely would rarely matter.  Moreover, this function is more
 590   // about displaying underlying bytes, not about displaying glyphs
 591   // from languages.
 592   if (numUnprintable == 0) {
 593     output = input;
 594   } else if (5 * numUnprintable >= 3 * input.size()) {
 595     // However!  If we have a "meaningful" prefix of printable
 596     // characters, say 20% of the string, we backslashify under the
 597     // assumption viewing the prefix as ascii is worth blowing the
 598     // output size up a bit.
 599     if (5 * numPrintablePrefix >= input.size()) {
 600       backslashify(input, output);
 601     } else {
 602       output = "0x";
 603       hexlify(input, output, true /* append output */);
 604     }
 605   } else {
 606     backslashify(input, output);
 607   }
 608 }
 609
 610 template<class InputString, class OutputString>
 611 bool hexlify(const InputString& input, OutputString& output,
 612              bool append_output) {
 613   if (!append_output) output.clear();
 614
 615   static char hexValues[] = "0123456789abcdef";
 616   auto j = output.size();
 617   output.resize(2 * input.size() + output.size());
 618   for (size_t i = 0; i < input.size(); ++i) {
 619     int ch = input[i];
 620     output[j++] = hexValues[(ch >> 4) & 0xf];
 621     output[j++] = hexValues[ch & 0xf];
 622   }
 623   return true;
 624 }
 625
 626 template<class InputString, class OutputString>
 627 bool unhexlify(const InputString& input, OutputString& output) {
 628   if (input.size() % 2 != 0) {
 629     return false;
 630   }
 631   output.resize(input.size() / 2);
 632   int j = 0;
 633   auto unhex = [](char c) -> int {
 634     return c >= '0' && c <= '9' ? c - '0' :
 635            c >= 'A' && c <= 'F' ? c - 'A' + 10 :
 636            c >= 'a' && c <= 'f' ? c - 'a' + 10 :
 637            -1;
 638   };
 639
 640   for (size_t i = 0; i < input.size(); i += 2) {
 641     int highBits = unhex(input[i]);
 642     int lowBits = unhex(input[i + 1]);
 643     if (highBits < 0 || lowBits < 0) {
 644       return false;
 645     }
 646     output[j++] = (highBits << 4) + lowBits;
 647   }
 648   return true;
 649 }
 650
 651 namespace detail {
 652 /**
 653  * Hex-dump at most 16 bytes starting at offset from a memory area of size
 654  * bytes.  Return the number of bytes actually dumped.
 655  */
 656 size_t hexDumpLine(const void* ptr, size_t offset, size_t size,
 657                    std::string& line);
 658 }  // namespace detail
 659
 660 template <class OutIt>
 661 void hexDump(const void* ptr, size_t size, OutIt out) {
 662   size_t offset = 0;
 663   std::string line;
 664   while (offset < size) {
 665     offset += detail::hexDumpLine(ptr, offset, size, line);
 666     *out++ = line;
 667   }
 668 }
 669
 670 }  // namespace folly