2 * Copyright 2016 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
22 #ifndef FOLLY_STRING_H_
23 #error This file may only be included from String.h
29 // Map from character code to value of one-character escape sequence
30 // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
31 // an octal escape sequence, or 'P' if the character is printable and
32 // should be printed as is.
33 extern const char cEscapeTable[];
36 template <class String>
37 void cEscape(StringPiece str, String& out) {
40 out.reserve(out.size() + str.size());
42 auto last = p; // last regular character
43 // We advance over runs of regular characters (printable, not double-quote or
44 // backslash) and copy them in one go; this is faster than calling push_back
46 while (p != str.end()) {
48 unsigned char v = static_cast<unsigned char>(c);
49 char e = detail::cEscapeTable[v];
50 if (e == 'P') { // printable
52 } else if (e == 'O') { // octal
53 out.append(&*last, p - last);
54 esc[1] = '0' + ((v >> 6) & 7);
55 esc[2] = '0' + ((v >> 3) & 7);
56 esc[3] = '0' + (v & 7);
60 } else { // special 1-character escape
61 out.append(&*last, p - last);
68 out.append(&*last, p - last);
72 // Map from the character code of the character following a backslash to
73 // the unescaped character if a valid one-character escape sequence
74 // ('n' maps to 10 = '\n'), 'O' if this is the first character of an
75 // octal escape sequence, 'X' if this is the first character of a
76 // hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
77 extern const char cUnescapeTable[];
79 // Map from the character code to the hex value, or 16 if invalid hex char.
80 extern const unsigned char hexTable[];
83 template <class String>
84 void cUnescape(StringPiece str, String& out, bool strict) {
85 out.reserve(out.size() + str.size());
87 auto last = p; // last regular character (not part of an escape sequence)
88 // We advance over runs of regular characters (not backslash) and copy them
89 // in one go; this is faster than calling push_back repeatedly.
90 while (p != str.end()) {
92 if (c != '\\') { // normal case
96 out.append(&*last, p - last);
97 if (p == str.end()) { // backslash at end of string
99 throw std::invalid_argument("incomplete escape sequence");
106 char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
107 if (e == 'O') { // octal
108 unsigned char val = 0;
109 for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
111 val = (val << 3) | (*p - '0');
115 } else if (e == 'X') { // hex
117 if (p == str.end()) { // \x at end of string
119 throw std::invalid_argument("incomplete hex escape sequence");
125 unsigned char val = 0;
127 for (; (p != str.end() &&
128 (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
130 val = (val << 4) | h;
134 } else if (e == 'I') { // invalid
136 throw std::invalid_argument("invalid escape sequence");
142 } else { // standard escape sequence, \' etc
148 out.append(&*last, p - last);
152 // Map from character code to escape mode:
155 // 2 = pass through in PATH mode
156 // 3 = space, replace with '+' in QUERY mode
157 // 4 = percent-encode
158 extern const unsigned char uriEscapeTable[];
159 } // namespace detail
161 template <class String>
162 void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
163 static const char hexValues[] = "0123456789abcdef";
166 // Preallocate assuming that 25% of the input string will be escaped
167 out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
168 auto p = str.begin();
169 auto last = p; // last regular character
170 // We advance over runs of passthrough characters and copy them in one go;
171 // this is faster than calling push_back repeatedly.
172 unsigned char minEncode = static_cast<unsigned char>(mode);
173 while (p != str.end()) {
175 unsigned char v = static_cast<unsigned char>(c);
176 unsigned char discriminator = detail::uriEscapeTable[v];
177 if (LIKELY(discriminator <= minEncode)) {
179 } else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
180 out.append(&*last, p - last);
185 out.append(&*last, p - last);
186 esc[1] = hexValues[v >> 4];
187 esc[2] = hexValues[v & 0x0f];
193 out.append(&*last, p - last);
196 template <class String>
197 void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
198 out.reserve(out.size() + str.size());
199 auto p = str.begin();
201 // We advance over runs of passthrough characters and copy them in one go;
202 // this is faster than calling push_back repeatedly.
203 while (p != str.end()) {
208 if (UNLIKELY(std::distance(p, str.end()) < 3)) {
209 throw std::invalid_argument("incomplete percent encode sequence");
211 auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
212 auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
213 if (UNLIKELY(h1 == 16 || h2 == 16)) {
214 throw std::invalid_argument("invalid percent encode sequence");
216 out.append(&*last, p - last);
217 out.push_back((h1 << 4) | h2);
223 if (mode == UriEscapeMode::QUERY) {
224 out.append(&*last, p - last);
236 out.append(&*last, p - last);
242 * The following functions are type-overloaded helpers for
245 inline size_t delimSize(char) { return 1; }
246 inline size_t delimSize(StringPiece s) { return s.size(); }
247 inline bool atDelim(const char* s, char c) {
250 inline bool atDelim(const char* s, StringPiece sp) {
251 return !std::memcmp(s, sp.start(), sp.size());
254 // These are used to short-circuit internalSplit() in the case of
255 // 1-character strings.
256 inline char delimFront(char c) {
257 // This one exists only for compile-time; it should never be called.
261 inline char delimFront(StringPiece s) {
262 assert(!s.empty() && s.start() != nullptr);
267 * These output conversion templates allow us to support multiple
268 * output string types, even when we are using an arbitrary
271 template<class OutStringT> struct OutputConverter {};
273 template<> struct OutputConverter<std::string> {
274 std::string operator()(StringPiece sp) const {
275 return sp.toString();
279 template<> struct OutputConverter<fbstring> {
280 fbstring operator()(StringPiece sp) const {
281 return sp.toFbstring();
285 template<> struct OutputConverter<StringPiece> {
286 StringPiece operator()(StringPiece sp) const { return sp; }
290 * Shared implementation for all the split() overloads.
292 * This uses some external helpers that are overloaded to let this
293 * algorithm be more performant if the deliminator is a single
294 * character instead of a whole string.
296 * @param ignoreEmpty iff true, don't copy empty segments to output
298 template<class OutStringT, class DelimT, class OutputIterator>
299 void internalSplit(DelimT delim, StringPiece sp, OutputIterator out,
301 assert(sp.empty() || sp.start() != nullptr);
303 const char* s = sp.start();
304 const size_t strSize = sp.size();
305 const size_t dSize = delimSize(delim);
307 OutputConverter<OutStringT> conv;
309 if (dSize > strSize || dSize == 0) {
310 if (!ignoreEmpty || strSize > 0) {
315 if (boost::is_same<DelimT,StringPiece>::value && dSize == 1) {
316 // Call the char version because it is significantly faster.
317 return internalSplit<OutStringT>(delimFront(delim), sp, out,
321 size_t tokenStartPos = 0;
322 size_t tokenSize = 0;
323 for (size_t i = 0; i <= strSize - dSize; ++i) {
324 if (atDelim(&s[i], delim)) {
325 if (!ignoreEmpty || tokenSize > 0) {
326 *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
329 tokenStartPos = i + dSize;
336 tokenSize = strSize - tokenStartPos;
337 if (!ignoreEmpty || tokenSize > 0) {
338 *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
342 template<class String> StringPiece prepareDelim(const String& s) {
343 return StringPiece(s);
345 inline char prepareDelim(char c) { return c; }
350 static Dst from(const Src& src) { return folly::to<Dst>(src); }
351 static Dst from(const Dst& src) { return src; }
357 typename std::enable_if<IsSplitTargetType<OutputType>::value, bool>::type
358 splitFixed(const Delim& delimiter,
361 if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) {
364 out = convertTo<OutputType>::from(input);
371 class... OutputTypes>
372 typename std::enable_if<IsSplitTargetType<OutputType>::value, bool>::type
373 splitFixed(const Delim& delimiter,
376 OutputTypes&... outTail) {
377 size_t cut = input.find(delimiter);
378 if (UNLIKELY(cut == std::string::npos)) {
381 StringPiece head(input.begin(), input.begin() + cut);
382 StringPiece tail(input.begin() + cut + detail::delimSize(delimiter),
384 if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) {
385 outHead = convertTo<OutputType>::from(head);
393 //////////////////////////////////////////////////////////////////////
395 template<class Delim, class String, class OutputType>
396 void split(const Delim& delimiter,
398 std::vector<OutputType>& out,
400 detail::internalSplit<OutputType>(
401 detail::prepareDelim(delimiter),
403 std::back_inserter(out),
407 template<class Delim, class String, class OutputType>
408 void split(const Delim& delimiter,
410 fbvector<OutputType>& out,
412 detail::internalSplit<OutputType>(
413 detail::prepareDelim(delimiter),
415 std::back_inserter(out),
419 template<class OutputValueType, class Delim, class String,
420 class OutputIterator>
421 void splitTo(const Delim& delimiter,
425 detail::internalSplit<OutputValueType>(
426 detail::prepareDelim(delimiter),
435 class... OutputTypes>
436 typename std::enable_if<IsSplitTargetType<OutputType>::value, bool>::type
437 split(const Delim& delimiter,
440 OutputTypes&... outTail) {
441 return detail::splitFixed<exact>(
442 detail::prepareDelim(delimiter),
451 * If a type can have its string size determined cheaply, we can more
452 * efficiently append it in a loop (see internalJoinAppend). Note that the
453 * struct need not conform to the std::string api completely (ex. does not need
454 * to implement append()).
456 template <class T> struct IsSizableString {
457 enum { value = IsSomeString<T>::value
458 || std::is_same<T, StringPiece>::value };
461 template <class Iterator>
462 struct IsSizableStringContainerIterator :
463 IsSizableString<typename std::iterator_traits<Iterator>::value_type> {
466 template <class Delim, class Iterator, class String>
467 void internalJoinAppend(Delim delimiter,
471 assert(begin != end);
472 if (std::is_same<Delim, StringPiece>::value &&
473 delimSize(delimiter) == 1) {
474 internalJoinAppend(delimFront(delimiter), begin, end, output);
477 toAppend(*begin, &output);
478 while (++begin != end) {
479 toAppend(delimiter, *begin, &output);
483 template <class Delim, class Iterator, class String>
484 typename std::enable_if<IsSizableStringContainerIterator<Iterator>::value>::type
485 internalJoin(Delim delimiter,
493 const size_t dsize = delimSize(delimiter);
495 size_t size = it->size();
496 while (++it != end) {
497 size += dsize + it->size();
499 output.reserve(size);
500 internalJoinAppend(delimiter, begin, end, output);
503 template <class Delim, class Iterator, class String>
505 std::enable_if<!IsSizableStringContainerIterator<Iterator>::value>::type
506 internalJoin(Delim delimiter,
514 internalJoinAppend(delimiter, begin, end, output);
517 } // namespace detail
519 template <class Delim, class Iterator, class String>
520 void join(const Delim& delimiter,
524 detail::internalJoin(
525 detail::prepareDelim(delimiter),
531 template <class String1, class String2>
532 void backslashify(const String1& input, String2& output, bool hex_style) {
533 static const char hexValues[] = "0123456789abcdef";
535 output.reserve(3 * input.size());
536 for (unsigned char c : input) {
537 // less than space or greater than '~' are considered unprintable
538 if (c < 0x20 || c > 0x7e || c == '\\') {
539 bool hex_append = false;
540 output.push_back('\\');
544 if (c == '\r') output += 'r';
545 else if (c == '\n') output += 'n';
546 else if (c == '\t') output += 't';
547 else if (c == '\a') output += 'a';
548 else if (c == '\b') output += 'b';
549 else if (c == '\0') output += '0';
550 else if (c == '\\') output += '\\';
556 output.push_back('x');
557 output.push_back(hexValues[(c >> 4) & 0xf]);
558 output.push_back(hexValues[c & 0xf]);
566 template <class String1, class String2>
567 void humanify(const String1& input, String2& output) {
568 size_t numUnprintable = 0;
569 size_t numPrintablePrefix = 0;
570 for (unsigned char c : input) {
571 if (c < 0x20 || c > 0x7e || c == '\\') {
574 if (numUnprintable == 0) {
575 ++numPrintablePrefix;
579 // hexlify doubles a string's size; backslashify can potentially
580 // explode it by 4x. Now, the printable range of the ascii
581 // "spectrum" is around 95 out of 256 values, so a "random" binary
582 // string should be around 60% unprintable. We use a 50% hueristic
583 // here, so if a string is 60% unprintable, then we just use hex
584 // output. Otherwise we backslash.
586 // UTF8 is completely ignored; as a result, utf8 characters will
587 // likely be \x escaped (since most common glyphs fit in two bytes).
588 // This is a tradeoff of complexity/speed instead of a convenience
589 // that likely would rarely matter. Moreover, this function is more
590 // about displaying underlying bytes, not about displaying glyphs
592 if (numUnprintable == 0) {
594 } else if (5 * numUnprintable >= 3 * input.size()) {
595 // However! If we have a "meaningful" prefix of printable
596 // characters, say 20% of the string, we backslashify under the
597 // assumption viewing the prefix as ascii is worth blowing the
598 // output size up a bit.
599 if (5 * numPrintablePrefix >= input.size()) {
600 backslashify(input, output);
603 hexlify(input, output, true /* append output */);
606 backslashify(input, output);
610 template<class InputString, class OutputString>
611 bool hexlify(const InputString& input, OutputString& output,
612 bool append_output) {
613 if (!append_output) output.clear();
615 static char hexValues[] = "0123456789abcdef";
616 auto j = output.size();
617 output.resize(2 * input.size() + output.size());
618 for (size_t i = 0; i < input.size(); ++i) {
620 output[j++] = hexValues[(ch >> 4) & 0xf];
621 output[j++] = hexValues[ch & 0xf];
626 template<class InputString, class OutputString>
627 bool unhexlify(const InputString& input, OutputString& output) {
628 if (input.size() % 2 != 0) {
631 output.resize(input.size() / 2);
633 auto unhex = [](char c) -> int {
634 return c >= '0' && c <= '9' ? c - '0' :
635 c >= 'A' && c <= 'F' ? c - 'A' + 10 :
636 c >= 'a' && c <= 'f' ? c - 'a' + 10 :
640 for (size_t i = 0; i < input.size(); i += 2) {
641 int highBits = unhex(input[i]);
642 int lowBits = unhex(input[i + 1]);
643 if (highBits < 0 || lowBits < 0) {
646 output[j++] = (highBits << 4) + lowBits;
653 * Hex-dump at most 16 bytes starting at offset from a memory area of size
654 * bytes. Return the number of bytes actually dumped.
656 size_t hexDumpLine(const void* ptr, size_t offset, size_t size,
658 } // namespace detail
660 template <class OutIt>
661 void hexDump(const void* ptr, size_t size, OutIt out) {
664 while (offset < size) {
665 offset += detail::hexDumpLine(ptr, offset, size, line);