2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
22 #include <folly/CppAttributes.h>
24 #ifndef FOLLY_STRING_H_
25 #error This file may only be included from String.h
31 // Map from character code to value of one-character escape sequence
32 // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
33 // an octal escape sequence, or 'P' if the character is printable and
34 // should be printed as is.
35 extern const char cEscapeTable[];
38 template <class String>
39 void cEscape(StringPiece str, String& out) {
42 out.reserve(out.size() + str.size());
44 auto last = p; // last regular character
45 // We advance over runs of regular characters (printable, not double-quote or
46 // backslash) and copy them in one go; this is faster than calling push_back
48 while (p != str.end()) {
50 unsigned char v = static_cast<unsigned char>(c);
51 char e = detail::cEscapeTable[v];
52 if (e == 'P') { // printable
54 } else if (e == 'O') { // octal
55 out.append(&*last, size_t(p - last));
56 esc[1] = '0' + ((v >> 6) & 7);
57 esc[2] = '0' + ((v >> 3) & 7);
58 esc[3] = '0' + (v & 7);
62 } else { // special 1-character escape
63 out.append(&*last, size_t(p - last));
70 out.append(&*last, size_t(p - last));
74 // Map from the character code of the character following a backslash to
75 // the unescaped character if a valid one-character escape sequence
76 // ('n' maps to 10 = '\n'), 'O' if this is the first character of an
77 // octal escape sequence, 'X' if this is the first character of a
78 // hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
79 extern const char cUnescapeTable[];
81 // Map from the character code to the hex value, or 16 if invalid hex char.
82 extern const unsigned char hexTable[];
85 template <class String>
86 void cUnescape(StringPiece str, String& out, bool strict) {
87 out.reserve(out.size() + str.size());
89 auto last = p; // last regular character (not part of an escape sequence)
90 // We advance over runs of regular characters (not backslash) and copy them
91 // in one go; this is faster than calling push_back repeatedly.
92 while (p != str.end()) {
94 if (c != '\\') { // normal case
98 out.append(&*last, p - last);
99 if (p == str.end()) { // backslash at end of string
101 throw std::invalid_argument("incomplete escape sequence");
108 char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
109 if (e == 'O') { // octal
110 unsigned char val = 0;
111 for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
113 val = (val << 3) | (*p - '0');
117 } else if (e == 'X') { // hex
119 if (p == str.end()) { // \x at end of string
121 throw std::invalid_argument("incomplete hex escape sequence");
127 unsigned char val = 0;
129 for (; (p != str.end() &&
130 (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
132 val = (val << 4) | h;
136 } else if (e == 'I') { // invalid
138 throw std::invalid_argument("invalid escape sequence");
144 } else { // standard escape sequence, \' etc
150 out.append(&*last, p - last);
154 // Map from character code to escape mode:
157 // 2 = pass through in PATH mode
158 // 3 = space, replace with '+' in QUERY mode
159 // 4 = percent-encode
160 extern const unsigned char uriEscapeTable[];
161 } // namespace detail
163 template <class String>
164 void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
165 static const char hexValues[] = "0123456789abcdef";
168 // Preallocate assuming that 25% of the input string will be escaped
169 out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
170 auto p = str.begin();
171 auto last = p; // last regular character
172 // We advance over runs of passthrough characters and copy them in one go;
173 // this is faster than calling push_back repeatedly.
174 unsigned char minEncode = static_cast<unsigned char>(mode);
175 while (p != str.end()) {
177 unsigned char v = static_cast<unsigned char>(c);
178 unsigned char discriminator = detail::uriEscapeTable[v];
179 if (LIKELY(discriminator <= minEncode)) {
181 } else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
182 out.append(&*last, size_t(p - last));
187 out.append(&*last, size_t(p - last));
188 esc[1] = hexValues[v >> 4];
189 esc[2] = hexValues[v & 0x0f];
195 out.append(&*last, size_t(p - last));
198 template <class String>
199 void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
200 out.reserve(out.size() + str.size());
201 auto p = str.begin();
203 // We advance over runs of passthrough characters and copy them in one go;
204 // this is faster than calling push_back repeatedly.
205 while (p != str.end()) {
210 if (UNLIKELY(std::distance(p, str.end()) < 3)) {
211 throw std::invalid_argument("incomplete percent encode sequence");
213 auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
214 auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
215 if (UNLIKELY(h1 == 16 || h2 == 16)) {
216 throw std::invalid_argument("invalid percent encode sequence");
218 out.append(&*last, size_t(p - last));
219 out.push_back((h1 << 4) | h2);
225 if (mode == UriEscapeMode::QUERY) {
226 out.append(&*last, size_t(p - last));
239 out.append(&*last, size_t(p - last));
245 * The following functions are type-overloaded helpers for
248 inline size_t delimSize(char) { return 1; }
249 inline size_t delimSize(StringPiece s) { return s.size(); }
250 inline bool atDelim(const char* s, char c) {
253 inline bool atDelim(const char* s, StringPiece sp) {
254 return !std::memcmp(s, sp.start(), sp.size());
257 // These are used to short-circuit internalSplit() in the case of
258 // 1-character strings.
259 inline char delimFront(char c) {
260 // This one exists only for compile-time; it should never be called.
264 inline char delimFront(StringPiece s) {
265 assert(!s.empty() && s.start() != nullptr);
270 * Shared implementation for all the split() overloads.
272 * This uses some external helpers that are overloaded to let this
273 * algorithm be more performant if the deliminator is a single
274 * character instead of a whole string.
276 * @param ignoreEmpty iff true, don't copy empty segments to output
278 template<class OutStringT, class DelimT, class OutputIterator>
279 void internalSplit(DelimT delim, StringPiece sp, OutputIterator out,
281 assert(sp.empty() || sp.start() != nullptr);
283 const char* s = sp.start();
284 const size_t strSize = sp.size();
285 const size_t dSize = delimSize(delim);
287 if (dSize > strSize || dSize == 0) {
288 if (!ignoreEmpty || strSize > 0) {
289 *out++ = to<OutStringT>(sp);
293 if (boost::is_same<DelimT,StringPiece>::value && dSize == 1) {
294 // Call the char version because it is significantly faster.
295 return internalSplit<OutStringT>(delimFront(delim), sp, out,
299 size_t tokenStartPos = 0;
300 size_t tokenSize = 0;
301 for (size_t i = 0; i <= strSize - dSize; ++i) {
302 if (atDelim(&s[i], delim)) {
303 if (!ignoreEmpty || tokenSize > 0) {
304 *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
307 tokenStartPos = i + dSize;
314 tokenSize = strSize - tokenStartPos;
315 if (!ignoreEmpty || tokenSize > 0) {
316 *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
320 template<class String> StringPiece prepareDelim(const String& s) {
321 return StringPiece(s);
323 inline char prepareDelim(char c) { return c; }
325 template <bool exact, class Delim, class OutputType>
326 bool splitFixed(const Delim& delimiter, StringPiece input, OutputType& output) {
328 exact || std::is_same<OutputType, StringPiece>::value ||
329 IsSomeString<OutputType>::value,
330 "split<false>() requires that the last argument be a string type");
331 if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) {
334 output = folly::to<OutputType>(input);
338 template <bool exact, class Delim, class OutputType, class... OutputTypes>
340 const Delim& delimiter,
343 OutputTypes&... outTail) {
344 size_t cut = input.find(delimiter);
345 if (UNLIKELY(cut == std::string::npos)) {
348 StringPiece head(input.begin(), input.begin() + cut);
349 StringPiece tail(input.begin() + cut + detail::delimSize(delimiter),
351 if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) {
352 outHead = folly::to<OutputType>(head);
360 //////////////////////////////////////////////////////////////////////
362 template<class Delim, class String, class OutputType>
363 void split(const Delim& delimiter,
365 std::vector<OutputType>& out,
367 detail::internalSplit<OutputType>(
368 detail::prepareDelim(delimiter),
370 std::back_inserter(out),
374 template<class Delim, class String, class OutputType>
375 void split(const Delim& delimiter,
377 fbvector<OutputType>& out,
379 detail::internalSplit<OutputType>(
380 detail::prepareDelim(delimiter),
382 std::back_inserter(out),
386 template<class OutputValueType, class Delim, class String,
387 class OutputIterator>
388 void splitTo(const Delim& delimiter,
392 detail::internalSplit<OutputValueType>(
393 detail::prepareDelim(delimiter),
399 template <bool exact, class Delim, class... OutputTypes>
400 typename std::enable_if<
401 AllConvertible<OutputTypes...>::value && sizeof...(OutputTypes) >= 1,
403 split(const Delim& delimiter, StringPiece input, OutputTypes&... outputs) {
404 return detail::splitFixed<exact>(
405 detail::prepareDelim(delimiter), input, outputs...);
411 * If a type can have its string size determined cheaply, we can more
412 * efficiently append it in a loop (see internalJoinAppend). Note that the
413 * struct need not conform to the std::string api completely (ex. does not need
414 * to implement append()).
416 template <class T> struct IsSizableString {
417 enum { value = IsSomeString<T>::value
418 || std::is_same<T, StringPiece>::value };
421 template <class Iterator>
422 struct IsSizableStringContainerIterator :
423 IsSizableString<typename std::iterator_traits<Iterator>::value_type> {
426 template <class Delim, class Iterator, class String>
427 void internalJoinAppend(Delim delimiter,
431 assert(begin != end);
432 if (std::is_same<Delim, StringPiece>::value &&
433 delimSize(delimiter) == 1) {
434 internalJoinAppend(delimFront(delimiter), begin, end, output);
437 toAppend(*begin, &output);
438 while (++begin != end) {
439 toAppend(delimiter, *begin, &output);
443 template <class Delim, class Iterator, class String>
444 typename std::enable_if<IsSizableStringContainerIterator<Iterator>::value>::type
445 internalJoin(Delim delimiter,
453 const size_t dsize = delimSize(delimiter);
455 size_t size = it->size();
456 while (++it != end) {
457 size += dsize + it->size();
459 output.reserve(size);
460 internalJoinAppend(delimiter, begin, end, output);
463 template <class Delim, class Iterator, class String>
465 std::enable_if<!IsSizableStringContainerIterator<Iterator>::value>::type
466 internalJoin(Delim delimiter,
474 internalJoinAppend(delimiter, begin, end, output);
477 } // namespace detail
479 template <class Delim, class Iterator, class String>
480 void join(const Delim& delimiter,
484 detail::internalJoin(
485 detail::prepareDelim(delimiter),
491 template <class String1, class String2>
492 void backslashify(const String1& input, String2& output, bool hex_style) {
493 static const char hexValues[] = "0123456789abcdef";
495 output.reserve(3 * input.size());
496 for (unsigned char c : input) {
497 // less than space or greater than '~' are considered unprintable
498 if (c < 0x20 || c > 0x7e || c == '\\') {
499 bool hex_append = false;
500 output.push_back('\\');
504 if (c == '\r') output += 'r';
505 else if (c == '\n') output += 'n';
506 else if (c == '\t') output += 't';
507 else if (c == '\a') output += 'a';
508 else if (c == '\b') output += 'b';
509 else if (c == '\0') output += '0';
510 else if (c == '\\') output += '\\';
516 output.push_back('x');
517 output.push_back(hexValues[(c >> 4) & 0xf]);
518 output.push_back(hexValues[c & 0xf]);
526 template <class String1, class String2>
527 void humanify(const String1& input, String2& output) {
528 size_t numUnprintable = 0;
529 size_t numPrintablePrefix = 0;
530 for (unsigned char c : input) {
531 if (c < 0x20 || c > 0x7e || c == '\\') {
534 if (numUnprintable == 0) {
535 ++numPrintablePrefix;
539 // hexlify doubles a string's size; backslashify can potentially
540 // explode it by 4x. Now, the printable range of the ascii
541 // "spectrum" is around 95 out of 256 values, so a "random" binary
542 // string should be around 60% unprintable. We use a 50% hueristic
543 // here, so if a string is 60% unprintable, then we just use hex
544 // output. Otherwise we backslash.
546 // UTF8 is completely ignored; as a result, utf8 characters will
547 // likely be \x escaped (since most common glyphs fit in two bytes).
548 // This is a tradeoff of complexity/speed instead of a convenience
549 // that likely would rarely matter. Moreover, this function is more
550 // about displaying underlying bytes, not about displaying glyphs
552 if (numUnprintable == 0) {
554 } else if (5 * numUnprintable >= 3 * input.size()) {
555 // However! If we have a "meaningful" prefix of printable
556 // characters, say 20% of the string, we backslashify under the
557 // assumption viewing the prefix as ascii is worth blowing the
558 // output size up a bit.
559 if (5 * numPrintablePrefix >= input.size()) {
560 backslashify(input, output);
563 hexlify(input, output, true /* append output */);
566 backslashify(input, output);
570 template<class InputString, class OutputString>
571 bool hexlify(const InputString& input, OutputString& output,
572 bool append_output) {
573 if (!append_output) output.clear();
575 static char hexValues[] = "0123456789abcdef";
576 auto j = output.size();
577 output.resize(2 * input.size() + output.size());
578 for (size_t i = 0; i < input.size(); ++i) {
580 output[j++] = hexValues[(ch >> 4) & 0xf];
581 output[j++] = hexValues[ch & 0xf];
586 template<class InputString, class OutputString>
587 bool unhexlify(const InputString& input, OutputString& output) {
588 if (input.size() % 2 != 0) {
591 output.resize(input.size() / 2);
594 for (size_t i = 0; i < input.size(); i += 2) {
595 int highBits = detail::hexTable[static_cast<uint8_t>(input[i])];
596 int lowBits = detail::hexTable[static_cast<uint8_t>(input[i + 1])];
597 if ((highBits | lowBits) & 0x10) {
598 // One of the characters wasn't a hex digit
601 output[j++] = (highBits << 4) + lowBits;
608 * Hex-dump at most 16 bytes starting at offset from a memory area of size
609 * bytes. Return the number of bytes actually dumped.
611 size_t hexDumpLine(const void* ptr, size_t offset, size_t size,
613 } // namespace detail
615 template <class OutIt>
616 void hexDump(const void* ptr, size_t size, OutIt out) {
619 while (offset < size) {
620 offset += detail::hexDumpLine(ptr, offset, size, line);