2 * Copyright 2012 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #ifndef FOLLY_STRING_INL_H_
18 #define FOLLY_STRING_INL_H_
23 #ifndef FOLLY_BASE_STRING_H_
24 #error This file may only be included from String.h
30 // Map from character code to value of one-character escape sequence
31 // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
32 // an octal escape sequence, or 'P' if the character is printable and
33 // should be printed as is.
34 extern const char cEscapeTable[];
37 template <class String>
38 void cEscape(StringPiece str, String& out) {
41 out.reserve(out.size() + str.size());
43 auto last = p; // last regular character
44 // We advance over runs of regular characters (printable, not double-quote or
45 // backslash) and copy them in one go; this is faster than calling push_back
47 while (p != str.end()) {
49 unsigned char v = static_cast<unsigned char>(c);
50 char e = detail::cEscapeTable[v];
51 if (e == 'P') { // printable
53 } else if (e == 'O') { // octal
54 out.append(&*last, p - last);
55 esc[1] = '0' + ((v >> 6) & 7);
56 esc[2] = '0' + ((v >> 3) & 7);
57 esc[3] = '0' + (v & 7);
61 } else { // special 1-character escape
62 out.append(&*last, p - last);
69 out.append(&*last, p - last);
73 // Map from the character code of the character following a backslash to
74 // the unescaped character if a valid one-character escape sequence
75 // ('n' maps to 10 = '\n'), 'O' if this is the first character of an
76 // octal escape sequence, 'X' if this is the first character of a
77 // hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
78 extern const char cUnescapeTable[];
80 // Map from the character code to the hex value, or 16 if invalid hex char.
81 extern const unsigned char hexTable[];
84 template <class String>
85 void cUnescape(StringPiece str, String& out, bool strict) {
86 out.reserve(out.size() + str.size());
88 auto last = p; // last regular character (not part of an escape sequence)
89 // We advance over runs of regular characters (not backslash) and copy them
90 // in one go; this is faster than calling push_back repeatedly.
91 while (p != str.end()) {
93 if (c != '\\') { // normal case
97 out.append(&*last, p - last);
98 if (p == str.end()) { // backslash at end of string
100 throw std::invalid_argument("incomplete escape sequence");
107 char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
108 if (e == 'O') { // octal
109 unsigned char val = 0;
110 for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
112 val = (val << 3) | (*p - '0');
116 } else if (e == 'X') { // hex
118 if (p == str.end()) { // \x at end of string
120 throw std::invalid_argument("incomplete hex escape sequence");
126 unsigned char val = 0;
128 for (; (p != str.end() &&
129 (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
131 val = (val << 4) | h;
135 } else if (e == 'I') { // invalid
137 throw std::invalid_argument("invalid escape sequence");
143 } else { // standard escape sequence, \' etc
149 out.append(&*last, p - last);
155 * The following functions are type-overloaded helpers for
158 inline size_t delimSize(char) { return 1; }
159 inline size_t delimSize(StringPiece s) { return s.size(); }
160 inline bool atDelim(const char* s, char c) {
163 inline bool atDelim(const char* s, StringPiece sp) {
164 return !std::memcmp(s, sp.start(), sp.size());
167 // These are used to short-circuit internalSplit() in the case of
168 // 1-character strings.
169 inline char delimFront(char c) {
170 // This one exists only for compile-time; it should never be called.
174 inline char delimFront(StringPiece s) {
175 assert(!s.empty() && s.start() != nullptr);
180 * These output conversion templates allow us to support multiple
181 * output string types, even when we are using an arbitrary
184 template<class OutStringT> struct OutputConverter {};
186 template<> struct OutputConverter<std::string> {
187 std::string operator()(StringPiece sp) const {
188 return sp.toString();
192 template<> struct OutputConverter<fbstring> {
193 fbstring operator()(StringPiece sp) const {
194 return sp.toFbstring();
198 template<> struct OutputConverter<StringPiece> {
199 StringPiece operator()(StringPiece sp) const { return sp; }
203 * Shared implementation for all the split() overloads.
205 * This uses some external helpers that are overloaded to let this
206 * algorithm be more performant if the deliminator is a single
207 * character instead of a whole string.
209 * @param ignoreEmpty iff true, don't copy empty segments to output
211 template<class OutStringT, class DelimT, class OutputIterator>
212 void internalSplit(DelimT delim, StringPiece sp, OutputIterator out,
214 assert(sp.start() != nullptr);
216 const char* s = sp.start();
217 const size_t strSize = sp.size();
218 const size_t dSize = delimSize(delim);
220 OutputConverter<OutStringT> conv;
222 if (dSize > strSize || dSize == 0) {
223 if (!ignoreEmpty || strSize > 0) {
228 if (boost::is_same<DelimT,StringPiece>::value && dSize == 1) {
229 // Call the char version because it is significantly faster.
230 return internalSplit<OutStringT>(delimFront(delim), sp, out,
234 int tokenStartPos = 0;
236 for (int i = 0; i <= strSize - dSize; ++i) {
237 if (atDelim(&s[i], delim)) {
238 if (!ignoreEmpty || tokenSize > 0) {
239 *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
242 tokenStartPos = i + dSize;
250 if (!ignoreEmpty || tokenSize > 0) {
251 tokenSize = strSize - tokenStartPos;
252 *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
256 template<class String> StringPiece prepareDelim(const String& s) {
257 return StringPiece(s);
259 inline char prepareDelim(char c) { return c; }
263 //////////////////////////////////////////////////////////////////////
265 template<class Delim, class String, class OutputType>
266 void split(const Delim& delimiter,
268 std::vector<OutputType>& out,
270 detail::internalSplit<OutputType>(
271 detail::prepareDelim(delimiter),
273 std::back_inserter(out),
277 template<class Delim, class String, class OutputType>
278 void split(const Delim& delimiter,
280 fbvector<OutputType>& out,
282 detail::internalSplit<OutputType>(
283 detail::prepareDelim(delimiter),
285 std::back_inserter(out),
289 template<class OutputValueType, class Delim, class String,
290 class OutputIterator>
291 void splitTo(const Delim& delimiter,
295 detail::internalSplit<OutputValueType>(
296 detail::prepareDelim(delimiter),
304 template <class Iterator>
305 struct IsStringContainerIterator :
306 IsSomeString<typename std::iterator_traits<Iterator>::value_type> {
309 template <class Delim, class Iterator, class String>
310 void internalJoinAppend(Delim delimiter,
314 assert(begin != end);
315 if (std::is_same<Delim, StringPiece>::value &&
316 delimSize(delimiter) == 1) {
317 internalJoinAppend(delimFront(delimiter), begin, end, output);
320 toAppend(*begin, &output);
321 while (++begin != end) {
322 toAppend(delimiter, *begin, &output);
326 template <class Delim, class Iterator, class String>
327 typename std::enable_if<IsStringContainerIterator<Iterator>::value>::type
328 internalJoin(Delim delimiter,
336 const size_t dsize = delimSize(delimiter);
338 size_t size = it->size();
339 while (++it != end) {
340 size += dsize + it->size();
342 output.reserve(size);
343 internalJoinAppend(delimiter, begin, end, output);
346 template <class Delim, class Iterator, class String>
347 typename std::enable_if<!IsStringContainerIterator<Iterator>::value>::type
348 internalJoin(Delim delimiter,
356 internalJoinAppend(delimiter, begin, end, output);
359 } // namespace detail
361 template <class Delim, class Iterator, class String>
362 void join(const Delim& delimiter,
366 detail::internalJoin(
367 detail::prepareDelim(delimiter),
373 template <class String1, class String2>
374 void backslashify(const String1& input, String2& output, bool hex_style) {
375 static const char hexValues[] = "0123456789abcdef";
377 output.reserve(3 * input.size());
378 for (unsigned char c : input) {
379 // less than space or greater than '~' are considered unprintable
380 if (c < 0x20 || c > 0x7e || c == '\\') {
381 bool hex_append = false;
382 output.push_back('\\');
386 if (c == '\r') output += 'r';
387 else if (c == '\n') output += 'n';
388 else if (c == '\t') output += 't';
389 else if (c == '\a') output += 'a';
390 else if (c == '\b') output += 'b';
391 else if (c == '\0') output += '0';
392 else if (c == '\\') output += '\\';
398 output.push_back('x');
399 output.push_back(hexValues[(c >> 4) & 0xf]);
400 output.push_back(hexValues[c & 0xf]);
408 template <class String1, class String2>
409 void humanify(const String1& input, String2& output) {
410 int numUnprintable = 0;
411 int numPrintablePrefix = 0;
412 for (unsigned char c : input) {
413 if (c < 0x20 || c > 0x7e || c == '\\') {
416 if (numUnprintable == 0) {
417 ++numPrintablePrefix;
421 // hexlify doubles a string's size; backslashify can potentially
422 // explode it by 4x. Now, the printable range of the ascii
423 // "spectrum" is around 95 out of 256 values, so a "random" binary
424 // string should be around 60% unprintable. We use a 50% hueristic
425 // here, so if a string is 60% unprintable, then we just use hex
426 // output. Otherwise we backslash.
428 // UTF8 is completely ignored; as a result, utf8 characters will
429 // likely be \x escaped (since most common glyphs fit in two bytes).
430 // This is a tradeoff of complexity/speed instead of a convenience
431 // that likely would rarely matter. Moreover, this function is more
432 // about displaying underlying bytes, not about displaying glyphs
434 if (numUnprintable == 0) {
436 } else if (5 * numUnprintable >= 3 * input.size()) {
437 // However! If we have a "meaningful" prefix of printable
438 // characters, say 20% of the string, we backslashify under the
439 // assumption viewing the prefix as ascii is worth blowing the
440 // output size up a bit.
441 if (5 * numPrintablePrefix >= input.size()) {
442 backslashify(input, output);
445 hexlify(input, output, true /* append output */);
448 backslashify(input, output);
452 template<class InputString, class OutputString>
453 bool hexlify(const InputString& input, OutputString& output,
454 bool append_output) {
455 if (!append_output) output.clear();
457 static char hexValues[] = "0123456789abcdef";
458 int j = output.size();
459 output.resize(2 * input.size() + output.size());
460 for (int i = 0; i < input.size(); ++i) {
462 output[j++] = hexValues[(ch >> 4) & 0xf];
463 output[j++] = hexValues[ch & 0xf];
468 template<class InputString, class OutputString>
469 bool unhexlify(const InputString& input, OutputString& output) {
470 if (input.size() % 2 != 0) {
473 output.resize(input.size() / 2);
475 auto unhex = [](char c) -> int {
476 return c >= '0' && c <= '9' ? c - '0' :
477 c >= 'A' && c <= 'F' ? c - 'A' + 10 :
478 c >= 'a' && c <= 'f' ? c - 'a' + 10 :
482 for (int i = 0; i < input.size(); i += 2) {
483 int highBits = unhex(input[i]);
484 int lowBits = unhex(input[i + 1]);
485 if (highBits < 0 || lowBits < 0) {
488 output[j++] = (highBits << 4) + lowBits;
495 * Hex-dump at most 16 bytes starting at offset from a memory area of size
496 * bytes. Return the number of bytes actually dumped.
498 size_t hexDumpLine(const void* ptr, size_t offset, size_t size,
500 } // namespace detail
502 template <class OutIt>
503 void hexDump(const void* ptr, size_t size, OutIt out) {
506 while (offset < size) {
507 offset += detail::hexDumpLine(ptr, offset, size, line);
514 #endif /* FOLLY_STRING_INL_H_ */