2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #ifndef FOLLY_GEN_STRING_H_
18 #error This file may only be included from folly/gen/String.h
21 #include <folly/Conv.h>
22 #include <folly/Portability.h>
23 #include <folly/String.h>
30 * Finds the first occurrence of delimiter in "in", advances "in" past the
31 * delimiter. Populates "prefix" with the consumed bytes, including the
34 * Returns the number of trailing bytes of "prefix" that make up the
35 * delimiter, or 0 if the delimiter was not found.
37 inline size_t splitPrefix(StringPiece& in,
40 size_t found = in.find(delimiter);
41 if (found != StringPiece::npos) {
43 prefix.assign(in.data(), in.data() + found);
52 * As above, but supports multibyte delimiters.
54 inline size_t splitPrefix(StringPiece& in,
56 StringPiece delimiter) {
57 auto found = in.find(delimiter);
58 if (found != StringPiece::npos) {
59 found += delimiter.size();
60 prefix.assign(in.data(), in.data() + found);
62 return delimiter.size();
69 * As above, but splits by any of the EOL terms: \r, \n, or \r\n.
71 inline size_t splitPrefix(StringPiece& in,
74 const auto kCRLF = "\r\n";
75 const size_t kLenCRLF = 2;
77 auto p = in.find_first_of(kCRLF);
78 if (p != std::string::npos) {
79 const auto in_start = in.data();
82 // Either remove an MS-DOS CR-LF 2-byte newline, or eat 1 byte at a time.
83 if (in.removePrefix(kCRLF)) {
86 in.advance(delim_len);
88 prefix.assign(in_start, in.data());
95 inline const char* ch(const unsigned char* p) {
96 return reinterpret_cast<const char*>(p);
99 // Chop s into pieces of at most maxLength, feed them to cb
100 template <class Callback>
101 bool consumeFixedSizeChunks(Callback& cb, StringPiece& s, uint64_t maxLength) {
103 auto num_to_add = s.size();
105 num_to_add = std::min<uint64_t>(num_to_add, maxLength);
107 if (!cb(StringPiece(s.begin(), num_to_add))) {
110 s.advance(num_to_add);
115 // Consumes all of buffer, plus n chars from s.
116 template <class Callback>
117 bool consumeBufferPlus(Callback& cb, IOBuf& buf, StringPiece& s, uint64_t n) {
119 memcpy(buf.writableTail(), s.data(), n);
122 if (!cb(StringPiece(detail::ch(buf.data()), buf.length()))) {
129 } // namespace detail
131 template <class Callback>
132 bool StreamSplitter<Callback>::flush() {
133 CHECK(maxLength_ == 0 || buffer_.length() < maxLength_);
134 if (!pieceCb_(StringPiece(detail::ch(buffer_.data()), buffer_.length()))) {
137 // We are ready to handle another stream now.
142 template <class Callback>
143 bool StreamSplitter<Callback>::operator()(StringPiece in) {
145 // NB This code assumes a 1-byte delimiter. It's not too hard to support
146 // multibyte delimiters, just remember that maxLength_ chunks can end up
147 // falling in the middle of a delimiter.
148 bool found = detail::splitPrefix(in, prefix, delimiter_);
149 if (buffer_.length() != 0) {
151 uint64_t num_to_add = prefix.size();
153 CHECK(buffer_.length() < maxLength_);
154 // Consume as much of prefix as possible without exceeding maxLength_
155 num_to_add = std::min(maxLength_ - buffer_.length(), num_to_add);
158 // Append part of the prefix to the buffer, and send it to the callback
159 if (!detail::consumeBufferPlus(pieceCb_, buffer_, prefix, num_to_add)) {
163 if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) {
167 found = detail::splitPrefix(in, prefix, delimiter_);
169 // - we consumed all of buffer_ and all of the first prefix.
170 // - found, in, and prefix reflect the second delimiter_ search
171 } else if (maxLength_ && buffer_.length() + in.size() >= maxLength_) {
172 // Send all of buffer_, plus a bit of in, to the callback
173 if (!detail::consumeBufferPlus(
174 pieceCb_, buffer_, in, maxLength_ - buffer_.length())) {
178 // - we consumed all of buffer, and the minimal # of bytes from in
180 } // Otherwise: found is false & we cannot invoke the callback this turn
182 // Post-condition: buffer_ is nonempty only if found is false **and**
183 // len(buffer + in) < maxLength_.
185 // Send lines to callback directly from input (no buffer)
186 while (found) { // Buffer guaranteed to be empty
187 if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) {
190 found = detail::splitPrefix(in, prefix, delimiter_);
193 // No more delimiters left; consume 'in' until it is shorter than maxLength_
195 while (in.size() >= maxLength_) { // Buffer is guaranteed to be empty
196 if (!pieceCb_(StringPiece(in.begin(), maxLength_))) {
199 in.advance(maxLength_);
203 if (!in.empty()) { // Buffer may be nonempty
204 // Incomplete line left, append to buffer
205 buffer_.reserve(0, in.size());
206 memcpy(buffer_.writableTail(), in.data(), in.size());
207 buffer_.append(in.size());
209 CHECK(maxLength_ == 0 || buffer_.length() < maxLength_);
215 class StringResplitter : public Operator<StringResplitter> {
220 explicit StringResplitter(char delimiter, bool keepDelimiter = false)
221 : delimiter_(delimiter), keepDelimiter_(keepDelimiter) {}
223 template <class Source>
224 class Generator : public GenImpl<StringPiece, Generator<Source>> {
230 Generator(Source source, char delimiter, bool keepDelimiter)
231 : source_(std::move(source)),
232 delimiter_(delimiter),
233 keepDelimiter_(keepDelimiter) {}
235 template <class Body>
236 bool apply(Body&& body) const {
238 streamSplitter(this->delimiter_, [this, &body](StringPiece s) {
239 // The stream ended with a delimiter; our contract is to swallow
240 // the final empty piece.
244 if (s.back() != this->delimiter_) {
247 if (!keepDelimiter_) {
248 s.pop_back(); // Remove the 1-character delimiter
252 if (!source_.apply(splitter)) {
255 return splitter.flush();
258 static constexpr bool infinite = Source::infinite;
261 template <class Source, class Value, class Gen = Generator<Source>>
262 Gen compose(GenImpl<Value, Source>&& source) const {
263 return Gen(std::move(source.self()), delimiter_, keepDelimiter_);
266 template <class Source, class Value, class Gen = Generator<Source>>
267 Gen compose(const GenImpl<Value, Source>& source) const {
268 return Gen(source.self(), delimiter_, keepDelimiter_);
272 template <class DelimiterType = char>
273 class SplitStringSource
274 : public GenImpl<StringPiece, SplitStringSource<DelimiterType>> {
276 DelimiterType delimiter_;
278 SplitStringSource(const StringPiece source,
279 DelimiterType delimiter)
281 , delimiter_(std::move(delimiter)) { }
283 template <class Body>
284 bool apply(Body&& body) const {
285 StringPiece rest(source_);
287 while (size_t delim_len = splitPrefix(rest, prefix, this->delimiter_)) {
288 prefix.subtract(delim_len); // Remove the delimiter
303 * Unsplit - For joining tokens from a generator into a string. This is
304 * the inverse of `split` above.
306 * This type is primarily used through the 'unsplit' function.
308 template <class Delimiter, class Output>
309 class Unsplit : public Operator<Unsplit<Delimiter, Output>> {
310 Delimiter delimiter_;
312 explicit Unsplit(const Delimiter& delimiter)
313 : delimiter_(delimiter) {
316 template <class Source, class Value>
317 Output compose(const GenImpl<Value, Source>& source) const {
319 UnsplitBuffer<Delimiter, Output> unsplitter(delimiter_, &outputBuffer);
320 unsplitter.compose(source);
326 * UnsplitBuffer - For joining tokens from a generator into a string,
327 * and inserting them into a custom buffer.
329 * This type is primarily used through the 'unsplit' function.
331 template <class Delimiter, class OutputBuffer>
332 class UnsplitBuffer : public Operator<UnsplitBuffer<Delimiter, OutputBuffer>> {
333 Delimiter delimiter_;
334 OutputBuffer* outputBuffer_;
336 UnsplitBuffer(const Delimiter& delimiter, OutputBuffer* outputBuffer)
337 : delimiter_(delimiter)
338 , outputBuffer_(outputBuffer) {
342 template <class Source, class Value>
343 void compose(const GenImpl<Value, Source>& source) const {
344 // If the output buffer is empty, we skip inserting the delimiter for the
346 bool skipDelim = outputBuffer_->empty();
347 source | [&](Value v) {
350 toAppend(std::forward<Value>(v), outputBuffer_);
352 toAppend(delimiter_, std::forward<Value>(v), outputBuffer_);
360 * Hack for static for-like constructs
362 template <class Target, class = void>
363 inline Target passthrough(Target target) { return target; }
367 // Clang isn't happy with eatField() hack below.
368 #pragma GCC diagnostic ignored "-Wreturn-stack-address"
372 * ParseToTuple - For splitting a record and immediatlely converting it to a
373 * target tuple type. Primary used through the 'eachToTuple' helper, like so:
376 * = split("1:a 2:b", ' ')
377 * | eachToTuple<int, string>()
378 * | as<vector<tuple<int, string>>>();
381 template <class TargetContainer, class Delimiter, class... Targets>
383 Delimiter delimiter_;
385 explicit SplitTo(Delimiter delimiter)
386 : delimiter_(delimiter) {}
388 TargetContainer operator()(StringPiece line) const {
390 StringPiece fields[sizeof...(Targets)];
391 // HACK(tjackson): Used for referencing fields[] corresponding to variadic
392 // template parameters.
393 auto eatField = [&]() -> StringPiece& { return fields[i++]; };
394 if (!split(delimiter_,
396 detail::passthrough<StringPiece&, Targets>(eatField())...)) {
397 throw std::runtime_error("field count mismatch");
400 return TargetContainer(To<Targets>()(eatField())...);
406 } // namespace detail