2 * Copyright 2016 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #ifndef FOLLY_GEN_STRING_H_
18 #error This file may only be included from folly/gen/String.h
21 #include <folly/Conv.h>
22 #include <folly/String.h>
29 * Finds the first occurrence of delimiter in "in", advances "in" past the
30 * delimiter. Populates "prefix" with the consumed bytes, including the
33 * Returns the number of trailing bytes of "prefix" that make up the
34 * delimiter, or 0 if the delimiter was not found.
36 inline size_t splitPrefix(StringPiece& in,
39 size_t found = in.find(delimiter);
40 if (found != StringPiece::npos) {
42 prefix.assign(in.data(), in.data() + found);
51 * As above, but supports multibyte delimiters.
53 inline size_t splitPrefix(StringPiece& in,
55 StringPiece delimiter) {
56 auto found = in.find(delimiter);
57 if (found != StringPiece::npos) {
58 found += delimiter.size();
59 prefix.assign(in.data(), in.data() + found);
61 return delimiter.size();
68 * As above, but splits by any of the EOL terms: \r, \n, or \r\n.
70 inline size_t splitPrefix(StringPiece& in,
73 const auto kCRLF = "\r\n";
74 const size_t kLenCRLF = 2;
76 auto p = in.find_first_of(kCRLF);
77 if (p != std::string::npos) {
78 const auto in_start = in.data();
81 // Either remove an MS-DOS CR-LF 2-byte newline, or eat 1 byte at a time.
82 if (in.removePrefix(kCRLF)) {
85 in.advance(delim_len);
87 prefix.assign(in_start, in.data());
94 inline const char* ch(const unsigned char* p) {
95 return reinterpret_cast<const char*>(p);
98 // Chop s into pieces of at most maxLength, feed them to cb
99 template <class Callback>
100 bool consumeFixedSizeChunks(Callback& cb, StringPiece& s, uint64_t maxLength) {
102 auto num_to_add = s.size();
104 num_to_add = std::min<uint64_t>(num_to_add, maxLength);
106 if (!cb(StringPiece(s.begin(), num_to_add))) {
109 s.advance(num_to_add);
114 // Consumes all of buffer, plus n chars from s.
115 template <class Callback>
116 bool consumeBufferPlus(Callback& cb, IOBuf& buf, StringPiece& s, uint64_t n) {
118 memcpy(buf.writableTail(), s.data(), n);
121 if (!cb(StringPiece(detail::ch(buf.data()), buf.length()))) {
128 } // namespace detail
130 template <class Callback>
131 bool StreamSplitter<Callback>::flush() {
132 CHECK(maxLength_ == 0 || buffer_.length() < maxLength_);
133 if (!pieceCb_(StringPiece(detail::ch(buffer_.data()), buffer_.length()))) {
136 // We are ready to handle another stream now.
141 template <class Callback>
142 bool StreamSplitter<Callback>::operator()(StringPiece in) {
144 // NB This code assumes a 1-byte delimiter. It's not too hard to support
145 // multibyte delimiters, just remember that maxLength_ chunks can end up
146 // falling in the middle of a delimiter.
147 bool found = detail::splitPrefix(in, prefix, delimiter_);
148 if (buffer_.length() != 0) {
150 uint64_t num_to_add = prefix.size();
152 CHECK(buffer_.length() < maxLength_);
153 // Consume as much of prefix as possible without exceeding maxLength_
154 num_to_add = std::min(maxLength_ - buffer_.length(), num_to_add);
157 // Append part of the prefix to the buffer, and send it to the callback
158 if (!detail::consumeBufferPlus(pieceCb_, buffer_, prefix, num_to_add)) {
162 if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) {
166 found = detail::splitPrefix(in, prefix, delimiter_);
168 // - we consumed all of buffer_ and all of the first prefix.
169 // - found, in, and prefix reflect the second delimiter_ search
170 } else if (maxLength_ && buffer_.length() + in.size() >= maxLength_) {
171 // Send all of buffer_, plus a bit of in, to the callback
172 if (!detail::consumeBufferPlus(
173 pieceCb_, buffer_, in, maxLength_ - buffer_.length())) {
177 // - we consumed all of buffer, and the minimal # of bytes from in
179 } // Otherwise: found is false & we cannot invoke the callback this turn
181 // Post-condition: buffer_ is nonempty only if found is false **and**
182 // len(buffer + in) < maxLength_.
184 // Send lines to callback directly from input (no buffer)
185 while (found) { // Buffer guaranteed to be empty
186 if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) {
189 found = detail::splitPrefix(in, prefix, delimiter_);
192 // No more delimiters left; consume 'in' until it is shorter than maxLength_
194 while (in.size() >= maxLength_) { // Buffer is guaranteed to be empty
195 if (!pieceCb_(StringPiece(in.begin(), maxLength_))) {
198 in.advance(maxLength_);
202 if (!in.empty()) { // Buffer may be nonempty
203 // Incomplete line left, append to buffer
204 buffer_.reserve(0, in.size());
205 memcpy(buffer_.writableTail(), in.data(), in.size());
206 buffer_.append(in.size());
208 CHECK(maxLength_ == 0 || buffer_.length() < maxLength_);
214 class StringResplitter : public Operator<StringResplitter> {
217 explicit StringResplitter(char delimiter) : delimiter_(delimiter) { }
219 template <class Source>
220 class Generator : public GenImpl<StringPiece, Generator<Source>> {
224 Generator(Source source, char delimiter)
225 : source_(std::move(source)), delimiter_(delimiter) { }
227 template <class Body>
228 bool apply(Body&& body) const {
230 streamSplitter(this->delimiter_, [this, &body](StringPiece s) {
231 // The stream ended with a delimiter; our contract is to swallow
232 // the final empty piece.
236 if (s.back() != this->delimiter_) {
239 s.pop_back(); // Remove the 1-character delimiter
242 if (!source_.apply(splitter)) {
245 return splitter.flush();
248 static constexpr bool infinite = Source::infinite;
251 template<class Source,
253 class Gen = Generator<Source>>
254 Gen compose(GenImpl<Value, Source>&& source) const {
255 return Gen(std::move(source.self()), delimiter_);
258 template<class Source,
260 class Gen = Generator<Source>>
261 Gen compose(const GenImpl<Value, Source>& source) const {
262 return Gen(source.self(), delimiter_);
266 template <class DelimiterType = char>
267 class SplitStringSource
268 : public GenImpl<StringPiece, SplitStringSource<DelimiterType>> {
270 DelimiterType delimiter_;
272 SplitStringSource(const StringPiece source,
273 DelimiterType delimiter)
275 , delimiter_(std::move(delimiter)) { }
277 template <class Body>
278 bool apply(Body&& body) const {
279 StringPiece rest(source_);
281 while (size_t delim_len = splitPrefix(rest, prefix, this->delimiter_)) {
282 prefix.subtract(delim_len); // Remove the delimiter
297 * Unsplit - For joining tokens from a generator into a string. This is
298 * the inverse of `split` above.
300 * This type is primarily used through the 'unsplit' function.
302 template<class Delimiter,
304 class Unsplit : public Operator<Unsplit<Delimiter, Output>> {
305 Delimiter delimiter_;
307 explicit Unsplit(const Delimiter& delimiter)
308 : delimiter_(delimiter) {
311 template<class Source,
313 Output compose(const GenImpl<Value, Source>& source) const {
315 UnsplitBuffer<Delimiter, Output> unsplitter(delimiter_, &outputBuffer);
316 unsplitter.compose(source);
322 * UnsplitBuffer - For joining tokens from a generator into a string,
323 * and inserting them into a custom buffer.
325 * This type is primarily used through the 'unsplit' function.
327 template<class Delimiter,
329 class UnsplitBuffer : public Operator<UnsplitBuffer<Delimiter, OutputBuffer>> {
330 Delimiter delimiter_;
331 OutputBuffer* outputBuffer_;
333 UnsplitBuffer(const Delimiter& delimiter, OutputBuffer* outputBuffer)
334 : delimiter_(delimiter)
335 , outputBuffer_(outputBuffer) {
339 template<class Source,
341 void compose(const GenImpl<Value, Source>& source) const {
342 // If the output buffer is empty, we skip inserting the delimiter for the
344 bool skipDelim = outputBuffer_->empty();
345 source | [&](Value v) {
348 toAppend(std::forward<Value>(v), outputBuffer_);
350 toAppend(delimiter_, std::forward<Value>(v), outputBuffer_);
358 * Hack for static for-like constructs
360 template<class Target, class=void>
361 inline Target passthrough(Target target) { return target; }
363 #pragma GCC diagnostic push
365 // Clang isn't happy with eatField() hack below.
366 #pragma GCC diagnostic ignored "-Wreturn-stack-address"
370 * ParseToTuple - For splitting a record and immediatlely converting it to a
371 * target tuple type. Primary used through the 'eachToTuple' helper, like so:
374 * = split("1:a 2:b", ' ')
375 * | eachToTuple<int, string>()
376 * | as<vector<tuple<int, string>>>();
379 template<class TargetContainer,
383 Delimiter delimiter_;
385 explicit SplitTo(Delimiter delimiter)
386 : delimiter_(delimiter) {}
388 TargetContainer operator()(StringPiece line) const {
390 StringPiece fields[sizeof...(Targets)];
391 // HACK(tjackson): Used for referencing fields[] corresponding to variadic
392 // template parameters.
393 auto eatField = [&]() -> StringPiece& { return fields[i++]; };
394 if (!split(delimiter_,
396 detail::passthrough<StringPiece&, Targets>(eatField())...)) {
397 throw std::runtime_error("field count mismatch");
400 return TargetContainer(To<Targets>()(eatField())...);
404 #pragma GCC diagnostic pop
406 } // namespace detail