2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/Unicode.h>
18 #include <folly/Conv.h>
22 //////////////////////////////////////////////////////////////////////
24 std::string codePointToUtf8(char32_t cp) {
27 // Based on description from http://en.wikipedia.org/wiki/UTF-8.
31 result[0] = static_cast<char>(cp);
32 } else if (cp <= 0x7FF) {
34 result[1] = static_cast<char>(0x80 | (0x3f & cp));
35 result[0] = static_cast<char>(0xC0 | (cp >> 6));
36 } else if (cp <= 0xFFFF) {
38 result[2] = static_cast<char>(0x80 | (0x3f & cp));
39 result[1] = (0x80 | static_cast<char>((0x3f & (cp >> 6))));
40 result[0] = (0xE0 | static_cast<char>(cp >> 12));
41 } else if (cp <= 0x10FFFF) {
43 result[3] = static_cast<char>(0x80 | (0x3f & cp));
44 result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
45 result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
46 result[0] = static_cast<char>(0xF0 | (cp >> 18));
53 char32_t utf8ToCodePoint(
54 const unsigned char*& p,
55 const unsigned char* const e,
57 /* The following encodings are valid, except for the 5 and 6 byte
61 * 1110xxxx 10xxxxxx 10xxxxxx
62 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
63 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
64 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
67 auto skip = [&] { ++p; return U'\ufffd'; };
70 if (skipOnError) return skip();
71 throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string");
74 unsigned char fst = *p;
80 static const uint32_t bitMask[] = {
87 // upper control bits are masked out later
90 if ((fst & 0xC0) != 0xC0) {
91 if (skipOnError) return skip();
92 throw std::runtime_error(to<std::string>("folly::utf8ToCodePoint i=0 d=", d));
97 for (unsigned int i = 1; i != 3 && p + i < e; ++i) {
98 unsigned char tmp = p[i];
100 if ((tmp & 0xC0) != 0x80) {
101 if (skipOnError) return skip();
102 throw std::runtime_error(
103 to<std::string>("folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp));
106 d = (d << 6) | (tmp & 0x3F);
112 // overlong, could have been encoded with i bytes
113 if ((d & ~bitMask[i - 1]) == 0) {
114 if (skipOnError) return skip();
115 throw std::runtime_error(
116 to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
119 // check for surrogates only needed for 3 bytes
121 if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
122 if (skipOnError) return skip();
123 throw std::runtime_error(
124 to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
133 if (skipOnError) return skip();
134 throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out");
137 //////////////////////////////////////////////////////////////////////