URI parsing in folly
authorTudor Bosman <tudorb@fb.com>
Wed, 10 Apr 2013 00:34:52 +0000 (17:34 -0700)
committerJordan DeLong <jdelong@fb.com>
Sun, 21 Apr 2013 20:21:31 +0000 (13:21 -0700)
Summary: Cleaned up from common/strings/URL.h, and it should be URI, not URL.

Test Plan: tests added

Reviewed By: chip@fb.com

FB internal diff: D768880

folly/String-inl.h
folly/String.h
folly/Uri-inl.h [new file with mode: 0644]
folly/Uri.cpp [new file with mode: 0644]
folly/Uri.h [new file with mode: 0644]
folly/build/generate_escape_tables.py
folly/test/StringTest.cpp
folly/test/UriTest.cpp [new file with mode: 0644]

index b9f7c23700580fc875ec3ed8ae77e9baa80b6c22..8ae2d20af86a37a9172e1f9c7796c934f01b465a 100644 (file)
@@ -149,6 +149,95 @@ void cUnescape(StringPiece str, String& out, bool strict) {
   out.append(&*last, p - last);
 }
 
+namespace detail {
+// Map from character code to escape mode:
+// 0 = pass through
+// 1 = unused
+// 2 = pass through in PATH mode
+// 3 = space, replace with '+' in QUERY mode
+// 4 = percent-encode
+extern const unsigned char uriEscapeTable[];
+}  // namespace detail
+
+template <class String>
+void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
+  static const char hexValues[] = "0123456789abcdef";
+  char esc[3];
+  esc[0] = '%';
+  // Preallocate assuming that 25% of the input string will be escaped
+  out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
+  auto p = str.begin();
+  auto last = p;  // last regular character
+  // We advance over runs of passthrough characters and copy them in one go;
+  // this is faster than calling push_back repeatedly.
+  unsigned char minEncode = static_cast<unsigned char>(mode);
+  while (p != str.end()) {
+    char c = *p;
+    unsigned char v = static_cast<unsigned char>(c);
+    unsigned char discriminator = detail::uriEscapeTable[v];
+    if (LIKELY(discriminator <= minEncode)) {
+      ++p;
+    } else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
+      out.append(&*last, p - last);
+      out.push_back('+');
+      ++p;
+      last = p;
+    } else {
+      out.append(&*last, p - last);
+      esc[1] = hexValues[v >> 4];
+      esc[2] = hexValues[v & 0x0f];
+      out.append(esc, 3);
+      ++p;
+      last = p;
+    }
+  }
+  out.append(&*last, p - last);
+}
+
+template <class String>
+void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
+  out.reserve(out.size() + str.size());
+  auto p = str.begin();
+  auto last = p;
+  // We advance over runs of passthrough characters and copy them in one go;
+  // this is faster than calling push_back repeatedly.
+  while (p != str.end()) {
+    char c = *p;
+    unsigned char v = static_cast<unsigned char>(v);
+    switch (c) {
+    case '%':
+      {
+        if (UNLIKELY(std::distance(p, str.end()) < 3)) {
+          throw std::invalid_argument("incomplete percent encode sequence");
+        }
+        auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
+        auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
+        if (UNLIKELY(h1 == 16 || h2 == 16)) {
+          throw std::invalid_argument("invalid percent encode sequence");
+        }
+        out.append(&*last, p - last);
+        out.push_back((h1 << 4) | h2);
+        p += 3;
+        last = p;
+        break;
+      }
+    case '+':
+      if (mode == UriEscapeMode::QUERY) {
+        out.append(&*last, p - last);
+        out.push_back(' ');
+        ++p;
+        last = p;
+        break;
+      }
+      // else fallthrough
+    default:
+      ++p;
+      break;
+    }
+  }
+  out.append(&*last, p - last);
+}
+
 namespace detail {
 
 /*
index c843eed8b1b5d4768570fc9dd0aa528092e2858e..2edd0da9b666eb3448cbffded1f2299cd944cf7f 100644 (file)
@@ -112,6 +112,56 @@ String cUnescape(StringPiece str, bool strict = true) {
   return out;
 }
 
+/**
+ * URI-escape a string.  Appends the result to the output string.
+ *
+ * Alphanumeric characters and other characters marked as "unreserved" in RFC
+ * 3986 ( -_.~ ) are left unchanged.  In PATH mode, the forward slash (/) is
+ * also left unchanged.  In QUERY mode, spaces are replaced by '+'.  All other
+ * characters are percent-encoded.
+ */
+enum class UriEscapeMode : unsigned char {
+  // The values are meaningful, see generate_escape_tables.py
+  ALL = 0,
+  QUERY = 1,
+  PATH = 2
+};
+template <class String>
+void uriEscape(StringPiece str,
+               String& out,
+               UriEscapeMode mode = UriEscapeMode::ALL);
+
+/**
+ * Similar to uriEscape above, but returns the escaped string.
+ */
+template <class String>
+String uriEscape(StringPiece str, UriEscapeMode mode = UriEscapeMode::ALL) {
+  String out;
+  uriEscape(str, out, mode);
+  return out;
+}
+
+/**
+ * URI-unescape a string.  Appends the result to the output string.
+ *
+ * In QUERY mode, '+' are replaced by space.  %XX sequences are decoded if
+ * XX is a valid hex sequence, otherwise we throw invalid_argument.
+ */
+template <class String>
+void uriUnescape(StringPiece str,
+                 String& out,
+                 UriEscapeMode mode = UriEscapeMode::ALL);
+
+/**
+ * Similar to uriUnescape above, but returns the unescaped string.
+ */
+template <class String>
+String uriUnescape(StringPiece str, UriEscapeMode mode = UriEscapeMode::ALL) {
+  String out;
+  uriUnescape(str, out, mode);
+  return out;
+}
+
 /**
  * stringPrintf is much like printf but deposits its result into a
  * string. Two signatures are supported: the first simply returns the
diff --git a/folly/Uri-inl.h b/folly/Uri-inl.h
new file mode 100644 (file)
index 0000000..71a23cb
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FOLLY_URI_H_
+#error This file may only be included from folly/Uri.h
+#endif
+
+#include "folly/Conv.h"
+
+namespace folly {
+
+template <class String>
+String Uri::toString() const {
+  String str;
+  toAppend(scheme_, "://", &str);
+  if (!password_.empty()) {
+    toAppend(username_, ":", password_, "@", &str);
+  } else if (!username_.empty()) {
+    toAppend(username_, "@", &str);
+  }
+  toAppend(host_, &str);
+  if (port_ != 0) {
+    toAppend(":", port_, &str);
+  }
+  toAppend(path_, &str);
+  if (!query_.empty()) {
+    toAppend("?", query_, &str);
+  }
+  if (!fragment_.empty()) {
+    toAppend("#", fragment_, &str);
+  }
+  return str;
+}
+
+}  // namespace folly
+
diff --git a/folly/Uri.cpp b/folly/Uri.cpp
new file mode 100644 (file)
index 0000000..9ae5c71
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "folly/Uri.h"
+
+#include <ctype.h>
+#include <boost/regex.hpp>
+
+namespace folly {
+
+namespace {
+
+fbstring submatch(const boost::cmatch& m, size_t idx) {
+  auto& sub = m[idx];
+  return fbstring(sub.first, sub.second);
+}
+
+template <class String>
+void toLower(String& s) {
+  for (auto& c : s) {
+    c = tolower(c);
+  }
+}
+
+}  // namespace
+
+Uri::Uri(StringPiece str) : port_(0) {
+  static const boost::regex uriRegex(
+      "([a-zA-Z][a-zA-Z0-9+.-]*):"  // scheme:
+      "([^?#]*)"                    // authority and path
+      "(?:\\?([^#]*))?"             // ?query
+      "(?:#(.*))?");                // #fragment
+  static const boost::regex authorityAndPathRegex("//([^/]*)(/.*)?");
+
+  boost::cmatch match;
+  if (UNLIKELY(!boost::regex_match(str.begin(), str.end(), match, uriRegex))) {
+    throw std::invalid_argument("invalid URI");
+  }
+
+  scheme_ = submatch(match, 1);
+  toLower(scheme_);
+
+  StringPiece authorityAndPath(match[2].first, match[2].second);
+  boost::cmatch authorityAndPathMatch;
+  if (!boost::regex_match(authorityAndPath.begin(),
+                          authorityAndPath.end(),
+                          authorityAndPathMatch,
+                          authorityAndPathRegex)) {
+    // Does not start with //, doesn't have authority
+    path_ = authorityAndPath.fbstr();
+  } else {
+    static const boost::regex authorityRegex(
+        "(?:([^@:]*)(?::([^@]*))?@)?"  // username, password
+        "(\\[[^\\]]*\\]|[^\\[:]*)"     // host (IP-literal, dotted-IPv4, or
+                                       // named host)
+        "(?::(\\d*))?");               // port
+
+    auto authority = authorityAndPathMatch[1];
+    boost::cmatch authorityMatch;
+    if (!boost::regex_match(authority.first,
+                            authority.second,
+                            authorityMatch,
+                            authorityRegex)) {
+      throw std::invalid_argument("invalid URI authority");
+    }
+
+    StringPiece port(authorityMatch[4].first, authorityMatch[4].second);
+    if (!port.empty()) {
+      port_ = to<uint32_t>(port);
+    }
+
+    username_ = submatch(authorityMatch, 1);
+    password_ = submatch(authorityMatch, 2);
+    host_ = submatch(authorityMatch, 3);
+    path_ = submatch(authorityAndPathMatch, 2);
+  }
+
+  query_ = submatch(match, 3);
+  fragment_ = submatch(match, 4);
+}
+
+}  // namespace folly
diff --git a/folly/Uri.h b/folly/Uri.h
new file mode 100644 (file)
index 0000000..8885bb9
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FOLLY_URI_H_
+#define FOLLY_URI_H_
+
+#include "folly/String.h"
+
+namespace folly {
+
+/**
+ * Class representing a URI.
+ *
+ * Consider http://www.facebook.com/foo/bar?key=foo#anchor
+ *
+ * The URI is broken down into its parts: scheme ("http"), authority
+ * (ie. host and port, in most cases: "www.facebook.com"), path
+ * ("/foo/bar"), query ("key=foo") and fragment ("anchor").  The scheme is
+ * lower-cased.
+ *
+ * If this Uri represents a URL, note that, to prevent ambiguity, the component
+ * parts are NOT percent-decoded; you should do this yourself with
+ * uriUnescape() (for the authority and path) and uriUnescape(...,
+ * UriEscapeMode::QUERY) (for the query, but probably only after splitting at
+ * '&' to identify the individual parameters).
+ */
+class Uri {
+ public:
+  /**
+   * Parse a Uri from a string.  Throws std::invalid_argument on parse error.
+   */
+  explicit Uri(StringPiece str);
+
+  const fbstring& scheme() const { return scheme_; }
+  const fbstring& username() const { return username_; }
+  const fbstring& password() const { return password_; }
+  const fbstring& host() const { return host_; }
+  uint32_t port() const { return port_; }
+  const fbstring& path() const { return path_; }
+  const fbstring& query() const { return query_; }
+  const fbstring& fragment() const { return fragment_; }
+
+  template <class String>
+  String toString() const;
+
+  std::string str() const { return toString<std::string>(); }
+  fbstring fbstr() const { return toString<fbstring>(); }
+
+ private:
+  fbstring scheme_;
+  fbstring username_;
+  fbstring password_;
+  fbstring host_;
+  uint32_t port_;
+  fbstring path_;
+  fbstring query_;
+  fbstring fragment_;
+};
+
+}  // namespace folly
+
+#include "folly/Uri-inl.h"
+
+#endif /* FOLLY_URI_H_ */
index e84179e7aa538231981e83844fb2646776a8c20e..0d18978e470b6a808e80aa1f609716d57cdfed81 100755 (executable)
@@ -14,6 +14,7 @@ def generate(f):
     f.write("namespace folly {\n"
             "namespace detail {\n"
             "\n")
+
     f.write("extern const char cEscapeTable[] =\n")
     escapes = dict((
         ('"', '\\"'),
@@ -36,8 +37,7 @@ def generate(f):
         else:
             c = 'P'  # printable
         f.write(c)
-    f.write("\";\n"
-            "\n")
+    f.write("\";\n\n")
 
     f.write("extern const char cUnescapeTable[] =\n")
     for i in range(0, 256):
@@ -56,10 +56,9 @@ def generate(f):
             f.write("X")  # hex
         else:
             f.write("I")  # invalid
-    f.write("\";\n"
-            "\n"
-            "extern const unsigned char hexTable[] = {")
+    f.write("\";\n\n")
 
+    f.write("extern const unsigned char hexTable[] = {")
     for i in range(0, 256):
         if i % 16 == 0:
             f.write("\n  ")
@@ -71,8 +70,31 @@ def generate(f):
             f.write("{0:2d}, ".format(i - ord('A') + 10))
         else:
             f.write("16, ")
-    f.write("\n};\n"
-            "\n")
+    f.write("\n};\n\n")
+
+    # 0 = passthrough
+    # 1 = unused
+    # 2 = safe in path (/)
+    # 3 = space (replace with '+' in query)
+    # 4 = always percent-encode
+    f.write("extern const unsigned char uriEscapeTable[] = {")
+    passthrough = (
+        range(ord('0'), ord('9')) +
+        range(ord('A'), ord('Z')) +
+        range(ord('a'), ord('z')) +
+        map(ord, '-_.~'))
+    for i in range(0, 256):
+        if i % 16 == 0:
+            f.write("\n  ")
+        if i in passthrough:
+            f.write("0, ")
+        elif i == ord('/'):
+            f.write("2, ")
+        elif i == ord(' '):
+            f.write("3, ")
+        else:
+            f.write("4, ")
+    f.write("\n};\n\n")
 
     f.write("}  // namespace detail\n"
             "}  // namespace folly\n")
index 43eed510c82862d4d05c3d66d91fcd08f1e59676..8c6d6d615f2e2977cfb96390fe463a59aa696d14 100644 (file)
@@ -131,25 +131,117 @@ TEST(Escape, cUnescape) {
                std::invalid_argument);
 }
 
+TEST(Escape, uriEscape) {
+  EXPECT_EQ("hello%2c%20%2fworld", uriEscape<std::string>("hello, /world"));
+  EXPECT_EQ("hello%2c%20/world", uriEscape<std::string>("hello, /world",
+                                                        UriEscapeMode::PATH));
+  EXPECT_EQ("hello%2c+%2fworld", uriEscape<std::string>("hello, /world",
+                                                        UriEscapeMode::QUERY));
+}
+
+TEST(Escape, uriUnescape) {
+  EXPECT_EQ("hello, /world", uriUnescape<std::string>("hello, /world"));
+  EXPECT_EQ("hello, /world", uriUnescape<std::string>("hello%2c%20%2fworld"));
+  EXPECT_EQ("hello,+/world", uriUnescape<std::string>("hello%2c+%2fworld"));
+  EXPECT_EQ("hello, /world", uriUnescape<std::string>("hello%2c+%2fworld",
+                                                      UriEscapeMode::QUERY));
+  EXPECT_EQ("hello/", uriUnescape<std::string>("hello%2f"));
+  EXPECT_EQ("hello/", uriUnescape<std::string>("hello%2F"));
+  EXPECT_THROW({uriUnescape<std::string>("hello%");},
+               std::invalid_argument);
+  EXPECT_THROW({uriUnescape<std::string>("hello%2");},
+               std::invalid_argument);
+  EXPECT_THROW({uriUnescape<std::string>("hello%2g");},
+               std::invalid_argument);
+}
+
 namespace {
-fbstring bmString;
-fbstring bmEscapedString;
-fbstring escapedString;
-fbstring unescapedString;
-const size_t kBmStringLength = 64 << 10;
-const uint32_t kPrintablePercentage = 90;
+void expectPrintable(StringPiece s) {
+  for (char c : s) {
+    EXPECT_LE(32, c);
+    EXPECT_GE(127, c);
+  }
+}
+}  // namespace
 
-void initBenchmark() {
-  bmString.reserve(kBmStringLength);
+TEST(Escape, uriEscapeAllCombinations) {
+  char c[3];
+  c[2] = '\0';
+  StringPiece in(c, 2);
+  fbstring tmp;
+  fbstring out;
+  for (int i = 0; i < 256; ++i) {
+    c[0] = i;
+    for (int j = 0; j < 256; ++j) {
+      c[1] = j;
+      tmp.clear();
+      out.clear();
+      uriEscape(in, tmp);
+      expectPrintable(tmp);
+      uriUnescape(tmp, out);
+      EXPECT_EQ(in, out);
+    }
+  }
+}
+
+namespace {
+bool isHex(int v) {
+  return ((v >= '0' && v <= '9') ||
+          (v >= 'A' && v <= 'F') ||
+          (v >= 'a' && v <= 'f'));
+}
+}  // namespace
+
+TEST(Escape, uriUnescapePercentDecoding) {
+  char c[4] = {'%', '\0', '\0', '\0'};
+  StringPiece in(c, 3);
+  fbstring out;
+  unsigned int expected = 0;
+  for (int i = 0; i < 256; ++i) {
+    c[1] = i;
+    for (int j = 0; j < 256; ++j) {
+      c[2] = j;
+      if (isHex(i) && isHex(j)) {
+        out.clear();
+        uriUnescape(in, out);
+        EXPECT_EQ(1, out.size());
+        EXPECT_EQ(1, sscanf(c + 1, "%x", &expected));
+        unsigned char v = out[0];
+        EXPECT_EQ(expected, v);
+      } else {
+        EXPECT_THROW({uriUnescape(in, out);}, std::invalid_argument);
+      }
+    }
+  }
+}
+
+namespace {
+fbstring cbmString;
+fbstring cbmEscapedString;
+fbstring cEscapedString;
+fbstring cUnescapedString;
+const size_t kCBmStringLength = 64 << 10;
+const uint32_t kCPrintablePercentage = 90;
+
+fbstring uribmString;
+fbstring uribmEscapedString;
+fbstring uriEscapedString;
+fbstring uriUnescapedString;
+const size_t kURIBmStringLength = 256;
+const uint32_t kURIPassThroughPercentage = 50;
 
+void initBenchmark() {
   std::mt19937 rnd;
+
+  // C escape
   std::uniform_int_distribution<uint32_t> printable(32, 126);
   std::uniform_int_distribution<uint32_t> nonPrintable(0, 160);
   std::uniform_int_distribution<uint32_t> percentage(0, 99);
 
-  for (size_t i = 0; i < kBmStringLength; ++i) {
+  cbmString.reserve(kCBmStringLength);
+  for (size_t i = 0; i < kCBmStringLength; ++i) {
     unsigned char c;
-    if (percentage(rnd) < kPrintablePercentage) {
+    if (percentage(rnd) < kCPrintablePercentage) {
       c = printable(rnd);
     } else {
       c = nonPrintable(rnd);
@@ -159,23 +251,55 @@ void initBenchmark() {
         c += (126 - 32) + 1;
       }
     }
-    bmString.push_back(c);
+    cbmString.push_back(c);
+  }
+
+  cbmEscapedString = cEscape<fbstring>(cbmString);
+
+  // URI escape
+  std::uniform_int_distribution<uint32_t> passthrough('a', 'z');
+  std::string encodeChars = " ?!\"',+[]";
+  std::uniform_int_distribution<uint32_t> encode(0, encodeChars.size() - 1);
+
+  uribmString.reserve(kURIBmStringLength);
+  for (size_t i = 0; i < kURIBmStringLength; ++i) {
+    unsigned char c;
+    if (percentage(rnd) < kURIPassThroughPercentage) {
+      c = passthrough(rnd);
+    } else {
+      c = encodeChars[encode(rnd)];
+    }
+    uribmString.push_back(c);
   }
 
-  bmEscapedString = cEscape<fbstring>(bmString);
+  uribmEscapedString = uriEscape<fbstring>(uribmString);
 }
 
 BENCHMARK(BM_cEscape, iters) {
   while (iters--) {
-    escapedString = cEscape<fbstring>(bmString);
-    doNotOptimizeAway(escapedString.size());
+    cEscapedString = cEscape<fbstring>(cbmString);
+    doNotOptimizeAway(cEscapedString.size());
   }
 }
 
 BENCHMARK(BM_cUnescape, iters) {
   while (iters--) {
-    unescapedString = cUnescape<fbstring>(bmEscapedString);
-    doNotOptimizeAway(unescapedString.size());
+    cUnescapedString = cUnescape<fbstring>(cbmEscapedString);
+    doNotOptimizeAway(cUnescapedString.size());
+  }
+}
+
+BENCHMARK(BM_uriEscape, iters) {
+  while (iters--) {
+    uriEscapedString = uriEscape<fbstring>(uribmString);
+    doNotOptimizeAway(uriEscapedString.size());
+  }
+}
+
+BENCHMARK(BM_uriUnescape, iters) {
+  while (iters--) {
+    uriUnescapedString = uriUnescape<fbstring>(uribmEscapedString);
+    doNotOptimizeAway(uriUnescapedString.size());
   }
 }
 
diff --git a/folly/test/UriTest.cpp b/folly/test/UriTest.cpp
new file mode 100644 (file)
index 0000000..97b1463
--- /dev/null
@@ -0,0 +1,213 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "folly/Uri.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+using namespace folly;
+
+namespace {
+
+}  // namespace
+
+TEST(Uri, Simple) {
+  {
+    fbstring s("http://www.facebook.com/hello/world?query#fragment");
+    Uri u(s);
+    EXPECT_EQ("http", u.scheme());
+    EXPECT_EQ("", u.username());
+    EXPECT_EQ("", u.password());
+    EXPECT_EQ("www.facebook.com", u.host());
+    EXPECT_EQ(0, u.port());
+    EXPECT_EQ("/hello/world", u.path());
+    EXPECT_EQ("query", u.query());
+    EXPECT_EQ("fragment", u.fragment());
+    EXPECT_EQ(s, u.fbstr());  // canonical
+  }
+
+  {
+    fbstring s("http://www.facebook.com:8080/hello/world?query#fragment");
+    Uri u(s);
+    EXPECT_EQ("http", u.scheme());
+    EXPECT_EQ("", u.username());
+    EXPECT_EQ("", u.password());
+    EXPECT_EQ("www.facebook.com", u.host());
+    EXPECT_EQ(8080, u.port());
+    EXPECT_EQ("/hello/world", u.path());
+    EXPECT_EQ("query", u.query());
+    EXPECT_EQ("fragment", u.fragment());
+    EXPECT_EQ(s, u.fbstr());  // canonical
+  }
+
+  {
+    fbstring s("http://127.0.0.1:8080/hello/world?query#fragment");
+    Uri u(s);
+    EXPECT_EQ("http", u.scheme());
+    EXPECT_EQ("", u.username());
+    EXPECT_EQ("", u.password());
+    EXPECT_EQ("127.0.0.1", u.host());
+    EXPECT_EQ(8080, u.port());
+    EXPECT_EQ("/hello/world", u.path());
+    EXPECT_EQ("query", u.query());
+    EXPECT_EQ("fragment", u.fragment());
+    EXPECT_EQ(s, u.fbstr());  // canonical
+  }
+
+  {
+    fbstring s("http://[::1]:8080/hello/world?query#fragment");
+    Uri u(s);
+    EXPECT_EQ("http", u.scheme());
+    EXPECT_EQ("", u.username());
+    EXPECT_EQ("", u.password());
+    EXPECT_EQ("[::1]", u.host());
+    EXPECT_EQ(8080, u.port());
+    EXPECT_EQ("/hello/world", u.path());
+    EXPECT_EQ("query", u.query());
+    EXPECT_EQ("fragment", u.fragment());
+    EXPECT_EQ(s, u.fbstr());  // canonical
+  }
+
+  {
+    fbstring s("http://user:pass@host.com/");
+    Uri u(s);
+    EXPECT_EQ("http", u.scheme());
+    EXPECT_EQ("user", u.username());
+    EXPECT_EQ("pass", u.password());
+    EXPECT_EQ("host.com", u.host());
+    EXPECT_EQ(0, u.port());
+    EXPECT_EQ("/", u.path());
+    EXPECT_EQ("", u.query());
+    EXPECT_EQ("", u.fragment());
+    EXPECT_EQ(s, u.fbstr());
+  }
+
+  {
+    fbstring s("http://user@host.com/");
+    Uri u(s);
+    EXPECT_EQ("http", u.scheme());
+    EXPECT_EQ("user", u.username());
+    EXPECT_EQ("", u.password());
+    EXPECT_EQ("host.com", u.host());
+    EXPECT_EQ(0, u.port());
+    EXPECT_EQ("/", u.path());
+    EXPECT_EQ("", u.query());
+    EXPECT_EQ("", u.fragment());
+    EXPECT_EQ(s, u.fbstr());
+  }
+
+  {
+    fbstring s("http://user:@host.com/");
+    Uri u(s);
+    EXPECT_EQ("http", u.scheme());
+    EXPECT_EQ("user", u.username());
+    EXPECT_EQ("", u.password());
+    EXPECT_EQ("host.com", u.host());
+    EXPECT_EQ(0, u.port());
+    EXPECT_EQ("/", u.path());
+    EXPECT_EQ("", u.query());
+    EXPECT_EQ("", u.fragment());
+    EXPECT_EQ("http://user@host.com/", u.fbstr());
+  }
+
+  {
+    fbstring s("http://:pass@host.com/");
+    Uri u(s);
+    EXPECT_EQ("http", u.scheme());
+    EXPECT_EQ("", u.username());
+    EXPECT_EQ("pass", u.password());
+    EXPECT_EQ("host.com", u.host());
+    EXPECT_EQ(0, u.port());
+    EXPECT_EQ("/", u.path());
+    EXPECT_EQ("", u.query());
+    EXPECT_EQ("", u.fragment());
+    EXPECT_EQ(s, u.fbstr());
+  }
+
+  {
+    fbstring s("http://@host.com/");
+    Uri u(s);
+    EXPECT_EQ("http", u.scheme());
+    EXPECT_EQ("", u.username());
+    EXPECT_EQ("", u.password());
+    EXPECT_EQ("host.com", u.host());
+    EXPECT_EQ(0, u.port());
+    EXPECT_EQ("/", u.path());
+    EXPECT_EQ("", u.query());
+    EXPECT_EQ("", u.fragment());
+    EXPECT_EQ("http://host.com/", u.fbstr());
+  }
+
+  {
+    fbstring s("http://:@host.com/");
+    Uri u(s);
+    EXPECT_EQ("http", u.scheme());
+    EXPECT_EQ("", u.username());
+    EXPECT_EQ("", u.password());
+    EXPECT_EQ("host.com", u.host());
+    EXPECT_EQ(0, u.port());
+    EXPECT_EQ("/", u.path());
+    EXPECT_EQ("", u.query());
+    EXPECT_EQ("", u.fragment());
+    EXPECT_EQ("http://host.com/", u.fbstr());
+  }
+
+  {
+    fbstring s("file:///etc/motd");
+    Uri u(s);
+    EXPECT_EQ("file", u.scheme());
+    EXPECT_EQ("", u.username());
+    EXPECT_EQ("", u.password());
+    EXPECT_EQ("", u.host());
+    EXPECT_EQ(0, u.port());
+    EXPECT_EQ("/etc/motd", u.path());
+    EXPECT_EQ("", u.query());
+    EXPECT_EQ("", u.fragment());
+    EXPECT_EQ(s, u.fbstr());
+  }
+
+  {
+    fbstring s("file:/etc/motd");
+    Uri u(s);
+    EXPECT_EQ("file", u.scheme());
+    EXPECT_EQ("", u.username());
+    EXPECT_EQ("", u.password());
+    EXPECT_EQ("", u.host());
+    EXPECT_EQ(0, u.port());
+    EXPECT_EQ("/etc/motd", u.path());
+    EXPECT_EQ("", u.query());
+    EXPECT_EQ("", u.fragment());
+    EXPECT_EQ("file:///etc/motd", u.fbstr());
+  }
+
+  {
+    fbstring s("file://etc/motd");
+    Uri u(s);
+    EXPECT_EQ("file", u.scheme());
+    EXPECT_EQ("", u.username());
+    EXPECT_EQ("", u.password());
+    EXPECT_EQ("etc", u.host());
+    EXPECT_EQ(0, u.port());
+    EXPECT_EQ("/motd", u.path());
+    EXPECT_EQ("", u.query());
+    EXPECT_EQ("", u.fragment());
+    EXPECT_EQ(s, u.fbstr());
+  }
+
+  EXPECT_THROW({Uri("2http://www.facebook.com/");},
+               std::invalid_argument);
+}