+static unsigned
+findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
+ const UTF8 *sourceEnd) {
+ UTF8 b1, b2, b3;
+
+ assert(!isLegalUTF8Sequence(source, sourceEnd));
+
+ /*
+ * Unicode 6.3.0, D93b:
+ *
+ * Maximal subpart of an ill-formed subsequence: The longest code unit
+ * subsequence starting at an unconvertible offset that is either:
+ * a. the initial subsequence of a well-formed code unit sequence, or
+ * b. a subsequence of length one.
+ */
+
+ if (source == sourceEnd)
+ return 0;
+
+ /*
+ * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
+ * Byte Sequences.
+ */
+
+ b1 = *source;
+ ++source;
+ if (b1 >= 0xC2 && b1 <= 0xDF) {
+ /*
+ * First byte is valid, but we know that this code unit sequence is
+ * invalid, so the maximal subpart has to end after the first byte.
+ */
+ return 1;
+ }
+
+ if (source == sourceEnd)
+ return 1;
+
+ b2 = *source;
+ ++source;
+
+ if (b1 == 0xE0) {
+ return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
+ }
+ if (b1 >= 0xE1 && b1 <= 0xEC) {
+ return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
+ }
+ if (b1 == 0xED) {
+ return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
+ }
+ if (b1 >= 0xEE && b1 <= 0xEF) {
+ return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
+ }
+ if (b1 == 0xF0) {
+ if (b2 >= 0x90 && b2 <= 0xBF) {
+ if (source == sourceEnd)
+ return 2;
+
+ b3 = *source;
+ return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
+ }
+ return 1;
+ }
+ if (b1 >= 0xF1 && b1 <= 0xF3) {
+ if (b2 >= 0x80 && b2 <= 0xBF) {
+ if (source == sourceEnd)
+ return 2;
+
+ b3 = *source;
+ return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
+ }
+ return 1;
+ }
+ if (b1 == 0xF4) {
+ if (b2 >= 0x80 && b2 <= 0x8F) {
+ if (source == sourceEnd)
+ return 2;
+
+ b3 = *source;
+ return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
+ }
+ return 1;
+ }
+
+ assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
+ /*
+ * There are no valid sequences that start with these bytes. Maximal subpart
+ * is defined to have length 1 in these cases.
+ */
+ return 1;
+}
+
+/* --------------------------------------------------------------------- */
+