2 * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands
3 * This software is provided 'as-is', without any express or implied
4 * warranty. In no event will the author be held liable for any damages
5 * arising from the use of this software.
6 * Permission is granted to anyone to use this software for any purpose,
7 * including commercial applications, and to alter it and redistribute it
8 * freely, subject to the following restrictions:
9 * 1. The origin of this software must not be misrepresented; you must not
10 * claim that you wrote the original software. If you use this software
11 * in a product, an acknowledgment in the product documentation would be
12 * appreciated but is not required.
13 * 2. Altered source versions must be plainly marked as such, and must not be
14 * misrepresented as being the original software.
15 * 3. This notice may not be removed or altered from any source distribution.
17 * ftoth@exalondelft.nl
19 * https://github.com/htot/crc32c
21 * Modified by Facebook
23 * Original intel whitepaper:
24 * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
25 * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
27 * 32-bit support dropped
28 * use intrinsics instead of inline asm
34 #include <folly/detail/ChecksumDetail.h>
36 #include <folly/CppAttributes.h>
38 #include <boost/preprocessor/arithmetic/add.hpp>
39 #include <boost/preprocessor/arithmetic/sub.hpp>
40 #include <boost/preprocessor/repetition/repeat_from_to.hpp>
45 #if FOLLY_SSE_PREREQ(4, 2)
47 namespace crc32_detail {
49 #define CRCtriplet(crc, buf, offset) \
50 crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
51 crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \
52 crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset)); \
55 #define CRCduplet(crc, buf, offset) \
56 crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
57 crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset));
59 #define CRCsinglet(crc, buf, offset) \
60 crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset)); \
63 #define CASEREPEAT_TRIPLET(unused, count, total) \
64 case BOOST_PP_ADD(1, BOOST_PP_SUB(total, count)): \
65 CRCtriplet(crc, next, -BOOST_PP_ADD(1, BOOST_PP_SUB(total, count)));
67 #define CASEREPEAT_SINGLET(unused, count, total) \
68 case BOOST_PP_SUB(total, count): \
69 CRCsinglet(crc0, next, -BOOST_PP_SUB(total, count) * 8);
71 // Numbers taken directly from intel whitepaper.
73 const uint64_t clmul_constants[] = {
74 0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6,
75 0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e,
76 0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da,
77 0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8,
78 0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296,
79 0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2,
80 0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6,
81 0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092,
82 0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0,
83 0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456,
84 0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e,
85 0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a,
86 0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574,
87 0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832,
88 0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124,
89 0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86,
90 0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e,
91 0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a,
92 0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46,
93 0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a,
94 0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a,
95 0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4,
96 0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56,
97 0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2,
98 0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c,
99 0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac,
100 0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64,
101 0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e,
102 0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c,
103 0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28,
104 0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26,
105 0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c,
106 0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c,
107 0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c,
108 0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4,
109 0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844,
110 0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c,
111 0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730,
112 0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c,
113 0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2,
114 0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2,
115 0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e,
116 0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a,
117 0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a,
118 0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a,
119 0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768,
120 0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4,
121 0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c,
122 0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba,
123 0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312,
124 0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544,
125 0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a,
126 0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e,
127 0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a,
128 0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c,
129 0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a,
130 0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6,
131 0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca,
132 0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888,
133 0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e,
134 0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528,
135 0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a,
136 0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e,
137 0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa,
142 * CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well
143 * chosen constant and xor's these with the remaining CRC.
150 const uint64_t* next2) {
151 const auto multiplier =
152 *(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1);
153 const auto crc0_xmm = _mm_set_epi64x(0, crc0);
154 const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00);
155 const auto crc1_xmm = _mm_set_epi64x(0, crc1);
156 const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10);
157 const auto res = _mm_xor_si128(res0, res1);
158 crc0 = _mm_cvtsi128_si64(res);
159 crc0 = crc0 ^ *((uint64_t*)next2 - 1);
160 crc2 = _mm_crc32_u64(crc2, crc0);
164 // Generates a block that will crc up to 7 bytes of unaligned data.
165 // Always inline to avoid overhead on small crc sizes.
166 FOLLY_ALWAYS_INLINE void align_to_8(
168 uint64_t& crc0, // crc so far, updated on return
169 const unsigned char*& next) { // next data pointer, updated on return
170 uint32_t crc32bit = static_cast<uint32_t>(crc0);
172 crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next);
173 next += sizeof(uint32_t);
176 crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next);
177 next += sizeof(uint16_t);
180 crc32bit = _mm_crc32_u8(crc32bit, *(next));
186 // The main loop for large crc sizes. Generates three crc32c
187 // streams, of varying block sizes, using a duff's device.
190 uint64_t& crc0, // crc so far, updated on return
191 const unsigned char*& next, // next data pointer, updated on return
192 size_t n) { // block count
193 uint64_t crc1 = 0, crc2 = 0;
194 // points to the first byte of the next block
195 const uint64_t* next0 = (uint64_t*)next + block_size;
196 const uint64_t* next1 = next0 + block_size;
197 const uint64_t* next2 = next1 + block_size;
199 // Use Duff's device, a for() loop inside a switch()
200 // statement. This needs to execute at least once, round len
201 // down to nearest triplet multiple
202 switch (block_size) {
205 // jumps here for a full block of len 128
206 CRCtriplet(crc, next, -128);
208 // Generates case statements from 127 to 2 of form:
210 // CRCtriplet(crc, next, -127);
212 // MSVC has a max preprocessor expansion depth of 255, which is
213 // exceeded if this is a single statement.
214 BOOST_PP_REPEAT_FROM_TO(0, 64, CASEREPEAT_TRIPLET, 126);
215 BOOST_PP_REPEAT_FROM_TO(0, 62, CASEREPEAT_TRIPLET, 62);
217 // For the last byte, the three crc32c streams must be combined
218 // using carry-less multiplication.
220 CRCduplet(crc, next, -1); // the final triplet is actually only 2
221 crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2);
225 // points to the first byte of the next block
227 next1 = next0 + 128; // from here on all blocks are 128 long
235 next = (const unsigned char*)next2;
238 } // namespace crc32c_detail
240 /* Compute CRC-32C using the Intel hardware instruction. */
241 FOLLY_TARGET_ATTRIBUTE("sse4.2")
242 uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) {
243 const unsigned char* next = (const unsigned char*)buf;
249 // if len > 216 then align and use triplets
251 size_t align = (8 - (uintptr_t)next) & 7;
252 crc32_detail::align_to_8(align, crc0, next);
255 count = len / 24; // number of triplets
256 len %= 24; // bytes remaining
257 size_t n = count >> 7; // #blocks = first block + full blocks
258 size_t block_size = count & 127;
259 if (block_size == 0) {
265 // This is a separate function call mainly to stop
266 // clang from spilling registers.
267 crc32_detail::triplet_loop(block_size, crc0, next, n);
270 size_t count2 = len >> 3;
272 next += (count2 * 8);
274 // Generates a duff device for the last 128 bytes of aligned data.
276 // Generates case statements of the form:
278 // CRCsinglet(crc0, next, -27 * 8);
279 BOOST_PP_REPEAT_FROM_TO(0, 27, CASEREPEAT_SINGLET, 27);
284 // compute the crc for up to seven trailing bytes
285 crc32_detail::align_to_8(len, crc0, next);
286 return (uint32_t)crc0;
291 uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) {
292 throw std::runtime_error("crc32_hw is not implemented on this platform");
297 } // namespace detail