From 8ce1001d4580992de706e8387aba590375a81798 Mon Sep 17 00:00:00 2001 From: inga-lovinde <52715130+inga-lovinde@users.noreply.github.com> Date: Sat, 1 Apr 2017 19:58:34 +0300 Subject: [PATCH] MD5 SIMD scratchpad --- README.md | 17 ++ dotnet/WhiteRabbit/MD5Digest.cs | 271 ++++++++++++++++++++++---------- 2 files changed, 209 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index 3c86fcb..1e21d52 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,20 @@ +NOTES!!! +======== + +This branch is an attempt to further optimize MD5 computation. +With SIMD, one can perform all the arithmetic operations in MD5 on vectors of uints instead of individual uints, effectively computing 4 hashes (or 8 hashes on AVX2) at once. + +However, SIMD support is somewhat limited in C#; for example, it does not give access not only to rotate commands, but even to bitshift commands; so, in order to perform rotate, I had to split vector back into components, rotate individual components, and then combine these back into vector. +Additionally, there is a performance hit on initialization stage, as to create a vector, one has to allocate an array of its components first. + +In the last commit, I tried to refactor MD5Digest to operate on vectors instead of individual uints, and I've got an overall performance hit of 4x (or 2x, if I replace LeftRotate with noop). +Even computing 4 hashes at once (4x speedup of MD5Digest) won't compensate for this. + +How MD5 could be optimized further: + +* Using CPU instructions for rotation (implemented in not yet released version of RyuJIT): https://github.com/dotnet/coreclr/pull/1830 +* Computing several MD5 hashes in parallel on each core, using SSE (4 hashes / core) or AVX2 (8 hashes / core). However, even bit shifts on vectors are not yet supported by .NET: https://github.com/dotnet/coreclr/issues/3226 + Info ==== diff --git a/dotnet/WhiteRabbit/MD5Digest.cs b/dotnet/WhiteRabbit/MD5Digest.cs index 1075395..bee9dfe 100644 --- a/dotnet/WhiteRabbit/MD5Digest.cs +++ b/dotnet/WhiteRabbit/MD5Digest.cs @@ -2,6 +2,8 @@ namespace WhiteRabbit { + using System.Numerics; + /** * Code taken from BouncyCastle and optimized for specific constraints (e.g. input is always larger than 4 bytes and smaller than 52 bytes). * Further optimization: input could be assumed to be smaller than 27 bytes (original phrase contains 18 letters, so that allows anagrams of 9 words) @@ -11,105 +13,192 @@ namespace WhiteRabbit */ internal static class MD5Digest { + private static readonly Vector K1 = new Vector(0xd76aa478); + private static readonly Vector K2 = new Vector(0xe8c7b756); + private static readonly Vector K3 = new Vector(0x242070db); + private static readonly Vector K4 = new Vector(0xc1bdceee); + private static readonly Vector K5 = new Vector(0xf57c0faf); + private static readonly Vector K6 = new Vector(0x4787c62a); + private static readonly Vector K7 = new Vector(0xa8304613); + private static readonly Vector K8 = new Vector(0xfd469501); + private static readonly Vector K9 = new Vector(0x698098d8); + private static readonly Vector K10 = new Vector(0x8b44f7af); + private static readonly Vector K11 = new Vector(0xffff5bb1); + private static readonly Vector K12 = new Vector(0x895cd7be); + private static readonly Vector K13 = new Vector(0x6b901122); + private static readonly Vector K14 = new Vector(0xfd987193); + private static readonly Vector K15 = new Vector(0xa679438e); + private static readonly Vector K16 = new Vector(0x49b40821); + + private static readonly Vector K17 = new Vector(0xf61e2562); + private static readonly Vector K18 = new Vector(0xc040b340); + private static readonly Vector K19 = new Vector(0x265e5a51); + private static readonly Vector K20 = new Vector(0xe9b6c7aa); + private static readonly Vector K21 = new Vector(0xd62f105d); + private static readonly Vector K22 = new Vector(0x2441453); + private static readonly Vector K23 = new Vector(0xd8a1e681); + private static readonly Vector K24 = new Vector(0xe7d3fbc8); + private static readonly Vector K25 = new Vector(0x21e1cde6); + private static readonly Vector K26 = new Vector(0xc33707d6); + private static readonly Vector K27 = new Vector(0xf4d50d87); + private static readonly Vector K28 = new Vector(0x455a14ed); + private static readonly Vector K29 = new Vector(0xa9e3e905); + private static readonly Vector K30 = new Vector(0xfcefa3f8); + private static readonly Vector K31 = new Vector(0x676f02d9); + private static readonly Vector K32 = new Vector(0x8d2a4c8a); + + private static readonly Vector K33 = new Vector(0xfffa3942); + private static readonly Vector K34 = new Vector(0x8771f681); + private static readonly Vector K35 = new Vector(0x6d9d6122); + private static readonly Vector K36 = new Vector(0xfde5380c); + private static readonly Vector K37 = new Vector(0xa4beea44); + private static readonly Vector K38 = new Vector(0x4bdecfa9); + private static readonly Vector K39 = new Vector(0xf6bb4b60); + private static readonly Vector K40 = new Vector(0xbebfbc70); + private static readonly Vector K41 = new Vector(0x289b7ec6); + private static readonly Vector K42 = new Vector(0xeaa127fa); + private static readonly Vector K43 = new Vector(0xd4ef3085); + private static readonly Vector K44 = new Vector(0x4881d05); + private static readonly Vector K45 = new Vector(0xd9d4d039); + private static readonly Vector K46 = new Vector(0xe6db99e5); + private static readonly Vector K47 = new Vector(0x1fa27cf8); + private static readonly Vector K48 = new Vector(0xc4ac5665); + + private static readonly Vector K49 = new Vector(0xf4292244); + private static readonly Vector K50 = new Vector(0x432aff97); + private static readonly Vector K51 = new Vector(0xab9423a7); + private static readonly Vector K52 = new Vector(0xfc93a039); + private static readonly Vector K53 = new Vector(0x655b59c3); + private static readonly Vector K54 = new Vector(0x8f0ccc92); + private static readonly Vector K55 = new Vector(0xffeff47d); + private static readonly Vector K56 = new Vector(0x85845dd1); + private static readonly Vector K57 = new Vector(0x6fa87e4f); + private static readonly Vector K58 = new Vector(0xfe2ce6e0); + private static readonly Vector K59 = new Vector(0xa3014314); + private static readonly Vector K60 = new Vector(0x4e0811a1); + private static readonly Vector K61 = new Vector(0xf7537e82); + private static readonly Vector K62 = new Vector(0xbd3af235); + private static readonly Vector K63 = new Vector(0x2ad7d2bb); + private static readonly Vector K64 = new Vector(0xeb86d391); + + private static readonly Vector A0 = new Vector(0x67452301); + private static readonly Vector B0 = new Vector(0xefcdab89); + private static readonly Vector C0 = new Vector(0x98badcfe); + private static readonly Vector D0 = new Vector(0x10325476); + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static uint[] Compute(Phrase input) { - uint a = 0x67452301; - uint b = 0xefcdab89; - uint c = 0x98badcfe; - uint d = 0x10325476; - - a = b + LeftRotate(0xd76aa478 + a + Blend(d, c, b) + input.Buffer[0], 7); - d = a + LeftRotate(0xe8c7b756 + d + Blend(c, b, a) + input.Buffer[1], 12); - c = d + LeftRotate(0x242070db + c + Blend(b, a, d) + input.Buffer[2], 17); - b = c + LeftRotate(0xc1bdceee + b + Blend(a, d, c) + input.Buffer[3], 22); - a = b + LeftRotate(0xf57c0faf + a + Blend(d, c, b) + input.Buffer[4], 7); - d = a + LeftRotate(0x4787c62a + d + Blend(c, b, a) + input.Buffer[5], 12); - c = d + LeftRotate(0xa8304613 + c + Blend(b, a, d) + input.Buffer[6], 17); - b = c + LeftRotate(0xfd469501 + b + Blend(a, d, c), 22); - a = b + LeftRotate(0x698098d8 + a + Blend(d, c, b), 7); - d = a + LeftRotate(0x8b44f7af + d + Blend(c, b, a), 12); - c = d + LeftRotate(0xffff5bb1 + c + Blend(b, a, d), 17); - b = c + LeftRotate(0x895cd7be + b + Blend(a, d, c), 22); - a = b + LeftRotate(0x6b901122 + a + Blend(d, c, b), 7); - d = a + LeftRotate(0xfd987193 + d + Blend(c, b, a), 12); - c = d + LeftRotate(0xa679438e + c + Blend(b, a, d) + input.Buffer[7], 17); - b = c + LeftRotate(0x49b40821 + b + Blend(a, d, c), 22); - - a = b + LeftRotate(0xf61e2562 + a + Blend(c, b, d) + input.Buffer[1], 5); - d = a + LeftRotate(0xc040b340 + d + Blend(b, a, c) + input.Buffer[6], 9); - c = d + LeftRotate(0x265e5a51 + c + Blend(a, d, b), 14); - b = c + LeftRotate(0xe9b6c7aa + b + Blend(d, c, a) + input.Buffer[0], 20); - a = b + LeftRotate(0xd62f105d + a + Blend(c, b, d) + input.Buffer[5], 5); - d = a + LeftRotate(0x02441453 + d + Blend(b, a, c), 9); - c = d + LeftRotate(0xd8a1e681 + c + Blend(a, d, b), 14); - b = c + LeftRotate(0xe7d3fbc8 + b + Blend(d, c, a) + input.Buffer[4], 20); - a = b + LeftRotate(0x21e1cde6 + a + Blend(c, b, d), 5); - d = a + LeftRotate(0xc33707d6 + d + Blend(b, a, c) + input.Buffer[7], 9); - c = d + LeftRotate(0xf4d50d87 + c + Blend(a, d, b) + input.Buffer[3], 14); - b = c + LeftRotate(0x455a14ed + b + Blend(d, c, a), 20); - a = b + LeftRotate(0xa9e3e905 + a + Blend(c, b, d), 5); - d = a + LeftRotate(0xfcefa3f8 + d + Blend(b, a, c) + input.Buffer[2], 9); - c = d + LeftRotate(0x676f02d9 + c + Blend(a, d, b), 14); - b = c + LeftRotate(0x8d2a4c8a + b + Blend(d, c, a), 20); - - a = b + LeftRotate(0xfffa3942 + a + Xor(b, c, d) + input.Buffer[5], 4); - d = a + LeftRotate(0x8771f681 + d + Xor(a, b, c), 11); - c = d + LeftRotate(0x6d9d6122 + c + Xor(d, a, b), 16); - b = c + LeftRotate(0xfde5380c + b + Xor(c, d, a) + input.Buffer[7], 23); - a = b + LeftRotate(0xa4beea44 + a + Xor(b, c, d) + input.Buffer[1], 4); - d = a + LeftRotate(0x4bdecfa9 + d + Xor(a, b, c) + input.Buffer[4], 11); - c = d + LeftRotate(0xf6bb4b60 + c + Xor(d, a, b), 16); - b = c + LeftRotate(0xbebfbc70 + b + Xor(c, d, a), 23); - a = b + LeftRotate(0x289b7ec6 + a + Xor(b, c, d), 4); - d = a + LeftRotate(0xeaa127fa + d + Xor(a, b, c) + input.Buffer[0], 11); - c = d + LeftRotate(0xd4ef3085 + c + Xor(d, a, b) + input.Buffer[3], 16); - b = c + LeftRotate(0x04881d05 + b + Xor(c, d, a) + input.Buffer[6], 23); - a = b + LeftRotate(0xd9d4d039 + a + Xor(b, c, d), 4); - d = a + LeftRotate(0xe6db99e5 + d + Xor(a, b, c), 11); - c = d + LeftRotate(0x1fa27cf8 + c + Xor(d, a, b), 16); - b = c + LeftRotate(0xc4ac5665 + b + Xor(c, d, a) + input.Buffer[2], 23); - - a = b + LeftRotate(0xf4292244 + a + I(c, b, d) + input.Buffer[0], 6); - d = a + LeftRotate(0x432aff97 + d + I(b, a, c), 10); - c = d + LeftRotate(0xab9423a7 + c + I(a, d, b) + input.Buffer[7], 15); - b = c + LeftRotate(0xfc93a039 + b + I(d, c, a) + input.Buffer[5], 21); - a = b + LeftRotate(0x655b59c3 + a + I(c, b, d), 6); - d = a + LeftRotate(0x8f0ccc92 + d + I(b, a, c) + input.Buffer[3], 10); - c = d + LeftRotate(0xffeff47d + c + I(a, d, b), 15); - b = c + LeftRotate(0x85845dd1 + b + I(d, c, a) + input.Buffer[1], 21); - a = b + LeftRotate(0x6fa87e4f + a + I(c, b, d), 6); - d = a + LeftRotate(0xfe2ce6e0 + d + I(b, a, c), 10); - c = d + LeftRotate(0xa3014314 + c + I(a, d, b) + input.Buffer[6], 15); - b = c + LeftRotate(0x4e0811a1 + b + I(d, c, a), 21); - a = b + LeftRotate(0xf7537e82 + a + I(c, b, d) + input.Buffer[4], 6); - d = a + LeftRotate(0xbd3af235 + d + I(b, a, c), 10); - c = d + LeftRotate(0x2ad7d2bb + c + I(a, d, b) + input.Buffer[2], 15); - b = c + LeftRotate(0xeb86d391 + b + I(d, c, a), 21); + var chunk0 = GetChunk(input, 0); + var chunk1 = GetChunk(input, 1); + var chunk2 = GetChunk(input, 2); + var chunk3 = GetChunk(input, 3); + var chunk4 = GetChunk(input, 4); + var chunk5 = GetChunk(input, 5); + var chunk6 = GetChunk(input, 6); + var chunk7 = GetChunk(input, 7); + + var a = A0; + var b = B0; + var c = C0; + var d = D0; + + a = b + LeftRotate(K1 + a + Blend(d, c, b) + chunk0, 7); + d = a + LeftRotate(K2 + d + Blend(c, b, a) + chunk1, 12); + c = d + LeftRotate(K3 + c + Blend(b, a, d) + chunk2, 17); + b = c + LeftRotate(K4 + b + Blend(a, d, c) + chunk3, 22); + a = b + LeftRotate(K5 + a + Blend(d, c, b) + chunk4, 7); + d = a + LeftRotate(K6 + d + Blend(c, b, a) + chunk5, 12); + c = d + LeftRotate(K7 + c + Blend(b, a, d) + chunk6, 17); + b = c + LeftRotate(K8 + b + Blend(a, d, c), 22); + a = b + LeftRotate(K9 + a + Blend(d, c, b), 7); + d = a + LeftRotate(K10 + d + Blend(c, b, a), 12); + c = d + LeftRotate(K11 + c + Blend(b, a, d), 17); + b = c + LeftRotate(K12 + b + Blend(a, d, c), 22); + a = b + LeftRotate(K13 + a + Blend(d, c, b), 7); + d = a + LeftRotate(K14 + d + Blend(c, b, a), 12); + c = d + LeftRotate(K15 + c + Blend(b, a, d) + chunk7, 17); + b = c + LeftRotate(K16 + b + Blend(a, d, c), 22); + + a = b + LeftRotate(K17 + a + Blend(c, b, d) + chunk1, 5); + d = a + LeftRotate(K18 + d + Blend(b, a, c) + chunk6, 9); + c = d + LeftRotate(K19 + c + Blend(a, d, b), 14); + b = c + LeftRotate(K20 + b + Blend(d, c, a) + chunk0, 20); + a = b + LeftRotate(K21 + a + Blend(c, b, d) + chunk5, 5); + d = a + LeftRotate(K22 + d + Blend(b, a, c), 9); + c = d + LeftRotate(K23 + c + Blend(a, d, b), 14); + b = c + LeftRotate(K24 + b + Blend(d, c, a) + chunk4, 20); + a = b + LeftRotate(K25 + a + Blend(c, b, d), 5); + d = a + LeftRotate(K26 + d + Blend(b, a, c) + chunk7, 9); + c = d + LeftRotate(K27 + c + Blend(a, d, b) + chunk3, 14); + b = c + LeftRotate(K28 + b + Blend(d, c, a), 20); + a = b + LeftRotate(K29 + a + Blend(c, b, d), 5); + d = a + LeftRotate(K30 + d + Blend(b, a, c) + chunk2, 9); + c = d + LeftRotate(K31 + c + Blend(a, d, b), 14); + b = c + LeftRotate(K32 + b + Blend(d, c, a), 20); + + a = b + LeftRotate(K33 + a + Xor(b, c, d) + chunk5, 4); + d = a + LeftRotate(K34 + d + Xor(a, b, c), 11); + c = d + LeftRotate(K35 + c + Xor(d, a, b), 16); + b = c + LeftRotate(K36 + b + Xor(c, d, a) + chunk7, 23); + a = b + LeftRotate(K37 + a + Xor(b, c, d) + chunk1, 4); + d = a + LeftRotate(K38 + d + Xor(a, b, c) + chunk4, 11); + c = d + LeftRotate(K39 + c + Xor(d, a, b), 16); + b = c + LeftRotate(K40 + b + Xor(c, d, a), 23); + a = b + LeftRotate(K41 + a + Xor(b, c, d), 4); + d = a + LeftRotate(K42 + d + Xor(a, b, c) + chunk0, 11); + c = d + LeftRotate(K43 + c + Xor(d, a, b) + chunk3, 16); + b = c + LeftRotate(K44 + b + Xor(c, d, a) + chunk6, 23); + a = b + LeftRotate(K45 + a + Xor(b, c, d), 4); + d = a + LeftRotate(K46 + d + Xor(a, b, c), 11); + c = d + LeftRotate(K47 + c + Xor(d, a, b), 16); + b = c + LeftRotate(K48 + b + Xor(c, d, a) + chunk2, 23); + + a = b + LeftRotate(K49 + a + I(c, b, d) + chunk0, 6); + d = a + LeftRotate(K50 + d + I(b, a, c), 10); + c = d + LeftRotate(K51 + c + I(a, d, b) + chunk7, 15); + b = c + LeftRotate(K52 + b + I(d, c, a) + chunk5, 21); + a = b + LeftRotate(K53 + a + I(c, b, d), 6); + d = a + LeftRotate(K54 + d + I(b, a, c) + chunk3, 10); + c = d + LeftRotate(K55 + c + I(a, d, b), 15); + b = c + LeftRotate(K56 + b + I(d, c, a) + chunk1, 21); + a = b + LeftRotate(K57 + a + I(c, b, d), 6); + d = a + LeftRotate(K58 + d + I(b, a, c), 10); + c = d + LeftRotate(K59 + c + I(a, d, b) + chunk6, 15); + b = c + LeftRotate(K60 + b + I(d, c, a), 21); + a = b + LeftRotate(K61 + a + I(c, b, d) + chunk4, 6); + d = a + LeftRotate(K62 + d + I(b, a, c), 10); + c = d + LeftRotate(K63 + c + I(a, d, b) + chunk2, 15); + b = c + LeftRotate(K64 + b + I(d, c, a), 21); + + a += A0; + b += B0; + c += C0; + d += D0; return new[] { - 0x67452301 + a, - 0xefcdab89 + b, - 0x98badcfe + c, - 0x10325476 + d, + a[0], + b[0], + c[0], + d[0], }; } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static uint Blend(uint a, uint b, uint x) + private static Vector Blend(Vector a, Vector b, Vector x) { return (x & b) | (~x & a); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static uint Xor(uint a, uint b, uint c) + private static Vector Xor(Vector a, Vector b, Vector c) { return a ^ b ^ c; } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static uint I(uint a, uint b, uint c) + private static Vector I(Vector a, Vector b, Vector c) { return a ^ (b | ~c); } @@ -119,5 +208,29 @@ namespace WhiteRabbit { return (x << left) | (x >> 32 - left); } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector LeftRotate(Vector x, int left) + { + return new Vector(new[] + { + LeftRotate(x[0], left), + LeftRotate(x[1], left), + LeftRotate(x[2], left), + LeftRotate(x[3], left), + }); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector GetChunk(Phrase phrase, int offset) + { + return new Vector(new[] + { + phrase.Buffer[offset], + phrase.Buffer[offset], + phrase.Buffer[offset], + phrase.Buffer[offset], + }); + } } }