MD5 SIMD scratchpad

simd-md5
Inga 🏳‍🌈 7 years ago
parent 5d2c16668b
commit 8ce1001d45
  1. 17
      README.md
  2. 271
      dotnet/WhiteRabbit/MD5Digest.cs

@ -1,3 +1,20 @@
NOTES!!!
========
This branch is an attempt to further optimize MD5 computation.
With SIMD, one can perform all the arithmetic operations in MD5 on vectors of uints instead of individual uints, effectively computing 4 hashes (or 8 hashes on AVX2) at once.
However, SIMD support is somewhat limited in C#; for example, it does not give access not only to rotate commands, but even to bitshift commands; so, in order to perform rotate, I had to split vector back into components, rotate individual components, and then combine these back into vector.
Additionally, there is a performance hit on initialization stage, as to create a vector, one has to allocate an array of its components first.
In the last commit, I tried to refactor MD5Digest to operate on vectors instead of individual uints, and I've got an overall performance hit of 4x (or 2x, if I replace LeftRotate with noop).
Even computing 4 hashes at once (4x speedup of MD5Digest) won't compensate for this.
How MD5 could be optimized further:
* Using CPU instructions for rotation (implemented in not yet released version of RyuJIT): https://github.com/dotnet/coreclr/pull/1830
* Computing several MD5 hashes in parallel on each core, using SSE (4 hashes / core) or AVX2 (8 hashes / core). However, even bit shifts on vectors are not yet supported by .NET: https://github.com/dotnet/coreclr/issues/3226
Info
====

@ -2,6 +2,8 @@
namespace WhiteRabbit
{
using System.Numerics;
/**
* Code taken from BouncyCastle and optimized for specific constraints (e.g. input is always larger than 4 bytes and smaller than 52 bytes).
* Further optimization: input could be assumed to be smaller than 27 bytes (original phrase contains 18 letters, so that allows anagrams of 9 words)
@ -11,105 +13,192 @@ namespace WhiteRabbit
*/
internal static class MD5Digest
{
private static readonly Vector<uint> K1 = new Vector<uint>(0xd76aa478);
private static readonly Vector<uint> K2 = new Vector<uint>(0xe8c7b756);
private static readonly Vector<uint> K3 = new Vector<uint>(0x242070db);
private static readonly Vector<uint> K4 = new Vector<uint>(0xc1bdceee);
private static readonly Vector<uint> K5 = new Vector<uint>(0xf57c0faf);
private static readonly Vector<uint> K6 = new Vector<uint>(0x4787c62a);
private static readonly Vector<uint> K7 = new Vector<uint>(0xa8304613);
private static readonly Vector<uint> K8 = new Vector<uint>(0xfd469501);
private static readonly Vector<uint> K9 = new Vector<uint>(0x698098d8);
private static readonly Vector<uint> K10 = new Vector<uint>(0x8b44f7af);
private static readonly Vector<uint> K11 = new Vector<uint>(0xffff5bb1);
private static readonly Vector<uint> K12 = new Vector<uint>(0x895cd7be);
private static readonly Vector<uint> K13 = new Vector<uint>(0x6b901122);
private static readonly Vector<uint> K14 = new Vector<uint>(0xfd987193);
private static readonly Vector<uint> K15 = new Vector<uint>(0xa679438e);
private static readonly Vector<uint> K16 = new Vector<uint>(0x49b40821);
private static readonly Vector<uint> K17 = new Vector<uint>(0xf61e2562);
private static readonly Vector<uint> K18 = new Vector<uint>(0xc040b340);
private static readonly Vector<uint> K19 = new Vector<uint>(0x265e5a51);
private static readonly Vector<uint> K20 = new Vector<uint>(0xe9b6c7aa);
private static readonly Vector<uint> K21 = new Vector<uint>(0xd62f105d);
private static readonly Vector<uint> K22 = new Vector<uint>(0x2441453);
private static readonly Vector<uint> K23 = new Vector<uint>(0xd8a1e681);
private static readonly Vector<uint> K24 = new Vector<uint>(0xe7d3fbc8);
private static readonly Vector<uint> K25 = new Vector<uint>(0x21e1cde6);
private static readonly Vector<uint> K26 = new Vector<uint>(0xc33707d6);
private static readonly Vector<uint> K27 = new Vector<uint>(0xf4d50d87);
private static readonly Vector<uint> K28 = new Vector<uint>(0x455a14ed);
private static readonly Vector<uint> K29 = new Vector<uint>(0xa9e3e905);
private static readonly Vector<uint> K30 = new Vector<uint>(0xfcefa3f8);
private static readonly Vector<uint> K31 = new Vector<uint>(0x676f02d9);
private static readonly Vector<uint> K32 = new Vector<uint>(0x8d2a4c8a);
private static readonly Vector<uint> K33 = new Vector<uint>(0xfffa3942);
private static readonly Vector<uint> K34 = new Vector<uint>(0x8771f681);
private static readonly Vector<uint> K35 = new Vector<uint>(0x6d9d6122);
private static readonly Vector<uint> K36 = new Vector<uint>(0xfde5380c);
private static readonly Vector<uint> K37 = new Vector<uint>(0xa4beea44);
private static readonly Vector<uint> K38 = new Vector<uint>(0x4bdecfa9);
private static readonly Vector<uint> K39 = new Vector<uint>(0xf6bb4b60);
private static readonly Vector<uint> K40 = new Vector<uint>(0xbebfbc70);
private static readonly Vector<uint> K41 = new Vector<uint>(0x289b7ec6);
private static readonly Vector<uint> K42 = new Vector<uint>(0xeaa127fa);
private static readonly Vector<uint> K43 = new Vector<uint>(0xd4ef3085);
private static readonly Vector<uint> K44 = new Vector<uint>(0x4881d05);
private static readonly Vector<uint> K45 = new Vector<uint>(0xd9d4d039);
private static readonly Vector<uint> K46 = new Vector<uint>(0xe6db99e5);
private static readonly Vector<uint> K47 = new Vector<uint>(0x1fa27cf8);
private static readonly Vector<uint> K48 = new Vector<uint>(0xc4ac5665);
private static readonly Vector<uint> K49 = new Vector<uint>(0xf4292244);
private static readonly Vector<uint> K50 = new Vector<uint>(0x432aff97);
private static readonly Vector<uint> K51 = new Vector<uint>(0xab9423a7);
private static readonly Vector<uint> K52 = new Vector<uint>(0xfc93a039);
private static readonly Vector<uint> K53 = new Vector<uint>(0x655b59c3);
private static readonly Vector<uint> K54 = new Vector<uint>(0x8f0ccc92);
private static readonly Vector<uint> K55 = new Vector<uint>(0xffeff47d);
private static readonly Vector<uint> K56 = new Vector<uint>(0x85845dd1);
private static readonly Vector<uint> K57 = new Vector<uint>(0x6fa87e4f);
private static readonly Vector<uint> K58 = new Vector<uint>(0xfe2ce6e0);
private static readonly Vector<uint> K59 = new Vector<uint>(0xa3014314);
private static readonly Vector<uint> K60 = new Vector<uint>(0x4e0811a1);
private static readonly Vector<uint> K61 = new Vector<uint>(0xf7537e82);
private static readonly Vector<uint> K62 = new Vector<uint>(0xbd3af235);
private static readonly Vector<uint> K63 = new Vector<uint>(0x2ad7d2bb);
private static readonly Vector<uint> K64 = new Vector<uint>(0xeb86d391);
private static readonly Vector<uint> A0 = new Vector<uint>(0x67452301);
private static readonly Vector<uint> B0 = new Vector<uint>(0xefcdab89);
private static readonly Vector<uint> C0 = new Vector<uint>(0x98badcfe);
private static readonly Vector<uint> D0 = new Vector<uint>(0x10325476);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static uint[] Compute(Phrase input)
{
uint a = 0x67452301;
uint b = 0xefcdab89;
uint c = 0x98badcfe;
uint d = 0x10325476;
a = b + LeftRotate(0xd76aa478 + a + Blend(d, c, b) + input.Buffer[0], 7);
d = a + LeftRotate(0xe8c7b756 + d + Blend(c, b, a) + input.Buffer[1], 12);
c = d + LeftRotate(0x242070db + c + Blend(b, a, d) + input.Buffer[2], 17);
b = c + LeftRotate(0xc1bdceee + b + Blend(a, d, c) + input.Buffer[3], 22);
a = b + LeftRotate(0xf57c0faf + a + Blend(d, c, b) + input.Buffer[4], 7);
d = a + LeftRotate(0x4787c62a + d + Blend(c, b, a) + input.Buffer[5], 12);
c = d + LeftRotate(0xa8304613 + c + Blend(b, a, d) + input.Buffer[6], 17);
b = c + LeftRotate(0xfd469501 + b + Blend(a, d, c), 22);
a = b + LeftRotate(0x698098d8 + a + Blend(d, c, b), 7);
d = a + LeftRotate(0x8b44f7af + d + Blend(c, b, a), 12);
c = d + LeftRotate(0xffff5bb1 + c + Blend(b, a, d), 17);
b = c + LeftRotate(0x895cd7be + b + Blend(a, d, c), 22);
a = b + LeftRotate(0x6b901122 + a + Blend(d, c, b), 7);
d = a + LeftRotate(0xfd987193 + d + Blend(c, b, a), 12);
c = d + LeftRotate(0xa679438e + c + Blend(b, a, d) + input.Buffer[7], 17);
b = c + LeftRotate(0x49b40821 + b + Blend(a, d, c), 22);
a = b + LeftRotate(0xf61e2562 + a + Blend(c, b, d) + input.Buffer[1], 5);
d = a + LeftRotate(0xc040b340 + d + Blend(b, a, c) + input.Buffer[6], 9);
c = d + LeftRotate(0x265e5a51 + c + Blend(a, d, b), 14);
b = c + LeftRotate(0xe9b6c7aa + b + Blend(d, c, a) + input.Buffer[0], 20);
a = b + LeftRotate(0xd62f105d + a + Blend(c, b, d) + input.Buffer[5], 5);
d = a + LeftRotate(0x02441453 + d + Blend(b, a, c), 9);
c = d + LeftRotate(0xd8a1e681 + c + Blend(a, d, b), 14);
b = c + LeftRotate(0xe7d3fbc8 + b + Blend(d, c, a) + input.Buffer[4], 20);
a = b + LeftRotate(0x21e1cde6 + a + Blend(c, b, d), 5);
d = a + LeftRotate(0xc33707d6 + d + Blend(b, a, c) + input.Buffer[7], 9);
c = d + LeftRotate(0xf4d50d87 + c + Blend(a, d, b) + input.Buffer[3], 14);
b = c + LeftRotate(0x455a14ed + b + Blend(d, c, a), 20);
a = b + LeftRotate(0xa9e3e905 + a + Blend(c, b, d), 5);
d = a + LeftRotate(0xfcefa3f8 + d + Blend(b, a, c) + input.Buffer[2], 9);
c = d + LeftRotate(0x676f02d9 + c + Blend(a, d, b), 14);
b = c + LeftRotate(0x8d2a4c8a + b + Blend(d, c, a), 20);
a = b + LeftRotate(0xfffa3942 + a + Xor(b, c, d) + input.Buffer[5], 4);
d = a + LeftRotate(0x8771f681 + d + Xor(a, b, c), 11);
c = d + LeftRotate(0x6d9d6122 + c + Xor(d, a, b), 16);
b = c + LeftRotate(0xfde5380c + b + Xor(c, d, a) + input.Buffer[7], 23);
a = b + LeftRotate(0xa4beea44 + a + Xor(b, c, d) + input.Buffer[1], 4);
d = a + LeftRotate(0x4bdecfa9 + d + Xor(a, b, c) + input.Buffer[4], 11);
c = d + LeftRotate(0xf6bb4b60 + c + Xor(d, a, b), 16);
b = c + LeftRotate(0xbebfbc70 + b + Xor(c, d, a), 23);
a = b + LeftRotate(0x289b7ec6 + a + Xor(b, c, d), 4);
d = a + LeftRotate(0xeaa127fa + d + Xor(a, b, c) + input.Buffer[0], 11);
c = d + LeftRotate(0xd4ef3085 + c + Xor(d, a, b) + input.Buffer[3], 16);
b = c + LeftRotate(0x04881d05 + b + Xor(c, d, a) + input.Buffer[6], 23);
a = b + LeftRotate(0xd9d4d039 + a + Xor(b, c, d), 4);
d = a + LeftRotate(0xe6db99e5 + d + Xor(a, b, c), 11);
c = d + LeftRotate(0x1fa27cf8 + c + Xor(d, a, b), 16);
b = c + LeftRotate(0xc4ac5665 + b + Xor(c, d, a) + input.Buffer[2], 23);
a = b + LeftRotate(0xf4292244 + a + I(c, b, d) + input.Buffer[0], 6);
d = a + LeftRotate(0x432aff97 + d + I(b, a, c), 10);
c = d + LeftRotate(0xab9423a7 + c + I(a, d, b) + input.Buffer[7], 15);
b = c + LeftRotate(0xfc93a039 + b + I(d, c, a) + input.Buffer[5], 21);
a = b + LeftRotate(0x655b59c3 + a + I(c, b, d), 6);
d = a + LeftRotate(0x8f0ccc92 + d + I(b, a, c) + input.Buffer[3], 10);
c = d + LeftRotate(0xffeff47d + c + I(a, d, b), 15);
b = c + LeftRotate(0x85845dd1 + b + I(d, c, a) + input.Buffer[1], 21);
a = b + LeftRotate(0x6fa87e4f + a + I(c, b, d), 6);
d = a + LeftRotate(0xfe2ce6e0 + d + I(b, a, c), 10);
c = d + LeftRotate(0xa3014314 + c + I(a, d, b) + input.Buffer[6], 15);
b = c + LeftRotate(0x4e0811a1 + b + I(d, c, a), 21);
a = b + LeftRotate(0xf7537e82 + a + I(c, b, d) + input.Buffer[4], 6);
d = a + LeftRotate(0xbd3af235 + d + I(b, a, c), 10);
c = d + LeftRotate(0x2ad7d2bb + c + I(a, d, b) + input.Buffer[2], 15);
b = c + LeftRotate(0xeb86d391 + b + I(d, c, a), 21);
var chunk0 = GetChunk(input, 0);
var chunk1 = GetChunk(input, 1);
var chunk2 = GetChunk(input, 2);
var chunk3 = GetChunk(input, 3);
var chunk4 = GetChunk(input, 4);
var chunk5 = GetChunk(input, 5);
var chunk6 = GetChunk(input, 6);
var chunk7 = GetChunk(input, 7);
var a = A0;
var b = B0;
var c = C0;
var d = D0;
a = b + LeftRotate(K1 + a + Blend(d, c, b) + chunk0, 7);
d = a + LeftRotate(K2 + d + Blend(c, b, a) + chunk1, 12);
c = d + LeftRotate(K3 + c + Blend(b, a, d) + chunk2, 17);
b = c + LeftRotate(K4 + b + Blend(a, d, c) + chunk3, 22);
a = b + LeftRotate(K5 + a + Blend(d, c, b) + chunk4, 7);
d = a + LeftRotate(K6 + d + Blend(c, b, a) + chunk5, 12);
c = d + LeftRotate(K7 + c + Blend(b, a, d) + chunk6, 17);
b = c + LeftRotate(K8 + b + Blend(a, d, c), 22);
a = b + LeftRotate(K9 + a + Blend(d, c, b), 7);
d = a + LeftRotate(K10 + d + Blend(c, b, a), 12);
c = d + LeftRotate(K11 + c + Blend(b, a, d), 17);
b = c + LeftRotate(K12 + b + Blend(a, d, c), 22);
a = b + LeftRotate(K13 + a + Blend(d, c, b), 7);
d = a + LeftRotate(K14 + d + Blend(c, b, a), 12);
c = d + LeftRotate(K15 + c + Blend(b, a, d) + chunk7, 17);
b = c + LeftRotate(K16 + b + Blend(a, d, c), 22);
a = b + LeftRotate(K17 + a + Blend(c, b, d) + chunk1, 5);
d = a + LeftRotate(K18 + d + Blend(b, a, c) + chunk6, 9);
c = d + LeftRotate(K19 + c + Blend(a, d, b), 14);
b = c + LeftRotate(K20 + b + Blend(d, c, a) + chunk0, 20);
a = b + LeftRotate(K21 + a + Blend(c, b, d) + chunk5, 5);
d = a + LeftRotate(K22 + d + Blend(b, a, c), 9);
c = d + LeftRotate(K23 + c + Blend(a, d, b), 14);
b = c + LeftRotate(K24 + b + Blend(d, c, a) + chunk4, 20);
a = b + LeftRotate(K25 + a + Blend(c, b, d), 5);
d = a + LeftRotate(K26 + d + Blend(b, a, c) + chunk7, 9);
c = d + LeftRotate(K27 + c + Blend(a, d, b) + chunk3, 14);
b = c + LeftRotate(K28 + b + Blend(d, c, a), 20);
a = b + LeftRotate(K29 + a + Blend(c, b, d), 5);
d = a + LeftRotate(K30 + d + Blend(b, a, c) + chunk2, 9);
c = d + LeftRotate(K31 + c + Blend(a, d, b), 14);
b = c + LeftRotate(K32 + b + Blend(d, c, a), 20);
a = b + LeftRotate(K33 + a + Xor(b, c, d) + chunk5, 4);
d = a + LeftRotate(K34 + d + Xor(a, b, c), 11);
c = d + LeftRotate(K35 + c + Xor(d, a, b), 16);
b = c + LeftRotate(K36 + b + Xor(c, d, a) + chunk7, 23);
a = b + LeftRotate(K37 + a + Xor(b, c, d) + chunk1, 4);
d = a + LeftRotate(K38 + d + Xor(a, b, c) + chunk4, 11);
c = d + LeftRotate(K39 + c + Xor(d, a, b), 16);
b = c + LeftRotate(K40 + b + Xor(c, d, a), 23);
a = b + LeftRotate(K41 + a + Xor(b, c, d), 4);
d = a + LeftRotate(K42 + d + Xor(a, b, c) + chunk0, 11);
c = d + LeftRotate(K43 + c + Xor(d, a, b) + chunk3, 16);
b = c + LeftRotate(K44 + b + Xor(c, d, a) + chunk6, 23);
a = b + LeftRotate(K45 + a + Xor(b, c, d), 4);
d = a + LeftRotate(K46 + d + Xor(a, b, c), 11);
c = d + LeftRotate(K47 + c + Xor(d, a, b), 16);
b = c + LeftRotate(K48 + b + Xor(c, d, a) + chunk2, 23);
a = b + LeftRotate(K49 + a + I(c, b, d) + chunk0, 6);
d = a + LeftRotate(K50 + d + I(b, a, c), 10);
c = d + LeftRotate(K51 + c + I(a, d, b) + chunk7, 15);
b = c + LeftRotate(K52 + b + I(d, c, a) + chunk5, 21);
a = b + LeftRotate(K53 + a + I(c, b, d), 6);
d = a + LeftRotate(K54 + d + I(b, a, c) + chunk3, 10);
c = d + LeftRotate(K55 + c + I(a, d, b), 15);
b = c + LeftRotate(K56 + b + I(d, c, a) + chunk1, 21);
a = b + LeftRotate(K57 + a + I(c, b, d), 6);
d = a + LeftRotate(K58 + d + I(b, a, c), 10);
c = d + LeftRotate(K59 + c + I(a, d, b) + chunk6, 15);
b = c + LeftRotate(K60 + b + I(d, c, a), 21);
a = b + LeftRotate(K61 + a + I(c, b, d) + chunk4, 6);
d = a + LeftRotate(K62 + d + I(b, a, c), 10);
c = d + LeftRotate(K63 + c + I(a, d, b) + chunk2, 15);
b = c + LeftRotate(K64 + b + I(d, c, a), 21);
a += A0;
b += B0;
c += C0;
d += D0;
return new[]
{
0x67452301 + a,
0xefcdab89 + b,
0x98badcfe + c,
0x10325476 + d,
a[0],
b[0],
c[0],
d[0],
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static uint Blend(uint a, uint b, uint x)
private static Vector<uint> Blend(Vector<uint> a, Vector<uint> b, Vector<uint> x)
{
return (x & b) | (~x & a);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static uint Xor(uint a, uint b, uint c)
private static Vector<uint> Xor(Vector<uint> a, Vector<uint> b, Vector<uint> c)
{
return a ^ b ^ c;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static uint I(uint a, uint b, uint c)
private static Vector<uint> I(Vector<uint> a, Vector<uint> b, Vector<uint> c)
{
return a ^ (b | ~c);
}
@ -119,5 +208,29 @@ namespace WhiteRabbit
{
return (x << left) | (x >> 32 - left);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector<uint> LeftRotate(Vector<uint> x, int left)
{
return new Vector<uint>(new[]
{
LeftRotate(x[0], left),
LeftRotate(x[1], left),
LeftRotate(x[2], left),
LeftRotate(x[3], left),
});
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector<uint> GetChunk(Phrase phrase, int offset)
{
return new Vector<uint>(new[]
{
phrase.Buffer[offset],
phrase.Buffer[offset],
phrase.Buffer[offset],
phrase.Buffer[offset],
});
}
}
}

Loading…
Cancel
Save