Optimized MD5 computation (loop unrolling)

master
Inga 🏳‍🌈 8 years ago
parent c919172ac7
commit b667aa8830
  1. 8
      README.md
  2. 1
      dotnet/WhiteRabbit.UnmanagedBridge/WhiteRabbit.UnmanagedBridge.cpp
  3. 99
      dotnet/WhiteRabbit.UnmanagedBridge/md5.cpp

@ -47,10 +47,10 @@ Number of words|Time to check all anagrams no longer than that|Time to solve "ea
---------------|----------------------------------------------|-------------------------|-----------------------------------|-------------------------|--------------------------------------------- ---------------|----------------------------------------------|-------------------------|-----------------------------------|-------------------------|---------------------------------------------
3|0.04s||||4560 3|0.04s||||4560
4|0.45s|||0.08s|7,431,984 4|0.45s|||0.08s|7,431,984
5|11.7s|0.15s|0.06s|0.3s|1,347,437,484 5|10.4s|0.15s|0.06s|0.3s|1,347,437,484
6|6.5 minutes|1.2s|0.2s|2.9s|58,405,904,844 6|5.5 minutes|1s|0.2s|2.5s|58,405,904,844
7|2 hours|6.7s|0.9s|19s|1,070,307,744,114 7|81 minutes|5.7s|0.8s|16.1s|1,070,307,744,114
8|21.5 hours|25s|2.6s|79s|10,893,594,396,594 8|21.5 hours|21s|2.3s|65s|10,893,594,396,594
9||2.5 minutes|13s|9.5 minutes|70,596,864,409,954 9||2.5 minutes|13s|9.5 minutes|70,596,864,409,954
10||5 minutes|21s|17.5 minutes|314,972,701,475,754 10||5 minutes|21s|17.5 minutes|314,972,701,475,754

@ -10,7 +10,6 @@ void WhiteRabbitUnmanagedBridge::MD5Unmanaged::ComputeMD5(unsigned __int32 * inp
{ {
#if AVX2 #if AVX2
md5(input + 0 * 8 * 8); md5(input + 0 * 8 * 8);
md5(input + 1 * 8 * 8);
#elif SIMD #elif SIMD
md5(input + 0 * 8 * 4); md5(input + 0 * 8 * 4);
md5(input + 1 * 8 * 4); md5(input + 1 * 8 * 4);

@ -7,18 +7,61 @@
#if AVX2 #if AVX2
typedef __m256i MD5Vector; //typedef __m256i MD5Vector;
struct MD5Vector
#define OP_XOR(a, b) _mm256_xor_si256(a, b) {
#define OP_AND(a, b) _mm256_and_si256(a, b) __m256i m_V0;
#define OP_ANDNOT(a, b) _mm256_andnot_si256(a, b) __m256i m_V1;
#define OP_OR(a, b) _mm256_or_si256(a, b) inline MD5Vector() {}
#define OP_ADD(a, b) _mm256_add_epi32(a, b) inline MD5Vector(__m256i C0, __m256i C1) :m_V0(C0), m_V1(C1) {}
#define OP_ROT(a, r) OP_OR(_mm256_slli_epi32(a, r), _mm256_srli_epi32(a, 32 - (r)))
__forceinline MD5Vector MXor(MD5Vector R) const
{
return MD5Vector(_mm256_xor_si256(m_V0, R.m_V0), _mm256_xor_si256(m_V1, R.m_V1));
}
__forceinline MD5Vector MAnd(MD5Vector R) const
{
return MD5Vector(_mm256_and_si256(m_V0, R.m_V0), _mm256_and_si256(m_V1, R.m_V1));
}
__forceinline MD5Vector MAndNot(MD5Vector R) const
{
return MD5Vector(_mm256_andnot_si256(m_V0, R.m_V0), _mm256_andnot_si256(m_V1, R.m_V1));
}
__forceinline MD5Vector MOr(MD5Vector R) const
{
return MD5Vector(_mm256_or_si256(m_V0, R.m_V0), _mm256_or_si256(m_V1, R.m_V1));
}
__forceinline MD5Vector MAdd(MD5Vector R) const
{
return MD5Vector(_mm256_add_epi32(m_V0, R.m_V0), _mm256_add_epi32(m_V1, R.m_V1));
}
__forceinline MD5Vector MShiftLeft(int shift) const
{
return MD5Vector(_mm256_slli_epi32(m_V0, shift), _mm256_slli_epi32(m_V1, shift));
}
__forceinline MD5Vector MShiftRight(int shift) const
{
return MD5Vector(_mm256_srli_epi32(m_V0, shift), _mm256_srli_epi32(m_V1, shift));
}
};
#define OP_XOR(a, b) a.MXor(b)
#define OP_AND(a, b) a.MAnd(b)
#define OP_ANDNOT(a, b) a.MAndNot(b)
#define OP_OR(a, b) a.MOr(b)
#define OP_ADD(a, b) a.MAdd(b)
#define OP_ROT(a, r) OP_OR(a.MShiftLeft(r), a.MShiftRight(32 - (r)))
#define OP_BLEND(a, b, x) OP_OR(OP_AND(x, b), OP_ANDNOT(x, a)) #define OP_BLEND(a, b, x) OP_OR(OP_AND(x, b), OP_ANDNOT(x, a))
#define CREATE_VECTOR(a) _mm256_set1_epi32(a) #define CREATE_VECTOR(a) MD5Vector(_mm256_set1_epi32(a), _mm256_set1_epi32(a))
#define CREATE_VECTOR_FROM_INPUT(input, offset) _mm256_set_epi32( \ #define CREATE_VECTOR_FROM_INPUT(input, offset) MD5Vector(\
_mm256_set_epi32( \
input[offset + 7 * 8], \ input[offset + 7 * 8], \
input[offset + 6 * 8], \ input[offset + 6 * 8], \
input[offset + 5 * 8], \ input[offset + 5 * 8], \
@ -26,17 +69,35 @@ typedef __m256i MD5Vector;
input[offset + 3 * 8], \ input[offset + 3 * 8], \
input[offset + 2 * 8], \ input[offset + 2 * 8], \
input[offset + 1 * 8], \ input[offset + 1 * 8], \
input[offset + 0 * 8]) input[offset + 0 * 8]), \
_mm256_set_epi32( \
input[offset + 15 * 8], \
input[offset + 14 * 8], \
input[offset + 13 * 8], \
input[offset + 12 * 8], \
input[offset + 11 * 8], \
input[offset + 10 * 8], \
input[offset + 9 * 8], \
input[offset + 8 * 8]) \
)
#define WRITE_TO_OUTPUT(a, output) \ #define WRITE_TO_OUTPUT(a, output) \
output[7 + 0 * 8] = a.m256i_u32[0]; \ output[7 + 0 * 8] = a.m_V0.m256i_u32[0]; \
output[7 + 1 * 8] = a.m256i_u32[1]; \ output[7 + 1 * 8] = a.m_V0.m256i_u32[1]; \
output[7 + 2 * 8] = a.m256i_u32[2]; \ output[7 + 2 * 8] = a.m_V0.m256i_u32[2]; \
output[7 + 3 * 8] = a.m256i_u32[3]; \ output[7 + 3 * 8] = a.m_V0.m256i_u32[3]; \
output[7 + 4 * 8] = a.m256i_u32[4]; \ output[7 + 4 * 8] = a.m_V0.m256i_u32[4]; \
output[7 + 5 * 8] = a.m256i_u32[5]; \ output[7 + 5 * 8] = a.m_V0.m256i_u32[5]; \
output[7 + 6 * 8] = a.m256i_u32[6]; \ output[7 + 6 * 8] = a.m_V0.m256i_u32[6]; \
output[7 + 7 * 8] = a.m256i_u32[7]; output[7 + 7 * 8] = a.m_V0.m256i_u32[7]; \
output[7 + 8 * 8] = a.m_V1.m256i_u32[0]; \
output[7 + 9 * 8] = a.m_V1.m256i_u32[1]; \
output[7 + 10 * 8] = a.m_V1.m256i_u32[2]; \
output[7 + 11 * 8] = a.m_V1.m256i_u32[3]; \
output[7 + 12 * 8] = a.m_V1.m256i_u32[4]; \
output[7 + 13 * 8] = a.m_V1.m256i_u32[5]; \
output[7 + 14 * 8] = a.m_V1.m256i_u32[6]; \
output[7 + 15 * 8] = a.m_V1.m256i_u32[7];
#elif SIMD #elif SIMD

Loading…
Cancel
Save