From 5584ea843d93749c6af15fa3278fd42d477a703a Mon Sep 17 00:00:00 2001 From: inga-lovinde <52715130+inga-lovinde@users.noreply.github.com> Date: Wed, 5 Apr 2017 15:25:15 +0300 Subject: [PATCH] Performance fixes --- .../WhiteRabbit.UnmanagedBridge.vcxproj | 1 + dotnet/WhiteRabbit.UnmanagedBridge/md5.cpp | 43 +++++++++++-------- dotnet/WhiteRabbit/MD5Digest.cs | 9 ++-- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/dotnet/WhiteRabbit.UnmanagedBridge/WhiteRabbit.UnmanagedBridge.vcxproj b/dotnet/WhiteRabbit.UnmanagedBridge/WhiteRabbit.UnmanagedBridge.vcxproj index a24e9df..c870702 100644 --- a/dotnet/WhiteRabbit.UnmanagedBridge/WhiteRabbit.UnmanagedBridge.vcxproj +++ b/dotnet/WhiteRabbit.UnmanagedBridge/WhiteRabbit.UnmanagedBridge.vcxproj @@ -126,6 +126,7 @@ true Speed AssemblyAndSourceCode + StreamingSIMDExtensions2 diff --git a/dotnet/WhiteRabbit.UnmanagedBridge/md5.cpp b/dotnet/WhiteRabbit.UnmanagedBridge/md5.cpp index 2fe4c9b..c33b6bd 100644 --- a/dotnet/WhiteRabbit.UnmanagedBridge/md5.cpp +++ b/dotnet/WhiteRabbit.UnmanagedBridge/md5.cpp @@ -18,17 +18,21 @@ typedef __m256i MD5Vector; #define OP_BLEND(a, b, x) OP_OR(OP_AND(x, b), OP_ANDNOT(x, a)) #define CREATE_VECTOR(a) _mm256_set1_epi32(a) -#define CREATE_VECTOR_FROM_INPUT(input, offset) _mm256_set1_epi32(input[offset]) +#define CREATE_VECTOR_FROM_INPUT(input, offset) _mm256_set_epi32( \ + input[offset + 0 * 8], \ + input[offset + 1 * 8], \ + input[offset + 2 * 8], \ + input[offset + 3 * 8], \ + input[offset + 4 * 8], \ + input[offset + 5 * 8], \ + input[offset + 6 * 8], \ + input[offset + 7 * 8]) #define WRITE_TO_OUTPUT(a, output) \ - output[0] = a.m256i_u32[0]; \ - output[1] = a.m256i_u32[1]; \ - output[2] = a.m256i_u32[2]; \ - output[3] = a.m256i_u32[3]; \ - output[4] = a.m256i_u32[4]; \ - output[5] = a.m256i_u32[5]; \ - output[6] = a.m256i_u32[6]; \ - output[7] = a.m256i_u32[7]; + ((unsigned __int64*)output)[0] = a.m256i_u64[0]; \ + ((unsigned __int64*)output)[1] = a.m256i_u64[1]; \ + ((unsigned __int64*)output)[2] = a.m256i_u64[2]; \ + ((unsigned __int64*)output)[3] = a.m256i_u64[3]; #elif SIMD @@ -41,6 +45,7 @@ typedef __m128i MD5Vector; #define OP_ADD(a, b) _mm_add_epi32(a, b) #define OP_ROT(a, r) OP_OR(_mm_slli_epi32(a, r), _mm_srli_epi32(a, 32 - (r))) #define OP_BLEND(a, b, x) OP_OR(OP_AND(x, b), OP_ANDNOT(x, a)) +//#define OP_BLEND(a, b, x) OP_XOR(a, OP_AND(x, OP_XOR(b, a))) #define CREATE_VECTOR(a) _mm_set1_epi32(a) #define CREATE_VECTOR_FROM_INPUT(input, offset) _mm_set_epi32( \ @@ -158,19 +163,21 @@ static const MD5Parameters Parameters = { #define Xor(a, b, c) OP_XOR(a, OP_XOR(b, c)) #define I(a, b, c) OP_XOR(a, OP_OR(b, OP_NEG(c))) -#define LeftRotate(r, x) OP_ROT(x, r) +#define StepOuter(r, a, b, x) \ + a = x; \ + a = OP_ADD(b, OP_ROT(a, r)); -#define Step1(r, a, b, c, d, k, w) OP_ADD(b, LeftRotate(r, OP_ADD(Blend(d, c, b), OP_ADD(CREATE_VECTOR(k), OP_ADD(a, w))))) -#define Step1E(r, a, b, c, d, k) OP_ADD(b, LeftRotate(r, OP_ADD(Blend(d, c, b), OP_ADD(CREATE_VECTOR(k), a)))) +#define Step1(r, a, b, c, d, k, w) StepOuter(r, a, b, OP_ADD(Blend(d, c, b), OP_ADD(CREATE_VECTOR(k), OP_ADD(a, w)))) +#define Step1E(r, a, b, c, d, k) StepOuter(r, a, b, OP_ADD(Blend(d, c, b), OP_ADD(CREATE_VECTOR(k), a))) -#define Step2(r, a, b, c, d, k, w) OP_ADD(c, LeftRotate(r, OP_ADD(Blend(d, c, b), OP_ADD(CREATE_VECTOR(k), OP_ADD(a, w))))) -#define Step2E(r, a, b, c, d, k) OP_ADD(c, LeftRotate(r, OP_ADD(Blend(d, c, b), OP_ADD(CREATE_VECTOR(k), a)))) +#define Step2(r, a, b, c, d, k, w) StepOuter(r, a, c, OP_ADD(Blend(d, c, b), OP_ADD(CREATE_VECTOR(k), OP_ADD(a, w)))) +#define Step2E(r, a, b, c, d, k) StepOuter(r, a, c, OP_ADD(Blend(d, c, b), OP_ADD(CREATE_VECTOR(k), a))) -#define Step3(r, a, b, c, d, k, w) OP_ADD(b, LeftRotate(r, OP_ADD(Xor(b, c, d), OP_ADD(CREATE_VECTOR(k), OP_ADD(a, w))))) -#define Step3E(r, a, b, c, d, k) OP_ADD(b, LeftRotate(r, OP_ADD(Xor(b, c, d), OP_ADD(CREATE_VECTOR(k), a)))) +#define Step3(r, a, b, c, d, k, w) StepOuter(r, a, b, OP_ADD(Xor(b, c, d), OP_ADD(CREATE_VECTOR(k), OP_ADD(a, w)))) +#define Step3E(r, a, b, c, d, k) StepOuter(r, a, b, OP_ADD(Xor(b, c, d), OP_ADD(CREATE_VECTOR(k), a))) -#define Step4(r, a, b, c, d, k, w) OP_ADD(b, LeftRotate(r, OP_ADD(I(c, b, d), OP_ADD(CREATE_VECTOR(k), OP_ADD(a, w))))) -#define Step4E(r, a, b, c, d, k) OP_ADD(b, LeftRotate(r, OP_ADD(I(c, b, d), OP_ADD(CREATE_VECTOR(k), a)))) +#define Step4(r, a, b, c, d, k, w) StepOuter(r, a, b, OP_ADD(I(c, b, d), OP_ADD(CREATE_VECTOR(k), OP_ADD(a, w)))) +#define Step4E(r, a, b, c, d, k) StepOuter(r, a, b, OP_ADD(I(c, b, d), OP_ADD(CREATE_VECTOR(k), a))) void md5(unsigned __int32 * input, unsigned __int32 * output) { diff --git a/dotnet/WhiteRabbit/MD5Digest.cs b/dotnet/WhiteRabbit/MD5Digest.cs index 2ff6e02..154210d 100644 --- a/dotnet/WhiteRabbit/MD5Digest.cs +++ b/dotnet/WhiteRabbit/MD5Digest.cs @@ -1,9 +1,8 @@ -using System.Numerics; -using System.Runtime.CompilerServices; -using WhiteRabbitUnmanagedBridge; - -namespace WhiteRabbit +namespace WhiteRabbit { + using System.Runtime.CompilerServices; + using WhiteRabbitUnmanagedBridge; + internal static class MD5Digest { // It only returns first component of MD5 hash