From e8544bbd71a4dffda915efbfd45dee014afc92c6 Mon Sep 17 00:00:00 2001 From: inga-lovinde <52715130+inga-lovinde@users.noreply.github.com> Date: Thu, 27 Apr 2017 12:49:32 +0300 Subject: [PATCH] AVX2 optimizations, loop unrolling --- .../WhiteRabbit.UnmanagedBridge.cpp | 4 +- .../WhiteRabbit.UnmanagedBridge/constants.h | 2 +- .../WhiteRabbit.UnmanagedBridge/phraseset.cpp | 107 +++++------------- .../WhiteRabbit.UnmanagedBridge/phraseset.h | 2 +- 4 files changed, 30 insertions(+), 85 deletions(-) diff --git a/dotnet/WhiteRabbit.UnmanagedBridge/WhiteRabbit.UnmanagedBridge.cpp b/dotnet/WhiteRabbit.UnmanagedBridge/WhiteRabbit.UnmanagedBridge.cpp index b349270..63a43c3 100644 --- a/dotnet/WhiteRabbit.UnmanagedBridge/WhiteRabbit.UnmanagedBridge.cpp +++ b/dotnet/WhiteRabbit.UnmanagedBridge/WhiteRabbit.UnmanagedBridge.cpp @@ -10,7 +10,7 @@ void WhiteRabbitUnmanagedBridge::MD5Unmanaged::ComputeMD5(unsigned __int32 * inp { #if AVX2 md5(input + 0 * 8 * 8, output + 0 * 8); - md5(input + 1 * 8 * 8, output + 1 * 8); + //md5(input + 1 * 8 * 8, output + 1 * 8); #elif SIMD md5(input + 0 * 8 * 4, output + 0 * 4); md5(input + 1 * 8 * 4, output + 1 * 4); @@ -29,5 +29,5 @@ void WhiteRabbitUnmanagedBridge::MD5Unmanaged::ComputeMD5(unsigned __int32 * inp void WhiteRabbitUnmanagedBridge::MD5Unmanaged::FillPhraseSet(__int64* bufferPointer, __int64* allWordsPointer, __int32* wordIndexes, unsigned __int64* permutationsPointer, int permutationOffset, int numberOfCharacters, int numberOfWords) { - fillPhraseSet(bufferPointer, allWordsPointer, wordIndexes, permutationsPointer, permutationOffset, numberOfCharacters, numberOfWords); + fillPhraseSet(bufferPointer, (unsigned __int64*)allWordsPointer, wordIndexes, permutationsPointer, permutationOffset, numberOfCharacters, numberOfWords); } diff --git a/dotnet/WhiteRabbit.UnmanagedBridge/constants.h b/dotnet/WhiteRabbit.UnmanagedBridge/constants.h index 12c14e7..92d60e0 100644 --- a/dotnet/WhiteRabbit.UnmanagedBridge/constants.h +++ b/dotnet/WhiteRabbit.UnmanagedBridge/constants.h @@ -1,3 +1,3 @@ #pragma once -#define PHRASES_PER_SET 16 +#define PHRASES_PER_SET 8 diff --git a/dotnet/WhiteRabbit.UnmanagedBridge/phraseset.cpp b/dotnet/WhiteRabbit.UnmanagedBridge/phraseset.cpp index d994d3c..44a6d26 100644 --- a/dotnet/WhiteRabbit.UnmanagedBridge/phraseset.cpp +++ b/dotnet/WhiteRabbit.UnmanagedBridge/phraseset.cpp @@ -6,105 +6,50 @@ #pragma unmanaged -void fillPhraseSet(__int64* bufferPointer, __int64* allWordsPointer, __int32* wordIndexes, unsigned __int64* permutationsPointer, int permutationOffset, int numberOfCharacters, int numberOfWords) +#define REPEAT(macro) \ + macro(0); \ + macro(1); \ + macro(2); \ + macro(3); \ + macro(4); \ + macro(5); \ + macro(6); \ + macro(7); + + +void fillPhraseSet(__int64* bufferPointer, unsigned __int64* allWordsPointer, __int32* wordIndexes, unsigned __int64* permutationsPointer, int permutationOffset, int numberOfCharacters, int numberOfWords) { unsigned __int64 permutations[PHRASES_PER_SET]; - __int32 cumulativeWordOffsets[PHRASES_PER_SET]; - __int32 permutationOffsetInBytes = permutationOffset * sizeof(*permutations); + unsigned __int64 cumulativeWordOffsets = 0; + + auto avx2buffer = (__m256i*)bufferPointer; #define INIT_DATA(phraseNumber) \ - permutations[phraseNumber] = *(unsigned __int64*)(((char*)permutationsPointer) + permutationOffsetInBytes + phraseNumber * sizeof(*permutations)); \ - cumulativeWordOffsets[phraseNumber] = 0; + permutations[phraseNumber] = permutationsPointer[permutationOffset + phraseNumber]; \ - INIT_DATA(0); - INIT_DATA(1); - INIT_DATA(2); - INIT_DATA(3); - INIT_DATA(4); - INIT_DATA(5); - INIT_DATA(6); - INIT_DATA(7); - INIT_DATA(8); - INIT_DATA(9); - INIT_DATA(10); - INIT_DATA(11); - INIT_DATA(12); - INIT_DATA(13); - INIT_DATA(14); - INIT_DATA(15); + REPEAT(INIT_DATA); #define PROCESS_WORD(phraseNumber) \ { \ - auto currentWord = allWordsPointer + wordIndexes[permutations[phraseNumber] & 15] * 128; \ - permutations[phraseNumber] = permutations[phraseNumber] >> 4; \ - bufferPointer[phraseNumber * 4 + 0] |= currentWord[cumulativeWordOffsets[phraseNumber] + 0]; \ - bufferPointer[phraseNumber * 4 + 1] |= currentWord[cumulativeWordOffsets[phraseNumber] + 1]; \ - bufferPointer[phraseNumber * 4 + 2] |= currentWord[cumulativeWordOffsets[phraseNumber] + 2]; \ - bufferPointer[phraseNumber * 4 + 3] |= currentWord[cumulativeWordOffsets[phraseNumber] + 3]; \ - cumulativeWordOffsets[phraseNumber] += (__int32)currentWord[127]; \ + auto currentWord = allWordsPointer + wordIndexes[permutations[phraseNumber] % 16] * 128; \ + permutations[phraseNumber] >>= 4; \ + avx2buffer[phraseNumber] = _mm256_or_si256(avx2buffer[phraseNumber], *(__m256i*)(currentWord + ((cumulativeWordOffsets >> (8 * (phraseNumber % 8))) % 256))); \ + cumulativeWordOffsets += (((unsigned __int64*)currentWord)[127]) << (8 * (phraseNumber % 8)); \ } for (auto j = 0; j < numberOfWords; j++) { - PROCESS_WORD(0); - PROCESS_WORD(1); - PROCESS_WORD(2); - PROCESS_WORD(3); - PROCESS_WORD(4); - PROCESS_WORD(5); - PROCESS_WORD(6); - PROCESS_WORD(7); - PROCESS_WORD(8); - PROCESS_WORD(9); - PROCESS_WORD(10); - PROCESS_WORD(11); - PROCESS_WORD(12); - PROCESS_WORD(13); - PROCESS_WORD(14); - PROCESS_WORD(15); + REPEAT(PROCESS_WORD); } auto length = numberOfCharacters + numberOfWords - 1; auto lengthInBits = (unsigned __int32)(length << 3); -#define FILL_PHRASE_LAST_BYTE(phraseNumber, byteBuffer) ((unsigned char*)bufferPointer)[length + phraseNumber * 32] = 128; -#define FILL_PHRASE_SET_LENGTH(phraseNumber, uintBuffer, lengthInBits) ((unsigned __int32*)bufferPointer)[7 + phraseNumber * 8] = lengthInBits; - - FILL_PHRASE_LAST_BYTE(0, byteBuffer); - FILL_PHRASE_LAST_BYTE(1, byteBuffer); - FILL_PHRASE_LAST_BYTE(2, byteBuffer); - FILL_PHRASE_LAST_BYTE(3, byteBuffer); - FILL_PHRASE_SET_LENGTH(0, uintBuffer, lengthInBits); - FILL_PHRASE_SET_LENGTH(1, uintBuffer, lengthInBits); - FILL_PHRASE_SET_LENGTH(2, uintBuffer, lengthInBits); - FILL_PHRASE_SET_LENGTH(3, uintBuffer, lengthInBits); - - FILL_PHRASE_LAST_BYTE(4, byteBuffer); - FILL_PHRASE_LAST_BYTE(5, byteBuffer); - FILL_PHRASE_LAST_BYTE(6, byteBuffer); - FILL_PHRASE_LAST_BYTE(7, byteBuffer); - FILL_PHRASE_SET_LENGTH(4, uintBuffer, lengthInBits); - FILL_PHRASE_SET_LENGTH(5, uintBuffer, lengthInBits); - FILL_PHRASE_SET_LENGTH(6, uintBuffer, lengthInBits); - FILL_PHRASE_SET_LENGTH(7, uintBuffer, lengthInBits); - - FILL_PHRASE_LAST_BYTE(8, byteBuffer); - FILL_PHRASE_LAST_BYTE(9, byteBuffer); - FILL_PHRASE_LAST_BYTE(10, byteBuffer); - FILL_PHRASE_LAST_BYTE(11, byteBuffer); - FILL_PHRASE_SET_LENGTH(8, uintBuffer, lengthInBits); - FILL_PHRASE_SET_LENGTH(9, uintBuffer, lengthInBits); - FILL_PHRASE_SET_LENGTH(10, uintBuffer, lengthInBits); - FILL_PHRASE_SET_LENGTH(11, uintBuffer, lengthInBits); +#define FILL_PHRASE_LAST_BYTE(phraseNumber) ((unsigned char*)bufferPointer)[length + phraseNumber * 32] = 128; +#define FILL_PHRASE_SET_LENGTH(phraseNumber) ((unsigned __int32*)bufferPointer)[7 + phraseNumber * 8] = lengthInBits; - FILL_PHRASE_LAST_BYTE(12, byteBuffer); - FILL_PHRASE_LAST_BYTE(13, byteBuffer); - FILL_PHRASE_LAST_BYTE(14, byteBuffer); - FILL_PHRASE_LAST_BYTE(15, byteBuffer); - FILL_PHRASE_SET_LENGTH(12, uintBuffer, lengthInBits); - FILL_PHRASE_SET_LENGTH(13, uintBuffer, lengthInBits); - FILL_PHRASE_SET_LENGTH(14, uintBuffer, lengthInBits); - FILL_PHRASE_SET_LENGTH(15, uintBuffer, lengthInBits); + REPEAT(FILL_PHRASE_LAST_BYTE); + REPEAT(FILL_PHRASE_SET_LENGTH); } #pragma managed diff --git a/dotnet/WhiteRabbit.UnmanagedBridge/phraseset.h b/dotnet/WhiteRabbit.UnmanagedBridge/phraseset.h index 858c59a..8417b2b 100644 --- a/dotnet/WhiteRabbit.UnmanagedBridge/phraseset.h +++ b/dotnet/WhiteRabbit.UnmanagedBridge/phraseset.h @@ -1,3 +1,3 @@ #pragma once -void fillPhraseSet(__int64* bufferPointer, __int64* allWordsPointer, __int32* wordIndexes, unsigned __int64* permutationsPointer, int permutationOffset, int numberOfCharacters, int numberOfWords); +void fillPhraseSet(__int64* bufferPointer, unsigned __int64* allWordsPointer, __int32* wordIndexes, unsigned __int64* permutationsPointer, int permutationOffset, int numberOfCharacters, int numberOfWords);