AVX2 optimizations, loop unrolling

master
Inga 🏳‍🌈 8 years ago
parent 7e4c23d467
commit e8544bbd71
  1. 4
      dotnet/WhiteRabbit.UnmanagedBridge/WhiteRabbit.UnmanagedBridge.cpp
  2. 2
      dotnet/WhiteRabbit.UnmanagedBridge/constants.h
  3. 107
      dotnet/WhiteRabbit.UnmanagedBridge/phraseset.cpp
  4. 2
      dotnet/WhiteRabbit.UnmanagedBridge/phraseset.h

@ -10,7 +10,7 @@ void WhiteRabbitUnmanagedBridge::MD5Unmanaged::ComputeMD5(unsigned __int32 * inp
{ {
#if AVX2 #if AVX2
md5(input + 0 * 8 * 8, output + 0 * 8); md5(input + 0 * 8 * 8, output + 0 * 8);
md5(input + 1 * 8 * 8, output + 1 * 8); //md5(input + 1 * 8 * 8, output + 1 * 8);
#elif SIMD #elif SIMD
md5(input + 0 * 8 * 4, output + 0 * 4); md5(input + 0 * 8 * 4, output + 0 * 4);
md5(input + 1 * 8 * 4, output + 1 * 4); md5(input + 1 * 8 * 4, output + 1 * 4);
@ -29,5 +29,5 @@ void WhiteRabbitUnmanagedBridge::MD5Unmanaged::ComputeMD5(unsigned __int32 * inp
void WhiteRabbitUnmanagedBridge::MD5Unmanaged::FillPhraseSet(__int64* bufferPointer, __int64* allWordsPointer, __int32* wordIndexes, unsigned __int64* permutationsPointer, int permutationOffset, int numberOfCharacters, int numberOfWords) void WhiteRabbitUnmanagedBridge::MD5Unmanaged::FillPhraseSet(__int64* bufferPointer, __int64* allWordsPointer, __int32* wordIndexes, unsigned __int64* permutationsPointer, int permutationOffset, int numberOfCharacters, int numberOfWords)
{ {
fillPhraseSet(bufferPointer, allWordsPointer, wordIndexes, permutationsPointer, permutationOffset, numberOfCharacters, numberOfWords); fillPhraseSet(bufferPointer, (unsigned __int64*)allWordsPointer, wordIndexes, permutationsPointer, permutationOffset, numberOfCharacters, numberOfWords);
} }

@ -1,3 +1,3 @@
#pragma once #pragma once
#define PHRASES_PER_SET 16 #define PHRASES_PER_SET 8

@ -6,105 +6,50 @@
#pragma unmanaged #pragma unmanaged
void fillPhraseSet(__int64* bufferPointer, __int64* allWordsPointer, __int32* wordIndexes, unsigned __int64* permutationsPointer, int permutationOffset, int numberOfCharacters, int numberOfWords) #define REPEAT(macro) \
macro(0); \
macro(1); \
macro(2); \
macro(3); \
macro(4); \
macro(5); \
macro(6); \
macro(7);
void fillPhraseSet(__int64* bufferPointer, unsigned __int64* allWordsPointer, __int32* wordIndexes, unsigned __int64* permutationsPointer, int permutationOffset, int numberOfCharacters, int numberOfWords)
{ {
unsigned __int64 permutations[PHRASES_PER_SET]; unsigned __int64 permutations[PHRASES_PER_SET];
__int32 cumulativeWordOffsets[PHRASES_PER_SET]; unsigned __int64 cumulativeWordOffsets = 0;
__int32 permutationOffsetInBytes = permutationOffset * sizeof(*permutations);
auto avx2buffer = (__m256i*)bufferPointer;
#define INIT_DATA(phraseNumber) \ #define INIT_DATA(phraseNumber) \
permutations[phraseNumber] = *(unsigned __int64*)(((char*)permutationsPointer) + permutationOffsetInBytes + phraseNumber * sizeof(*permutations)); \ permutations[phraseNumber] = permutationsPointer[permutationOffset + phraseNumber]; \
cumulativeWordOffsets[phraseNumber] = 0;
INIT_DATA(0); REPEAT(INIT_DATA);
INIT_DATA(1);
INIT_DATA(2);
INIT_DATA(3);
INIT_DATA(4);
INIT_DATA(5);
INIT_DATA(6);
INIT_DATA(7);
INIT_DATA(8);
INIT_DATA(9);
INIT_DATA(10);
INIT_DATA(11);
INIT_DATA(12);
INIT_DATA(13);
INIT_DATA(14);
INIT_DATA(15);
#define PROCESS_WORD(phraseNumber) \ #define PROCESS_WORD(phraseNumber) \
{ \ { \
auto currentWord = allWordsPointer + wordIndexes[permutations[phraseNumber] & 15] * 128; \ auto currentWord = allWordsPointer + wordIndexes[permutations[phraseNumber] % 16] * 128; \
permutations[phraseNumber] = permutations[phraseNumber] >> 4; \ permutations[phraseNumber] >>= 4; \
bufferPointer[phraseNumber * 4 + 0] |= currentWord[cumulativeWordOffsets[phraseNumber] + 0]; \ avx2buffer[phraseNumber] = _mm256_or_si256(avx2buffer[phraseNumber], *(__m256i*)(currentWord + ((cumulativeWordOffsets >> (8 * (phraseNumber % 8))) % 256))); \
bufferPointer[phraseNumber * 4 + 1] |= currentWord[cumulativeWordOffsets[phraseNumber] + 1]; \ cumulativeWordOffsets += (((unsigned __int64*)currentWord)[127]) << (8 * (phraseNumber % 8)); \
bufferPointer[phraseNumber * 4 + 2] |= currentWord[cumulativeWordOffsets[phraseNumber] + 2]; \
bufferPointer[phraseNumber * 4 + 3] |= currentWord[cumulativeWordOffsets[phraseNumber] + 3]; \
cumulativeWordOffsets[phraseNumber] += (__int32)currentWord[127]; \
} }
for (auto j = 0; j < numberOfWords; j++) for (auto j = 0; j < numberOfWords; j++)
{ {
PROCESS_WORD(0); REPEAT(PROCESS_WORD);
PROCESS_WORD(1);
PROCESS_WORD(2);
PROCESS_WORD(3);
PROCESS_WORD(4);
PROCESS_WORD(5);
PROCESS_WORD(6);
PROCESS_WORD(7);
PROCESS_WORD(8);
PROCESS_WORD(9);
PROCESS_WORD(10);
PROCESS_WORD(11);
PROCESS_WORD(12);
PROCESS_WORD(13);
PROCESS_WORD(14);
PROCESS_WORD(15);
} }
auto length = numberOfCharacters + numberOfWords - 1; auto length = numberOfCharacters + numberOfWords - 1;
auto lengthInBits = (unsigned __int32)(length << 3); auto lengthInBits = (unsigned __int32)(length << 3);
#define FILL_PHRASE_LAST_BYTE(phraseNumber, byteBuffer) ((unsigned char*)bufferPointer)[length + phraseNumber * 32] = 128; #define FILL_PHRASE_LAST_BYTE(phraseNumber) ((unsigned char*)bufferPointer)[length + phraseNumber * 32] = 128;
#define FILL_PHRASE_SET_LENGTH(phraseNumber, uintBuffer, lengthInBits) ((unsigned __int32*)bufferPointer)[7 + phraseNumber * 8] = lengthInBits; #define FILL_PHRASE_SET_LENGTH(phraseNumber) ((unsigned __int32*)bufferPointer)[7 + phraseNumber * 8] = lengthInBits;
FILL_PHRASE_LAST_BYTE(0, byteBuffer);
FILL_PHRASE_LAST_BYTE(1, byteBuffer);
FILL_PHRASE_LAST_BYTE(2, byteBuffer);
FILL_PHRASE_LAST_BYTE(3, byteBuffer);
FILL_PHRASE_SET_LENGTH(0, uintBuffer, lengthInBits);
FILL_PHRASE_SET_LENGTH(1, uintBuffer, lengthInBits);
FILL_PHRASE_SET_LENGTH(2, uintBuffer, lengthInBits);
FILL_PHRASE_SET_LENGTH(3, uintBuffer, lengthInBits);
FILL_PHRASE_LAST_BYTE(4, byteBuffer);
FILL_PHRASE_LAST_BYTE(5, byteBuffer);
FILL_PHRASE_LAST_BYTE(6, byteBuffer);
FILL_PHRASE_LAST_BYTE(7, byteBuffer);
FILL_PHRASE_SET_LENGTH(4, uintBuffer, lengthInBits);
FILL_PHRASE_SET_LENGTH(5, uintBuffer, lengthInBits);
FILL_PHRASE_SET_LENGTH(6, uintBuffer, lengthInBits);
FILL_PHRASE_SET_LENGTH(7, uintBuffer, lengthInBits);
FILL_PHRASE_LAST_BYTE(8, byteBuffer);
FILL_PHRASE_LAST_BYTE(9, byteBuffer);
FILL_PHRASE_LAST_BYTE(10, byteBuffer);
FILL_PHRASE_LAST_BYTE(11, byteBuffer);
FILL_PHRASE_SET_LENGTH(8, uintBuffer, lengthInBits);
FILL_PHRASE_SET_LENGTH(9, uintBuffer, lengthInBits);
FILL_PHRASE_SET_LENGTH(10, uintBuffer, lengthInBits);
FILL_PHRASE_SET_LENGTH(11, uintBuffer, lengthInBits);
FILL_PHRASE_LAST_BYTE(12, byteBuffer); REPEAT(FILL_PHRASE_LAST_BYTE);
FILL_PHRASE_LAST_BYTE(13, byteBuffer); REPEAT(FILL_PHRASE_SET_LENGTH);
FILL_PHRASE_LAST_BYTE(14, byteBuffer);
FILL_PHRASE_LAST_BYTE(15, byteBuffer);
FILL_PHRASE_SET_LENGTH(12, uintBuffer, lengthInBits);
FILL_PHRASE_SET_LENGTH(13, uintBuffer, lengthInBits);
FILL_PHRASE_SET_LENGTH(14, uintBuffer, lengthInBits);
FILL_PHRASE_SET_LENGTH(15, uintBuffer, lengthInBits);
} }
#pragma managed #pragma managed

@ -1,3 +1,3 @@
#pragma once #pragma once
void fillPhraseSet(__int64* bufferPointer, __int64* allWordsPointer, __int32* wordIndexes, unsigned __int64* permutationsPointer, int permutationOffset, int numberOfCharacters, int numberOfWords); void fillPhraseSet(__int64* bufferPointer, unsigned __int64* allWordsPointer, __int32* wordIndexes, unsigned __int64* permutationsPointer, int permutationOffset, int numberOfCharacters, int numberOfWords);

Loading…
Cancel
Save