Further unsafe optimizations

performance-simd-rust
Inga 🏳‍🌈 8 years ago
parent e5c1e743bc
commit 3429ad83cf
  1. 11
      README.md
  2. 110
      WhiteRabbit/MD5Digest.cs
  3. 54
      WhiteRabbit/Phrase.cs
  4. 15
      WhiteRabbit/Program.cs
  5. 21
      WhiteRabbit/StringsProcessor.cs
  6. 2
      WhiteRabbit/WhiteRabbit.csproj

@ -29,6 +29,8 @@ Usage info
WhiteRabbit.exe < wordlist
```
**Note that this code only works correctly on big-endian x64 systems, due to heavy optimizations of MD5 computation!**
Performance
===========
@ -43,12 +45,11 @@ Multi-threaded performance with RyuJIT (.NET 4.6, 64-bit system) on quad-core Sa
* If only phrases of at most 4 words are allowed, then it takes **less than 2 seconds** to find and check all 7433016 anagrams; all hashes are solved in first 0.3 seconds.
* If phrases of 5 words are allowed as well, then it takes around 5 minutes to find and check all 1348876896 anagrams; all hashes are solved in first 7.5 seconds.
Most of time is spent on MD5 computations for correct anagrams, so there is not a lot to optimize further.
* If phrases of 5 words are allowed as well, then it takes around 4 minutes to find and check all 1348876896 anagrams; all hashes are solved in first 6.5 seconds.
* If phrases of 6 words are allowed as well, then "more difficult" hash is solved in 8 seconds, "easiest" in 48 seconds, and "hard" in less than 2 minutes.
* If phrases of 6 words are allowed as well, then "more difficult" hash is solved in 6 seconds, "easiest" in 38 seconds, and "hard" in 1.5 minutes.
* If phrases of 7 words are allowed as well, then "more difficult" hash is solved in 46 seconds, "easiest" in less than 6 minutes, and "hard" in around 15 minutes.
* If phrases of 7 words are allowed as well, then "more difficult" hash is solved in 39 seconds, "easiest" in less than 5 minutes, and "hard" in 13 minutes.
Note that all measurements were done on a Release build; Debug build is significantly slower.
@ -57,8 +58,6 @@ For comparison, certain other solutions available on GitHub seem to require 3 ho
Conditional compilation symbols
===============================
* Define `BIG_ENDIAN` if you plan to run this on big-endian PC; this will disable certain MD5 optimizations that only produce correct result on little-endian PCs.
* Define `SINGLE_THREADED` to use standard enumerables instead of ParallelEnumerable.
* Define `DEBUG`, or build in debug mode, to get the total number of anagrams (not optimized, memory-hogging).

@ -1,6 +1,8 @@
namespace WhiteRabbit
{
using System;
using System.Linq;
using System.Reflection;
/**
* Code taken from BouncyCastle and optimized for specific constraints (e.g. input is always larger than 4 bytes and smaller than 52 bytes).
@ -11,45 +13,29 @@
*/
internal static class MD5Digest
{
public static uint[] Compute(byte[] input)
public static unsafe uint[] Compute(Phrase input)
{
var length = input.Length;
var xUints = new uint[8]; // it seems that alignment helps
#if BIG_ENDIAN
xUints[0] = LE_To_UInt32(xBytes, 4 * 0);
xUints[1] = LE_To_UInt32(xBytes, 4 * 1);
xUints[2] = LE_To_UInt32(xBytes, 4 * 2);
xUints[3] = LE_To_UInt32(xBytes, 4 * 3);
xUints[4] = LE_To_UInt32(xBytes, 4 * 4);
xUints[5] = LE_To_UInt32(xBytes, 4 * 5);
xUints[6] = LE_To_UInt32(xBytes, 4 * 6);
#else
Buffer.BlockCopy(input, 0, xUints, 0, length);
#endif
xUints[length >> 2] |= (uint)128 << (8 * (length & 3));
var x0 = xUints[0];
var x1 = xUints[1];
var x2 = xUints[2];
var x3 = xUints[3];
var x4 = xUints[4];
var x5 = xUints[5];
var x6 = xUints[6];
var x14 = (uint)(length << 3);
var xUints = stackalloc uint[8]; // it seems that alignment helps
*(long*)xUints = *(long*)input.Buffer;
*(long*)(xUints + 2) = *(long*)(input.Buffer + 8);
*(long*)(xUints + 4) = *(long*)(input.Buffer + 16);
*(long*)(xUints + 6) = *(long*)(input.Buffer + 24);
((byte*)xUints)[31] = 0;
((byte*)xUints)[input.Buffer[31]] = 128;
xUints[7] = (uint)(input.Buffer[31] << 3);
uint a = 0x67452301;
uint b = 0xefcdab89;
uint c = 0x98badcfe;
uint d = 0x10325476;
a = LeftRotate(x0 + 0xd76aa478 + a + ((b & c) | (~b & d)), 7, 32 - 7) + b;
d = LeftRotate(x1 + 0xe8c7b756 + d + ((a & b) | (~a & c)), 12, 32 - 12) + a;
c = LeftRotate(x2 + 0x242070db + c + ((d & a) | (~d & b)), 17, 32 - 17) + d;
b = LeftRotate(x3 + 0xc1bdceee + b + ((c & d) | (~c & a)), 22, 32 - 22) + c;
a = LeftRotate(x4 + 0xf57c0faf + a + ((b & c) | (~b & d)), 7, 32 - 7) + b;
d = LeftRotate(x5 + 0x4787c62a + d + ((a & b) | (~a & c)), 12, 32 - 12) + a;
c = LeftRotate(x6 + 0xa8304613 + c + ((d & a) | (~d & b)), 17, 32 - 17) + d;
a = LeftRotate(xUints[0] + 0xd76aa478 + a + ((b & c) | (~b & d)), 7, 32 - 7) + b;
d = LeftRotate(xUints[1] + 0xe8c7b756 + d + ((a & b) | (~a & c)), 12, 32 - 12) + a;
c = LeftRotate(xUints[2] + 0x242070db + c + ((d & a) | (~d & b)), 17, 32 - 17) + d;
b = LeftRotate(xUints[3] + 0xc1bdceee + b + ((c & d) | (~c & a)), 22, 32 - 22) + c;
a = LeftRotate(xUints[4] + 0xf57c0faf + a + ((b & c) | (~b & d)), 7, 32 - 7) + b;
d = LeftRotate(xUints[5] + 0x4787c62a + d + ((a & b) | (~a & c)), 12, 32 - 12) + a;
c = LeftRotate(xUints[6] + 0xa8304613 + c + ((d & a) | (~d & b)), 17, 32 - 17) + d;
b = LeftRotate(0xfd469501 + b + ((c & d) | (~c & a)), 22, 32 - 22) + c;
a = LeftRotate(0x698098d8 + a + ((b & c) | (~b & d)), 7, 32 - 7) + b;
d = LeftRotate(0x8b44f7af + d + ((a & b) | (~a & c)), 12, 32 - 12) + a;
@ -57,58 +43,58 @@
b = LeftRotate(0x895cd7be + b + ((c & d) | (~c & a)), 22, 32 - 22) + c;
a = LeftRotate(0x6b901122 + a + ((b & c) | (~b & d)), 7, 32 - 7) + b;
d = LeftRotate(0xfd987193 + d + ((a & b) | (~a & c)), 12, 32 - 12) + a;
c = LeftRotate(x14 + 0xa679438e + c + ((d & a) | (~d & b)), 17, 32 - 17) + d;
c = LeftRotate(xUints[7] + 0xa679438e + c + ((d & a) | (~d & b)), 17, 32 - 17) + d;
b = LeftRotate(0x49b40821 + b + ((c & d) | (~c & a)), 22, 32 - 22) + c;
a = LeftRotate(x1 + 0xf61e2562 + a + ((b & d) | (c & ~d)), 5, 32 - 5) + b;
d = LeftRotate(x6 + 0xc040b340 + d + ((a & c) | (b & ~c)), 9, 32 - 9) + a;
a = LeftRotate(xUints[1] + 0xf61e2562 + a + ((b & d) | (c & ~d)), 5, 32 - 5) + b;
d = LeftRotate(xUints[6] + 0xc040b340 + d + ((a & c) | (b & ~c)), 9, 32 - 9) + a;
c = LeftRotate(0x265e5a51 + c + ((d & b) | (a & ~b)), 14, 32 - 14) + d;
b = LeftRotate(x0 + 0xe9b6c7aa + b + ((c & a) | (d & ~a)), 20, 32 - 20) + c;
a = LeftRotate(x5 + 0xd62f105d + a + ((b & d) | (c & ~d)), 5, 32 - 5) + b;
b = LeftRotate(xUints[0] + 0xe9b6c7aa + b + ((c & a) | (d & ~a)), 20, 32 - 20) + c;
a = LeftRotate(xUints[5] + 0xd62f105d + a + ((b & d) | (c & ~d)), 5, 32 - 5) + b;
d = LeftRotate(0x2441453 + d + ((a & c) | (b & ~c)), 9, 32 - 9) + a;
c = LeftRotate(0xd8a1e681 + c + ((d & b) | (a & ~b)), 14, 32 - 14) + d;
b = LeftRotate(x4 + 0xe7d3fbc8 + b + ((c & a) | (d & ~a)), 20, 32 - 20) + c;
b = LeftRotate(xUints[4] + 0xe7d3fbc8 + b + ((c & a) | (d & ~a)), 20, 32 - 20) + c;
a = LeftRotate(0x21e1cde6 + a + ((b & d) | (c & ~d)), 5, 32 - 5) + b;
d = LeftRotate(x14 + 0xc33707d6 + d + ((a & c) | (b & ~c)), 9, 32 - 9) + a;
c = LeftRotate(x3 + 0xf4d50d87 + c + ((d & b) | (a & ~b)), 14, 32 - 14) + d;
d = LeftRotate(xUints[7] + 0xc33707d6 + d + ((a & c) | (b & ~c)), 9, 32 - 9) + a;
c = LeftRotate(xUints[3] + 0xf4d50d87 + c + ((d & b) | (a & ~b)), 14, 32 - 14) + d;
b = LeftRotate(0x455a14ed + b + ((c & a) | (d & ~a)), 20, 32 - 20) + c;
a = LeftRotate(0xa9e3e905 + a + ((b & d) | (c & ~d)), 5, 32 - 5) + b;
d = LeftRotate(x2 + 0xfcefa3f8 + d + ((a & c) | (b & ~c)), 9, 32 - 9) + a;
d = LeftRotate(xUints[2] + 0xfcefa3f8 + d + ((a & c) | (b & ~c)), 9, 32 - 9) + a;
c = LeftRotate(0x676f02d9 + c + ((d & b) | (a & ~b)), 14, 32 - 14) + d;
b = LeftRotate(0x8d2a4c8a + b + ((c & a) | (d & ~a)), 20, 32 - 20) + c;
a = LeftRotate(x5 + 0xfffa3942 + a + (b ^ c ^ d), 4, 32 - 4) + b;
a = LeftRotate(xUints[5] + 0xfffa3942 + a + (b ^ c ^ d), 4, 32 - 4) + b;
d = LeftRotate(0x8771f681 + d + (a ^ b ^ c), 11, 32 - 11) + a;
c = LeftRotate(0x6d9d6122 + c + (d ^ a ^ b), 16, 32 - 16) + d;
b = LeftRotate(x14 + 0xfde5380c + b + (c ^ d ^ a), 23, 32 - 23) + c;
a = LeftRotate(x1 + 0xa4beea44 + a + (b ^ c ^ d), 4, 32 - 4) + b;
d = LeftRotate(x4 + 0x4bdecfa9 + d + (a ^ b ^ c), 11, 32 - 11) + a;
b = LeftRotate(xUints[7] + 0xfde5380c + b + (c ^ d ^ a), 23, 32 - 23) + c;
a = LeftRotate(xUints[1] + 0xa4beea44 + a + (b ^ c ^ d), 4, 32 - 4) + b;
d = LeftRotate(xUints[4] + 0x4bdecfa9 + d + (a ^ b ^ c), 11, 32 - 11) + a;
c = LeftRotate(0xf6bb4b60 + c + (d ^ a ^ b), 16, 32 - 16) + d;
b = LeftRotate(0xbebfbc70 + b + (c ^ d ^ a), 23, 32 - 23) + c;
a = LeftRotate(0x289b7ec6 + a + (b ^ c ^ d), 4, 32 - 4) + b;
d = LeftRotate(x0 + 0xeaa127fa + d + (a ^ b ^ c), 11, 32 - 11) + a;
c = LeftRotate(x3 + 0xd4ef3085 + c + (d ^ a ^ b), 16, 32 - 16) + d;
b = LeftRotate(x6 + 0x4881d05 + b + (c ^ d ^ a), 23, 32 - 23) + c;
d = LeftRotate(xUints[0] + 0xeaa127fa + d + (a ^ b ^ c), 11, 32 - 11) + a;
c = LeftRotate(xUints[3] + 0xd4ef3085 + c + (d ^ a ^ b), 16, 32 - 16) + d;
b = LeftRotate(xUints[6] + 0x4881d05 + b + (c ^ d ^ a), 23, 32 - 23) + c;
a = LeftRotate(0xd9d4d039 + a + (b ^ c ^ d), 4, 32 - 4) + b;
d = LeftRotate(0xe6db99e5 + d + (a ^ b ^ c), 11, 32 - 11) + a;
c = LeftRotate(0x1fa27cf8 + c + (d ^ a ^ b), 16, 32 - 16) + d;
b = LeftRotate(x2 + 0xc4ac5665 + b + (c ^ d ^ a), 23, 32 - 23) + c;
b = LeftRotate(xUints[2] + 0xc4ac5665 + b + (c ^ d ^ a), 23, 32 - 23) + c;
a = LeftRotate(x0 + 0xf4292244 + a + (c ^ (b | ~d)), 6, 32 - 6) + b;
a = LeftRotate(xUints[0] + 0xf4292244 + a + (c ^ (b | ~d)), 6, 32 - 6) + b;
d = LeftRotate(0x432aff97 + d + (b ^ (a | ~c)), 10, 32 - 10) + a;
c = LeftRotate(x14 + 0xab9423a7 + c + (a ^ (d | ~b)), 15, 32 - 15) + d;
b = LeftRotate(x5 + 0xfc93a039 + b + (d ^ (c | ~a)), 21, 32 - 21) + c;
c = LeftRotate(xUints[7] + 0xab9423a7 + c + (a ^ (d | ~b)), 15, 32 - 15) + d;
b = LeftRotate(xUints[5] + 0xfc93a039 + b + (d ^ (c | ~a)), 21, 32 - 21) + c;
a = LeftRotate(0x655b59c3 + a + (c ^ (b | ~d)), 6, 32 - 6) + b;
d = LeftRotate(x3 + 0x8f0ccc92 + d + (b ^ (a | ~c)), 10, 32 - 10) + a;
d = LeftRotate(xUints[3] + 0x8f0ccc92 + d + (b ^ (a | ~c)), 10, 32 - 10) + a;
c = LeftRotate(0xffeff47d + c + (a ^ (d | ~b)), 15, 32 - 15) + d;
b = LeftRotate(x1 + 0x85845dd1 + b + (d ^ (c | ~a)), 21, 32 - 21) + c;
b = LeftRotate(xUints[1] + 0x85845dd1 + b + (d ^ (c | ~a)), 21, 32 - 21) + c;
a = LeftRotate(0x6fa87e4f + a + (c ^ (b | ~d)), 6, 32 - 6) + b;
d = LeftRotate(0xfe2ce6e0 + d + (b ^ (a | ~c)), 10, 32 - 10) + a;
c = LeftRotate(x6 + 0xa3014314 + c + (a ^ (d | ~b)), 15, 32 - 15) + d;
c = LeftRotate(xUints[6] + 0xa3014314 + c + (a ^ (d | ~b)), 15, 32 - 15) + d;
b = LeftRotate(0x4e0811a1 + b + (d ^ (c | ~a)), 21, 32 - 21) + c;
a = LeftRotate(x4 + 0xf7537e82 + a + (c ^ (b | ~d)), 6, 32 - 6) + b;
a = LeftRotate(xUints[4] + 0xf7537e82 + a + (c ^ (b | ~d)), 6, 32 - 6) + b;
d = LeftRotate(0xbd3af235 + d + (b ^ (a | ~c)), 10, 32 - 10) + a;
c = LeftRotate(x2 + 0x2ad7d2bb + c + (a ^ (d | ~b)), 15, 32 - 15) + d;
c = LeftRotate(xUints[2] + 0x2ad7d2bb + c + (a ^ (d | ~b)), 15, 32 - 15) + d;
b = LeftRotate(0xeb86d391 + b + (d ^ (c | ~a)), 21, 32 - 21) + c;
return new[]
@ -120,15 +106,7 @@
};
}
private static uint LE_To_UInt32(byte[] bs, int off)
{
return (uint)bs[off]
| (uint)bs[off + 1] << 8
| (uint)bs[off + 2] << 16
| (uint)bs[off + 3] << 24;
}
private static uint LeftRotate(uint x, int left, int right)
private static uint LeftRotate(uint x, int left, int right)
{
return (x << left) | (x >> right);
}

@ -0,0 +1,54 @@
namespace WhiteRabbit
{
internal unsafe struct Phrase
{
private const byte SPACE = 32;
public fixed byte Buffer[32];
public Phrase(byte[][] words, int numberOfCharacters)
{
fixed (byte* bufferPointer = this.Buffer)
{
var length = numberOfCharacters + words.Length - 1;
byte* end = bufferPointer + length;
byte[] currentWord = words[0];
var j = 0;
var wordIndex = 0;
for (var currentPointer = bufferPointer; currentPointer < end; currentPointer++)
{
if (j >= currentWord.Length)
{
*currentPointer = SPACE;
j = 0;
wordIndex++;
currentWord = words[wordIndex];
}
else
{
*currentPointer = currentWord[j];
j++;
}
}
bufferPointer[31] = (byte)length;
}
}
public byte[] GetBytes()
{
fixed(byte* bufferPointer = this.Buffer)
{
var length = bufferPointer[31];
var result = new byte[length];
for (var i = 0; i < length; i++)
{
result[i] = bufferPointer[i];
}
return result;
}
}
}
}

@ -47,19 +47,19 @@
.ForAll(phraseBytes =>
{
Debug.Assert(
sourceChars == ToOrderedChars(Encoding.ASCII.GetString(phraseBytes)),
$"StringsProcessor produced incorrect anagram: {Encoding.ASCII.GetString(phraseBytes)}");
sourceChars == ToOrderedChars(ToString(phraseBytes)),
$"StringsProcessor produced incorrect anagram: {ToString(phraseBytes)}");
var hashVector = ComputeHashVector(phraseBytes);
if (Array.IndexOf(expectedHashesAsVectors, hashVector) >= 0)
{
var phrase = Encoding.ASCII.GetString(phraseBytes);
var phrase = ToString(phraseBytes);
var hash = VectorToHexadecimalString(hashVector);
Console.WriteLine($"Found phrase for {hash}: {phrase}; time from start is {stopwatch.Elapsed}");
}
#if DEBUG
anagramsBag.Add(Encoding.ASCII.GetString(phraseBytes));
anagramsBag.Add(ToString(phraseBytes));
#endif
});
@ -94,7 +94,7 @@
}
// Bouncy Castle is used instead of standard .NET methods for performance reasons
private static Vector<uint> ComputeHashVector(byte[] input)
private static Vector<uint> ComputeHashVector(Phrase input)
{
return new Vector<uint>(MD5Digest.Compute(input));
}
@ -113,6 +113,11 @@
return hex.Substring(6, 2) + hex.Substring(4, 2) + hex.Substring(2, 2) + hex.Substring(0, 2);
}
private static string ToString(Phrase phrase)
{
return Encoding.ASCII.GetString(phrase.GetBytes());
}
private static IEnumerable<byte[]> ReadInput()
{
string line;

@ -46,7 +46,7 @@
#if SINGLE_THREADED
public IEnumerable<byte[]> GeneratePhrases()
#else
public ParallelQuery<byte[]> GeneratePhrases()
public ParallelQuery<Phrase> GeneratePhrases()
#endif
{
// task of finding anagrams could be reduced to the task of finding sequences of dictionary vectors with the target sum
@ -71,24 +71,9 @@
return words;
}
private byte[] ConvertWordsToPhrase(byte[][] words)
private unsafe Phrase ConvertWordsToPhrase(byte[][] words)
{
var result = new byte[this.NumberOfCharacters + words.Length - 1];
byte[] currentWord = words[0];
Buffer.BlockCopy(currentWord, 0, result, 0, currentWord.Length);
var position = currentWord.Length;
for (var i = 1; i < words.Length; i++)
{
result[position] = SPACE;
position++;
currentWord = words[i];
Buffer.BlockCopy(currentWord, 0, result, position, currentWord.Length);
position += currentWord.Length;
}
return result;
return new Phrase(words, this.NumberOfCharacters);
}
}
}

@ -12,6 +12,7 @@
<TargetFrameworkVersion>v4.6</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<PlatformTarget>AnyCPU</PlatformTarget>
@ -59,6 +60,7 @@
<Compile Include="ByteArrayEqualityComparer.cs" />
<Compile Include="Flattener.cs" />
<Compile Include="MD5Digest.cs" />
<Compile Include="Phrase.cs" />
<Compile Include="PrecomputedPermutationsGenerator.cs" />
<Compile Include="PermutationsGenerator.cs" />
<Compile Include="StringsProcessor.cs" />

Loading…
Cancel
Save