Microoptimization: one part of MD5 is enough for search

unmanaged
Inga 🏳‍🌈 8 years ago
parent 4702fba26b
commit bcd6a1d053
  1. 8
      README.md
  2. 16
      dotnet/WhiteRabbit.UnmanagedBridge/WhiteRabbit.UnmanagedBridge.cpp
  3. 7
      dotnet/WhiteRabbit.UnmanagedBridge/md5.cpp
  4. 20
      dotnet/WhiteRabbit/MD5Digest.cs
  5. 31
      dotnet/WhiteRabbit/Program.cs

@ -43,13 +43,13 @@ Anagrams generation is not parallelized, as even single-threaded performance for
Multi-threaded performance with RyuJIT (.NET 4.6, 64-bit system) on quad-core Sandy Bridge @2.8GHz is as follows (excluding initialization time of 0.2 seconds):
* If only phrases of at most 4 words are allowed, then it takes **1.1 seconds** to find and check all 7,433,016 anagrams; **all hashes are solved in first 0.2 seconds**.
* If only phrases of at most 4 words are allowed, then it takes **0.9 seconds** to find and check all 7,433,016 anagrams; **all hashes are solved in first 0.15 seconds**.
* If phrases of 5 words are allowed as well, then it takes 2:45 minutes to find and check all 1,348,876,896 anagrams; all hashes are solved in first 4 seconds.
* If phrases of 5 words are allowed as well, then it takes around 100 seconds to find and check all 1,348,876,896 anagrams; all hashes are solved in first 2.5 seconds.
* If phrases of 6 words are allowed as well, then it takes less than 2 hours to find and check all 58,837,302,096 anagrams; "more difficult" hash is solved in 3.5 seconds, "easiest" in 21 seconds, and "hard" in 54 seconds.
* If phrases of 6 words are allowed as well, then it takes around 75 minutes to find and check all 58,837,302,096 anagrams; "more difficult" hash is solved in 2.5 seconds, "easiest" in 14 seconds, and "hard" in 35 seconds.
* If phrases of 7 words are allowed as well, then it takes 75 seconds to count all 1,108,328,708,976 anagrams, and around 40 hours to find and check all these anagrams; "more difficult" hash is solved in 20 seconds, "easiest" in less than 2.5 minutes, and "hard" in 6:45 minutes.
* If phrases of 7 words are allowed as well, then it takes 75 seconds to count all 1,108,328,708,976 anagrams, and around 40 hours (speculatively) to find and check all these anagrams; "more difficult" hash is solved in 13 seconds, "easiest" in 1.5 minutes, and "hard" in 4.5 minutes.
Note that all measurements were done on a Release build; Debug build is significantly slower.

@ -7,12 +7,12 @@
void WhiteRabbitUnmanagedBridge::MD5Unmanaged::ComputeMD5(unsigned int * input, unsigned int* output)
{
md5(input + 0 * 8, output + 0 * 4);
md5(input + 1 * 8, output + 1 * 4);
md5(input + 2 * 8, output + 2 * 4);
md5(input + 3 * 8, output + 3 * 4);
md5(input + 4 * 8, output + 4 * 4);
md5(input + 5 * 8, output + 5 * 4);
md5(input + 6 * 8, output + 6 * 4);
md5(input + 7 * 8, output + 7 * 4);
md5(input + 0 * 8, output + 0);
md5(input + 1 * 8, output + 1);
md5(input + 2 * 8, output + 2);
md5(input + 3 * 8, output + 3);
md5(input + 4 * 8, output + 4);
md5(input + 5 * 8, output + 5);
md5(input + 6 * 8, output + 6);
md5(input + 7 * 8, output + 7);
}

@ -80,7 +80,6 @@ inline MD5Word Step4(MD5Word a, MD5Word b, MD5Word c, MD5Word d, MD5Word k)
void md5(unsigned int * input, unsigned int * output)
{
MD5Word a = 0x67452301;
MD5Word b = 0xefcdab89;
MD5Word c = 0x98badcfe;
@ -150,13 +149,7 @@ void md5(unsigned int * input, unsigned int * output)
c = Step4<15>(c, d, a, b, 0xa3014314, input[6]);
b = Step4<21>(b, c, d, a, 0x4e0811a1);
a = Step4<6>(a, b, c, d, 0xf7537e82, input[4]);
d = Step4<10>(d, a, b, c, 0xbd3af235);
c = Step4<15>(c, d, a, b, 0x2ad7d2bb, input[2]);
b = Step4<21>(b, c, d, a, 0xeb86d391);
output[0] = 0x67452301 + a;
output[1] = 0xefcdab89 + b;
output[2] = 0x98badcfe + c;
output[3] = 0x10325476 + d;
}
#pragma managed

@ -4,30 +4,18 @@ using WhiteRabbitUnmanagedBridge;
namespace WhiteRabbit
{
/**
* Code taken from BouncyCastle and optimized for specific constraints (e.g. input is always larger than 4 bytes and smaller than 52 bytes).
* Further optimization: input could be assumed to be smaller than 27 bytes (original phrase contains 18 letters, so that allows anagrams of 9 words)
* base implementation of MD4 family style digest as outlined in
* "Handbook of Applied Cryptography", pages 344 - 347.
* implementation of MD5 as outlined in "Handbook of Applied Cryptography", pages 346 - 347.
*/
internal static class MD5Digest
{
// It only returns first component of MD5 hash
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe Vector<uint>[] Compute(PhraseSet input)
public static unsafe uint[] Compute(PhraseSet input)
{
var rawResult = new uint[4 * Constants.PhrasesPerSet];
fixed (uint* resultPointer = rawResult)
var result = new uint[Constants.PhrasesPerSet];
fixed (uint* resultPointer = result)
{
MD5Unmanaged.ComputeMD5(input.Buffer, resultPointer);
}
var result = new Vector<uint>[Constants.PhrasesPerSet];
for (var i = 0; i < Constants.PhrasesPerSet; i++)
{
result[i] = new Vector<uint>(rawResult, 4 * i);
}
return result;
}
}

@ -7,6 +7,7 @@
using System.Diagnostics;
using System.Linq;
using System.Numerics;
using System.Security.Cryptography;
using System.Text;
/// <summary>
@ -49,6 +50,8 @@
.Select(hash => new Vector<uint>(HexadecimalStringToUnsignedIntArray(hash)))
.ToArray();
var expectedHashesFirstComponents = expectedHashesAsVectors.Select(vector => vector[0]).ToArray();
var processor = new StringsProcessor(
Encoding.ASCII.GetBytes(sourcePhrase),
maxWordsInPhrase,
@ -64,19 +67,19 @@
stopwatch.Restart();
processor.GeneratePhrases()
.ForAll(phraseBytes =>
.ForAll(phraseSet =>
{
var hashVectors = MD5Digest.Compute(phraseBytes);
var hashesFirstComponents = MD5Digest.Compute(phraseSet);
for (var i = 0; i < Constants.PhrasesPerSet; i++)
{
Debug.Assert(
sourceChars == ToOrderedChars(ToString(phraseBytes, i)),
$"StringsProcessor produced incorrect anagram: {ToString(phraseBytes, i)}");
sourceChars == ToOrderedChars(ToString(phraseSet, i)),
$"StringsProcessor produced incorrect anagram: {ToString(phraseSet, i)}");
if (Array.IndexOf(expectedHashesAsVectors, hashVectors[i]) >= 0)
if (Array.IndexOf(expectedHashesFirstComponents, hashesFirstComponents[i]) >= 0)
{
var phrase = ToString(phraseBytes, i);
var hash = VectorToHexadecimalString(hashVectors[i]);
var phrase = ToString(phraseSet, i);
var hash = ComputeFullMD5(phrase);
Console.WriteLine($"Found phrase for {hash}: {phrase}; time from start is {stopwatch.Elapsed}");
}
}
@ -96,13 +99,15 @@
.ToArray();
}
private static string VectorToHexadecimalString(Vector<uint> hash)
// We can afford to spend some time here; this code will only run for matched phrases (and for one in several billion non-matched)
private static string ComputeFullMD5(string phrase)
{
var components = Enumerable.Range(0, 4)
.Select(i => hash[i].ToString("x8"))
.Select(ChangeEndianness);
return string.Concat(components);
var phraseBytes = Encoding.ASCII.GetBytes(phrase);
using (var hashAlgorithm = new MD5CryptoServiceProvider())
{
var resultBytes = hashAlgorithm.ComputeHash(phraseBytes);
return string.Concat(resultBytes.Select(b => b.ToString("x2")));
}
}
private static string ChangeEndianness(string hex)

Loading…
Cancel
Save