From e5c1e743bc23999b9c7374cab975441ada4b7e9a Mon Sep 17 00:00:00 2001 From: inga-lovinde <52715130+inga-lovinde@users.noreply.github.com> Date: Mon, 27 Mar 2017 11:44:25 +0300 Subject: [PATCH] Further MD5 optimizations --- README.md | 17 +++++-- WhiteRabbit/MD5Digest.cs | 105 ++++++++++++++++++++------------------- 2 files changed, 67 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 88e20d7..97e62f8 100644 --- a/README.md +++ b/README.md @@ -41,19 +41,28 @@ Anagrams generation is not parallelized, as even single-threaded performance for Multi-threaded performance with RyuJIT (.NET 4.6, 64-bit system) on quad-core Sandy Bridge @2.8GHz is as follows: -* If only phrases of at most 4 words are allowed, then it takes **around 2 seconds** to find and check all 7433016 anagrams; all hashes are solved in first 0.4 seconds. +* If only phrases of at most 4 words are allowed, then it takes **less than 2 seconds** to find and check all 7433016 anagrams; all hashes are solved in first 0.3 seconds. -* If phrases of 5 words are allowed as well, then it takes around 5.5 minutes to find and check all 1348876896 anagrams; all hashes are solved in first 8.5 seconds. +* If phrases of 5 words are allowed as well, then it takes around 5 minutes to find and check all 1348876896 anagrams; all hashes are solved in first 7.5 seconds. Most of time is spent on MD5 computations for correct anagrams, so there is not a lot to optimize further. -* If phrases of 6 words are allowed as well, then "more difficult" hash is solved in 9 seconds, "easiest" in 50 seconds, and "hard" in 2 minutes. +* If phrases of 6 words are allowed as well, then "more difficult" hash is solved in 8 seconds, "easiest" in 48 seconds, and "hard" in less than 2 minutes. -* If phrases of 7 words are allowed as well, then "more difficult" hash is solved in 56 seconds, "easiest" in 7 minutes, and "hard" in 18 minutes. +* If phrases of 7 words are allowed as well, then "more difficult" hash is solved in 46 seconds, "easiest" in less than 6 minutes, and "hard" in around 15 minutes. Note that all measurements were done on a Release build; Debug build is significantly slower. For comparison, certain other solutions available on GitHub seem to require 3 hours to find all 3-word anagrams. This solution is faster by 5-7 orders of magnitude (it finds and checks all 4-word anagrams in 1/2000th fraction of time required for other solution just to find all 3-word anagrams, with no MD5 calculations). +Conditional compilation symbols +=============================== + +* Define `BIG_ENDIAN` if you plan to run this on big-endian PC; this will disable certain MD5 optimizations that only produce correct result on little-endian PCs. + +* Define `SINGLE_THREADED` to use standard enumerables instead of ParallelEnumerable. + +* Define `DEBUG`, or build in debug mode, to get the total number of anagrams (not optimized, memory-hogging). + Implementation notes ==================== diff --git a/WhiteRabbit/MD5Digest.cs b/WhiteRabbit/MD5Digest.cs index cecf2f6..1dcd407 100644 --- a/WhiteRabbit/MD5Digest.cs +++ b/WhiteRabbit/MD5Digest.cs @@ -4,6 +4,7 @@ /** * Code taken from BouncyCastle and optimized for specific constraints (e.g. input is always larger than 4 bytes and smaller than 52 bytes). + * Further optimization: input could be assumed to be smaller than 27 bytes (original phrase contains 18 letters, so that allows anagrams of 9 words) * base implementation of MD4 family style digest as outlined in * "Handbook of Applied Cryptography", pages 344 - 347. * implementation of MD5 as outlined in "Handbook of Applied Cryptography", pages 346 - 347. @@ -14,26 +15,28 @@ { var length = input.Length; - var xBytes = new byte[4 * 14]; - Buffer.BlockCopy(input, 0, xBytes, 0, length); - xBytes[length] = 128; + var xUints = new uint[8]; // it seems that alignment helps +#if BIG_ENDIAN + xUints[0] = LE_To_UInt32(xBytes, 4 * 0); + xUints[1] = LE_To_UInt32(xBytes, 4 * 1); + xUints[2] = LE_To_UInt32(xBytes, 4 * 2); + xUints[3] = LE_To_UInt32(xBytes, 4 * 3); + xUints[4] = LE_To_UInt32(xBytes, 4 * 4); + xUints[5] = LE_To_UInt32(xBytes, 4 * 5); + xUints[6] = LE_To_UInt32(xBytes, 4 * 6); +#else + Buffer.BlockCopy(input, 0, xUints, 0, length); +#endif + xUints[length >> 2] |= (uint)128 << (8 * (length & 3)); - var x0 = LE_To_UInt32(xBytes, 4 * 0); - var x1 = LE_To_UInt32(xBytes, 4 * 1); - var x2 = LE_To_UInt32(xBytes, 4 * 2); - var x3 = LE_To_UInt32(xBytes, 4 * 3); - var x4 = LE_To_UInt32(xBytes, 4 * 4); - var x5 = LE_To_UInt32(xBytes, 4 * 5); - var x6 = LE_To_UInt32(xBytes, 4 * 6); - var x7 = LE_To_UInt32(xBytes, 4 * 7); - var x8 = LE_To_UInt32(xBytes, 4 * 8); - var x9 = LE_To_UInt32(xBytes, 4 * 9); - var x10 = LE_To_UInt32(xBytes, 4 * 10); - var x11 = LE_To_UInt32(xBytes, 4 * 11); - var x12 = LE_To_UInt32(xBytes, 4 * 12); - var x13 = LE_To_UInt32(xBytes, 4 * 13); + var x0 = xUints[0]; + var x1 = xUints[1]; + var x2 = xUints[2]; + var x3 = xUints[3]; + var x4 = xUints[4]; + var x5 = xUints[5]; + var x6 = xUints[6]; var x14 = (uint)(length << 3); - uint x15 = 0; uint a = 0x67452301; uint b = 0xefcdab89; @@ -47,66 +50,66 @@ a = LeftRotate(x4 + 0xf57c0faf + a + ((b & c) | (~b & d)), 7, 32 - 7) + b; d = LeftRotate(x5 + 0x4787c62a + d + ((a & b) | (~a & c)), 12, 32 - 12) + a; c = LeftRotate(x6 + 0xa8304613 + c + ((d & a) | (~d & b)), 17, 32 - 17) + d; - b = LeftRotate(x7 + 0xfd469501 + b + ((c & d) | (~c & a)), 22, 32 - 22) + c; - a = LeftRotate(x8 + 0x698098d8 + a + ((b & c) | (~b & d)), 7, 32 - 7) + b; - d = LeftRotate(x9 + 0x8b44f7af + d + ((a & b) | (~a & c)), 12, 32 - 12) + a; - c = LeftRotate(x10 + 0xffff5bb1 + c + ((d & a) | (~d & b)), 17, 32 - 17) + d; - b = LeftRotate(x11 + 0x895cd7be + b + ((c & d) | (~c & a)), 22, 32 - 22) + c; - a = LeftRotate(x12 + 0x6b901122 + a + ((b & c) | (~b & d)), 7, 32 - 7) + b; - d = LeftRotate(x13 + 0xfd987193 + d + ((a & b) | (~a & c)), 12, 32 - 12) + a; + b = LeftRotate(0xfd469501 + b + ((c & d) | (~c & a)), 22, 32 - 22) + c; + a = LeftRotate(0x698098d8 + a + ((b & c) | (~b & d)), 7, 32 - 7) + b; + d = LeftRotate(0x8b44f7af + d + ((a & b) | (~a & c)), 12, 32 - 12) + a; + c = LeftRotate(0xffff5bb1 + c + ((d & a) | (~d & b)), 17, 32 - 17) + d; + b = LeftRotate(0x895cd7be + b + ((c & d) | (~c & a)), 22, 32 - 22) + c; + a = LeftRotate(0x6b901122 + a + ((b & c) | (~b & d)), 7, 32 - 7) + b; + d = LeftRotate(0xfd987193 + d + ((a & b) | (~a & c)), 12, 32 - 12) + a; c = LeftRotate(x14 + 0xa679438e + c + ((d & a) | (~d & b)), 17, 32 - 17) + d; - b = LeftRotate(x15 + 0x49b40821 + b + ((c & d) | (~c & a)), 22, 32 - 22) + c; + b = LeftRotate(0x49b40821 + b + ((c & d) | (~c & a)), 22, 32 - 22) + c; a = LeftRotate(x1 + 0xf61e2562 + a + ((b & d) | (c & ~d)), 5, 32 - 5) + b; d = LeftRotate(x6 + 0xc040b340 + d + ((a & c) | (b & ~c)), 9, 32 - 9) + a; - c = LeftRotate(x11 + 0x265e5a51 + c + ((d & b) | (a & ~b)), 14, 32 - 14) + d; + c = LeftRotate(0x265e5a51 + c + ((d & b) | (a & ~b)), 14, 32 - 14) + d; b = LeftRotate(x0 + 0xe9b6c7aa + b + ((c & a) | (d & ~a)), 20, 32 - 20) + c; a = LeftRotate(x5 + 0xd62f105d + a + ((b & d) | (c & ~d)), 5, 32 - 5) + b; - d = LeftRotate(x10 + 0x2441453 + d + ((a & c) | (b & ~c)), 9, 32 - 9) + a; - c = LeftRotate(x15 + 0xd8a1e681 + c + ((d & b) | (a & ~b)), 14, 32 - 14) + d; + d = LeftRotate(0x2441453 + d + ((a & c) | (b & ~c)), 9, 32 - 9) + a; + c = LeftRotate(0xd8a1e681 + c + ((d & b) | (a & ~b)), 14, 32 - 14) + d; b = LeftRotate(x4 + 0xe7d3fbc8 + b + ((c & a) | (d & ~a)), 20, 32 - 20) + c; - a = LeftRotate(x9 + 0x21e1cde6 + a + ((b & d) | (c & ~d)), 5, 32 - 5) + b; + a = LeftRotate(0x21e1cde6 + a + ((b & d) | (c & ~d)), 5, 32 - 5) + b; d = LeftRotate(x14 + 0xc33707d6 + d + ((a & c) | (b & ~c)), 9, 32 - 9) + a; c = LeftRotate(x3 + 0xf4d50d87 + c + ((d & b) | (a & ~b)), 14, 32 - 14) + d; - b = LeftRotate(x8 + 0x455a14ed + b + ((c & a) | (d & ~a)), 20, 32 - 20) + c; - a = LeftRotate(x13 + 0xa9e3e905 + a + ((b & d) | (c & ~d)), 5, 32 - 5) + b; + b = LeftRotate(0x455a14ed + b + ((c & a) | (d & ~a)), 20, 32 - 20) + c; + a = LeftRotate(0xa9e3e905 + a + ((b & d) | (c & ~d)), 5, 32 - 5) + b; d = LeftRotate(x2 + 0xfcefa3f8 + d + ((a & c) | (b & ~c)), 9, 32 - 9) + a; - c = LeftRotate(x7 + 0x676f02d9 + c + ((d & b) | (a & ~b)), 14, 32 - 14) + d; - b = LeftRotate(x12 + 0x8d2a4c8a + b + ((c & a) | (d & ~a)), 20, 32 - 20) + c; + c = LeftRotate(0x676f02d9 + c + ((d & b) | (a & ~b)), 14, 32 - 14) + d; + b = LeftRotate(0x8d2a4c8a + b + ((c & a) | (d & ~a)), 20, 32 - 20) + c; a = LeftRotate(x5 + 0xfffa3942 + a + (b ^ c ^ d), 4, 32 - 4) + b; - d = LeftRotate(x8 + 0x8771f681 + d + (a ^ b ^ c), 11, 32 - 11) + a; - c = LeftRotate(x11 + 0x6d9d6122 + c + (d ^ a ^ b), 16, 32 - 16) + d; + d = LeftRotate(0x8771f681 + d + (a ^ b ^ c), 11, 32 - 11) + a; + c = LeftRotate(0x6d9d6122 + c + (d ^ a ^ b), 16, 32 - 16) + d; b = LeftRotate(x14 + 0xfde5380c + b + (c ^ d ^ a), 23, 32 - 23) + c; a = LeftRotate(x1 + 0xa4beea44 + a + (b ^ c ^ d), 4, 32 - 4) + b; d = LeftRotate(x4 + 0x4bdecfa9 + d + (a ^ b ^ c), 11, 32 - 11) + a; - c = LeftRotate(x7 + 0xf6bb4b60 + c + (d ^ a ^ b), 16, 32 - 16) + d; - b = LeftRotate(x10 + 0xbebfbc70 + b + (c ^ d ^ a), 23, 32 - 23) + c; - a = LeftRotate(x13 + 0x289b7ec6 + a + (b ^ c ^ d), 4, 32 - 4) + b; + c = LeftRotate(0xf6bb4b60 + c + (d ^ a ^ b), 16, 32 - 16) + d; + b = LeftRotate(0xbebfbc70 + b + (c ^ d ^ a), 23, 32 - 23) + c; + a = LeftRotate(0x289b7ec6 + a + (b ^ c ^ d), 4, 32 - 4) + b; d = LeftRotate(x0 + 0xeaa127fa + d + (a ^ b ^ c), 11, 32 - 11) + a; c = LeftRotate(x3 + 0xd4ef3085 + c + (d ^ a ^ b), 16, 32 - 16) + d; b = LeftRotate(x6 + 0x4881d05 + b + (c ^ d ^ a), 23, 32 - 23) + c; - a = LeftRotate(x9 + 0xd9d4d039 + a + (b ^ c ^ d), 4, 32 - 4) + b; - d = LeftRotate(x12 + 0xe6db99e5 + d + (a ^ b ^ c), 11, 32 - 11) + a; - c = LeftRotate(x15 + 0x1fa27cf8 + c + (d ^ a ^ b), 16, 32 - 16) + d; + a = LeftRotate(0xd9d4d039 + a + (b ^ c ^ d), 4, 32 - 4) + b; + d = LeftRotate(0xe6db99e5 + d + (a ^ b ^ c), 11, 32 - 11) + a; + c = LeftRotate(0x1fa27cf8 + c + (d ^ a ^ b), 16, 32 - 16) + d; b = LeftRotate(x2 + 0xc4ac5665 + b + (c ^ d ^ a), 23, 32 - 23) + c; a = LeftRotate(x0 + 0xf4292244 + a + (c ^ (b | ~d)), 6, 32 - 6) + b; - d = LeftRotate(x7 + 0x432aff97 + d + (b ^ (a | ~c)), 10, 32 - 10) + a; + d = LeftRotate(0x432aff97 + d + (b ^ (a | ~c)), 10, 32 - 10) + a; c = LeftRotate(x14 + 0xab9423a7 + c + (a ^ (d | ~b)), 15, 32 - 15) + d; b = LeftRotate(x5 + 0xfc93a039 + b + (d ^ (c | ~a)), 21, 32 - 21) + c; - a = LeftRotate(x12 + 0x655b59c3 + a + (c ^ (b | ~d)), 6, 32 - 6) + b; + a = LeftRotate(0x655b59c3 + a + (c ^ (b | ~d)), 6, 32 - 6) + b; d = LeftRotate(x3 + 0x8f0ccc92 + d + (b ^ (a | ~c)), 10, 32 - 10) + a; - c = LeftRotate(x10 + 0xffeff47d + c + (a ^ (d | ~b)), 15, 32 - 15) + d; + c = LeftRotate(0xffeff47d + c + (a ^ (d | ~b)), 15, 32 - 15) + d; b = LeftRotate(x1 + 0x85845dd1 + b + (d ^ (c | ~a)), 21, 32 - 21) + c; - a = LeftRotate(x8 + 0x6fa87e4f + a + (c ^ (b | ~d)), 6, 32 - 6) + b; - d = LeftRotate(x15 + 0xfe2ce6e0 + d + (b ^ (a | ~c)), 10, 32 - 10) + a; + a = LeftRotate(0x6fa87e4f + a + (c ^ (b | ~d)), 6, 32 - 6) + b; + d = LeftRotate(0xfe2ce6e0 + d + (b ^ (a | ~c)), 10, 32 - 10) + a; c = LeftRotate(x6 + 0xa3014314 + c + (a ^ (d | ~b)), 15, 32 - 15) + d; - b = LeftRotate(x13 + 0x4e0811a1 + b + (d ^ (c | ~a)), 21, 32 - 21) + c; + b = LeftRotate(0x4e0811a1 + b + (d ^ (c | ~a)), 21, 32 - 21) + c; a = LeftRotate(x4 + 0xf7537e82 + a + (c ^ (b | ~d)), 6, 32 - 6) + b; - d = LeftRotate(x11 + 0xbd3af235 + d + (b ^ (a | ~c)), 10, 32 - 10) + a; + d = LeftRotate(0xbd3af235 + d + (b ^ (a | ~c)), 10, 32 - 10) + a; c = LeftRotate(x2 + 0x2ad7d2bb + c + (a ^ (d | ~b)), 15, 32 - 15) + d; - b = LeftRotate(x9 + 0xeb86d391 + b + (d ^ (c | ~a)), 21, 32 - 21) + c; + b = LeftRotate(0xeb86d391 + b + (d ^ (c | ~a)), 21, 32 - 21) + c; return new[] { @@ -125,7 +128,7 @@ | (uint)bs[off + 3] << 24; } - private static uint LeftRotate(uint x, int left, int right) + private static uint LeftRotate(uint x, int left, int right) { return (x << left) | (x >> right); }