Anagram analyzer implemented; solution is fully functional now

4 years ago · e57b327c57
parent 69edfe4e14
commit e57b327c57
9 changed files with 126 additions and 52 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -9,6 +9,7 @@ edition = "2018"
 [dependencies]
 bit_field = "0.10.1"
 crunchy = "0.2.2"
+md5 = "0.7.0"
 packed_simd = { version = "0.3.4", package = "packed_simd_2", features = ["into_bits"] }
 permutohedron = "0.2.4"
 rayon = "1.5.0"
--- a/README.md
+++ b/README.md
@ -6,7 +6,8 @@ where you had to, given the dictionary, and given three MD5 hashes,
 find three-word anagrams of a phrase *"poultry outwits ants"*
 which result in these hashes.

-My original solution was in mixture of C# and plain C (with a bit of Visual C++
+My [original solution](https://github.com/inga-lovinde/TrustPilotChallenge)
+was in mixture of C# and plain C (with a bit of Visual C++
 as a bridge), and heavily used AVX2 intrinsics for optimization.

 Rust now has a decent API frontend for AVX2 intrinsics 
@ -18,11 +19,9 @@ find all anagrams no longer than N words and no longer than 27 bytes
 which produce given MD5 hashes.

 (The limit on the number of words is neccessary, because there are single-letter words
-in the dictionary; and it makes the total number of anagrams astronomically large)
+in the dictionary; and it makes the total number of anagrams astronomically large.)

-This is a working draft, so far the code is extremely dirty (this is my first Rust project),
-and it only lists all anagrams
-and does not yet do actual MD5 calculation.
+Note that this is my first Rust project.

 ## Algorithm description

@ -32,6 +31,7 @@ It also computes eight MD5 hashes at a time *per thread*
 (that is, 128 MD5 hashes at once on a modern 8-core CPU),
 with some further optimizations which further shave off
 several percents from MD5 computation time.
+(md5 crate dependency is only used to nicely print results)

 We could split the problem into three parts: finding all anagrams
 (up to words reordering and replacing some of the words with their single-word anagrams),
@ -158,7 +158,13 @@ it will not severely affect performance.
 How to run to solve the original task for three-word anagrams:

 ```
-cargo run data\words.txt data\hashes.txt 3 "poultry outwits ants"
+cargo run data\words.txt data\hashes.txt 4 "poultry outwits ants"
 ```

-(Note that CPU with AVX2 support is required; that is, Intel Haswell (2013) or newer, or AMD Excavator (2015) or newer)
+(Note that CPU with AVX2 support is required; that is, Intel Haswell (2013) or newer, or AMD Excavator (2015) or newer.)
+
+In addition to the right solutions it will also output some wrong ones,
+because for performance and transparency reasons only the first 8 bytes of hashes are compared.
+This means that for every requested hash there is 1/1^32 chance of collision,
+so for 10 requested hashes you will get one false positive every 430 millions of anagrams, on average,
+which allows one to roughly measure the perfomance of MD5 calculation.
--- a/data/hashes.txt
+++ b/data/hashes.txt
@ -0,0 +1,10 @@
+e4820b45d2277f3844eac66c903e84be
+23170acc097c24edb98fc5488ab033fe
+4a9f51db2c7eba0c724499f749d3176a
+665e5bcb0c20062fe8abaaf4628bb154
+e8a2cbb6206fc937082bb92e4ed9cd3d
+74a613b8c64fb216dc22d4f2bd4965f4
+ccb5ed231ba04d750c963668391d1e61
+d864ae0e66c89cb78345967cb2f3ab6b
+2b56477105d91076030e877c94dd9776
+732442feac8b5013e16a776486ac5447
--- a/src/anagram_analyzer.rs
+++ b/src/anagram_analyzer.rs
@ -0,0 +1,48 @@
+use packed_simd::u8x32;
+use crate::anagram_logger::log_anagram;
+use crate::dictionary_builder::Dictionary;
+use crate::hash_computer::CHUNK_SIZE;
+use crate::hash_computer::find_hashes;
+use crate::permutations_cache::PermutationsCache;
+
+fn generate_vector_substitutions<'a>(simple_dictionary: &'a Dictionary, permutation: &'a [usize], current_phrase: u8x32, current_phrase_length: usize) -> Box<dyn Iterator<Item = u8x32> + 'a> {
+    if permutation.len() == 0 {
+        return Box::new(std::iter::once(current_phrase.clone()));
+    }
+
+    let result = simple_dictionary.words[permutation[0]].iter()
+        .flat_map(move |word_info| {
+            generate_vector_substitutions(&simple_dictionary, &permutation[1..], current_phrase ^ word_info.get_simd_word_for_offset(current_phrase_length), current_phrase_length + word_info.length + 1).into_iter()
+        });
+    return Box::new(result);
+}
+
+fn process_anagram_chunk(chunk: &[u8x32; CHUNK_SIZE], phrase_length: usize, hashes_to_find: &[u32]) -> () {
+    match find_hashes(chunk, phrase_length, hashes_to_find) {
+        Some(anagrams) => {
+            for anagram in anagrams {
+                log_anagram(anagram, phrase_length);
+            }
+        }
+        _ => ()
+    }
+}
+
+pub fn analyze_anagrams(anagram_vector: &Vec<usize>, dictionary: &Dictionary, permutations: &PermutationsCache, phrase_length: usize, hashes_to_find: &[u32]) -> () {
+    let mut chunk: [u8x32; CHUNK_SIZE] = [u8x32::splat(0); CHUNK_SIZE];
+    let mut chunk_position: usize = 0;
+
+    permutations.get_permuted_vectors(&anagram_vector).iter()
+        .flat_map(|permuted_vector| {
+            generate_vector_substitutions(&dictionary, &permuted_vector, u8x32::splat(0), 0)
+        })
+        .for_each(|anagram| {
+            chunk[chunk_position] = anagram;
+            chunk_position = (chunk_position + 1) % CHUNK_SIZE;
+            if chunk_position == 0 {
+                process_anagram_chunk(&chunk, phrase_length, hashes_to_find);
+            }
+        });
+
+    process_anagram_chunk(&chunk, phrase_length, hashes_to_find);
+}
--- a/src/anagram_finder.rs
+++ b/src/anagram_finder.rs
@ -1,3 +1,6 @@
+// Finds all subsets of vectors within a given set (ordered by norm) which add up to a required vector
+// Within a subset, order of vectors is consistent with the original order in a set
+
 use crate::dictionary_builder;
 use crate::vector_alphabet;

--- a/src/anagram_logger.rs
+++ b/src/anagram_logger.rs
@ -1,38 +1,15 @@
+use md5;
 use packed_simd::u8x32;
-use crate::dictionary_builder::Dictionary;
-use crate::dictionary_builder::WordInfo;
-use crate::permutations_cache::PermutationsCache;

-fn get_anagram_view_from_simd(simd_vector: u8x32, phrase_length: usize) -> String {
+fn get_anagram_string_from_simd(simd_vector: u8x32, phrase_length: usize) -> String {
    let mut string_bytes: [u8; 32] = [0; 32];
    simd_vector.write_to_slice_unaligned(&mut string_bytes);

    String::from_utf8_lossy(&string_bytes[0..phrase_length]).into_owned()
 }

-fn generate_vector_substitutions<'a>(simple_dictionary: &'a Vec<Vec<&WordInfo>>, permutation: &'a [usize], current_phrase: u8x32, current_phrase_length: usize) -> Box<dyn Iterator<Item = u8x32> + 'a> {
-    if permutation.len() == 0 {
-        return Box::new(std::iter::once(current_phrase.clone()));
-    }
-
-    let result = simple_dictionary[permutation[0]].iter()
-        .flat_map(move |&word_info| {
-            generate_vector_substitutions(&simple_dictionary, &permutation[1..], current_phrase ^ word_info.get_simd_word_for_offset(current_phrase_length), current_phrase_length + word_info.length + 1).into_iter()
-        });
-    return Box::new(result);
-}
-
-pub fn log_anagrams(anagram_vector: &Vec<usize>, dictionary: &Dictionary, permutations: &PermutationsCache, phrase_length: usize) -> () {
-    let simple_vector: Vec<usize> = (0..anagram_vector.len()).collect();
-    let simple_dictionary: Vec<Vec<&WordInfo>> = (0..anagram_vector.len())
-        .map(|i| dictionary.words[anagram_vector[i]].iter().map(|word_info| word_info).collect())
-        .collect();
-
-    permutations.get_permuted_vectors(&simple_vector).iter()
-        .flat_map(|permuted_vector| {
-            generate_vector_substitutions(&simple_dictionary, &permuted_vector, u8x32::splat(0), 0)
-        })
-        .for_each(|anagram| {
-            println!("{}", get_anagram_view_from_simd(anagram, phrase_length));
-        })
+pub fn log_anagram(simd_vector: u8x32, phrase_length: usize) -> () {
+    let anagram_string = get_anagram_string_from_simd(simd_vector, phrase_length);
+    let hash = md5::compute(anagram_string.as_bytes());
+    println!("{:x} {}", hash, anagram_string);
 }
--- a/src/hash_computer.rs
+++ b/src/hash_computer.rs
@ -3,9 +3,10 @@ use packed_simd::u32x8;
 use packed_simd::u8x32;

 pub const MAX_PHRASE_LENGTH: usize = 31;
+pub const CHUNK_SIZE: usize = 8;

 #[allow(unused_assignments)]
-pub fn compute_hashes(messages: [u8x32; 8], messages_length: usize) -> [u32; 8] {
+fn compute_hashes_vector(messages: &[u8x32; CHUNK_SIZE], messages_length: usize) -> u32x8 {
    let mut a: u32x8 = u32x8::splat(0x67452301);
    let mut b: u32x8 = u32x8::splat(0xefcdab89);
    let mut c: u32x8 = u32x8::splat(0x98badcfe);
@ -174,10 +175,35 @@ pub fn compute_hashes(messages: [u8x32; 8], messages_length: usize) -> [u32; 8]
        // the remaining three iterations are unnecessary,
        // as the value of a after iteration 64 is equal
        // to the value of b after iteration 61
-        a = b + u32x8::splat(0x67452301);
+        return b + u32x8::splat(0x67452301);

-        let mut result: [u32; 8] = [0; 8];
-        a.write_to_slice_unaligned(&mut result);
+    }
+}
+
+pub fn compute_hashes(messages: &[u8x32; CHUNK_SIZE], messages_length: usize) -> [u32; CHUNK_SIZE] {
+    let hashes_vector = compute_hashes_vector(messages, messages_length);
+    let mut result: [u32; CHUNK_SIZE] = [0; CHUNK_SIZE];
+    hashes_vector.write_to_slice_unaligned(&mut result);
    result
 }
+
+pub fn find_hashes(messages: &[u8x32; CHUNK_SIZE], messages_length: usize, hashes_to_find: &[u32]) -> Option<Vec<u8x32>> {
+    let hashes_vector = compute_hashes_vector(messages, messages_length);
+
+    let has_matches: bool = hashes_to_find.iter()
+        .any(|&hash| hashes_vector.eq(u32x8::splat(hash)).any());
+
+    if !has_matches {
+        return None;
+    }
+
+    let mut result: Vec<_> = Vec::new();
+    for i in 0..CHUNK_SIZE {
+        let hash = hashes_vector.extract(i);
+        if hashes_to_find.contains(&hash) {
+            result.push(messages[i]);
+        }
+    }
+
+    Some(result)
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,5 +1,6 @@
 #![feature(map_into_keys_values)]

+pub mod anagram_analyzer;
 pub mod anagram_finder;
 pub mod anagram_logger;
 pub mod dictionary_builder;
--- a/src/main.rs
+++ b/src/main.rs
@ -4,8 +4,8 @@ use std::cmp;
 use std::env;
 use rayon::prelude::*;

+use trustpilot_challenge_rust::anagram_analyzer;
 use trustpilot_challenge_rust::anagram_finder;
-use trustpilot_challenge_rust::anagram_logger;
 use trustpilot_challenge_rust::dictionary_builder;
 use trustpilot_challenge_rust::hash_computer;
 use trustpilot_challenge_rust::permutations_cache;
@ -20,12 +20,6 @@ fn main() {
    let max_requested_number_of_words = (&args[3]).parse::<usize>().unwrap();
    let phrase = &args[4];

-    /*let message = hash_computer::prepare_messages(phrase);
-    let hashes = hash_computer::compute_hashes(message, phrase.len());
-    for hash in hashes.iter() {
-        println!("{:#08x}", hash);
-    }*/
-
    let phrase_byte_length_without_spaces = phrase.as_bytes().into_iter().filter(|&b| *b != b' ').count();
    let max_supported_number_of_words = (hash_computer::MAX_PHRASE_LENGTH - phrase_byte_length_without_spaces) + 1;

@ -40,12 +34,20 @@ fn main() {

    let dictionary = dictionary_builder::build_dictionary(phrase, words);

+    let hashes_strings = read_lines::lines_from_file(hashes_file_path).unwrap();
+    let mut hashes_to_find: Vec<u32> = Vec::new();
+    for hash_string in hashes_strings {
+        let hash: u128 = u128::from_str_radix(&hash_string, 16).unwrap();
+        hashes_to_find.push(((hash >> 96) as u32).to_be());
+    }
+
    for number_of_words in 1..=max_number_of_words {
-        //println!("======= Number of words: {} =======", number_of_words);
        let phrase_length = phrase_byte_length_without_spaces + number_of_words - 1;
        let permutations = permutations_cache::PermutationsCache::new(number_of_words);
-        let result = anagram_finder::find_anagrams(&dictionary, number_of_words);
-        result.par_iter()
-            .for_each(|anagram_vector| anagram_logger::log_anagrams(anagram_vector, &dictionary, &permutations, phrase_length));
+        let anagram_vectors = anagram_finder::find_anagrams(&dictionary, number_of_words);
+        anagram_vectors.par_iter()
+            .for_each(|anagram_vector| {
+                anagram_analyzer::analyze_anagrams(anagram_vector, &dictionary, &permutations, phrase_length, &hashes_to_find)
+            });
    }
 }