diff --git a/Cargo.toml b/Cargo.toml index 8e9de58..5a62a91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ edition = "2018" [dependencies] bit_field = "0.10.1" crunchy = "0.2.2" +md5 = "0.7.0" packed_simd = { version = "0.3.4", package = "packed_simd_2", features = ["into_bits"] } permutohedron = "0.2.4" rayon = "1.5.0" diff --git a/README.md b/README.md index 950eebe..e9c1c83 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,8 @@ where you had to, given the dictionary, and given three MD5 hashes, find three-word anagrams of a phrase *"poultry outwits ants"* which result in these hashes. -My original solution was in mixture of C# and plain C (with a bit of Visual C++ +My [original solution](https://github.com/inga-lovinde/TrustPilotChallenge) +was in mixture of C# and plain C (with a bit of Visual C++ as a bridge), and heavily used AVX2 intrinsics for optimization. Rust now has a decent API frontend for AVX2 intrinsics @@ -18,11 +19,9 @@ find all anagrams no longer than N words and no longer than 27 bytes which produce given MD5 hashes. (The limit on the number of words is neccessary, because there are single-letter words -in the dictionary; and it makes the total number of anagrams astronomically large) +in the dictionary; and it makes the total number of anagrams astronomically large.) -This is a working draft, so far the code is extremely dirty (this is my first Rust project), -and it only lists all anagrams -and does not yet do actual MD5 calculation. +Note that this is my first Rust project. ## Algorithm description @@ -32,6 +31,7 @@ It also computes eight MD5 hashes at a time *per thread* (that is, 128 MD5 hashes at once on a modern 8-core CPU), with some further optimizations which further shave off several percents from MD5 computation time. +(md5 crate dependency is only used to nicely print results) We could split the problem into three parts: finding all anagrams (up to words reordering and replacing some of the words with their single-word anagrams), @@ -158,7 +158,13 @@ it will not severely affect performance. How to run to solve the original task for three-word anagrams: ``` -cargo run data\words.txt data\hashes.txt 3 "poultry outwits ants" +cargo run data\words.txt data\hashes.txt 4 "poultry outwits ants" ``` -(Note that CPU with AVX2 support is required; that is, Intel Haswell (2013) or newer, or AMD Excavator (2015) or newer) \ No newline at end of file +(Note that CPU with AVX2 support is required; that is, Intel Haswell (2013) or newer, or AMD Excavator (2015) or newer.) + +In addition to the right solutions it will also output some wrong ones, +because for performance and transparency reasons only the first 8 bytes of hashes are compared. +This means that for every requested hash there is 1/1^32 chance of collision, +so for 10 requested hashes you will get one false positive every 430 millions of anagrams, on average, +which allows one to roughly measure the perfomance of MD5 calculation. \ No newline at end of file diff --git a/data/hashes.txt b/data/hashes.txt new file mode 100644 index 0000000..5f4ce00 --- /dev/null +++ b/data/hashes.txt @@ -0,0 +1,10 @@ +e4820b45d2277f3844eac66c903e84be +23170acc097c24edb98fc5488ab033fe +4a9f51db2c7eba0c724499f749d3176a +665e5bcb0c20062fe8abaaf4628bb154 +e8a2cbb6206fc937082bb92e4ed9cd3d +74a613b8c64fb216dc22d4f2bd4965f4 +ccb5ed231ba04d750c963668391d1e61 +d864ae0e66c89cb78345967cb2f3ab6b +2b56477105d91076030e877c94dd9776 +732442feac8b5013e16a776486ac5447 diff --git a/src/anagram_analyzer.rs b/src/anagram_analyzer.rs new file mode 100644 index 0000000..9b704e7 --- /dev/null +++ b/src/anagram_analyzer.rs @@ -0,0 +1,48 @@ +use packed_simd::u8x32; +use crate::anagram_logger::log_anagram; +use crate::dictionary_builder::Dictionary; +use crate::hash_computer::CHUNK_SIZE; +use crate::hash_computer::find_hashes; +use crate::permutations_cache::PermutationsCache; + +fn generate_vector_substitutions<'a>(simple_dictionary: &'a Dictionary, permutation: &'a [usize], current_phrase: u8x32, current_phrase_length: usize) -> Box + 'a> { + if permutation.len() == 0 { + return Box::new(std::iter::once(current_phrase.clone())); + } + + let result = simple_dictionary.words[permutation[0]].iter() + .flat_map(move |word_info| { + generate_vector_substitutions(&simple_dictionary, &permutation[1..], current_phrase ^ word_info.get_simd_word_for_offset(current_phrase_length), current_phrase_length + word_info.length + 1).into_iter() + }); + return Box::new(result); +} + +fn process_anagram_chunk(chunk: &[u8x32; CHUNK_SIZE], phrase_length: usize, hashes_to_find: &[u32]) -> () { + match find_hashes(chunk, phrase_length, hashes_to_find) { + Some(anagrams) => { + for anagram in anagrams { + log_anagram(anagram, phrase_length); + } + } + _ => () + } +} + +pub fn analyze_anagrams(anagram_vector: &Vec, dictionary: &Dictionary, permutations: &PermutationsCache, phrase_length: usize, hashes_to_find: &[u32]) -> () { + let mut chunk: [u8x32; CHUNK_SIZE] = [u8x32::splat(0); CHUNK_SIZE]; + let mut chunk_position: usize = 0; + + permutations.get_permuted_vectors(&anagram_vector).iter() + .flat_map(|permuted_vector| { + generate_vector_substitutions(&dictionary, &permuted_vector, u8x32::splat(0), 0) + }) + .for_each(|anagram| { + chunk[chunk_position] = anagram; + chunk_position = (chunk_position + 1) % CHUNK_SIZE; + if chunk_position == 0 { + process_anagram_chunk(&chunk, phrase_length, hashes_to_find); + } + }); + + process_anagram_chunk(&chunk, phrase_length, hashes_to_find); +} \ No newline at end of file diff --git a/src/anagram_finder.rs b/src/anagram_finder.rs index 2237342..7ccc2a3 100644 --- a/src/anagram_finder.rs +++ b/src/anagram_finder.rs @@ -1,3 +1,6 @@ +// Finds all subsets of vectors within a given set (ordered by norm) which add up to a required vector +// Within a subset, order of vectors is consistent with the original order in a set + use crate::dictionary_builder; use crate::vector_alphabet; diff --git a/src/anagram_logger.rs b/src/anagram_logger.rs index 13f91ca..c462a86 100644 --- a/src/anagram_logger.rs +++ b/src/anagram_logger.rs @@ -1,38 +1,15 @@ +use md5; use packed_simd::u8x32; -use crate::dictionary_builder::Dictionary; -use crate::dictionary_builder::WordInfo; -use crate::permutations_cache::PermutationsCache; -fn get_anagram_view_from_simd(simd_vector: u8x32, phrase_length: usize) -> String { +fn get_anagram_string_from_simd(simd_vector: u8x32, phrase_length: usize) -> String { let mut string_bytes: [u8; 32] = [0; 32]; simd_vector.write_to_slice_unaligned(&mut string_bytes); String::from_utf8_lossy(&string_bytes[0..phrase_length]).into_owned() } -fn generate_vector_substitutions<'a>(simple_dictionary: &'a Vec>, permutation: &'a [usize], current_phrase: u8x32, current_phrase_length: usize) -> Box + 'a> { - if permutation.len() == 0 { - return Box::new(std::iter::once(current_phrase.clone())); - } - - let result = simple_dictionary[permutation[0]].iter() - .flat_map(move |&word_info| { - generate_vector_substitutions(&simple_dictionary, &permutation[1..], current_phrase ^ word_info.get_simd_word_for_offset(current_phrase_length), current_phrase_length + word_info.length + 1).into_iter() - }); - return Box::new(result); +pub fn log_anagram(simd_vector: u8x32, phrase_length: usize) -> () { + let anagram_string = get_anagram_string_from_simd(simd_vector, phrase_length); + let hash = md5::compute(anagram_string.as_bytes()); + println!("{:x} {}", hash, anagram_string); } - -pub fn log_anagrams(anagram_vector: &Vec, dictionary: &Dictionary, permutations: &PermutationsCache, phrase_length: usize) -> () { - let simple_vector: Vec = (0..anagram_vector.len()).collect(); - let simple_dictionary: Vec> = (0..anagram_vector.len()) - .map(|i| dictionary.words[anagram_vector[i]].iter().map(|word_info| word_info).collect()) - .collect(); - - permutations.get_permuted_vectors(&simple_vector).iter() - .flat_map(|permuted_vector| { - generate_vector_substitutions(&simple_dictionary, &permuted_vector, u8x32::splat(0), 0) - }) - .for_each(|anagram| { - println!("{}", get_anagram_view_from_simd(anagram, phrase_length)); - }) -} \ No newline at end of file diff --git a/src/hash_computer.rs b/src/hash_computer.rs index ddc291f..e606844 100644 --- a/src/hash_computer.rs +++ b/src/hash_computer.rs @@ -3,9 +3,10 @@ use packed_simd::u32x8; use packed_simd::u8x32; pub const MAX_PHRASE_LENGTH: usize = 31; +pub const CHUNK_SIZE: usize = 8; #[allow(unused_assignments)] -pub fn compute_hashes(messages: [u8x32; 8], messages_length: usize) -> [u32; 8] { +fn compute_hashes_vector(messages: &[u8x32; CHUNK_SIZE], messages_length: usize) -> u32x8 { let mut a: u32x8 = u32x8::splat(0x67452301); let mut b: u32x8 = u32x8::splat(0xefcdab89); let mut c: u32x8 = u32x8::splat(0x98badcfe); @@ -174,10 +175,35 @@ pub fn compute_hashes(messages: [u8x32; 8], messages_length: usize) -> [u32; 8] // the remaining three iterations are unnecessary, // as the value of a after iteration 64 is equal // to the value of b after iteration 61 - a = b + u32x8::splat(0x67452301); + return b + u32x8::splat(0x67452301); - let mut result: [u32; 8] = [0; 8]; - a.write_to_slice_unaligned(&mut result); - result } } + +pub fn compute_hashes(messages: &[u8x32; CHUNK_SIZE], messages_length: usize) -> [u32; CHUNK_SIZE] { + let hashes_vector = compute_hashes_vector(messages, messages_length); + let mut result: [u32; CHUNK_SIZE] = [0; CHUNK_SIZE]; + hashes_vector.write_to_slice_unaligned(&mut result); + result +} + +pub fn find_hashes(messages: &[u8x32; CHUNK_SIZE], messages_length: usize, hashes_to_find: &[u32]) -> Option> { + let hashes_vector = compute_hashes_vector(messages, messages_length); + + let has_matches: bool = hashes_to_find.iter() + .any(|&hash| hashes_vector.eq(u32x8::splat(hash)).any()); + + if !has_matches { + return None; + } + + let mut result: Vec<_> = Vec::new(); + for i in 0..CHUNK_SIZE { + let hash = hashes_vector.extract(i); + if hashes_to_find.contains(&hash) { + result.push(messages[i]); + } + } + + Some(result) +} diff --git a/src/lib.rs b/src/lib.rs index 6275e08..4d3710a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ #![feature(map_into_keys_values)] +pub mod anagram_analyzer; pub mod anagram_finder; pub mod anagram_logger; pub mod dictionary_builder; diff --git a/src/main.rs b/src/main.rs index 0847f61..8c3336b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,8 +4,8 @@ use std::cmp; use std::env; use rayon::prelude::*; +use trustpilot_challenge_rust::anagram_analyzer; use trustpilot_challenge_rust::anagram_finder; -use trustpilot_challenge_rust::anagram_logger; use trustpilot_challenge_rust::dictionary_builder; use trustpilot_challenge_rust::hash_computer; use trustpilot_challenge_rust::permutations_cache; @@ -20,12 +20,6 @@ fn main() { let max_requested_number_of_words = (&args[3]).parse::().unwrap(); let phrase = &args[4]; - /*let message = hash_computer::prepare_messages(phrase); - let hashes = hash_computer::compute_hashes(message, phrase.len()); - for hash in hashes.iter() { - println!("{:#08x}", hash); - }*/ - let phrase_byte_length_without_spaces = phrase.as_bytes().into_iter().filter(|&b| *b != b' ').count(); let max_supported_number_of_words = (hash_computer::MAX_PHRASE_LENGTH - phrase_byte_length_without_spaces) + 1; @@ -40,12 +34,20 @@ fn main() { let dictionary = dictionary_builder::build_dictionary(phrase, words); + let hashes_strings = read_lines::lines_from_file(hashes_file_path).unwrap(); + let mut hashes_to_find: Vec = Vec::new(); + for hash_string in hashes_strings { + let hash: u128 = u128::from_str_radix(&hash_string, 16).unwrap(); + hashes_to_find.push(((hash >> 96) as u32).to_be()); + } + for number_of_words in 1..=max_number_of_words { - //println!("======= Number of words: {} =======", number_of_words); let phrase_length = phrase_byte_length_without_spaces + number_of_words - 1; let permutations = permutations_cache::PermutationsCache::new(number_of_words); - let result = anagram_finder::find_anagrams(&dictionary, number_of_words); - result.par_iter() - .for_each(|anagram_vector| anagram_logger::log_anagrams(anagram_vector, &dictionary, &permutations, phrase_length)); + let anagram_vectors = anagram_finder::find_anagrams(&dictionary, number_of_words); + anagram_vectors.par_iter() + .for_each(|anagram_vector| { + anagram_analyzer::analyze_anagrams(anagram_vector, &dictionary, &permutations, phrase_length, &hashes_to_find) + }); } }