Anagram analyzer implemented; solution is fully functional now

main
Inga 🏳‍🌈 4 years ago
parent 69edfe4e14
commit e57b327c57
  1. 1
      Cargo.toml
  2. 20
      README.md
  3. 10
      data/hashes.txt
  4. 48
      src/anagram_analyzer.rs
  5. 3
      src/anagram_finder.rs
  6. 35
      src/anagram_logger.rs
  7. 36
      src/hash_computer.rs
  8. 1
      src/lib.rs
  9. 24
      src/main.rs

@ -9,6 +9,7 @@ edition = "2018"
[dependencies] [dependencies]
bit_field = "0.10.1" bit_field = "0.10.1"
crunchy = "0.2.2" crunchy = "0.2.2"
md5 = "0.7.0"
packed_simd = { version = "0.3.4", package = "packed_simd_2", features = ["into_bits"] } packed_simd = { version = "0.3.4", package = "packed_simd_2", features = ["into_bits"] }
permutohedron = "0.2.4" permutohedron = "0.2.4"
rayon = "1.5.0" rayon = "1.5.0"

@ -6,7 +6,8 @@ where you had to, given the dictionary, and given three MD5 hashes,
find three-word anagrams of a phrase *"poultry outwits ants"* find three-word anagrams of a phrase *"poultry outwits ants"*
which result in these hashes. which result in these hashes.
My original solution was in mixture of C# and plain C (with a bit of Visual C++ My [original solution](https://github.com/inga-lovinde/TrustPilotChallenge)
was in mixture of C# and plain C (with a bit of Visual C++
as a bridge), and heavily used AVX2 intrinsics for optimization. as a bridge), and heavily used AVX2 intrinsics for optimization.
Rust now has a decent API frontend for AVX2 intrinsics Rust now has a decent API frontend for AVX2 intrinsics
@ -18,11 +19,9 @@ find all anagrams no longer than N words and no longer than 27 bytes
which produce given MD5 hashes. which produce given MD5 hashes.
(The limit on the number of words is neccessary, because there are single-letter words (The limit on the number of words is neccessary, because there are single-letter words
in the dictionary; and it makes the total number of anagrams astronomically large) in the dictionary; and it makes the total number of anagrams astronomically large.)
This is a working draft, so far the code is extremely dirty (this is my first Rust project), Note that this is my first Rust project.
and it only lists all anagrams
and does not yet do actual MD5 calculation.
## Algorithm description ## Algorithm description
@ -32,6 +31,7 @@ It also computes eight MD5 hashes at a time *per thread*
(that is, 128 MD5 hashes at once on a modern 8-core CPU), (that is, 128 MD5 hashes at once on a modern 8-core CPU),
with some further optimizations which further shave off with some further optimizations which further shave off
several percents from MD5 computation time. several percents from MD5 computation time.
(md5 crate dependency is only used to nicely print results)
We could split the problem into three parts: finding all anagrams We could split the problem into three parts: finding all anagrams
(up to words reordering and replacing some of the words with their single-word anagrams), (up to words reordering and replacing some of the words with their single-word anagrams),
@ -158,7 +158,13 @@ it will not severely affect performance.
How to run to solve the original task for three-word anagrams: How to run to solve the original task for three-word anagrams:
``` ```
cargo run data\words.txt data\hashes.txt 3 "poultry outwits ants" cargo run data\words.txt data\hashes.txt 4 "poultry outwits ants"
``` ```
(Note that CPU with AVX2 support is required; that is, Intel Haswell (2013) or newer, or AMD Excavator (2015) or newer) (Note that CPU with AVX2 support is required; that is, Intel Haswell (2013) or newer, or AMD Excavator (2015) or newer.)
In addition to the right solutions it will also output some wrong ones,
because for performance and transparency reasons only the first 8 bytes of hashes are compared.
This means that for every requested hash there is 1/1^32 chance of collision,
so for 10 requested hashes you will get one false positive every 430 millions of anagrams, on average,
which allows one to roughly measure the perfomance of MD5 calculation.

@ -0,0 +1,10 @@
e4820b45d2277f3844eac66c903e84be
23170acc097c24edb98fc5488ab033fe
4a9f51db2c7eba0c724499f749d3176a
665e5bcb0c20062fe8abaaf4628bb154
e8a2cbb6206fc937082bb92e4ed9cd3d
74a613b8c64fb216dc22d4f2bd4965f4
ccb5ed231ba04d750c963668391d1e61
d864ae0e66c89cb78345967cb2f3ab6b
2b56477105d91076030e877c94dd9776
732442feac8b5013e16a776486ac5447

@ -0,0 +1,48 @@
use packed_simd::u8x32;
use crate::anagram_logger::log_anagram;
use crate::dictionary_builder::Dictionary;
use crate::hash_computer::CHUNK_SIZE;
use crate::hash_computer::find_hashes;
use crate::permutations_cache::PermutationsCache;
fn generate_vector_substitutions<'a>(simple_dictionary: &'a Dictionary, permutation: &'a [usize], current_phrase: u8x32, current_phrase_length: usize) -> Box<dyn Iterator<Item = u8x32> + 'a> {
if permutation.len() == 0 {
return Box::new(std::iter::once(current_phrase.clone()));
}
let result = simple_dictionary.words[permutation[0]].iter()
.flat_map(move |word_info| {
generate_vector_substitutions(&simple_dictionary, &permutation[1..], current_phrase ^ word_info.get_simd_word_for_offset(current_phrase_length), current_phrase_length + word_info.length + 1).into_iter()
});
return Box::new(result);
}
fn process_anagram_chunk(chunk: &[u8x32; CHUNK_SIZE], phrase_length: usize, hashes_to_find: &[u32]) -> () {
match find_hashes(chunk, phrase_length, hashes_to_find) {
Some(anagrams) => {
for anagram in anagrams {
log_anagram(anagram, phrase_length);
}
}
_ => ()
}
}
pub fn analyze_anagrams(anagram_vector: &Vec<usize>, dictionary: &Dictionary, permutations: &PermutationsCache, phrase_length: usize, hashes_to_find: &[u32]) -> () {
let mut chunk: [u8x32; CHUNK_SIZE] = [u8x32::splat(0); CHUNK_SIZE];
let mut chunk_position: usize = 0;
permutations.get_permuted_vectors(&anagram_vector).iter()
.flat_map(|permuted_vector| {
generate_vector_substitutions(&dictionary, &permuted_vector, u8x32::splat(0), 0)
})
.for_each(|anagram| {
chunk[chunk_position] = anagram;
chunk_position = (chunk_position + 1) % CHUNK_SIZE;
if chunk_position == 0 {
process_anagram_chunk(&chunk, phrase_length, hashes_to_find);
}
});
process_anagram_chunk(&chunk, phrase_length, hashes_to_find);
}

@ -1,3 +1,6 @@
// Finds all subsets of vectors within a given set (ordered by norm) which add up to a required vector
// Within a subset, order of vectors is consistent with the original order in a set
use crate::dictionary_builder; use crate::dictionary_builder;
use crate::vector_alphabet; use crate::vector_alphabet;

@ -1,38 +1,15 @@
use md5;
use packed_simd::u8x32; use packed_simd::u8x32;
use crate::dictionary_builder::Dictionary;
use crate::dictionary_builder::WordInfo;
use crate::permutations_cache::PermutationsCache;
fn get_anagram_view_from_simd(simd_vector: u8x32, phrase_length: usize) -> String { fn get_anagram_string_from_simd(simd_vector: u8x32, phrase_length: usize) -> String {
let mut string_bytes: [u8; 32] = [0; 32]; let mut string_bytes: [u8; 32] = [0; 32];
simd_vector.write_to_slice_unaligned(&mut string_bytes); simd_vector.write_to_slice_unaligned(&mut string_bytes);
String::from_utf8_lossy(&string_bytes[0..phrase_length]).into_owned() String::from_utf8_lossy(&string_bytes[0..phrase_length]).into_owned()
} }
fn generate_vector_substitutions<'a>(simple_dictionary: &'a Vec<Vec<&WordInfo>>, permutation: &'a [usize], current_phrase: u8x32, current_phrase_length: usize) -> Box<dyn Iterator<Item = u8x32> + 'a> { pub fn log_anagram(simd_vector: u8x32, phrase_length: usize) -> () {
if permutation.len() == 0 { let anagram_string = get_anagram_string_from_simd(simd_vector, phrase_length);
return Box::new(std::iter::once(current_phrase.clone())); let hash = md5::compute(anagram_string.as_bytes());
} println!("{:x} {}", hash, anagram_string);
let result = simple_dictionary[permutation[0]].iter()
.flat_map(move |&word_info| {
generate_vector_substitutions(&simple_dictionary, &permutation[1..], current_phrase ^ word_info.get_simd_word_for_offset(current_phrase_length), current_phrase_length + word_info.length + 1).into_iter()
});
return Box::new(result);
} }
pub fn log_anagrams(anagram_vector: &Vec<usize>, dictionary: &Dictionary, permutations: &PermutationsCache, phrase_length: usize) -> () {
let simple_vector: Vec<usize> = (0..anagram_vector.len()).collect();
let simple_dictionary: Vec<Vec<&WordInfo>> = (0..anagram_vector.len())
.map(|i| dictionary.words[anagram_vector[i]].iter().map(|word_info| word_info).collect())
.collect();
permutations.get_permuted_vectors(&simple_vector).iter()
.flat_map(|permuted_vector| {
generate_vector_substitutions(&simple_dictionary, &permuted_vector, u8x32::splat(0), 0)
})
.for_each(|anagram| {
println!("{}", get_anagram_view_from_simd(anagram, phrase_length));
})
}

@ -3,9 +3,10 @@ use packed_simd::u32x8;
use packed_simd::u8x32; use packed_simd::u8x32;
pub const MAX_PHRASE_LENGTH: usize = 31; pub const MAX_PHRASE_LENGTH: usize = 31;
pub const CHUNK_SIZE: usize = 8;
#[allow(unused_assignments)] #[allow(unused_assignments)]
pub fn compute_hashes(messages: [u8x32; 8], messages_length: usize) -> [u32; 8] { fn compute_hashes_vector(messages: &[u8x32; CHUNK_SIZE], messages_length: usize) -> u32x8 {
let mut a: u32x8 = u32x8::splat(0x67452301); let mut a: u32x8 = u32x8::splat(0x67452301);
let mut b: u32x8 = u32x8::splat(0xefcdab89); let mut b: u32x8 = u32x8::splat(0xefcdab89);
let mut c: u32x8 = u32x8::splat(0x98badcfe); let mut c: u32x8 = u32x8::splat(0x98badcfe);
@ -174,10 +175,35 @@ pub fn compute_hashes(messages: [u8x32; 8], messages_length: usize) -> [u32; 8]
// the remaining three iterations are unnecessary, // the remaining three iterations are unnecessary,
// as the value of a after iteration 64 is equal // as the value of a after iteration 64 is equal
// to the value of b after iteration 61 // to the value of b after iteration 61
a = b + u32x8::splat(0x67452301); return b + u32x8::splat(0x67452301);
let mut result: [u32; 8] = [0; 8];
a.write_to_slice_unaligned(&mut result);
result
} }
} }
pub fn compute_hashes(messages: &[u8x32; CHUNK_SIZE], messages_length: usize) -> [u32; CHUNK_SIZE] {
let hashes_vector = compute_hashes_vector(messages, messages_length);
let mut result: [u32; CHUNK_SIZE] = [0; CHUNK_SIZE];
hashes_vector.write_to_slice_unaligned(&mut result);
result
}
pub fn find_hashes(messages: &[u8x32; CHUNK_SIZE], messages_length: usize, hashes_to_find: &[u32]) -> Option<Vec<u8x32>> {
let hashes_vector = compute_hashes_vector(messages, messages_length);
let has_matches: bool = hashes_to_find.iter()
.any(|&hash| hashes_vector.eq(u32x8::splat(hash)).any());
if !has_matches {
return None;
}
let mut result: Vec<_> = Vec::new();
for i in 0..CHUNK_SIZE {
let hash = hashes_vector.extract(i);
if hashes_to_find.contains(&hash) {
result.push(messages[i]);
}
}
Some(result)
}

@ -1,5 +1,6 @@
#![feature(map_into_keys_values)] #![feature(map_into_keys_values)]
pub mod anagram_analyzer;
pub mod anagram_finder; pub mod anagram_finder;
pub mod anagram_logger; pub mod anagram_logger;
pub mod dictionary_builder; pub mod dictionary_builder;

@ -4,8 +4,8 @@ use std::cmp;
use std::env; use std::env;
use rayon::prelude::*; use rayon::prelude::*;
use trustpilot_challenge_rust::anagram_analyzer;
use trustpilot_challenge_rust::anagram_finder; use trustpilot_challenge_rust::anagram_finder;
use trustpilot_challenge_rust::anagram_logger;
use trustpilot_challenge_rust::dictionary_builder; use trustpilot_challenge_rust::dictionary_builder;
use trustpilot_challenge_rust::hash_computer; use trustpilot_challenge_rust::hash_computer;
use trustpilot_challenge_rust::permutations_cache; use trustpilot_challenge_rust::permutations_cache;
@ -20,12 +20,6 @@ fn main() {
let max_requested_number_of_words = (&args[3]).parse::<usize>().unwrap(); let max_requested_number_of_words = (&args[3]).parse::<usize>().unwrap();
let phrase = &args[4]; let phrase = &args[4];
/*let message = hash_computer::prepare_messages(phrase);
let hashes = hash_computer::compute_hashes(message, phrase.len());
for hash in hashes.iter() {
println!("{:#08x}", hash);
}*/
let phrase_byte_length_without_spaces = phrase.as_bytes().into_iter().filter(|&b| *b != b' ').count(); let phrase_byte_length_without_spaces = phrase.as_bytes().into_iter().filter(|&b| *b != b' ').count();
let max_supported_number_of_words = (hash_computer::MAX_PHRASE_LENGTH - phrase_byte_length_without_spaces) + 1; let max_supported_number_of_words = (hash_computer::MAX_PHRASE_LENGTH - phrase_byte_length_without_spaces) + 1;
@ -40,12 +34,20 @@ fn main() {
let dictionary = dictionary_builder::build_dictionary(phrase, words); let dictionary = dictionary_builder::build_dictionary(phrase, words);
let hashes_strings = read_lines::lines_from_file(hashes_file_path).unwrap();
let mut hashes_to_find: Vec<u32> = Vec::new();
for hash_string in hashes_strings {
let hash: u128 = u128::from_str_radix(&hash_string, 16).unwrap();
hashes_to_find.push(((hash >> 96) as u32).to_be());
}
for number_of_words in 1..=max_number_of_words { for number_of_words in 1..=max_number_of_words {
//println!("======= Number of words: {} =======", number_of_words);
let phrase_length = phrase_byte_length_without_spaces + number_of_words - 1; let phrase_length = phrase_byte_length_without_spaces + number_of_words - 1;
let permutations = permutations_cache::PermutationsCache::new(number_of_words); let permutations = permutations_cache::PermutationsCache::new(number_of_words);
let result = anagram_finder::find_anagrams(&dictionary, number_of_words); let anagram_vectors = anagram_finder::find_anagrams(&dictionary, number_of_words);
result.par_iter() anagram_vectors.par_iter()
.for_each(|anagram_vector| anagram_logger::log_anagrams(anagram_vector, &dictionary, &permutations, phrase_length)); .for_each(|anagram_vector| {
anagram_analyzer::analyze_anagrams(anagram_vector, &dictionary, &permutations, phrase_length, &hashes_to_find)
});
} }
} }

Loading…
Cancel
Save