From 57f387737870b49ddd4b0907af698d20bc0844e5 Mon Sep 17 00:00:00 2001 From: inga-lovinde <52715130+inga-lovinde@users.noreply.github.com> Date: Tue, 1 Dec 2020 00:36:49 +0100 Subject: [PATCH] Refactoring; all anagrams are printed --- README.md | 56 +++++++++++++++++++++++++++++++++++-- src/anagram_finder.rs | 17 ++++++++---- src/anagram_logger.rs | 15 ++++++++++ src/dictionary_builder.rs | 58 +++++++++++++++++++++++++++++++++++++++ src/main.rs | 48 ++++++++++++-------------------- src/vector_alphabet.rs | 2 +- 6 files changed, 157 insertions(+), 39 deletions(-) create mode 100644 src/anagram_logger.rs create mode 100644 src/dictionary_builder.rs diff --git a/README.md b/README.md index 4313e63..a8c39a9 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ # TrustPilotChallengeRust -TrustPilot had this challenge (http://followthewhiterabbit.trustpilot.com/) +TrustPilot had this challenge several years ago +(http://followthewhiterabbit.trustpilot.com/) where you had to, given the dictionary, and given three MD5 hashes, -find anagrams of a phrase *"poultry outwits ants"* which result in these hashes. +find three-word anagrams of a phrase *"poultry outwits ants"* +which result in these hashes. My original solution was in mixture of C# and plain C (with a bit of Visual C++ as a bridge), and heavily used AVX2 intrinsics for optimization. @@ -22,8 +24,58 @@ This is a working draft, so far the code is extremely dirty (this is my first Ru and it only lists all anagrams (not including words reordering) and does not yet do actual MD5 calculation. +## Algorithm description + +Notably this solution does not involve string concatenation; strings are only concatenated for debugging purposes. + +We could split the problem into three parts: finding all anagrams +(up to words reordering and replacing some of the words with their single-word anagrams), +finding all anagrams taking into account words order, +and checking their hashes against the supplied list. + +### Finding all anagrams, pt. 1 + +For every string (ignoring spaces) we could define a vector in Z^N space, with its i-th coordinate +matching the number of occurrences of character i in the string. + +Two strings are anagrams of each other if and only if their vectors are the same. + +Vector for a concatenation of two strings is the sum of vectors for these two strings. + +This means that the task of finding anagrams for a phrase reduces to the task of finding +subsets of vectors (out of sets of vectors for all dictionary words) which add up +to the vector for original phrase. +Since all coordinates are positive, only vectors which are contained in a hyperrectangle +defined by the target vector (that is, which have all coordinates not larger +than the target vector; that is, corresponding words for which are subsets of the target phrase) +could belong to such subsets. + +Additionally, if the source phrase contains no more than 32 different characters, +and each of these no more than 255 times, we could limit ourselves to u8x32 vectors +instead of vectors in Z^N. +That way we can "concatenate" strings or "compare" them for anagrams in a single CPU cycle. + +The naive solution of finding fixed-length subsets of vectors which add up to a given vector +could be further optimized, resulting in the following algorithm: + +1. Sort all vectors by their norm (length of the original word), largest first; +2. Find all target subsets such that the order of items in subset is compatible with their order in sorted vectors list +2. For number of words N, the requested phrase P, and the offset K (originally 0) check: + * If N is 0 and P is non-zero, there are no solutions; + * If N is 0 and P is zero, there is a trivial solution (empty subset); + * If N is larger than 0, let us find the first vector of a target subset: + * For every vector W starting with offset K + (while its norm times N is less than the norm of P) + * If the norm of W is not larger than the norm of P and all coordinates of W are not larger than of P: + * W might be one element of a target subset, and the remaining elements could be found + by solving the task 2 for N-1, P-W and position of W in the list of vectors. + +## How to run + How to run to solve the original task for three-word anagrams: ``` cargo run data\words.txt data\hashes.txt 3 "poultry outwits ants" ``` + +(Note that CPU with AVX2 support is required; that is, Intel Haswell (2013) or newer, or AMD Excavator (2015) or newer) \ No newline at end of file diff --git a/src/anagram_finder.rs b/src/anagram_finder.rs index 33dbf5f..a0571f0 100644 --- a/src/anagram_finder.rs +++ b/src/anagram_finder.rs @@ -1,6 +1,7 @@ +use crate::dictionary_builder; use crate::vector_alphabet; -pub fn find_anagrams(remainder: &vector_alphabet::Vector, remaining_depth: usize, current_words: &[(String, vector_alphabet::VectorWithMetadata)]) -> Vec> { +fn find_anagrams_recursive(remainder: &vector_alphabet::Vector, remaining_depth: usize, word_vectors: &[vector_alphabet::Vector], offset: usize) -> Vec> { if remaining_depth == 0 { if remainder.norm == 0 { return vec![vec![]]; @@ -8,13 +9,15 @@ pub fn find_anagrams(remainder: &vector_alphabet::Vector, remaining_depth: usize return vec![]; } - current_words.iter() + word_vectors.iter() .enumerate() - .map(|(index, (word, word_metadata))| match remainder.safe_substract(&word_metadata.vector) { - Some(new_remainder) => find_anagrams(&new_remainder, remaining_depth-1, ¤t_words[index..]) + .skip(offset) + .take_while(|(_, vector)| vector.norm * remaining_depth >= remainder.norm) + .map(|(index, vector)| match remainder.safe_substract(&vector) { + Some(new_remainder) => find_anagrams_recursive(&new_remainder, remaining_depth-1, word_vectors, index) .iter() .map(|partial_phrase| { - vec![word.clone()].iter().chain(partial_phrase).cloned().collect() + vec![index].iter().chain(partial_phrase).cloned().collect() }) .collect(), _ => vec![], @@ -22,3 +25,7 @@ pub fn find_anagrams(remainder: &vector_alphabet::Vector, remaining_depth: usize .flatten() .collect() } + +pub fn find_anagrams(dictionary: &dictionary_builder::Dictionary, number_of_words: usize) -> Vec> { + find_anagrams_recursive(&dictionary.phrase_vector, number_of_words, &dictionary.vectors, 0) +} diff --git a/src/anagram_logger.rs b/src/anagram_logger.rs new file mode 100644 index 0000000..d1d32ca --- /dev/null +++ b/src/anagram_logger.rs @@ -0,0 +1,15 @@ +use crate::dictionary_builder; + +pub fn get_anagram_view(anagram: Vec, dictionary: &dictionary_builder::Dictionary) -> String { + anagram.iter() + .map(|&index| { + let word_options = &dictionary.words[index]; + if word_options.len() == 1 { + word_options[0].clone() + } else { + format!("[{}]", word_options.join(",")) + } + }) + .collect::>() + .join(" ") +} \ No newline at end of file diff --git a/src/dictionary_builder.rs b/src/dictionary_builder.rs new file mode 100644 index 0000000..993f3e0 --- /dev/null +++ b/src/dictionary_builder.rs @@ -0,0 +1,58 @@ +use std::collections::HashMap; +use crate::vector_alphabet; + +pub struct Dictionary { + pub phrase_vector: vector_alphabet::Vector, + pub vectors: Vec, + pub words: Vec>, +} + +pub fn build_dictionary(phrase: &String, unique_words: &[String]) -> Dictionary { + let alphabet = vector_alphabet::Alphabet::new(phrase).unwrap(); + + let phrase_with_metadata = alphabet.vectorize(phrase).unwrap(); + + let words_with_vectors: Vec<_> = unique_words + .into_iter() + .map(|word| { + let vector_option = alphabet.vectorize(&word); + match vector_option { + Some(vector_with_metadata) => { + if vector_with_metadata.vector.is_subset_of(&phrase_with_metadata.vector) { + return Some((word, vector_with_metadata)); + } else { + return None; + } + } + None => { + return None; + } + } + }) + .flatten() + .collect(); + + let mut words_by_vectors: HashMap<_, _> = HashMap::new(); + for (word, vector_with_metadata) in words_with_vectors { + let (_, words_for_vector) = words_by_vectors.entry(vector_with_metadata.key).or_insert((vector_with_metadata.vector, vec![])); + words_for_vector.push(word.clone()); + } + + let mut words_by_vectors: Vec<_> = words_by_vectors.into_values().collect(); + words_by_vectors.sort_by_key(|(vector, _)| vector.norm); + words_by_vectors.reverse(); + + let mut vectors = vec![]; + let mut words_by_vectors_vec = vec![]; + + for (vector, words_by_vector) in words_by_vectors { + vectors.push(vector); + words_by_vectors_vec.push(words_by_vector); + } + + Dictionary { + phrase_vector: phrase_with_metadata.vector, + vectors, + words: words_by_vectors_vec, + } +} diff --git a/src/main.rs b/src/main.rs index 77bd665..74340c1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,11 @@ +#![feature(map_into_keys_values)] + +use std::cmp; use std::env; mod anagram_finder; +mod anagram_logger; +mod dictionary_builder; mod hash_computer; mod read_lines; mod vector_alphabet; @@ -11,46 +16,27 @@ fn main() { let words_file_path = &args[1]; let hashes_file_path = &args[2]; - let max_number_of_words = &args[3].parse::().unwrap(); + let max_requested_number_of_words = (&args[3]).parse::().unwrap(); let phrase = &args[4]; let phrase_byte_length_without_spaces = phrase.as_bytes().into_iter().filter(|&b| *b != b' ').count(); - let result_byte_length = phrase_byte_length_without_spaces + max_number_of_words - 1; + let max_supported_number_of_words = (hash_computer::MAX_PHRASE_LENGTH - phrase_byte_length_without_spaces) + 1; - if result_byte_length > hash_computer::MAX_PHRASE_LENGTH { - panic!("Words number limit exceeded") + if max_requested_number_of_words > max_supported_number_of_words { + println!("Requested number of words unsupported; using {} as maximum number of words", max_supported_number_of_words); } - - let alphabet = vector_alphabet::Alphabet::new(phrase).unwrap(); - - let phrase_with_metadata = alphabet.vectorize(phrase).unwrap(); + let max_number_of_words = cmp::min(max_requested_number_of_words, max_supported_number_of_words); let mut words = read_lines::lines_from_file(words_file_path).unwrap(); words.sort(); words.dedup(); - let words_with_vectors: Vec<_> = words - .into_iter() - .map(|word| { - let vector_option = alphabet.vectorize(&word); - match vector_option { - Some(vector_with_metadata) => { - if vector_with_metadata.vector.is_subset_of(&phrase_with_metadata.vector) { - return Some((word, vector_with_metadata)); - } else { - return None; - } - } - None => { - return None; - } - } - }) - .flatten() - .collect(); - - let result = anagram_finder::find_anagrams(&phrase_with_metadata.vector, *max_number_of_words, &words_with_vectors); - for result_words in result { - println!("{}", result_words.join(" ")) + let dictionary = dictionary_builder::build_dictionary(phrase, &words); + + for number_of_words in 0..=max_number_of_words { + let result = anagram_finder::find_anagrams(&dictionary, number_of_words); + for anagram in result { + println!("{}", anagram_logger::get_anagram_view(anagram, &dictionary)); + } } } diff --git a/src/vector_alphabet.rs b/src/vector_alphabet.rs index 9f96add..655d6a1 100644 --- a/src/vector_alphabet.rs +++ b/src/vector_alphabet.rs @@ -74,7 +74,7 @@ impl Alphabet { for ch in &chars { match self.chars_to_offsets.get(&ch) { Some(&index) => { - if array[index] > 127 { + if array[index] >= u8::MAX { return None; }