First draft

4 years ago · ddd9ae9b33
parent 89246c63da
commit ddd9ae9b33
10 changed files with 99411 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,5 @@ Cargo.lock

 # These are backup files generated by rustfmt
 **/*.rs.bk
+
+.vs
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,10 @@
+[package]
+name = "hello_cargo"
+version = "0.1.0"
+authors = ["inga-lovinde <52715130+inga-lovinde@users.noreply.github.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+packed_simd = { version = "0.3.4", package = "packed_simd_2" }
--- a/README.md
+++ b/README.md
@ -1 +1,29 @@
-# TrustPilotChallengeRust
+# TrustPilotChallengeRust
+
+TrustPilot had this challenge (http://followthewhiterabbit.trustpilot.com/)
+where you had to, given the dictionary, and given three MD5 hashes,
+find anagrams of a phrase *"poultry outwits ants"* which result in these hashes.
+
+My original solution was in mixture of C# and plain C (with a bit of Visual C++
+as a bridge), and heavily used AVX2 intrinsics for optimization.
+
+Rust now has a decent API frontend for AVX2 intrinsics 
+(https://rust-lang.github.io/packed_simd/packed_simd_2/, and soon-to-be `std::simd`),
+so it makes perfect sense to try and reimplement the same ideas with Rust.
+
+The problem will sound a bit different: given a dictionary and given a string,
+find all anagrams no longer than N words and no longer than 27 bytes
+which produce given MD5 hashes.
+
+(The limit on the number of words is neccessary, because there are single-letter words
+in the dictionary; and it makes the total number of anagrams astronomically large)
+
+This is a working draft, so far the code is extremely dirty (this is my first Rust project),
+and it only lists all anagrams (not including words reordering)
+and does not yet do actual MD5 calculation.
+
+How to run to solve the original task for three-word anagrams:
+
+```
+cargo run data\words.txt data\hashes.txt 3 "poultry outwits ants"
+```
--- a/data/words-lite.txt
+++ b/data/words-lite.txt
@ -0,0 +1,14 @@
+qq
+poultry
+outwits
+ants
+poultrypoultry
+outwitsoutwits
+antsants
+p
+o
+a
+pp
+oo
+aa
+zz
--- a/data/words.txt
+++ b/data/words.txt
--- a/src/anagram_finder.rs
+++ b/src/anagram_finder.rs
@ -0,0 +1,24 @@
+use crate::vector_alphabet;
+
+pub fn find_anagrams(remainder: &vector_alphabet::Vector, remaining_depth: usize, current_words: &[(String, vector_alphabet::VectorWithMetadata)]) -> Vec<Vec<String>> {
+    if remaining_depth == 0 {
+        if remainder.norm == 0 {
+            return vec![vec![]];
+        }
+        return vec![];
+    }
+
+    current_words.iter()
+        .enumerate()
+        .map(|(index, (word, word_metadata))| match remainder.safe_substract(&word_metadata.vector) {
+            Some(new_remainder) => find_anagrams(&new_remainder, remaining_depth-1, &current_words[index..])
+                .iter()
+                .map(|partial_phrase| {
+                    vec![word.clone()].iter().chain(partial_phrase).cloned().collect()
+                })
+                .collect(),
+            _ => vec![],
+        })
+        .flatten()
+        .collect()
+}
--- a/src/hash_computer.rs
+++ b/src/hash_computer.rs
@ -0,0 +1 @@
+pub const MAX_PHRASE_LENGTH: usize = 27;
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,56 @@
+use std::env;
+
+mod anagram_finder;
+mod hash_computer;
+mod read_lines;
+mod vector_alphabet;
+
+fn main() {
+    let args: Vec<_> = env::args().collect();
+    println!("{:?}", args);
+
+    let words_file_path = &args[1];
+    let hashes_file_path = &args[2];
+    let max_number_of_words = &args[3].parse::<usize>().unwrap();
+    let phrase = &args[4];
+
+    let phrase_byte_length_without_spaces = phrase.as_bytes().into_iter().filter(|&b| *b != b' ').count();
+    let result_byte_length = phrase_byte_length_without_spaces + max_number_of_words - 1;
+
+    if result_byte_length > hash_computer::MAX_PHRASE_LENGTH {
+        panic!("Words number limit exceeded")
+    }
+
+    let alphabet = vector_alphabet::Alphabet::new(phrase).unwrap();
+
+    let phrase_with_metadata = alphabet.vectorize(phrase).unwrap();
+
+    let mut words = read_lines::lines_from_file(words_file_path).unwrap();
+    words.sort();
+    words.dedup();
+
+    let words_with_vectors: Vec<_> = words
+        .into_iter()
+        .map(|word| {
+            let vector_option = alphabet.vectorize(&word);
+            match vector_option {
+                Some(vector_with_metadata) => {
+                    if vector_with_metadata.vector.is_subset_of(&phrase_with_metadata.vector) {
+                        return Some((word, vector_with_metadata));
+                    } else {
+                        return None;
+                    }
+                }
+                None => {
+                    return None;
+                }
+            }
+        })
+        .flatten()
+        .collect();
+
+    let result = anagram_finder::find_anagrams(&phrase_with_metadata.vector, *max_number_of_words, &words_with_vectors);
+    for result_words in result {
+        println!("{}", result_words.join(" "))
+    }
+}
--- a/src/read_lines.rs
+++ b/src/read_lines.rs
@ -0,0 +1,7 @@
+use std::fs::File;
+use std::io::{self, BufRead};
+use std::path::Path;
+
+pub fn lines_from_file(filename: impl AsRef<Path>) -> io::Result<Vec<String>> {
+    io::BufReader::new(File::open(filename)?).lines().collect()
+}
--- a/src/vector_alphabet.rs
+++ b/src/vector_alphabet.rs
@ -0,0 +1,93 @@
+use std::collections::HashMap;
+use packed_simd;
+
+#[derive(Debug)]
+pub struct Vector {
+    pub norm: usize,
+    simd_vector: packed_simd::u8x32,
+}
+
+#[derive(Debug)]
+pub struct VectorWithMetadata {
+    pub key: String,
+    pub vector: Vector,
+}
+
+impl Vector {
+    fn new(&array: &[u8; 32], norm: usize) -> Vector {
+        Vector {
+            norm,
+            simd_vector: packed_simd::u8x32::from_slice_unaligned(&array),
+        }
+    }
+
+    pub fn is_subset_of(&self, other: &Vector) -> bool {
+        let comparison_result = packed_simd::u8x32::gt(self.simd_vector, other.simd_vector);
+        packed_simd::m8x32::none(comparison_result as packed_simd::m8x32)
+    }
+
+    pub fn safe_substract(&self, vector_to_substract: &Vector) -> Option<Vector> {
+        if vector_to_substract.is_subset_of(self) {
+            return Some(Vector {
+                norm: self.norm - vector_to_substract.norm,
+                simd_vector: self.simd_vector - vector_to_substract.simd_vector
+            });
+        } else {
+            return None;
+        }
+    }
+}
+
+pub struct Alphabet {
+    chars_to_offsets: HashMap<char, usize>,
+}
+
+impl Alphabet {
+    pub fn new(phrase: &str) -> Result<Alphabet, &'static str> {
+        let mut chars: Vec<_> = phrase.chars().filter(|&ch| ch != ' ').collect();
+        chars.sort();
+        chars.dedup();
+        
+        if chars.len() > 32 {
+            return Err("Number of different chars should not exceed 32");
+        }
+
+        let mut offsets_to_chars: [char; 32] = [' '; 32];
+        let mut chars_to_offsets: HashMap<char, usize> = HashMap::new();
+        for (pos, ch) in chars.iter().enumerate() {
+            chars_to_offsets.insert(*ch, pos);
+            offsets_to_chars[pos] = *ch;
+        }
+
+        Ok(Alphabet {
+            chars_to_offsets,
+        })
+    }
+
+    pub fn vectorize(&self, phrase: &str) -> Option<VectorWithMetadata> {
+        let mut chars: Vec<_> = phrase.chars().filter(|&ch| ch != ' ').collect();
+        chars.sort();
+
+        let norm = chars.len();
+
+        let mut array: [u8; 32] = [0; 32];
+        for ch in &chars {
+            match self.chars_to_offsets.get(&ch) {
+                Some(&index) => {
+                    if array[index] > 127 {
+                        return None;
+                    }
+
+                    array[index] += 1;
+                },
+                _ => return None,
+            }
+        }
+
+        let key: String = chars.into_iter().collect();
+        return Some(VectorWithMetadata {
+            key,
+            vector: Vector::new(&array, norm),
+        });
+    }
+}