First draft

4 years ago · ddd9ae9b33
parent 89246c63da
commit ddd9ae9b33
10 changed files with 99411 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,5 @@ Cargo.lock
 # These are backup files generated by rustfmt
 **/*.rs.bk
 .vs
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,10 @@
 [package]
 name = "hello_cargo"
 version = "0.1.0"
 authors = ["inga-lovinde <52715130+inga-lovinde@users.noreply.github.com>"]
 edition = "2018"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
 packed_simd = { version = "0.3.4", package = "packed_simd_2" }
--- a/README.md
+++ b/README.md
@ -1 +1,29 @@
 # TrustPilotChallengeRust
 TrustPilot had this challenge (http://followthewhiterabbit.trustpilot.com/)
 where you had to, given the dictionary, and given three MD5 hashes,
 find anagrams of a phrase *"poultry outwits ants"* which result in these hashes.
 My original solution was in mixture of C# and plain C (with a bit of Visual C++
 as a bridge), and heavily used AVX2 intrinsics for optimization.
 Rust now has a decent API frontend for AVX2 intrinsics 
 (https://rust-lang.github.io/packed_simd/packed_simd_2/, and soon-to-be `std::simd`),
 so it makes perfect sense to try and reimplement the same ideas with Rust.
 The problem will sound a bit different: given a dictionary and given a string,
 find all anagrams no longer than N words and no longer than 27 bytes
 which produce given MD5 hashes.
 (The limit on the number of words is neccessary, because there are single-letter words
 in the dictionary; and it makes the total number of anagrams astronomically large)
 This is a working draft, so far the code is extremely dirty (this is my first Rust project),
 and it only lists all anagrams (not including words reordering)
 and does not yet do actual MD5 calculation.
 How to run to solve the original task for three-word anagrams:
 ```
 cargo run data\words.txt data\hashes.txt 3 "poultry outwits ants"
 ```
--- a/data/words-lite.txt
+++ b/data/words-lite.txt
@ -0,0 +1,14 @@
 qq
 poultry
 outwits
 ants
 poultrypoultry
 outwitsoutwits
 antsants
 p
 o
 a
 pp
 oo
 aa
 zz
--- a/data/words.txt
+++ b/data/words.txt
--- a/src/anagram_finder.rs
+++ b/src/anagram_finder.rs
@ -0,0 +1,24 @@
 use crate::vector_alphabet;
 pub fn find_anagrams(remainder: &vector_alphabet::Vector, remaining_depth: usize, current_words: &[(String, vector_alphabet::VectorWithMetadata)]) -> Vec<Vec<String>> {
    if remaining_depth == 0 {
        if remainder.norm == 0 {
            return vec![vec![]];
        }
        return vec![];
    }
    current_words.iter()
        .enumerate()
        .map(|(index, (word, word_metadata))| match remainder.safe_substract(&word_metadata.vector) {
            Some(new_remainder) => find_anagrams(&new_remainder, remaining_depth-1, &current_words[index..])
                .iter()
                .map(|partial_phrase| {
                    vec![word.clone()].iter().chain(partial_phrase).cloned().collect()
                })
                .collect(),
            _ => vec![],
        })
        .flatten()
        .collect()
 }
--- a/src/hash_computer.rs
+++ b/src/hash_computer.rs
@ -0,0 +1 @@
 pub const MAX_PHRASE_LENGTH: usize = 27;
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,56 @@
 use std::env;
 mod anagram_finder;
 mod hash_computer;
 mod read_lines;
 mod vector_alphabet;
 fn main() {
    let args: Vec<_> = env::args().collect();
    println!("{:?}", args);
    let words_file_path = &args[1];
    let hashes_file_path = &args[2];
    let max_number_of_words = &args[3].parse::<usize>().unwrap();
    let phrase = &args[4];
    let phrase_byte_length_without_spaces = phrase.as_bytes().into_iter().filter(|&b| *b != b' ').count();
    let result_byte_length = phrase_byte_length_without_spaces + max_number_of_words - 1;
    if result_byte_length > hash_computer::MAX_PHRASE_LENGTH {
        panic!("Words number limit exceeded")
    }
    let alphabet = vector_alphabet::Alphabet::new(phrase).unwrap();
    let phrase_with_metadata = alphabet.vectorize(phrase).unwrap();
    let mut words = read_lines::lines_from_file(words_file_path).unwrap();
    words.sort();
    words.dedup();
    let words_with_vectors: Vec<_> = words
        .into_iter()
        .map(|word| {
            let vector_option = alphabet.vectorize(&word);
            match vector_option {
                Some(vector_with_metadata) => {
                    if vector_with_metadata.vector.is_subset_of(&phrase_with_metadata.vector) {
                        return Some((word, vector_with_metadata));
                    } else {
                        return None;
                    }
                }
                None => {
                    return None;
                }
            }
        })
        .flatten()
        .collect();
    let result = anagram_finder::find_anagrams(&phrase_with_metadata.vector, *max_number_of_words, &words_with_vectors);
    for result_words in result {
        println!("{}", result_words.join(" "))
    }
 }
--- a/src/read_lines.rs
+++ b/src/read_lines.rs
@ -0,0 +1,7 @@
 use std::fs::File;
 use std::io::{self, BufRead};
 use std::path::Path;
 pub fn lines_from_file(filename: impl AsRef<Path>) -> io::Result<Vec<String>> {
    io::BufReader::new(File::open(filename)?).lines().collect()
 }
--- a/src/vector_alphabet.rs
+++ b/src/vector_alphabet.rs
@ -0,0 +1,93 @@
 use std::collections::HashMap;
 use packed_simd;
 #[derive(Debug)]
 pub struct Vector {
    pub norm: usize,
    simd_vector: packed_simd::u8x32,
 }
 #[derive(Debug)]
 pub struct VectorWithMetadata {
    pub key: String,
    pub vector: Vector,
 }
 impl Vector {
    fn new(&array: &[u8; 32], norm: usize) -> Vector {
        Vector {
            norm,
            simd_vector: packed_simd::u8x32::from_slice_unaligned(&array),
        }
    }
    pub fn is_subset_of(&self, other: &Vector) -> bool {
        let comparison_result = packed_simd::u8x32::gt(self.simd_vector, other.simd_vector);
        packed_simd::m8x32::none(comparison_result as packed_simd::m8x32)
    }
    pub fn safe_substract(&self, vector_to_substract: &Vector) -> Option<Vector> {
        if vector_to_substract.is_subset_of(self) {
            return Some(Vector {
                norm: self.norm - vector_to_substract.norm,
                simd_vector: self.simd_vector - vector_to_substract.simd_vector
            });
        } else {
            return None;
        }
    }
 }
 pub struct Alphabet {
    chars_to_offsets: HashMap<char, usize>,
 }
 impl Alphabet {
    pub fn new(phrase: &str) -> Result<Alphabet, &'static str> {
        let mut chars: Vec<_> = phrase.chars().filter(|&ch| ch != ' ').collect();
        chars.sort();
        chars.dedup();
        if chars.len() > 32 {
            return Err("Number of different chars should not exceed 32");
        }
        let mut offsets_to_chars: [char; 32] = [' '; 32];
        let mut chars_to_offsets: HashMap<char, usize> = HashMap::new();
        for (pos, ch) in chars.iter().enumerate() {
            chars_to_offsets.insert(*ch, pos);
            offsets_to_chars[pos] = *ch;
        }
        Ok(Alphabet {
            chars_to_offsets,
        })
    }
    pub fn vectorize(&self, phrase: &str) -> Option<VectorWithMetadata> {
        let mut chars: Vec<_> = phrase.chars().filter(|&ch| ch != ' ').collect();
        chars.sort();
        let norm = chars.len();
        let mut array: [u8; 32] = [0; 32];
        for ch in &chars {
            match self.chars_to_offsets.get(&ch) {
                Some(&index) => {
                    if array[index] > 127 {
                        return None;
                    }
                    array[index] += 1;
                },
                _ => return None,
            }
        }
        let key: String = chars.into_iter().collect();
        return Some(VectorWithMetadata {
            key,
            vector: Vector::new(&array, norm),
        });
    }
 }