parent
89246c63da
commit
ddd9ae9b33
@ -0,0 +1,10 @@ |
||||
[package] |
||||
name = "hello_cargo" |
||||
version = "0.1.0" |
||||
authors = ["inga-lovinde <52715130+inga-lovinde@users.noreply.github.com>"] |
||||
edition = "2018" |
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html |
||||
|
||||
[dependencies] |
||||
packed_simd = { version = "0.3.4", package = "packed_simd_2" } |
@ -1 +1,29 @@ |
||||
# TrustPilotChallengeRust |
||||
|
||||
TrustPilot had this challenge (http://followthewhiterabbit.trustpilot.com/) |
||||
where you had to, given the dictionary, and given three MD5 hashes, |
||||
find anagrams of a phrase *"poultry outwits ants"* which result in these hashes. |
||||
|
||||
My original solution was in mixture of C# and plain C (with a bit of Visual C++ |
||||
as a bridge), and heavily used AVX2 intrinsics for optimization. |
||||
|
||||
Rust now has a decent API frontend for AVX2 intrinsics |
||||
(https://rust-lang.github.io/packed_simd/packed_simd_2/, and soon-to-be `std::simd`), |
||||
so it makes perfect sense to try and reimplement the same ideas with Rust. |
||||
|
||||
The problem will sound a bit different: given a dictionary and given a string, |
||||
find all anagrams no longer than N words and no longer than 27 bytes |
||||
which produce given MD5 hashes. |
||||
|
||||
(The limit on the number of words is neccessary, because there are single-letter words |
||||
in the dictionary; and it makes the total number of anagrams astronomically large) |
||||
|
||||
This is a working draft, so far the code is extremely dirty (this is my first Rust project), |
||||
and it only lists all anagrams (not including words reordering) |
||||
and does not yet do actual MD5 calculation. |
||||
|
||||
How to run to solve the original task for three-word anagrams: |
||||
|
||||
``` |
||||
cargo run data\words.txt data\hashes.txt 3 "poultry outwits ants" |
||||
``` |
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,24 @@ |
||||
use crate::vector_alphabet; |
||||
|
||||
pub fn find_anagrams(remainder: &vector_alphabet::Vector, remaining_depth: usize, current_words: &[(String, vector_alphabet::VectorWithMetadata)]) -> Vec<Vec<String>> { |
||||
if remaining_depth == 0 { |
||||
if remainder.norm == 0 { |
||||
return vec![vec![]]; |
||||
} |
||||
return vec![]; |
||||
} |
||||
|
||||
current_words.iter() |
||||
.enumerate() |
||||
.map(|(index, (word, word_metadata))| match remainder.safe_substract(&word_metadata.vector) { |
||||
Some(new_remainder) => find_anagrams(&new_remainder, remaining_depth-1, ¤t_words[index..]) |
||||
.iter() |
||||
.map(|partial_phrase| { |
||||
vec![word.clone()].iter().chain(partial_phrase).cloned().collect() |
||||
}) |
||||
.collect(), |
||||
_ => vec![], |
||||
}) |
||||
.flatten() |
||||
.collect() |
||||
} |
@ -0,0 +1 @@ |
||||
pub const MAX_PHRASE_LENGTH: usize = 27; |
@ -0,0 +1,56 @@ |
||||
use std::env; |
||||
|
||||
mod anagram_finder; |
||||
mod hash_computer; |
||||
mod read_lines; |
||||
mod vector_alphabet; |
||||
|
||||
fn main() { |
||||
let args: Vec<_> = env::args().collect(); |
||||
println!("{:?}", args); |
||||
|
||||
let words_file_path = &args[1]; |
||||
let hashes_file_path = &args[2]; |
||||
let max_number_of_words = &args[3].parse::<usize>().unwrap(); |
||||
let phrase = &args[4]; |
||||
|
||||
let phrase_byte_length_without_spaces = phrase.as_bytes().into_iter().filter(|&b| *b != b' ').count(); |
||||
let result_byte_length = phrase_byte_length_without_spaces + max_number_of_words - 1; |
||||
|
||||
if result_byte_length > hash_computer::MAX_PHRASE_LENGTH { |
||||
panic!("Words number limit exceeded") |
||||
} |
||||
|
||||
let alphabet = vector_alphabet::Alphabet::new(phrase).unwrap(); |
||||
|
||||
let phrase_with_metadata = alphabet.vectorize(phrase).unwrap(); |
||||
|
||||
let mut words = read_lines::lines_from_file(words_file_path).unwrap(); |
||||
words.sort(); |
||||
words.dedup(); |
||||
|
||||
let words_with_vectors: Vec<_> = words |
||||
.into_iter() |
||||
.map(|word| { |
||||
let vector_option = alphabet.vectorize(&word); |
||||
match vector_option { |
||||
Some(vector_with_metadata) => { |
||||
if vector_with_metadata.vector.is_subset_of(&phrase_with_metadata.vector) { |
||||
return Some((word, vector_with_metadata)); |
||||
} else { |
||||
return None; |
||||
} |
||||
} |
||||
None => { |
||||
return None; |
||||
} |
||||
} |
||||
}) |
||||
.flatten() |
||||
.collect(); |
||||
|
||||
let result = anagram_finder::find_anagrams(&phrase_with_metadata.vector, *max_number_of_words, &words_with_vectors); |
||||
for result_words in result { |
||||
println!("{}", result_words.join(" ")) |
||||
} |
||||
} |
@ -0,0 +1,7 @@ |
||||
use std::fs::File; |
||||
use std::io::{self, BufRead}; |
||||
use std::path::Path; |
||||
|
||||
pub fn lines_from_file(filename: impl AsRef<Path>) -> io::Result<Vec<String>> { |
||||
io::BufReader::new(File::open(filename)?).lines().collect() |
||||
} |
@ -0,0 +1,93 @@ |
||||
use std::collections::HashMap; |
||||
use packed_simd; |
||||
|
||||
#[derive(Debug)] |
||||
pub struct Vector { |
||||
pub norm: usize, |
||||
simd_vector: packed_simd::u8x32, |
||||
} |
||||
|
||||
#[derive(Debug)] |
||||
pub struct VectorWithMetadata { |
||||
pub key: String, |
||||
pub vector: Vector, |
||||
} |
||||
|
||||
impl Vector { |
||||
fn new(&array: &[u8; 32], norm: usize) -> Vector { |
||||
Vector { |
||||
norm, |
||||
simd_vector: packed_simd::u8x32::from_slice_unaligned(&array), |
||||
} |
||||
} |
||||
|
||||
pub fn is_subset_of(&self, other: &Vector) -> bool { |
||||
let comparison_result = packed_simd::u8x32::gt(self.simd_vector, other.simd_vector); |
||||
packed_simd::m8x32::none(comparison_result as packed_simd::m8x32) |
||||
} |
||||
|
||||
pub fn safe_substract(&self, vector_to_substract: &Vector) -> Option<Vector> { |
||||
if vector_to_substract.is_subset_of(self) { |
||||
return Some(Vector { |
||||
norm: self.norm - vector_to_substract.norm, |
||||
simd_vector: self.simd_vector - vector_to_substract.simd_vector |
||||
}); |
||||
} else { |
||||
return None; |
||||
} |
||||
} |
||||
} |
||||
|
||||
pub struct Alphabet { |
||||
chars_to_offsets: HashMap<char, usize>, |
||||
} |
||||
|
||||
impl Alphabet { |
||||
pub fn new(phrase: &str) -> Result<Alphabet, &'static str> { |
||||
let mut chars: Vec<_> = phrase.chars().filter(|&ch| ch != ' ').collect(); |
||||
chars.sort(); |
||||
chars.dedup(); |
||||
|
||||
if chars.len() > 32 { |
||||
return Err("Number of different chars should not exceed 32"); |
||||
} |
||||
|
||||
let mut offsets_to_chars: [char; 32] = [' '; 32]; |
||||
let mut chars_to_offsets: HashMap<char, usize> = HashMap::new(); |
||||
for (pos, ch) in chars.iter().enumerate() { |
||||
chars_to_offsets.insert(*ch, pos); |
||||
offsets_to_chars[pos] = *ch; |
||||
} |
||||
|
||||
Ok(Alphabet { |
||||
chars_to_offsets, |
||||
}) |
||||
} |
||||
|
||||
pub fn vectorize(&self, phrase: &str) -> Option<VectorWithMetadata> { |
||||
let mut chars: Vec<_> = phrase.chars().filter(|&ch| ch != ' ').collect(); |
||||
chars.sort(); |
||||
|
||||
let norm = chars.len(); |
||||
|
||||
let mut array: [u8; 32] = [0; 32]; |
||||
for ch in &chars { |
||||
match self.chars_to_offsets.get(&ch) { |
||||
Some(&index) => { |
||||
if array[index] > 127 { |
||||
return None; |
||||
} |
||||
|
||||
array[index] += 1; |
||||
}, |
||||
_ => return None, |
||||
} |
||||
} |
||||
|
||||
let key: String = chars.into_iter().collect(); |
||||
return Some(VectorWithMetadata { |
||||
key, |
||||
vector: Vector::new(&array, norm), |
||||
}); |
||||
} |
||||
} |
Loading…
Reference in new issue