You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
TrustPilotChallengeRust/src/dictionary_builder.rs

96 lines
3.0 KiB

use std::collections::HashMap;
use packed_simd::u8x32;
use crate::vector_alphabet;
pub struct WordInfo {
simd_words: [u8x32; 32],
pub length: usize,
pub word: String,
}
impl WordInfo {
fn new(word: String) -> WordInfo {
let mut byte_array: [u8; 64] = [0; 64];
let bytes = word.as_bytes();
let length = bytes.len();
byte_array[32 + length] = b' ';
for i in 0..length {
byte_array[32 + i] = bytes[i];
}
let simd_word_zero: u8x32 = u8x32::from_slice_unaligned(&[0; 32]);
let mut simd_words: [u8x32; 32] = [simd_word_zero; 32];
for i in 0..31 {
simd_words[i] = u8x32::from_slice_unaligned(&byte_array[32-i..64-i]);
}
WordInfo {
simd_words,
length,
word,
}
}
pub fn get_simd_word_for_offset(&self, offset: usize) -> u8x32 {
self.simd_words[offset]
}
}
pub struct Dictionary {
pub phrase_vector: vector_alphabet::Vector,
pub vectors: Vec<vector_alphabet::Vector>,
pub words: Vec<Vec<WordInfo>>,
}
3 years ago
impl Dictionary {
pub fn from_phrase_and_words(phrase: &str, unique_words: Vec<String>) -> Dictionary {
let alphabet = vector_alphabet::Alphabet::new(phrase).unwrap();
3 years ago
let phrase_with_metadata = alphabet.vectorize(phrase).unwrap();
3 years ago
let words_with_vectors: Vec<_> = unique_words
.into_iter()
.map(|word| {
let vector_option = alphabet.vectorize(&word);
match vector_option {
Some(vector_with_metadata) => {
if vector_with_metadata.vector.is_subset_of(&phrase_with_metadata.vector) {
return Some((word, vector_with_metadata));
} else {
return None;
}
}
None => {
return None;
}
}
3 years ago
})
.flatten()
.collect();
3 years ago
let mut words_by_vectors: HashMap<_, _> = HashMap::new();
for (word, vector_with_metadata) in words_with_vectors {
let (_, words_for_vector) = words_by_vectors.entry(vector_with_metadata.key).or_insert((vector_with_metadata.vector, vec![]));
words_for_vector.push(WordInfo::new(word));
}
3 years ago
let mut words_by_vectors: Vec<_> = words_by_vectors.into_values().collect();
words_by_vectors.sort_by_cached_key(|(vector, _)| (vector.norm, vector.get_key()));
3 years ago
words_by_vectors.reverse();
3 years ago
let mut vectors = vec![];
let mut words_by_vectors_vec = vec![];
3 years ago
for (vector, words_by_vector) in words_by_vectors {
vectors.push(vector);
words_by_vectors_vec.push(words_by_vector);
}
3 years ago
Dictionary {
phrase_vector: phrase_with_metadata.vector,
vectors,
words: words_by_vectors_vec,
}
}
}