TrustPilotChallenge/dotnet/WhiteRabbit/VectorsProcessor.cs

namespace WhiteRabbit
{
    using System;
    using System.Collections.Generic;
    using System.Collections.Immutable;
    using System.Linq;
    using System.Numerics;

    internal sealed class VectorsProcessor
    {
        private const byte MaxComponentValue = 8;
        private const int LeastCommonMultiple = 840;

        public VectorsProcessor(Vector<byte> target, int maxVectorsCount, Vector<byte>[] dictionary)
        {
            if (Enumerable.Range(0, Vector<byte>.Count).Any(i => target[i] > MaxComponentValue))
            {
                throw new ArgumentException($"Every value should be at most {MaxComponentValue} (at most {MaxComponentValue} same characters allowed in the source string)", nameof(target));
            }

            this.Target = target;

            this.MaxVectorsCount = maxVectorsCount;
            this.Dictionary = ImmutableArray.Create(FilterVectors(dictionary, target).ToArray());

            var normsIndex = new int[GetVectorNorm(target, target) + 1];
            var offset = 0;
            for (var i = normsIndex.Length - 1; i >= 0; i--)
            {
                while (offset < this.Dictionary.Length && this.Dictionary[offset].Norm > i)
                {
                    offset++;
                }

                normsIndex[i] = offset;
            }

            this.NormsIndex = ImmutableArray.Create(normsIndex);
        }

        private Vector<byte> Target { get; }

        private int MaxVectorsCount { get; }

        private ImmutableArray<VectorInfo> Dictionary { get; }

        // Stores index of the first vector from Dictionary with norm less than or equal to offset
        private ImmutableArray<int> NormsIndex { get; }

        // Produces all sets of vectors with the target sum
#if SINGLE_THREADED
        public IEnumerable<int[]> GenerateSequences()
#else
        public ParallelQuery<int[]> GenerateSequences()
#endif
        {
            return this.GenerateUnorderedSequences(this.Target, GetVectorNorm(this.Target, this.Target), this.MaxVectorsCount, 0)
#if !SINGLE_THREADED
                .AsParallel()
#endif
                .Select(Enumerable.ToArray);
        }

        // We want words with more letters (and among these, words with more "rare" letters) to appear first, to reduce the searching time somewhat.
        // Applying such a sort, we reduce the total number of triplets to check for anagrams from ~62M to ~29M.
        // Total number of quadruplets is reduced from 1468M to mere 311M.
        // And total number of quintuplets becomes reasonable 1412M.
        // Also, it produces the intended results faster (as these are more likely to contain longer words - e.g. "poultry outwits ants" is more likely than "p o u l t r y o u t w i t s a n t s").
        // This method basically gives us the 1-norm of the vector in the space rescaled so that the target is [1, 1, ..., 1].
        private static int GetVectorNorm(Vector<byte> vector, Vector<byte> target)
        {
            var norm = 0;
            for (var i = 0; target[i] != 0; i++)
            {
                norm += (LeastCommonMultiple * vector[i]) / target[i];
            }

            return norm;
        }

        private static VectorInfo[] FilterVectors(Vector<byte>[] vectors, Vector<byte> target)
        {
            return Enumerable.Range(0, vectors.Length)
                .Where(i => Vector.GreaterThanOrEqualAll(target, vectors[i]))
                .Select(i => new VectorInfo(vectors[i], GetVectorNorm(vectors[i], target), i))
                .OrderByDescending(vectorInfo => vectorInfo.Norm)
                .ToArray();
        }

        // This method takes most of the time, so everything related to it must be optimized.
        // In every sequence, next vector always goes after the previous one from dictionary.
        // E.g. if dictionary is [x, y, z], then only [x, y] sequence could be generated, and [y, x] will never be generated.
        // That way, the complexity of search goes down by a factor of MaxVectorsCount! (as if [x, y] does not add up to a required target, there is no point in checking [y, x])
        private IEnumerable<ImmutableStack<int>> GenerateUnorderedSequences(Vector<byte> remainder, int remainderNorm, int allowedRemainingWords, int currentDictionaryPosition)
        {
            if (allowedRemainingWords > 1)
            {
                var newAllowedRemainingWords = allowedRemainingWords - 1;

                // E.g. if remainder norm is 7, 8 or 9, and allowedRemainingWords is 3,
                // we need the largest remaining word to have a norm of at least 3
                var requiredRemainderPerWord = (remainderNorm + allowedRemainingWords - 1) / allowedRemainingWords;

                for (var i = Math.Max(this.NormsIndex[remainderNorm], currentDictionaryPosition); i < this.Dictionary.Length; i++)
                {
                    var currentVectorInfo = this.Dictionary[i];
                    if (currentVectorInfo.Vector == remainder)
                    {
                        yield return ImmutableStack.Create(currentVectorInfo.Index);
                    }
                    else if (currentVectorInfo.Norm < requiredRemainderPerWord)
                    {
                        break;
                    }
                    else if (Vector.LessThanOrEqualAll(currentVectorInfo.Vector, remainder))
                    {
                        var newRemainder = remainder - currentVectorInfo.Vector;
                        var newRemainderNorm = remainderNorm - currentVectorInfo.Norm;
                        foreach (var result in this.GenerateUnorderedSequences(newRemainder, newRemainderNorm, newAllowedRemainingWords, i))
                        {
                            yield return result.Push(currentVectorInfo.Index);
                        }
                    }
                }
            }
            else
            {
                for (var i = Math.Max(this.NormsIndex[remainderNorm], currentDictionaryPosition); i < this.Dictionary.Length; i++)
                {
                    var currentVectorInfo = this.Dictionary[i];
                    if (currentVectorInfo.Vector == remainder)
                    {
                        yield return ImmutableStack.Create(currentVectorInfo.Index);
                    }
                    else if (currentVectorInfo.Norm < remainderNorm)
                    {
                        break;
                    }
                }
            }
        }

        private struct VectorInfo
        {
            public VectorInfo(Vector<byte> vector, int norm, int index)
            {
                this.Vector = vector;
                this.Norm = norm;
                this.Index = index;
            }

            public Vector<byte> Vector { get; }

            public int Norm { get; }

            public int Index { get; }
        }
    }
}
-												Refactoring

											
										
										
											8 years ago
+								namespace WhiteRabbit
 								{
 								    using System;
 								    using System.Collections.Generic;
 								    using System.Collections.Immutable;
 								    using System.Linq;
 								    using System.Numerics;
-												Minor code cleanup + microoptimization + readme update

											
										
										
											8 years ago
+								    internal sealed class VectorsProcessor
-												Refactoring

											
										
										
											8 years ago
+								    {
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								        private const byte MaxComponentValue = 8;
 								        private const int LeastCommonMultiple = 840;
-												Improved performance (dictionary => array)

											
										
										
											8 years ago
+								        public VectorsProcessor(Vector<byte> target, int maxVectorsCount, Vector<byte>[] dictionary)
-												Refactoring

											
										
										
											8 years ago
+								        {
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								            if (Enumerable.Range(0, Vector<byte>.Count).Any(i => target[i] > MaxComponentValue))
-												New optimization: there is no point in checking too small vectors

											
										
										
											8 years ago
+								            {
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								                throw new ArgumentException($"Every value should be at most {MaxComponentValue} (at most {MaxComponentValue} same characters allowed in the source string)", nameof(target));
-												New optimization: there is no point in checking too small vectors

											
										
										
											8 years ago
+								            }
-												Refactoring

											
										
										
											8 years ago
+								            this.Target = target;
-												New optimization: there is no point in checking too small vectors

											
										
										
											8 years ago
-												Refactoring

											
										
										
											8 years ago
+								            this.MaxVectorsCount = maxVectorsCount;
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								            this.Dictionary = ImmutableArray.Create(FilterVectors(dictionary, target).ToArray());
-												microoptimization

											
										
										
											8 years ago
 								            var normsIndex = new int[GetVectorNorm(target, target) + 1];
 								            var offset = 0;
 								            for (var i = normsIndex.Length - 1; i >= 0; i--)
 								            {
 								                while (offset < this.Dictionary.Length && this.Dictionary[offset].Norm > i)
 								                {
 								                    offset++;
 								                }
 								                normsIndex[i] = offset;
 								            }
 								            this.NormsIndex = ImmutableArray.Create(normsIndex);
-												Refactoring

											
										
										
											8 years ago
+								        }
 								        private Vector<byte> Target { get; }
 								        private int MaxVectorsCount { get; }
-												Further optimization

											
										
										
											8 years ago
+								        private ImmutableArray<VectorInfo> Dictionary { get; }
-												Code cleanup; additional information

											
										
										
											8 years ago
-												microoptimization

											
										
										
											8 years ago
+								        // Stores index of the first vector from Dictionary with norm less than or equal to offset
 								        private ImmutableArray<int> NormsIndex { get; }
-												Optimization; GeneratePermutations is called after flattening

											
										
										
											8 years ago
+								        // Produces all sets of vectors with the target sum
-												Improved debugging

											
										
										
											8 years ago
+								#if SINGLE_THREADED
 								        public IEnumerable<int[]> GenerateSequences()
 								#else
-												Improved performance (dictionary => array)

											
										
										
											8 years ago
+								        public ParallelQuery<int[]> GenerateSequences()
-												Improved debugging

											
										
										
											8 years ago
+								#endif
-												Refactoring

											
										
										
											8 years ago
+								        {
-												microoptimization

											
										
										
											8 years ago
+								            return this.GenerateUnorderedSequences(this.Target, GetVectorNorm(this.Target, this.Target), this.MaxVectorsCount, 0)
-												Improved debugging

											
										
										
											8 years ago
+								#if !SINGLE_THREADED
-												Microoptimizations

											
										
										
											8 years ago
+								                .AsParallel()
-												Improved debugging

											
										
										
											8 years ago
+								#endif
-												Optimization; GeneratePermutations is called after flattening

											
										
										
											8 years ago
+								                .Select(Enumerable.ToArray);
-												Refactoring

											
										
										
											8 years ago
+								        }
-												Optimization

											
										
										
											8 years ago
+								        // We want words with more letters (and among these, words with more "rare" letters) to appear first, to reduce the searching time somewhat.
 								        // Applying such a sort, we reduce the total number of triplets to check for anagrams from ~62M to ~29M.
 								        // Total number of quadruplets is reduced from 1468M to mere 311M.
-												Added information on 5-word anagrams

											
										
										
											8 years ago
+								        // And total number of quintuplets becomes reasonable 1412M.
-												Optimization

											
										
										
											8 years ago
+								        // Also, it produces the intended results faster (as these are more likely to contain longer words - e.g. "poultry outwits ants" is more likely than "p o u l t r y o u t w i t s a n t s").
 								        // This method basically gives us the 1-norm of the vector in the space rescaled so that the target is [1, 1, ..., 1].
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								        private static int GetVectorNorm(Vector<byte> vector, Vector<byte> target)
-												Further optimization

											
										
										
											8 years ago
+								        {
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								            var norm = 0;
-												Code cleanup; additional information

											
										
										
											8 years ago
+								            for (var i = 0; target[i] != 0; i++)
-												Optimization

											
										
										
											8 years ago
+								            {
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								                norm += (LeastCommonMultiple * vector[i]) / target[i];
-												Optimization

											
										
										
											8 years ago
+								            }
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								            return norm;
-												Optimization

											
										
										
											8 years ago
+								        }
-												Improved performance (dictionary => array)

											
										
										
											8 years ago
+								        private static VectorInfo[] FilterVectors(Vector<byte>[] vectors, Vector<byte> target)
-												Refactoring

											
										
										
											8 years ago
+								        {
-												Improved performance (dictionary => array)

											
										
										
											8 years ago
+								            return Enumerable.Range(0, vectors.Length)
 								                .Where(i => Vector.GreaterThanOrEqualAll(target, vectors[i]))
 								                .Select(i => new VectorInfo(vectors[i], GetVectorNorm(vectors[i], target), i))
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								                .OrderByDescending(vectorInfo => vectorInfo.Norm)
-												Further optimization

											
										
										
											8 years ago
+								                .ToArray();
-												Refactoring

											
										
										
											8 years ago
+								        }
-												Code cleanup

											
										
										
											8 years ago
+								        // This method takes most of the time, so everything related to it must be optimized.
 								        // In every sequence, next vector always goes after the previous one from dictionary.
 								        // E.g. if dictionary is [x, y, z], then only [x, y] sequence could be generated, and [y, x] will never be generated.
 								        // That way, the complexity of search goes down by a factor of MaxVectorsCount! (as if [x, y] does not add up to a required target, there is no point in checking [y, x])
-												microoptimization

											
										
										
											8 years ago
+								        private IEnumerable<ImmutableStack<int>> GenerateUnorderedSequences(Vector<byte> remainder, int remainderNorm, int allowedRemainingWords, int currentDictionaryPosition)
-												Refactoring

											
										
										
											8 years ago
+								        {
-												New optimization: there is no point in checking too small vectors

											
										
										
											8 years ago
+								            if (allowedRemainingWords > 1)
-												Refactoring

											
										
										
											8 years ago
+								            {
-												Microoptimizations

											
										
										
											8 years ago
+								                var newAllowedRemainingWords = allowedRemainingWords - 1;
-												Code cleanup; implementation notes added

											
										
										
											8 years ago
 								                // E.g. if remainder norm is 7, 8 or 9, and allowedRemainingWords is 3,
-												New optimization: there is no point in checking too small vectors

											
										
										
											8 years ago
+								                // we need the largest remaining word to have a norm of at least 3
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								                var requiredRemainderPerWord = (remainderNorm + allowedRemainingWords - 1) / allowedRemainingWords;
-												New optimization: there is no point in checking too small vectors

											
										
										
											8 years ago
-												microoptimization

											
										
										
											8 years ago
+								                for (var i = Math.Max(this.NormsIndex[remainderNorm], currentDictionaryPosition); i < this.Dictionary.Length; i++)
-												Refactoring

											
										
										
											8 years ago
+								                {
-												microoptimization

											
										
										
											8 years ago
+								                    var currentVectorInfo = this.Dictionary[i];
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								                    if (currentVectorInfo.Vector == remainder)
-												Refactoring

											
										
										
											8 years ago
+								                    {
-												Improved performance (dictionary => array)

											
										
										
											8 years ago
+								                        yield return ImmutableStack.Create(currentVectorInfo.Index);
-												Refactoring

											
										
										
											8 years ago
+								                    }
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								                    else if (currentVectorInfo.Norm < requiredRemainderPerWord)
-												New optimization: there is no point in checking too small vectors

											
										
										
											8 years ago
+								                    {
 								                        break;
 								                    }
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								                    else if (Vector.LessThanOrEqualAll(currentVectorInfo.Vector, remainder))
-												Refactoring

											
										
										
											8 years ago
+								                    {
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								                        var newRemainder = remainder - currentVectorInfo.Vector;
 								                        var newRemainderNorm = remainderNorm - currentVectorInfo.Norm;
-												microoptimization

											
										
										
											8 years ago
+								                        foreach (var result in this.GenerateUnorderedSequences(newRemainder, newRemainderNorm, newAllowedRemainingWords, i))
-												Refactoring

											
										
										
											8 years ago
+								                        {
-												Improved performance (dictionary => array)

											
										
										
											8 years ago
+								                            yield return result.Push(currentVectorInfo.Index);
-												Refactoring

											
										
										
											8 years ago
+								                        }
 								                    }
 								                }
 								            }
-												New optimization: there is no point in checking too small vectors

											
										
										
											8 years ago
+								            else
-												Refactoring

											
										
										
											8 years ago
+								            {
-												microoptimization

											
										
										
											8 years ago
+								                for (var i = Math.Max(this.NormsIndex[remainderNorm], currentDictionaryPosition); i < this.Dictionary.Length; i++)
-												Refactoring

											
										
										
											8 years ago
+								                {
-												microoptimization

											
										
										
											8 years ago
+								                    var currentVectorInfo = this.Dictionary[i];
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								                    if (currentVectorInfo.Vector == remainder)
-												Refactoring

											
										
										
											8 years ago
+								                    {
-												Improved performance (dictionary => array)

											
										
										
											8 years ago
+								                        yield return ImmutableStack.Create(currentVectorInfo.Index);
-												Refactoring

											
										
										
											8 years ago
+								                    }
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								                    else if (currentVectorInfo.Norm < remainderNorm)
-												Further optimization

											
										
										
											8 years ago
+								                    {
 								                        break;
 								                    }
-												Refactoring

											
										
										
											8 years ago
+								                }
 								            }
 								        }
-												Further optimization

											
										
										
											8 years ago
+								        private struct VectorInfo
 								        {
-												Improved performance (dictionary => array)

											
										
										
											8 years ago
+								            public VectorInfo(Vector<byte> vector, int norm, int index)
-												Further optimization

											
										
										
											8 years ago
+								            {
 								                this.Vector = vector;
 								                this.Norm = norm;
-												Improved performance (dictionary => array)

											
										
										
											8 years ago
+								                this.Index = index;
-												Further optimization

											
										
										
											8 years ago
+								            }
 								            public Vector<byte> Vector { get; }
-												As used vector norm is linear, dot product is not needed

											
										
										
											8 years ago
+								            public int Norm { get; }
-												Improved performance (dictionary => array)

											
										
										
											8 years ago
 								            public int Index { get; }
-												Further optimization

											
										
										
											8 years ago
+								        }
-												Refactoring

											
										
										
											8 years ago
+								    }
 								}