day 8, part 2 (much faster, but incorrect)

1 year ago · c949603386
parent 17a9896398
commit c949603386
3 changed files with 193 additions and 55 deletions
--- a/day08-hard/README.md
+++ b/day08-hard/README.md
@ -0,0 +1,69 @@
+The problem's phase space consists of pairs ("current step (modulo number of directions)", "current node"),
+with the total of ~200 thousands states for the puzzle input.
+
+For every state, there is a defined transition to the single next state.
+
+There are six starting states (all pairs with "current step" being zero and "current node" ending with 'A').
+And 263*6 ending states (all pairs with any "current step", and "current node" ending with Z).
+
+Transitions are periodic; since successor of every state is clearly defined, and there are finite number of states,
+this means that no matter at what state we start, we will eventually find ourselves in a loop with the length lower than 200k.
+There might be several non-intersecting loops.
+
+One way to solve the problem would be to use some complicated math in order to compute the result.
+Another, to brute force the result naively, by doing what the puzzle describes:
+running several "ghosts", one from each starting state, and on every step checking if all the current states are "ending".
+
+In order for brute force to work as fast as possible,
+we need to reduce the number of conditions, dereferences and computations within the loop.
+
+There is only so much that we can do regarding the storage
+(200k states means at least 18 bits per state to store the next state, times 200k that's 450KB,
+way larger than any L1 cache).
+
+For simplicity, here I store states in array of 270*1024 u32 (i.e. one megabyte),
+still just a bit more than a modern L2 cache per core;
+and the array layout is optimized for access: index is "current step" * 270 + "current node",
+so on every step we stay more or less in the same region of the array
+(we traverse 1k entries, or 4KB of memory, on average for every step).
+
+For simplicity, in order to check that the state is "final", I slightly renumber the list of nodes;
+nodes that end with Z get the high three bits of their 10-bit index set to 1
+(since the total number of nodes in the sample input is 770).
+Unfortunately, the puzzle input contains collisions
+(there are "final" nodes on lines 320 and 694, with the same last seven bits),
+so I had to manually reorder the puzzle input;
+it was easier to move all nodes ending with Z to the end of the file,
+to make sure that there will be no collisions.
+This way, the state is final iff it has its eight, ninth and tenth bits set.
+It's also easy enough to check all six current states at once
+(just bitwise-and them all, bitwise-and the result with a `0b1110000000` mask, and check that the result matches the mask).
+
+So ultimately, every step is just six bitwise-ands, one comparison
+(which is only true once we found the result, meaning that there is no performance penalty for branch misprediction),
+and six dereferences and assignments.
+
+The resulting performance is over 100 million steps per second (single-threaded),
+meaning that we get to ~250 billion steps in just half an hour.
+
+Unfortunately, the result it produces (around ~250 billion) is apparently incorrect;
+it is not accepted by AoC website.
+Must be some bug somewhere, even though it works correctly on the (modified) sample input.
+
+Another option, with math, would be to iterate over all possible direction numbers,
+and for every direction number (out of 270), and for each permutation of final nodes (6^6~=47k) compute:
+For each one out of the six starting states, how many steps does it take to get to this node? And to get to it again?
+(Answering that question with brute-forcing would require on the order of 200k operations for every starting state and final state,
+and another 200k for every final state, so that's about 200k*(270*6 + 270*6*6) ~= 2 billion operations
+to precompute all ~10k values,
+but it can be optimized if we would identify the shape of transitions,
+and untangle the transition matrix into a set of loops, and of paths leading to these loops).
+
+The answer to such a question would have a form of a_i+b_i*k, for some a and b, for every integer k>=0.
+Knowing a and b, for each of the six questions, we could use arithmetic to find A and B such that
+for every k>=0, A+Bk steps from the starting states produce exactly this configuration.
+With A being the first time when we reach this configuration.
+
+And then we would just need to find the smallest A for all ~10 million configurations.
+
+But I can't be bothered to do this now.
--- a/day08-hard/sample.in
+++ b/day08-hard/sample.in
@ -1,10 +1,12 @@
-LR
+RL

-PPA = (PPB, XXX)
+PPA = (PPL, PPL)
+PPL = (PPB, XXX)
 PPB = (XXX, PPZ)
-PPZ = (PPB, XXX)
-QQA = (QQB, XXX)
+QQA = (QQL, QQL)
+QQL = (QQB, XXX)
 QQB = (QQC, QQC)
 QQC = (QQZ, QQZ)
-QQZ = (QQB, QQB)
-XXX = (XXX, XXX)
+XXX = (XXX, XXX)
+PPZ = (PPB, XXX)
+QQZ = (QQB, QQB)
--- a/day08-hard/src/main.zig
+++ b/day08-hard/src/main.zig
@ -28,6 +28,10 @@ fn StackList(comptime T: type, comptime capacity_type: type, comptime capacity:
            return (&self.mem)[0..self.length];
        }

+        fn getSlice(self: *const Self) []const T {
+            return self.mem[0..self.length];
+        }
+
        fn getLoopedValue(self: *const Self, index: usize) T {
            return self.mem[index % self.length];
        }
@ -45,23 +49,21 @@ fn StackList(comptime T: type, comptime capacity_type: type, comptime capacity:
    };
 }

+const MAX_DIRECTIONS = 270;
 const SIXTEEN_BITS = 65535;
 const FIVE_BITS = 31;
-const DEFAULT_NODE_VALUE = (SIXTEEN_BITS << 16) | SIXTEEN_BITS;
+const END_LINE_MASK = 512 | 256 | 128;

-const Directions = StackList(usize, usize, 1000);
+const Direction = enum(u8) { Left, Right };
+const Directions = StackList(Direction, usize, MAX_DIRECTIONS);

 fn parseDirections(line: []const u8) Directions {
    var result = Directions.init();

-    const left: usize = SIXTEEN_BITS << 16;
-    const right: usize = SIXTEEN_BITS;
-
-    var index: usize = 0;
-    while (index < line.len) : (index += 1) {
-        result.add(switch (line[index]) {
-            'L' => left,
-            'R' => right,
+    for (line) |char| {
+        result.add(switch (char) {
+            'L' => .Left,
+            'R' => .Right,
            else => unreachable,
        });
    }
@ -69,10 +71,17 @@ fn parseDirections(line: []const u8) Directions {
    return result;
 }

-const Nodes = [32 * 32 * 32]usize;
+const Node = struct {
+    line_index: u16,
+    current_label: u16,
+    left_next_label: u16,
+    right_next_label: u16,
+};

-fn parseNodeNumber(line: []const u8) usize {
-    var result: usize = 0;
+const Nodes = [32 * 32 * 32]Node;
+
+fn parseNodeLabel(line: []const u8) u16 {
+    var result: u16 = 0;
    for (line) |char| {
        result = (result << 5) + (char - 'A');
    }
@ -80,56 +89,112 @@ fn parseNodeNumber(line: []const u8) usize {
    return result;
 }

-fn parseNodeLine(line: []const u8, state: *Nodes) void {
-    const current_node_number = parseNodeNumber(line[0..3]);
-    state[current_node_number] = (parseNodeNumber(line[7..10]) << 16) | parseNodeNumber(line[12..15]);
+fn parseNodeLine(nodes: *Nodes, line: []const u8, line_index: u16) void {
+    const current_node = Node{
+        .line_index = if (line[2] == 'Z') (line_index | END_LINE_MASK) else line_index,
+        .current_label = parseNodeLabel(line[0..3]),
+        .left_next_label = parseNodeLabel(line[7..10]),
+        .right_next_label = parseNodeLabel(line[12..15]),
+    };
+    nodes[current_node.current_label] = current_node;
 }

-fn solve(nodes: Nodes, directions: Directions) usize {
-    var current = StackList(usize, usize, 1023).init();
-    {
-        var node_number: usize = 0;
-        while (node_number < nodes.len) : (node_number += 32) {
-            if (nodes[node_number] != DEFAULT_NODE_VALUE) {
-                current.add(node_number);
-            }
+const TransitionMap = [MAX_DIRECTIONS * 1024]u32;
+
+fn createTransitions(nodes: *const Nodes, directions: *const Directions) TransitionMap {
+    var result = std.mem.zeroes(TransitionMap);
+    for (nodes.*) |node| {
+        if (node.current_label == 0) {
+            continue;
+        }
+
+        //std.debug.print("Computing transitions for {d}\n", .{node.line_index});
+
+        for (directions.getSlice(), 0..) |direction, direction_index| {
+            const key = (direction_index << 10) | node.line_index;
+            const next_direction_index = (direction_index + 1) % directions.length;
+            const next = (@as(u32, @intCast(next_direction_index)) << 10) | nodes[
+                switch (direction) {
+                    .Left => node.left_next_label,
+                    .Right => node.right_next_label,
+                }
+            ].line_index;
+            result[key] = next;
+            //std.debug.print("Saved transition from {d} to {d}\n", .{ key, next });
        }
    }

-    std.debug.print("Total number of starting points: {d}\n", .{current.length});
+    return result;
+}
+
+const CurrentNodes = [6]u32;

-    var i: usize = 0;
-    while (true) : (i += 1) {
-        if (i & SIXTEEN_BITS == 0) {
-            var debug_string = StackList(u8, u8, 255).init();
-            for (current.getMutableSlice()) |current_number| {
-                debug_string.add(@as(u8, @intCast((current_number >> 10) & FIVE_BITS)) + 'A');
-                debug_string.add(@as(u8, @intCast((current_number >> 5) & FIVE_BITS)) + 'A');
-                debug_string.add(@as(u8, @intCast((current_number >> 0) & FIVE_BITS)) + 'A');
-                debug_string.add(' ');
+fn getStartingNodes(nodes: *const Nodes) CurrentNodes {
+    var starting_nodes = std.mem.zeroes(CurrentNodes);
+    var current_index: usize = 0;
+    for (nodes.*) |node| {
+        if (node.current_label != 0 and node.current_label & FIVE_BITS == 0) {
+            if (current_index == 0) {
+                for (&starting_nodes) |*starting_node| {
+                    starting_node.* = node.line_index;
+                }
+            } else {
+                starting_nodes[current_index] = node.line_index;
            }
-            std.debug.print("Points at step {d}: {s}\n", .{ i, debug_string.getMutableSlice() });
+
+            current_index += 1;
        }
+    }

-        var are_all_z = true;
-        for (current.getMutableSlice()) |*current_number| {
-            if (current_number.* & FIVE_BITS != 'Z' - 'A') {
-                are_all_z = false;
-            }
+    return starting_nodes;
+}

-            const mask = nodes[current_number.*] & directions.getLoopedValue(i);
-            current_number.* = (mask | (mask >> 16)) & SIXTEEN_BITS;
+fn solve(nodes: *const Nodes, directions: *const Directions) u64 {
+    std.debug.print("Inside solve\n", .{});
+    const transitions = createTransitions(nodes, directions);
+    const starting = getStartingNodes(nodes);
+    //var current: CurrentNodes = .{ 203941, 204125, 203913, 204388, 204337, 203941 };
+
+    std.debug.print("Starting points: {any}\n", .{starting});
+    const DEBUG_MASK = (1 << 27) - 1;
+
+    var current0 = starting[0];
+    var current1 = starting[1];
+    var current2 = starting[2];
+    var current3 = starting[3];
+    var current4 = starting[4];
+    var current5 = starting[5];
+    var i: usize = 0;
+    while (true) : (i += 1) {
+        // Main loop, with hundreds of billions of iterations, so it is performance-critical.
+        if (i & DEBUG_MASK == 0) {
+            std.debug.print(
+                "Points at step {d}: {d} {d} {d} {d} {d} {d} (raw: {any})\n",
+                .{ i, current0 & 1023, current1 & 1023, current2 & 1023, current3 & 1023, current4 & 1023, current5 & 1023, .{ current0, current1, current2, current3, current4, current5 } },
+            );
        }

-        if (are_all_z) {
+        if (current0 & current1 & current2 & current3 & current4 & current5 & @as(u32, END_LINE_MASK) == @as(u32, END_LINE_MASK)) {
+            std.debug.print(
+                "Points at step {d}: {d} {d} {d} {d} {d} {d} (raw: {any})\n",
+                .{ i, current0 & 1023, current1 & 1023, current2 & 1023, current3 & 1023, current4 & 1023, current5 & 1023, .{ current0, current1, current2, current3, current4, current5 } },
+            );
            break;
        }
+
+        current0 = transitions[@as(usize, current0)];
+        current1 = transitions[@as(usize, current1)];
+        current2 = transitions[@as(usize, current2)];
+        current3 = transitions[@as(usize, current3)];
+        current4 = transitions[@as(usize, current4)];
+        current5 = transitions[@as(usize, current5)];
    }

    return i;
 }

 pub fn main() !void {
+    std.debug.print("First line of main\n", .{});
    const stdout = std.io.getStdOut().writer();

    const raw_in = std.io.getStdIn();
@ -141,14 +206,16 @@ pub fn main() !void {
    var directions = parseDirections(first_line);
    _ = try reader.readUntilDelimiterOrEof(&line_buffer, '\n');

-    var nodes: Nodes = undefined;
-    for (&nodes) |*node| {
-        node.* = DEFAULT_NODE_VALUE;
-    }
+    std.debug.print("Creating nodes\n", .{});
+    var nodes = std.mem.zeroes(Nodes);
+    std.debug.print("Created nodes\n", .{});
+    var line_index: u16 = 3;
    while (try reader.readUntilDelimiterOrEof(&line_buffer, '\n')) |line| {
-        parseNodeLine(line, &nodes);
+        parseNodeLine(&nodes, line, line_index);
+        line_index += 1;
    }

-    const result = solve(nodes, directions);
+    std.debug.print("Calling solve\n", .{});
+    const result = solve(&nodes, &directions);
    try stdout.print("{d}\n", .{result});
 }