From c9496033863f779489888215a83a43fdb1d04d93 Mon Sep 17 00:00:00 2001 From: Inga Date: Sat, 9 Dec 2023 00:42:23 +0000 Subject: [PATCH] day 8, part 2 (much faster, but incorrect) --- day08-hard/README.md | 69 +++++++++++++++++ day08-hard/sample.in | 14 ++-- day08-hard/src/main.zig | 165 ++++++++++++++++++++++++++++------------ 3 files changed, 193 insertions(+), 55 deletions(-) create mode 100644 day08-hard/README.md diff --git a/day08-hard/README.md b/day08-hard/README.md new file mode 100644 index 0000000..8faaf1f --- /dev/null +++ b/day08-hard/README.md @@ -0,0 +1,69 @@ +The problem's phase space consists of pairs ("current step (modulo number of directions)", "current node"), +with the total of ~200 thousands states for the puzzle input. + +For every state, there is a defined transition to the single next state. + +There are six starting states (all pairs with "current step" being zero and "current node" ending with 'A'). +And 263*6 ending states (all pairs with any "current step", and "current node" ending with Z). + +Transitions are periodic; since successor of every state is clearly defined, and there are finite number of states, +this means that no matter at what state we start, we will eventually find ourselves in a loop with the length lower than 200k. +There might be several non-intersecting loops. + +One way to solve the problem would be to use some complicated math in order to compute the result. +Another, to brute force the result naively, by doing what the puzzle describes: +running several "ghosts", one from each starting state, and on every step checking if all the current states are "ending". + +In order for brute force to work as fast as possible, +we need to reduce the number of conditions, dereferences and computations within the loop. + +There is only so much that we can do regarding the storage +(200k states means at least 18 bits per state to store the next state, times 200k that's 450KB, +way larger than any L1 cache). + +For simplicity, here I store states in array of 270*1024 u32 (i.e. one megabyte), +still just a bit more than a modern L2 cache per core; +and the array layout is optimized for access: index is "current step" * 270 + "current node", +so on every step we stay more or less in the same region of the array +(we traverse 1k entries, or 4KB of memory, on average for every step). + +For simplicity, in order to check that the state is "final", I slightly renumber the list of nodes; +nodes that end with Z get the high three bits of their 10-bit index set to 1 +(since the total number of nodes in the sample input is 770). +Unfortunately, the puzzle input contains collisions +(there are "final" nodes on lines 320 and 694, with the same last seven bits), +so I had to manually reorder the puzzle input; +it was easier to move all nodes ending with Z to the end of the file, +to make sure that there will be no collisions. +This way, the state is final iff it has its eight, ninth and tenth bits set. +It's also easy enough to check all six current states at once +(just bitwise-and them all, bitwise-and the result with a `0b1110000000` mask, and check that the result matches the mask). + +So ultimately, every step is just six bitwise-ands, one comparison +(which is only true once we found the result, meaning that there is no performance penalty for branch misprediction), +and six dereferences and assignments. + +The resulting performance is over 100 million steps per second (single-threaded), +meaning that we get to ~250 billion steps in just half an hour. + +Unfortunately, the result it produces (around ~250 billion) is apparently incorrect; +it is not accepted by AoC website. +Must be some bug somewhere, even though it works correctly on the (modified) sample input. + +Another option, with math, would be to iterate over all possible direction numbers, +and for every direction number (out of 270), and for each permutation of final nodes (6^6~=47k) compute: +For each one out of the six starting states, how many steps does it take to get to this node? And to get to it again? +(Answering that question with brute-forcing would require on the order of 200k operations for every starting state and final state, +and another 200k for every final state, so that's about 200k*(270*6 + 270*6*6) ~= 2 billion operations +to precompute all ~10k values, +but it can be optimized if we would identify the shape of transitions, +and untangle the transition matrix into a set of loops, and of paths leading to these loops). + +The answer to such a question would have a form of a_i+b_i*k, for some a and b, for every integer k>=0. +Knowing a and b, for each of the six questions, we could use arithmetic to find A and B such that +for every k>=0, A+Bk steps from the starting states produce exactly this configuration. +With A being the first time when we reach this configuration. + +And then we would just need to find the smallest A for all ~10 million configurations. + +But I can't be bothered to do this now. \ No newline at end of file diff --git a/day08-hard/sample.in b/day08-hard/sample.in index f84a0c6..e8b3c9c 100644 --- a/day08-hard/sample.in +++ b/day08-hard/sample.in @@ -1,10 +1,12 @@ -LR +RL -PPA = (PPB, XXX) +PPA = (PPL, PPL) +PPL = (PPB, XXX) PPB = (XXX, PPZ) -PPZ = (PPB, XXX) -QQA = (QQB, XXX) +QQA = (QQL, QQL) +QQL = (QQB, XXX) QQB = (QQC, QQC) QQC = (QQZ, QQZ) -QQZ = (QQB, QQB) -XXX = (XXX, XXX) \ No newline at end of file +XXX = (XXX, XXX) +PPZ = (PPB, XXX) +QQZ = (QQB, QQB) \ No newline at end of file diff --git a/day08-hard/src/main.zig b/day08-hard/src/main.zig index 7ce4620..62b9f35 100644 --- a/day08-hard/src/main.zig +++ b/day08-hard/src/main.zig @@ -28,6 +28,10 @@ fn StackList(comptime T: type, comptime capacity_type: type, comptime capacity: return (&self.mem)[0..self.length]; } + fn getSlice(self: *const Self) []const T { + return self.mem[0..self.length]; + } + fn getLoopedValue(self: *const Self, index: usize) T { return self.mem[index % self.length]; } @@ -45,23 +49,21 @@ fn StackList(comptime T: type, comptime capacity_type: type, comptime capacity: }; } +const MAX_DIRECTIONS = 270; const SIXTEEN_BITS = 65535; const FIVE_BITS = 31; -const DEFAULT_NODE_VALUE = (SIXTEEN_BITS << 16) | SIXTEEN_BITS; +const END_LINE_MASK = 512 | 256 | 128; -const Directions = StackList(usize, usize, 1000); +const Direction = enum(u8) { Left, Right }; +const Directions = StackList(Direction, usize, MAX_DIRECTIONS); fn parseDirections(line: []const u8) Directions { var result = Directions.init(); - const left: usize = SIXTEEN_BITS << 16; - const right: usize = SIXTEEN_BITS; - - var index: usize = 0; - while (index < line.len) : (index += 1) { - result.add(switch (line[index]) { - 'L' => left, - 'R' => right, + for (line) |char| { + result.add(switch (char) { + 'L' => .Left, + 'R' => .Right, else => unreachable, }); } @@ -69,10 +71,17 @@ fn parseDirections(line: []const u8) Directions { return result; } -const Nodes = [32 * 32 * 32]usize; +const Node = struct { + line_index: u16, + current_label: u16, + left_next_label: u16, + right_next_label: u16, +}; -fn parseNodeNumber(line: []const u8) usize { - var result: usize = 0; +const Nodes = [32 * 32 * 32]Node; + +fn parseNodeLabel(line: []const u8) u16 { + var result: u16 = 0; for (line) |char| { result = (result << 5) + (char - 'A'); } @@ -80,56 +89,112 @@ fn parseNodeNumber(line: []const u8) usize { return result; } -fn parseNodeLine(line: []const u8, state: *Nodes) void { - const current_node_number = parseNodeNumber(line[0..3]); - state[current_node_number] = (parseNodeNumber(line[7..10]) << 16) | parseNodeNumber(line[12..15]); +fn parseNodeLine(nodes: *Nodes, line: []const u8, line_index: u16) void { + const current_node = Node{ + .line_index = if (line[2] == 'Z') (line_index | END_LINE_MASK) else line_index, + .current_label = parseNodeLabel(line[0..3]), + .left_next_label = parseNodeLabel(line[7..10]), + .right_next_label = parseNodeLabel(line[12..15]), + }; + nodes[current_node.current_label] = current_node; } -fn solve(nodes: Nodes, directions: Directions) usize { - var current = StackList(usize, usize, 1023).init(); - { - var node_number: usize = 0; - while (node_number < nodes.len) : (node_number += 32) { - if (nodes[node_number] != DEFAULT_NODE_VALUE) { - current.add(node_number); - } +const TransitionMap = [MAX_DIRECTIONS * 1024]u32; + +fn createTransitions(nodes: *const Nodes, directions: *const Directions) TransitionMap { + var result = std.mem.zeroes(TransitionMap); + for (nodes.*) |node| { + if (node.current_label == 0) { + continue; + } + + //std.debug.print("Computing transitions for {d}\n", .{node.line_index}); + + for (directions.getSlice(), 0..) |direction, direction_index| { + const key = (direction_index << 10) | node.line_index; + const next_direction_index = (direction_index + 1) % directions.length; + const next = (@as(u32, @intCast(next_direction_index)) << 10) | nodes[ + switch (direction) { + .Left => node.left_next_label, + .Right => node.right_next_label, + } + ].line_index; + result[key] = next; + //std.debug.print("Saved transition from {d} to {d}\n", .{ key, next }); } } - std.debug.print("Total number of starting points: {d}\n", .{current.length}); + return result; +} + +const CurrentNodes = [6]u32; - var i: usize = 0; - while (true) : (i += 1) { - if (i & SIXTEEN_BITS == 0) { - var debug_string = StackList(u8, u8, 255).init(); - for (current.getMutableSlice()) |current_number| { - debug_string.add(@as(u8, @intCast((current_number >> 10) & FIVE_BITS)) + 'A'); - debug_string.add(@as(u8, @intCast((current_number >> 5) & FIVE_BITS)) + 'A'); - debug_string.add(@as(u8, @intCast((current_number >> 0) & FIVE_BITS)) + 'A'); - debug_string.add(' '); +fn getStartingNodes(nodes: *const Nodes) CurrentNodes { + var starting_nodes = std.mem.zeroes(CurrentNodes); + var current_index: usize = 0; + for (nodes.*) |node| { + if (node.current_label != 0 and node.current_label & FIVE_BITS == 0) { + if (current_index == 0) { + for (&starting_nodes) |*starting_node| { + starting_node.* = node.line_index; + } + } else { + starting_nodes[current_index] = node.line_index; } - std.debug.print("Points at step {d}: {s}\n", .{ i, debug_string.getMutableSlice() }); + + current_index += 1; } + } - var are_all_z = true; - for (current.getMutableSlice()) |*current_number| { - if (current_number.* & FIVE_BITS != 'Z' - 'A') { - are_all_z = false; - } + return starting_nodes; +} - const mask = nodes[current_number.*] & directions.getLoopedValue(i); - current_number.* = (mask | (mask >> 16)) & SIXTEEN_BITS; +fn solve(nodes: *const Nodes, directions: *const Directions) u64 { + std.debug.print("Inside solve\n", .{}); + const transitions = createTransitions(nodes, directions); + const starting = getStartingNodes(nodes); + //var current: CurrentNodes = .{ 203941, 204125, 203913, 204388, 204337, 203941 }; + + std.debug.print("Starting points: {any}\n", .{starting}); + const DEBUG_MASK = (1 << 27) - 1; + + var current0 = starting[0]; + var current1 = starting[1]; + var current2 = starting[2]; + var current3 = starting[3]; + var current4 = starting[4]; + var current5 = starting[5]; + var i: usize = 0; + while (true) : (i += 1) { + // Main loop, with hundreds of billions of iterations, so it is performance-critical. + if (i & DEBUG_MASK == 0) { + std.debug.print( + "Points at step {d}: {d} {d} {d} {d} {d} {d} (raw: {any})\n", + .{ i, current0 & 1023, current1 & 1023, current2 & 1023, current3 & 1023, current4 & 1023, current5 & 1023, .{ current0, current1, current2, current3, current4, current5 } }, + ); } - if (are_all_z) { + if (current0 & current1 & current2 & current3 & current4 & current5 & @as(u32, END_LINE_MASK) == @as(u32, END_LINE_MASK)) { + std.debug.print( + "Points at step {d}: {d} {d} {d} {d} {d} {d} (raw: {any})\n", + .{ i, current0 & 1023, current1 & 1023, current2 & 1023, current3 & 1023, current4 & 1023, current5 & 1023, .{ current0, current1, current2, current3, current4, current5 } }, + ); break; } + + current0 = transitions[@as(usize, current0)]; + current1 = transitions[@as(usize, current1)]; + current2 = transitions[@as(usize, current2)]; + current3 = transitions[@as(usize, current3)]; + current4 = transitions[@as(usize, current4)]; + current5 = transitions[@as(usize, current5)]; } return i; } pub fn main() !void { + std.debug.print("First line of main\n", .{}); const stdout = std.io.getStdOut().writer(); const raw_in = std.io.getStdIn(); @@ -141,14 +206,16 @@ pub fn main() !void { var directions = parseDirections(first_line); _ = try reader.readUntilDelimiterOrEof(&line_buffer, '\n'); - var nodes: Nodes = undefined; - for (&nodes) |*node| { - node.* = DEFAULT_NODE_VALUE; - } + std.debug.print("Creating nodes\n", .{}); + var nodes = std.mem.zeroes(Nodes); + std.debug.print("Created nodes\n", .{}); + var line_index: u16 = 3; while (try reader.readUntilDelimiterOrEof(&line_buffer, '\n')) |line| { - parseNodeLine(line, &nodes); + parseNodeLine(&nodes, line, line_index); + line_index += 1; } - const result = solve(nodes, directions); + std.debug.print("Calling solve\n", .{}); + const result = solve(&nodes, &directions); try stdout.print("{d}\n", .{result}); }