all: do some restructuring

I don't like big monolithic source files, so let's restructure a bit. parser.zig is still bigger than I would like it to be, but there isn't a good way to break up the two state machine parsers, which take up most of the space. This is the last junk commit before I am seriously going to implement the "streaming" parser. Which is the last change before implementing deserialization to object. I am definitely not just spinning my wheels here.
2023-09-24 18:22:12 -07:00 · 2023-09-24 18:22:12 -07:00 · 38e47b39dc
commit 38e47b39dc
parent 8684fab23c
6 changed files with 639 additions and 649 deletions
--- a/build.zig
+++ b/build.zig
@ -4,7 +4,7 @@ pub fn build(b: *std.Build) void {
    const target = b.standardTargetOptions(.{});
    const nice = b.addModule("nice", .{
-        .source_file = .{ .path = "src/config.zig" },
+        .source_file = .{ .path = "src/nice.zig" },
    });
    add_examples(b, .{
--- a/src/linebuffer.zig
+++ b/src/linebuffer.zig
@ -0,0 +1,105 @@
 const std = @import("std");
 pub const IndexSlice = struct { start: usize, len: usize };
 pub const LineBuffer = struct {
    allocator: std.mem.Allocator,
    internal: FixedLineBuffer,
    used: usize,
    pub const default_capacity: usize = 4096;
    pub const Error = std.mem.Allocator.Error;
    pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
        return initCapacity(allocator, default_capacity);
    }
    pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
        return .{
            .allocator = allocator,
            .internal = .{
                .buffer = try allocator.alloc(u8, capacity),
                .window = .{ .start = 0, .len = 0 },
            },
            .used = 0,
        };
    }
    pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
        if (data.len == 0) return;
        // TODO: check for usize overflow here if we want Maximum Robustness
        const new_window_len = self.internal.window.len + data.len;
        // data cannot fit in the buffer with our scan window, so we have to realloc
        if (new_window_len > self.internal.buffer.len) {
            // TODO: adopt an overallocation strategy? Will potentially avoid allocating
            //       on every invocation but will cause the buffer to oversize
            try self.allocator.realloc(self.internal.buffer, new_window_len);
            self.rehome();
            @memcpy(self.internal.buffer[self.used..].ptr, data);
            self.used = new_window_len;
            self.internal.window.len = new_window_len;
        }
        // data will fit, but needs to be moved in the buffer
        else if (self.internal.window.start + new_window_len > self.internal.buffer.len) {
            self.rehome();
            @memcpy(self.internal.buffer[self.used..].ptr, data);
            self.used = new_window_len;
            self.internal.window.len = new_window_len;
        }
        // data can simply be appended
        else {
            @memcpy(self.internal.buffer[self.used..].ptr, data);
        }
    }
    /// The memory returned by this function is valid until the next call to `feed`.
    /// The resulting slice does not include the newline character.
    pub fn nextLine(self: *LineBuffer) ?[]const u8 {
        return self.internal.nextLine();
    }
    fn rehome(self: *LineBuffer) void {
        self.internal.rehome();
        self.used = self.internal.window.len;
    }
 };
 pub const FixedLineBuffer = struct {
    buffer: []const u8,
    window: IndexSlice,
    pub fn init(data: []const u8) FixedLineBuffer {
        return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
    }
    pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
        if (self.window.start >= self.buffer.len or self.window.len == 0)
            return null;
        const window = self.buffer[self.window.start..][0..self.window.len];
        const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
        self.window.start += split + 1;
        self.window.len -= split + 1;
        return window[0..split];
    }
    // move the current scan window to the beginning of the buffer. This internal
    // method is used by LineBuffer.
    fn rehome(self: *LineBuffer) usize {
        if (self.window.start == 0) return;
        const window = self.buffer[self.window.start..][0..self.window.len];
        // if the window is longer than its starting index, the memory move will be
        // overlapping, so we can't use memcpy
        if (self.window.len > self.window.start)
            std.mem.copyForwards(u8, self.buffer, window)
        else
            @memcpy(self.buffer.ptr, window);
        self.window.start = 0;
    }
 };
--- a/src/nice.zig
+++ b/src/nice.zig
@ -0,0 +1,67 @@
 // Heavily inspired by, but not quite compatible with, NestedText. Key differences:
 //
 // - Doesn't support multiline keys (this means map keys cannot start with
 //   ' ', \t, #, {, [, |, or >, and they cannot contain :)
 // - Allows using tabs for indentation (but not mixed tabs/spaces)
 // - Indentation must be quantized consistently throughout the document. e.g.
 //   every nested layer being exactly 2 spaces past its parent. Tabs may
 //   only use one tab per indentation level.
 // - Allows flow-style lists, maps, and strings on the same line as map keys or
 //   list items (i.e. the following are legal):
 //
 //      key: {inline: map}
 //      key: [inline, list]
 //      key: > inline string
 //      - {map: item}
 //      - [list, item]
 //      - > inline string
 //
 //   The string case retains the possibility of having an inline map value starting
 //   with {, [, or >
 // - a map keys and list item dashes must be followed by a value or an indented
 //   section to reduce parser quantum state. This means that
 //
 //      foo:
 //      bar: baz
 //
 //   or
 //
 //      -
 //      - qux
 //
 //   are not valid. This can be represented with an inline empty string after foo:
 //
 //      foo: >
 //      bar: baz
 //
 //   or
 //
 //      - >
 //      - qux
 //
 // - newlines are strictly LF, if the parser finds CR, it is an error
 // - blank lines may not contain any whitespace characters except the single LF
 // - Additional string indicator `|` for soft-wrapped strings, i.e.
 //
 //      key: | this is not special
 //      key:
 //        | these lines are
 //        | soft-wrapped
 //
 //   soft-wrapped lines are joined with a ' ' instead of a newline character.
 //   Like multiline strings, the final space is stripped (I guess this is a very
 //   janky way to add trailing whitespace to a string).
 //
 // - terminated strings to allow trailing whitespace:
 //      | this string has trailing whitespace    |
 //      > and so does this one                   |
 // - The parser is both strict and probably sloppy and may have weird edge
 //   cases since I'm slinging code, not writing a spec. For example, tabs are
 //   not trimmed from the values of inline lists/maps
 const std = @import("std");
 pub const buffers = @import("./linebuffer.zig");
 pub const tokenizer = @import("./tokenizer.zig");
 pub const parser = @import("./parser.zig");
 pub const Parser = parser.Parser;
--- a/src/parser.zig
+++ b/src/parser.zig
@ -1,69 +1,8 @@
 // Heavily inspired by, but not quite compatible with, NestedText. Key differences:
 //
 // - Doesn't support multiline keys (this means map keys cannot start with
 //   ' ', \t, #, {, [, |, or >, and they cannot contain :)
 // - Allows using tabs for indentation (but not mixed tabs/spaces)
 // - Indentation must be quantized consistently throughout the document. e.g.
 //   every nested layer being exactly 2 spaces past its parent. Tabs may
 //   only use one tab per indentation level.
 // - Allows flow-style lists, maps, and strings on the same line as map keys or
 //   list items (i.e. the following are legal):
 //
 //      key: {inline: map}
 //      key: [inline, list]
 //      key: > inline string
 //      - {map: item}
 //      - [list, item]
 //      - > inline string
 //
 //   The string case retains the possibility of having an inline map value starting
 //   with {, [, or >
 // - inline lists and maps cannot contain other inline structures. This may
 //   change, as writing {:[{:[{:[{:[{:[{:[]}]}]}]}]}]} seems tremendously useful
 // - a map keys and list item dashes must be followed by a value or an indented
 //   section to reduce parser quantum state. This means that
 //
 //      foo:
 //      bar: baz
 //
 //   or
 //
 //      -
 //      - qux
 //
 //   are not valid. This can be represented with an inline empty string after foo:
 //
 //      foo: >
 //      bar: baz
 //
 //   or
 //
 //      - >
 //      - qux
 //
 // - newlines are strictly LF, if the parser finds CR, it is an error
 // - blank lines may not contain any whitespace characters except the single LF
 // - Additional string indicator `|` for soft-wrapped strings, i.e.
 //
 //      key: | this is not special
 //      key:
 //        | these lines are
 //        | soft-wrapped
 //
 //   soft-wrapped lines are joined with a ' ' instead of a newline character.
 //   Like multiline strings, the final space is stripped (I guess this is a very
 //   janky way to add trailing whitespace to a string).
 //
 // - terminated strings to allow trailing whitespace:
 //      | this string has trailing whitespace    |
 //      > and so does this one                   |
 // - The parser is both strict and probably sloppy and may have weird edge
 //   cases since I'm slinging code, not writing a spec. For example, tabs are
 //   not trimmed from the values of inline lists/maps
 const std = @import("std");
-pub const IndexSlice = struct { start: usize, len: usize };
+const buffers = @import("./linebuffer.zig");
 const tokenizer = @import("./tokenizer.zig");
 const Value = @import("./parser/value.zig").Value;
 pub const Diagnostics = struct {
    row: usize,
@ -71,481 +10,51 @@ pub const Diagnostics = struct {
    message: []const u8,
 };
-pub const LineBuffer = struct {
+pub const Error = error{
-    allocator: std.mem.Allocator,
+    UnexpectedIndent,
-    buffer: []u8,
+    UnexpectedValue,
-    used: usize,
+    ExtraContent,
-    window: IndexSlice,
+    EmptyDocument,
    DuplicateKey,
    BadMapEntry,
    BadState,
    BadToken,
    Fail,
 } || tokenizer.Error || std.mem.Allocator.Error;
-    pub const default_capacity: usize = 4096;
+pub const DuplicateKeyBehavior = enum {
-    pub const Error = std.mem.Allocator.Error;
+    use_first,
    use_last,
    fail,
 };
-    pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
+pub const DefaultObject = enum {
-        return initCapacity(allocator, default_capacity);
+    scalar,
-    }
+    string,
    list,
    map,
    fail,
 };
-    pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
+const ParseState = enum { initial, value, done };
 pub const Document = struct {
    arena: std.heap.ArenaAllocator,
    root: Value,
    pub fn init(alloc: std.mem.Allocator) Document {
        return .{
-            .allocator = allocator,
+            .arena = std.heap.ArenaAllocator.init(alloc),
-            .buffer = try allocator.alloc(u8, capacity),
+            .root = undefined,
            .used = 0,
            .window = .{ .start = 0, .len = 0 },
        };
    }
-    pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
+    pub fn printDebug(self: Document) void {
-        if (data.len == 0) return;
+        return self.root.printDebug();
        // TODO: check for usize overflow here if we want Maximum Robustness
        const new_window_len = self.window.len + data.len;
        // data cannot fit in the buffer with our scan window, so we have to realloc
        if (new_window_len > self.buffer.len) {
            // TODO: adopt an overallocation strategy? Will potentially avoid allocating
            //       on every invocation but will cause the buffer to oversize
            try self.allocator.realloc(self.buffer, new_window_len);
            self.rehome();
            @memcpy(self.buffer[self.used..].ptr, data);
            self.used = new_window_len;
            self.window.len = new_window_len;
        }
        // data will fit, but needs to be moved in the buffer
        else if (self.window.start + new_window_len > self.buffer.len) {
            self.rehome();
            @memcpy(self.buffer[self.used..].ptr, data);
            self.used = new_window_len;
            self.window.len = new_window_len;
        }
        // data can simply be appended
        else {
            @memcpy(self.buffer[self.used..].ptr, data);
        }
    }
-    /// The memory returned by this function is valid until the next call to `feed`.
+    pub fn deinit(self: Document) void {
-    /// The resulting slice does not include the newline character.
+        self.arena.deinit();
    pub fn nextLine(self: *LineBuffer) ?[]const u8 {
        if (self.window.start >= self.buffer.len or self.window.len == 0)
            return null;
        const window = self.buffer[self.window.start..][0..self.window.len];
        const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
        self.window.start += split + 1;
        self.window.len -= split + 1;
        return window[0..split];
    }
    fn rehome(self: *LineBuffer) void {
        if (self.window.start == 0) return;
        const window = self.buffer[self.window.start..][0..self.window.len];
        if (self.window.len > self.window.start)
            std.mem.copyForwards(u8, self.buffer, window)
        else
            @memcpy(self.buffer.ptr, window);
        self.window.start = 0;
        self.used = window.len;
    }
 };
 pub const FixedLineBuffer = struct {
    buffer: []const u8,
    window: IndexSlice,
    pub fn init(data: []const u8) FixedLineBuffer {
        return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
    }
    pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
        if (self.window.start >= self.buffer.len or self.window.len == 0)
            return null;
        const window = self.buffer[self.window.start..][0..self.window.len];
        const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
        self.window.start += split + 1;
        self.window.len -= split + 1;
        return window[0..split];
    }
 };
 const IndentationType = union(enum) {
    immaterial: void,
    spaces: usize,
    tabs: void,
 };
 const InlineItem = union(enum) {
    empty: void,
    scalar: []const u8,
    line_string: []const u8,
    space_string: []const u8,
    flow_list: []const u8,
    flow_map: []const u8,
    fn lineEnding(self: InlineItem) u8 {
        return switch (self) {
            .line_string => '\n',
            .space_string => ' ',
            else => unreachable,
        };
    }
 };
 const LineContents = union(enum) {
    comment: []const u8,
    in_line: InlineItem,
    list_item: InlineItem,
    map_item: struct { key: []const u8, val: InlineItem },
 };
 // we can dedent multiple levels at once. Example:
 //
 // foo:
 //   bar:
 //     > a
 //     > string
 // baz: [qux]
 //
 // capturing this is conceptually simple, but implementing it without complex
 // indentation tracking requires quantizing the indentation. This means our
 // IndentationType will also need to track the number of spaces used for
 // indentation, as detected. Then every line we have to check indent rem the
 // quantization level == 0 (otherwise we broke quantization) and compute indent
 // div the quantization level to give us our effective indentation level.
 const ShiftDirection = enum { indent, dedent, none };
 const RelativeIndent = union(ShiftDirection) {
    indent: void,
    dedent: usize,
    none: void,
 };
 const Line = struct {
    indent: RelativeIndent,
    contents: LineContents,
    raw: []const u8,
 };
 pub fn LineTokenizer(comptime Buffer: type) type {
    return struct {
        buffer: Buffer,
        index: usize = 0,
        indentation: IndentationType = .immaterial,
        last_indent: usize = 0,
        diagnostics: *Diagnostics,
        row: usize = 0,
        const Error = error{
            BadToken,
            MixedIndentation,
            UnquantizedIndentation,
            TooMuchIndentation,
            MissingNewline,
            TrailingWhitespace,
            Impossible,
        };
        pub fn next(self: *@This()) Error!?Line {
            lineloop: while (self.buffer.nextLine()) |raw_line| {
                var indent: usize = 0;
                for (raw_line, 0..) |char, idx| {
                    switch (char) {
                        ' ' => {
                            switch (self.indentation) {
                                // There's a weird coupling here because we can't set this until
                                // all spaces have been consumed. I also thought about ignoring
                                // spaces on comment lines since those don't affect the
                                // relative indent/dedent, but then we would allow comments
                                // to ignore our indent quantum, which I dislike due to it making
                                // ugly documents.
                                .immaterial => self.indentation = .{ .spaces = 0 },
                                .spaces => {},
                                .tabs => return error.MixedIndentation,
                            }
                        },
                        '\t' => {
                            switch (self.indentation) {
                                .immaterial => self.indentation = .tabs,
                                .spaces => return error.MixedIndentation,
                                .tabs => {},
                            }
                        },
                        '\r' => {
                            return error.BadToken;
                        },
                        else => {
                            indent = idx;
                            break;
                        },
                    }
                } else {
                    if (raw_line.len > 0) return error.TrailingWhitespace;
                    continue :lineloop;
                }
                var quantized: usize = if (self.indentation == .spaces) quant: {
                    if (self.indentation.spaces == 0) {
                        self.indentation.spaces = indent;
                    }
                    if (@rem(indent, self.indentation.spaces) != 0)
                        return error.UnquantizedIndentation;
                    break :quant @divExact(indent, self.indentation.spaces);
                } else indent;
                const relative: RelativeIndent = if (quantized > self.last_indent) rel: {
                    if ((quantized - self.last_indent) > 1)
                        return error.TooMuchIndentation;
                    break :rel .indent;
                } else if (quantized < self.last_indent)
                    .{ .dedent = self.last_indent - quantized }
                else
                    .none;
                defer {
                    self.row += 1;
                    self.last_indent = quantized;
                }
                const line = raw_line[indent..];
                // this should not be possible, as empty lines are caught earlier.
                if (line.len == 0) return error.Impossible;
                switch (line[0]) {
                    '#' => {
                        // simply lie about indentation when the line is a comment.
                        quantized = self.last_indent;
                        return .{
                            .indent = .none,
                            .contents = .{ .comment = line[1..] },
                            .raw = line,
                        };
                    },
                    '|', '>', '[', '{' => {
                        return .{
                            .indent = relative,
                            .contents = .{ .in_line = try detectInlineItem(line) },
                            .raw = line,
                        };
                    },
                    '-' => {
                        if (line.len > 1 and line[1] != ' ') return error.BadToken;
                        return if (line.len == 1) .{
                            .indent = relative,
                            .contents = .{ .list_item = .empty },
                            .raw = line,
                        } else .{
                            .indent = relative,
                            .contents = .{ .list_item = try detectInlineItem(line[2..]) },
                            .raw = line,
                        };
                    },
                    else => {
                        for (line, 0..) |char, idx| {
                            if (char == ':') {
                                if (idx + 1 == line.len) return .{
                                    .indent = relative,
                                    .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
                                    .raw = line,
                                };
                                if (line[idx + 1] != ' ') return error.BadToken;
                                return .{
                                    .indent = relative,
                                    .contents = .{ .map_item = .{
                                        .key = line[0..idx],
                                        .val = try detectInlineItem(line[idx + 2 ..]),
                                    } },
                                    .raw = line,
                                };
                            }
                        }
                        return .{
                            .indent = relative,
                            .contents = .{ .in_line = .{ .scalar = line } },
                            .raw = line,
                        };
                    },
                }
                // somehow everything else has failed
                return error.Impossible;
            }
            return null;
        }
        fn detectInlineItem(buf: []const u8) Error!InlineItem {
            if (buf.len == 0) return .empty;
            switch (buf[0]) {
                '>', '|' => |char| {
                    if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
                    const slice: []const u8 = switch (buf[buf.len - 1]) {
                        ' ', '\t' => return error.TrailingWhitespace,
                        '|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
                        else => buf[@min(2, buf.len)..buf.len],
                    };
                    return if (char == '>')
                        .{ .line_string = slice }
                    else
                        .{ .space_string = slice };
                },
                '[' => {
                    if (buf.len < 2 or buf[buf.len - 1] != ']')
                        return error.BadToken;
                    // keep the closing ] for the flow parser
                    return .{ .flow_list = buf[1..] };
                },
                '{' => {
                    if (buf.len < 2 or buf[buf.len - 1] != '}')
                        return error.BadToken;
                    // keep the closing } fpr the flow parser
                    return .{ .flow_map = buf[1..] };
                },
                else => {
                    if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t')
                        return error.TrailingWhitespace;
                    return .{ .scalar = buf };
                },
            }
        }
    };
 }
 pub const Value = union(enum) {
    pub const String = std.ArrayList(u8);
    pub const Map = std.StringArrayHashMap(Value);
    pub const List = std.ArrayList(Value);
    pub const TagType = @typeInfo(Value).Union.tag_type.?;
    scalar: String,
    string: String,
    list: List,
    flow_list: List,
    map: Map,
    flow_map: Map,
    pub inline fn fromScalar(alloc: std.mem.Allocator, input: []const u8) !Value {
        return try _fromScalarOrString(alloc, .scalar, input);
    }
    pub inline fn fromString(alloc: std.mem.Allocator, input: []const u8) !Value {
        return try _fromScalarOrString(alloc, .string, input);
    }
    inline fn _fromScalarOrString(alloc: std.mem.Allocator, comptime classification: TagType, input: []const u8) !Value {
        var res = @unionInit(Value, @tagName(classification), try String.initCapacity(alloc, input.len));
        @field(res, @tagName(classification)).appendSliceAssumeCapacity(input);
        return res;
    }
    pub inline fn newScalar(alloc: std.mem.Allocator) Value {
        return .{ .scalar = String.init(alloc) };
    }
    pub inline fn newString(alloc: std.mem.Allocator) Value {
        return .{ .string = String.init(alloc) };
    }
    pub inline fn newList(alloc: std.mem.Allocator) Value {
        return .{ .list = List.init(alloc) };
    }
    pub inline fn newFlowList(alloc: std.mem.Allocator) Value {
        return .{ .flow_list = List.init(alloc) };
    }
    pub inline fn newMap(alloc: std.mem.Allocator) Value {
        return .{ .map = Map.init(alloc) };
    }
    pub inline fn newFlowMap(alloc: std.mem.Allocator) Value {
        return .{ .flow_map = Map.init(alloc) };
    }
    pub fn printDebug(self: Value) void {
        self.printRecursive(0);
        std.debug.print("\n", .{});
    }
    fn printRecursive(self: Value, indent: usize) void {
        switch (self) {
            .scalar, .string => |str| {
                if (std.mem.indexOfScalar(u8, str.items, '\n')) |_| {
                    var lines = std.mem.splitScalar(u8, str.items, '\n');
                    std.debug.print("\n", .{});
                    while (lines.next()) |line| {
                        std.debug.print(
                            "{[empty]s: >[indent]}{[line]s}{[nl]s}",
                            .{
                                .empty = "",
                                .indent = indent,
                                .line = line,
                                .nl = if (lines.peek() == null) "" else "\n",
                            },
                        );
                    }
                } else {
                    std.debug.print("{s}", .{str.items});
                }
            },
            .list, .flow_list => |list| {
                if (list.items.len == 0) {
                    std.debug.print("[]", .{});
                    return;
                }
                std.debug.print("[\n", .{});
                for (list.items, 0..) |value, idx| {
                    std.debug.print("{[empty]s: >[indent]}[{[idx]d}] = ", .{ .empty = "", .indent = indent, .idx = idx });
                    value.printRecursive(indent + 2);
                    std.debug.print(",\n", .{});
                }
                std.debug.print(
                    "{[empty]s: >[indent]}]",
                    .{ .empty = "", .indent = indent },
                );
            },
            .map, .flow_map => |map| {
                if (map.count() == 0) {
                    std.debug.print("{{}}", .{});
                    return;
                }
                std.debug.print("{{\n", .{});
                var iter = map.iterator();
                while (iter.next()) |entry| {
                    std.debug.print(
                        "{[empty]s: >[indent]}{[key]s}: ",
                        .{ .empty = "", .indent = indent + 2, .key = entry.key_ptr.* },
                    );
                    entry.value_ptr.printRecursive(indent + 4);
                    std.debug.print(",\n", .{});
                }
                std.debug.print(
                    "{[empty]s: >[indent]}}}",
                    .{ .empty = "", .indent = indent },
                );
            },
        }
    }
 };
@ -559,64 +68,13 @@ pub const Parser = struct {
        .message = "all is well",
    },
    pub const Error = error{
        UnexpectedIndent,
        UnexpectedValue,
        ExtraContent,
        EmptyDocument,
        DuplicateKey,
        BadMapEntry,
        BadState,
        BadToken,
        Fail,
    } || LineTokenizer(FixedLineBuffer).Error || std.mem.Allocator.Error;
    pub const DuplicateKeyBehavior = enum {
        use_first,
        use_last,
        fail,
    };
    pub const DefaultObject = enum {
        string,
        list,
        map,
        fail,
    };
    pub const ParseState = enum {
        initial,
        value,
        done,
    };
    pub const Document = struct {
        arena: std.heap.ArenaAllocator,
        root: Value,
        pub fn init(alloc: std.mem.Allocator) Document {
            return .{
                .arena = std.heap.ArenaAllocator.init(alloc),
                .root = undefined,
            };
        }
        pub fn printDebug(self: Document) void {
            return self.root.printDebug();
        }
        pub fn deinit(self: Document) void {
            self.arena.deinit();
        }
    };
    pub const State = struct {
        pub const Stack = std.ArrayList(*Value);
        document: Document,
        value_stack: Stack,
-        state: ParseState = .initial,
+        state: enum { initial, value, done } = .initial,
-        expect_shift: ShiftDirection = .none,
+        expect_shift: tokenizer.ShiftDirection = .none,
        dangling_key: ?[]const u8 = null,
        pub fn init(alloc: std.mem.Allocator) State {
@ -637,13 +95,13 @@ pub const Parser = struct {
        const arena_alloc = document.arena.allocator();
        var state: ParseState = .initial;
-        var expect_shift: ShiftDirection = .none;
+        var expect_shift: tokenizer.ShiftDirection = .none;
        var dangling_key: ?[]const u8 = null;
        var stack = std.ArrayList(*Value).init(arena_alloc);
        defer stack.deinit();
-        var tok: LineTokenizer(FixedLineBuffer) = .{
+        var tok: tokenizer.LineTokenizer(buffers.FixedLineBuffer) = .{
-            .buffer = FixedLineBuffer.init(buffer),
+            .buffer = buffers.FixedLineBuffer.init(buffer),
            .diagnostics = &self.diagnostics,
        };
@ -656,7 +114,7 @@ pub const Parser = struct {
            flipflop: while (flip) : (flop = true) {
                switch (state) {
                    .initial => {
-                        if (line.indent == .indent) return error.UnexpectedIndent;
+                        if (line.shift == .indent) return error.UnexpectedIndent;
                        switch (line.contents) {
                            // we filter out comments above
@ -737,14 +195,14 @@ pub const Parser = struct {
                        // switch is embedded.
                        .scalar, .flow_list, .flow_map => unreachable,
                        .string => |*string| {
-                            if (line.indent == .indent)
+                            if (line.shift == .indent)
                                return error.UnexpectedIndent;
-                            if (!flop and line.indent == .dedent) {
+                            if (!flop and line.shift == .dedent) {
                                // kick off the last trailing space or newline
                                _ = string.pop();
-                                var dedent_depth = line.indent.dedent;
+                                var dedent_depth = line.shift.dedent;
                                while (dedent_depth > 0) : (dedent_depth -= 1)
                                    _ = stack.pop();
@ -772,7 +230,7 @@ pub const Parser = struct {
                            //
                            // the first line here creates the expect_shift, but the second line
                            // is a valid continuation of the list despite not being indented
-                            if (!flop and (expect_shift == .indent and line.indent != .indent))
+                            if (!flop and (expect_shift == .indent and line.shift != .indent))
                                try list.append(Value.newScalar(arena_alloc));
                            // Consider:
@ -782,11 +240,11 @@ pub const Parser = struct {
                            //    - inline scalar
                            //
                            // the own-line scalar will not push the stack but the next list item will be a dedent
-                            if (!flop and line.indent == .dedent) {
+                            if (!flop and line.shift == .dedent) {
-                                // if line.indent.dedent is 1 and we're expecting it, the stack will not be popped,
+                                // if line.shift.dedent is 1 and we're expecting it, the stack will not be popped,
                                // but we will continue loop flipflop. However, flop will be set to false on the next
                                // trip, so this if prong will not be run again.
-                                var dedent_depth = line.indent.dedent - @intFromBool(expect_shift == .dedent);
+                                var dedent_depth = line.shift.dedent - @intFromBool(expect_shift == .dedent);
                                while (dedent_depth > 0) : (dedent_depth -= 1)
                                    _ = stack.pop();
@ -799,7 +257,7 @@ pub const Parser = struct {
                                .in_line => |in_line| {
                                    // assert that this line has been indented. this is required for an inline value when
                                    // the stack is in list mode.
-                                    if (expect_shift != .indent or line.indent != .indent)
+                                    if (expect_shift != .indent or line.shift != .indent)
                                        return error.UnexpectedValue;
                                    expect_shift = .dedent;
@ -819,7 +277,7 @@ pub const Parser = struct {
                                    }
                                },
                                .list_item => |value| {
-                                    if (flop or (line.indent == .none or line.indent == .dedent)) {
+                                    if (flop or (line.shift == .none or line.shift == .dedent)) {
                                        expect_shift = .none;
                                        switch (value) {
                                            .empty => expect_shift = .indent,
@ -828,7 +286,7 @@ pub const Parser = struct {
                                            .flow_list => |str| try list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
                                            .flow_map => |str| try list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
                                        }
-                                    } else if (line.indent == .indent) {
+                                    } else if (line.shift == .indent) {
                                        if (expect_shift != .indent) return error.UnexpectedIndent;
                                        const new_list = try appendListGetValue(list, Value.newList(arena_alloc));
@ -847,7 +305,7 @@ pub const Parser = struct {
                                    //
                                    // dedenting back to the list stack level requires list_item
-                                    if (line.indent != .indent)
+                                    if (line.shift != .indent)
                                        return error.UnexpectedValue;
                                    const new_map = try appendListGetValue(list, Value.newMap(arena_alloc));
@ -865,7 +323,7 @@ pub const Parser = struct {
                            //
                            // the first line here creates the expect_shift, but the second line
                            // is a valid continuation of the map despite not being indented
-                            if (!flop and (expect_shift == .indent and line.indent != .indent)) {
+                            if (!flop and (expect_shift == .indent and line.shift != .indent)) {
                                try putMap(
                                    map,
                                    dangling_key orelse return error.Fail,
@ -875,8 +333,8 @@ pub const Parser = struct {
                                dangling_key = null;
                            }
-                            if (!flop and line.indent == .dedent) {
+                            if (!flop and line.shift == .dedent) {
-                                var dedent_depth = line.indent.dedent - @intFromBool(expect_shift == .dedent);
+                                var dedent_depth = line.shift.dedent - @intFromBool(expect_shift == .dedent);
                                while (dedent_depth > 0) : (dedent_depth -= 1)
                                    _ = stack.pop();
@ -889,7 +347,7 @@ pub const Parser = struct {
                                .in_line => |in_line| {
                                    // assert that this line has been indented. this is required for an inline value when
                                    // the stack is in map mode.
-                                    if (expect_shift != .indent or line.indent != .indent or dangling_key == null)
+                                    if (expect_shift != .indent or line.shift != .indent or dangling_key == null)
                                        return error.UnexpectedValue;
                                    expect_shift = .dedent;
@ -921,7 +379,7 @@ pub const Parser = struct {
                                    //
                                    // dedenting back to the map stack level requires map_item
-                                    if (expect_shift != .indent or line.indent != .indent or dangling_key == null)
+                                    if (expect_shift != .indent or line.shift != .indent or dangling_key == null)
                                        return error.UnexpectedValue;
                                    const new_list = try putMapGetValue(map, dangling_key.?, Value.newList(arena_alloc), self.dupe_behavior);
@ -931,7 +389,7 @@ pub const Parser = struct {
                                    continue :flipflop;
                                },
                                .map_item => |pair| {
-                                    if (flop or (line.indent == .none or line.indent == .dedent)) {
+                                    if (flop or (line.shift == .none or line.shift == .dedent)) {
                                        expect_shift = .none;
                                        const dupekey = try arena_alloc.dupe(u8, pair.key);
                                        switch (pair.val) {
@ -944,7 +402,7 @@ pub const Parser = struct {
                                            .flow_list => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior), self.dupe_behavior),
                                            .flow_map => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior), self.dupe_behavior),
                                        }
-                                    } else if (line.indent == .indent) {
+                                    } else if (line.shift == .indent) {
                                        if (expect_shift != .indent or dangling_key == null) return error.UnexpectedValue;
                                        const new_map = try putMapGetValue(map, dangling_key.?, Value.newMap(arena_alloc), self.dupe_behavior);
@ -967,6 +425,7 @@ pub const Parser = struct {
        switch (state) {
            .initial => switch (self.default_object) {
                .scalar => document.root = .{ .scalar = std.ArrayList(u8).init(arena_alloc) },
                .string => document.root = .{ .string = std.ArrayList(u8).init(arena_alloc) },
                .list => document.root = Value.newList(arena_alloc),
                .map => document.root = Value.newMap(arena_alloc),
@ -1256,47 +715,4 @@ pub const Parser = struct {
        return gop.value_ptr;
    }
    pub fn dumpBufLines(self: *Parser, buf: []const u8) Error!void {
        var tok: LineTokenizer = .{ .buffer = buf, .diagnostics = &self.diagnostics };
        while (try tok.next()) |line| {
            dumpLine(line);
        }
    }
    fn dumpLine(line: LineTokenizer.Line) void {
        var dedbuf: [64]u8 = .{0} ** 64;
        var keybuf: [2048]u8 = .{0} ** 2048;
        var valbuf: [2048]u8 = .{0} ** 2048;
        const shiftstr = if (line.indent == .dedent)
            std.fmt.bufPrint(&dedbuf, " ({d})", .{line.indent.dedent}) catch unreachable
        else
            "";
        std.debug.print("{s}{s}: {s} => {s}\n", .{
            @tagName(line.indent), shiftstr, @tagName(line.contents), switch (line.contents) {
                .comment => |str| str,
                .in_line, .list_item => |scalar| switch (scalar) {
                    .empty => "[empty]",
                    .scalar,
                    .string,
                    .flow_list,
                    .flow_map,
                    => |str| std.fmt.bufPrint(&keybuf, "{s} => {s}", .{ @tagName(scalar), str }) catch unreachable,
                },
                .map_item => |map| std.fmt.bufPrint(&keybuf, "{s} : {s}", .{
                    map.key,
                    switch (map.val) {
                        .empty => "[empty]",
                        .scalar,
                        .string,
                        .flow_list,
                        .flow_map,
                        => |str| std.fmt.bufPrint(&valbuf, "{s} => {s}", .{ @tagName(map.val), str }) catch unreachable,
                    },
                }) catch unreachable,
            },
        });
    }
 };
--- a/src/parser/value.zig
+++ b/src/parser/value.zig
@ -0,0 +1,151 @@
 const std = @import("std");
 pub const Value = union(enum) {
    pub const String = std.ArrayList(u8);
    pub const Map = std.StringArrayHashMap(Value);
    pub const List = std.ArrayList(Value);
    pub const TagType = @typeInfo(Value).Union.tag_type.?;
    scalar: String,
    string: String,
    list: List,
    flow_list: List,
    map: Map,
    flow_map: Map,
    pub inline fn fromScalar(alloc: std.mem.Allocator, input: []const u8) !Value {
        return try _fromScalarOrString(alloc, .scalar, input);
    }
    pub inline fn fromString(alloc: std.mem.Allocator, input: []const u8) !Value {
        return try _fromScalarOrString(alloc, .string, input);
    }
    inline fn _fromScalarOrString(alloc: std.mem.Allocator, comptime classification: TagType, input: []const u8) !Value {
        var res = @unionInit(Value, @tagName(classification), try String.initCapacity(alloc, input.len));
        @field(res, @tagName(classification)).appendSliceAssumeCapacity(input);
        return res;
    }
    pub inline fn newScalar(alloc: std.mem.Allocator) Value {
        return .{ .scalar = String.init(alloc) };
    }
    pub inline fn newString(alloc: std.mem.Allocator) Value {
        return .{ .string = String.init(alloc) };
    }
    pub inline fn newList(alloc: std.mem.Allocator) Value {
        return .{ .list = List.init(alloc) };
    }
    pub inline fn newFlowList(alloc: std.mem.Allocator) Value {
        return .{ .flow_list = List.init(alloc) };
    }
    pub inline fn newMap(alloc: std.mem.Allocator) Value {
        return .{ .map = Map.init(alloc) };
    }
    pub inline fn newFlowMap(alloc: std.mem.Allocator) Value {
        return .{ .flow_map = Map.init(alloc) };
    }
    pub fn recursiveEqualsExact(self: Value, other: Value) bool {
        if (@as(TagType, self) != other) return false;
        switch (self) {
            inline .scalar, .string => |str, tag| return std.mem.eql(u8, str.items, @field(other, @tagName(tag)).items),
            inline .list, .flow_list => |lst, tag| {
                const olst = @field(other, @tagName(tag));
                if (lst.items.len != olst.items.len) return false;
                for (lst.items, olst.items) |this, that| if (!this.recursiveEqualsExact(that)) return false;
                return true;
            },
            inline .map, .flow_map => |map, tag| {
                const omap = @field(other, @tagName(tag));
                if (map.count() != omap.count()) return false;
                var iter = map.iterator();
                var oiter = omap.iterator();
                // this loop structure enforces that the maps are in the same order
                while (iter.next()) |this| {
                    const that = oiter.next() orelse return false;
                    if (!std.mem.eql(u8, this.key_ptr.*, that.key_ptr.*) or !this.value_ptr.recursiveEqualsExact(that.value_ptr.*)) return false;
                }
                // the maps are equal if we have also consumed all of the values from
                // other.
                return oiter.next() == null;
            },
        }
    }
    pub fn printDebug(self: Value) void {
        self.printRecursive(0);
        std.debug.print("\n", .{});
    }
    fn printRecursive(self: Value, indent: usize) void {
        switch (self) {
            .scalar, .string => |str| {
                if (std.mem.indexOfScalar(u8, str.items, '\n')) |_| {
                    var lines = std.mem.splitScalar(u8, str.items, '\n');
                    std.debug.print("\n", .{});
                    while (lines.next()) |line| {
                        std.debug.print(
                            "{[empty]s: >[indent]}{[line]s}{[nl]s}",
                            .{
                                .empty = "",
                                .indent = indent,
                                .line = line,
                                .nl = if (lines.peek() == null) "" else "\n",
                            },
                        );
                    }
                } else {
                    std.debug.print("{s}", .{str.items});
                }
            },
            .list, .flow_list => |list| {
                if (list.items.len == 0) {
                    std.debug.print("[]", .{});
                    return;
                }
                std.debug.print("[\n", .{});
                for (list.items, 0..) |value, idx| {
                    std.debug.print("{[empty]s: >[indent]}[{[idx]d}] = ", .{ .empty = "", .indent = indent, .idx = idx });
                    value.printRecursive(indent + 2);
                    std.debug.print(",\n", .{});
                }
                std.debug.print(
                    "{[empty]s: >[indent]}]",
                    .{ .empty = "", .indent = indent },
                );
            },
            .map, .flow_map => |map| {
                if (map.count() == 0) {
                    std.debug.print("{{}}", .{});
                    return;
                }
                std.debug.print("{{\n", .{});
                var iter = map.iterator();
                while (iter.next()) |entry| {
                    std.debug.print(
                        "{[empty]s: >[indent]}{[key]s}: ",
                        .{ .empty = "", .indent = indent + 2, .key = entry.key_ptr.* },
                    );
                    entry.value_ptr.printRecursive(indent + 4);
                    std.debug.print(",\n", .{});
                }
                std.debug.print(
                    "{[empty]s: >[indent]}}}",
                    .{ .empty = "", .indent = indent },
                );
            },
        }
    }
 };
--- a/src/tokenizer.zig
+++ b/src/tokenizer.zig
@ -0,0 +1,251 @@
 const std = @import("std");
 const Diagnostics = @import("./parser.zig").Diagnostics;
 pub const Error = error{
    BadToken,
    MixedIndentation,
    UnquantizedIndentation,
    TooMuchIndentation,
    MissingNewline,
    TrailingWhitespace,
    Impossible,
 };
 pub const DetectedIndentation = union(enum) {
    unknown: void,
    spaces: usize,
    tabs: void,
 };
 pub const InlineItem = union(enum) {
    empty: void,
    scalar: []const u8,
    line_string: []const u8,
    space_string: []const u8,
    flow_list: []const u8,
    flow_map: []const u8,
    pub fn lineEnding(self: InlineItem) u8 {
        return switch (self) {
            .line_string => '\n',
            .space_string => ' ',
            else => unreachable,
        };
    }
 };
 pub const LineContents = union(enum) {
    comment: []const u8,
    in_line: InlineItem,
    list_item: InlineItem,
    map_item: struct { key: []const u8, val: InlineItem },
 };
 pub const ShiftDirection = enum { indent, dedent, none };
 pub const LineShift = union(ShiftDirection) {
    indent: void,
    // we can dedent multiple levels at once.
    dedent: usize,
    none: void,
 };
 pub const Line = struct {
    shift: LineShift,
    contents: LineContents,
    raw: []const u8,
 };
 // buffer is expected to be either LineBuffer or FixedLineBuffer, but can
 // technically be anything with a `nextLine` method
 pub fn LineTokenizer(comptime Buffer: type) type {
    return struct {
        buffer: Buffer,
        index: usize = 0,
        indentation: DetectedIndentation = .unknown,
        last_indent: usize = 0,
        diagnostics: *Diagnostics,
        row: usize = 0,
        pub fn next(self: *@This()) Error!?Line {
            lineloop: while (self.buffer.nextLine()) |raw_line| {
                var indent: usize = 0;
                for (raw_line, 0..) |char, idx| {
                    switch (char) {
                        ' ' => {
                            switch (self.indentation) {
                                // There's a weird coupling here because we can't set this until
                                // all spaces have been consumed. I also thought about ignoring
                                // spaces on comment lines since those don't affect the
                                // relative indent/dedent, but then we would allow comments
                                // to ignore our indent quantum, which I dislike due to it making
                                // ugly documents.
                                .unknown => self.indentation = .{ .spaces = 0 },
                                .spaces => {},
                                .tabs => return error.MixedIndentation,
                            }
                        },
                        '\t' => {
                            switch (self.indentation) {
                                .unknown => self.indentation = .tabs,
                                .spaces => return error.MixedIndentation,
                                .tabs => {},
                            }
                        },
                        '\r' => {
                            return error.BadToken;
                        },
                        else => {
                            indent = idx;
                            break;
                        },
                    }
                } else {
                    if (raw_line.len > 0) return error.TrailingWhitespace;
                    continue :lineloop;
                }
                var quantized: usize = if (self.indentation == .spaces) quant: {
                    if (self.indentation.spaces == 0) {
                        self.indentation.spaces = indent;
                    }
                    if (@rem(indent, self.indentation.spaces) != 0)
                        return error.UnquantizedIndentation;
                    break :quant @divExact(indent, self.indentation.spaces);
                } else indent;
                const shift: LineShift = if (quantized > self.last_indent) rel: {
                    if ((quantized - self.last_indent) > 1)
                        return error.TooMuchIndentation;
                    break :rel .indent;
                } else if (quantized < self.last_indent)
                    .{ .dedent = self.last_indent - quantized }
                else
                    .none;
                defer {
                    self.row += 1;
                    self.last_indent = quantized;
                }
                const line = raw_line[indent..];
                // this should not be possible, as empty lines are caught earlier.
                if (line.len == 0) return error.Impossible;
                switch (line[0]) {
                    '#' => {
                        // force comments to be followed by a space. This makes them
                        // behave the same way as strings, actually.
                        if (line.len > 1 and line[1] != ' ') return error.BadToken;
                        // simply lie about indentation when the line is a comment.
                        quantized = self.last_indent;
                        return .{
                            .shift = .none,
                            .contents = .{ .comment = line[1..] },
                            .raw = line,
                        };
                    },
                    '|', '>', '[', '{' => {
                        return .{
                            .shift = shift,
                            .contents = .{ .in_line = try detectInlineItem(line) },
                            .raw = line,
                        };
                    },
                    '-' => {
                        if (line.len > 1 and line[1] != ' ') return error.BadToken;
                        return if (line.len == 1) .{
                            .shift = shift,
                            .contents = .{ .list_item = .empty },
                            .raw = line,
                        } else .{
                            .shift = shift,
                            .contents = .{ .list_item = try detectInlineItem(line[2..]) },
                            .raw = line,
                        };
                    },
                    else => {
                        for (line, 0..) |char, idx| {
                            if (char == ':') {
                                if (idx + 1 == line.len) return .{
                                    .shift = shift,
                                    .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
                                    .raw = line,
                                };
                                if (line[idx + 1] != ' ') return error.BadToken;
                                return .{
                                    .shift = shift,
                                    .contents = .{ .map_item = .{
                                        .key = line[0..idx],
                                        .val = try detectInlineItem(line[idx + 2 ..]),
                                    } },
                                    .raw = line,
                                };
                            }
                        }
                        return .{
                            .shift = shift,
                            .contents = .{ .in_line = .{ .scalar = line } },
                            .raw = line,
                        };
                    },
                }
                // somehow everything else has failed
                return error.Impossible;
            }
            return null;
        }
        fn detectInlineItem(buf: []const u8) Error!InlineItem {
            if (buf.len == 0) return .empty;
            switch (buf[0]) {
                '>', '|' => |char| {
                    if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
                    const slice: []const u8 = switch (buf[buf.len - 1]) {
                        ' ', '\t' => return error.TrailingWhitespace,
                        '|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
                        else => buf[@min(2, buf.len)..buf.len],
                    };
                    return if (char == '>')
                        .{ .line_string = slice }
                    else
                        .{ .space_string = slice };
                },
                '[' => {
                    if (buf.len < 2 or buf[buf.len - 1] != ']')
                        return error.BadToken;
                    // keep the closing ] for the flow parser
                    return .{ .flow_list = buf[1..] };
                },
                '{' => {
                    if (buf.len < 2 or buf[buf.len - 1] != '}')
                        return error.BadToken;
                    // keep the closing } fpr the flow parser
                    return .{ .flow_map = buf[1..] };
                },
                else => {
                    if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t')
                        return error.TrailingWhitespace;
                    return .{ .scalar = buf };
                },
            }
        }
    };
 }