From 38e47b39dc4d3c0c8c2fcb50c2f458021cb86407 Mon Sep 17 00:00:00 2001
From: torque <torque@epicyclic.dev>
Date: Sun, 24 Sep 2023 18:22:12 -0700
Subject: [PATCH] all: do some restructuring

I don't like big monolithic source files, so let's restructure a bit.
parser.zig is still bigger than I would like it to be, but there isn't
a good way to break up the two state machine parsers, which take up
most of the space. This is the last junk commit before I am seriously
going to implement the "streaming" parser. Which is the last change
before implementing deserialization to object. I am definitely not
just spinning my wheels here.
---
 build.zig                      |   2 +-
 src/linebuffer.zig             | 105 +++++
 src/nice.zig                   |  67 ++++
 src/{config.zig => parser.zig} | 712 +++------------------------------
 src/parser/value.zig           | 151 +++++++
 src/tokenizer.zig              | 251 ++++++++++++
 6 files changed, 639 insertions(+), 649 deletions(-)
 create mode 100644 src/linebuffer.zig
 create mode 100644 src/nice.zig
 rename src/{config.zig => parser.zig} (57%)
 create mode 100644 src/parser/value.zig
 create mode 100644 src/tokenizer.zig

diff --git a/build.zig b/build.zig
index 2fcea8d..caf97f7 100644
--- a/build.zig
+++ b/build.zig
@@ -4,7 +4,7 @@ pub fn build(b: *std.Build) void {
     const target = b.standardTargetOptions(.{});
 
     const nice = b.addModule("nice", .{
-        .source_file = .{ .path = "src/config.zig" },
+        .source_file = .{ .path = "src/nice.zig" },
     });
 
     add_examples(b, .{
diff --git a/src/linebuffer.zig b/src/linebuffer.zig
new file mode 100644
index 0000000..1f50e13
--- /dev/null
+++ b/src/linebuffer.zig
@@ -0,0 +1,105 @@
+const std = @import("std");
+
+pub const IndexSlice = struct { start: usize, len: usize };
+
+pub const LineBuffer = struct {
+    allocator: std.mem.Allocator,
+    internal: FixedLineBuffer,
+    used: usize,
+
+    pub const default_capacity: usize = 4096;
+    pub const Error = std.mem.Allocator.Error;
+
+    pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
+        return initCapacity(allocator, default_capacity);
+    }
+
+    pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
+        return .{
+            .allocator = allocator,
+            .internal = .{
+                .buffer = try allocator.alloc(u8, capacity),
+                .window = .{ .start = 0, .len = 0 },
+            },
+            .used = 0,
+        };
+    }
+
+    pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
+        if (data.len == 0) return;
+        // TODO: check for usize overflow here if we want Maximum Robustness
+        const new_window_len = self.internal.window.len + data.len;
+
+        // data cannot fit in the buffer with our scan window, so we have to realloc
+        if (new_window_len > self.internal.buffer.len) {
+            // TODO: adopt an overallocation strategy? Will potentially avoid allocating
+            //       on every invocation but will cause the buffer to oversize
+            try self.allocator.realloc(self.internal.buffer, new_window_len);
+            self.rehome();
+            @memcpy(self.internal.buffer[self.used..].ptr, data);
+            self.used = new_window_len;
+            self.internal.window.len = new_window_len;
+        }
+        // data will fit, but needs to be moved in the buffer
+        else if (self.internal.window.start + new_window_len > self.internal.buffer.len) {
+            self.rehome();
+            @memcpy(self.internal.buffer[self.used..].ptr, data);
+            self.used = new_window_len;
+            self.internal.window.len = new_window_len;
+        }
+        // data can simply be appended
+        else {
+            @memcpy(self.internal.buffer[self.used..].ptr, data);
+        }
+    }
+
+    /// The memory returned by this function is valid until the next call to `feed`.
+    /// The resulting slice does not include the newline character.
+    pub fn nextLine(self: *LineBuffer) ?[]const u8 {
+        return self.internal.nextLine();
+    }
+
+    fn rehome(self: *LineBuffer) void {
+        self.internal.rehome();
+        self.used = self.internal.window.len;
+    }
+};
+
+pub const FixedLineBuffer = struct {
+    buffer: []const u8,
+    window: IndexSlice,
+
+    pub fn init(data: []const u8) FixedLineBuffer {
+        return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
+    }
+
+    pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
+        if (self.window.start >= self.buffer.len or self.window.len == 0)
+            return null;
+
+        const window = self.buffer[self.window.start..][0..self.window.len];
+        const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
+
+        self.window.start += split + 1;
+        self.window.len -= split + 1;
+
+        return window[0..split];
+    }
+
+    // move the current scan window to the beginning of the buffer. This internal
+    // method is used by LineBuffer.
+    fn rehome(self: *LineBuffer) usize {
+        if (self.window.start == 0) return;
+
+        const window = self.buffer[self.window.start..][0..self.window.len];
+
+        // if the window is longer than its starting index, the memory move will be
+        // overlapping, so we can't use memcpy
+        if (self.window.len > self.window.start)
+            std.mem.copyForwards(u8, self.buffer, window)
+        else
+            @memcpy(self.buffer.ptr, window);
+
+        self.window.start = 0;
+    }
+};
diff --git a/src/nice.zig b/src/nice.zig
new file mode 100644
index 0000000..3920756
--- /dev/null
+++ b/src/nice.zig
@@ -0,0 +1,67 @@
+// Heavily inspired by, but not quite compatible with, NestedText. Key differences:
+//
+// - Doesn't support multiline keys (this means map keys cannot start with
+//   ' ', \t, #, {, [, |, or >, and they cannot contain :)
+// - Allows using tabs for indentation (but not mixed tabs/spaces)
+// - Indentation must be quantized consistently throughout the document. e.g.
+//   every nested layer being exactly 2 spaces past its parent. Tabs may
+//   only use one tab per indentation level.
+// - Allows flow-style lists, maps, and strings on the same line as map keys or
+//   list items (i.e. the following are legal):
+//
+//      key: {inline: map}
+//      key: [inline, list]
+//      key: > inline string
+//      - {map: item}
+//      - [list, item]
+//      - > inline string
+//
+//   The string case retains the possibility of having an inline map value starting
+//   with {, [, or >
+// - a map keys and list item dashes must be followed by a value or an indented
+//   section to reduce parser quantum state. This means that
+//
+//      foo:
+//      bar: baz
+//
+//   or
+//
+//      -
+//      - qux
+//
+//   are not valid. This can be represented with an inline empty string after foo:
+//
+//      foo: >
+//      bar: baz
+//
+//   or
+//
+//      - >
+//      - qux
+//
+// - newlines are strictly LF, if the parser finds CR, it is an error
+// - blank lines may not contain any whitespace characters except the single LF
+// - Additional string indicator `|` for soft-wrapped strings, i.e.
+//
+//      key: | this is not special
+//      key:
+//        | these lines are
+//        | soft-wrapped
+//
+//   soft-wrapped lines are joined with a ' ' instead of a newline character.
+//   Like multiline strings, the final space is stripped (I guess this is a very
+//   janky way to add trailing whitespace to a string).
+//
+// - terminated strings to allow trailing whitespace:
+//      | this string has trailing whitespace    |
+//      > and so does this one                   |
+// - The parser is both strict and probably sloppy and may have weird edge
+//   cases since I'm slinging code, not writing a spec. For example, tabs are
+//   not trimmed from the values of inline lists/maps
+
+const std = @import("std");
+
+pub const buffers = @import("./linebuffer.zig");
+pub const tokenizer = @import("./tokenizer.zig");
+pub const parser = @import("./parser.zig");
+pub const Parser = parser.Parser;
diff --git a/src/config.zig b/src/parser.zig
similarity index 57%
rename from src/config.zig
rename to src/parser.zig
index 3d297ec..34ec496 100644
--- a/src/config.zig
+++ b/src/parser.zig
@@ -1,69 +1,8 @@
-// Heavily inspired by, but not quite compatible with, NestedText. Key differences:
-//
-// - Doesn't support multiline keys (this means map keys cannot start with
-//   ' ', \t, #, {, [, |, or >, and they cannot contain :)
-// - Allows using tabs for indentation (but not mixed tabs/spaces)
-// - Indentation must be quantized consistently throughout the document. e.g.
-//   every nested layer being exactly 2 spaces past its parent. Tabs may
-//   only use one tab per indentation level.
-// - Allows flow-style lists, maps, and strings on the same line as map keys or
-//   list items (i.e. the following are legal):
-//
-//      key: {inline: map}
-//      key: [inline, list]
-//      key: > inline string
-//      - {map: item}
-//      - [list, item]
-//      - > inline string
-//
-//   The string case retains the possibility of having an inline map value starting
-//   with {, [, or >
-// - inline lists and maps cannot contain other inline structures. This may
-//   change, as writing {:[{:[{:[{:[{:[{:[]}]}]}]}]}]} seems tremendously useful
-// - a map keys and list item dashes must be followed by a value or an indented
-//   section to reduce parser quantum state. This means that
-//
-//      foo:
-//      bar: baz
-//
-//   or
-//
-//      -
-//      - qux
-//
-//   are not valid. This can be represented with an inline empty string after foo:
-//
-//      foo: >
-//      bar: baz
-//
-//   or
-//
-//      - >
-//      - qux
-//
-// - newlines are strictly LF, if the parser finds CR, it is an error
-// - blank lines may not contain any whitespace characters except the single LF
-// - Additional string indicator `|` for soft-wrapped strings, i.e.
-//
-//      key: | this is not special
-//      key:
-//        | these lines are
-//        | soft-wrapped
-//
-//   soft-wrapped lines are joined with a ' ' instead of a newline character.
-//   Like multiline strings, the final space is stripped (I guess this is a very
-//   janky way to add trailing whitespace to a string).
-//
-// - terminated strings to allow trailing whitespace:
-//      | this string has trailing whitespace    |
-//      > and so does this one                   |
-// - The parser is both strict and probably sloppy and may have weird edge
-//   cases since I'm slinging code, not writing a spec. For example, tabs are
-//   not trimmed from the values of inline lists/maps
-
 const std = @import("std");
 
-pub const IndexSlice = struct { start: usize, len: usize };
+const buffers = @import("./linebuffer.zig");
+const tokenizer = @import("./tokenizer.zig");
+const Value = @import("./parser/value.zig").Value;
 
 pub const Diagnostics = struct {
     row: usize,
@@ -71,481 +10,51 @@ pub const Diagnostics = struct {
     message: []const u8,
 };
 
-pub const LineBuffer = struct {
-    allocator: std.mem.Allocator,
-    buffer: []u8,
-    used: usize,
-    window: IndexSlice,
+pub const Error = error{
+    UnexpectedIndent,
+    UnexpectedValue,
+    ExtraContent,
+    EmptyDocument,
+    DuplicateKey,
+    BadMapEntry,
+    BadState,
+    BadToken,
+    Fail,
+} || tokenizer.Error || std.mem.Allocator.Error;
 
-    pub const default_capacity: usize = 4096;
-    pub const Error = std.mem.Allocator.Error;
+pub const DuplicateKeyBehavior = enum {
+    use_first,
+    use_last,
+    fail,
+};
 
-    pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
-        return initCapacity(allocator, default_capacity);
-    }
+pub const DefaultObject = enum {
+    scalar,
+    string,
+    list,
+    map,
+    fail,
+};
 
-    pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
+const ParseState = enum { initial, value, done };
+
+pub const Document = struct {
+    arena: std.heap.ArenaAllocator,
+    root: Value,
+
+    pub fn init(alloc: std.mem.Allocator) Document {
         return .{
-            .allocator = allocator,
-            .buffer = try allocator.alloc(u8, capacity),
-            .used = 0,
-            .window = .{ .start = 0, .len = 0 },
+            .arena = std.heap.ArenaAllocator.init(alloc),
+            .root = undefined,
         };
     }
 
-    pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
-        if (data.len == 0) return;
-        // TODO: check for usize overflow here if we want Maximum Robustness
-        const new_window_len = self.window.len + data.len;
-
-        // data cannot fit in the buffer with our scan window, so we have to realloc
-        if (new_window_len > self.buffer.len) {
-            // TODO: adopt an overallocation strategy? Will potentially avoid allocating
-            //       on every invocation but will cause the buffer to oversize
-            try self.allocator.realloc(self.buffer, new_window_len);
-            self.rehome();
-            @memcpy(self.buffer[self.used..].ptr, data);
-            self.used = new_window_len;
-            self.window.len = new_window_len;
-        }
-        // data will fit, but needs to be moved in the buffer
-        else if (self.window.start + new_window_len > self.buffer.len) {
-            self.rehome();
-            @memcpy(self.buffer[self.used..].ptr, data);
-            self.used = new_window_len;
-            self.window.len = new_window_len;
-        }
-        // data can simply be appended
-        else {
-            @memcpy(self.buffer[self.used..].ptr, data);
-        }
+    pub fn printDebug(self: Document) void {
+        return self.root.printDebug();
     }
 
-    /// The memory returned by this function is valid until the next call to `feed`.
-    /// The resulting slice does not include the newline character.
-    pub fn nextLine(self: *LineBuffer) ?[]const u8 {
-        if (self.window.start >= self.buffer.len or self.window.len == 0)
-            return null;
-
-        const window = self.buffer[self.window.start..][0..self.window.len];
-        const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
-
-        self.window.start += split + 1;
-        self.window.len -= split + 1;
-
-        return window[0..split];
-    }
-
-    fn rehome(self: *LineBuffer) void {
-        if (self.window.start == 0) return;
-
-        const window = self.buffer[self.window.start..][0..self.window.len];
-
-        if (self.window.len > self.window.start)
-            std.mem.copyForwards(u8, self.buffer, window)
-        else
-            @memcpy(self.buffer.ptr, window);
-
-        self.window.start = 0;
-        self.used = window.len;
-    }
-};
-
-pub const FixedLineBuffer = struct {
-    buffer: []const u8,
-    window: IndexSlice,
-
-    pub fn init(data: []const u8) FixedLineBuffer {
-        return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
-    }
-
-    pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
-        if (self.window.start >= self.buffer.len or self.window.len == 0)
-            return null;
-
-        const window = self.buffer[self.window.start..][0..self.window.len];
-        const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
-
-        self.window.start += split + 1;
-        self.window.len -= split + 1;
-
-        return window[0..split];
-    }
-};
-
-const IndentationType = union(enum) {
-    immaterial: void,
-    spaces: usize,
-    tabs: void,
-};
-
-const InlineItem = union(enum) {
-    empty: void,
-    scalar: []const u8,
-    line_string: []const u8,
-    space_string: []const u8,
-
-    flow_list: []const u8,
-    flow_map: []const u8,
-
-    fn lineEnding(self: InlineItem) u8 {
-        return switch (self) {
-            .line_string => '\n',
-            .space_string => ' ',
-            else => unreachable,
-        };
-    }
-};
-
-const LineContents = union(enum) {
-    comment: []const u8,
-
-    in_line: InlineItem,
-    list_item: InlineItem,
-    map_item: struct { key: []const u8, val: InlineItem },
-};
-
-// we can dedent multiple levels at once. Example:
-//
-// foo:
-//   bar:
-//     > a
-//     > string
-// baz: [qux]
-//
-// capturing this is conceptually simple, but implementing it without complex
-// indentation tracking requires quantizing the indentation. This means our
-// IndentationType will also need to track the number of spaces used for
-// indentation, as detected. Then every line we have to check indent rem the
-// quantization level == 0 (otherwise we broke quantization) and compute indent
-// div the quantization level to give us our effective indentation level.
-
-const ShiftDirection = enum { indent, dedent, none };
-const RelativeIndent = union(ShiftDirection) {
-    indent: void,
-    dedent: usize,
-    none: void,
-};
-
-const Line = struct {
-    indent: RelativeIndent,
-    contents: LineContents,
-    raw: []const u8,
-};
-
-pub fn LineTokenizer(comptime Buffer: type) type {
-    return struct {
-        buffer: Buffer,
-        index: usize = 0,
-        indentation: IndentationType = .immaterial,
-        last_indent: usize = 0,
-        diagnostics: *Diagnostics,
-        row: usize = 0,
-
-        const Error = error{
-            BadToken,
-            MixedIndentation,
-            UnquantizedIndentation,
-            TooMuchIndentation,
-            MissingNewline,
-            TrailingWhitespace,
-            Impossible,
-        };
-
-        pub fn next(self: *@This()) Error!?Line {
-            lineloop: while (self.buffer.nextLine()) |raw_line| {
-                var indent: usize = 0;
-                for (raw_line, 0..) |char, idx| {
-                    switch (char) {
-                        ' ' => {
-                            switch (self.indentation) {
-                                // There's a weird coupling here because we can't set this until
-                                // all spaces have been consumed. I also thought about ignoring
-                                // spaces on comment lines since those don't affect the
-                                // relative indent/dedent, but then we would allow comments
-                                // to ignore our indent quantum, which I dislike due to it making
-                                // ugly documents.
-                                .immaterial => self.indentation = .{ .spaces = 0 },
-                                .spaces => {},
-                                .tabs => return error.MixedIndentation,
-                            }
-                        },
-                        '\t' => {
-                            switch (self.indentation) {
-                                .immaterial => self.indentation = .tabs,
-                                .spaces => return error.MixedIndentation,
-                                .tabs => {},
-                            }
-                        },
-                        '\r' => {
-                            return error.BadToken;
-                        },
-                        else => {
-                            indent = idx;
-                            break;
-                        },
-                    }
-                } else {
-                    if (raw_line.len > 0) return error.TrailingWhitespace;
-                    continue :lineloop;
-                }
-
-                var quantized: usize = if (self.indentation == .spaces) quant: {
-                    if (self.indentation.spaces == 0) {
-                        self.indentation.spaces = indent;
-                    }
-                    if (@rem(indent, self.indentation.spaces) != 0)
-                        return error.UnquantizedIndentation;
-
-                    break :quant @divExact(indent, self.indentation.spaces);
-                } else indent;
-
-                const relative: RelativeIndent = if (quantized > self.last_indent) rel: {
-                    if ((quantized - self.last_indent) > 1)
-                        return error.TooMuchIndentation;
-                    break :rel .indent;
-                } else if (quantized < self.last_indent)
-                    .{ .dedent = self.last_indent - quantized }
-                else
-                    .none;
-
-                defer {
-                    self.row += 1;
-                    self.last_indent = quantized;
-                }
-
-                const line = raw_line[indent..];
-
-                // this should not be possible, as empty lines are caught earlier.
-                if (line.len == 0) return error.Impossible;
-
-                switch (line[0]) {
-                    '#' => {
-                        // simply lie about indentation when the line is a comment.
-                        quantized = self.last_indent;
-                        return .{
-                            .indent = .none,
-                            .contents = .{ .comment = line[1..] },
-                            .raw = line,
-                        };
-                    },
-                    '|', '>', '[', '{' => {
-                        return .{
-                            .indent = relative,
-                            .contents = .{ .in_line = try detectInlineItem(line) },
-                            .raw = line,
-                        };
-                    },
-                    '-' => {
-                        if (line.len > 1 and line[1] != ' ') return error.BadToken;
-
-                        return if (line.len == 1) .{
-                            .indent = relative,
-                            .contents = .{ .list_item = .empty },
-                            .raw = line,
-                        } else .{
-                            .indent = relative,
-                            .contents = .{ .list_item = try detectInlineItem(line[2..]) },
-                            .raw = line,
-                        };
-                    },
-                    else => {
-                        for (line, 0..) |char, idx| {
-                            if (char == ':') {
-                                if (idx + 1 == line.len) return .{
-                                    .indent = relative,
-                                    .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
-                                    .raw = line,
-                                };
-
-                                if (line[idx + 1] != ' ') return error.BadToken;
-
-                                return .{
-                                    .indent = relative,
-                                    .contents = .{ .map_item = .{
-                                        .key = line[0..idx],
-                                        .val = try detectInlineItem(line[idx + 2 ..]),
-                                    } },
-                                    .raw = line,
-                                };
-                            }
-                        }
-
-                        return .{
-                            .indent = relative,
-                            .contents = .{ .in_line = .{ .scalar = line } },
-                            .raw = line,
-                        };
-                    },
-                }
-
-                // somehow everything else has failed
-                return error.Impossible;
-            }
-            return null;
-        }
-
-        fn detectInlineItem(buf: []const u8) Error!InlineItem {
-            if (buf.len == 0) return .empty;
-
-            switch (buf[0]) {
-                '>', '|' => |char| {
-                    if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
-
-                    const slice: []const u8 = switch (buf[buf.len - 1]) {
-                        ' ', '\t' => return error.TrailingWhitespace,
-                        '|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
-                        else => buf[@min(2, buf.len)..buf.len],
-                    };
-
-                    return if (char == '>')
-                        .{ .line_string = slice }
-                    else
-                        .{ .space_string = slice };
-                },
-                '[' => {
-                    if (buf.len < 2 or buf[buf.len - 1] != ']')
-                        return error.BadToken;
-
-                    // keep the closing ] for the flow parser
-                    return .{ .flow_list = buf[1..] };
-                },
-                '{' => {
-                    if (buf.len < 2 or buf[buf.len - 1] != '}')
-                        return error.BadToken;
-
-                    // keep the closing } fpr the flow parser
-                    return .{ .flow_map = buf[1..] };
-                },
-                else => {
-                    if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t')
-                        return error.TrailingWhitespace;
-
-                    return .{ .scalar = buf };
-                },
-            }
-        }
-    };
-}
-
-pub const Value = union(enum) {
-    pub const String = std.ArrayList(u8);
-    pub const Map = std.StringArrayHashMap(Value);
-    pub const List = std.ArrayList(Value);
-    pub const TagType = @typeInfo(Value).Union.tag_type.?;
-
-    scalar: String,
-    string: String,
-    list: List,
-    flow_list: List,
-    map: Map,
-    flow_map: Map,
-
-    pub inline fn fromScalar(alloc: std.mem.Allocator, input: []const u8) !Value {
-        return try _fromScalarOrString(alloc, .scalar, input);
-    }
-
-    pub inline fn fromString(alloc: std.mem.Allocator, input: []const u8) !Value {
-        return try _fromScalarOrString(alloc, .string, input);
-    }
-
-    inline fn _fromScalarOrString(alloc: std.mem.Allocator, comptime classification: TagType, input: []const u8) !Value {
-        var res = @unionInit(Value, @tagName(classification), try String.initCapacity(alloc, input.len));
-        @field(res, @tagName(classification)).appendSliceAssumeCapacity(input);
-        return res;
-    }
-
-    pub inline fn newScalar(alloc: std.mem.Allocator) Value {
-        return .{ .scalar = String.init(alloc) };
-    }
-
-    pub inline fn newString(alloc: std.mem.Allocator) Value {
-        return .{ .string = String.init(alloc) };
-    }
-
-    pub inline fn newList(alloc: std.mem.Allocator) Value {
-        return .{ .list = List.init(alloc) };
-    }
-
-    pub inline fn newFlowList(alloc: std.mem.Allocator) Value {
-        return .{ .flow_list = List.init(alloc) };
-    }
-
-    pub inline fn newMap(alloc: std.mem.Allocator) Value {
-        return .{ .map = Map.init(alloc) };
-    }
-
-    pub inline fn newFlowMap(alloc: std.mem.Allocator) Value {
-        return .{ .flow_map = Map.init(alloc) };
-    }
-
-    pub fn printDebug(self: Value) void {
-        self.printRecursive(0);
-        std.debug.print("\n", .{});
-    }
-
-    fn printRecursive(self: Value, indent: usize) void {
-        switch (self) {
-            .scalar, .string => |str| {
-                if (std.mem.indexOfScalar(u8, str.items, '\n')) |_| {
-                    var lines = std.mem.splitScalar(u8, str.items, '\n');
-                    std.debug.print("\n", .{});
-                    while (lines.next()) |line| {
-                        std.debug.print(
-                            "{[empty]s: >[indent]}{[line]s}{[nl]s}",
-                            .{
-                                .empty = "",
-                                .indent = indent,
-                                .line = line,
-                                .nl = if (lines.peek() == null) "" else "\n",
-                            },
-                        );
-                    }
-                } else {
-                    std.debug.print("{s}", .{str.items});
-                }
-            },
-            .list, .flow_list => |list| {
-                if (list.items.len == 0) {
-                    std.debug.print("[]", .{});
-                    return;
-                }
-
-                std.debug.print("[\n", .{});
-                for (list.items, 0..) |value, idx| {
-                    std.debug.print("{[empty]s: >[indent]}[{[idx]d}] = ", .{ .empty = "", .indent = indent, .idx = idx });
-                    value.printRecursive(indent + 2);
-                    std.debug.print(",\n", .{});
-                }
-                std.debug.print(
-                    "{[empty]s: >[indent]}]",
-                    .{ .empty = "", .indent = indent },
-                );
-            },
-            .map, .flow_map => |map| {
-                if (map.count() == 0) {
-                    std.debug.print("{{}}", .{});
-                    return;
-                }
-
-                std.debug.print("{{\n", .{});
-
-                var iter = map.iterator();
-
-                while (iter.next()) |entry| {
-                    std.debug.print(
-                        "{[empty]s: >[indent]}{[key]s}: ",
-                        .{ .empty = "", .indent = indent + 2, .key = entry.key_ptr.* },
-                    );
-                    entry.value_ptr.printRecursive(indent + 4);
-                    std.debug.print(",\n", .{});
-                }
-                std.debug.print(
-                    "{[empty]s: >[indent]}}}",
-                    .{ .empty = "", .indent = indent },
-                );
-            },
-        }
+    pub fn deinit(self: Document) void {
+        self.arena.deinit();
     }
 };
 
@@ -559,64 +68,13 @@ pub const Parser = struct {
         .message = "all is well",
     },
 
-    pub const Error = error{
-        UnexpectedIndent,
-        UnexpectedValue,
-        ExtraContent,
-        EmptyDocument,
-        DuplicateKey,
-        BadMapEntry,
-        BadState,
-        BadToken,
-        Fail,
-    } || LineTokenizer(FixedLineBuffer).Error || std.mem.Allocator.Error;
-
-    pub const DuplicateKeyBehavior = enum {
-        use_first,
-        use_last,
-        fail,
-    };
-
-    pub const DefaultObject = enum {
-        string,
-        list,
-        map,
-        fail,
-    };
-
-    pub const ParseState = enum {
-        initial,
-        value,
-        done,
-    };
-
-    pub const Document = struct {
-        arena: std.heap.ArenaAllocator,
-        root: Value,
-
-        pub fn init(alloc: std.mem.Allocator) Document {
-            return .{
-                .arena = std.heap.ArenaAllocator.init(alloc),
-                .root = undefined,
-            };
-        }
-
-        pub fn printDebug(self: Document) void {
-            return self.root.printDebug();
-        }
-
-        pub fn deinit(self: Document) void {
-            self.arena.deinit();
-        }
-    };
-
     pub const State = struct {
         pub const Stack = std.ArrayList(*Value);
 
         document: Document,
         value_stack: Stack,
-        state: ParseState = .initial,
-        expect_shift: ShiftDirection = .none,
+        state: enum { initial, value, done } = .initial,
+        expect_shift: tokenizer.ShiftDirection = .none,
         dangling_key: ?[]const u8 = null,
 
         pub fn init(alloc: std.mem.Allocator) State {
@@ -637,13 +95,13 @@ pub const Parser = struct {
         const arena_alloc = document.arena.allocator();
 
         var state: ParseState = .initial;
-        var expect_shift: ShiftDirection = .none;
+        var expect_shift: tokenizer.ShiftDirection = .none;
         var dangling_key: ?[]const u8 = null;
         var stack = std.ArrayList(*Value).init(arena_alloc);
         defer stack.deinit();
 
-        var tok: LineTokenizer(FixedLineBuffer) = .{
-            .buffer = FixedLineBuffer.init(buffer),
+        var tok: tokenizer.LineTokenizer(buffers.FixedLineBuffer) = .{
+            .buffer = buffers.FixedLineBuffer.init(buffer),
             .diagnostics = &self.diagnostics,
         };
 
@@ -656,7 +114,7 @@ pub const Parser = struct {
             flipflop: while (flip) : (flop = true) {
                 switch (state) {
                     .initial => {
-                        if (line.indent == .indent) return error.UnexpectedIndent;
+                        if (line.shift == .indent) return error.UnexpectedIndent;
 
                         switch (line.contents) {
                             // we filter out comments above
@@ -737,14 +195,14 @@ pub const Parser = struct {
                         // switch is embedded.
                         .scalar, .flow_list, .flow_map => unreachable,
                         .string => |*string| {
-                            if (line.indent == .indent)
+                            if (line.shift == .indent)
                                 return error.UnexpectedIndent;
 
-                            if (!flop and line.indent == .dedent) {
+                            if (!flop and line.shift == .dedent) {
                                 // kick off the last trailing space or newline
                                 _ = string.pop();
 
-                                var dedent_depth = line.indent.dedent;
+                                var dedent_depth = line.shift.dedent;
                                 while (dedent_depth > 0) : (dedent_depth -= 1)
                                     _ = stack.pop();
 
@@ -772,7 +230,7 @@ pub const Parser = struct {
                             //
                             // the first line here creates the expect_shift, but the second line
                             // is a valid continuation of the list despite not being indented
-                            if (!flop and (expect_shift == .indent and line.indent != .indent))
+                            if (!flop and (expect_shift == .indent and line.shift != .indent))
                                 try list.append(Value.newScalar(arena_alloc));
 
                             // Consider:
@@ -782,11 +240,11 @@ pub const Parser = struct {
                             //    - inline scalar
                             //
                             // the own-line scalar will not push the stack but the next list item will be a dedent
-                            if (!flop and line.indent == .dedent) {
-                                // if line.indent.dedent is 1 and we're expecting it, the stack will not be popped,
+                            if (!flop and line.shift == .dedent) {
+                                // if line.shift.dedent is 1 and we're expecting it, the stack will not be popped,
                                 // but we will continue loop flipflop. However, flop will be set to false on the next
                                 // trip, so this if prong will not be run again.
-                                var dedent_depth = line.indent.dedent - @intFromBool(expect_shift == .dedent);
+                                var dedent_depth = line.shift.dedent - @intFromBool(expect_shift == .dedent);
 
                                 while (dedent_depth > 0) : (dedent_depth -= 1)
                                     _ = stack.pop();
@@ -799,7 +257,7 @@ pub const Parser = struct {
                                 .in_line => |in_line| {
                                     // assert that this line has been indented. this is required for an inline value when
                                     // the stack is in list mode.
-                                    if (expect_shift != .indent or line.indent != .indent)
+                                    if (expect_shift != .indent or line.shift != .indent)
                                         return error.UnexpectedValue;
 
                                     expect_shift = .dedent;
@@ -819,7 +277,7 @@ pub const Parser = struct {
                                     }
                                 },
                                 .list_item => |value| {
-                                    if (flop or (line.indent == .none or line.indent == .dedent)) {
+                                    if (flop or (line.shift == .none or line.shift == .dedent)) {
                                         expect_shift = .none;
                                         switch (value) {
                                             .empty => expect_shift = .indent,
@@ -828,7 +286,7 @@ pub const Parser = struct {
                                             .flow_list => |str| try list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
                                             .flow_map => |str| try list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
                                         }
-                                    } else if (line.indent == .indent) {
+                                    } else if (line.shift == .indent) {
                                         if (expect_shift != .indent) return error.UnexpectedIndent;
 
                                         const new_list = try appendListGetValue(list, Value.newList(arena_alloc));
@@ -847,7 +305,7 @@ pub const Parser = struct {
                                     //
                                     // dedenting back to the list stack level requires list_item
 
-                                    if (line.indent != .indent)
+                                    if (line.shift != .indent)
                                         return error.UnexpectedValue;
 
                                     const new_map = try appendListGetValue(list, Value.newMap(arena_alloc));
@@ -865,7 +323,7 @@ pub const Parser = struct {
                             //
                             // the first line here creates the expect_shift, but the second line
                             // is a valid continuation of the map despite not being indented
-                            if (!flop and (expect_shift == .indent and line.indent != .indent)) {
+                            if (!flop and (expect_shift == .indent and line.shift != .indent)) {
                                 try putMap(
                                     map,
                                     dangling_key orelse return error.Fail,
@@ -875,8 +333,8 @@ pub const Parser = struct {
                                 dangling_key = null;
                             }
 
-                            if (!flop and line.indent == .dedent) {
-                                var dedent_depth = line.indent.dedent - @intFromBool(expect_shift == .dedent);
+                            if (!flop and line.shift == .dedent) {
+                                var dedent_depth = line.shift.dedent - @intFromBool(expect_shift == .dedent);
 
                                 while (dedent_depth > 0) : (dedent_depth -= 1)
                                     _ = stack.pop();
@@ -889,7 +347,7 @@ pub const Parser = struct {
                                 .in_line => |in_line| {
                                     // assert that this line has been indented. this is required for an inline value when
                                     // the stack is in map mode.
-                                    if (expect_shift != .indent or line.indent != .indent or dangling_key == null)
+                                    if (expect_shift != .indent or line.shift != .indent or dangling_key == null)
                                         return error.UnexpectedValue;
 
                                     expect_shift = .dedent;
@@ -921,7 +379,7 @@ pub const Parser = struct {
                                     //
                                     // dedenting back to the map stack level requires map_item
 
-                                    if (expect_shift != .indent or line.indent != .indent or dangling_key == null)
+                                    if (expect_shift != .indent or line.shift != .indent or dangling_key == null)
                                         return error.UnexpectedValue;
 
                                     const new_list = try putMapGetValue(map, dangling_key.?, Value.newList(arena_alloc), self.dupe_behavior);
@@ -931,7 +389,7 @@ pub const Parser = struct {
                                     continue :flipflop;
                                 },
                                 .map_item => |pair| {
-                                    if (flop or (line.indent == .none or line.indent == .dedent)) {
+                                    if (flop or (line.shift == .none or line.shift == .dedent)) {
                                         expect_shift = .none;
                                         const dupekey = try arena_alloc.dupe(u8, pair.key);
                                         switch (pair.val) {
@@ -944,7 +402,7 @@ pub const Parser = struct {
                                             .flow_list => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior), self.dupe_behavior),
                                             .flow_map => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior), self.dupe_behavior),
                                         }
-                                    } else if (line.indent == .indent) {
+                                    } else if (line.shift == .indent) {
                                         if (expect_shift != .indent or dangling_key == null) return error.UnexpectedValue;
 
                                         const new_map = try putMapGetValue(map, dangling_key.?, Value.newMap(arena_alloc), self.dupe_behavior);
@@ -967,6 +425,7 @@ pub const Parser = struct {
 
         switch (state) {
             .initial => switch (self.default_object) {
+                .scalar => document.root = .{ .scalar = std.ArrayList(u8).init(arena_alloc) },
                 .string => document.root = .{ .string = std.ArrayList(u8).init(arena_alloc) },
                 .list => document.root = Value.newList(arena_alloc),
                 .map => document.root = Value.newMap(arena_alloc),
@@ -1256,47 +715,4 @@ pub const Parser = struct {
 
         return gop.value_ptr;
     }
-
-    pub fn dumpBufLines(self: *Parser, buf: []const u8) Error!void {
-        var tok: LineTokenizer = .{ .buffer = buf, .diagnostics = &self.diagnostics };
-        while (try tok.next()) |line| {
-            dumpLine(line);
-        }
-    }
-
-    fn dumpLine(line: LineTokenizer.Line) void {
-        var dedbuf: [64]u8 = .{0} ** 64;
-        var keybuf: [2048]u8 = .{0} ** 2048;
-        var valbuf: [2048]u8 = .{0} ** 2048;
-
-        const shiftstr = if (line.indent == .dedent)
-            std.fmt.bufPrint(&dedbuf, " ({d})", .{line.indent.dedent}) catch unreachable
-        else
-            "";
-
-        std.debug.print("{s}{s}: {s} => {s}\n", .{
-            @tagName(line.indent), shiftstr, @tagName(line.contents), switch (line.contents) {
-                .comment => |str| str,
-                .in_line, .list_item => |scalar| switch (scalar) {
-                    .empty => "[empty]",
-                    .scalar,
-                    .string,
-                    .flow_list,
-                    .flow_map,
-                    => |str| std.fmt.bufPrint(&keybuf, "{s} => {s}", .{ @tagName(scalar), str }) catch unreachable,
-                },
-                .map_item => |map| std.fmt.bufPrint(&keybuf, "{s} : {s}", .{
-                    map.key,
-                    switch (map.val) {
-                        .empty => "[empty]",
-                        .scalar,
-                        .string,
-                        .flow_list,
-                        .flow_map,
-                        => |str| std.fmt.bufPrint(&valbuf, "{s} => {s}", .{ @tagName(map.val), str }) catch unreachable,
-                    },
-                }) catch unreachable,
-            },
-        });
-    }
 };
diff --git a/src/parser/value.zig b/src/parser/value.zig
new file mode 100644
index 0000000..85c783f
--- /dev/null
+++ b/src/parser/value.zig
@@ -0,0 +1,151 @@
+const std = @import("std");
+
+pub const Value = union(enum) {
+    pub const String = std.ArrayList(u8);
+    pub const Map = std.StringArrayHashMap(Value);
+    pub const List = std.ArrayList(Value);
+    pub const TagType = @typeInfo(Value).Union.tag_type.?;
+
+    scalar: String,
+    string: String,
+    list: List,
+    flow_list: List,
+    map: Map,
+    flow_map: Map,
+
+    pub inline fn fromScalar(alloc: std.mem.Allocator, input: []const u8) !Value {
+        return try _fromScalarOrString(alloc, .scalar, input);
+    }
+
+    pub inline fn fromString(alloc: std.mem.Allocator, input: []const u8) !Value {
+        return try _fromScalarOrString(alloc, .string, input);
+    }
+
+    inline fn _fromScalarOrString(alloc: std.mem.Allocator, comptime classification: TagType, input: []const u8) !Value {
+        var res = @unionInit(Value, @tagName(classification), try String.initCapacity(alloc, input.len));
+        @field(res, @tagName(classification)).appendSliceAssumeCapacity(input);
+        return res;
+    }
+
+    pub inline fn newScalar(alloc: std.mem.Allocator) Value {
+        return .{ .scalar = String.init(alloc) };
+    }
+
+    pub inline fn newString(alloc: std.mem.Allocator) Value {
+        return .{ .string = String.init(alloc) };
+    }
+
+    pub inline fn newList(alloc: std.mem.Allocator) Value {
+        return .{ .list = List.init(alloc) };
+    }
+
+    pub inline fn newFlowList(alloc: std.mem.Allocator) Value {
+        return .{ .flow_list = List.init(alloc) };
+    }
+
+    pub inline fn newMap(alloc: std.mem.Allocator) Value {
+        return .{ .map = Map.init(alloc) };
+    }
+
+    pub inline fn newFlowMap(alloc: std.mem.Allocator) Value {
+        return .{ .flow_map = Map.init(alloc) };
+    }
+
+    pub fn recursiveEqualsExact(self: Value, other: Value) bool {
+        if (@as(TagType, self) != other) return false;
+        switch (self) {
+            inline .scalar, .string => |str, tag| return std.mem.eql(u8, str.items, @field(other, @tagName(tag)).items),
+            inline .list, .flow_list => |lst, tag| {
+                const olst = @field(other, @tagName(tag));
+
+                if (lst.items.len != olst.items.len) return false;
+                for (lst.items, olst.items) |this, that| if (!this.recursiveEqualsExact(that)) return false;
+                return true;
+            },
+            inline .map, .flow_map => |map, tag| {
+                const omap = @field(other, @tagName(tag));
+
+                if (map.count() != omap.count()) return false;
+                var iter = map.iterator();
+                var oiter = omap.iterator();
+                // this loop structure enforces that the maps are in the same order
+                while (iter.next()) |this| {
+                    const that = oiter.next() orelse return false;
+                    if (!std.mem.eql(u8, this.key_ptr.*, that.key_ptr.*) or !this.value_ptr.recursiveEqualsExact(that.value_ptr.*)) return false;
+                }
+                // the maps are equal if we have also consumed all of the values from
+                // other.
+                return oiter.next() == null;
+            },
+        }
+    }
+
+    pub fn printDebug(self: Value) void {
+        self.printRecursive(0);
+        std.debug.print("\n", .{});
+    }
+
+    fn printRecursive(self: Value, indent: usize) void {
+        switch (self) {
+            .scalar, .string => |str| {
+                if (std.mem.indexOfScalar(u8, str.items, '\n')) |_| {
+                    var lines = std.mem.splitScalar(u8, str.items, '\n');
+                    std.debug.print("\n", .{});
+                    while (lines.next()) |line| {
+                        std.debug.print(
+                            "{[empty]s: >[indent]}{[line]s}{[nl]s}",
+                            .{
+                                .empty = "",
+                                .indent = indent,
+                                .line = line,
+                                .nl = if (lines.peek() == null) "" else "\n",
+                            },
+                        );
+                    }
+                } else {
+                    std.debug.print("{s}", .{str.items});
+                }
+            },
+            .list, .flow_list => |list| {
+                if (list.items.len == 0) {
+                    std.debug.print("[]", .{});
+                    return;
+                }
+
+                std.debug.print("[\n", .{});
+                for (list.items, 0..) |value, idx| {
+                    std.debug.print("{[empty]s: >[indent]}[{[idx]d}] = ", .{ .empty = "", .indent = indent, .idx = idx });
+                    value.printRecursive(indent + 2);
+                    std.debug.print(",\n", .{});
+                }
+                std.debug.print(
+                    "{[empty]s: >[indent]}]",
+                    .{ .empty = "", .indent = indent },
+                );
+            },
+            .map, .flow_map => |map| {
+                if (map.count() == 0) {
+                    std.debug.print("{{}}", .{});
+                    return;
+                }
+
+                std.debug.print("{{\n", .{});
+
+                var iter = map.iterator();
+
+                while (iter.next()) |entry| {
+                    std.debug.print(
+                        "{[empty]s: >[indent]}{[key]s}: ",
+                        .{ .empty = "", .indent = indent + 2, .key = entry.key_ptr.* },
+                    );
+                    entry.value_ptr.printRecursive(indent + 4);
+                    std.debug.print(",\n", .{});
+                }
+                std.debug.print(
+                    "{[empty]s: >[indent]}}}",
+                    .{ .empty = "", .indent = indent },
+                );
+            },
+        }
+    }
+};
diff --git a/src/tokenizer.zig b/src/tokenizer.zig
new file mode 100644
index 0000000..af3786d
--- /dev/null
+++ b/src/tokenizer.zig
@@ -0,0 +1,251 @@
+const std = @import("std");
+
+const Diagnostics = @import("./parser.zig").Diagnostics;
+
+pub const Error = error{
+    BadToken,
+    MixedIndentation,
+    UnquantizedIndentation,
+    TooMuchIndentation,
+    MissingNewline,
+    TrailingWhitespace,
+    Impossible,
+};
+
+pub const DetectedIndentation = union(enum) {
+    unknown: void,
+    spaces: usize,
+    tabs: void,
+};
+
+pub const InlineItem = union(enum) {
+    empty: void,
+    scalar: []const u8,
+    line_string: []const u8,
+    space_string: []const u8,
+
+    flow_list: []const u8,
+    flow_map: []const u8,
+
+    pub fn lineEnding(self: InlineItem) u8 {
+        return switch (self) {
+            .line_string => '\n',
+            .space_string => ' ',
+            else => unreachable,
+        };
+    }
+};
+
+pub const LineContents = union(enum) {
+    comment: []const u8,
+
+    in_line: InlineItem,
+    list_item: InlineItem,
+    map_item: struct { key: []const u8, val: InlineItem },
+};
+
+pub const ShiftDirection = enum { indent, dedent, none };
+
+pub const LineShift = union(ShiftDirection) {
+    indent: void,
+    // we can dedent multiple levels at once.
+    dedent: usize,
+    none: void,
+};
+
+pub const Line = struct {
+    shift: LineShift,
+    contents: LineContents,
+    raw: []const u8,
+};
+
+// buffer is expected to be either LineBuffer or FixedLineBuffer, but can
+// technically be anything with a `nextLine` method
+pub fn LineTokenizer(comptime Buffer: type) type {
+    return struct {
+        buffer: Buffer,
+        index: usize = 0,
+        indentation: DetectedIndentation = .unknown,
+        last_indent: usize = 0,
+        diagnostics: *Diagnostics,
+        row: usize = 0,
+
+        pub fn next(self: *@This()) Error!?Line {
+            lineloop: while (self.buffer.nextLine()) |raw_line| {
+                var indent: usize = 0;
+                for (raw_line, 0..) |char, idx| {
+                    switch (char) {
+                        ' ' => {
+                            switch (self.indentation) {
+                                // There's a weird coupling here because we can't set this until
+                                // all spaces have been consumed. I also thought about ignoring
+                                // spaces on comment lines since those don't affect the
+                                // relative indent/dedent, but then we would allow comments
+                                // to ignore our indent quantum, which I dislike due to it making
+                                // ugly documents.
+                                .unknown => self.indentation = .{ .spaces = 0 },
+                                .spaces => {},
+                                .tabs => return error.MixedIndentation,
+                            }
+                        },
+                        '\t' => {
+                            switch (self.indentation) {
+                                .unknown => self.indentation = .tabs,
+                                .spaces => return error.MixedIndentation,
+                                .tabs => {},
+                            }
+                        },
+                        '\r' => {
+                            return error.BadToken;
+                        },
+                        else => {
+                            indent = idx;
+                            break;
+                        },
+                    }
+                } else {
+                    if (raw_line.len > 0) return error.TrailingWhitespace;
+                    continue :lineloop;
+                }
+
+                var quantized: usize = if (self.indentation == .spaces) quant: {
+                    if (self.indentation.spaces == 0) {
+                        self.indentation.spaces = indent;
+                    }
+                    if (@rem(indent, self.indentation.spaces) != 0)
+                        return error.UnquantizedIndentation;
+
+                    break :quant @divExact(indent, self.indentation.spaces);
+                } else indent;
+
+                const shift: LineShift = if (quantized > self.last_indent) rel: {
+                    if ((quantized - self.last_indent) > 1)
+                        return error.TooMuchIndentation;
+                    break :rel .indent;
+                } else if (quantized < self.last_indent)
+                    .{ .dedent = self.last_indent - quantized }
+                else
+                    .none;
+
+                defer {
+                    self.row += 1;
+                    self.last_indent = quantized;
+                }
+
+                const line = raw_line[indent..];
+
+                // this should not be possible, as empty lines are caught earlier.
+                if (line.len == 0) return error.Impossible;
+
+                switch (line[0]) {
+                    '#' => {
+                        // force comments to be followed by a space. This makes them
+                        // behave the same way as strings, actually.
+                        if (line.len > 1 and line[1] != ' ') return error.BadToken;
+
+                        // simply lie about indentation when the line is a comment.
+                        quantized = self.last_indent;
+                        return .{
+                            .shift = .none,
+                            .contents = .{ .comment = line[1..] },
+                            .raw = line,
+                        };
+                    },
+                    '|', '>', '[', '{' => {
+                        return .{
+                            .shift = shift,
+                            .contents = .{ .in_line = try detectInlineItem(line) },
+                            .raw = line,
+                        };
+                    },
+                    '-' => {
+                        if (line.len > 1 and line[1] != ' ') return error.BadToken;
+
+                        return if (line.len == 1) .{
+                            .shift = shift,
+                            .contents = .{ .list_item = .empty },
+                            .raw = line,
+                        } else .{
+                            .shift = shift,
+                            .contents = .{ .list_item = try detectInlineItem(line[2..]) },
+                            .raw = line,
+                        };
+                    },
+                    else => {
+                        for (line, 0..) |char, idx| {
+                            if (char == ':') {
+                                if (idx + 1 == line.len) return .{
+                                    .shift = shift,
+                                    .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
+                                    .raw = line,
+                                };
+
+                                if (line[idx + 1] != ' ') return error.BadToken;
+
+                                return .{
+                                    .shift = shift,
+                                    .contents = .{ .map_item = .{
+                                        .key = line[0..idx],
+                                        .val = try detectInlineItem(line[idx + 2 ..]),
+                                    } },
+                                    .raw = line,
+                                };
+                            }
+                        }
+
+                        return .{
+                            .shift = shift,
+                            .contents = .{ .in_line = .{ .scalar = line } },
+                            .raw = line,
+                        };
+                    },
+                }
+
+                // somehow everything else has failed
+                return error.Impossible;
+            }
+            return null;
+        }
+
+        fn detectInlineItem(buf: []const u8) Error!InlineItem {
+            if (buf.len == 0) return .empty;
+
+            switch (buf[0]) {
+                '>', '|' => |char| {
+                    if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
+
+                    const slice: []const u8 = switch (buf[buf.len - 1]) {
+                        ' ', '\t' => return error.TrailingWhitespace,
+                        '|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
+                        else => buf[@min(2, buf.len)..buf.len],
+                    };
+
+                    return if (char == '>')
+                        .{ .line_string = slice }
+                    else
+                        .{ .space_string = slice };
+                },
+                '[' => {
+                    if (buf.len < 2 or buf[buf.len - 1] != ']')
+                        return error.BadToken;
+
+                    // keep the closing ] for the flow parser
+                    return .{ .flow_list = buf[1..] };
+                },
+                '{' => {
+                    if (buf.len < 2 or buf[buf.len - 1] != '}')
+                        return error.BadToken;
+
+                    // keep the closing } fpr the flow parser
+                    return .{ .flow_map = buf[1..] };
+                },
+                else => {
+                    if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t')
+                        return error.TrailingWhitespace;
+
+                    return .{ .scalar = buf };
+                },
+            }
+        }
+    };
+}