1 changed files with 281 additions and 363 deletions
--- a/src/config.zig
+++ b/src/config.zig
@ -63,123 +63,38 @@
 const std = @import("std");
 pub const IndexSlice = struct { start: usize, len: usize };
 pub const Diagnostics = struct {
    row: usize,
    span: struct { absolute: usize, line_offset: usize, length: usize },
    message: []const u8,
 };
-pub const LineBuffer = struct {
+pub const LineTokenizer = struct {
    allocator: std.mem.Allocator,
    buffer: []u8,
    used: usize,
    window: IndexSlice,
    pub const default_capacity: usize = 4096;
    pub const Error = std.mem.Allocator.Error;
    pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
        return initCapacity(allocator, default_capacity);
    }
    pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
        return .{
            .allocator = allocator,
            .buffer = try allocator.alloc(u8, capacity),
            .used = 0,
            .window = .{ .start = 0, .len = 0 },
        };
    }
    pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
        if (data.len == 0) return;
        // TODO: check for usize overflow here if we want Maximum Robustness
        const new_window_len = self.window.len + data.len;
        // data cannot fit in the buffer with our scan window, so we have to realloc
        if (new_window_len > self.buffer.len) {
            // TODO: adopt an overallocation strategy? Will potentially avoid allocating
            //       on every invocation but will cause the buffer to oversize
            try self.allocator.realloc(self.buffer, new_window_len);
            self.rehome();
            @memcpy(self.buffer[self.used..].ptr, data);
            self.used = new_window_len;
            self.window.len = new_window_len;
        }
        // data will fit, but needs to be moved in the buffer
        else if (self.window.start + new_window_len > self.buffer.len) {
            self.rehome();
            @memcpy(self.buffer[self.used..].ptr, data);
            self.used = new_window_len;
            self.window.len = new_window_len;
        }
        // data can simply be appended
        else {
            @memcpy(self.buffer[self.used..].ptr, data);
        }
    }
    /// The memory returned by this function is valid until the next call to `feed`.
    /// The resulting slice does not include the newline character.
    pub fn nextLine(self: *LineBuffer) ?[]const u8 {
        if (self.window.start >= self.buffer.len or self.window.len == 0)
            return null;
        const window = self.buffer[self.window.start..][0..self.window.len];
        const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
        self.window.start += split + 1;
        self.window.len -= split + 1;
        return window[0..split];
    }
    fn rehome(self: *LineBuffer) void {
        if (self.window.start == 0) return;
        const window = self.buffer[self.window.start..][0..self.window.len];
        if (self.window.len > self.window.start)
            std.mem.copyForwards(u8, self.buffer, window)
        else
            @memcpy(self.buffer.ptr, window);
        self.window.start = 0;
        self.used = window.len;
    }
 };
 pub const FixedLineBuffer = struct {
    buffer: []const u8,
-    window: IndexSlice,
+    index: usize = 0,
    indentation: IndentationType = .immaterial,
    last_indent: usize = 0,
    diagnostics: *Diagnostics,
-    pub fn init(data: []const u8) FixedLineBuffer {
+    row: usize = 0,
        return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
    }
-    pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
+    const Error = error{
-        if (self.window.start >= self.buffer.len or self.window.len == 0)
+        BadToken,
-            return null;
+        MixedIndentation,
        UnquantizedIndentation,
        TooMuchIndentation,
        MissingNewline,
        TrailingWhitespace,
        Impossible,
    };
-        const window = self.buffer[self.window.start..][0..self.window.len];
+    const IndentationType = union(enum) {
        const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
        self.window.start += split + 1;
        self.window.len -= split + 1;
        return window[0..split];
    }
 };
 const IndentationType = union(enum) {
        immaterial: void,
        spaces: usize,
        tabs: void,
-};
+    };
-const InlineItem = union(enum) {
+    const InlineItem = union(enum) {
        empty: void,
        scalar: []const u8,
        line_string: []const u8,
@ -195,67 +110,51 @@ const InlineItem = union(enum) {
                else => unreachable,
            };
        }
-};
+    };
-const LineContents = union(enum) {
+    const LineContents = union(enum) {
        comment: []const u8,
        in_line: InlineItem,
        list_item: InlineItem,
        map_item: struct { key: []const u8, val: InlineItem },
-};
+    };
-// we can dedent multiple levels at once. Example:
+    // we can dedent multiple levels at once. Example:
-//
+    //
-// foo:
+    // foo:
-//   bar:
+    //   bar:
-//     > a
+    //     > a
-//     > string
+    //     > string
-// baz: [qux]
+    // baz: [qux]
-//
+    //
-// capturing this is conceptually simple, but implementing it without complex
+    // capturing this is conceptually simple, but implementing it without complex
-// indentation tracking requires quantizing the indentation. This means our
+    // indentation tracking requires quantizing the indentation. This means our
-// IndentationType will also need to track the number of spaces used for
+    // IndentationType will also need to track the number of spaces used for
-// indentation, as detected. Then every line we have to check indent rem the
+    // indentation, as detected. Then every line we have to check indent rem the
-// quantization level == 0 (otherwise we broke quantization) and compute indent
+    // quantization level == 0 (otherwise we broke quantization) and compute indent
-// div the quantization level to give us our effective indentation level.
+    // div the quantization level to give us our effective indentation level.
-const ShiftDirection = enum { indent, dedent, none };
+    const ShiftDirection = enum { indent, dedent, none };
-const RelativeIndent = union(ShiftDirection) {
+    const RelativeIndent = union(ShiftDirection) {
        indent: void,
        dedent: usize,
        none: void,
-};
+    };
-const Line = struct {
+    const Line = struct {
        indent: RelativeIndent,
        contents: LineContents,
        raw: []const u8,
 };
 pub fn LineTokenizer(comptime Buffer: type) type {
    return struct {
        buffer: Buffer,
        index: usize = 0,
        indentation: IndentationType = .immaterial,
        last_indent: usize = 0,
        diagnostics: *Diagnostics,
        row: usize = 0,
        const Error = error{
            BadToken,
            MixedIndentation,
            UnquantizedIndentation,
            TooMuchIndentation,
            MissingNewline,
            TrailingWhitespace,
            Impossible,
    };
-        pub fn next(self: *@This()) Error!?Line {
+    pub fn next(self: *LineTokenizer) Error!?Line {
-            lineloop: while (self.buffer.nextLine()) |raw_line| {
+        if (self.index == self.buffer.len) return null;
        var indent: usize = 0;
-                for (raw_line, 0..) |char, idx| {
+        var offset: usize = 0;
        for (self.buffer[self.index..], 0..) |char, idx| {
            switch (char) {
                ' ' => {
                    switch (self.indentation) {
@ -269,6 +168,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                        .spaces => {},
                        .tabs => return error.MixedIndentation,
                    }
                    indent += 1;
                },
                '\t' => {
                    switch (self.indentation) {
@ -276,28 +176,40 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                        .spaces => return error.MixedIndentation,
                        .tabs => {},
                    }
                    indent += 1;
                },
                '\r' => {
                    return error.BadToken;
                },
-                        else => {
+                '\n' => {
-                            indent = idx;
+                    // don't even emit anything for empty rows.
-                            break;
+                    self.row += 1;
                    offset = idx + 1;
                    // if it's too hard to deal with, Just Make It An Error!!!
                    // an empty line with whitespace on it is garbage. It can mess with
                    // the indentation detection grossly in a way that is annoying to
                    // deal with. Besides, having whitespace-only lines in a document
                    // is essentially terrorism, with which negotiations are famously
                    // not permitted.
                    if (indent > 0) return error.TrailingWhitespace;
                },
                else => break,
            }
        } else {
-                    if (raw_line.len > 0) return error.TrailingWhitespace;
+            std.debug.assert(self.buffer.len == self.index + indent + offset + 1);
-                    continue :lineloop;
+            self.index = self.buffer.len;
            // this prong will get hit when the document only consists of whitespace
            return null;
        }
-                var quantized: usize = if (self.indentation == .spaces) quant: {
+        var quantized: usize = if (self.indentation == .spaces) blk: {
            if (self.indentation.spaces == 0) {
                self.indentation.spaces = indent;
            }
            if (@rem(indent, self.indentation.spaces) != 0)
                return error.UnquantizedIndentation;
-                    break :quant @divExact(indent, self.indentation.spaces);
+            break :blk @divExact(indent, self.indentation.spaces);
        } else indent;
        const relative: RelativeIndent = if (quantized > self.last_indent) rel: {
@ -309,12 +221,16 @@ pub fn LineTokenizer(comptime Buffer: type) type {
        else
            .none;
        offset += indent;
        defer {
            self.row += 1;
            self.last_indent = quantized;
            self.index += offset;
        }
-                const line = raw_line[indent..];
+        const line = try consumeLine(self.buffer[self.index + offset ..]);
        offset += line.len + 1;
        // this should not be possible, as empty lines are caught earlier.
        if (line.len == 0) return error.Impossible;
@ -378,11 +294,6 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                };
            },
        }
                // somehow everything else has failed
                return error.Impossible;
            }
            return null;
    }
    fn detectInlineItem(buf: []const u8) Error!InlineItem {
@ -425,12 +336,23 @@ pub fn LineTokenizer(comptime Buffer: type) type {
            },
        }
    }
-    };
+
-}
+    fn consumeLine(buf: []const u8) ![]const u8 {
        for (buf, 0..) |char, idx| {
            switch (char) {
                '\n' => return buf[0..idx],
                '\r' => return error.BadToken,
                else => {},
            }
        }
        return error.MissingNewline;
    }
 };
 pub const Value = union(enum) {
    pub const String = std.ArrayList(u8);
-    pub const Map = std.StringArrayHashMap(Value);
+    pub const Map = std.StringHashMap(Value);
    pub const List = std.ArrayList(Value);
    pub const TagType = @typeInfo(Value).Union.tag_type.?;
@ -567,7 +489,7 @@ pub const Parser = struct {
        DuplicateKey,
        BadMapEntry,
        Fail,
-    } || LineTokenizer(FixedLineBuffer).Error || FlowParser.Error || std.mem.Allocator.Error;
+    } || LineTokenizer.Error || FlowParser.Error || std.mem.Allocator.Error;
    pub const DuplicateKeyBehavior = enum {
        use_first,
@ -614,7 +536,7 @@ pub const Parser = struct {
        document: Document,
        value_stack: Stack,
        state: ParseState = .initial,
-        expect_shift: ShiftDirection = .none,
+        expect_shift: LineTokenizer.ShiftDirection = .none,
        dangling_key: ?[]const u8 = null,
        pub fn init(alloc: std.mem.Allocator) State {
@ -635,16 +557,12 @@ pub const Parser = struct {
        const arena_alloc = document.arena.allocator();
        var state: ParseState = .initial;
-        var expect_shift: ShiftDirection = .none;
+        var expect_shift: LineTokenizer.ShiftDirection = .none;
        var dangling_key: ?[]const u8 = null;
        var stack = std.ArrayList(*Value).init(arena_alloc);
        defer stack.deinit();
-        var tok: LineTokenizer(FixedLineBuffer) = .{
+        var tok: LineTokenizer = .{ .buffer = buffer, .diagnostics = &self.diagnostics };
            .buffer = FixedLineBuffer.init(buffer),
            .diagnostics = &self.diagnostics,
        };
        while (try tok.next()) |line| {
            if (line.contents == .comment) continue;
@ -727,7 +645,7 @@ pub const Parser = struct {
                                        // key somewhere until we can consume the
                                        // value. More parser state to lug along.
-                                        dangling_key = try arena_alloc.dupe(u8, pair.key);
+                                        dangling_key = pair.key;
                                        state = .value;
                                    },
                                    .scalar => |str| {
@ -897,7 +815,7 @@ pub const Parser = struct {
                                    switch (pair.val) {
                                        .empty => {
-                                            dangling_key = try arena_alloc.dupe(u8, pair.key);
+                                            dangling_key = pair.key;
                                            expect_shift = .indent;
                                        },
                                        .scalar => |str| try new_map.map.put(pair.key, try Value.fromScalar(arena_alloc, str)),
@ -995,7 +913,7 @@ pub const Parser = struct {
                                        .none, .dedent => switch (pair.val) {
                                            .empty => {
                                                expect_shift = .indent;
-                                                dangling_key = try arena_alloc.dupe(u8, pair.key);
+                                                dangling_key = pair.key;
                                            },
                                            .scalar => |str| try putMap(map, pair.key, try Value.fromScalar(arena_alloc, str), self.dupe_behavior),
                                            .line_string, .space_string => |str| try putMap(map, pair.key, try Value.fromString(arena_alloc, str), self.dupe_behavior),
@ -1013,7 +931,7 @@ pub const Parser = struct {
                                            switch (pair.val) {
                                                .empty => {
                                                    expect_shift = .indent;
-                                                    dangling_key = try arena_alloc.dupe(u8, pair.key);
+                                                    dangling_key = pair.key;
                                                },
                                                .scalar => |str| try new_map.map.put(pair.key, try Value.fromScalar(arena_alloc, str)),
                                                .line_string, .space_string => |str| try new_map.map.put(pair.key, try Value.fromString(arena_alloc, str)),
@ -1334,7 +1252,7 @@ pub const FlowParser = struct {
                .consuming_map_key => switch (char) {
                    ':' => {
                        const tip = try getStackTip(self.stack);
-                        dangling_key = try self.alloc.dupe(u8, self.buffer[tip.item_start..idx]);
+                        dangling_key = self.buffer[tip.item_start..idx];
                        self.state = .want_map_value;
                    },