config: refactor LineTokenizer to use an internal line buffer

The goal here is to support a streaming parser. However, I did decide the leave the flow item parser state machine as fully buffered (i.e. not streaming). This is not JSON and in general documents should be many, shorter lines, so this buffering strategy should work reasonably well. I have not actually tried the streaming implementation of this, yet.
2023-09-21 23:34:17 -07:00
parent ab580fa80a
commit 6415571d01
1 changed files with 328 additions and 246 deletions
--- a/src/config.zig
+++ b/src/config.zig
@@ -63,29 +63,114 @@
 const std = @import("std");
 pub const IndexSlice = struct { start: usize, len: usize };
 pub const Diagnostics = struct {
    row: usize,
    span: struct { absolute: usize, line_offset: usize, length: usize },
    message: []const u8,
 };
-pub const LineTokenizer = struct {
+pub const LineBuffer = struct {
    allocator: std.mem.Allocator,
    buffer: []u8,
    used: usize,
    window: IndexSlice,
    pub const default_capacity: usize = 4096;
    pub const Error = std.mem.Allocator.Error;
    pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
        return initCapacity(allocator, default_capacity);
    }
    pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
        return .{
            .allocator = allocator,
            .buffer = try allocator.alloc(u8, capacity),
            .used = 0,
            .window = .{ .start = 0, .len = 0 },
        };
    }
    pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
        if (data.len == 0) return;
        // TODO: check for usize overflow here if we want Maximum Robustness
        const new_window_len = self.window.len + data.len;
        // data cannot fit in the buffer with our scan window, so we have to realloc
        if (new_window_len > self.buffer.len) {
            // TODO: adopt an overallocation strategy? Will potentially avoid allocating
            //       on every invocation but will cause the buffer to oversize
            try self.allocator.realloc(self.buffer, new_window_len);
            self.rehome();
            @memcpy(self.buffer[self.used..].ptr, data);
            self.used = new_window_len;
            self.window.len = new_window_len;
        }
        // data will fit, but needs to be moved in the buffer
        else if (self.window.start + new_window_len > self.buffer.len) {
            self.rehome();
            @memcpy(self.buffer[self.used..].ptr, data);
            self.used = new_window_len;
            self.window.len = new_window_len;
        }
        // data can simply be appended
        else {
            @memcpy(self.buffer[self.used..].ptr, data);
        }
    }
    /// The memory returned by this function is valid until the next call to `feed`.
    /// The resulting slice does not include the newline character.
    pub fn nextLine(self: *LineBuffer) ?[]const u8 {
        if (self.window.start >= self.buffer.len or self.window.len == 0)
            return null;
        const window = self.buffer[self.window.start..][0..self.window.len];
        const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
        self.window.start += split + 1;
        self.window.len -= split + 1;
        return window[0..split];
    }
    fn rehome(self: *LineBuffer) void {
        if (self.window.start == 0) return;
        const window = self.buffer[self.window.start..][0..self.window.len];
        if (self.window.len > self.window.start)
            std.mem.copyForwards(u8, self.buffer, window)
        else
            @memcpy(self.buffer.ptr, window);
        self.window.start = 0;
        self.used = window.len;
    }
 };
 pub const FixedLineBuffer = struct {
    buffer: []const u8,
-    index: usize = 0,
+    window: IndexSlice,
    indentation: IndentationType = .immaterial,
    last_indent: usize = 0,
    diagnostics: *Diagnostics,
-    row: usize = 0,
+    pub fn init(data: []const u8) FixedLineBuffer {
        return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
    }
-    const Error = error{
+    pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
-        BadToken,
+        if (self.window.start >= self.buffer.len or self.window.len == 0)
-        MixedIndentation,
+            return null;
-        UnquantizedIndentation,
+
-        TooMuchIndentation,
+        const window = self.buffer[self.window.start..][0..self.window.len];
-        MissingNewline,
+        const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
-        TrailingWhitespace,
+
-        Impossible,
+        self.window.start += split + 1;
        self.window.len -= split + 1;
        return window[0..split];
    }
 };
 const IndentationType = union(enum) {
@@ -148,13 +233,29 @@ pub const LineTokenizer = struct {
    raw: []const u8,
 };
-    pub fn next(self: *LineTokenizer) Error!?Line {
+pub fn LineTokenizer(comptime Buffer: type) type {
-        if (self.index == self.buffer.len) return null;
+    return struct {
        buffer: Buffer,
        index: usize = 0,
        indentation: IndentationType = .immaterial,
        last_indent: usize = 0,
        diagnostics: *Diagnostics,
        row: usize = 0,
        const Error = error{
            BadToken,
            MixedIndentation,
            UnquantizedIndentation,
            TooMuchIndentation,
            MissingNewline,
            TrailingWhitespace,
            Impossible,
        };
        pub fn next(self: *@This()) Error!?Line {
            lineloop: while (self.buffer.nextLine()) |raw_line| {
                var indent: usize = 0;
-        var offset: usize = 0;
+                for (raw_line, 0..) |char, idx| {
        for (self.buffer[self.index..], 0..) |char, idx| {
                    switch (char) {
                        ' ' => {
                            switch (self.indentation) {
@@ -168,7 +269,6 @@ pub const LineTokenizer = struct {
                                .spaces => {},
                                .tabs => return error.MixedIndentation,
                            }
                    indent += 1;
                        },
                        '\t' => {
                            switch (self.indentation) {
@@ -176,40 +276,28 @@ pub const LineTokenizer = struct {
                                .spaces => return error.MixedIndentation,
                                .tabs => {},
                            }
                    indent += 1;
                        },
                        '\r' => {
                            return error.BadToken;
                        },
-                '\n' => {
+                        else => {
-                    // don't even emit anything for empty rows.
+                            indent = idx;
-                    self.row += 1;
+                            break;
                    offset = idx + 1;
                    // if it's too hard to deal with, Just Make It An Error!!!
                    // an empty line with whitespace on it is garbage. It can mess with
                    // the indentation detection grossly in a way that is annoying to
                    // deal with. Besides, having whitespace-only lines in a document
                    // is essentially terrorism, with which negotiations are famously
                    // not permitted.
                    if (indent > 0) return error.TrailingWhitespace;
                        },
                else => break,
                    }
                } else {
-            std.debug.assert(self.buffer.len == self.index + indent + offset + 1);
+                    if (raw_line.len > 0) return error.TrailingWhitespace;
-            self.index = self.buffer.len;
+                    continue :lineloop;
            // this prong will get hit when the document only consists of whitespace
            return null;
                }
-        var quantized: usize = if (self.indentation == .spaces) blk: {
+                var quantized: usize = if (self.indentation == .spaces) quant: {
                    if (self.indentation.spaces == 0) {
                        self.indentation.spaces = indent;
                    }
                    if (@rem(indent, self.indentation.spaces) != 0)
                        return error.UnquantizedIndentation;
-            break :blk @divExact(indent, self.indentation.spaces);
+                    break :quant @divExact(indent, self.indentation.spaces);
                } else indent;
                const relative: RelativeIndent = if (quantized > self.last_indent) rel: {
@@ -221,16 +309,12 @@ pub const LineTokenizer = struct {
                else
                    .none;
        offset += indent;
                defer {
                    self.row += 1;
                    self.last_indent = quantized;
            self.index += offset;
                }
-        const line = try consumeLine(self.buffer[self.index + offset ..]);
+                const line = raw_line[indent..];
        offset += line.len + 1;
                // this should not be possible, as empty lines are caught earlier.
                if (line.len == 0) return error.Impossible;
@@ -294,6 +378,11 @@ pub const LineTokenizer = struct {
                        };
                    },
                }
                // somehow everything else has failed
                return error.Impossible;
            }
            return null;
        }
        fn detectInlineItem(buf: []const u8) Error!InlineItem {
@@ -336,19 +425,8 @@ pub const LineTokenizer = struct {
                },
            }
        }
    fn consumeLine(buf: []const u8) ![]const u8 {
        for (buf, 0..) |char, idx| {
            switch (char) {
                '\n' => return buf[0..idx],
                '\r' => return error.BadToken,
                else => {},
            }
        }
        return error.MissingNewline;
    }
    };
 }
 pub const Value = union(enum) {
    pub const String = std.ArrayList(u8);
@@ -489,7 +567,7 @@ pub const Parser = struct {
        DuplicateKey,
        BadMapEntry,
        Fail,
-    } || LineTokenizer.Error || FlowParser.Error || std.mem.Allocator.Error;
+    } || LineTokenizer(FixedLineBuffer).Error || FlowParser.Error || std.mem.Allocator.Error;
    pub const DuplicateKeyBehavior = enum {
        use_first,
@@ -536,7 +614,7 @@ pub const Parser = struct {
        document: Document,
        value_stack: Stack,
        state: ParseState = .initial,
-        expect_shift: LineTokenizer.ShiftDirection = .none,
+        expect_shift: ShiftDirection = .none,
        dangling_key: ?[]const u8 = null,
        pub fn init(alloc: std.mem.Allocator) State {
@@ -557,12 +635,16 @@ pub const Parser = struct {
        const arena_alloc = document.arena.allocator();
        var state: ParseState = .initial;
-        var expect_shift: LineTokenizer.ShiftDirection = .none;
+        var expect_shift: ShiftDirection = .none;
        var dangling_key: ?[]const u8 = null;
        var stack = std.ArrayList(*Value).init(arena_alloc);
        defer stack.deinit();
-        var tok: LineTokenizer = .{ .buffer = buffer, .diagnostics = &self.diagnostics };
+        var tok: LineTokenizer(FixedLineBuffer) = .{
            .buffer = FixedLineBuffer.init(buffer),
            .diagnostics = &self.diagnostics,
        };
        while (try tok.next()) |line| {
            if (line.contents == .comment) continue;