config: refactor LineTokenizer to use an internal line buffer

The goal here is to support a streaming parser. However, I did decide the leave the flow item parser state machine as fully buffered (i.e. not streaming). This is not JSON and in general documents should be many, shorter lines, so this buffering strategy should work reasonably well. I have not actually tried the streaming implementation of this, yet.
2023-09-21 23:34:17 -07:00
parent b08d712616
commit a0107ab9fd
1 changed files with 328 additions and 246 deletions
--- a/src/config.zig
+++ b/src/config.zig
@@ -63,38 +63,123 @@
 const std = @import("std");
 pub const IndexSlice = struct { start: usize, len: usize };
 pub const Diagnostics = struct {
    row: usize,
    span: struct { absolute: usize, line_offset: usize, length: usize },
    message: []const u8,
 };
-pub const LineTokenizer = struct {
+pub const LineBuffer = struct {
-    buffer: []const u8,
+    allocator: std.mem.Allocator,
-    index: usize = 0,
+    buffer: []u8,
-    indentation: IndentationType = .immaterial,
+    used: usize,
-    last_indent: usize = 0,
+    window: IndexSlice,
    diagnostics: *Diagnostics,
-    row: usize = 0,
+    pub const default_capacity: usize = 4096;
    pub const Error = std.mem.Allocator.Error;
-    const Error = error{
+    pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
-        BadToken,
+        return initCapacity(allocator, default_capacity);
-        MixedIndentation,
+    }
-        UnquantizedIndentation,
+
-        TooMuchIndentation,
+    pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
-        MissingNewline,
+        return .{
-        TrailingWhitespace,
+            .allocator = allocator,
-        Impossible,
+            .buffer = try allocator.alloc(u8, capacity),
            .used = 0,
            .window = .{ .start = 0, .len = 0 },
        };
    }
-    const IndentationType = union(enum) {
+    pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
        if (data.len == 0) return;
        // TODO: check for usize overflow here if we want Maximum Robustness
        const new_window_len = self.window.len + data.len;
        // data cannot fit in the buffer with our scan window, so we have to realloc
        if (new_window_len > self.buffer.len) {
            // TODO: adopt an overallocation strategy? Will potentially avoid allocating
            //       on every invocation but will cause the buffer to oversize
            try self.allocator.realloc(self.buffer, new_window_len);
            self.rehome();
            @memcpy(self.buffer[self.used..].ptr, data);
            self.used = new_window_len;
            self.window.len = new_window_len;
        }
        // data will fit, but needs to be moved in the buffer
        else if (self.window.start + new_window_len > self.buffer.len) {
            self.rehome();
            @memcpy(self.buffer[self.used..].ptr, data);
            self.used = new_window_len;
            self.window.len = new_window_len;
        }
        // data can simply be appended
        else {
            @memcpy(self.buffer[self.used..].ptr, data);
        }
    }
    /// The memory returned by this function is valid until the next call to `feed`.
    /// The resulting slice does not include the newline character.
    pub fn nextLine(self: *LineBuffer) ?[]const u8 {
        if (self.window.start >= self.buffer.len or self.window.len == 0)
            return null;
        const window = self.buffer[self.window.start..][0..self.window.len];
        const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
        self.window.start += split + 1;
        self.window.len -= split + 1;
        return window[0..split];
    }
    fn rehome(self: *LineBuffer) void {
        if (self.window.start == 0) return;
        const window = self.buffer[self.window.start..][0..self.window.len];
        if (self.window.len > self.window.start)
            std.mem.copyForwards(u8, self.buffer, window)
        else
            @memcpy(self.buffer.ptr, window);
        self.window.start = 0;
        self.used = window.len;
    }
 };
 pub const FixedLineBuffer = struct {
    buffer: []const u8,
    window: IndexSlice,
    pub fn init(data: []const u8) FixedLineBuffer {
        return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
    }
    pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
        if (self.window.start >= self.buffer.len or self.window.len == 0)
            return null;
        const window = self.buffer[self.window.start..][0..self.window.len];
        const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
        self.window.start += split + 1;
        self.window.len -= split + 1;
        return window[0..split];
    }
 };
 const IndentationType = union(enum) {
    immaterial: void,
    spaces: usize,
    tabs: void,
-    };
+};
-    const InlineItem = union(enum) {
+const InlineItem = union(enum) {
    empty: void,
    scalar: []const u8,
    line_string: []const u8,
@@ -110,51 +195,67 @@ pub const LineTokenizer = struct {
            else => unreachable,
        };
    }
-    };
+};
-    const LineContents = union(enum) {
+const LineContents = union(enum) {
    comment: []const u8,
    in_line: InlineItem,
    list_item: InlineItem,
    map_item: struct { key: []const u8, val: InlineItem },
-    };
+};
-    // we can dedent multiple levels at once. Example:
+// we can dedent multiple levels at once. Example:
-    //
+//
-    // foo:
+// foo:
-    //   bar:
+//   bar:
-    //     > a
+//     > a
-    //     > string
+//     > string
-    // baz: [qux]
+// baz: [qux]
-    //
+//
-    // capturing this is conceptually simple, but implementing it without complex
+// capturing this is conceptually simple, but implementing it without complex
-    // indentation tracking requires quantizing the indentation. This means our
+// indentation tracking requires quantizing the indentation. This means our
-    // IndentationType will also need to track the number of spaces used for
+// IndentationType will also need to track the number of spaces used for
-    // indentation, as detected. Then every line we have to check indent rem the
+// indentation, as detected. Then every line we have to check indent rem the
-    // quantization level == 0 (otherwise we broke quantization) and compute indent
+// quantization level == 0 (otherwise we broke quantization) and compute indent
-    // div the quantization level to give us our effective indentation level.
+// div the quantization level to give us our effective indentation level.
-    const ShiftDirection = enum { indent, dedent, none };
+const ShiftDirection = enum { indent, dedent, none };
-    const RelativeIndent = union(ShiftDirection) {
+const RelativeIndent = union(ShiftDirection) {
    indent: void,
    dedent: usize,
    none: void,
-    };
+};
-    const Line = struct {
+const Line = struct {
    indent: RelativeIndent,
    contents: LineContents,
    raw: []const u8,
 };
 pub fn LineTokenizer(comptime Buffer: type) type {
    return struct {
        buffer: Buffer,
        index: usize = 0,
        indentation: IndentationType = .immaterial,
        last_indent: usize = 0,
        diagnostics: *Diagnostics,
        row: usize = 0,
        const Error = error{
            BadToken,
            MixedIndentation,
            UnquantizedIndentation,
            TooMuchIndentation,
            MissingNewline,
            TrailingWhitespace,
            Impossible,
        };
-    pub fn next(self: *LineTokenizer) Error!?Line {
+        pub fn next(self: *@This()) Error!?Line {
-        if (self.index == self.buffer.len) return null;
+            lineloop: while (self.buffer.nextLine()) |raw_line| {
                var indent: usize = 0;
-        var offset: usize = 0;
+                for (raw_line, 0..) |char, idx| {
        for (self.buffer[self.index..], 0..) |char, idx| {
                    switch (char) {
                        ' ' => {
                            switch (self.indentation) {
@@ -168,7 +269,6 @@ pub const LineTokenizer = struct {
                                .spaces => {},
                                .tabs => return error.MixedIndentation,
                            }
                    indent += 1;
                        },
                        '\t' => {
                            switch (self.indentation) {
@@ -176,40 +276,28 @@ pub const LineTokenizer = struct {
                                .spaces => return error.MixedIndentation,
                                .tabs => {},
                            }
                    indent += 1;
                        },
                        '\r' => {
                            return error.BadToken;
                        },
-                '\n' => {
+                        else => {
-                    // don't even emit anything for empty rows.
+                            indent = idx;
-                    self.row += 1;
+                            break;
                    offset = idx + 1;
                    // if it's too hard to deal with, Just Make It An Error!!!
                    // an empty line with whitespace on it is garbage. It can mess with
                    // the indentation detection grossly in a way that is annoying to
                    // deal with. Besides, having whitespace-only lines in a document
                    // is essentially terrorism, with which negotiations are famously
                    // not permitted.
                    if (indent > 0) return error.TrailingWhitespace;
                        },
                else => break,
                    }
                } else {
-            std.debug.assert(self.buffer.len == self.index + indent + offset + 1);
+                    if (raw_line.len > 0) return error.TrailingWhitespace;
-            self.index = self.buffer.len;
+                    continue :lineloop;
            // this prong will get hit when the document only consists of whitespace
            return null;
                }
-        var quantized: usize = if (self.indentation == .spaces) blk: {
+                var quantized: usize = if (self.indentation == .spaces) quant: {
                    if (self.indentation.spaces == 0) {
                        self.indentation.spaces = indent;
                    }
                    if (@rem(indent, self.indentation.spaces) != 0)
                        return error.UnquantizedIndentation;
-            break :blk @divExact(indent, self.indentation.spaces);
+                    break :quant @divExact(indent, self.indentation.spaces);
                } else indent;
                const relative: RelativeIndent = if (quantized > self.last_indent) rel: {
@@ -221,16 +309,12 @@ pub const LineTokenizer = struct {
                else
                    .none;
        offset += indent;
                defer {
                    self.row += 1;
                    self.last_indent = quantized;
            self.index += offset;
                }
-        const line = try consumeLine(self.buffer[self.index + offset ..]);
+                const line = raw_line[indent..];
        offset += line.len + 1;
                // this should not be possible, as empty lines are caught earlier.
                if (line.len == 0) return error.Impossible;
@@ -294,6 +378,11 @@ pub const LineTokenizer = struct {
                        };
                    },
                }
                // somehow everything else has failed
                return error.Impossible;
            }
            return null;
        }
        fn detectInlineItem(buf: []const u8) Error!InlineItem {
@@ -336,19 +425,8 @@ pub const LineTokenizer = struct {
                },
            }
        }
-
+    };
-    fn consumeLine(buf: []const u8) ![]const u8 {
+}
        for (buf, 0..) |char, idx| {
            switch (char) {
                '\n' => return buf[0..idx],
                '\r' => return error.BadToken,
                else => {},
            }
        }
        return error.MissingNewline;
    }
 };
 pub const Value = union(enum) {
    pub const String = std.ArrayList(u8);
@@ -489,7 +567,7 @@ pub const Parser = struct {
        DuplicateKey,
        BadMapEntry,
        Fail,
-    } || LineTokenizer.Error || FlowParser.Error || std.mem.Allocator.Error;
+    } || LineTokenizer(FixedLineBuffer).Error || FlowParser.Error || std.mem.Allocator.Error;
    pub const DuplicateKeyBehavior = enum {
        use_first,
@@ -536,7 +614,7 @@ pub const Parser = struct {
        document: Document,
        value_stack: Stack,
        state: ParseState = .initial,
-        expect_shift: LineTokenizer.ShiftDirection = .none,
+        expect_shift: ShiftDirection = .none,
        dangling_key: ?[]const u8 = null,
        pub fn init(alloc: std.mem.Allocator) State {
@@ -557,12 +635,16 @@ pub const Parser = struct {
        const arena_alloc = document.arena.allocator();
        var state: ParseState = .initial;
-        var expect_shift: LineTokenizer.ShiftDirection = .none;
+        var expect_shift: ShiftDirection = .none;
        var dangling_key: ?[]const u8 = null;
        var stack = std.ArrayList(*Value).init(arena_alloc);
        defer stack.deinit();
-        var tok: LineTokenizer = .{ .buffer = buffer, .diagnostics = &self.diagnostics };
+        var tok: LineTokenizer(FixedLineBuffer) = .{
            .buffer = FixedLineBuffer.init(buffer),
            .diagnostics = &self.diagnostics,
        };
        while (try tok.next()) |line| {
            if (line.contents == .comment) continue;