parser: start the arduous journey of hooking up diagnostics

The errors in the line buffer and tokenizer now have diagnostics. The line number is trivial to keep track of due to the line buffer, but the column index requires quite a bit of juggling, as we pass successively trimmed down buffers to the internals of the parser. There will probably be some column index counting problems in the future. Also, handling the diagnostics is a bit awkward, since it's a mandatory out-parameter of the parse functions now. The user must provide a valid diagnostics object that survives for the life of the parser.
2023-09-27 23:44:06 -07:00 · 2023-09-27 23:44:06 -07:00 · 01f98f9aff
commit 01f98f9aff
parent 3258e7fdb5
7 changed files with 185 additions and 45 deletions
--- a/examples/parse.zig
+++ b/examples/parse.zig
@ -15,7 +15,16 @@ pub fn main() !void {
    var needfree = true;
    defer if (needfree) allocator.free(data);

-    const document = try nice.parseBuffer(allocator, data, .{});
+    var diagnostics = nice.Diagnostics{};
+    const document = nice.parseBuffer(allocator, data, &diagnostics, .{}) catch |err| {
+        std.debug.print("{s}:{d} col:{d}: {s}\n", .{
+            args[1],
+            diagnostics.row,
+            diagnostics.line_offset,
+            diagnostics.message,
+        });
+        return err;
+    };
    defer document.deinit();

    // free data memory to ensure that the parsed document is not holding
--- a/examples/stream.zig
+++ b/examples/stream.zig
@ -16,6 +16,7 @@ pub fn main() !void {
        defer file.close();
        var parser = try nice.StreamParser.init(allocator, .{});
        defer parser.deinit();
+        errdefer parser.parse_state.document.deinit();
        while (true) {
            var buf = [_]u8{0} ** 1024;
            const len = try file.read(&buf);
--- a/src/linebuffer.zig
+++ b/src/linebuffer.zig
@ -1,5 +1,7 @@
 const std = @import("std");

+const Diagnostics = @import("./parser.zig").Diagnostics;
+
 pub const IndexSlice = struct { start: usize, len: usize };

 pub const Error = error{
@ -45,14 +47,15 @@ pub fn LineBuffer(comptime options: Strictness) type {

        pub const default_capacity: usize = 4096;

-        pub fn init(allocator: std.mem.Allocator) !@This() {
-            return initCapacity(allocator, default_capacity);
+        pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) !@This() {
+            return initCapacity(allocator, diagnostics, default_capacity);
        }

-        pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) !@This() {
+        pub fn initCapacity(allocator: std.mem.Allocator, diagnostics: *Diagnostics, capacity: usize) !@This() {
            return .{
                .allocator = allocator,
                .internal = .{
+                    .diagnostics = diagnostics,
                    .buffer = try allocator.alloc(u8, capacity),
                    .window = .{ .start = 0, .len = 0 },
                },
@ -60,6 +63,10 @@ pub fn LineBuffer(comptime options: Strictness) type {
            };
        }

+        pub fn diag(self: @This()) *Diagnostics {
+            return self.internal.diagnostics;
+        }
+
        pub fn empty(self: @This()) bool {
            return self.internal.empty();
        }
@ -111,9 +118,18 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
    return struct {
        buffer: []const u8,
        window: IndexSlice,
+        diagnostics: *Diagnostics,

-        pub fn init(data: []const u8) @This() {
-            return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
+        pub fn init(data: []const u8, diagnostics: *Diagnostics) @This() {
+            return .{
+                .buffer = data,
+                .window = .{ .start = 0, .len = data.len },
+                .diagnostics = diagnostics,
+            };
+        }
+
+        pub fn diag(self: @This()) *Diagnostics {
+            return self.diagnostics;
        }

        pub fn empty(self: @This()) bool {
@ -131,16 +147,33 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
            const split: usize = split: {
                for (window, 0..) |char, idx| {
                    if (comptime options.check_carriage_return)
-                        if (char == '\r') return error.IllegalCarriageReturn;
+                        if (char == '\r') {
+                            self.diagnostics.row += 1;
+                            self.diagnostics.line_offset = idx;
+                            self.diagnostics.length = 1;
+                            self.diagnostics.message = "found a carriage return";
+                            return error.IllegalCarriageReturn;
+                        };

                    if (comptime options.check_nonprinting_ascii)
-                        if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F))
+                        if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F)) {
+                            self.diagnostics.row += 1;
+                            self.diagnostics.line_offset = idx;
+                            self.diagnostics.length = 1;
+                            self.diagnostics.message = "found nonprinting ascii characters";
                            return error.IllegalNonprintingAscii;
+                        };

                    if (comptime options.check_trailing_whitespace) {
                        if (char == '\n') {
-                            if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t'))
+                            if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t')) {
+                                self.diagnostics.row += 1;
+                                self.diagnostics.line_offset = idx;
+                                self.diagnostics.length = 1;
+                                self.diagnostics.message = "found trailing spaces";
                                return error.IllegalTrailingSpace;
+                            }
+
                            break :split idx;
                        }
                    } else {
@ -150,12 +183,41 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
                return null;
            };

+            self.diagnostics.row += 1;
+            self.diagnostics.line_offset = 0;
+
            self.window.start += split + 1;
            self.window.len -= split + 1;

            if (comptime options.validate_utf8) {
                const line = window[0..split];
-                return if (std.unicode.utf8ValidateSlice(line)) line else error.InputIsNotValidUtf8;
+
+                var idx: usize = 0;
+                while (idx < line.len) {
+                    if (std.unicode.utf8ByteSequenceLength(line[idx])) |cp_len| {
+                        if (idx + cp_len > line.len) {
+                            self.diagnostics.line_offset = idx;
+                            self.diagnostics.length = cp_len;
+                            self.diagnostics.message = "truncated UTF-8 sequence";
+                            return error.InputIsNotValidUtf8;
+                        }
+
+                        if (std.meta.isError(std.unicode.utf8Decode(line[idx .. idx + cp_len]))) {
+                            self.diagnostics.line_offset = idx;
+                            self.diagnostics.length = cp_len;
+                            self.diagnostics.message = "invalid UTF-8 sequence";
+                            return error.InputIsNotValidUtf8;
+                        }
+                        idx += cp_len;
+                    } else |_| {
+                        self.diagnostics.line_offset = idx;
+                        self.diagnostics.length = 1;
+                        self.diagnostics.message = "invalid UTF-8 sequence start byte";
+                        return error.InputIsNotValidUtf8;
+                    }
+                }
+
+                return line;
            } else {
                return window[0..split];
            }
--- a/src/nice.zig
+++ b/src/nice.zig
@ -68,3 +68,4 @@ pub const parseBuffer = parser.parseBuffer;
 pub const StreamParser = parser.StreamParser;
 pub const Document = parser.Document;
 pub const Value = parser.Value;
+pub const Diagnostics = parser.Diagnostics;
--- a/src/parser.zig
+++ b/src/parser.zig
@ -8,14 +8,14 @@ pub const Value = @import("./parser/value.zig").Value;

 pub const Diagnostics = struct {
    row: usize = 0,
-    span: struct { absolute: usize = 0, line_offset: usize = 0, length: usize = 0 } = .{},
+    line_offset: usize = 0,
+    length: usize = 0,
    message: []const u8 = "no problems",
 };

 pub const Error = error{
    UnexpectedIndent,
    UnexpectedValue,
-    ExtraContent,
    EmptyDocument,
    DuplicateKey,
    BadMapEntry,
@ -42,15 +42,13 @@ pub const Options = struct {
    default_object: enum { string, list, map, fail } = .fail,
 };

-pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, options: Options) !Document {
-    var state = State.init(allocator);
+pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, diagnostics: *Diagnostics, options: Options) !Document {
+    var state = State.init(allocator, diagnostics);
    defer state.deinit();
    errdefer state.document.deinit();

-    var diagnostics = Diagnostics{};
    var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{
-        .buffer = buffers.ValidatingFixedLineBuffer.init(buffer),
-        .diagnostics = &diagnostics,
+        .buffer = buffers.ValidatingFixedLineBuffer.init(buffer, diagnostics),
    };

    while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior);
@ -65,7 +63,6 @@ pub const StreamParser = struct {
    linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer),
    parse_state: State,
    parse_options: Options = .{},
-    diagnostics: Diagnostics = .{},

    pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser {
        const diagnostics = try allocator.create(Diagnostics);
@ -74,16 +71,15 @@ pub const StreamParser = struct {

        return .{
            .linetok = .{
-                .buffer = try buffers.ValidatingLineBuffer.init(allocator),
-                .diagnostics = diagnostics,
+                .buffer = try buffers.ValidatingLineBuffer.init(allocator, diagnostics),
            },
-            .parse_state = State.init(allocator),
+            .parse_state = State.init(allocator, diagnostics),
            .parse_options = options,
        };
    }

    pub fn deinit(self: StreamParser) void {
-        self.linetok.buffer.allocator.destroy(self.linetok.diagnostics);
+        self.linetok.buffer.allocator.destroy(self.parse_state.diagnostics);
        self.linetok.buffer.deinit();
        self.parse_state.deinit();
    }
--- a/src/parser/state.zig
+++ b/src/parser/state.zig
@ -4,6 +4,7 @@ const tokenizer = @import("../tokenizer.zig");
 const Error = @import("../parser.zig").Error;
 const DuplicateKeyBehavior = @import("../parser.zig").DuplicateKeyBehavior;
 const Options = @import("../parser.zig").Options;
+const Diagnostics = @import("../parser.zig").Diagnostics;
 const Value = @import("./value.zig").Value;

 pub const Document = struct {
@ -42,14 +43,16 @@ pub const State = struct {
    pub const Stack = std.ArrayList(*Value);

    document: Document,
+    diagnostics: *Diagnostics,
    value_stack: Stack,
    mode: enum { initial, value, done } = .initial,
    expect_shift: tokenizer.ShiftDirection = .none,
    dangling_key: ?[]const u8 = null,

-    pub fn init(allocator: std.mem.Allocator) State {
+    pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) State {
        return .{
            .document = Document.init(allocator),
+            .diagnostics = diagnostics,
            .value_stack = Stack.init(allocator),
        };
    }
--- a/src/tokenizer.zig
+++ b/src/tokenizer.zig
@ -4,10 +4,10 @@ const Diagnostics = @import("./parser.zig").Diagnostics;

 pub const Error = error{
    BadToken,
+    ExtraContent,
    MixedIndentation,
-    UnquantizedIndentation,
    TooMuchIndentation,
-    MissingNewline,
+    UnquantizedIndentation,
    TrailingWhitespace,
    Impossible,
 };
@ -60,18 +60,19 @@ pub const Line = struct {
 };

 // buffer is expected to be either LineBuffer or FixedLineBuffer, but can
-// technically be anything with a `nextLine` method
+// technically be anything with a conformant interface.
 pub fn LineTokenizer(comptime Buffer: type) type {
    return struct {
        buffer: Buffer,
        index: usize = 0,
        indentation: DetectedIndentation = .unknown,
        last_indent: usize = 0,
-        diagnostics: *Diagnostics,
-        row: usize = 0,

        pub fn finish(self: @This()) !void {
            if (!self.buffer.empty()) {
+                self.buffer.diag().line_offset = 0;
+                self.buffer.diag().length = 1;
+                self.buffer.diag().message = "the document has extra content or is missing the final LF character";
                return error.ExtraContent;
            }
        }
@ -91,13 +92,23 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                                // ugly documents.
                                .unknown => self.indentation = .{ .spaces = 0 },
                                .spaces => {},
-                                .tabs => return error.MixedIndentation,
+                                .tabs => {
+                                    self.buffer.diag().line_offset = idx;
+                                    self.buffer.diag().length = 1;
+                                    self.buffer.diag().message = "the document contains mixed tab/space indentation";
+                                    return error.MixedIndentation;
+                                },
                            }
                        },
                        '\t' => {
                            switch (self.indentation) {
                                .unknown => self.indentation = .tabs,
-                                .spaces => return error.MixedIndentation,
+                                .spaces => {
+                                    self.buffer.diag().line_offset = idx;
+                                    self.buffer.diag().length = 1;
+                                    self.buffer.diag().message = "the document contains mixed tab/space indentation";
+                                    return error.MixedIndentation;
+                                },
                                .tabs => {},
                            }
                        },
@ -110,7 +121,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                        },
                    }
                } else {
-                    if (raw_line.len > 0) return error.TrailingWhitespace;
+                    if (raw_line.len > 0) {
+                        self.buffer.diag().line_offset = raw_line.len - 1;
+                        self.buffer.diag().length = 1;
+                        self.buffer.diag().message = "this line contains trailing whitespace";
+                        return error.TrailingWhitespace;
+                    }
                    continue :lineloop;
                }

@ -118,15 +134,23 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                    if (self.indentation.spaces == 0) {
                        self.indentation.spaces = indent;
                    }
-                    if (@rem(indent, self.indentation.spaces) != 0)
+                    if (@rem(indent, self.indentation.spaces) != 0) {
+                        self.buffer.diag().line_offset = 0;
+                        self.buffer.diag().length = indent;
+                        self.buffer.diag().message = "this line contains incorrectly quantized indentation";
                        return error.UnquantizedIndentation;
+                    }

                    break :quant @divExact(indent, self.indentation.spaces);
                } else indent;

                const shift: LineShift = if (quantized > self.last_indent) rel: {
-                    if ((quantized - self.last_indent) > 1)
+                    if ((quantized - self.last_indent) > 1) {
+                        self.buffer.diag().line_offset = 0;
+                        self.buffer.diag().length = indent;
+                        self.buffer.diag().message = "this line contains too much indentation";
                        return error.TooMuchIndentation;
+                    }
                    break :rel .indent;
                } else if (quantized < self.last_indent)
                    .{ .dedent = self.last_indent - quantized }
@ -134,10 +158,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                    .none;

                defer {
-                    self.row += 1;
                    self.last_indent = quantized;
                }

+                // update the diagnostics so that the parser can use them without
+                // knowing about the whitespace.
+                self.buffer.diag().line_offset = indent;
                const line = raw_line[indent..];

                // this should not be possible, as empty lines are caught earlier.
@ -147,7 +173,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                    '#' => {
                        // force comments to be followed by a space. This makes them
                        // behave the same way as strings, actually.
-                        if (line.len > 1 and line[1] != ' ') return error.BadToken;
+                        if (line.len > 1 and line[1] != ' ') {
+                            self.buffer.diag().line_offset += 1;
+                            self.buffer.diag().length = 1;
+                            self.buffer.diag().message = "this line is missing a space after the start of comment character '#'";
+                            return error.BadToken;
+                        }

                        // simply lie about indentation when the line is a comment.
                        quantized = self.last_indent;
@ -160,12 +191,21 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                    '|', '>', '[', '{' => {
                        return .{
                            .shift = shift,
-                            .contents = .{ .in_line = try detectInlineItem(line) },
+                            .contents = .{ .in_line = try self.detectInlineItem(line) },
                            .raw = line,
                        };
                    },
                    '-' => {
-                        if (line.len > 1 and line[1] != ' ') return error.BadToken;
+                        if (line.len > 1 and line[1] != ' ') {
+                            self.buffer.diag().line_offset += 1;
+                            self.buffer.diag().length = 1;
+                            self.buffer.diag().message = "this line is missing a space after the list entry character '-'";
+                            return error.BadToken;
+                        }
+
+                        // blindly add 2 here because an empty item cannot fail in
+                        // the value, only if a bogus dedent has occurred
+                        self.buffer.diag().line_offset += 2;

                        return if (line.len == 1) .{
                            .shift = shift,
@ -173,26 +213,33 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                            .raw = line,
                        } else .{
                            .shift = shift,
-                            .contents = .{ .list_item = try detectInlineItem(line[2..]) },
+                            .contents = .{ .list_item = try self.detectInlineItem(line[2..]) },
                            .raw = line,
                        };
                    },
                    else => {
                        for (line, 0..) |char, idx| {
                            if (char == ':') {
+                                self.buffer.diag().line_offset += idx + 2;
+
                                if (idx + 1 == line.len) return .{
                                    .shift = shift,
                                    .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
                                    .raw = line,
                                };

-                                if (line[idx + 1] != ' ') return error.BadToken;
+                                if (line[idx + 1] != ' ') {
+                                    self.buffer.diag().line_offset += idx + 1;
+                                    self.buffer.diag().length = 1;
+                                    self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'";
+                                    return error.BadToken;
+                                }

                                return .{
                                    .shift = shift,
                                    .contents = .{ .map_item = .{
                                        .key = line[0..idx],
-                                        .val = try detectInlineItem(line[idx + 2 ..]),
+                                        .val = try self.detectInlineItem(line[idx + 2 ..]),
                                    } },
                                    .raw = line,
                                };
@ -208,12 +255,16 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                }

                // somehow everything else has failed
+                self.buffer.diag().line_offset = 0;
+                self.buffer.diag().length = raw_line.len;
+                self.buffer.diag().message = "this document contains an unknown error. Please report this.";
                return error.Impossible;
            }
            return null;
        }

-        fn detectInlineItem(buf: []const u8) Error!InlineItem {
+        // TODO: it's impossible to get the right diagnostic offset in this function at the moment
+        fn detectInlineItem(self: @This(), buf: []const u8) Error!InlineItem {
            if (buf.len == 0) return .empty;

            switch (buf[0]) {
@ -221,7 +272,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                    if (buf.len > 1 and buf[1] != ' ') return error.BadToken;

                    const slice: []const u8 = switch (buf[buf.len - 1]) {
-                        ' ', '\t' => return error.TrailingWhitespace,
+                        ' ', '\t' => {
+                            self.buffer.diag().line_offset = 0;
+                            self.buffer.diag().length = 1;
+                            self.buffer.diag().message = "this line contains trailing whitespace";
+                            return error.TrailingWhitespace;
+                        },
                        '|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
                        else => buf[@min(2, buf.len)..buf.len],
                    };
@ -232,22 +288,34 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                        .{ .space_string = slice };
                },
                '[' => {
-                    if (buf.len < 2 or buf[buf.len - 1] != ']')
+                    if (buf.len < 2 or buf[buf.len - 1] != ']') {
+                        self.buffer.diag().line_offset = 0;
+                        self.buffer.diag().length = 1;
+                        self.buffer.diag().message = "this line contains a flow-style list but does not end with the closing character ']'";
                        return error.BadToken;
+                    }

                    // keep the closing ] for the flow parser
                    return .{ .flow_list = buf[1..] };
                },
                '{' => {
-                    if (buf.len < 2 or buf[buf.len - 1] != '}')
+                    if (buf.len < 2 or buf[buf.len - 1] != '}') {
+                        self.buffer.diag().line_offset = 0;
+                        self.buffer.diag().length = 1;
+                        self.buffer.diag().message = "this line contains a flow-style map but does not end with the closing character '}'";
                        return error.BadToken;
+                    }

                    // keep the closing } fpr the flow parser
                    return .{ .flow_map = buf[1..] };
                },
                else => {
-                    if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t')
+                    if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') {
+                        self.buffer.diag().line_offset = 0;
+                        self.buffer.diag().length = 1;
+                        self.buffer.diag().message = "this line contains trailing whitespace";
                        return error.TrailingWhitespace;
+                    }

                    return .{ .scalar = buf };
                },