From 01f98f9aff46bb4c6d510a6dff4d5208736f18e8 Mon Sep 17 00:00:00 2001
From: torque <torque@epicyclic.dev>
Date: Wed, 27 Sep 2023 23:44:06 -0700
Subject: [PATCH] parser: start the arduous journey of hooking up diagnostics

The errors in the line buffer and tokenizer now have diagnostics. The
line number is trivial to keep track of due to the line buffer, but
the column index requires quite a bit of juggling, as we pass
successively trimmed down buffers to the internals of the parser.
There will probably be some column index counting problems in the
future. Also, handling the diagnostics is a bit awkward, since it's a
mandatory out-parameter of the parse functions now. The user must
provide a valid diagnostics object that survives for the life of the
parser.
---
 examples/parse.zig   |  11 ++++-
 examples/stream.zig  |   1 +
 src/linebuffer.zig   |  80 +++++++++++++++++++++++++++----
 src/nice.zig         |   1 +
 src/parser.zig       |  20 ++++----
 src/parser/state.zig |   5 +-
 src/tokenizer.zig    | 112 ++++++++++++++++++++++++++++++++++---------
 7 files changed, 185 insertions(+), 45 deletions(-)

diff --git a/examples/parse.zig b/examples/parse.zig
index 1c890ac..315600c 100644
--- a/examples/parse.zig
+++ b/examples/parse.zig
@@ -15,7 +15,16 @@ pub fn main() !void {
     var needfree = true;
     defer if (needfree) allocator.free(data);
 
-    const document = try nice.parseBuffer(allocator, data, .{});
+    var diagnostics = nice.Diagnostics{};
+    const document = nice.parseBuffer(allocator, data, &diagnostics, .{}) catch |err| {
+        std.debug.print("{s}:{d} col:{d}: {s}\n", .{
+            args[1],
+            diagnostics.row,
+            diagnostics.line_offset,
+            diagnostics.message,
+        });
+        return err;
+    };
     defer document.deinit();
 
     // free data memory to ensure that the parsed document is not holding
diff --git a/examples/stream.zig b/examples/stream.zig
index 10c9bcd..9d0c8de 100644
--- a/examples/stream.zig
+++ b/examples/stream.zig
@@ -16,6 +16,7 @@ pub fn main() !void {
         defer file.close();
         var parser = try nice.StreamParser.init(allocator, .{});
         defer parser.deinit();
+        errdefer parser.parse_state.document.deinit();
         while (true) {
             var buf = [_]u8{0} ** 1024;
             const len = try file.read(&buf);
diff --git a/src/linebuffer.zig b/src/linebuffer.zig
index c68d5b6..07eb891 100644
--- a/src/linebuffer.zig
+++ b/src/linebuffer.zig
@@ -1,5 +1,7 @@
 const std = @import("std");
 
+const Diagnostics = @import("./parser.zig").Diagnostics;
+
 pub const IndexSlice = struct { start: usize, len: usize };
 
 pub const Error = error{
@@ -45,14 +47,15 @@ pub fn LineBuffer(comptime options: Strictness) type {
 
         pub const default_capacity: usize = 4096;
 
-        pub fn init(allocator: std.mem.Allocator) !@This() {
-            return initCapacity(allocator, default_capacity);
+        pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) !@This() {
+            return initCapacity(allocator, diagnostics, default_capacity);
         }
 
-        pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) !@This() {
+        pub fn initCapacity(allocator: std.mem.Allocator, diagnostics: *Diagnostics, capacity: usize) !@This() {
             return .{
                 .allocator = allocator,
                 .internal = .{
+                    .diagnostics = diagnostics,
                     .buffer = try allocator.alloc(u8, capacity),
                     .window = .{ .start = 0, .len = 0 },
                 },
@@ -60,6 +63,10 @@ pub fn LineBuffer(comptime options: Strictness) type {
             };
         }
 
+        pub fn diag(self: @This()) *Diagnostics {
+            return self.internal.diagnostics;
+        }
+
         pub fn empty(self: @This()) bool {
             return self.internal.empty();
         }
@@ -111,9 +118,18 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
     return struct {
         buffer: []const u8,
         window: IndexSlice,
+        diagnostics: *Diagnostics,
 
-        pub fn init(data: []const u8) @This() {
-            return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
+        pub fn init(data: []const u8, diagnostics: *Diagnostics) @This() {
+            return .{
+                .buffer = data,
+                .window = .{ .start = 0, .len = data.len },
+                .diagnostics = diagnostics,
+            };
+        }
+
+        pub fn diag(self: @This()) *Diagnostics {
+            return self.diagnostics;
         }
 
         pub fn empty(self: @This()) bool {
@@ -131,16 +147,33 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
             const split: usize = split: {
                 for (window, 0..) |char, idx| {
                     if (comptime options.check_carriage_return)
-                        if (char == '\r') return error.IllegalCarriageReturn;
+                        if (char == '\r') {
+                            self.diagnostics.row += 1;
+                            self.diagnostics.line_offset = idx;
+                            self.diagnostics.length = 1;
+                            self.diagnostics.message = "found a carriage return";
+                            return error.IllegalCarriageReturn;
+                        };
 
                     if (comptime options.check_nonprinting_ascii)
-                        if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F))
+                        if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F)) {
+                            self.diagnostics.row += 1;
+                            self.diagnostics.line_offset = idx;
+                            self.diagnostics.length = 1;
+                            self.diagnostics.message = "found nonprinting ascii characters";
                             return error.IllegalNonprintingAscii;
+                        };
 
                     if (comptime options.check_trailing_whitespace) {
                         if (char == '\n') {
-                            if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t'))
+                            if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t')) {
+                                self.diagnostics.row += 1;
+                                self.diagnostics.line_offset = idx;
+                                self.diagnostics.length = 1;
+                                self.diagnostics.message = "found trailing spaces";
                                 return error.IllegalTrailingSpace;
+                            }
+
                             break :split idx;
                         }
                     } else {
@@ -150,12 +183,41 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
                 return null;
             };
 
+            self.diagnostics.row += 1;
+            self.diagnostics.line_offset = 0;
+
             self.window.start += split + 1;
             self.window.len -= split + 1;
 
             if (comptime options.validate_utf8) {
                 const line = window[0..split];
-                return if (std.unicode.utf8ValidateSlice(line)) line else error.InputIsNotValidUtf8;
+
+                var idx: usize = 0;
+                while (idx < line.len) {
+                    if (std.unicode.utf8ByteSequenceLength(line[idx])) |cp_len| {
+                        if (idx + cp_len > line.len) {
+                            self.diagnostics.line_offset = idx;
+                            self.diagnostics.length = cp_len;
+                            self.diagnostics.message = "truncated UTF-8 sequence";
+                            return error.InputIsNotValidUtf8;
+                        }
+
+                        if (std.meta.isError(std.unicode.utf8Decode(line[idx .. idx + cp_len]))) {
+                            self.diagnostics.line_offset = idx;
+                            self.diagnostics.length = cp_len;
+                            self.diagnostics.message = "invalid UTF-8 sequence";
+                            return error.InputIsNotValidUtf8;
+                        }
+                        idx += cp_len;
+                    } else |_| {
+                        self.diagnostics.line_offset = idx;
+                        self.diagnostics.length = 1;
+                        self.diagnostics.message = "invalid UTF-8 sequence start byte";
+                        return error.InputIsNotValidUtf8;
+                    }
+                }
+
+                return line;
             } else {
                 return window[0..split];
             }
diff --git a/src/nice.zig b/src/nice.zig
index d025f35..44124f8 100644
--- a/src/nice.zig
+++ b/src/nice.zig
@@ -68,3 +68,4 @@ pub const parseBuffer = parser.parseBuffer;
 pub const StreamParser = parser.StreamParser;
 pub const Document = parser.Document;
 pub const Value = parser.Value;
+pub const Diagnostics = parser.Diagnostics;
diff --git a/src/parser.zig b/src/parser.zig
index 457c45d..16bab9d 100644
--- a/src/parser.zig
+++ b/src/parser.zig
@@ -8,14 +8,14 @@ pub const Value = @import("./parser/value.zig").Value;
 
 pub const Diagnostics = struct {
     row: usize = 0,
-    span: struct { absolute: usize = 0, line_offset: usize = 0, length: usize = 0 } = .{},
+    line_offset: usize = 0,
+    length: usize = 0,
     message: []const u8 = "no problems",
 };
 
 pub const Error = error{
     UnexpectedIndent,
     UnexpectedValue,
-    ExtraContent,
     EmptyDocument,
     DuplicateKey,
     BadMapEntry,
@@ -42,15 +42,13 @@ pub const Options = struct {
     default_object: enum { string, list, map, fail } = .fail,
 };
 
-pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, options: Options) !Document {
-    var state = State.init(allocator);
+pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, diagnostics: *Diagnostics, options: Options) !Document {
+    var state = State.init(allocator, diagnostics);
     defer state.deinit();
     errdefer state.document.deinit();
 
-    var diagnostics = Diagnostics{};
     var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{
-        .buffer = buffers.ValidatingFixedLineBuffer.init(buffer),
-        .diagnostics = &diagnostics,
+        .buffer = buffers.ValidatingFixedLineBuffer.init(buffer, diagnostics),
     };
 
     while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior);
@@ -65,7 +63,6 @@ pub const StreamParser = struct {
     linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer),
     parse_state: State,
     parse_options: Options = .{},
-    diagnostics: Diagnostics = .{},
 
     pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser {
         const diagnostics = try allocator.create(Diagnostics);
@@ -74,16 +71,15 @@ pub const StreamParser = struct {
 
         return .{
             .linetok = .{
-                .buffer = try buffers.ValidatingLineBuffer.init(allocator),
-                .diagnostics = diagnostics,
+                .buffer = try buffers.ValidatingLineBuffer.init(allocator, diagnostics),
             },
-            .parse_state = State.init(allocator),
+            .parse_state = State.init(allocator, diagnostics),
             .parse_options = options,
         };
     }
 
     pub fn deinit(self: StreamParser) void {
-        self.linetok.buffer.allocator.destroy(self.linetok.diagnostics);
+        self.linetok.buffer.allocator.destroy(self.parse_state.diagnostics);
         self.linetok.buffer.deinit();
         self.parse_state.deinit();
     }
diff --git a/src/parser/state.zig b/src/parser/state.zig
index 55a71d9..d62e7e8 100644
--- a/src/parser/state.zig
+++ b/src/parser/state.zig
@@ -4,6 +4,7 @@ const tokenizer = @import("../tokenizer.zig");
 const Error = @import("../parser.zig").Error;
 const DuplicateKeyBehavior = @import("../parser.zig").DuplicateKeyBehavior;
 const Options = @import("../parser.zig").Options;
+const Diagnostics = @import("../parser.zig").Diagnostics;
 const Value = @import("./value.zig").Value;
 
 pub const Document = struct {
@@ -42,14 +43,16 @@ pub const State = struct {
     pub const Stack = std.ArrayList(*Value);
 
     document: Document,
+    diagnostics: *Diagnostics,
     value_stack: Stack,
     mode: enum { initial, value, done } = .initial,
     expect_shift: tokenizer.ShiftDirection = .none,
     dangling_key: ?[]const u8 = null,
 
-    pub fn init(allocator: std.mem.Allocator) State {
+    pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) State {
         return .{
             .document = Document.init(allocator),
+            .diagnostics = diagnostics,
             .value_stack = Stack.init(allocator),
         };
     }
diff --git a/src/tokenizer.zig b/src/tokenizer.zig
index 14788e2..1379260 100644
--- a/src/tokenizer.zig
+++ b/src/tokenizer.zig
@@ -4,10 +4,10 @@ const Diagnostics = @import("./parser.zig").Diagnostics;
 
 pub const Error = error{
     BadToken,
+    ExtraContent,
     MixedIndentation,
-    UnquantizedIndentation,
     TooMuchIndentation,
-    MissingNewline,
+    UnquantizedIndentation,
     TrailingWhitespace,
     Impossible,
 };
@@ -60,18 +60,19 @@ pub const Line = struct {
 };
 
 // buffer is expected to be either LineBuffer or FixedLineBuffer, but can
-// technically be anything with a `nextLine` method
+// technically be anything with a conformant interface.
 pub fn LineTokenizer(comptime Buffer: type) type {
     return struct {
         buffer: Buffer,
         index: usize = 0,
         indentation: DetectedIndentation = .unknown,
         last_indent: usize = 0,
-        diagnostics: *Diagnostics,
-        row: usize = 0,
 
         pub fn finish(self: @This()) !void {
             if (!self.buffer.empty()) {
+                self.buffer.diag().line_offset = 0;
+                self.buffer.diag().length = 1;
+                self.buffer.diag().message = "the document has extra content or is missing the final LF character";
                 return error.ExtraContent;
             }
         }
@@ -91,13 +92,23 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                                 // ugly documents.
                                 .unknown => self.indentation = .{ .spaces = 0 },
                                 .spaces => {},
-                                .tabs => return error.MixedIndentation,
+                                .tabs => {
+                                    self.buffer.diag().line_offset = idx;
+                                    self.buffer.diag().length = 1;
+                                    self.buffer.diag().message = "the document contains mixed tab/space indentation";
+                                    return error.MixedIndentation;
+                                },
                             }
                         },
                         '\t' => {
                             switch (self.indentation) {
                                 .unknown => self.indentation = .tabs,
-                                .spaces => return error.MixedIndentation,
+                                .spaces => {
+                                    self.buffer.diag().line_offset = idx;
+                                    self.buffer.diag().length = 1;
+                                    self.buffer.diag().message = "the document contains mixed tab/space indentation";
+                                    return error.MixedIndentation;
+                                },
                                 .tabs => {},
                             }
                         },
@@ -110,7 +121,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                         },
                     }
                 } else {
-                    if (raw_line.len > 0) return error.TrailingWhitespace;
+                    if (raw_line.len > 0) {
+                        self.buffer.diag().line_offset = raw_line.len - 1;
+                        self.buffer.diag().length = 1;
+                        self.buffer.diag().message = "this line contains trailing whitespace";
+                        return error.TrailingWhitespace;
+                    }
                     continue :lineloop;
                 }
 
@@ -118,15 +134,23 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                     if (self.indentation.spaces == 0) {
                         self.indentation.spaces = indent;
                     }
-                    if (@rem(indent, self.indentation.spaces) != 0)
+                    if (@rem(indent, self.indentation.spaces) != 0) {
+                        self.buffer.diag().line_offset = 0;
+                        self.buffer.diag().length = indent;
+                        self.buffer.diag().message = "this line contains incorrectly quantized indentation";
                         return error.UnquantizedIndentation;
+                    }
 
                     break :quant @divExact(indent, self.indentation.spaces);
                 } else indent;
 
                 const shift: LineShift = if (quantized > self.last_indent) rel: {
-                    if ((quantized - self.last_indent) > 1)
+                    if ((quantized - self.last_indent) > 1) {
+                        self.buffer.diag().line_offset = 0;
+                        self.buffer.diag().length = indent;
+                        self.buffer.diag().message = "this line contains too much indentation";
                         return error.TooMuchIndentation;
+                    }
                     break :rel .indent;
                 } else if (quantized < self.last_indent)
                     .{ .dedent = self.last_indent - quantized }
@@ -134,10 +158,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                     .none;
 
                 defer {
-                    self.row += 1;
                     self.last_indent = quantized;
                 }
 
+                // update the diagnostics so that the parser can use them without
+                // knowing about the whitespace.
+                self.buffer.diag().line_offset = indent;
                 const line = raw_line[indent..];
 
                 // this should not be possible, as empty lines are caught earlier.
@@ -147,7 +173,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                     '#' => {
                         // force comments to be followed by a space. This makes them
                         // behave the same way as strings, actually.
-                        if (line.len > 1 and line[1] != ' ') return error.BadToken;
+                        if (line.len > 1 and line[1] != ' ') {
+                            self.buffer.diag().line_offset += 1;
+                            self.buffer.diag().length = 1;
+                            self.buffer.diag().message = "this line is missing a space after the start of comment character '#'";
+                            return error.BadToken;
+                        }
 
                         // simply lie about indentation when the line is a comment.
                         quantized = self.last_indent;
@@ -160,12 +191,21 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                     '|', '>', '[', '{' => {
                         return .{
                             .shift = shift,
-                            .contents = .{ .in_line = try detectInlineItem(line) },
+                            .contents = .{ .in_line = try self.detectInlineItem(line) },
                             .raw = line,
                         };
                     },
                     '-' => {
-                        if (line.len > 1 and line[1] != ' ') return error.BadToken;
+                        if (line.len > 1 and line[1] != ' ') {
+                            self.buffer.diag().line_offset += 1;
+                            self.buffer.diag().length = 1;
+                            self.buffer.diag().message = "this line is missing a space after the list entry character '-'";
+                            return error.BadToken;
+                        }
+
+                        // blindly add 2 here because an empty item cannot fail in
+                        // the value, only if a bogus dedent has occurred
+                        self.buffer.diag().line_offset += 2;
 
                         return if (line.len == 1) .{
                             .shift = shift,
@@ -173,26 +213,33 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                             .raw = line,
                         } else .{
                             .shift = shift,
-                            .contents = .{ .list_item = try detectInlineItem(line[2..]) },
+                            .contents = .{ .list_item = try self.detectInlineItem(line[2..]) },
                             .raw = line,
                         };
                     },
                     else => {
                         for (line, 0..) |char, idx| {
                             if (char == ':') {
+                                self.buffer.diag().line_offset += idx + 2;
+
                                 if (idx + 1 == line.len) return .{
                                     .shift = shift,
                                     .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
                                     .raw = line,
                                 };
 
-                                if (line[idx + 1] != ' ') return error.BadToken;
+                                if (line[idx + 1] != ' ') {
+                                    self.buffer.diag().line_offset += idx + 1;
+                                    self.buffer.diag().length = 1;
+                                    self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'";
+                                    return error.BadToken;
+                                }
 
                                 return .{
                                     .shift = shift,
                                     .contents = .{ .map_item = .{
                                         .key = line[0..idx],
-                                        .val = try detectInlineItem(line[idx + 2 ..]),
+                                        .val = try self.detectInlineItem(line[idx + 2 ..]),
                                     } },
                                     .raw = line,
                                 };
@@ -208,12 +255,16 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                 }
 
                 // somehow everything else has failed
+                self.buffer.diag().line_offset = 0;
+                self.buffer.diag().length = raw_line.len;
+                self.buffer.diag().message = "this document contains an unknown error. Please report this.";
                 return error.Impossible;
             }
             return null;
         }
 
-        fn detectInlineItem(buf: []const u8) Error!InlineItem {
+        // TODO: it's impossible to get the right diagnostic offset in this function at the moment
+        fn detectInlineItem(self: @This(), buf: []const u8) Error!InlineItem {
             if (buf.len == 0) return .empty;
 
             switch (buf[0]) {
@@ -221,7 +272,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                     if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
 
                     const slice: []const u8 = switch (buf[buf.len - 1]) {
-                        ' ', '\t' => return error.TrailingWhitespace,
+                        ' ', '\t' => {
+                            self.buffer.diag().line_offset = 0;
+                            self.buffer.diag().length = 1;
+                            self.buffer.diag().message = "this line contains trailing whitespace";
+                            return error.TrailingWhitespace;
+                        },
                         '|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
                         else => buf[@min(2, buf.len)..buf.len],
                     };
@@ -232,22 +288,34 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                         .{ .space_string = slice };
                 },
                 '[' => {
-                    if (buf.len < 2 or buf[buf.len - 1] != ']')
+                    if (buf.len < 2 or buf[buf.len - 1] != ']') {
+                        self.buffer.diag().line_offset = 0;
+                        self.buffer.diag().length = 1;
+                        self.buffer.diag().message = "this line contains a flow-style list but does not end with the closing character ']'";
                         return error.BadToken;
+                    }
 
                     // keep the closing ] for the flow parser
                     return .{ .flow_list = buf[1..] };
                 },
                 '{' => {
-                    if (buf.len < 2 or buf[buf.len - 1] != '}')
+                    if (buf.len < 2 or buf[buf.len - 1] != '}') {
+                        self.buffer.diag().line_offset = 0;
+                        self.buffer.diag().length = 1;
+                        self.buffer.diag().message = "this line contains a flow-style map but does not end with the closing character '}'";
                         return error.BadToken;
+                    }
 
                     // keep the closing } fpr the flow parser
                     return .{ .flow_map = buf[1..] };
                 },
                 else => {
-                    if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t')
+                    if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') {
+                        self.buffer.diag().line_offset = 0;
+                        self.buffer.diag().length = 1;
+                        self.buffer.diag().message = "this line contains trailing whitespace";
                         return error.TrailingWhitespace;
+                    }
 
                     return .{ .scalar = buf };
                 },