7 changed files with 45 additions and 206 deletions
--- a/examples/parse.zig
+++ b/examples/parse.zig
@ -15,16 +15,7 @@ pub fn main() !void {
    var needfree = true;
    defer if (needfree) allocator.free(data);
-    var diagnostics = nice.Diagnostics{};
+    const document = try nice.parseBuffer(allocator, data, .{});
    const document = nice.parseBuffer(allocator, data, &diagnostics, .{}) catch |err| {
        std.debug.print("{s}:{d} col:{d}: {s}\n", .{
            args[1],
            diagnostics.row,
            diagnostics.line_offset,
            diagnostics.message,
        });
        return err;
    };
    defer document.deinit();
    // free data memory to ensure that the parsed document is not holding
--- a/examples/stream.zig
+++ b/examples/stream.zig
@ -16,7 +16,6 @@ pub fn main() !void {
        defer file.close();
        var parser = try nice.StreamParser.init(allocator, .{});
        defer parser.deinit();
        errdefer parser.parse_state.document.deinit();
        while (true) {
            var buf = [_]u8{0} ** 1024;
            const len = try file.read(&buf);
--- a/src/linebuffer.zig
+++ b/src/linebuffer.zig
@ -1,7 +1,5 @@
 const std = @import("std");
 const Diagnostics = @import("./parser.zig").Diagnostics;
 pub const IndexSlice = struct { start: usize, len: usize };
 pub const Error = error{
@ -47,15 +45,14 @@ pub fn LineBuffer(comptime options: Strictness) type {
        pub const default_capacity: usize = 4096;
-        pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) !@This() {
+        pub fn init(allocator: std.mem.Allocator) !@This() {
-            return initCapacity(allocator, diagnostics, default_capacity);
+            return initCapacity(allocator, default_capacity);
        }
-        pub fn initCapacity(allocator: std.mem.Allocator, diagnostics: *Diagnostics, capacity: usize) !@This() {
+        pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) !@This() {
            return .{
                .allocator = allocator,
                .internal = .{
                    .diagnostics = diagnostics,
                    .buffer = try allocator.alloc(u8, capacity),
                    .window = .{ .start = 0, .len = 0 },
                },
@ -63,14 +60,6 @@ pub fn LineBuffer(comptime options: Strictness) type {
            };
        }
        pub fn diag(self: @This()) *Diagnostics {
            return self.internal.diagnostics;
        }
        pub fn empty(self: @This()) bool {
            return self.internal.empty();
        }
        pub fn deinit(self: @This()) void {
            self.allocator.free(self.internal.buffer);
        }
@ -118,24 +107,9 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
    return struct {
        buffer: []const u8,
        window: IndexSlice,
        diagnostics: *Diagnostics,
-        pub fn init(data: []const u8, diagnostics: *Diagnostics) @This() {
+        pub fn init(data: []const u8) @This() {
-            return .{
+            return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
                .buffer = data,
                .window = .{ .start = 0, .len = data.len },
                .diagnostics = diagnostics,
            };
        }
        pub fn diag(self: @This()) *Diagnostics {
            return self.diagnostics;
        }
        pub fn empty(self: @This()) bool {
            // we can't check the overall buffer size because the dynamic buffer may be
            // overallocated
            return self.window.len == 0;
        }
        pub fn nextLine(self: *@This()) !?[]const u8 {
@ -147,33 +121,16 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
            const split: usize = split: {
                for (window, 0..) |char, idx| {
                    if (comptime options.check_carriage_return)
-                        if (char == '\r') {
+                        if (char == '\r') return error.IllegalCarriageReturn;
                            self.diagnostics.row += 1;
                            self.diagnostics.line_offset = idx;
                            self.diagnostics.length = 1;
                            self.diagnostics.message = "found a carriage return";
                            return error.IllegalCarriageReturn;
                        };
                    if (comptime options.check_nonprinting_ascii)
-                        if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F)) {
+                        if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F))
                            self.diagnostics.row += 1;
                            self.diagnostics.line_offset = idx;
                            self.diagnostics.length = 1;
                            self.diagnostics.message = "found nonprinting ascii characters";
                            return error.IllegalNonprintingAscii;
                        };
                    if (comptime options.check_trailing_whitespace) {
                        if (char == '\n') {
-                            if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t')) {
+                            if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t'))
                                self.diagnostics.row += 1;
                                self.diagnostics.line_offset = idx;
                                self.diagnostics.length = 1;
                                self.diagnostics.message = "found trailing spaces";
                                return error.IllegalTrailingSpace;
                            }
                            break :split idx;
                        }
                    } else {
@ -183,41 +140,12 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
                return null;
            };
            self.diagnostics.row += 1;
            self.diagnostics.line_offset = 0;
            self.window.start += split + 1;
            self.window.len -= split + 1;
            if (comptime options.validate_utf8) {
                const line = window[0..split];
-
+                return if (std.unicode.utf8ValidateSlice(line)) line else error.InputIsNotValidUtf8;
                var idx: usize = 0;
                while (idx < line.len) {
                    if (std.unicode.utf8ByteSequenceLength(line[idx])) |cp_len| {
                        if (idx + cp_len > line.len) {
                            self.diagnostics.line_offset = idx;
                            self.diagnostics.length = cp_len;
                            self.diagnostics.message = "truncated UTF-8 sequence";
                            return error.InputIsNotValidUtf8;
                        }
                        if (std.meta.isError(std.unicode.utf8Decode(line[idx .. idx + cp_len]))) {
                            self.diagnostics.line_offset = idx;
                            self.diagnostics.length = cp_len;
                            self.diagnostics.message = "invalid UTF-8 sequence";
                            return error.InputIsNotValidUtf8;
                        }
                        idx += cp_len;
                    } else |_| {
                        self.diagnostics.line_offset = idx;
                        self.diagnostics.length = 1;
                        self.diagnostics.message = "invalid UTF-8 sequence start byte";
                        return error.InputIsNotValidUtf8;
                    }
                }
                return line;
            } else {
                return window[0..split];
            }
--- a/src/nice.zig
+++ b/src/nice.zig
@ -68,4 +68,3 @@ pub const parseBuffer = parser.parseBuffer;
 pub const StreamParser = parser.StreamParser;
 pub const Document = parser.Document;
 pub const Value = parser.Value;
 pub const Diagnostics = parser.Diagnostics;
--- a/src/parser.zig
+++ b/src/parser.zig
@ -8,14 +8,14 @@ pub const Value = @import("./parser/value.zig").Value;
 pub const Diagnostics = struct {
    row: usize = 0,
-    line_offset: usize = 0,
+    span: struct { absolute: usize = 0, line_offset: usize = 0, length: usize = 0 } = .{},
    length: usize = 0,
    message: []const u8 = "no problems",
 };
 pub const Error = error{
    UnexpectedIndent,
    UnexpectedValue,
    ExtraContent,
    EmptyDocument,
    DuplicateKey,
    BadMapEntry,
@ -42,20 +42,18 @@ pub const Options = struct {
    default_object: enum { string, list, map, fail } = .fail,
 };
-pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, diagnostics: *Diagnostics, options: Options) !Document {
+pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, options: Options) !Document {
-    var state = State.init(allocator, diagnostics);
+    var state = State.init(allocator);
    defer state.deinit();
    errdefer state.document.deinit();
    var diagnostics = Diagnostics{};
    var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{
-        .buffer = buffers.ValidatingFixedLineBuffer.init(buffer, diagnostics),
+        .buffer = buffers.ValidatingFixedLineBuffer.init(buffer),
        .diagnostics = &diagnostics,
    };
    while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior);
    // state doesn't have access to the tokenizer, which is the only thing that can
    // error if unparsed lines remain in the buffer by the time that "finish" is
    // called.
    try tok.finish();
    return try state.finish(options);
 }
@ -63,6 +61,7 @@ pub const StreamParser = struct {
    linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer),
    parse_state: State,
    parse_options: Options = .{},
    diagnostics: Diagnostics = .{},
    pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser {
        const diagnostics = try allocator.create(Diagnostics);
@ -71,15 +70,16 @@ pub const StreamParser = struct {
        return .{
            .linetok = .{
-                .buffer = try buffers.ValidatingLineBuffer.init(allocator, diagnostics),
+                .buffer = try buffers.ValidatingLineBuffer.init(allocator),
                .diagnostics = diagnostics,
            },
-            .parse_state = State.init(allocator, diagnostics),
+            .parse_state = State.init(allocator),
            .parse_options = options,
        };
    }
    pub fn deinit(self: StreamParser) void {
-        self.linetok.buffer.allocator.destroy(self.parse_state.diagnostics);
+        self.linetok.buffer.allocator.destroy(self.linetok.diagnostics);
        self.linetok.buffer.deinit();
        self.parse_state.deinit();
    }
@ -90,7 +90,6 @@ pub const StreamParser = struct {
    }
    pub fn finish(self: *StreamParser) !Document {
        try self.linetok.finish();
        return try self.parse_state.finish(self.parse_options);
    }
 };
--- a/src/parser/state.zig
+++ b/src/parser/state.zig
@ -4,7 +4,6 @@ const tokenizer = @import("../tokenizer.zig");
 const Error = @import("../parser.zig").Error;
 const DuplicateKeyBehavior = @import("../parser.zig").DuplicateKeyBehavior;
 const Options = @import("../parser.zig").Options;
 const Diagnostics = @import("../parser.zig").Diagnostics;
 const Value = @import("./value.zig").Value;
 pub const Document = struct {
@ -43,16 +42,14 @@ pub const State = struct {
    pub const Stack = std.ArrayList(*Value);
    document: Document,
    diagnostics: *Diagnostics,
    value_stack: Stack,
    mode: enum { initial, value, done } = .initial,
    expect_shift: tokenizer.ShiftDirection = .none,
    dangling_key: ?[]const u8 = null,
-    pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) State {
+    pub fn init(allocator: std.mem.Allocator) State {
        return .{
            .document = Document.init(allocator),
            .diagnostics = diagnostics,
            .value_stack = Stack.init(allocator),
        };
    }
--- a/src/tokenizer.zig
+++ b/src/tokenizer.zig
@ -4,10 +4,10 @@ const Diagnostics = @import("./parser.zig").Diagnostics;
 pub const Error = error{
    BadToken,
    ExtraContent,
    MixedIndentation,
    TooMuchIndentation,
    UnquantizedIndentation,
    TooMuchIndentation,
    MissingNewline,
    TrailingWhitespace,
    Impossible,
 };
@ -60,22 +60,15 @@ pub const Line = struct {
 };
 // buffer is expected to be either LineBuffer or FixedLineBuffer, but can
-// technically be anything with a conformant interface.
+// technically be anything with a `nextLine` method
 pub fn LineTokenizer(comptime Buffer: type) type {
    return struct {
        buffer: Buffer,
        index: usize = 0,
        indentation: DetectedIndentation = .unknown,
        last_indent: usize = 0,
-
+        diagnostics: *Diagnostics,
-        pub fn finish(self: @This()) !void {
+        row: usize = 0,
            if (!self.buffer.empty()) {
                self.buffer.diag().line_offset = 0;
                self.buffer.diag().length = 1;
                self.buffer.diag().message = "the document has extra content or is missing the final LF character";
                return error.ExtraContent;
            }
        }
        pub fn next(self: *@This()) !?Line {
            lineloop: while (try self.buffer.nextLine()) |raw_line| {
@ -92,23 +85,13 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                                // ugly documents.
                                .unknown => self.indentation = .{ .spaces = 0 },
                                .spaces => {},
-                                .tabs => {
+                                .tabs => return error.MixedIndentation,
                                    self.buffer.diag().line_offset = idx;
                                    self.buffer.diag().length = 1;
                                    self.buffer.diag().message = "the document contains mixed tab/space indentation";
                                    return error.MixedIndentation;
                                },
                            }
                        },
                        '\t' => {
                            switch (self.indentation) {
                                .unknown => self.indentation = .tabs,
-                                .spaces => {
+                                .spaces => return error.MixedIndentation,
                                    self.buffer.diag().line_offset = idx;
                                    self.buffer.diag().length = 1;
                                    self.buffer.diag().message = "the document contains mixed tab/space indentation";
                                    return error.MixedIndentation;
                                },
                                .tabs => {},
                            }
                        },
@ -121,12 +104,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                        },
                    }
                } else {
-                    if (raw_line.len > 0) {
+                    if (raw_line.len > 0) return error.TrailingWhitespace;
                        self.buffer.diag().line_offset = raw_line.len - 1;
                        self.buffer.diag().length = 1;
                        self.buffer.diag().message = "this line contains trailing whitespace";
                        return error.TrailingWhitespace;
                    }
                    continue :lineloop;
                }
@ -134,23 +112,15 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                    if (self.indentation.spaces == 0) {
                        self.indentation.spaces = indent;
                    }
-                    if (@rem(indent, self.indentation.spaces) != 0) {
+                    if (@rem(indent, self.indentation.spaces) != 0)
                        self.buffer.diag().line_offset = 0;
                        self.buffer.diag().length = indent;
                        self.buffer.diag().message = "this line contains incorrectly quantized indentation";
                        return error.UnquantizedIndentation;
                    }
                    break :quant @divExact(indent, self.indentation.spaces);
                } else indent;
                const shift: LineShift = if (quantized > self.last_indent) rel: {
-                    if ((quantized - self.last_indent) > 1) {
+                    if ((quantized - self.last_indent) > 1)
                        self.buffer.diag().line_offset = 0;
                        self.buffer.diag().length = indent;
                        self.buffer.diag().message = "this line contains too much indentation";
                        return error.TooMuchIndentation;
                    }
                    break :rel .indent;
                } else if (quantized < self.last_indent)
                    .{ .dedent = self.last_indent - quantized }
@ -158,12 +128,10 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                    .none;
                defer {
                    self.row += 1;
                    self.last_indent = quantized;
                }
                // update the diagnostics so that the parser can use them without
                // knowing about the whitespace.
                self.buffer.diag().line_offset = indent;
                const line = raw_line[indent..];
                // this should not be possible, as empty lines are caught earlier.
@ -173,12 +141,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                    '#' => {
                        // force comments to be followed by a space. This makes them
                        // behave the same way as strings, actually.
-                        if (line.len > 1 and line[1] != ' ') {
+                        if (line.len > 1 and line[1] != ' ') return error.BadToken;
                            self.buffer.diag().line_offset += 1;
                            self.buffer.diag().length = 1;
                            self.buffer.diag().message = "this line is missing a space after the start of comment character '#'";
                            return error.BadToken;
                        }
                        // simply lie about indentation when the line is a comment.
                        quantized = self.last_indent;
@ -191,21 +154,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                    '|', '>', '[', '{' => {
                        return .{
                            .shift = shift,
-                            .contents = .{ .in_line = try self.detectInlineItem(line) },
+                            .contents = .{ .in_line = try detectInlineItem(line) },
                            .raw = line,
                        };
                    },
                    '-' => {
-                        if (line.len > 1 and line[1] != ' ') {
+                        if (line.len > 1 and line[1] != ' ') return error.BadToken;
                            self.buffer.diag().line_offset += 1;
                            self.buffer.diag().length = 1;
                            self.buffer.diag().message = "this line is missing a space after the list entry character '-'";
                            return error.BadToken;
                        }
                        // blindly add 2 here because an empty item cannot fail in
                        // the value, only if a bogus dedent has occurred
                        self.buffer.diag().line_offset += 2;
                        return if (line.len == 1) .{
                            .shift = shift,
@ -213,33 +167,26 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                            .raw = line,
                        } else .{
                            .shift = shift,
-                            .contents = .{ .list_item = try self.detectInlineItem(line[2..]) },
+                            .contents = .{ .list_item = try detectInlineItem(line[2..]) },
                            .raw = line,
                        };
                    },
                    else => {
                        for (line, 0..) |char, idx| {
                            if (char == ':') {
                                self.buffer.diag().line_offset += idx + 2;
                                if (idx + 1 == line.len) return .{
                                    .shift = shift,
                                    .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
                                    .raw = line,
                                };
-                                if (line[idx + 1] != ' ') {
+                                if (line[idx + 1] != ' ') return error.BadToken;
                                    self.buffer.diag().line_offset += idx + 1;
                                    self.buffer.diag().length = 1;
                                    self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'";
                                    return error.BadToken;
                                }
                                return .{
                                    .shift = shift,
                                    .contents = .{ .map_item = .{
                                        .key = line[0..idx],
-                                        .val = try self.detectInlineItem(line[idx + 2 ..]),
+                                        .val = try detectInlineItem(line[idx + 2 ..]),
                                    } },
                                    .raw = line,
                                };
@ -255,16 +202,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                }
                // somehow everything else has failed
                self.buffer.diag().line_offset = 0;
                self.buffer.diag().length = raw_line.len;
                self.buffer.diag().message = "this document contains an unknown error. Please report this.";
                return error.Impossible;
            }
            return null;
        }
-        // TODO: it's impossible to get the right diagnostic offset in this function at the moment
+        fn detectInlineItem(buf: []const u8) Error!InlineItem {
        fn detectInlineItem(self: @This(), buf: []const u8) Error!InlineItem {
            if (buf.len == 0) return .empty;
            switch (buf[0]) {
@ -272,12 +215,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                    if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
                    const slice: []const u8 = switch (buf[buf.len - 1]) {
-                        ' ', '\t' => {
+                        ' ', '\t' => return error.TrailingWhitespace,
                            self.buffer.diag().line_offset = 0;
                            self.buffer.diag().length = 1;
                            self.buffer.diag().message = "this line contains trailing whitespace";
                            return error.TrailingWhitespace;
                        },
                        '|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
                        else => buf[@min(2, buf.len)..buf.len],
                    };
@ -288,34 +226,22 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                        .{ .space_string = slice };
                },
                '[' => {
-                    if (buf.len < 2 or buf[buf.len - 1] != ']') {
+                    if (buf.len < 2 or buf[buf.len - 1] != ']')
                        self.buffer.diag().line_offset = 0;
                        self.buffer.diag().length = 1;
                        self.buffer.diag().message = "this line contains a flow-style list but does not end with the closing character ']'";
                        return error.BadToken;
                    }
                    // keep the closing ] for the flow parser
                    return .{ .flow_list = buf[1..] };
                },
                '{' => {
-                    if (buf.len < 2 or buf[buf.len - 1] != '}') {
+                    if (buf.len < 2 or buf[buf.len - 1] != '}')
                        self.buffer.diag().line_offset = 0;
                        self.buffer.diag().length = 1;
                        self.buffer.diag().message = "this line contains a flow-style map but does not end with the closing character '}'";
                        return error.BadToken;
                    }
                    // keep the closing } fpr the flow parser
                    return .{ .flow_map = buf[1..] };
                },
                else => {
-                    if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') {
+                    if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t')
                        self.buffer.diag().line_offset = 0;
                        self.buffer.diag().length = 1;
                        self.buffer.diag().message = "this line contains trailing whitespace";
                        return error.TrailingWhitespace;
                    }
                    return .{ .scalar = buf };
                },