diff --git a/examples/parse.zig b/examples/parse.zig index 1c890ac..315600c 100644 --- a/examples/parse.zig +++ b/examples/parse.zig @@ -15,7 +15,16 @@ pub fn main() !void { var needfree = true; defer if (needfree) allocator.free(data); - const document = try nice.parseBuffer(allocator, data, .{}); + var diagnostics = nice.Diagnostics{}; + const document = nice.parseBuffer(allocator, data, &diagnostics, .{}) catch |err| { + std.debug.print("{s}:{d} col:{d}: {s}\n", .{ + args[1], + diagnostics.row, + diagnostics.line_offset, + diagnostics.message, + }); + return err; + }; defer document.deinit(); // free data memory to ensure that the parsed document is not holding diff --git a/examples/stream.zig b/examples/stream.zig index 10c9bcd..9d0c8de 100644 --- a/examples/stream.zig +++ b/examples/stream.zig @@ -16,6 +16,7 @@ pub fn main() !void { defer file.close(); var parser = try nice.StreamParser.init(allocator, .{}); defer parser.deinit(); + errdefer parser.parse_state.document.deinit(); while (true) { var buf = [_]u8{0} ** 1024; const len = try file.read(&buf); diff --git a/src/linebuffer.zig b/src/linebuffer.zig index c68d5b6..07eb891 100644 --- a/src/linebuffer.zig +++ b/src/linebuffer.zig @@ -1,5 +1,7 @@ const std = @import("std"); +const Diagnostics = @import("./parser.zig").Diagnostics; + pub const IndexSlice = struct { start: usize, len: usize }; pub const Error = error{ @@ -45,14 +47,15 @@ pub fn LineBuffer(comptime options: Strictness) type { pub const default_capacity: usize = 4096; - pub fn init(allocator: std.mem.Allocator) !@This() { - return initCapacity(allocator, default_capacity); + pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) !@This() { + return initCapacity(allocator, diagnostics, default_capacity); } - pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) !@This() { + pub fn initCapacity(allocator: std.mem.Allocator, diagnostics: *Diagnostics, capacity: usize) !@This() { return .{ .allocator = allocator, .internal = .{ + .diagnostics = diagnostics, .buffer = try allocator.alloc(u8, capacity), .window = .{ .start = 0, .len = 0 }, }, @@ -60,6 +63,10 @@ pub fn LineBuffer(comptime options: Strictness) type { }; } + pub fn diag(self: @This()) *Diagnostics { + return self.internal.diagnostics; + } + pub fn empty(self: @This()) bool { return self.internal.empty(); } @@ -111,9 +118,18 @@ pub fn FixedLineBuffer(comptime options: Strictness) type { return struct { buffer: []const u8, window: IndexSlice, + diagnostics: *Diagnostics, - pub fn init(data: []const u8) @This() { - return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } }; + pub fn init(data: []const u8, diagnostics: *Diagnostics) @This() { + return .{ + .buffer = data, + .window = .{ .start = 0, .len = data.len }, + .diagnostics = diagnostics, + }; + } + + pub fn diag(self: @This()) *Diagnostics { + return self.diagnostics; } pub fn empty(self: @This()) bool { @@ -131,16 +147,33 @@ pub fn FixedLineBuffer(comptime options: Strictness) type { const split: usize = split: { for (window, 0..) |char, idx| { if (comptime options.check_carriage_return) - if (char == '\r') return error.IllegalCarriageReturn; + if (char == '\r') { + self.diagnostics.row += 1; + self.diagnostics.line_offset = idx; + self.diagnostics.length = 1; + self.diagnostics.message = "found a carriage return"; + return error.IllegalCarriageReturn; + }; if (comptime options.check_nonprinting_ascii) - if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F)) + if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F)) { + self.diagnostics.row += 1; + self.diagnostics.line_offset = idx; + self.diagnostics.length = 1; + self.diagnostics.message = "found nonprinting ascii characters"; return error.IllegalNonprintingAscii; + }; if (comptime options.check_trailing_whitespace) { if (char == '\n') { - if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t')) + if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t')) { + self.diagnostics.row += 1; + self.diagnostics.line_offset = idx; + self.diagnostics.length = 1; + self.diagnostics.message = "found trailing spaces"; return error.IllegalTrailingSpace; + } + break :split idx; } } else { @@ -150,12 +183,41 @@ pub fn FixedLineBuffer(comptime options: Strictness) type { return null; }; + self.diagnostics.row += 1; + self.diagnostics.line_offset = 0; + self.window.start += split + 1; self.window.len -= split + 1; if (comptime options.validate_utf8) { const line = window[0..split]; - return if (std.unicode.utf8ValidateSlice(line)) line else error.InputIsNotValidUtf8; + + var idx: usize = 0; + while (idx < line.len) { + if (std.unicode.utf8ByteSequenceLength(line[idx])) |cp_len| { + if (idx + cp_len > line.len) { + self.diagnostics.line_offset = idx; + self.diagnostics.length = cp_len; + self.diagnostics.message = "truncated UTF-8 sequence"; + return error.InputIsNotValidUtf8; + } + + if (std.meta.isError(std.unicode.utf8Decode(line[idx .. idx + cp_len]))) { + self.diagnostics.line_offset = idx; + self.diagnostics.length = cp_len; + self.diagnostics.message = "invalid UTF-8 sequence"; + return error.InputIsNotValidUtf8; + } + idx += cp_len; + } else |_| { + self.diagnostics.line_offset = idx; + self.diagnostics.length = 1; + self.diagnostics.message = "invalid UTF-8 sequence start byte"; + return error.InputIsNotValidUtf8; + } + } + + return line; } else { return window[0..split]; } diff --git a/src/nice.zig b/src/nice.zig index d025f35..44124f8 100644 --- a/src/nice.zig +++ b/src/nice.zig @@ -68,3 +68,4 @@ pub const parseBuffer = parser.parseBuffer; pub const StreamParser = parser.StreamParser; pub const Document = parser.Document; pub const Value = parser.Value; +pub const Diagnostics = parser.Diagnostics; diff --git a/src/parser.zig b/src/parser.zig index 457c45d..16bab9d 100644 --- a/src/parser.zig +++ b/src/parser.zig @@ -8,14 +8,14 @@ pub const Value = @import("./parser/value.zig").Value; pub const Diagnostics = struct { row: usize = 0, - span: struct { absolute: usize = 0, line_offset: usize = 0, length: usize = 0 } = .{}, + line_offset: usize = 0, + length: usize = 0, message: []const u8 = "no problems", }; pub const Error = error{ UnexpectedIndent, UnexpectedValue, - ExtraContent, EmptyDocument, DuplicateKey, BadMapEntry, @@ -42,15 +42,13 @@ pub const Options = struct { default_object: enum { string, list, map, fail } = .fail, }; -pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, options: Options) !Document { - var state = State.init(allocator); +pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, diagnostics: *Diagnostics, options: Options) !Document { + var state = State.init(allocator, diagnostics); defer state.deinit(); errdefer state.document.deinit(); - var diagnostics = Diagnostics{}; var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{ - .buffer = buffers.ValidatingFixedLineBuffer.init(buffer), - .diagnostics = &diagnostics, + .buffer = buffers.ValidatingFixedLineBuffer.init(buffer, diagnostics), }; while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior); @@ -65,7 +63,6 @@ pub const StreamParser = struct { linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer), parse_state: State, parse_options: Options = .{}, - diagnostics: Diagnostics = .{}, pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser { const diagnostics = try allocator.create(Diagnostics); @@ -74,16 +71,15 @@ pub const StreamParser = struct { return .{ .linetok = .{ - .buffer = try buffers.ValidatingLineBuffer.init(allocator), - .diagnostics = diagnostics, + .buffer = try buffers.ValidatingLineBuffer.init(allocator, diagnostics), }, - .parse_state = State.init(allocator), + .parse_state = State.init(allocator, diagnostics), .parse_options = options, }; } pub fn deinit(self: StreamParser) void { - self.linetok.buffer.allocator.destroy(self.linetok.diagnostics); + self.linetok.buffer.allocator.destroy(self.parse_state.diagnostics); self.linetok.buffer.deinit(); self.parse_state.deinit(); } diff --git a/src/parser/state.zig b/src/parser/state.zig index 55a71d9..d62e7e8 100644 --- a/src/parser/state.zig +++ b/src/parser/state.zig @@ -4,6 +4,7 @@ const tokenizer = @import("../tokenizer.zig"); const Error = @import("../parser.zig").Error; const DuplicateKeyBehavior = @import("../parser.zig").DuplicateKeyBehavior; const Options = @import("../parser.zig").Options; +const Diagnostics = @import("../parser.zig").Diagnostics; const Value = @import("./value.zig").Value; pub const Document = struct { @@ -42,14 +43,16 @@ pub const State = struct { pub const Stack = std.ArrayList(*Value); document: Document, + diagnostics: *Diagnostics, value_stack: Stack, mode: enum { initial, value, done } = .initial, expect_shift: tokenizer.ShiftDirection = .none, dangling_key: ?[]const u8 = null, - pub fn init(allocator: std.mem.Allocator) State { + pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) State { return .{ .document = Document.init(allocator), + .diagnostics = diagnostics, .value_stack = Stack.init(allocator), }; } diff --git a/src/tokenizer.zig b/src/tokenizer.zig index 14788e2..1379260 100644 --- a/src/tokenizer.zig +++ b/src/tokenizer.zig @@ -4,10 +4,10 @@ const Diagnostics = @import("./parser.zig").Diagnostics; pub const Error = error{ BadToken, + ExtraContent, MixedIndentation, - UnquantizedIndentation, TooMuchIndentation, - MissingNewline, + UnquantizedIndentation, TrailingWhitespace, Impossible, }; @@ -60,18 +60,19 @@ pub const Line = struct { }; // buffer is expected to be either LineBuffer or FixedLineBuffer, but can -// technically be anything with a `nextLine` method +// technically be anything with a conformant interface. pub fn LineTokenizer(comptime Buffer: type) type { return struct { buffer: Buffer, index: usize = 0, indentation: DetectedIndentation = .unknown, last_indent: usize = 0, - diagnostics: *Diagnostics, - row: usize = 0, pub fn finish(self: @This()) !void { if (!self.buffer.empty()) { + self.buffer.diag().line_offset = 0; + self.buffer.diag().length = 1; + self.buffer.diag().message = "the document has extra content or is missing the final LF character"; return error.ExtraContent; } } @@ -91,13 +92,23 @@ pub fn LineTokenizer(comptime Buffer: type) type { // ugly documents. .unknown => self.indentation = .{ .spaces = 0 }, .spaces => {}, - .tabs => return error.MixedIndentation, + .tabs => { + self.buffer.diag().line_offset = idx; + self.buffer.diag().length = 1; + self.buffer.diag().message = "the document contains mixed tab/space indentation"; + return error.MixedIndentation; + }, } }, '\t' => { switch (self.indentation) { .unknown => self.indentation = .tabs, - .spaces => return error.MixedIndentation, + .spaces => { + self.buffer.diag().line_offset = idx; + self.buffer.diag().length = 1; + self.buffer.diag().message = "the document contains mixed tab/space indentation"; + return error.MixedIndentation; + }, .tabs => {}, } }, @@ -110,7 +121,12 @@ pub fn LineTokenizer(comptime Buffer: type) type { }, } } else { - if (raw_line.len > 0) return error.TrailingWhitespace; + if (raw_line.len > 0) { + self.buffer.diag().line_offset = raw_line.len - 1; + self.buffer.diag().length = 1; + self.buffer.diag().message = "this line contains trailing whitespace"; + return error.TrailingWhitespace; + } continue :lineloop; } @@ -118,15 +134,23 @@ pub fn LineTokenizer(comptime Buffer: type) type { if (self.indentation.spaces == 0) { self.indentation.spaces = indent; } - if (@rem(indent, self.indentation.spaces) != 0) + if (@rem(indent, self.indentation.spaces) != 0) { + self.buffer.diag().line_offset = 0; + self.buffer.diag().length = indent; + self.buffer.diag().message = "this line contains incorrectly quantized indentation"; return error.UnquantizedIndentation; + } break :quant @divExact(indent, self.indentation.spaces); } else indent; const shift: LineShift = if (quantized > self.last_indent) rel: { - if ((quantized - self.last_indent) > 1) + if ((quantized - self.last_indent) > 1) { + self.buffer.diag().line_offset = 0; + self.buffer.diag().length = indent; + self.buffer.diag().message = "this line contains too much indentation"; return error.TooMuchIndentation; + } break :rel .indent; } else if (quantized < self.last_indent) .{ .dedent = self.last_indent - quantized } @@ -134,10 +158,12 @@ pub fn LineTokenizer(comptime Buffer: type) type { .none; defer { - self.row += 1; self.last_indent = quantized; } + // update the diagnostics so that the parser can use them without + // knowing about the whitespace. + self.buffer.diag().line_offset = indent; const line = raw_line[indent..]; // this should not be possible, as empty lines are caught earlier. @@ -147,7 +173,12 @@ pub fn LineTokenizer(comptime Buffer: type) type { '#' => { // force comments to be followed by a space. This makes them // behave the same way as strings, actually. - if (line.len > 1 and line[1] != ' ') return error.BadToken; + if (line.len > 1 and line[1] != ' ') { + self.buffer.diag().line_offset += 1; + self.buffer.diag().length = 1; + self.buffer.diag().message = "this line is missing a space after the start of comment character '#'"; + return error.BadToken; + } // simply lie about indentation when the line is a comment. quantized = self.last_indent; @@ -160,12 +191,21 @@ pub fn LineTokenizer(comptime Buffer: type) type { '|', '>', '[', '{' => { return .{ .shift = shift, - .contents = .{ .in_line = try detectInlineItem(line) }, + .contents = .{ .in_line = try self.detectInlineItem(line) }, .raw = line, }; }, '-' => { - if (line.len > 1 and line[1] != ' ') return error.BadToken; + if (line.len > 1 and line[1] != ' ') { + self.buffer.diag().line_offset += 1; + self.buffer.diag().length = 1; + self.buffer.diag().message = "this line is missing a space after the list entry character '-'"; + return error.BadToken; + } + + // blindly add 2 here because an empty item cannot fail in + // the value, only if a bogus dedent has occurred + self.buffer.diag().line_offset += 2; return if (line.len == 1) .{ .shift = shift, @@ -173,26 +213,33 @@ pub fn LineTokenizer(comptime Buffer: type) type { .raw = line, } else .{ .shift = shift, - .contents = .{ .list_item = try detectInlineItem(line[2..]) }, + .contents = .{ .list_item = try self.detectInlineItem(line[2..]) }, .raw = line, }; }, else => { for (line, 0..) |char, idx| { if (char == ':') { + self.buffer.diag().line_offset += idx + 2; + if (idx + 1 == line.len) return .{ .shift = shift, .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } }, .raw = line, }; - if (line[idx + 1] != ' ') return error.BadToken; + if (line[idx + 1] != ' ') { + self.buffer.diag().line_offset += idx + 1; + self.buffer.diag().length = 1; + self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'"; + return error.BadToken; + } return .{ .shift = shift, .contents = .{ .map_item = .{ .key = line[0..idx], - .val = try detectInlineItem(line[idx + 2 ..]), + .val = try self.detectInlineItem(line[idx + 2 ..]), } }, .raw = line, }; @@ -208,12 +255,16 @@ pub fn LineTokenizer(comptime Buffer: type) type { } // somehow everything else has failed + self.buffer.diag().line_offset = 0; + self.buffer.diag().length = raw_line.len; + self.buffer.diag().message = "this document contains an unknown error. Please report this."; return error.Impossible; } return null; } - fn detectInlineItem(buf: []const u8) Error!InlineItem { + // TODO: it's impossible to get the right diagnostic offset in this function at the moment + fn detectInlineItem(self: @This(), buf: []const u8) Error!InlineItem { if (buf.len == 0) return .empty; switch (buf[0]) { @@ -221,7 +272,12 @@ pub fn LineTokenizer(comptime Buffer: type) type { if (buf.len > 1 and buf[1] != ' ') return error.BadToken; const slice: []const u8 = switch (buf[buf.len - 1]) { - ' ', '\t' => return error.TrailingWhitespace, + ' ', '\t' => { + self.buffer.diag().line_offset = 0; + self.buffer.diag().length = 1; + self.buffer.diag().message = "this line contains trailing whitespace"; + return error.TrailingWhitespace; + }, '|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)], else => buf[@min(2, buf.len)..buf.len], }; @@ -232,22 +288,34 @@ pub fn LineTokenizer(comptime Buffer: type) type { .{ .space_string = slice }; }, '[' => { - if (buf.len < 2 or buf[buf.len - 1] != ']') + if (buf.len < 2 or buf[buf.len - 1] != ']') { + self.buffer.diag().line_offset = 0; + self.buffer.diag().length = 1; + self.buffer.diag().message = "this line contains a flow-style list but does not end with the closing character ']'"; return error.BadToken; + } // keep the closing ] for the flow parser return .{ .flow_list = buf[1..] }; }, '{' => { - if (buf.len < 2 or buf[buf.len - 1] != '}') + if (buf.len < 2 or buf[buf.len - 1] != '}') { + self.buffer.diag().line_offset = 0; + self.buffer.diag().length = 1; + self.buffer.diag().message = "this line contains a flow-style map but does not end with the closing character '}'"; return error.BadToken; + } // keep the closing } fpr the flow parser return .{ .flow_map = buf[1..] }; }, else => { - if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') + if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') { + self.buffer.diag().line_offset = 0; + self.buffer.diag().length = 1; + self.buffer.diag().message = "this line contains trailing whitespace"; return error.TrailingWhitespace; + } return .{ .scalar = buf }; },