parser: start the arduous journey of hooking up diagnostics

The errors in the line buffer and tokenizer now have diagnostics. The
line number is trivial to keep track of due to the line buffer, but
the column index requires quite a bit of juggling, as we pass
successively trimmed down buffers to the internals of the parser.
There will probably be some column index counting problems in the
future. Also, handling the diagnostics is a bit awkward, since it's a
mandatory out-parameter of the parse functions now. The user must
provide a valid diagnostics object that survives for the life of the
parser.
This commit is contained in:
torque 2023-09-27 23:44:06 -07:00
parent 3258e7fdb5
commit 01f98f9aff
Signed by: torque
SSH Key Fingerprint: SHA256:nCrXefBNo6EbjNSQhv0nXmEg/VuNq3sMF5b8zETw3Tk
7 changed files with 185 additions and 45 deletions

View File

@ -15,7 +15,16 @@ pub fn main() !void {
var needfree = true; var needfree = true;
defer if (needfree) allocator.free(data); defer if (needfree) allocator.free(data);
const document = try nice.parseBuffer(allocator, data, .{}); var diagnostics = nice.Diagnostics{};
const document = nice.parseBuffer(allocator, data, &diagnostics, .{}) catch |err| {
std.debug.print("{s}:{d} col:{d}: {s}\n", .{
args[1],
diagnostics.row,
diagnostics.line_offset,
diagnostics.message,
});
return err;
};
defer document.deinit(); defer document.deinit();
// free data memory to ensure that the parsed document is not holding // free data memory to ensure that the parsed document is not holding

View File

@ -16,6 +16,7 @@ pub fn main() !void {
defer file.close(); defer file.close();
var parser = try nice.StreamParser.init(allocator, .{}); var parser = try nice.StreamParser.init(allocator, .{});
defer parser.deinit(); defer parser.deinit();
errdefer parser.parse_state.document.deinit();
while (true) { while (true) {
var buf = [_]u8{0} ** 1024; var buf = [_]u8{0} ** 1024;
const len = try file.read(&buf); const len = try file.read(&buf);

View File

@ -1,5 +1,7 @@
const std = @import("std"); const std = @import("std");
const Diagnostics = @import("./parser.zig").Diagnostics;
pub const IndexSlice = struct { start: usize, len: usize }; pub const IndexSlice = struct { start: usize, len: usize };
pub const Error = error{ pub const Error = error{
@ -45,14 +47,15 @@ pub fn LineBuffer(comptime options: Strictness) type {
pub const default_capacity: usize = 4096; pub const default_capacity: usize = 4096;
pub fn init(allocator: std.mem.Allocator) !@This() { pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) !@This() {
return initCapacity(allocator, default_capacity); return initCapacity(allocator, diagnostics, default_capacity);
} }
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) !@This() { pub fn initCapacity(allocator: std.mem.Allocator, diagnostics: *Diagnostics, capacity: usize) !@This() {
return .{ return .{
.allocator = allocator, .allocator = allocator,
.internal = .{ .internal = .{
.diagnostics = diagnostics,
.buffer = try allocator.alloc(u8, capacity), .buffer = try allocator.alloc(u8, capacity),
.window = .{ .start = 0, .len = 0 }, .window = .{ .start = 0, .len = 0 },
}, },
@ -60,6 +63,10 @@ pub fn LineBuffer(comptime options: Strictness) type {
}; };
} }
pub fn diag(self: @This()) *Diagnostics {
return self.internal.diagnostics;
}
pub fn empty(self: @This()) bool { pub fn empty(self: @This()) bool {
return self.internal.empty(); return self.internal.empty();
} }
@ -111,9 +118,18 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
return struct { return struct {
buffer: []const u8, buffer: []const u8,
window: IndexSlice, window: IndexSlice,
diagnostics: *Diagnostics,
pub fn init(data: []const u8) @This() { pub fn init(data: []const u8, diagnostics: *Diagnostics) @This() {
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } }; return .{
.buffer = data,
.window = .{ .start = 0, .len = data.len },
.diagnostics = diagnostics,
};
}
pub fn diag(self: @This()) *Diagnostics {
return self.diagnostics;
} }
pub fn empty(self: @This()) bool { pub fn empty(self: @This()) bool {
@ -131,16 +147,33 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
const split: usize = split: { const split: usize = split: {
for (window, 0..) |char, idx| { for (window, 0..) |char, idx| {
if (comptime options.check_carriage_return) if (comptime options.check_carriage_return)
if (char == '\r') return error.IllegalCarriageReturn; if (char == '\r') {
self.diagnostics.row += 1;
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "found a carriage return";
return error.IllegalCarriageReturn;
};
if (comptime options.check_nonprinting_ascii) if (comptime options.check_nonprinting_ascii)
if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F)) if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F)) {
self.diagnostics.row += 1;
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "found nonprinting ascii characters";
return error.IllegalNonprintingAscii; return error.IllegalNonprintingAscii;
};
if (comptime options.check_trailing_whitespace) { if (comptime options.check_trailing_whitespace) {
if (char == '\n') { if (char == '\n') {
if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t')) if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t')) {
self.diagnostics.row += 1;
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "found trailing spaces";
return error.IllegalTrailingSpace; return error.IllegalTrailingSpace;
}
break :split idx; break :split idx;
} }
} else { } else {
@ -150,12 +183,41 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
return null; return null;
}; };
self.diagnostics.row += 1;
self.diagnostics.line_offset = 0;
self.window.start += split + 1; self.window.start += split + 1;
self.window.len -= split + 1; self.window.len -= split + 1;
if (comptime options.validate_utf8) { if (comptime options.validate_utf8) {
const line = window[0..split]; const line = window[0..split];
return if (std.unicode.utf8ValidateSlice(line)) line else error.InputIsNotValidUtf8;
var idx: usize = 0;
while (idx < line.len) {
if (std.unicode.utf8ByteSequenceLength(line[idx])) |cp_len| {
if (idx + cp_len > line.len) {
self.diagnostics.line_offset = idx;
self.diagnostics.length = cp_len;
self.diagnostics.message = "truncated UTF-8 sequence";
return error.InputIsNotValidUtf8;
}
if (std.meta.isError(std.unicode.utf8Decode(line[idx .. idx + cp_len]))) {
self.diagnostics.line_offset = idx;
self.diagnostics.length = cp_len;
self.diagnostics.message = "invalid UTF-8 sequence";
return error.InputIsNotValidUtf8;
}
idx += cp_len;
} else |_| {
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "invalid UTF-8 sequence start byte";
return error.InputIsNotValidUtf8;
}
}
return line;
} else { } else {
return window[0..split]; return window[0..split];
} }

View File

@ -68,3 +68,4 @@ pub const parseBuffer = parser.parseBuffer;
pub const StreamParser = parser.StreamParser; pub const StreamParser = parser.StreamParser;
pub const Document = parser.Document; pub const Document = parser.Document;
pub const Value = parser.Value; pub const Value = parser.Value;
pub const Diagnostics = parser.Diagnostics;

View File

@ -8,14 +8,14 @@ pub const Value = @import("./parser/value.zig").Value;
pub const Diagnostics = struct { pub const Diagnostics = struct {
row: usize = 0, row: usize = 0,
span: struct { absolute: usize = 0, line_offset: usize = 0, length: usize = 0 } = .{}, line_offset: usize = 0,
length: usize = 0,
message: []const u8 = "no problems", message: []const u8 = "no problems",
}; };
pub const Error = error{ pub const Error = error{
UnexpectedIndent, UnexpectedIndent,
UnexpectedValue, UnexpectedValue,
ExtraContent,
EmptyDocument, EmptyDocument,
DuplicateKey, DuplicateKey,
BadMapEntry, BadMapEntry,
@ -42,15 +42,13 @@ pub const Options = struct {
default_object: enum { string, list, map, fail } = .fail, default_object: enum { string, list, map, fail } = .fail,
}; };
pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, options: Options) !Document { pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, diagnostics: *Diagnostics, options: Options) !Document {
var state = State.init(allocator); var state = State.init(allocator, diagnostics);
defer state.deinit(); defer state.deinit();
errdefer state.document.deinit(); errdefer state.document.deinit();
var diagnostics = Diagnostics{};
var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{ var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{
.buffer = buffers.ValidatingFixedLineBuffer.init(buffer), .buffer = buffers.ValidatingFixedLineBuffer.init(buffer, diagnostics),
.diagnostics = &diagnostics,
}; };
while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior); while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior);
@ -65,7 +63,6 @@ pub const StreamParser = struct {
linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer), linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer),
parse_state: State, parse_state: State,
parse_options: Options = .{}, parse_options: Options = .{},
diagnostics: Diagnostics = .{},
pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser { pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser {
const diagnostics = try allocator.create(Diagnostics); const diagnostics = try allocator.create(Diagnostics);
@ -74,16 +71,15 @@ pub const StreamParser = struct {
return .{ return .{
.linetok = .{ .linetok = .{
.buffer = try buffers.ValidatingLineBuffer.init(allocator), .buffer = try buffers.ValidatingLineBuffer.init(allocator, diagnostics),
.diagnostics = diagnostics,
}, },
.parse_state = State.init(allocator), .parse_state = State.init(allocator, diagnostics),
.parse_options = options, .parse_options = options,
}; };
} }
pub fn deinit(self: StreamParser) void { pub fn deinit(self: StreamParser) void {
self.linetok.buffer.allocator.destroy(self.linetok.diagnostics); self.linetok.buffer.allocator.destroy(self.parse_state.diagnostics);
self.linetok.buffer.deinit(); self.linetok.buffer.deinit();
self.parse_state.deinit(); self.parse_state.deinit();
} }

View File

@ -4,6 +4,7 @@ const tokenizer = @import("../tokenizer.zig");
const Error = @import("../parser.zig").Error; const Error = @import("../parser.zig").Error;
const DuplicateKeyBehavior = @import("../parser.zig").DuplicateKeyBehavior; const DuplicateKeyBehavior = @import("../parser.zig").DuplicateKeyBehavior;
const Options = @import("../parser.zig").Options; const Options = @import("../parser.zig").Options;
const Diagnostics = @import("../parser.zig").Diagnostics;
const Value = @import("./value.zig").Value; const Value = @import("./value.zig").Value;
pub const Document = struct { pub const Document = struct {
@ -42,14 +43,16 @@ pub const State = struct {
pub const Stack = std.ArrayList(*Value); pub const Stack = std.ArrayList(*Value);
document: Document, document: Document,
diagnostics: *Diagnostics,
value_stack: Stack, value_stack: Stack,
mode: enum { initial, value, done } = .initial, mode: enum { initial, value, done } = .initial,
expect_shift: tokenizer.ShiftDirection = .none, expect_shift: tokenizer.ShiftDirection = .none,
dangling_key: ?[]const u8 = null, dangling_key: ?[]const u8 = null,
pub fn init(allocator: std.mem.Allocator) State { pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) State {
return .{ return .{
.document = Document.init(allocator), .document = Document.init(allocator),
.diagnostics = diagnostics,
.value_stack = Stack.init(allocator), .value_stack = Stack.init(allocator),
}; };
} }

View File

@ -4,10 +4,10 @@ const Diagnostics = @import("./parser.zig").Diagnostics;
pub const Error = error{ pub const Error = error{
BadToken, BadToken,
ExtraContent,
MixedIndentation, MixedIndentation,
UnquantizedIndentation,
TooMuchIndentation, TooMuchIndentation,
MissingNewline, UnquantizedIndentation,
TrailingWhitespace, TrailingWhitespace,
Impossible, Impossible,
}; };
@ -60,18 +60,19 @@ pub const Line = struct {
}; };
// buffer is expected to be either LineBuffer or FixedLineBuffer, but can // buffer is expected to be either LineBuffer or FixedLineBuffer, but can
// technically be anything with a `nextLine` method // technically be anything with a conformant interface.
pub fn LineTokenizer(comptime Buffer: type) type { pub fn LineTokenizer(comptime Buffer: type) type {
return struct { return struct {
buffer: Buffer, buffer: Buffer,
index: usize = 0, index: usize = 0,
indentation: DetectedIndentation = .unknown, indentation: DetectedIndentation = .unknown,
last_indent: usize = 0, last_indent: usize = 0,
diagnostics: *Diagnostics,
row: usize = 0,
pub fn finish(self: @This()) !void { pub fn finish(self: @This()) !void {
if (!self.buffer.empty()) { if (!self.buffer.empty()) {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "the document has extra content or is missing the final LF character";
return error.ExtraContent; return error.ExtraContent;
} }
} }
@ -91,13 +92,23 @@ pub fn LineTokenizer(comptime Buffer: type) type {
// ugly documents. // ugly documents.
.unknown => self.indentation = .{ .spaces = 0 }, .unknown => self.indentation = .{ .spaces = 0 },
.spaces => {}, .spaces => {},
.tabs => return error.MixedIndentation, .tabs => {
self.buffer.diag().line_offset = idx;
self.buffer.diag().length = 1;
self.buffer.diag().message = "the document contains mixed tab/space indentation";
return error.MixedIndentation;
},
} }
}, },
'\t' => { '\t' => {
switch (self.indentation) { switch (self.indentation) {
.unknown => self.indentation = .tabs, .unknown => self.indentation = .tabs,
.spaces => return error.MixedIndentation, .spaces => {
self.buffer.diag().line_offset = idx;
self.buffer.diag().length = 1;
self.buffer.diag().message = "the document contains mixed tab/space indentation";
return error.MixedIndentation;
},
.tabs => {}, .tabs => {},
} }
}, },
@ -110,7 +121,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
}, },
} }
} else { } else {
if (raw_line.len > 0) return error.TrailingWhitespace; if (raw_line.len > 0) {
self.buffer.diag().line_offset = raw_line.len - 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains trailing whitespace";
return error.TrailingWhitespace;
}
continue :lineloop; continue :lineloop;
} }
@ -118,15 +134,23 @@ pub fn LineTokenizer(comptime Buffer: type) type {
if (self.indentation.spaces == 0) { if (self.indentation.spaces == 0) {
self.indentation.spaces = indent; self.indentation.spaces = indent;
} }
if (@rem(indent, self.indentation.spaces) != 0) if (@rem(indent, self.indentation.spaces) != 0) {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = indent;
self.buffer.diag().message = "this line contains incorrectly quantized indentation";
return error.UnquantizedIndentation; return error.UnquantizedIndentation;
}
break :quant @divExact(indent, self.indentation.spaces); break :quant @divExact(indent, self.indentation.spaces);
} else indent; } else indent;
const shift: LineShift = if (quantized > self.last_indent) rel: { const shift: LineShift = if (quantized > self.last_indent) rel: {
if ((quantized - self.last_indent) > 1) if ((quantized - self.last_indent) > 1) {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = indent;
self.buffer.diag().message = "this line contains too much indentation";
return error.TooMuchIndentation; return error.TooMuchIndentation;
}
break :rel .indent; break :rel .indent;
} else if (quantized < self.last_indent) } else if (quantized < self.last_indent)
.{ .dedent = self.last_indent - quantized } .{ .dedent = self.last_indent - quantized }
@ -134,10 +158,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.none; .none;
defer { defer {
self.row += 1;
self.last_indent = quantized; self.last_indent = quantized;
} }
// update the diagnostics so that the parser can use them without
// knowing about the whitespace.
self.buffer.diag().line_offset = indent;
const line = raw_line[indent..]; const line = raw_line[indent..];
// this should not be possible, as empty lines are caught earlier. // this should not be possible, as empty lines are caught earlier.
@ -147,7 +173,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
'#' => { '#' => {
// force comments to be followed by a space. This makes them // force comments to be followed by a space. This makes them
// behave the same way as strings, actually. // behave the same way as strings, actually.
if (line.len > 1 and line[1] != ' ') return error.BadToken; if (line.len > 1 and line[1] != ' ') {
self.buffer.diag().line_offset += 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the start of comment character '#'";
return error.BadToken;
}
// simply lie about indentation when the line is a comment. // simply lie about indentation when the line is a comment.
quantized = self.last_indent; quantized = self.last_indent;
@ -160,12 +191,21 @@ pub fn LineTokenizer(comptime Buffer: type) type {
'|', '>', '[', '{' => { '|', '>', '[', '{' => {
return .{ return .{
.shift = shift, .shift = shift,
.contents = .{ .in_line = try detectInlineItem(line) }, .contents = .{ .in_line = try self.detectInlineItem(line) },
.raw = line, .raw = line,
}; };
}, },
'-' => { '-' => {
if (line.len > 1 and line[1] != ' ') return error.BadToken; if (line.len > 1 and line[1] != ' ') {
self.buffer.diag().line_offset += 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the list entry character '-'";
return error.BadToken;
}
// blindly add 2 here because an empty item cannot fail in
// the value, only if a bogus dedent has occurred
self.buffer.diag().line_offset += 2;
return if (line.len == 1) .{ return if (line.len == 1) .{
.shift = shift, .shift = shift,
@ -173,26 +213,33 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.raw = line, .raw = line,
} else .{ } else .{
.shift = shift, .shift = shift,
.contents = .{ .list_item = try detectInlineItem(line[2..]) }, .contents = .{ .list_item = try self.detectInlineItem(line[2..]) },
.raw = line, .raw = line,
}; };
}, },
else => { else => {
for (line, 0..) |char, idx| { for (line, 0..) |char, idx| {
if (char == ':') { if (char == ':') {
self.buffer.diag().line_offset += idx + 2;
if (idx + 1 == line.len) return .{ if (idx + 1 == line.len) return .{
.shift = shift, .shift = shift,
.contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } }, .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
.raw = line, .raw = line,
}; };
if (line[idx + 1] != ' ') return error.BadToken; if (line[idx + 1] != ' ') {
self.buffer.diag().line_offset += idx + 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'";
return error.BadToken;
}
return .{ return .{
.shift = shift, .shift = shift,
.contents = .{ .map_item = .{ .contents = .{ .map_item = .{
.key = line[0..idx], .key = line[0..idx],
.val = try detectInlineItem(line[idx + 2 ..]), .val = try self.detectInlineItem(line[idx + 2 ..]),
} }, } },
.raw = line, .raw = line,
}; };
@ -208,12 +255,16 @@ pub fn LineTokenizer(comptime Buffer: type) type {
} }
// somehow everything else has failed // somehow everything else has failed
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = raw_line.len;
self.buffer.diag().message = "this document contains an unknown error. Please report this.";
return error.Impossible; return error.Impossible;
} }
return null; return null;
} }
fn detectInlineItem(buf: []const u8) Error!InlineItem { // TODO: it's impossible to get the right diagnostic offset in this function at the moment
fn detectInlineItem(self: @This(), buf: []const u8) Error!InlineItem {
if (buf.len == 0) return .empty; if (buf.len == 0) return .empty;
switch (buf[0]) { switch (buf[0]) {
@ -221,7 +272,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
if (buf.len > 1 and buf[1] != ' ') return error.BadToken; if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
const slice: []const u8 = switch (buf[buf.len - 1]) { const slice: []const u8 = switch (buf[buf.len - 1]) {
' ', '\t' => return error.TrailingWhitespace, ' ', '\t' => {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains trailing whitespace";
return error.TrailingWhitespace;
},
'|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)], '|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
else => buf[@min(2, buf.len)..buf.len], else => buf[@min(2, buf.len)..buf.len],
}; };
@ -232,22 +288,34 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.{ .space_string = slice }; .{ .space_string = slice };
}, },
'[' => { '[' => {
if (buf.len < 2 or buf[buf.len - 1] != ']') if (buf.len < 2 or buf[buf.len - 1] != ']') {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains a flow-style list but does not end with the closing character ']'";
return error.BadToken; return error.BadToken;
}
// keep the closing ] for the flow parser // keep the closing ] for the flow parser
return .{ .flow_list = buf[1..] }; return .{ .flow_list = buf[1..] };
}, },
'{' => { '{' => {
if (buf.len < 2 or buf[buf.len - 1] != '}') if (buf.len < 2 or buf[buf.len - 1] != '}') {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains a flow-style map but does not end with the closing character '}'";
return error.BadToken; return error.BadToken;
}
// keep the closing } fpr the flow parser // keep the closing } fpr the flow parser
return .{ .flow_map = buf[1..] }; return .{ .flow_map = buf[1..] };
}, },
else => { else => {
if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains trailing whitespace";
return error.TrailingWhitespace; return error.TrailingWhitespace;
}
return .{ .scalar = buf }; return .{ .scalar = buf };
}, },