Compare commits

..

No commits in common. "01f98f9aff46bb4c6d510a6dff4d5208736f18e8" and "0e60719c856025ae0e3feda8fdd7dd989d3e51ae" have entirely different histories.

7 changed files with 45 additions and 206 deletions

View File

@ -15,16 +15,7 @@ pub fn main() !void {
var needfree = true; var needfree = true;
defer if (needfree) allocator.free(data); defer if (needfree) allocator.free(data);
var diagnostics = nice.Diagnostics{}; const document = try nice.parseBuffer(allocator, data, .{});
const document = nice.parseBuffer(allocator, data, &diagnostics, .{}) catch |err| {
std.debug.print("{s}:{d} col:{d}: {s}\n", .{
args[1],
diagnostics.row,
diagnostics.line_offset,
diagnostics.message,
});
return err;
};
defer document.deinit(); defer document.deinit();
// free data memory to ensure that the parsed document is not holding // free data memory to ensure that the parsed document is not holding

View File

@ -16,7 +16,6 @@ pub fn main() !void {
defer file.close(); defer file.close();
var parser = try nice.StreamParser.init(allocator, .{}); var parser = try nice.StreamParser.init(allocator, .{});
defer parser.deinit(); defer parser.deinit();
errdefer parser.parse_state.document.deinit();
while (true) { while (true) {
var buf = [_]u8{0} ** 1024; var buf = [_]u8{0} ** 1024;
const len = try file.read(&buf); const len = try file.read(&buf);

View File

@ -1,7 +1,5 @@
const std = @import("std"); const std = @import("std");
const Diagnostics = @import("./parser.zig").Diagnostics;
pub const IndexSlice = struct { start: usize, len: usize }; pub const IndexSlice = struct { start: usize, len: usize };
pub const Error = error{ pub const Error = error{
@ -47,15 +45,14 @@ pub fn LineBuffer(comptime options: Strictness) type {
pub const default_capacity: usize = 4096; pub const default_capacity: usize = 4096;
pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) !@This() { pub fn init(allocator: std.mem.Allocator) !@This() {
return initCapacity(allocator, diagnostics, default_capacity); return initCapacity(allocator, default_capacity);
} }
pub fn initCapacity(allocator: std.mem.Allocator, diagnostics: *Diagnostics, capacity: usize) !@This() { pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) !@This() {
return .{ return .{
.allocator = allocator, .allocator = allocator,
.internal = .{ .internal = .{
.diagnostics = diagnostics,
.buffer = try allocator.alloc(u8, capacity), .buffer = try allocator.alloc(u8, capacity),
.window = .{ .start = 0, .len = 0 }, .window = .{ .start = 0, .len = 0 },
}, },
@ -63,14 +60,6 @@ pub fn LineBuffer(comptime options: Strictness) type {
}; };
} }
pub fn diag(self: @This()) *Diagnostics {
return self.internal.diagnostics;
}
pub fn empty(self: @This()) bool {
return self.internal.empty();
}
pub fn deinit(self: @This()) void { pub fn deinit(self: @This()) void {
self.allocator.free(self.internal.buffer); self.allocator.free(self.internal.buffer);
} }
@ -118,24 +107,9 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
return struct { return struct {
buffer: []const u8, buffer: []const u8,
window: IndexSlice, window: IndexSlice,
diagnostics: *Diagnostics,
pub fn init(data: []const u8, diagnostics: *Diagnostics) @This() { pub fn init(data: []const u8) @This() {
return .{ return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
.buffer = data,
.window = .{ .start = 0, .len = data.len },
.diagnostics = diagnostics,
};
}
pub fn diag(self: @This()) *Diagnostics {
return self.diagnostics;
}
pub fn empty(self: @This()) bool {
// we can't check the overall buffer size because the dynamic buffer may be
// overallocated
return self.window.len == 0;
} }
pub fn nextLine(self: *@This()) !?[]const u8 { pub fn nextLine(self: *@This()) !?[]const u8 {
@ -147,33 +121,16 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
const split: usize = split: { const split: usize = split: {
for (window, 0..) |char, idx| { for (window, 0..) |char, idx| {
if (comptime options.check_carriage_return) if (comptime options.check_carriage_return)
if (char == '\r') { if (char == '\r') return error.IllegalCarriageReturn;
self.diagnostics.row += 1;
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "found a carriage return";
return error.IllegalCarriageReturn;
};
if (comptime options.check_nonprinting_ascii) if (comptime options.check_nonprinting_ascii)
if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F)) { if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F))
self.diagnostics.row += 1;
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "found nonprinting ascii characters";
return error.IllegalNonprintingAscii; return error.IllegalNonprintingAscii;
};
if (comptime options.check_trailing_whitespace) { if (comptime options.check_trailing_whitespace) {
if (char == '\n') { if (char == '\n') {
if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t')) { if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t'))
self.diagnostics.row += 1;
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "found trailing spaces";
return error.IllegalTrailingSpace; return error.IllegalTrailingSpace;
}
break :split idx; break :split idx;
} }
} else { } else {
@ -183,41 +140,12 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
return null; return null;
}; };
self.diagnostics.row += 1;
self.diagnostics.line_offset = 0;
self.window.start += split + 1; self.window.start += split + 1;
self.window.len -= split + 1; self.window.len -= split + 1;
if (comptime options.validate_utf8) { if (comptime options.validate_utf8) {
const line = window[0..split]; const line = window[0..split];
return if (std.unicode.utf8ValidateSlice(line)) line else error.InputIsNotValidUtf8;
var idx: usize = 0;
while (idx < line.len) {
if (std.unicode.utf8ByteSequenceLength(line[idx])) |cp_len| {
if (idx + cp_len > line.len) {
self.diagnostics.line_offset = idx;
self.diagnostics.length = cp_len;
self.diagnostics.message = "truncated UTF-8 sequence";
return error.InputIsNotValidUtf8;
}
if (std.meta.isError(std.unicode.utf8Decode(line[idx .. idx + cp_len]))) {
self.diagnostics.line_offset = idx;
self.diagnostics.length = cp_len;
self.diagnostics.message = "invalid UTF-8 sequence";
return error.InputIsNotValidUtf8;
}
idx += cp_len;
} else |_| {
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "invalid UTF-8 sequence start byte";
return error.InputIsNotValidUtf8;
}
}
return line;
} else { } else {
return window[0..split]; return window[0..split];
} }

View File

@ -68,4 +68,3 @@ pub const parseBuffer = parser.parseBuffer;
pub const StreamParser = parser.StreamParser; pub const StreamParser = parser.StreamParser;
pub const Document = parser.Document; pub const Document = parser.Document;
pub const Value = parser.Value; pub const Value = parser.Value;
pub const Diagnostics = parser.Diagnostics;

View File

@ -8,14 +8,14 @@ pub const Value = @import("./parser/value.zig").Value;
pub const Diagnostics = struct { pub const Diagnostics = struct {
row: usize = 0, row: usize = 0,
line_offset: usize = 0, span: struct { absolute: usize = 0, line_offset: usize = 0, length: usize = 0 } = .{},
length: usize = 0,
message: []const u8 = "no problems", message: []const u8 = "no problems",
}; };
pub const Error = error{ pub const Error = error{
UnexpectedIndent, UnexpectedIndent,
UnexpectedValue, UnexpectedValue,
ExtraContent,
EmptyDocument, EmptyDocument,
DuplicateKey, DuplicateKey,
BadMapEntry, BadMapEntry,
@ -42,20 +42,18 @@ pub const Options = struct {
default_object: enum { string, list, map, fail } = .fail, default_object: enum { string, list, map, fail } = .fail,
}; };
pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, diagnostics: *Diagnostics, options: Options) !Document { pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, options: Options) !Document {
var state = State.init(allocator, diagnostics); var state = State.init(allocator);
defer state.deinit(); defer state.deinit();
errdefer state.document.deinit(); errdefer state.document.deinit();
var diagnostics = Diagnostics{};
var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{ var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{
.buffer = buffers.ValidatingFixedLineBuffer.init(buffer, diagnostics), .buffer = buffers.ValidatingFixedLineBuffer.init(buffer),
.diagnostics = &diagnostics,
}; };
while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior); while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior);
// state doesn't have access to the tokenizer, which is the only thing that can
// error if unparsed lines remain in the buffer by the time that "finish" is
// called.
try tok.finish();
return try state.finish(options); return try state.finish(options);
} }
@ -63,6 +61,7 @@ pub const StreamParser = struct {
linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer), linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer),
parse_state: State, parse_state: State,
parse_options: Options = .{}, parse_options: Options = .{},
diagnostics: Diagnostics = .{},
pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser { pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser {
const diagnostics = try allocator.create(Diagnostics); const diagnostics = try allocator.create(Diagnostics);
@ -71,15 +70,16 @@ pub const StreamParser = struct {
return .{ return .{
.linetok = .{ .linetok = .{
.buffer = try buffers.ValidatingLineBuffer.init(allocator, diagnostics), .buffer = try buffers.ValidatingLineBuffer.init(allocator),
.diagnostics = diagnostics,
}, },
.parse_state = State.init(allocator, diagnostics), .parse_state = State.init(allocator),
.parse_options = options, .parse_options = options,
}; };
} }
pub fn deinit(self: StreamParser) void { pub fn deinit(self: StreamParser) void {
self.linetok.buffer.allocator.destroy(self.parse_state.diagnostics); self.linetok.buffer.allocator.destroy(self.linetok.diagnostics);
self.linetok.buffer.deinit(); self.linetok.buffer.deinit();
self.parse_state.deinit(); self.parse_state.deinit();
} }
@ -90,7 +90,6 @@ pub const StreamParser = struct {
} }
pub fn finish(self: *StreamParser) !Document { pub fn finish(self: *StreamParser) !Document {
try self.linetok.finish();
return try self.parse_state.finish(self.parse_options); return try self.parse_state.finish(self.parse_options);
} }
}; };

View File

@ -4,7 +4,6 @@ const tokenizer = @import("../tokenizer.zig");
const Error = @import("../parser.zig").Error; const Error = @import("../parser.zig").Error;
const DuplicateKeyBehavior = @import("../parser.zig").DuplicateKeyBehavior; const DuplicateKeyBehavior = @import("../parser.zig").DuplicateKeyBehavior;
const Options = @import("../parser.zig").Options; const Options = @import("../parser.zig").Options;
const Diagnostics = @import("../parser.zig").Diagnostics;
const Value = @import("./value.zig").Value; const Value = @import("./value.zig").Value;
pub const Document = struct { pub const Document = struct {
@ -43,16 +42,14 @@ pub const State = struct {
pub const Stack = std.ArrayList(*Value); pub const Stack = std.ArrayList(*Value);
document: Document, document: Document,
diagnostics: *Diagnostics,
value_stack: Stack, value_stack: Stack,
mode: enum { initial, value, done } = .initial, mode: enum { initial, value, done } = .initial,
expect_shift: tokenizer.ShiftDirection = .none, expect_shift: tokenizer.ShiftDirection = .none,
dangling_key: ?[]const u8 = null, dangling_key: ?[]const u8 = null,
pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) State { pub fn init(allocator: std.mem.Allocator) State {
return .{ return .{
.document = Document.init(allocator), .document = Document.init(allocator),
.diagnostics = diagnostics,
.value_stack = Stack.init(allocator), .value_stack = Stack.init(allocator),
}; };
} }

View File

@ -4,10 +4,10 @@ const Diagnostics = @import("./parser.zig").Diagnostics;
pub const Error = error{ pub const Error = error{
BadToken, BadToken,
ExtraContent,
MixedIndentation, MixedIndentation,
TooMuchIndentation,
UnquantizedIndentation, UnquantizedIndentation,
TooMuchIndentation,
MissingNewline,
TrailingWhitespace, TrailingWhitespace,
Impossible, Impossible,
}; };
@ -60,22 +60,15 @@ pub const Line = struct {
}; };
// buffer is expected to be either LineBuffer or FixedLineBuffer, but can // buffer is expected to be either LineBuffer or FixedLineBuffer, but can
// technically be anything with a conformant interface. // technically be anything with a `nextLine` method
pub fn LineTokenizer(comptime Buffer: type) type { pub fn LineTokenizer(comptime Buffer: type) type {
return struct { return struct {
buffer: Buffer, buffer: Buffer,
index: usize = 0, index: usize = 0,
indentation: DetectedIndentation = .unknown, indentation: DetectedIndentation = .unknown,
last_indent: usize = 0, last_indent: usize = 0,
diagnostics: *Diagnostics,
pub fn finish(self: @This()) !void { row: usize = 0,
if (!self.buffer.empty()) {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "the document has extra content or is missing the final LF character";
return error.ExtraContent;
}
}
pub fn next(self: *@This()) !?Line { pub fn next(self: *@This()) !?Line {
lineloop: while (try self.buffer.nextLine()) |raw_line| { lineloop: while (try self.buffer.nextLine()) |raw_line| {
@ -92,23 +85,13 @@ pub fn LineTokenizer(comptime Buffer: type) type {
// ugly documents. // ugly documents.
.unknown => self.indentation = .{ .spaces = 0 }, .unknown => self.indentation = .{ .spaces = 0 },
.spaces => {}, .spaces => {},
.tabs => { .tabs => return error.MixedIndentation,
self.buffer.diag().line_offset = idx;
self.buffer.diag().length = 1;
self.buffer.diag().message = "the document contains mixed tab/space indentation";
return error.MixedIndentation;
},
} }
}, },
'\t' => { '\t' => {
switch (self.indentation) { switch (self.indentation) {
.unknown => self.indentation = .tabs, .unknown => self.indentation = .tabs,
.spaces => { .spaces => return error.MixedIndentation,
self.buffer.diag().line_offset = idx;
self.buffer.diag().length = 1;
self.buffer.diag().message = "the document contains mixed tab/space indentation";
return error.MixedIndentation;
},
.tabs => {}, .tabs => {},
} }
}, },
@ -121,12 +104,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
}, },
} }
} else { } else {
if (raw_line.len > 0) { if (raw_line.len > 0) return error.TrailingWhitespace;
self.buffer.diag().line_offset = raw_line.len - 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains trailing whitespace";
return error.TrailingWhitespace;
}
continue :lineloop; continue :lineloop;
} }
@ -134,23 +112,15 @@ pub fn LineTokenizer(comptime Buffer: type) type {
if (self.indentation.spaces == 0) { if (self.indentation.spaces == 0) {
self.indentation.spaces = indent; self.indentation.spaces = indent;
} }
if (@rem(indent, self.indentation.spaces) != 0) { if (@rem(indent, self.indentation.spaces) != 0)
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = indent;
self.buffer.diag().message = "this line contains incorrectly quantized indentation";
return error.UnquantizedIndentation; return error.UnquantizedIndentation;
}
break :quant @divExact(indent, self.indentation.spaces); break :quant @divExact(indent, self.indentation.spaces);
} else indent; } else indent;
const shift: LineShift = if (quantized > self.last_indent) rel: { const shift: LineShift = if (quantized > self.last_indent) rel: {
if ((quantized - self.last_indent) > 1) { if ((quantized - self.last_indent) > 1)
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = indent;
self.buffer.diag().message = "this line contains too much indentation";
return error.TooMuchIndentation; return error.TooMuchIndentation;
}
break :rel .indent; break :rel .indent;
} else if (quantized < self.last_indent) } else if (quantized < self.last_indent)
.{ .dedent = self.last_indent - quantized } .{ .dedent = self.last_indent - quantized }
@ -158,12 +128,10 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.none; .none;
defer { defer {
self.row += 1;
self.last_indent = quantized; self.last_indent = quantized;
} }
// update the diagnostics so that the parser can use them without
// knowing about the whitespace.
self.buffer.diag().line_offset = indent;
const line = raw_line[indent..]; const line = raw_line[indent..];
// this should not be possible, as empty lines are caught earlier. // this should not be possible, as empty lines are caught earlier.
@ -173,12 +141,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
'#' => { '#' => {
// force comments to be followed by a space. This makes them // force comments to be followed by a space. This makes them
// behave the same way as strings, actually. // behave the same way as strings, actually.
if (line.len > 1 and line[1] != ' ') { if (line.len > 1 and line[1] != ' ') return error.BadToken;
self.buffer.diag().line_offset += 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the start of comment character '#'";
return error.BadToken;
}
// simply lie about indentation when the line is a comment. // simply lie about indentation when the line is a comment.
quantized = self.last_indent; quantized = self.last_indent;
@ -191,21 +154,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
'|', '>', '[', '{' => { '|', '>', '[', '{' => {
return .{ return .{
.shift = shift, .shift = shift,
.contents = .{ .in_line = try self.detectInlineItem(line) }, .contents = .{ .in_line = try detectInlineItem(line) },
.raw = line, .raw = line,
}; };
}, },
'-' => { '-' => {
if (line.len > 1 and line[1] != ' ') { if (line.len > 1 and line[1] != ' ') return error.BadToken;
self.buffer.diag().line_offset += 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the list entry character '-'";
return error.BadToken;
}
// blindly add 2 here because an empty item cannot fail in
// the value, only if a bogus dedent has occurred
self.buffer.diag().line_offset += 2;
return if (line.len == 1) .{ return if (line.len == 1) .{
.shift = shift, .shift = shift,
@ -213,33 +167,26 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.raw = line, .raw = line,
} else .{ } else .{
.shift = shift, .shift = shift,
.contents = .{ .list_item = try self.detectInlineItem(line[2..]) }, .contents = .{ .list_item = try detectInlineItem(line[2..]) },
.raw = line, .raw = line,
}; };
}, },
else => { else => {
for (line, 0..) |char, idx| { for (line, 0..) |char, idx| {
if (char == ':') { if (char == ':') {
self.buffer.diag().line_offset += idx + 2;
if (idx + 1 == line.len) return .{ if (idx + 1 == line.len) return .{
.shift = shift, .shift = shift,
.contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } }, .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
.raw = line, .raw = line,
}; };
if (line[idx + 1] != ' ') { if (line[idx + 1] != ' ') return error.BadToken;
self.buffer.diag().line_offset += idx + 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'";
return error.BadToken;
}
return .{ return .{
.shift = shift, .shift = shift,
.contents = .{ .map_item = .{ .contents = .{ .map_item = .{
.key = line[0..idx], .key = line[0..idx],
.val = try self.detectInlineItem(line[idx + 2 ..]), .val = try detectInlineItem(line[idx + 2 ..]),
} }, } },
.raw = line, .raw = line,
}; };
@ -255,16 +202,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
} }
// somehow everything else has failed // somehow everything else has failed
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = raw_line.len;
self.buffer.diag().message = "this document contains an unknown error. Please report this.";
return error.Impossible; return error.Impossible;
} }
return null; return null;
} }
// TODO: it's impossible to get the right diagnostic offset in this function at the moment fn detectInlineItem(buf: []const u8) Error!InlineItem {
fn detectInlineItem(self: @This(), buf: []const u8) Error!InlineItem {
if (buf.len == 0) return .empty; if (buf.len == 0) return .empty;
switch (buf[0]) { switch (buf[0]) {
@ -272,12 +215,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
if (buf.len > 1 and buf[1] != ' ') return error.BadToken; if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
const slice: []const u8 = switch (buf[buf.len - 1]) { const slice: []const u8 = switch (buf[buf.len - 1]) {
' ', '\t' => { ' ', '\t' => return error.TrailingWhitespace,
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains trailing whitespace";
return error.TrailingWhitespace;
},
'|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)], '|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
else => buf[@min(2, buf.len)..buf.len], else => buf[@min(2, buf.len)..buf.len],
}; };
@ -288,34 +226,22 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.{ .space_string = slice }; .{ .space_string = slice };
}, },
'[' => { '[' => {
if (buf.len < 2 or buf[buf.len - 1] != ']') { if (buf.len < 2 or buf[buf.len - 1] != ']')
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains a flow-style list but does not end with the closing character ']'";
return error.BadToken; return error.BadToken;
}
// keep the closing ] for the flow parser // keep the closing ] for the flow parser
return .{ .flow_list = buf[1..] }; return .{ .flow_list = buf[1..] };
}, },
'{' => { '{' => {
if (buf.len < 2 or buf[buf.len - 1] != '}') { if (buf.len < 2 or buf[buf.len - 1] != '}')
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains a flow-style map but does not end with the closing character '}'";
return error.BadToken; return error.BadToken;
}
// keep the closing } fpr the flow parser // keep the closing } fpr the flow parser
return .{ .flow_map = buf[1..] }; return .{ .flow_map = buf[1..] };
}, },
else => { else => {
if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') { if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t')
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains trailing whitespace";
return error.TrailingWhitespace; return error.TrailingWhitespace;
}
return .{ .scalar = buf }; return .{ .scalar = buf };
}, },