Compare commits

..

No commits in common. "01f98f9aff46bb4c6d510a6dff4d5208736f18e8" and "0e60719c856025ae0e3feda8fdd7dd989d3e51ae" have entirely different histories.

7 changed files with 45 additions and 206 deletions

View File

@ -15,16 +15,7 @@ pub fn main() !void {
var needfree = true;
defer if (needfree) allocator.free(data);
var diagnostics = nice.Diagnostics{};
const document = nice.parseBuffer(allocator, data, &diagnostics, .{}) catch |err| {
std.debug.print("{s}:{d} col:{d}: {s}\n", .{
args[1],
diagnostics.row,
diagnostics.line_offset,
diagnostics.message,
});
return err;
};
const document = try nice.parseBuffer(allocator, data, .{});
defer document.deinit();
// free data memory to ensure that the parsed document is not holding

View File

@ -16,7 +16,6 @@ pub fn main() !void {
defer file.close();
var parser = try nice.StreamParser.init(allocator, .{});
defer parser.deinit();
errdefer parser.parse_state.document.deinit();
while (true) {
var buf = [_]u8{0} ** 1024;
const len = try file.read(&buf);

View File

@ -1,7 +1,5 @@
const std = @import("std");
const Diagnostics = @import("./parser.zig").Diagnostics;
pub const IndexSlice = struct { start: usize, len: usize };
pub const Error = error{
@ -47,15 +45,14 @@ pub fn LineBuffer(comptime options: Strictness) type {
pub const default_capacity: usize = 4096;
pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) !@This() {
return initCapacity(allocator, diagnostics, default_capacity);
pub fn init(allocator: std.mem.Allocator) !@This() {
return initCapacity(allocator, default_capacity);
}
pub fn initCapacity(allocator: std.mem.Allocator, diagnostics: *Diagnostics, capacity: usize) !@This() {
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) !@This() {
return .{
.allocator = allocator,
.internal = .{
.diagnostics = diagnostics,
.buffer = try allocator.alloc(u8, capacity),
.window = .{ .start = 0, .len = 0 },
},
@ -63,14 +60,6 @@ pub fn LineBuffer(comptime options: Strictness) type {
};
}
pub fn diag(self: @This()) *Diagnostics {
return self.internal.diagnostics;
}
pub fn empty(self: @This()) bool {
return self.internal.empty();
}
pub fn deinit(self: @This()) void {
self.allocator.free(self.internal.buffer);
}
@ -118,24 +107,9 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
return struct {
buffer: []const u8,
window: IndexSlice,
diagnostics: *Diagnostics,
pub fn init(data: []const u8, diagnostics: *Diagnostics) @This() {
return .{
.buffer = data,
.window = .{ .start = 0, .len = data.len },
.diagnostics = diagnostics,
};
}
pub fn diag(self: @This()) *Diagnostics {
return self.diagnostics;
}
pub fn empty(self: @This()) bool {
// we can't check the overall buffer size because the dynamic buffer may be
// overallocated
return self.window.len == 0;
pub fn init(data: []const u8) @This() {
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
}
pub fn nextLine(self: *@This()) !?[]const u8 {
@ -147,33 +121,16 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
const split: usize = split: {
for (window, 0..) |char, idx| {
if (comptime options.check_carriage_return)
if (char == '\r') {
self.diagnostics.row += 1;
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "found a carriage return";
return error.IllegalCarriageReturn;
};
if (char == '\r') return error.IllegalCarriageReturn;
if (comptime options.check_nonprinting_ascii)
if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F)) {
self.diagnostics.row += 1;
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "found nonprinting ascii characters";
if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F))
return error.IllegalNonprintingAscii;
};
if (comptime options.check_trailing_whitespace) {
if (char == '\n') {
if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t')) {
self.diagnostics.row += 1;
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "found trailing spaces";
if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t'))
return error.IllegalTrailingSpace;
}
break :split idx;
}
} else {
@ -183,41 +140,12 @@ pub fn FixedLineBuffer(comptime options: Strictness) type {
return null;
};
self.diagnostics.row += 1;
self.diagnostics.line_offset = 0;
self.window.start += split + 1;
self.window.len -= split + 1;
if (comptime options.validate_utf8) {
const line = window[0..split];
var idx: usize = 0;
while (idx < line.len) {
if (std.unicode.utf8ByteSequenceLength(line[idx])) |cp_len| {
if (idx + cp_len > line.len) {
self.diagnostics.line_offset = idx;
self.diagnostics.length = cp_len;
self.diagnostics.message = "truncated UTF-8 sequence";
return error.InputIsNotValidUtf8;
}
if (std.meta.isError(std.unicode.utf8Decode(line[idx .. idx + cp_len]))) {
self.diagnostics.line_offset = idx;
self.diagnostics.length = cp_len;
self.diagnostics.message = "invalid UTF-8 sequence";
return error.InputIsNotValidUtf8;
}
idx += cp_len;
} else |_| {
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "invalid UTF-8 sequence start byte";
return error.InputIsNotValidUtf8;
}
}
return line;
return if (std.unicode.utf8ValidateSlice(line)) line else error.InputIsNotValidUtf8;
} else {
return window[0..split];
}

View File

@ -68,4 +68,3 @@ pub const parseBuffer = parser.parseBuffer;
pub const StreamParser = parser.StreamParser;
pub const Document = parser.Document;
pub const Value = parser.Value;
pub const Diagnostics = parser.Diagnostics;

View File

@ -8,14 +8,14 @@ pub const Value = @import("./parser/value.zig").Value;
pub const Diagnostics = struct {
row: usize = 0,
line_offset: usize = 0,
length: usize = 0,
span: struct { absolute: usize = 0, line_offset: usize = 0, length: usize = 0 } = .{},
message: []const u8 = "no problems",
};
pub const Error = error{
UnexpectedIndent,
UnexpectedValue,
ExtraContent,
EmptyDocument,
DuplicateKey,
BadMapEntry,
@ -42,20 +42,18 @@ pub const Options = struct {
default_object: enum { string, list, map, fail } = .fail,
};
pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, diagnostics: *Diagnostics, options: Options) !Document {
var state = State.init(allocator, diagnostics);
pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, options: Options) !Document {
var state = State.init(allocator);
defer state.deinit();
errdefer state.document.deinit();
var diagnostics = Diagnostics{};
var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{
.buffer = buffers.ValidatingFixedLineBuffer.init(buffer, diagnostics),
.buffer = buffers.ValidatingFixedLineBuffer.init(buffer),
.diagnostics = &diagnostics,
};
while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior);
// state doesn't have access to the tokenizer, which is the only thing that can
// error if unparsed lines remain in the buffer by the time that "finish" is
// called.
try tok.finish();
return try state.finish(options);
}
@ -63,6 +61,7 @@ pub const StreamParser = struct {
linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer),
parse_state: State,
parse_options: Options = .{},
diagnostics: Diagnostics = .{},
pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser {
const diagnostics = try allocator.create(Diagnostics);
@ -71,15 +70,16 @@ pub const StreamParser = struct {
return .{
.linetok = .{
.buffer = try buffers.ValidatingLineBuffer.init(allocator, diagnostics),
.buffer = try buffers.ValidatingLineBuffer.init(allocator),
.diagnostics = diagnostics,
},
.parse_state = State.init(allocator, diagnostics),
.parse_state = State.init(allocator),
.parse_options = options,
};
}
pub fn deinit(self: StreamParser) void {
self.linetok.buffer.allocator.destroy(self.parse_state.diagnostics);
self.linetok.buffer.allocator.destroy(self.linetok.diagnostics);
self.linetok.buffer.deinit();
self.parse_state.deinit();
}
@ -90,7 +90,6 @@ pub const StreamParser = struct {
}
pub fn finish(self: *StreamParser) !Document {
try self.linetok.finish();
return try self.parse_state.finish(self.parse_options);
}
};

View File

@ -4,7 +4,6 @@ const tokenizer = @import("../tokenizer.zig");
const Error = @import("../parser.zig").Error;
const DuplicateKeyBehavior = @import("../parser.zig").DuplicateKeyBehavior;
const Options = @import("../parser.zig").Options;
const Diagnostics = @import("../parser.zig").Diagnostics;
const Value = @import("./value.zig").Value;
pub const Document = struct {
@ -43,16 +42,14 @@ pub const State = struct {
pub const Stack = std.ArrayList(*Value);
document: Document,
diagnostics: *Diagnostics,
value_stack: Stack,
mode: enum { initial, value, done } = .initial,
expect_shift: tokenizer.ShiftDirection = .none,
dangling_key: ?[]const u8 = null,
pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) State {
pub fn init(allocator: std.mem.Allocator) State {
return .{
.document = Document.init(allocator),
.diagnostics = diagnostics,
.value_stack = Stack.init(allocator),
};
}

View File

@ -4,10 +4,10 @@ const Diagnostics = @import("./parser.zig").Diagnostics;
pub const Error = error{
BadToken,
ExtraContent,
MixedIndentation,
TooMuchIndentation,
UnquantizedIndentation,
TooMuchIndentation,
MissingNewline,
TrailingWhitespace,
Impossible,
};
@ -60,22 +60,15 @@ pub const Line = struct {
};
// buffer is expected to be either LineBuffer or FixedLineBuffer, but can
// technically be anything with a conformant interface.
// technically be anything with a `nextLine` method
pub fn LineTokenizer(comptime Buffer: type) type {
return struct {
buffer: Buffer,
index: usize = 0,
indentation: DetectedIndentation = .unknown,
last_indent: usize = 0,
pub fn finish(self: @This()) !void {
if (!self.buffer.empty()) {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "the document has extra content or is missing the final LF character";
return error.ExtraContent;
}
}
diagnostics: *Diagnostics,
row: usize = 0,
pub fn next(self: *@This()) !?Line {
lineloop: while (try self.buffer.nextLine()) |raw_line| {
@ -92,23 +85,13 @@ pub fn LineTokenizer(comptime Buffer: type) type {
// ugly documents.
.unknown => self.indentation = .{ .spaces = 0 },
.spaces => {},
.tabs => {
self.buffer.diag().line_offset = idx;
self.buffer.diag().length = 1;
self.buffer.diag().message = "the document contains mixed tab/space indentation";
return error.MixedIndentation;
},
.tabs => return error.MixedIndentation,
}
},
'\t' => {
switch (self.indentation) {
.unknown => self.indentation = .tabs,
.spaces => {
self.buffer.diag().line_offset = idx;
self.buffer.diag().length = 1;
self.buffer.diag().message = "the document contains mixed tab/space indentation";
return error.MixedIndentation;
},
.spaces => return error.MixedIndentation,
.tabs => {},
}
},
@ -121,12 +104,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
},
}
} else {
if (raw_line.len > 0) {
self.buffer.diag().line_offset = raw_line.len - 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains trailing whitespace";
return error.TrailingWhitespace;
}
if (raw_line.len > 0) return error.TrailingWhitespace;
continue :lineloop;
}
@ -134,23 +112,15 @@ pub fn LineTokenizer(comptime Buffer: type) type {
if (self.indentation.spaces == 0) {
self.indentation.spaces = indent;
}
if (@rem(indent, self.indentation.spaces) != 0) {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = indent;
self.buffer.diag().message = "this line contains incorrectly quantized indentation";
if (@rem(indent, self.indentation.spaces) != 0)
return error.UnquantizedIndentation;
}
break :quant @divExact(indent, self.indentation.spaces);
} else indent;
const shift: LineShift = if (quantized > self.last_indent) rel: {
if ((quantized - self.last_indent) > 1) {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = indent;
self.buffer.diag().message = "this line contains too much indentation";
if ((quantized - self.last_indent) > 1)
return error.TooMuchIndentation;
}
break :rel .indent;
} else if (quantized < self.last_indent)
.{ .dedent = self.last_indent - quantized }
@ -158,12 +128,10 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.none;
defer {
self.row += 1;
self.last_indent = quantized;
}
// update the diagnostics so that the parser can use them without
// knowing about the whitespace.
self.buffer.diag().line_offset = indent;
const line = raw_line[indent..];
// this should not be possible, as empty lines are caught earlier.
@ -173,12 +141,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
'#' => {
// force comments to be followed by a space. This makes them
// behave the same way as strings, actually.
if (line.len > 1 and line[1] != ' ') {
self.buffer.diag().line_offset += 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the start of comment character '#'";
return error.BadToken;
}
if (line.len > 1 and line[1] != ' ') return error.BadToken;
// simply lie about indentation when the line is a comment.
quantized = self.last_indent;
@ -191,21 +154,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
'|', '>', '[', '{' => {
return .{
.shift = shift,
.contents = .{ .in_line = try self.detectInlineItem(line) },
.contents = .{ .in_line = try detectInlineItem(line) },
.raw = line,
};
},
'-' => {
if (line.len > 1 and line[1] != ' ') {
self.buffer.diag().line_offset += 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the list entry character '-'";
return error.BadToken;
}
// blindly add 2 here because an empty item cannot fail in
// the value, only if a bogus dedent has occurred
self.buffer.diag().line_offset += 2;
if (line.len > 1 and line[1] != ' ') return error.BadToken;
return if (line.len == 1) .{
.shift = shift,
@ -213,33 +167,26 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.raw = line,
} else .{
.shift = shift,
.contents = .{ .list_item = try self.detectInlineItem(line[2..]) },
.contents = .{ .list_item = try detectInlineItem(line[2..]) },
.raw = line,
};
},
else => {
for (line, 0..) |char, idx| {
if (char == ':') {
self.buffer.diag().line_offset += idx + 2;
if (idx + 1 == line.len) return .{
.shift = shift,
.contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
.raw = line,
};
if (line[idx + 1] != ' ') {
self.buffer.diag().line_offset += idx + 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'";
return error.BadToken;
}
if (line[idx + 1] != ' ') return error.BadToken;
return .{
.shift = shift,
.contents = .{ .map_item = .{
.key = line[0..idx],
.val = try self.detectInlineItem(line[idx + 2 ..]),
.val = try detectInlineItem(line[idx + 2 ..]),
} },
.raw = line,
};
@ -255,16 +202,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
}
// somehow everything else has failed
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = raw_line.len;
self.buffer.diag().message = "this document contains an unknown error. Please report this.";
return error.Impossible;
}
return null;
}
// TODO: it's impossible to get the right diagnostic offset in this function at the moment
fn detectInlineItem(self: @This(), buf: []const u8) Error!InlineItem {
fn detectInlineItem(buf: []const u8) Error!InlineItem {
if (buf.len == 0) return .empty;
switch (buf[0]) {
@ -272,12 +215,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
const slice: []const u8 = switch (buf[buf.len - 1]) {
' ', '\t' => {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains trailing whitespace";
return error.TrailingWhitespace;
},
' ', '\t' => return error.TrailingWhitespace,
'|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
else => buf[@min(2, buf.len)..buf.len],
};
@ -288,34 +226,22 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.{ .space_string = slice };
},
'[' => {
if (buf.len < 2 or buf[buf.len - 1] != ']') {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains a flow-style list but does not end with the closing character ']'";
if (buf.len < 2 or buf[buf.len - 1] != ']')
return error.BadToken;
}
// keep the closing ] for the flow parser
return .{ .flow_list = buf[1..] };
},
'{' => {
if (buf.len < 2 or buf[buf.len - 1] != '}') {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains a flow-style map but does not end with the closing character '}'";
if (buf.len < 2 or buf[buf.len - 1] != '}')
return error.BadToken;
}
// keep the closing } fpr the flow parser
return .{ .flow_map = buf[1..] };
},
else => {
if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains trailing whitespace";
if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t')
return error.TrailingWhitespace;
}
return .{ .scalar = buf };
},