diff --git a/examples/parse.zig b/examples/parse.zig index 69a6bd2..1c890ac 100644 --- a/examples/parse.zig +++ b/examples/parse.zig @@ -15,9 +15,7 @@ pub fn main() !void { var needfree = true; defer if (needfree) allocator.free(data); - var parser = nice.Parser{ .allocator = allocator }; - - const document = try parser.parseBuffer(data); + const document = try nice.parseBuffer(allocator, data, .{}); defer document.deinit(); // free data memory to ensure that the parsed document is not holding diff --git a/src/linebuffer.zig b/src/linebuffer.zig index 3d15497..f48a265 100644 --- a/src/linebuffer.zig +++ b/src/linebuffer.zig @@ -2,106 +2,170 @@ const std = @import("std"); pub const IndexSlice = struct { start: usize, len: usize }; -pub const LineBuffer = struct { - allocator: std.mem.Allocator, - internal: FixedLineBuffer, - used: usize, - - pub const default_capacity: usize = 4096; - pub const Error = std.mem.Allocator.Error; - - pub fn init(allocator: std.mem.Allocator) Error!LineBuffer { - return initCapacity(allocator, default_capacity); - } - - pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer { - return .{ - .allocator = allocator, - .internal = .{ - .buffer = try allocator.alloc(u8, capacity), - .window = .{ .start = 0, .len = 0 }, - }, - .used = 0, - }; - } - - pub fn deinit(self: LineBuffer) void { - self.allocator.free(self.internal.buffer); - } - - pub fn feed(self: *LineBuffer, data: []const u8) Error!void { - if (data.len == 0) return; - // TODO: check for usize overflow here if we want Maximum Robustness - const new_window_len = self.internal.window.len + data.len; - - // data cannot fit in the buffer with our scan window, so we have to realloc - if (new_window_len > self.internal.buffer.len) { - // TODO: adopt an overallocation strategy? Will potentially avoid allocating - // on every invocation but will cause the buffer to oversize - self.internal.buffer = try self.allocator.realloc(@constCast(self.internal.buffer), new_window_len); - self.rehome(); - @memcpy(@constCast(self.internal.buffer[self.used..].ptr), data); - } - // data will fit, but needs to be moved in the buffer - else if (self.internal.window.start + new_window_len > self.internal.buffer.len) { - self.rehome(); - @memcpy(@constCast(self.internal.buffer[self.used..].ptr), data); - } - // data can simply be appended - else { - @memcpy(@constCast(self.internal.buffer[self.used..].ptr), data); - } - self.used += data.len; - self.internal.window.len = new_window_len; - } - - /// The memory returned by this function is valid until the next call to `feed`. - /// The resulting slice does not include the newline character. - pub fn nextLine(self: *LineBuffer) ?[]const u8 { - return self.internal.nextLine(); - } - - fn rehome(self: *LineBuffer) void { - self.internal.rehome(); - self.used = self.internal.window.len; - } +pub const Error = error{ + CarriageReturn, + TrailingWhitespace, + NonprintingAscii, + InputIsNotValidUtf8, }; -pub const FixedLineBuffer = struct { - buffer: []const u8, - window: IndexSlice, - - pub fn init(data: []const u8) FixedLineBuffer { - return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } }; - } - - pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 { - if (self.window.start >= self.buffer.len or self.window.len == 0) - return null; - - const window = self.buffer[self.window.start..][0..self.window.len]; - const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null; - - self.window.start += split + 1; - self.window.len -= split + 1; - - return window[0..split]; - } - - // move the current scan window to the beginning of the buffer. This internal - // method is used by LineBuffer. - fn rehome(self: *FixedLineBuffer) void { - if (self.window.start == 0) return; - - const window = self.buffer[self.window.start..][0..self.window.len]; - - // if the window is longer than its starting index, the memory move will be - // overlapping, so we can't use memcpy - if (self.window.len > self.window.start) - std.mem.copyForwards(u8, @constCast(self.buffer), window) - else - @memcpy(@constCast(self.buffer.ptr), window); - - self.window.start = 0; - } +pub const Strictness = struct { + check_carriage_return: bool = true, + check_trailing_whitespace: bool = true, + check_nonprinting_ascii: bool = true, + validate_utf8: bool = false, }; + +pub const ValidatingLineBuffer = LineBuffer(.{ + .validate_utf8 = true, +}); +pub const StrictLineBuffer = LineBuffer(.{}); +pub const SloppyLineBuffer = LineBuffer(.{ + .check_carriage_return = false, + .check_trailing_whitespace = false, + .check_nonprinting_ascii = false, + .validate_utf8 = false, +}); +pub const ValidatingFixedLineBuffer = FixedLineBuffer(.{ + .validate_utf8 = true, +}); +pub const StrictFixedLineBuffer = FixedLineBuffer(.{}); +pub const SloppyFixedLineBuffer = FixedLineBuffer(.{ + .check_carriage_return = false, + .check_trailing_whitespace = false, + .check_nonprinting_ascii = false, + .validate_utf8 = false, +}); + +pub fn LineBuffer(comptime options: Strictness) type { + return struct { + allocator: std.mem.Allocator, + internal: FixedLineBuffer(options), + used: usize, + + pub const default_capacity: usize = 4096; + + pub fn init(allocator: std.mem.Allocator) !@This() { + return initCapacity(allocator, default_capacity); + } + + pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) !@This() { + return .{ + .allocator = allocator, + .internal = .{ + .buffer = try allocator.alloc(u8, capacity), + .window = .{ .start = 0, .len = 0 }, + }, + .used = 0, + }; + } + + pub fn deinit(self: @This()) void { + self.allocator.free(self.internal.buffer); + } + + pub fn feed(self: *@This(), data: []const u8) !void { + if (data.len == 0) return; + // TODO: check for usize overflow here if we want Maximum Robustness + const new_window_len = self.internal.window.len + data.len; + + // data cannot fit in the buffer with our scan window, so we have to realloc + if (new_window_len > self.internal.buffer.len) { + // TODO: adopt an overallocation strategy? Will potentially avoid allocating + // on every invocation but will cause the buffer to oversize + self.internal.buffer = try self.allocator.realloc(@constCast(self.internal.buffer), new_window_len); + self.rehome(); + @memcpy(@constCast(self.internal.buffer[self.used..].ptr), data); + } + // data will fit, but needs to be moved in the buffer + else if (self.internal.window.start + new_window_len > self.internal.buffer.len) { + self.rehome(); + @memcpy(@constCast(self.internal.buffer[self.used..].ptr), data); + } + // data can simply be appended + else { + @memcpy(@constCast(self.internal.buffer[self.used..].ptr), data); + } + self.used += data.len; + self.internal.window.len = new_window_len; + } + + /// The memory returned by this function is valid until the next call to `feed`. + /// The resulting slice does not include the newline character. + pub fn nextLine(self: *@This()) !?[]const u8 { + return self.internal.nextLine(); + } + + fn rehome(self: *@This()) void { + self.internal.rehome(); + self.used = self.internal.window.len; + } + }; +} + +pub fn FixedLineBuffer(comptime options: Strictness) type { + return struct { + buffer: []const u8, + window: IndexSlice, + + pub fn init(data: []const u8) @This() { + return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } }; + } + + pub fn nextLine(self: *@This()) !?[]const u8 { + if (self.window.start >= self.buffer.len or self.window.len == 0) + return null; + + const window = self.buffer[self.window.start..][0..self.window.len]; + + const split: usize = split: { + for (window, 0..) |char, idx| { + if (comptime options.check_carriage_return) + if (char == '\r') return error.IllegalCarriageReturn; + + if (comptime options.check_nonprinting_ascii) + if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F)) + return error.IllegalNonprintingAscii; + + if (comptime options.check_trailing_whitespace) { + if (char == '\n') { + if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t')) + return error.IllegalTrailingSpace; + break :split idx; + } + } else { + if (char == '\n') break :split idx; + } + } + return null; + }; + + self.window.start += split + 1; + self.window.len -= split + 1; + + if (comptime options.validate_utf8) { + const line = window[0..split]; + return if (std.unicode.utf8ValidateSlice(line)) line else error.InputIsNotValidUtf8; + } else { + return window[0..split]; + } + } + + // move the current scan window to the beginning of the buffer. This internal + // method is used by LineBuffer. + fn rehome(self: *@This()) void { + if (self.window.start == 0) return; + + const window = self.buffer[self.window.start..][0..self.window.len]; + + // if the window is longer than its starting index, the memory move will be + // overlapping, so we can't use memcpy + if (self.window.len > self.window.start) + std.mem.copyForwards(u8, @constCast(self.buffer), window) + else + @memcpy(@constCast(self.buffer.ptr), window); + + self.window.start = 0; + } + }; +} diff --git a/src/nice.zig b/src/nice.zig index 181f20f..d025f35 100644 --- a/src/nice.zig +++ b/src/nice.zig @@ -64,7 +64,7 @@ const std = @import("std"); pub const buffers = @import("./linebuffer.zig"); pub const tokenizer = @import("./tokenizer.zig"); pub const parser = @import("./parser.zig"); -pub const Parser = parser.Parser; +pub const parseBuffer = parser.parseBuffer; pub const StreamParser = parser.StreamParser; pub const Document = parser.Document; pub const Value = parser.Value; diff --git a/src/parser.zig b/src/parser.zig index a10e181..f7e0a1e 100644 --- a/src/parser.zig +++ b/src/parser.zig @@ -7,9 +7,9 @@ pub const Document = @import("./parser/state.zig").Document; pub const Value = @import("./parser/value.zig").Value; pub const Diagnostics = struct { - row: usize, - span: struct { absolute: usize, line_offset: usize, length: usize }, - message: []const u8, + row: usize = 0, + span: struct { absolute: usize = 0, line_offset: usize = 0, length: usize = 0 } = .{}, + message: []const u8 = "no problems", }; pub const Error = error{ @@ -42,64 +42,54 @@ pub const Options = struct { default_object: enum { string, list, map, fail } = .fail, }; -pub const Parser = struct { - allocator: std.mem.Allocator, - options: Options = .{}, - diagnostics: Diagnostics = .{ - .row = 0, - .span = .{ .absolute = 0, .line_offset = 0, .length = 0 }, - .message = "all is well", - }, +pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, options: Options) !Document { + var state = State.init(allocator); + defer state.deinit(); + errdefer state.document.deinit(); - pub fn parseBuffer(self: *Parser, buffer: []const u8) Error!Document { - var tok: tokenizer.LineTokenizer(buffers.FixedLineBuffer) = .{ - .buffer = buffers.FixedLineBuffer.init(buffer), - .diagnostics = &self.diagnostics, - }; + var diagnostics = Diagnostics{}; + var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{ + .buffer = buffers.ValidatingFixedLineBuffer.init(buffer), + .diagnostics = &diagnostics, + }; - var state = State.init(self.allocator); - defer state.deinit(); - errdefer state.document.deinit(); - - // TODO: pass the diagnostics pointer as well - while (try tok.next()) |line| try state.parseLine(line, self.options.duplicate_key_behavior); - - return try state.finish(self.options); - } -}; + while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior); + return try state.finish(options); +} pub const StreamParser = struct { - linetok: tokenizer.LineTokenizer(buffers.LineBuffer), - state: State, - options: Options = .{}, - diagnostics: Diagnostics = .{ - .row = 0, - .span = .{ .absolute = 0, .line_offset = 0, .length = 0 }, - .message = "all is well", - }, + linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer), + parse_state: State, + parse_options: Options = .{}, + diagnostics: Diagnostics = .{}, pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser { + const diagnostics = try allocator.create(Diagnostics); + errdefer allocator.destroy(diagnostics); + diagnostics.* = Diagnostics{}; + return .{ .linetok = .{ - .buffer = try buffers.LineBuffer.init(allocator), - .diagnostics = &@as(*StreamParser, @ptrFromInt(@returnAddress())).diagnostics, + .buffer = try buffers.ValidatingLineBuffer.init(allocator), + .diagnostics = diagnostics, }, - .state = State.init(allocator), - .options = options, + .parse_state = State.init(allocator), + .parse_options = options, }; } pub fn deinit(self: StreamParser) void { + self.linetok.buffer.allocator.destroy(self.linetok.diagnostics); self.linetok.buffer.deinit(); - self.state.deinit(); + self.parse_state.deinit(); } - pub fn feed(self: *StreamParser, data: []const u8) Error!void { + pub fn feed(self: *StreamParser, data: []const u8) !void { try self.linetok.buffer.feed(data); - while (try self.linetok.next()) |line| try self.state.parseLine(line, self.options.duplicate_key_behavior); + while (try self.linetok.next()) |line| try self.parse_state.parseLine(line, self.parse_options.duplicate_key_behavior); } - pub fn finish(self: *StreamParser) Error!Document { - return try self.state.finish(self.options); + pub fn finish(self: *StreamParser) !Document { + return try self.parse_state.finish(self.parse_options); } }; diff --git a/src/tokenizer.zig b/src/tokenizer.zig index af3786d..e52ba4a 100644 --- a/src/tokenizer.zig +++ b/src/tokenizer.zig @@ -70,8 +70,8 @@ pub fn LineTokenizer(comptime Buffer: type) type { diagnostics: *Diagnostics, row: usize = 0, - pub fn next(self: *@This()) Error!?Line { - lineloop: while (self.buffer.nextLine()) |raw_line| { + pub fn next(self: *@This()) !?Line { + lineloop: while (try self.buffer.nextLine()) |raw_line| { var indent: usize = 0; for (raw_line, 0..) |char, idx| { switch (char) {