linebuffer: add strictness options
When the buffer was separated from the tokenizer, we lost some validation, including really aggressive carriage return detection. This brings this back in full force and adds some additional validation on top of it.
This commit is contained in:
parent
7f82c24584
commit
0e60719c85
@ -15,9 +15,7 @@ pub fn main() !void {
|
||||
var needfree = true;
|
||||
defer if (needfree) allocator.free(data);
|
||||
|
||||
var parser = nice.Parser{ .allocator = allocator };
|
||||
|
||||
const document = try parser.parseBuffer(data);
|
||||
const document = try nice.parseBuffer(allocator, data, .{});
|
||||
defer document.deinit();
|
||||
|
||||
// free data memory to ensure that the parsed document is not holding
|
||||
|
@ -2,19 +2,54 @@ const std = @import("std");
|
||||
|
||||
pub const IndexSlice = struct { start: usize, len: usize };
|
||||
|
||||
pub const LineBuffer = struct {
|
||||
pub const Error = error{
|
||||
CarriageReturn,
|
||||
TrailingWhitespace,
|
||||
NonprintingAscii,
|
||||
InputIsNotValidUtf8,
|
||||
};
|
||||
|
||||
pub const Strictness = struct {
|
||||
check_carriage_return: bool = true,
|
||||
check_trailing_whitespace: bool = true,
|
||||
check_nonprinting_ascii: bool = true,
|
||||
validate_utf8: bool = false,
|
||||
};
|
||||
|
||||
pub const ValidatingLineBuffer = LineBuffer(.{
|
||||
.validate_utf8 = true,
|
||||
});
|
||||
pub const StrictLineBuffer = LineBuffer(.{});
|
||||
pub const SloppyLineBuffer = LineBuffer(.{
|
||||
.check_carriage_return = false,
|
||||
.check_trailing_whitespace = false,
|
||||
.check_nonprinting_ascii = false,
|
||||
.validate_utf8 = false,
|
||||
});
|
||||
pub const ValidatingFixedLineBuffer = FixedLineBuffer(.{
|
||||
.validate_utf8 = true,
|
||||
});
|
||||
pub const StrictFixedLineBuffer = FixedLineBuffer(.{});
|
||||
pub const SloppyFixedLineBuffer = FixedLineBuffer(.{
|
||||
.check_carriage_return = false,
|
||||
.check_trailing_whitespace = false,
|
||||
.check_nonprinting_ascii = false,
|
||||
.validate_utf8 = false,
|
||||
});
|
||||
|
||||
pub fn LineBuffer(comptime options: Strictness) type {
|
||||
return struct {
|
||||
allocator: std.mem.Allocator,
|
||||
internal: FixedLineBuffer,
|
||||
internal: FixedLineBuffer(options),
|
||||
used: usize,
|
||||
|
||||
pub const default_capacity: usize = 4096;
|
||||
pub const Error = std.mem.Allocator.Error;
|
||||
|
||||
pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
|
||||
pub fn init(allocator: std.mem.Allocator) !@This() {
|
||||
return initCapacity(allocator, default_capacity);
|
||||
}
|
||||
|
||||
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
|
||||
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) !@This() {
|
||||
return .{
|
||||
.allocator = allocator,
|
||||
.internal = .{
|
||||
@ -25,11 +60,11 @@ pub const LineBuffer = struct {
|
||||
};
|
||||
}
|
||||
|
||||
pub fn deinit(self: LineBuffer) void {
|
||||
pub fn deinit(self: @This()) void {
|
||||
self.allocator.free(self.internal.buffer);
|
||||
}
|
||||
|
||||
pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
|
||||
pub fn feed(self: *@This(), data: []const u8) !void {
|
||||
if (data.len == 0) return;
|
||||
// TODO: check for usize overflow here if we want Maximum Robustness
|
||||
const new_window_len = self.internal.window.len + data.len;
|
||||
@ -57,40 +92,68 @@ pub const LineBuffer = struct {
|
||||
|
||||
/// The memory returned by this function is valid until the next call to `feed`.
|
||||
/// The resulting slice does not include the newline character.
|
||||
pub fn nextLine(self: *LineBuffer) ?[]const u8 {
|
||||
pub fn nextLine(self: *@This()) !?[]const u8 {
|
||||
return self.internal.nextLine();
|
||||
}
|
||||
|
||||
fn rehome(self: *LineBuffer) void {
|
||||
fn rehome(self: *@This()) void {
|
||||
self.internal.rehome();
|
||||
self.used = self.internal.window.len;
|
||||
}
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
pub const FixedLineBuffer = struct {
|
||||
pub fn FixedLineBuffer(comptime options: Strictness) type {
|
||||
return struct {
|
||||
buffer: []const u8,
|
||||
window: IndexSlice,
|
||||
|
||||
pub fn init(data: []const u8) FixedLineBuffer {
|
||||
pub fn init(data: []const u8) @This() {
|
||||
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
|
||||
}
|
||||
|
||||
pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
|
||||
pub fn nextLine(self: *@This()) !?[]const u8 {
|
||||
if (self.window.start >= self.buffer.len or self.window.len == 0)
|
||||
return null;
|
||||
|
||||
const window = self.buffer[self.window.start..][0..self.window.len];
|
||||
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
|
||||
|
||||
const split: usize = split: {
|
||||
for (window, 0..) |char, idx| {
|
||||
if (comptime options.check_carriage_return)
|
||||
if (char == '\r') return error.IllegalCarriageReturn;
|
||||
|
||||
if (comptime options.check_nonprinting_ascii)
|
||||
if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F))
|
||||
return error.IllegalNonprintingAscii;
|
||||
|
||||
if (comptime options.check_trailing_whitespace) {
|
||||
if (char == '\n') {
|
||||
if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t'))
|
||||
return error.IllegalTrailingSpace;
|
||||
break :split idx;
|
||||
}
|
||||
} else {
|
||||
if (char == '\n') break :split idx;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
self.window.start += split + 1;
|
||||
self.window.len -= split + 1;
|
||||
|
||||
if (comptime options.validate_utf8) {
|
||||
const line = window[0..split];
|
||||
return if (std.unicode.utf8ValidateSlice(line)) line else error.InputIsNotValidUtf8;
|
||||
} else {
|
||||
return window[0..split];
|
||||
}
|
||||
}
|
||||
|
||||
// move the current scan window to the beginning of the buffer. This internal
|
||||
// method is used by LineBuffer.
|
||||
fn rehome(self: *FixedLineBuffer) void {
|
||||
fn rehome(self: *@This()) void {
|
||||
if (self.window.start == 0) return;
|
||||
|
||||
const window = self.buffer[self.window.start..][0..self.window.len];
|
||||
@ -104,4 +167,5 @@ pub const FixedLineBuffer = struct {
|
||||
|
||||
self.window.start = 0;
|
||||
}
|
||||
};
|
||||
};
|
||||
}
|
||||
|
@ -64,7 +64,7 @@ const std = @import("std");
|
||||
pub const buffers = @import("./linebuffer.zig");
|
||||
pub const tokenizer = @import("./tokenizer.zig");
|
||||
pub const parser = @import("./parser.zig");
|
||||
pub const Parser = parser.Parser;
|
||||
pub const parseBuffer = parser.parseBuffer;
|
||||
pub const StreamParser = parser.StreamParser;
|
||||
pub const Document = parser.Document;
|
||||
pub const Value = parser.Value;
|
||||
|
@ -7,9 +7,9 @@ pub const Document = @import("./parser/state.zig").Document;
|
||||
pub const Value = @import("./parser/value.zig").Value;
|
||||
|
||||
pub const Diagnostics = struct {
|
||||
row: usize,
|
||||
span: struct { absolute: usize, line_offset: usize, length: usize },
|
||||
message: []const u8,
|
||||
row: usize = 0,
|
||||
span: struct { absolute: usize = 0, line_offset: usize = 0, length: usize = 0 } = .{},
|
||||
message: []const u8 = "no problems",
|
||||
};
|
||||
|
||||
pub const Error = error{
|
||||
@ -42,64 +42,54 @@ pub const Options = struct {
|
||||
default_object: enum { string, list, map, fail } = .fail,
|
||||
};
|
||||
|
||||
pub const Parser = struct {
|
||||
allocator: std.mem.Allocator,
|
||||
options: Options = .{},
|
||||
diagnostics: Diagnostics = .{
|
||||
.row = 0,
|
||||
.span = .{ .absolute = 0, .line_offset = 0, .length = 0 },
|
||||
.message = "all is well",
|
||||
},
|
||||
|
||||
pub fn parseBuffer(self: *Parser, buffer: []const u8) Error!Document {
|
||||
var tok: tokenizer.LineTokenizer(buffers.FixedLineBuffer) = .{
|
||||
.buffer = buffers.FixedLineBuffer.init(buffer),
|
||||
.diagnostics = &self.diagnostics,
|
||||
};
|
||||
|
||||
var state = State.init(self.allocator);
|
||||
pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, options: Options) !Document {
|
||||
var state = State.init(allocator);
|
||||
defer state.deinit();
|
||||
errdefer state.document.deinit();
|
||||
|
||||
// TODO: pass the diagnostics pointer as well
|
||||
while (try tok.next()) |line| try state.parseLine(line, self.options.duplicate_key_behavior);
|
||||
var diagnostics = Diagnostics{};
|
||||
var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{
|
||||
.buffer = buffers.ValidatingFixedLineBuffer.init(buffer),
|
||||
.diagnostics = &diagnostics,
|
||||
};
|
||||
|
||||
return try state.finish(self.options);
|
||||
}
|
||||
};
|
||||
while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior);
|
||||
return try state.finish(options);
|
||||
}
|
||||
|
||||
pub const StreamParser = struct {
|
||||
linetok: tokenizer.LineTokenizer(buffers.LineBuffer),
|
||||
state: State,
|
||||
options: Options = .{},
|
||||
diagnostics: Diagnostics = .{
|
||||
.row = 0,
|
||||
.span = .{ .absolute = 0, .line_offset = 0, .length = 0 },
|
||||
.message = "all is well",
|
||||
},
|
||||
linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer),
|
||||
parse_state: State,
|
||||
parse_options: Options = .{},
|
||||
diagnostics: Diagnostics = .{},
|
||||
|
||||
pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser {
|
||||
const diagnostics = try allocator.create(Diagnostics);
|
||||
errdefer allocator.destroy(diagnostics);
|
||||
diagnostics.* = Diagnostics{};
|
||||
|
||||
return .{
|
||||
.linetok = .{
|
||||
.buffer = try buffers.LineBuffer.init(allocator),
|
||||
.diagnostics = &@as(*StreamParser, @ptrFromInt(@returnAddress())).diagnostics,
|
||||
.buffer = try buffers.ValidatingLineBuffer.init(allocator),
|
||||
.diagnostics = diagnostics,
|
||||
},
|
||||
.state = State.init(allocator),
|
||||
.options = options,
|
||||
.parse_state = State.init(allocator),
|
||||
.parse_options = options,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn deinit(self: StreamParser) void {
|
||||
self.linetok.buffer.allocator.destroy(self.linetok.diagnostics);
|
||||
self.linetok.buffer.deinit();
|
||||
self.state.deinit();
|
||||
self.parse_state.deinit();
|
||||
}
|
||||
|
||||
pub fn feed(self: *StreamParser, data: []const u8) Error!void {
|
||||
pub fn feed(self: *StreamParser, data: []const u8) !void {
|
||||
try self.linetok.buffer.feed(data);
|
||||
while (try self.linetok.next()) |line| try self.state.parseLine(line, self.options.duplicate_key_behavior);
|
||||
while (try self.linetok.next()) |line| try self.parse_state.parseLine(line, self.parse_options.duplicate_key_behavior);
|
||||
}
|
||||
|
||||
pub fn finish(self: *StreamParser) Error!Document {
|
||||
return try self.state.finish(self.options);
|
||||
pub fn finish(self: *StreamParser) !Document {
|
||||
return try self.parse_state.finish(self.parse_options);
|
||||
}
|
||||
};
|
||||
|
@ -70,8 +70,8 @@ pub fn LineTokenizer(comptime Buffer: type) type {
|
||||
diagnostics: *Diagnostics,
|
||||
row: usize = 0,
|
||||
|
||||
pub fn next(self: *@This()) Error!?Line {
|
||||
lineloop: while (self.buffer.nextLine()) |raw_line| {
|
||||
pub fn next(self: *@This()) !?Line {
|
||||
lineloop: while (try self.buffer.nextLine()) |raw_line| {
|
||||
var indent: usize = 0;
|
||||
for (raw_line, 0..) |char, idx| {
|
||||
switch (char) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user