linebuffer: add strictness options

When the buffer was separated from the tokenizer, we lost some
validation, including really aggressive carriage return detection.
This brings this back in full force and adds some additional
validation on top of it.
This commit is contained in:
torque 2023-09-26 00:06:39 -07:00
parent 7f82c24584
commit 0e60719c85
Signed by: torque
SSH Key Fingerprint: SHA256:nCrXefBNo6EbjNSQhv0nXmEg/VuNq3sMF5b8zETw3Tk
5 changed files with 201 additions and 149 deletions

View File

@ -15,9 +15,7 @@ pub fn main() !void {
var needfree = true;
defer if (needfree) allocator.free(data);
var parser = nice.Parser{ .allocator = allocator };
const document = try parser.parseBuffer(data);
const document = try nice.parseBuffer(allocator, data, .{});
defer document.deinit();
// free data memory to ensure that the parsed document is not holding

View File

@ -2,106 +2,170 @@ const std = @import("std");
pub const IndexSlice = struct { start: usize, len: usize };
pub const LineBuffer = struct {
allocator: std.mem.Allocator,
internal: FixedLineBuffer,
used: usize,
pub const default_capacity: usize = 4096;
pub const Error = std.mem.Allocator.Error;
pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
return initCapacity(allocator, default_capacity);
}
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
return .{
.allocator = allocator,
.internal = .{
.buffer = try allocator.alloc(u8, capacity),
.window = .{ .start = 0, .len = 0 },
},
.used = 0,
};
}
pub fn deinit(self: LineBuffer) void {
self.allocator.free(self.internal.buffer);
}
pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
if (data.len == 0) return;
// TODO: check for usize overflow here if we want Maximum Robustness
const new_window_len = self.internal.window.len + data.len;
// data cannot fit in the buffer with our scan window, so we have to realloc
if (new_window_len > self.internal.buffer.len) {
// TODO: adopt an overallocation strategy? Will potentially avoid allocating
// on every invocation but will cause the buffer to oversize
self.internal.buffer = try self.allocator.realloc(@constCast(self.internal.buffer), new_window_len);
self.rehome();
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
}
// data will fit, but needs to be moved in the buffer
else if (self.internal.window.start + new_window_len > self.internal.buffer.len) {
self.rehome();
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
}
// data can simply be appended
else {
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
}
self.used += data.len;
self.internal.window.len = new_window_len;
}
/// The memory returned by this function is valid until the next call to `feed`.
/// The resulting slice does not include the newline character.
pub fn nextLine(self: *LineBuffer) ?[]const u8 {
return self.internal.nextLine();
}
fn rehome(self: *LineBuffer) void {
self.internal.rehome();
self.used = self.internal.window.len;
}
pub const Error = error{
CarriageReturn,
TrailingWhitespace,
NonprintingAscii,
InputIsNotValidUtf8,
};
pub const FixedLineBuffer = struct {
buffer: []const u8,
window: IndexSlice,
pub fn init(data: []const u8) FixedLineBuffer {
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
}
pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
if (self.window.start >= self.buffer.len or self.window.len == 0)
return null;
const window = self.buffer[self.window.start..][0..self.window.len];
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
self.window.start += split + 1;
self.window.len -= split + 1;
return window[0..split];
}
// move the current scan window to the beginning of the buffer. This internal
// method is used by LineBuffer.
fn rehome(self: *FixedLineBuffer) void {
if (self.window.start == 0) return;
const window = self.buffer[self.window.start..][0..self.window.len];
// if the window is longer than its starting index, the memory move will be
// overlapping, so we can't use memcpy
if (self.window.len > self.window.start)
std.mem.copyForwards(u8, @constCast(self.buffer), window)
else
@memcpy(@constCast(self.buffer.ptr), window);
self.window.start = 0;
}
pub const Strictness = struct {
check_carriage_return: bool = true,
check_trailing_whitespace: bool = true,
check_nonprinting_ascii: bool = true,
validate_utf8: bool = false,
};
pub const ValidatingLineBuffer = LineBuffer(.{
.validate_utf8 = true,
});
pub const StrictLineBuffer = LineBuffer(.{});
pub const SloppyLineBuffer = LineBuffer(.{
.check_carriage_return = false,
.check_trailing_whitespace = false,
.check_nonprinting_ascii = false,
.validate_utf8 = false,
});
pub const ValidatingFixedLineBuffer = FixedLineBuffer(.{
.validate_utf8 = true,
});
pub const StrictFixedLineBuffer = FixedLineBuffer(.{});
pub const SloppyFixedLineBuffer = FixedLineBuffer(.{
.check_carriage_return = false,
.check_trailing_whitespace = false,
.check_nonprinting_ascii = false,
.validate_utf8 = false,
});
pub fn LineBuffer(comptime options: Strictness) type {
return struct {
allocator: std.mem.Allocator,
internal: FixedLineBuffer(options),
used: usize,
pub const default_capacity: usize = 4096;
pub fn init(allocator: std.mem.Allocator) !@This() {
return initCapacity(allocator, default_capacity);
}
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) !@This() {
return .{
.allocator = allocator,
.internal = .{
.buffer = try allocator.alloc(u8, capacity),
.window = .{ .start = 0, .len = 0 },
},
.used = 0,
};
}
pub fn deinit(self: @This()) void {
self.allocator.free(self.internal.buffer);
}
pub fn feed(self: *@This(), data: []const u8) !void {
if (data.len == 0) return;
// TODO: check for usize overflow here if we want Maximum Robustness
const new_window_len = self.internal.window.len + data.len;
// data cannot fit in the buffer with our scan window, so we have to realloc
if (new_window_len > self.internal.buffer.len) {
// TODO: adopt an overallocation strategy? Will potentially avoid allocating
// on every invocation but will cause the buffer to oversize
self.internal.buffer = try self.allocator.realloc(@constCast(self.internal.buffer), new_window_len);
self.rehome();
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
}
// data will fit, but needs to be moved in the buffer
else if (self.internal.window.start + new_window_len > self.internal.buffer.len) {
self.rehome();
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
}
// data can simply be appended
else {
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
}
self.used += data.len;
self.internal.window.len = new_window_len;
}
/// The memory returned by this function is valid until the next call to `feed`.
/// The resulting slice does not include the newline character.
pub fn nextLine(self: *@This()) !?[]const u8 {
return self.internal.nextLine();
}
fn rehome(self: *@This()) void {
self.internal.rehome();
self.used = self.internal.window.len;
}
};
}
pub fn FixedLineBuffer(comptime options: Strictness) type {
return struct {
buffer: []const u8,
window: IndexSlice,
pub fn init(data: []const u8) @This() {
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
}
pub fn nextLine(self: *@This()) !?[]const u8 {
if (self.window.start >= self.buffer.len or self.window.len == 0)
return null;
const window = self.buffer[self.window.start..][0..self.window.len];
const split: usize = split: {
for (window, 0..) |char, idx| {
if (comptime options.check_carriage_return)
if (char == '\r') return error.IllegalCarriageReturn;
if (comptime options.check_nonprinting_ascii)
if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F))
return error.IllegalNonprintingAscii;
if (comptime options.check_trailing_whitespace) {
if (char == '\n') {
if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t'))
return error.IllegalTrailingSpace;
break :split idx;
}
} else {
if (char == '\n') break :split idx;
}
}
return null;
};
self.window.start += split + 1;
self.window.len -= split + 1;
if (comptime options.validate_utf8) {
const line = window[0..split];
return if (std.unicode.utf8ValidateSlice(line)) line else error.InputIsNotValidUtf8;
} else {
return window[0..split];
}
}
// move the current scan window to the beginning of the buffer. This internal
// method is used by LineBuffer.
fn rehome(self: *@This()) void {
if (self.window.start == 0) return;
const window = self.buffer[self.window.start..][0..self.window.len];
// if the window is longer than its starting index, the memory move will be
// overlapping, so we can't use memcpy
if (self.window.len > self.window.start)
std.mem.copyForwards(u8, @constCast(self.buffer), window)
else
@memcpy(@constCast(self.buffer.ptr), window);
self.window.start = 0;
}
};
}

View File

@ -64,7 +64,7 @@ const std = @import("std");
pub const buffers = @import("./linebuffer.zig");
pub const tokenizer = @import("./tokenizer.zig");
pub const parser = @import("./parser.zig");
pub const Parser = parser.Parser;
pub const parseBuffer = parser.parseBuffer;
pub const StreamParser = parser.StreamParser;
pub const Document = parser.Document;
pub const Value = parser.Value;

View File

@ -7,9 +7,9 @@ pub const Document = @import("./parser/state.zig").Document;
pub const Value = @import("./parser/value.zig").Value;
pub const Diagnostics = struct {
row: usize,
span: struct { absolute: usize, line_offset: usize, length: usize },
message: []const u8,
row: usize = 0,
span: struct { absolute: usize = 0, line_offset: usize = 0, length: usize = 0 } = .{},
message: []const u8 = "no problems",
};
pub const Error = error{
@ -42,64 +42,54 @@ pub const Options = struct {
default_object: enum { string, list, map, fail } = .fail,
};
pub const Parser = struct {
allocator: std.mem.Allocator,
options: Options = .{},
diagnostics: Diagnostics = .{
.row = 0,
.span = .{ .absolute = 0, .line_offset = 0, .length = 0 },
.message = "all is well",
},
pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, options: Options) !Document {
var state = State.init(allocator);
defer state.deinit();
errdefer state.document.deinit();
pub fn parseBuffer(self: *Parser, buffer: []const u8) Error!Document {
var tok: tokenizer.LineTokenizer(buffers.FixedLineBuffer) = .{
.buffer = buffers.FixedLineBuffer.init(buffer),
.diagnostics = &self.diagnostics,
};
var diagnostics = Diagnostics{};
var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{
.buffer = buffers.ValidatingFixedLineBuffer.init(buffer),
.diagnostics = &diagnostics,
};
var state = State.init(self.allocator);
defer state.deinit();
errdefer state.document.deinit();
// TODO: pass the diagnostics pointer as well
while (try tok.next()) |line| try state.parseLine(line, self.options.duplicate_key_behavior);
return try state.finish(self.options);
}
};
while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior);
return try state.finish(options);
}
pub const StreamParser = struct {
linetok: tokenizer.LineTokenizer(buffers.LineBuffer),
state: State,
options: Options = .{},
diagnostics: Diagnostics = .{
.row = 0,
.span = .{ .absolute = 0, .line_offset = 0, .length = 0 },
.message = "all is well",
},
linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer),
parse_state: State,
parse_options: Options = .{},
diagnostics: Diagnostics = .{},
pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser {
const diagnostics = try allocator.create(Diagnostics);
errdefer allocator.destroy(diagnostics);
diagnostics.* = Diagnostics{};
return .{
.linetok = .{
.buffer = try buffers.LineBuffer.init(allocator),
.diagnostics = &@as(*StreamParser, @ptrFromInt(@returnAddress())).diagnostics,
.buffer = try buffers.ValidatingLineBuffer.init(allocator),
.diagnostics = diagnostics,
},
.state = State.init(allocator),
.options = options,
.parse_state = State.init(allocator),
.parse_options = options,
};
}
pub fn deinit(self: StreamParser) void {
self.linetok.buffer.allocator.destroy(self.linetok.diagnostics);
self.linetok.buffer.deinit();
self.state.deinit();
self.parse_state.deinit();
}
pub fn feed(self: *StreamParser, data: []const u8) Error!void {
pub fn feed(self: *StreamParser, data: []const u8) !void {
try self.linetok.buffer.feed(data);
while (try self.linetok.next()) |line| try self.state.parseLine(line, self.options.duplicate_key_behavior);
while (try self.linetok.next()) |line| try self.parse_state.parseLine(line, self.parse_options.duplicate_key_behavior);
}
pub fn finish(self: *StreamParser) Error!Document {
return try self.state.finish(self.options);
pub fn finish(self: *StreamParser) !Document {
return try self.parse_state.finish(self.parse_options);
}
};

View File

@ -70,8 +70,8 @@ pub fn LineTokenizer(comptime Buffer: type) type {
diagnostics: *Diagnostics,
row: usize = 0,
pub fn next(self: *@This()) Error!?Line {
lineloop: while (self.buffer.nextLine()) |raw_line| {
pub fn next(self: *@This()) !?Line {
lineloop: while (try self.buffer.nextLine()) |raw_line| {
var indent: usize = 0;
for (raw_line, 0..) |char, idx| {
switch (char) {