linebuffer: add strictness options
When the buffer was separated from the tokenizer, we lost some validation, including really aggressive carriage return detection. This brings this back in full force and adds some additional validation on top of it.
This commit is contained in:
parent
7f82c24584
commit
0e60719c85
@ -15,9 +15,7 @@ pub fn main() !void {
|
|||||||
var needfree = true;
|
var needfree = true;
|
||||||
defer if (needfree) allocator.free(data);
|
defer if (needfree) allocator.free(data);
|
||||||
|
|
||||||
var parser = nice.Parser{ .allocator = allocator };
|
const document = try nice.parseBuffer(allocator, data, .{});
|
||||||
|
|
||||||
const document = try parser.parseBuffer(data);
|
|
||||||
defer document.deinit();
|
defer document.deinit();
|
||||||
|
|
||||||
// free data memory to ensure that the parsed document is not holding
|
// free data memory to ensure that the parsed document is not holding
|
||||||
|
@ -2,19 +2,54 @@ const std = @import("std");
|
|||||||
|
|
||||||
pub const IndexSlice = struct { start: usize, len: usize };
|
pub const IndexSlice = struct { start: usize, len: usize };
|
||||||
|
|
||||||
pub const LineBuffer = struct {
|
pub const Error = error{
|
||||||
|
CarriageReturn,
|
||||||
|
TrailingWhitespace,
|
||||||
|
NonprintingAscii,
|
||||||
|
InputIsNotValidUtf8,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub const Strictness = struct {
|
||||||
|
check_carriage_return: bool = true,
|
||||||
|
check_trailing_whitespace: bool = true,
|
||||||
|
check_nonprinting_ascii: bool = true,
|
||||||
|
validate_utf8: bool = false,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub const ValidatingLineBuffer = LineBuffer(.{
|
||||||
|
.validate_utf8 = true,
|
||||||
|
});
|
||||||
|
pub const StrictLineBuffer = LineBuffer(.{});
|
||||||
|
pub const SloppyLineBuffer = LineBuffer(.{
|
||||||
|
.check_carriage_return = false,
|
||||||
|
.check_trailing_whitespace = false,
|
||||||
|
.check_nonprinting_ascii = false,
|
||||||
|
.validate_utf8 = false,
|
||||||
|
});
|
||||||
|
pub const ValidatingFixedLineBuffer = FixedLineBuffer(.{
|
||||||
|
.validate_utf8 = true,
|
||||||
|
});
|
||||||
|
pub const StrictFixedLineBuffer = FixedLineBuffer(.{});
|
||||||
|
pub const SloppyFixedLineBuffer = FixedLineBuffer(.{
|
||||||
|
.check_carriage_return = false,
|
||||||
|
.check_trailing_whitespace = false,
|
||||||
|
.check_nonprinting_ascii = false,
|
||||||
|
.validate_utf8 = false,
|
||||||
|
});
|
||||||
|
|
||||||
|
pub fn LineBuffer(comptime options: Strictness) type {
|
||||||
|
return struct {
|
||||||
allocator: std.mem.Allocator,
|
allocator: std.mem.Allocator,
|
||||||
internal: FixedLineBuffer,
|
internal: FixedLineBuffer(options),
|
||||||
used: usize,
|
used: usize,
|
||||||
|
|
||||||
pub const default_capacity: usize = 4096;
|
pub const default_capacity: usize = 4096;
|
||||||
pub const Error = std.mem.Allocator.Error;
|
|
||||||
|
|
||||||
pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
|
pub fn init(allocator: std.mem.Allocator) !@This() {
|
||||||
return initCapacity(allocator, default_capacity);
|
return initCapacity(allocator, default_capacity);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
|
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) !@This() {
|
||||||
return .{
|
return .{
|
||||||
.allocator = allocator,
|
.allocator = allocator,
|
||||||
.internal = .{
|
.internal = .{
|
||||||
@ -25,11 +60,11 @@ pub const LineBuffer = struct {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn deinit(self: LineBuffer) void {
|
pub fn deinit(self: @This()) void {
|
||||||
self.allocator.free(self.internal.buffer);
|
self.allocator.free(self.internal.buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
|
pub fn feed(self: *@This(), data: []const u8) !void {
|
||||||
if (data.len == 0) return;
|
if (data.len == 0) return;
|
||||||
// TODO: check for usize overflow here if we want Maximum Robustness
|
// TODO: check for usize overflow here if we want Maximum Robustness
|
||||||
const new_window_len = self.internal.window.len + data.len;
|
const new_window_len = self.internal.window.len + data.len;
|
||||||
@ -57,40 +92,68 @@ pub const LineBuffer = struct {
|
|||||||
|
|
||||||
/// The memory returned by this function is valid until the next call to `feed`.
|
/// The memory returned by this function is valid until the next call to `feed`.
|
||||||
/// The resulting slice does not include the newline character.
|
/// The resulting slice does not include the newline character.
|
||||||
pub fn nextLine(self: *LineBuffer) ?[]const u8 {
|
pub fn nextLine(self: *@This()) !?[]const u8 {
|
||||||
return self.internal.nextLine();
|
return self.internal.nextLine();
|
||||||
}
|
}
|
||||||
|
|
||||||
fn rehome(self: *LineBuffer) void {
|
fn rehome(self: *@This()) void {
|
||||||
self.internal.rehome();
|
self.internal.rehome();
|
||||||
self.used = self.internal.window.len;
|
self.used = self.internal.window.len;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
}
|
||||||
|
|
||||||
pub const FixedLineBuffer = struct {
|
pub fn FixedLineBuffer(comptime options: Strictness) type {
|
||||||
|
return struct {
|
||||||
buffer: []const u8,
|
buffer: []const u8,
|
||||||
window: IndexSlice,
|
window: IndexSlice,
|
||||||
|
|
||||||
pub fn init(data: []const u8) FixedLineBuffer {
|
pub fn init(data: []const u8) @This() {
|
||||||
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
|
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
|
pub fn nextLine(self: *@This()) !?[]const u8 {
|
||||||
if (self.window.start >= self.buffer.len or self.window.len == 0)
|
if (self.window.start >= self.buffer.len or self.window.len == 0)
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
const window = self.buffer[self.window.start..][0..self.window.len];
|
const window = self.buffer[self.window.start..][0..self.window.len];
|
||||||
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
|
|
||||||
|
const split: usize = split: {
|
||||||
|
for (window, 0..) |char, idx| {
|
||||||
|
if (comptime options.check_carriage_return)
|
||||||
|
if (char == '\r') return error.IllegalCarriageReturn;
|
||||||
|
|
||||||
|
if (comptime options.check_nonprinting_ascii)
|
||||||
|
if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F))
|
||||||
|
return error.IllegalNonprintingAscii;
|
||||||
|
|
||||||
|
if (comptime options.check_trailing_whitespace) {
|
||||||
|
if (char == '\n') {
|
||||||
|
if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t'))
|
||||||
|
return error.IllegalTrailingSpace;
|
||||||
|
break :split idx;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (char == '\n') break :split idx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
};
|
||||||
|
|
||||||
self.window.start += split + 1;
|
self.window.start += split + 1;
|
||||||
self.window.len -= split + 1;
|
self.window.len -= split + 1;
|
||||||
|
|
||||||
|
if (comptime options.validate_utf8) {
|
||||||
|
const line = window[0..split];
|
||||||
|
return if (std.unicode.utf8ValidateSlice(line)) line else error.InputIsNotValidUtf8;
|
||||||
|
} else {
|
||||||
return window[0..split];
|
return window[0..split];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// move the current scan window to the beginning of the buffer. This internal
|
// move the current scan window to the beginning of the buffer. This internal
|
||||||
// method is used by LineBuffer.
|
// method is used by LineBuffer.
|
||||||
fn rehome(self: *FixedLineBuffer) void {
|
fn rehome(self: *@This()) void {
|
||||||
if (self.window.start == 0) return;
|
if (self.window.start == 0) return;
|
||||||
|
|
||||||
const window = self.buffer[self.window.start..][0..self.window.len];
|
const window = self.buffer[self.window.start..][0..self.window.len];
|
||||||
@ -105,3 +168,4 @@ pub const FixedLineBuffer = struct {
|
|||||||
self.window.start = 0;
|
self.window.start = 0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
}
|
||||||
|
@ -64,7 +64,7 @@ const std = @import("std");
|
|||||||
pub const buffers = @import("./linebuffer.zig");
|
pub const buffers = @import("./linebuffer.zig");
|
||||||
pub const tokenizer = @import("./tokenizer.zig");
|
pub const tokenizer = @import("./tokenizer.zig");
|
||||||
pub const parser = @import("./parser.zig");
|
pub const parser = @import("./parser.zig");
|
||||||
pub const Parser = parser.Parser;
|
pub const parseBuffer = parser.parseBuffer;
|
||||||
pub const StreamParser = parser.StreamParser;
|
pub const StreamParser = parser.StreamParser;
|
||||||
pub const Document = parser.Document;
|
pub const Document = parser.Document;
|
||||||
pub const Value = parser.Value;
|
pub const Value = parser.Value;
|
||||||
|
@ -7,9 +7,9 @@ pub const Document = @import("./parser/state.zig").Document;
|
|||||||
pub const Value = @import("./parser/value.zig").Value;
|
pub const Value = @import("./parser/value.zig").Value;
|
||||||
|
|
||||||
pub const Diagnostics = struct {
|
pub const Diagnostics = struct {
|
||||||
row: usize,
|
row: usize = 0,
|
||||||
span: struct { absolute: usize, line_offset: usize, length: usize },
|
span: struct { absolute: usize = 0, line_offset: usize = 0, length: usize = 0 } = .{},
|
||||||
message: []const u8,
|
message: []const u8 = "no problems",
|
||||||
};
|
};
|
||||||
|
|
||||||
pub const Error = error{
|
pub const Error = error{
|
||||||
@ -42,64 +42,54 @@ pub const Options = struct {
|
|||||||
default_object: enum { string, list, map, fail } = .fail,
|
default_object: enum { string, list, map, fail } = .fail,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub const Parser = struct {
|
pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, options: Options) !Document {
|
||||||
allocator: std.mem.Allocator,
|
var state = State.init(allocator);
|
||||||
options: Options = .{},
|
|
||||||
diagnostics: Diagnostics = .{
|
|
||||||
.row = 0,
|
|
||||||
.span = .{ .absolute = 0, .line_offset = 0, .length = 0 },
|
|
||||||
.message = "all is well",
|
|
||||||
},
|
|
||||||
|
|
||||||
pub fn parseBuffer(self: *Parser, buffer: []const u8) Error!Document {
|
|
||||||
var tok: tokenizer.LineTokenizer(buffers.FixedLineBuffer) = .{
|
|
||||||
.buffer = buffers.FixedLineBuffer.init(buffer),
|
|
||||||
.diagnostics = &self.diagnostics,
|
|
||||||
};
|
|
||||||
|
|
||||||
var state = State.init(self.allocator);
|
|
||||||
defer state.deinit();
|
defer state.deinit();
|
||||||
errdefer state.document.deinit();
|
errdefer state.document.deinit();
|
||||||
|
|
||||||
// TODO: pass the diagnostics pointer as well
|
var diagnostics = Diagnostics{};
|
||||||
while (try tok.next()) |line| try state.parseLine(line, self.options.duplicate_key_behavior);
|
var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{
|
||||||
|
.buffer = buffers.ValidatingFixedLineBuffer.init(buffer),
|
||||||
return try state.finish(self.options);
|
.diagnostics = &diagnostics,
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior);
|
||||||
|
return try state.finish(options);
|
||||||
|
}
|
||||||
|
|
||||||
pub const StreamParser = struct {
|
pub const StreamParser = struct {
|
||||||
linetok: tokenizer.LineTokenizer(buffers.LineBuffer),
|
linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer),
|
||||||
state: State,
|
parse_state: State,
|
||||||
options: Options = .{},
|
parse_options: Options = .{},
|
||||||
diagnostics: Diagnostics = .{
|
diagnostics: Diagnostics = .{},
|
||||||
.row = 0,
|
|
||||||
.span = .{ .absolute = 0, .line_offset = 0, .length = 0 },
|
|
||||||
.message = "all is well",
|
|
||||||
},
|
|
||||||
|
|
||||||
pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser {
|
pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser {
|
||||||
|
const diagnostics = try allocator.create(Diagnostics);
|
||||||
|
errdefer allocator.destroy(diagnostics);
|
||||||
|
diagnostics.* = Diagnostics{};
|
||||||
|
|
||||||
return .{
|
return .{
|
||||||
.linetok = .{
|
.linetok = .{
|
||||||
.buffer = try buffers.LineBuffer.init(allocator),
|
.buffer = try buffers.ValidatingLineBuffer.init(allocator),
|
||||||
.diagnostics = &@as(*StreamParser, @ptrFromInt(@returnAddress())).diagnostics,
|
.diagnostics = diagnostics,
|
||||||
},
|
},
|
||||||
.state = State.init(allocator),
|
.parse_state = State.init(allocator),
|
||||||
.options = options,
|
.parse_options = options,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn deinit(self: StreamParser) void {
|
pub fn deinit(self: StreamParser) void {
|
||||||
|
self.linetok.buffer.allocator.destroy(self.linetok.diagnostics);
|
||||||
self.linetok.buffer.deinit();
|
self.linetok.buffer.deinit();
|
||||||
self.state.deinit();
|
self.parse_state.deinit();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn feed(self: *StreamParser, data: []const u8) Error!void {
|
pub fn feed(self: *StreamParser, data: []const u8) !void {
|
||||||
try self.linetok.buffer.feed(data);
|
try self.linetok.buffer.feed(data);
|
||||||
while (try self.linetok.next()) |line| try self.state.parseLine(line, self.options.duplicate_key_behavior);
|
while (try self.linetok.next()) |line| try self.parse_state.parseLine(line, self.parse_options.duplicate_key_behavior);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn finish(self: *StreamParser) Error!Document {
|
pub fn finish(self: *StreamParser) !Document {
|
||||||
return try self.state.finish(self.options);
|
return try self.parse_state.finish(self.parse_options);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -70,8 +70,8 @@ pub fn LineTokenizer(comptime Buffer: type) type {
|
|||||||
diagnostics: *Diagnostics,
|
diagnostics: *Diagnostics,
|
||||||
row: usize = 0,
|
row: usize = 0,
|
||||||
|
|
||||||
pub fn next(self: *@This()) Error!?Line {
|
pub fn next(self: *@This()) !?Line {
|
||||||
lineloop: while (self.buffer.nextLine()) |raw_line| {
|
lineloop: while (try self.buffer.nextLine()) |raw_line| {
|
||||||
var indent: usize = 0;
|
var indent: usize = 0;
|
||||||
for (raw_line, 0..) |char, idx| {
|
for (raw_line, 0..) |char, idx| {
|
||||||
switch (char) {
|
switch (char) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user