nice-data/src/linebuffer.zig

172 lines
6.1 KiB
Zig
Raw Normal View History

const std = @import("std");
pub const IndexSlice = struct { start: usize, len: usize };
pub const Error = error{
CarriageReturn,
TrailingWhitespace,
NonprintingAscii,
InputIsNotValidUtf8,
};
pub const Strictness = struct {
check_carriage_return: bool = true,
check_trailing_whitespace: bool = true,
check_nonprinting_ascii: bool = true,
validate_utf8: bool = false,
};
pub const ValidatingLineBuffer = LineBuffer(.{
.validate_utf8 = true,
});
pub const StrictLineBuffer = LineBuffer(.{});
pub const SloppyLineBuffer = LineBuffer(.{
.check_carriage_return = false,
.check_trailing_whitespace = false,
.check_nonprinting_ascii = false,
.validate_utf8 = false,
});
pub const ValidatingFixedLineBuffer = FixedLineBuffer(.{
.validate_utf8 = true,
});
pub const StrictFixedLineBuffer = FixedLineBuffer(.{});
pub const SloppyFixedLineBuffer = FixedLineBuffer(.{
.check_carriage_return = false,
.check_trailing_whitespace = false,
.check_nonprinting_ascii = false,
.validate_utf8 = false,
});
pub fn LineBuffer(comptime options: Strictness) type {
return struct {
allocator: std.mem.Allocator,
internal: FixedLineBuffer(options),
used: usize,
pub const default_capacity: usize = 4096;
pub fn init(allocator: std.mem.Allocator) !@This() {
return initCapacity(allocator, default_capacity);
}
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) !@This() {
return .{
.allocator = allocator,
.internal = .{
.buffer = try allocator.alloc(u8, capacity),
.window = .{ .start = 0, .len = 0 },
},
.used = 0,
};
}
pub fn deinit(self: @This()) void {
self.allocator.free(self.internal.buffer);
}
pub fn feed(self: *@This(), data: []const u8) !void {
if (data.len == 0) return;
// TODO: check for usize overflow here if we want Maximum Robustness
const new_window_len = self.internal.window.len + data.len;
// data cannot fit in the buffer with our scan window, so we have to realloc
if (new_window_len > self.internal.buffer.len) {
// TODO: adopt an overallocation strategy? Will potentially avoid allocating
// on every invocation but will cause the buffer to oversize
self.internal.buffer = try self.allocator.realloc(@constCast(self.internal.buffer), new_window_len);
self.rehome();
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
}
// data will fit, but needs to be moved in the buffer
else if (self.internal.window.start + new_window_len > self.internal.buffer.len) {
self.rehome();
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
}
// data can simply be appended
else {
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
}
self.used += data.len;
self.internal.window.len = new_window_len;
}
/// The memory returned by this function is valid until the next call to `feed`.
/// The resulting slice does not include the newline character.
pub fn nextLine(self: *@This()) !?[]const u8 {
return self.internal.nextLine();
}
fn rehome(self: *@This()) void {
self.internal.rehome();
self.used = self.internal.window.len;
}
};
}
pub fn FixedLineBuffer(comptime options: Strictness) type {
return struct {
buffer: []const u8,
window: IndexSlice,
pub fn init(data: []const u8) @This() {
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
}
pub fn nextLine(self: *@This()) !?[]const u8 {
if (self.window.start >= self.buffer.len or self.window.len == 0)
return null;
const window = self.buffer[self.window.start..][0..self.window.len];
const split: usize = split: {
for (window, 0..) |char, idx| {
if (comptime options.check_carriage_return)
if (char == '\r') return error.IllegalCarriageReturn;
if (comptime options.check_nonprinting_ascii)
if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F))
return error.IllegalNonprintingAscii;
if (comptime options.check_trailing_whitespace) {
if (char == '\n') {
if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t'))
return error.IllegalTrailingSpace;
break :split idx;
}
} else {
if (char == '\n') break :split idx;
}
}
return null;
};
self.window.start += split + 1;
self.window.len -= split + 1;
if (comptime options.validate_utf8) {
const line = window[0..split];
return if (std.unicode.utf8ValidateSlice(line)) line else error.InputIsNotValidUtf8;
} else {
return window[0..split];
}
}
// move the current scan window to the beginning of the buffer. This internal
// method is used by LineBuffer.
fn rehome(self: *@This()) void {
if (self.window.start == 0) return;
const window = self.buffer[self.window.start..][0..self.window.len];
// if the window is longer than its starting index, the memory move will be
// overlapping, so we can't use memcpy
if (self.window.len > self.window.start)
std.mem.copyForwards(u8, @constCast(self.buffer), window)
else
@memcpy(@constCast(self.buffer.ptr), window);
self.window.start = 0;
}
};
}