nice-data/src/linebuffer.zig

254 lines
9.5 KiB
Zig

// Copyright 2023 torque@epicyclic.dev
//
// Licensed under the MIT/Expat license. You may not use this file except in
// compliance with the license. You may obtain a copy of the license at
//
// https://spdx.org/licenses/MIT.html
//
// This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied.
const std = @import("std");
const Diagnostics = @import("./parser.zig").Diagnostics;
pub const IndexSlice = struct { start: usize, len: usize };
pub const Error = error{
CarriageReturn,
TrailingWhitespace,
NonprintingAscii,
InputIsNotValidUtf8,
};
pub const Strictness = struct {
check_carriage_return: bool = true,
check_trailing_whitespace: bool = true,
check_nonprinting_ascii: bool = true,
validate_utf8: bool = false,
};
pub const ValidatingLineBuffer = LineBuffer(.{
.validate_utf8 = true,
});
pub const StrictLineBuffer = LineBuffer(.{});
pub const SloppyLineBuffer = LineBuffer(.{
.check_carriage_return = false,
.check_trailing_whitespace = false,
.check_nonprinting_ascii = false,
.validate_utf8 = false,
});
pub const ValidatingFixedLineBuffer = FixedLineBuffer(.{
.validate_utf8 = true,
});
pub const StrictFixedLineBuffer = FixedLineBuffer(.{});
pub const SloppyFixedLineBuffer = FixedLineBuffer(.{
.check_carriage_return = false,
.check_trailing_whitespace = false,
.check_nonprinting_ascii = false,
.validate_utf8 = false,
});
pub fn LineBuffer(comptime options: Strictness) type {
return struct {
allocator: std.mem.Allocator,
internal: FixedLineBuffer(options),
used: usize,
pub const default_capacity: usize = 4096;
pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) !@This() {
return initCapacity(allocator, diagnostics, default_capacity);
}
pub fn initCapacity(allocator: std.mem.Allocator, diagnostics: *Diagnostics, capacity: usize) !@This() {
return .{
.allocator = allocator,
.internal = .{
.diagnostics = diagnostics,
.buffer = try allocator.alloc(u8, capacity),
.window = .{ .start = 0, .len = 0 },
},
.used = 0,
};
}
pub fn diag(self: @This()) *Diagnostics {
return self.internal.diagnostics;
}
pub fn empty(self: @This()) bool {
return self.internal.empty();
}
pub fn deinit(self: @This()) void {
self.allocator.free(self.internal.buffer);
}
pub fn feed(self: *@This(), data: []const u8) !void {
if (data.len == 0) return;
// TODO: check for usize overflow here if we want Maximum Robustness
const new_window_len = self.internal.window.len + data.len;
// data cannot fit in the buffer with our scan window, so we have to realloc
if (new_window_len > self.internal.buffer.len) {
// TODO: adopt an overallocation strategy? Will potentially avoid allocating
// on every invocation but will cause the buffer to oversize
self.internal.buffer = try self.allocator.realloc(@constCast(self.internal.buffer), new_window_len);
self.rehome();
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
}
// data will fit, but needs to be moved in the buffer
else if (self.internal.window.start + new_window_len > self.internal.buffer.len) {
self.rehome();
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
}
// data can simply be appended
else {
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
}
self.used += data.len;
self.internal.window.len = new_window_len;
}
/// The memory returned by this function is valid until the next call to `feed`.
/// The resulting slice does not include the newline character.
pub fn nextLine(self: *@This()) !?[]const u8 {
return self.internal.nextLine();
}
fn rehome(self: *@This()) void {
self.internal.rehome();
self.used = self.internal.window.len;
}
};
}
pub fn FixedLineBuffer(comptime options: Strictness) type {
return struct {
buffer: []const u8,
window: IndexSlice,
diagnostics: *Diagnostics,
pub fn init(data: []const u8, diagnostics: *Diagnostics) @This() {
return .{
.buffer = data,
.window = .{ .start = 0, .len = data.len },
.diagnostics = diagnostics,
};
}
pub fn diag(self: @This()) *Diagnostics {
return self.diagnostics;
}
pub fn empty(self: @This()) bool {
// we can't check the overall buffer size because the dynamic buffer may be
// overallocated
return self.window.len == 0;
}
pub fn nextLine(self: *@This()) !?[]const u8 {
if (self.window.start >= self.buffer.len or self.window.len == 0)
return null;
const window = self.buffer[self.window.start..][0..self.window.len];
const split: usize = split: {
for (window, 0..) |char, idx| {
if (comptime options.check_carriage_return)
if (char == '\r') {
self.diagnostics.row += 1;
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "found a carriage return";
return error.IllegalCarriageReturn;
};
if (comptime options.check_nonprinting_ascii)
if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F)) {
self.diagnostics.row += 1;
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "found nonprinting ascii characters";
return error.IllegalNonprintingAscii;
};
if (comptime options.check_trailing_whitespace) {
if (char == '\n') {
if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t')) {
self.diagnostics.row += 1;
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "found trailing spaces";
return error.IllegalTrailingSpace;
}
break :split idx;
}
} else {
if (char == '\n') break :split idx;
}
}
return null;
};
self.diagnostics.row += 1;
self.diagnostics.line_offset = 0;
self.window.start += split + 1;
self.window.len -= split + 1;
if (comptime options.validate_utf8) {
const line = window[0..split];
var idx: usize = 0;
while (idx < line.len) {
if (std.unicode.utf8ByteSequenceLength(line[idx])) |cp_len| {
if (idx + cp_len > line.len) {
self.diagnostics.line_offset = idx;
self.diagnostics.length = cp_len;
self.diagnostics.message = "truncated UTF-8 sequence";
return error.InputIsNotValidUtf8;
}
if (std.meta.isError(std.unicode.utf8Decode(line[idx .. idx + cp_len]))) {
self.diagnostics.line_offset = idx;
self.diagnostics.length = cp_len;
self.diagnostics.message = "invalid UTF-8 sequence";
return error.InputIsNotValidUtf8;
}
idx += cp_len;
} else |_| {
self.diagnostics.line_offset = idx;
self.diagnostics.length = 1;
self.diagnostics.message = "invalid UTF-8 sequence start byte";
return error.InputIsNotValidUtf8;
}
}
return line;
} else {
return window[0..split];
}
}
// move the current scan window to the beginning of the buffer. This internal
// method is used by LineBuffer.
fn rehome(self: *@This()) void {
if (self.window.start == 0) return;
const window = self.buffer[self.window.start..][0..self.window.len];
// if the window is longer than its starting index, the memory move will be
// overlapping, so we can't use memcpy
if (self.window.len > self.window.start)
std.mem.copyForwards(u8, @constCast(self.buffer), window)
else
@memcpy(@constCast(self.buffer.ptr), window);
self.window.start = 0;
}
};
}