2023-10-22 16:16:57 -07:00
|
|
|
// Copyright 2023 torque@epicyclic.dev
|
|
|
|
//
|
|
|
|
// Licensed under the MIT/Expat license. You may not use this file except in
|
|
|
|
// compliance with the license. You may obtain a copy of the license at
|
|
|
|
//
|
|
|
|
// https://spdx.org/licenses/MIT.html
|
|
|
|
//
|
|
|
|
// This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
|
|
|
|
// CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
2023-09-24 18:22:12 -07:00
|
|
|
const std = @import("std");
|
|
|
|
|
2023-09-27 23:44:06 -07:00
|
|
|
const Diagnostics = @import("./parser.zig").Diagnostics;
|
|
|
|
|
2023-09-24 18:22:12 -07:00
|
|
|
pub const IndexSlice = struct { start: usize, len: usize };
|
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
pub const Error = error{
|
|
|
|
CarriageReturn,
|
|
|
|
TrailingWhitespace,
|
|
|
|
NonprintingAscii,
|
|
|
|
InputIsNotValidUtf8,
|
|
|
|
};
|
|
|
|
|
|
|
|
pub const Strictness = struct {
|
|
|
|
check_carriage_return: bool = true,
|
|
|
|
check_trailing_whitespace: bool = true,
|
|
|
|
check_nonprinting_ascii: bool = true,
|
|
|
|
validate_utf8: bool = false,
|
|
|
|
};
|
|
|
|
|
|
|
|
pub const ValidatingLineBuffer = LineBuffer(.{
|
|
|
|
.validate_utf8 = true,
|
|
|
|
});
|
|
|
|
pub const StrictLineBuffer = LineBuffer(.{});
|
|
|
|
pub const SloppyLineBuffer = LineBuffer(.{
|
|
|
|
.check_carriage_return = false,
|
|
|
|
.check_trailing_whitespace = false,
|
|
|
|
.check_nonprinting_ascii = false,
|
|
|
|
.validate_utf8 = false,
|
|
|
|
});
|
|
|
|
pub const ValidatingFixedLineBuffer = FixedLineBuffer(.{
|
|
|
|
.validate_utf8 = true,
|
|
|
|
});
|
|
|
|
pub const StrictFixedLineBuffer = FixedLineBuffer(.{});
|
|
|
|
pub const SloppyFixedLineBuffer = FixedLineBuffer(.{
|
|
|
|
.check_carriage_return = false,
|
|
|
|
.check_trailing_whitespace = false,
|
|
|
|
.check_nonprinting_ascii = false,
|
|
|
|
.validate_utf8 = false,
|
|
|
|
});
|
|
|
|
|
|
|
|
pub fn LineBuffer(comptime options: Strictness) type {
|
|
|
|
return struct {
|
|
|
|
allocator: std.mem.Allocator,
|
|
|
|
internal: FixedLineBuffer(options),
|
|
|
|
used: usize,
|
|
|
|
|
|
|
|
pub const default_capacity: usize = 4096;
|
|
|
|
|
2023-09-27 23:44:06 -07:00
|
|
|
pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) !@This() {
|
|
|
|
return initCapacity(allocator, diagnostics, default_capacity);
|
2023-09-24 18:22:12 -07:00
|
|
|
}
|
2023-09-26 00:06:39 -07:00
|
|
|
|
2023-09-27 23:44:06 -07:00
|
|
|
pub fn initCapacity(allocator: std.mem.Allocator, diagnostics: *Diagnostics, capacity: usize) !@This() {
|
2023-09-26 00:06:39 -07:00
|
|
|
return .{
|
|
|
|
.allocator = allocator,
|
|
|
|
.internal = .{
|
2023-09-27 23:44:06 -07:00
|
|
|
.diagnostics = diagnostics,
|
2023-09-26 00:06:39 -07:00
|
|
|
.buffer = try allocator.alloc(u8, capacity),
|
|
|
|
.window = .{ .start = 0, .len = 0 },
|
|
|
|
},
|
|
|
|
.used = 0,
|
|
|
|
};
|
2023-09-24 18:22:12 -07:00
|
|
|
}
|
2023-09-26 00:06:39 -07:00
|
|
|
|
2023-09-27 23:44:06 -07:00
|
|
|
pub fn diag(self: @This()) *Diagnostics {
|
|
|
|
return self.internal.diagnostics;
|
|
|
|
}
|
|
|
|
|
2023-09-27 23:35:24 -07:00
|
|
|
pub fn empty(self: @This()) bool {
|
|
|
|
return self.internal.empty();
|
|
|
|
}
|
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
pub fn deinit(self: @This()) void {
|
|
|
|
self.allocator.free(self.internal.buffer);
|
2023-09-24 18:22:12 -07:00
|
|
|
}
|
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
pub fn feed(self: *@This(), data: []const u8) !void {
|
|
|
|
if (data.len == 0) return;
|
|
|
|
// TODO: check for usize overflow here if we want Maximum Robustness
|
|
|
|
const new_window_len = self.internal.window.len + data.len;
|
|
|
|
|
|
|
|
// data cannot fit in the buffer with our scan window, so we have to realloc
|
|
|
|
if (new_window_len > self.internal.buffer.len) {
|
|
|
|
// TODO: adopt an overallocation strategy? Will potentially avoid allocating
|
|
|
|
// on every invocation but will cause the buffer to oversize
|
|
|
|
self.internal.buffer = try self.allocator.realloc(@constCast(self.internal.buffer), new_window_len);
|
|
|
|
self.rehome();
|
|
|
|
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
|
|
|
|
}
|
|
|
|
// data will fit, but needs to be moved in the buffer
|
|
|
|
else if (self.internal.window.start + new_window_len > self.internal.buffer.len) {
|
|
|
|
self.rehome();
|
|
|
|
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
|
|
|
|
}
|
|
|
|
// data can simply be appended
|
|
|
|
else {
|
|
|
|
@memcpy(@constCast(self.internal.buffer[self.used..].ptr), data);
|
|
|
|
}
|
|
|
|
self.used += data.len;
|
|
|
|
self.internal.window.len = new_window_len;
|
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
/// The memory returned by this function is valid until the next call to `feed`.
|
|
|
|
/// The resulting slice does not include the newline character.
|
|
|
|
pub fn nextLine(self: *@This()) !?[]const u8 {
|
|
|
|
return self.internal.nextLine();
|
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
fn rehome(self: *@This()) void {
|
|
|
|
self.internal.rehome();
|
|
|
|
self.used = self.internal.window.len;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
pub fn FixedLineBuffer(comptime options: Strictness) type {
|
|
|
|
return struct {
|
|
|
|
buffer: []const u8,
|
|
|
|
window: IndexSlice,
|
2023-09-27 23:44:06 -07:00
|
|
|
diagnostics: *Diagnostics,
|
|
|
|
|
|
|
|
pub fn init(data: []const u8, diagnostics: *Diagnostics) @This() {
|
|
|
|
return .{
|
|
|
|
.buffer = data,
|
|
|
|
.window = .{ .start = 0, .len = data.len },
|
|
|
|
.diagnostics = diagnostics,
|
|
|
|
};
|
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
|
2023-09-27 23:44:06 -07:00
|
|
|
pub fn diag(self: @This()) *Diagnostics {
|
|
|
|
return self.diagnostics;
|
2023-09-26 00:06:39 -07:00
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
|
2023-09-27 23:35:24 -07:00
|
|
|
pub fn empty(self: @This()) bool {
|
|
|
|
// we can't check the overall buffer size because the dynamic buffer may be
|
|
|
|
// overallocated
|
|
|
|
return self.window.len == 0;
|
|
|
|
}
|
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
pub fn nextLine(self: *@This()) !?[]const u8 {
|
|
|
|
if (self.window.start >= self.buffer.len or self.window.len == 0)
|
|
|
|
return null;
|
|
|
|
|
|
|
|
const window = self.buffer[self.window.start..][0..self.window.len];
|
|
|
|
|
|
|
|
const split: usize = split: {
|
|
|
|
for (window, 0..) |char, idx| {
|
|
|
|
if (comptime options.check_carriage_return)
|
2023-09-27 23:44:06 -07:00
|
|
|
if (char == '\r') {
|
|
|
|
self.diagnostics.row += 1;
|
|
|
|
self.diagnostics.line_offset = idx;
|
|
|
|
self.diagnostics.length = 1;
|
|
|
|
self.diagnostics.message = "found a carriage return";
|
|
|
|
return error.IllegalCarriageReturn;
|
|
|
|
};
|
2023-09-26 00:06:39 -07:00
|
|
|
|
|
|
|
if (comptime options.check_nonprinting_ascii)
|
2023-09-27 23:44:06 -07:00
|
|
|
if ((char != '\n' and char != '\t') and (char < ' ' or char == 0x7F)) {
|
|
|
|
self.diagnostics.row += 1;
|
|
|
|
self.diagnostics.line_offset = idx;
|
|
|
|
self.diagnostics.length = 1;
|
|
|
|
self.diagnostics.message = "found nonprinting ascii characters";
|
2023-09-26 00:06:39 -07:00
|
|
|
return error.IllegalNonprintingAscii;
|
2023-09-27 23:44:06 -07:00
|
|
|
};
|
2023-09-26 00:06:39 -07:00
|
|
|
|
|
|
|
if (comptime options.check_trailing_whitespace) {
|
|
|
|
if (char == '\n') {
|
2023-09-27 23:44:06 -07:00
|
|
|
if (idx > 0 and (window[idx - 1] == ' ' or window[idx - 1] == '\t')) {
|
|
|
|
self.diagnostics.row += 1;
|
|
|
|
self.diagnostics.line_offset = idx;
|
|
|
|
self.diagnostics.length = 1;
|
|
|
|
self.diagnostics.message = "found trailing spaces";
|
2023-09-26 00:06:39 -07:00
|
|
|
return error.IllegalTrailingSpace;
|
2023-09-27 23:44:06 -07:00
|
|
|
}
|
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
break :split idx;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (char == '\n') break :split idx;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return null;
|
|
|
|
};
|
|
|
|
|
2023-09-27 23:44:06 -07:00
|
|
|
self.diagnostics.row += 1;
|
|
|
|
self.diagnostics.line_offset = 0;
|
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
self.window.start += split + 1;
|
|
|
|
self.window.len -= split + 1;
|
|
|
|
|
|
|
|
if (comptime options.validate_utf8) {
|
|
|
|
const line = window[0..split];
|
2023-09-27 23:44:06 -07:00
|
|
|
|
|
|
|
var idx: usize = 0;
|
|
|
|
while (idx < line.len) {
|
|
|
|
if (std.unicode.utf8ByteSequenceLength(line[idx])) |cp_len| {
|
|
|
|
if (idx + cp_len > line.len) {
|
|
|
|
self.diagnostics.line_offset = idx;
|
|
|
|
self.diagnostics.length = cp_len;
|
|
|
|
self.diagnostics.message = "truncated UTF-8 sequence";
|
|
|
|
return error.InputIsNotValidUtf8;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (std.meta.isError(std.unicode.utf8Decode(line[idx .. idx + cp_len]))) {
|
|
|
|
self.diagnostics.line_offset = idx;
|
|
|
|
self.diagnostics.length = cp_len;
|
|
|
|
self.diagnostics.message = "invalid UTF-8 sequence";
|
|
|
|
return error.InputIsNotValidUtf8;
|
|
|
|
}
|
|
|
|
idx += cp_len;
|
|
|
|
} else |_| {
|
|
|
|
self.diagnostics.line_offset = idx;
|
|
|
|
self.diagnostics.length = 1;
|
|
|
|
self.diagnostics.message = "invalid UTF-8 sequence start byte";
|
|
|
|
return error.InputIsNotValidUtf8;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return line;
|
2023-09-26 00:06:39 -07:00
|
|
|
} else {
|
|
|
|
return window[0..split];
|
|
|
|
}
|
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
// move the current scan window to the beginning of the buffer. This internal
|
|
|
|
// method is used by LineBuffer.
|
|
|
|
fn rehome(self: *@This()) void {
|
|
|
|
if (self.window.start == 0) return;
|
2023-09-24 18:22:12 -07:00
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
const window = self.buffer[self.window.start..][0..self.window.len];
|
2023-09-24 18:22:12 -07:00
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
// if the window is longer than its starting index, the memory move will be
|
|
|
|
// overlapping, so we can't use memcpy
|
|
|
|
if (self.window.len > self.window.start)
|
|
|
|
std.mem.copyForwards(u8, @constCast(self.buffer), window)
|
|
|
|
else
|
|
|
|
@memcpy(@constCast(self.buffer.ptr), window);
|
2023-09-24 18:22:12 -07:00
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
self.window.start = 0;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|