config: refactor LineTokenizer to use an internal line buffer
The goal here is to support a streaming parser. However, I did decide the leave the flow item parser state machine as fully buffered (i.e. not streaming). This is not JSON and in general documents should be many, shorter lines, so this buffering strategy should work reasonably well. I have not actually tried the streaming implementation of this, yet.
This commit is contained in:
parent
ab580fa80a
commit
6415571d01
202
src/config.zig
202
src/config.zig
@ -63,29 +63,114 @@
|
||||
|
||||
const std = @import("std");
|
||||
|
||||
pub const IndexSlice = struct { start: usize, len: usize };
|
||||
|
||||
pub const Diagnostics = struct {
|
||||
row: usize,
|
||||
span: struct { absolute: usize, line_offset: usize, length: usize },
|
||||
message: []const u8,
|
||||
};
|
||||
|
||||
pub const LineTokenizer = struct {
|
||||
pub const LineBuffer = struct {
|
||||
allocator: std.mem.Allocator,
|
||||
buffer: []u8,
|
||||
used: usize,
|
||||
window: IndexSlice,
|
||||
|
||||
pub const default_capacity: usize = 4096;
|
||||
pub const Error = std.mem.Allocator.Error;
|
||||
|
||||
pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
|
||||
return initCapacity(allocator, default_capacity);
|
||||
}
|
||||
|
||||
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
|
||||
return .{
|
||||
.allocator = allocator,
|
||||
.buffer = try allocator.alloc(u8, capacity),
|
||||
.used = 0,
|
||||
.window = .{ .start = 0, .len = 0 },
|
||||
};
|
||||
}
|
||||
|
||||
pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
|
||||
if (data.len == 0) return;
|
||||
// TODO: check for usize overflow here if we want Maximum Robustness
|
||||
const new_window_len = self.window.len + data.len;
|
||||
|
||||
// data cannot fit in the buffer with our scan window, so we have to realloc
|
||||
if (new_window_len > self.buffer.len) {
|
||||
// TODO: adopt an overallocation strategy? Will potentially avoid allocating
|
||||
// on every invocation but will cause the buffer to oversize
|
||||
try self.allocator.realloc(self.buffer, new_window_len);
|
||||
self.rehome();
|
||||
@memcpy(self.buffer[self.used..].ptr, data);
|
||||
self.used = new_window_len;
|
||||
self.window.len = new_window_len;
|
||||
}
|
||||
// data will fit, but needs to be moved in the buffer
|
||||
else if (self.window.start + new_window_len > self.buffer.len) {
|
||||
self.rehome();
|
||||
@memcpy(self.buffer[self.used..].ptr, data);
|
||||
self.used = new_window_len;
|
||||
self.window.len = new_window_len;
|
||||
}
|
||||
// data can simply be appended
|
||||
else {
|
||||
@memcpy(self.buffer[self.used..].ptr, data);
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory returned by this function is valid until the next call to `feed`.
|
||||
/// The resulting slice does not include the newline character.
|
||||
pub fn nextLine(self: *LineBuffer) ?[]const u8 {
|
||||
if (self.window.start >= self.buffer.len or self.window.len == 0)
|
||||
return null;
|
||||
|
||||
const window = self.buffer[self.window.start..][0..self.window.len];
|
||||
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
|
||||
|
||||
self.window.start += split + 1;
|
||||
self.window.len -= split + 1;
|
||||
|
||||
return window[0..split];
|
||||
}
|
||||
|
||||
fn rehome(self: *LineBuffer) void {
|
||||
if (self.window.start == 0) return;
|
||||
|
||||
const window = self.buffer[self.window.start..][0..self.window.len];
|
||||
|
||||
if (self.window.len > self.window.start)
|
||||
std.mem.copyForwards(u8, self.buffer, window)
|
||||
else
|
||||
@memcpy(self.buffer.ptr, window);
|
||||
|
||||
self.window.start = 0;
|
||||
self.used = window.len;
|
||||
}
|
||||
};
|
||||
|
||||
pub const FixedLineBuffer = struct {
|
||||
buffer: []const u8,
|
||||
index: usize = 0,
|
||||
indentation: IndentationType = .immaterial,
|
||||
last_indent: usize = 0,
|
||||
diagnostics: *Diagnostics,
|
||||
window: IndexSlice,
|
||||
|
||||
row: usize = 0,
|
||||
pub fn init(data: []const u8) FixedLineBuffer {
|
||||
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
|
||||
}
|
||||
|
||||
const Error = error{
|
||||
BadToken,
|
||||
MixedIndentation,
|
||||
UnquantizedIndentation,
|
||||
TooMuchIndentation,
|
||||
MissingNewline,
|
||||
TrailingWhitespace,
|
||||
Impossible,
|
||||
pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
|
||||
if (self.window.start >= self.buffer.len or self.window.len == 0)
|
||||
return null;
|
||||
|
||||
const window = self.buffer[self.window.start..][0..self.window.len];
|
||||
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
|
||||
|
||||
self.window.start += split + 1;
|
||||
self.window.len -= split + 1;
|
||||
|
||||
return window[0..split];
|
||||
}
|
||||
};
|
||||
|
||||
const IndentationType = union(enum) {
|
||||
@ -148,13 +233,29 @@ pub const LineTokenizer = struct {
|
||||
raw: []const u8,
|
||||
};
|
||||
|
||||
pub fn next(self: *LineTokenizer) Error!?Line {
|
||||
if (self.index == self.buffer.len) return null;
|
||||
pub fn LineTokenizer(comptime Buffer: type) type {
|
||||
return struct {
|
||||
buffer: Buffer,
|
||||
index: usize = 0,
|
||||
indentation: IndentationType = .immaterial,
|
||||
last_indent: usize = 0,
|
||||
diagnostics: *Diagnostics,
|
||||
row: usize = 0,
|
||||
|
||||
const Error = error{
|
||||
BadToken,
|
||||
MixedIndentation,
|
||||
UnquantizedIndentation,
|
||||
TooMuchIndentation,
|
||||
MissingNewline,
|
||||
TrailingWhitespace,
|
||||
Impossible,
|
||||
};
|
||||
|
||||
pub fn next(self: *@This()) Error!?Line {
|
||||
lineloop: while (self.buffer.nextLine()) |raw_line| {
|
||||
var indent: usize = 0;
|
||||
var offset: usize = 0;
|
||||
|
||||
for (self.buffer[self.index..], 0..) |char, idx| {
|
||||
for (raw_line, 0..) |char, idx| {
|
||||
switch (char) {
|
||||
' ' => {
|
||||
switch (self.indentation) {
|
||||
@ -168,7 +269,6 @@ pub const LineTokenizer = struct {
|
||||
.spaces => {},
|
||||
.tabs => return error.MixedIndentation,
|
||||
}
|
||||
indent += 1;
|
||||
},
|
||||
'\t' => {
|
||||
switch (self.indentation) {
|
||||
@ -176,40 +276,28 @@ pub const LineTokenizer = struct {
|
||||
.spaces => return error.MixedIndentation,
|
||||
.tabs => {},
|
||||
}
|
||||
indent += 1;
|
||||
},
|
||||
'\r' => {
|
||||
return error.BadToken;
|
||||
},
|
||||
'\n' => {
|
||||
// don't even emit anything for empty rows.
|
||||
self.row += 1;
|
||||
offset = idx + 1;
|
||||
// if it's too hard to deal with, Just Make It An Error!!!
|
||||
// an empty line with whitespace on it is garbage. It can mess with
|
||||
// the indentation detection grossly in a way that is annoying to
|
||||
// deal with. Besides, having whitespace-only lines in a document
|
||||
// is essentially terrorism, with which negotiations are famously
|
||||
// not permitted.
|
||||
if (indent > 0) return error.TrailingWhitespace;
|
||||
else => {
|
||||
indent = idx;
|
||||
break;
|
||||
},
|
||||
else => break,
|
||||
}
|
||||
} else {
|
||||
std.debug.assert(self.buffer.len == self.index + indent + offset + 1);
|
||||
self.index = self.buffer.len;
|
||||
// this prong will get hit when the document only consists of whitespace
|
||||
return null;
|
||||
if (raw_line.len > 0) return error.TrailingWhitespace;
|
||||
continue :lineloop;
|
||||
}
|
||||
|
||||
var quantized: usize = if (self.indentation == .spaces) blk: {
|
||||
var quantized: usize = if (self.indentation == .spaces) quant: {
|
||||
if (self.indentation.spaces == 0) {
|
||||
self.indentation.spaces = indent;
|
||||
}
|
||||
if (@rem(indent, self.indentation.spaces) != 0)
|
||||
return error.UnquantizedIndentation;
|
||||
|
||||
break :blk @divExact(indent, self.indentation.spaces);
|
||||
break :quant @divExact(indent, self.indentation.spaces);
|
||||
} else indent;
|
||||
|
||||
const relative: RelativeIndent = if (quantized > self.last_indent) rel: {
|
||||
@ -221,16 +309,12 @@ pub const LineTokenizer = struct {
|
||||
else
|
||||
.none;
|
||||
|
||||
offset += indent;
|
||||
|
||||
defer {
|
||||
self.row += 1;
|
||||
self.last_indent = quantized;
|
||||
self.index += offset;
|
||||
}
|
||||
|
||||
const line = try consumeLine(self.buffer[self.index + offset ..]);
|
||||
offset += line.len + 1;
|
||||
const line = raw_line[indent..];
|
||||
|
||||
// this should not be possible, as empty lines are caught earlier.
|
||||
if (line.len == 0) return error.Impossible;
|
||||
@ -294,6 +378,11 @@ pub const LineTokenizer = struct {
|
||||
};
|
||||
},
|
||||
}
|
||||
|
||||
// somehow everything else has failed
|
||||
return error.Impossible;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
fn detectInlineItem(buf: []const u8) Error!InlineItem {
|
||||
@ -336,19 +425,8 @@ pub const LineTokenizer = struct {
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn consumeLine(buf: []const u8) ![]const u8 {
|
||||
for (buf, 0..) |char, idx| {
|
||||
switch (char) {
|
||||
'\n' => return buf[0..idx],
|
||||
'\r' => return error.BadToken,
|
||||
else => {},
|
||||
}
|
||||
}
|
||||
|
||||
return error.MissingNewline;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub const Value = union(enum) {
|
||||
pub const String = std.ArrayList(u8);
|
||||
@ -489,7 +567,7 @@ pub const Parser = struct {
|
||||
DuplicateKey,
|
||||
BadMapEntry,
|
||||
Fail,
|
||||
} || LineTokenizer.Error || FlowParser.Error || std.mem.Allocator.Error;
|
||||
} || LineTokenizer(FixedLineBuffer).Error || FlowParser.Error || std.mem.Allocator.Error;
|
||||
|
||||
pub const DuplicateKeyBehavior = enum {
|
||||
use_first,
|
||||
@ -536,7 +614,7 @@ pub const Parser = struct {
|
||||
document: Document,
|
||||
value_stack: Stack,
|
||||
state: ParseState = .initial,
|
||||
expect_shift: LineTokenizer.ShiftDirection = .none,
|
||||
expect_shift: ShiftDirection = .none,
|
||||
dangling_key: ?[]const u8 = null,
|
||||
|
||||
pub fn init(alloc: std.mem.Allocator) State {
|
||||
@ -557,12 +635,16 @@ pub const Parser = struct {
|
||||
const arena_alloc = document.arena.allocator();
|
||||
|
||||
var state: ParseState = .initial;
|
||||
var expect_shift: LineTokenizer.ShiftDirection = .none;
|
||||
var expect_shift: ShiftDirection = .none;
|
||||
var dangling_key: ?[]const u8 = null;
|
||||
var stack = std.ArrayList(*Value).init(arena_alloc);
|
||||
defer stack.deinit();
|
||||
|
||||
var tok: LineTokenizer = .{ .buffer = buffer, .diagnostics = &self.diagnostics };
|
||||
var tok: LineTokenizer(FixedLineBuffer) = .{
|
||||
.buffer = FixedLineBuffer.init(buffer),
|
||||
.diagnostics = &self.diagnostics,
|
||||
};
|
||||
|
||||
while (try tok.next()) |line| {
|
||||
if (line.contents == .comment) continue;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user