config: refactor LineTokenizer to use an internal line buffer

The goal here is to support a streaming parser. However, I did decide
the leave the flow item parser state machine as fully buffered
(i.e. not streaming). This is not JSON and in general documents should
be many, shorter lines, so this buffering strategy should work
reasonably well. I have not actually tried the streaming
implementation of this, yet.
This commit is contained in:
torque 2023-09-21 23:34:17 -07:00
parent b08d712616
commit a0107ab9fd
Signed by: torque
SSH Key Fingerprint: SHA256:nCrXefBNo6EbjNSQhv0nXmEg/VuNq3sMF5b8zETw3Tk

View File

@ -63,38 +63,123 @@
const std = @import("std"); const std = @import("std");
pub const IndexSlice = struct { start: usize, len: usize };
pub const Diagnostics = struct { pub const Diagnostics = struct {
row: usize, row: usize,
span: struct { absolute: usize, line_offset: usize, length: usize }, span: struct { absolute: usize, line_offset: usize, length: usize },
message: []const u8, message: []const u8,
}; };
pub const LineTokenizer = struct { pub const LineBuffer = struct {
buffer: []const u8, allocator: std.mem.Allocator,
index: usize = 0, buffer: []u8,
indentation: IndentationType = .immaterial, used: usize,
last_indent: usize = 0, window: IndexSlice,
diagnostics: *Diagnostics,
row: usize = 0, pub const default_capacity: usize = 4096;
pub const Error = std.mem.Allocator.Error;
const Error = error{ pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
BadToken, return initCapacity(allocator, default_capacity);
MixedIndentation, }
UnquantizedIndentation,
TooMuchIndentation, pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
MissingNewline, return .{
TrailingWhitespace, .allocator = allocator,
Impossible, .buffer = try allocator.alloc(u8, capacity),
.used = 0,
.window = .{ .start = 0, .len = 0 },
}; };
}
const IndentationType = union(enum) { pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
if (data.len == 0) return;
// TODO: check for usize overflow here if we want Maximum Robustness
const new_window_len = self.window.len + data.len;
// data cannot fit in the buffer with our scan window, so we have to realloc
if (new_window_len > self.buffer.len) {
// TODO: adopt an overallocation strategy? Will potentially avoid allocating
// on every invocation but will cause the buffer to oversize
try self.allocator.realloc(self.buffer, new_window_len);
self.rehome();
@memcpy(self.buffer[self.used..].ptr, data);
self.used = new_window_len;
self.window.len = new_window_len;
}
// data will fit, but needs to be moved in the buffer
else if (self.window.start + new_window_len > self.buffer.len) {
self.rehome();
@memcpy(self.buffer[self.used..].ptr, data);
self.used = new_window_len;
self.window.len = new_window_len;
}
// data can simply be appended
else {
@memcpy(self.buffer[self.used..].ptr, data);
}
}
/// The memory returned by this function is valid until the next call to `feed`.
/// The resulting slice does not include the newline character.
pub fn nextLine(self: *LineBuffer) ?[]const u8 {
if (self.window.start >= self.buffer.len or self.window.len == 0)
return null;
const window = self.buffer[self.window.start..][0..self.window.len];
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
self.window.start += split + 1;
self.window.len -= split + 1;
return window[0..split];
}
fn rehome(self: *LineBuffer) void {
if (self.window.start == 0) return;
const window = self.buffer[self.window.start..][0..self.window.len];
if (self.window.len > self.window.start)
std.mem.copyForwards(u8, self.buffer, window)
else
@memcpy(self.buffer.ptr, window);
self.window.start = 0;
self.used = window.len;
}
};
pub const FixedLineBuffer = struct {
buffer: []const u8,
window: IndexSlice,
pub fn init(data: []const u8) FixedLineBuffer {
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
}
pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
if (self.window.start >= self.buffer.len or self.window.len == 0)
return null;
const window = self.buffer[self.window.start..][0..self.window.len];
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
self.window.start += split + 1;
self.window.len -= split + 1;
return window[0..split];
}
};
const IndentationType = union(enum) {
immaterial: void, immaterial: void,
spaces: usize, spaces: usize,
tabs: void, tabs: void,
}; };
const InlineItem = union(enum) { const InlineItem = union(enum) {
empty: void, empty: void,
scalar: []const u8, scalar: []const u8,
line_string: []const u8, line_string: []const u8,
@ -110,51 +195,67 @@ pub const LineTokenizer = struct {
else => unreachable, else => unreachable,
}; };
} }
}; };
const LineContents = union(enum) { const LineContents = union(enum) {
comment: []const u8, comment: []const u8,
in_line: InlineItem, in_line: InlineItem,
list_item: InlineItem, list_item: InlineItem,
map_item: struct { key: []const u8, val: InlineItem }, map_item: struct { key: []const u8, val: InlineItem },
}; };
// we can dedent multiple levels at once. Example: // we can dedent multiple levels at once. Example:
// //
// foo: // foo:
// bar: // bar:
// > a // > a
// > string // > string
// baz: [qux] // baz: [qux]
// //
// capturing this is conceptually simple, but implementing it without complex // capturing this is conceptually simple, but implementing it without complex
// indentation tracking requires quantizing the indentation. This means our // indentation tracking requires quantizing the indentation. This means our
// IndentationType will also need to track the number of spaces used for // IndentationType will also need to track the number of spaces used for
// indentation, as detected. Then every line we have to check indent rem the // indentation, as detected. Then every line we have to check indent rem the
// quantization level == 0 (otherwise we broke quantization) and compute indent // quantization level == 0 (otherwise we broke quantization) and compute indent
// div the quantization level to give us our effective indentation level. // div the quantization level to give us our effective indentation level.
const ShiftDirection = enum { indent, dedent, none }; const ShiftDirection = enum { indent, dedent, none };
const RelativeIndent = union(ShiftDirection) { const RelativeIndent = union(ShiftDirection) {
indent: void, indent: void,
dedent: usize, dedent: usize,
none: void, none: void,
}; };
const Line = struct { const Line = struct {
indent: RelativeIndent, indent: RelativeIndent,
contents: LineContents, contents: LineContents,
raw: []const u8, raw: []const u8,
};
pub fn LineTokenizer(comptime Buffer: type) type {
return struct {
buffer: Buffer,
index: usize = 0,
indentation: IndentationType = .immaterial,
last_indent: usize = 0,
diagnostics: *Diagnostics,
row: usize = 0,
const Error = error{
BadToken,
MixedIndentation,
UnquantizedIndentation,
TooMuchIndentation,
MissingNewline,
TrailingWhitespace,
Impossible,
}; };
pub fn next(self: *LineTokenizer) Error!?Line { pub fn next(self: *@This()) Error!?Line {
if (self.index == self.buffer.len) return null; lineloop: while (self.buffer.nextLine()) |raw_line| {
var indent: usize = 0; var indent: usize = 0;
var offset: usize = 0; for (raw_line, 0..) |char, idx| {
for (self.buffer[self.index..], 0..) |char, idx| {
switch (char) { switch (char) {
' ' => { ' ' => {
switch (self.indentation) { switch (self.indentation) {
@ -168,7 +269,6 @@ pub const LineTokenizer = struct {
.spaces => {}, .spaces => {},
.tabs => return error.MixedIndentation, .tabs => return error.MixedIndentation,
} }
indent += 1;
}, },
'\t' => { '\t' => {
switch (self.indentation) { switch (self.indentation) {
@ -176,40 +276,28 @@ pub const LineTokenizer = struct {
.spaces => return error.MixedIndentation, .spaces => return error.MixedIndentation,
.tabs => {}, .tabs => {},
} }
indent += 1;
}, },
'\r' => { '\r' => {
return error.BadToken; return error.BadToken;
}, },
'\n' => { else => {
// don't even emit anything for empty rows. indent = idx;
self.row += 1; break;
offset = idx + 1;
// if it's too hard to deal with, Just Make It An Error!!!
// an empty line with whitespace on it is garbage. It can mess with
// the indentation detection grossly in a way that is annoying to
// deal with. Besides, having whitespace-only lines in a document
// is essentially terrorism, with which negotiations are famously
// not permitted.
if (indent > 0) return error.TrailingWhitespace;
}, },
else => break,
} }
} else { } else {
std.debug.assert(self.buffer.len == self.index + indent + offset + 1); if (raw_line.len > 0) return error.TrailingWhitespace;
self.index = self.buffer.len; continue :lineloop;
// this prong will get hit when the document only consists of whitespace
return null;
} }
var quantized: usize = if (self.indentation == .spaces) blk: { var quantized: usize = if (self.indentation == .spaces) quant: {
if (self.indentation.spaces == 0) { if (self.indentation.spaces == 0) {
self.indentation.spaces = indent; self.indentation.spaces = indent;
} }
if (@rem(indent, self.indentation.spaces) != 0) if (@rem(indent, self.indentation.spaces) != 0)
return error.UnquantizedIndentation; return error.UnquantizedIndentation;
break :blk @divExact(indent, self.indentation.spaces); break :quant @divExact(indent, self.indentation.spaces);
} else indent; } else indent;
const relative: RelativeIndent = if (quantized > self.last_indent) rel: { const relative: RelativeIndent = if (quantized > self.last_indent) rel: {
@ -221,16 +309,12 @@ pub const LineTokenizer = struct {
else else
.none; .none;
offset += indent;
defer { defer {
self.row += 1; self.row += 1;
self.last_indent = quantized; self.last_indent = quantized;
self.index += offset;
} }
const line = try consumeLine(self.buffer[self.index + offset ..]); const line = raw_line[indent..];
offset += line.len + 1;
// this should not be possible, as empty lines are caught earlier. // this should not be possible, as empty lines are caught earlier.
if (line.len == 0) return error.Impossible; if (line.len == 0) return error.Impossible;
@ -294,6 +378,11 @@ pub const LineTokenizer = struct {
}; };
}, },
} }
// somehow everything else has failed
return error.Impossible;
}
return null;
} }
fn detectInlineItem(buf: []const u8) Error!InlineItem { fn detectInlineItem(buf: []const u8) Error!InlineItem {
@ -336,19 +425,8 @@ pub const LineTokenizer = struct {
}, },
} }
} }
};
fn consumeLine(buf: []const u8) ![]const u8 { }
for (buf, 0..) |char, idx| {
switch (char) {
'\n' => return buf[0..idx],
'\r' => return error.BadToken,
else => {},
}
}
return error.MissingNewline;
}
};
pub const Value = union(enum) { pub const Value = union(enum) {
pub const String = std.ArrayList(u8); pub const String = std.ArrayList(u8);
@ -489,7 +567,7 @@ pub const Parser = struct {
DuplicateKey, DuplicateKey,
BadMapEntry, BadMapEntry,
Fail, Fail,
} || LineTokenizer.Error || FlowParser.Error || std.mem.Allocator.Error; } || LineTokenizer(FixedLineBuffer).Error || FlowParser.Error || std.mem.Allocator.Error;
pub const DuplicateKeyBehavior = enum { pub const DuplicateKeyBehavior = enum {
use_first, use_first,
@ -536,7 +614,7 @@ pub const Parser = struct {
document: Document, document: Document,
value_stack: Stack, value_stack: Stack,
state: ParseState = .initial, state: ParseState = .initial,
expect_shift: LineTokenizer.ShiftDirection = .none, expect_shift: ShiftDirection = .none,
dangling_key: ?[]const u8 = null, dangling_key: ?[]const u8 = null,
pub fn init(alloc: std.mem.Allocator) State { pub fn init(alloc: std.mem.Allocator) State {
@ -557,12 +635,16 @@ pub const Parser = struct {
const arena_alloc = document.arena.allocator(); const arena_alloc = document.arena.allocator();
var state: ParseState = .initial; var state: ParseState = .initial;
var expect_shift: LineTokenizer.ShiftDirection = .none; var expect_shift: ShiftDirection = .none;
var dangling_key: ?[]const u8 = null; var dangling_key: ?[]const u8 = null;
var stack = std.ArrayList(*Value).init(arena_alloc); var stack = std.ArrayList(*Value).init(arena_alloc);
defer stack.deinit(); defer stack.deinit();
var tok: LineTokenizer = .{ .buffer = buffer, .diagnostics = &self.diagnostics }; var tok: LineTokenizer(FixedLineBuffer) = .{
.buffer = FixedLineBuffer.init(buffer),
.diagnostics = &self.diagnostics,
};
while (try tok.next()) |line| { while (try tok.next()) |line| {
if (line.contents == .comment) continue; if (line.contents == .comment) continue;