config: refactor LineTokenizer to use an internal line buffer
The goal here is to support a streaming parser. However, I did decide the leave the flow item parser state machine as fully buffered (i.e. not streaming). This is not JSON and in general documents should be many, shorter lines, so this buffering strategy should work reasonably well. I have not actually tried the streaming implementation of this, yet.
This commit is contained in:
parent
ab580fa80a
commit
6415571d01
202
src/config.zig
202
src/config.zig
@ -63,29 +63,114 @@
|
|||||||
|
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
|
|
||||||
|
pub const IndexSlice = struct { start: usize, len: usize };
|
||||||
|
|
||||||
pub const Diagnostics = struct {
|
pub const Diagnostics = struct {
|
||||||
row: usize,
|
row: usize,
|
||||||
span: struct { absolute: usize, line_offset: usize, length: usize },
|
span: struct { absolute: usize, line_offset: usize, length: usize },
|
||||||
message: []const u8,
|
message: []const u8,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub const LineTokenizer = struct {
|
pub const LineBuffer = struct {
|
||||||
|
allocator: std.mem.Allocator,
|
||||||
|
buffer: []u8,
|
||||||
|
used: usize,
|
||||||
|
window: IndexSlice,
|
||||||
|
|
||||||
|
pub const default_capacity: usize = 4096;
|
||||||
|
pub const Error = std.mem.Allocator.Error;
|
||||||
|
|
||||||
|
pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
|
||||||
|
return initCapacity(allocator, default_capacity);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
|
||||||
|
return .{
|
||||||
|
.allocator = allocator,
|
||||||
|
.buffer = try allocator.alloc(u8, capacity),
|
||||||
|
.used = 0,
|
||||||
|
.window = .{ .start = 0, .len = 0 },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
|
||||||
|
if (data.len == 0) return;
|
||||||
|
// TODO: check for usize overflow here if we want Maximum Robustness
|
||||||
|
const new_window_len = self.window.len + data.len;
|
||||||
|
|
||||||
|
// data cannot fit in the buffer with our scan window, so we have to realloc
|
||||||
|
if (new_window_len > self.buffer.len) {
|
||||||
|
// TODO: adopt an overallocation strategy? Will potentially avoid allocating
|
||||||
|
// on every invocation but will cause the buffer to oversize
|
||||||
|
try self.allocator.realloc(self.buffer, new_window_len);
|
||||||
|
self.rehome();
|
||||||
|
@memcpy(self.buffer[self.used..].ptr, data);
|
||||||
|
self.used = new_window_len;
|
||||||
|
self.window.len = new_window_len;
|
||||||
|
}
|
||||||
|
// data will fit, but needs to be moved in the buffer
|
||||||
|
else if (self.window.start + new_window_len > self.buffer.len) {
|
||||||
|
self.rehome();
|
||||||
|
@memcpy(self.buffer[self.used..].ptr, data);
|
||||||
|
self.used = new_window_len;
|
||||||
|
self.window.len = new_window_len;
|
||||||
|
}
|
||||||
|
// data can simply be appended
|
||||||
|
else {
|
||||||
|
@memcpy(self.buffer[self.used..].ptr, data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The memory returned by this function is valid until the next call to `feed`.
|
||||||
|
/// The resulting slice does not include the newline character.
|
||||||
|
pub fn nextLine(self: *LineBuffer) ?[]const u8 {
|
||||||
|
if (self.window.start >= self.buffer.len or self.window.len == 0)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
const window = self.buffer[self.window.start..][0..self.window.len];
|
||||||
|
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
|
||||||
|
|
||||||
|
self.window.start += split + 1;
|
||||||
|
self.window.len -= split + 1;
|
||||||
|
|
||||||
|
return window[0..split];
|
||||||
|
}
|
||||||
|
|
||||||
|
fn rehome(self: *LineBuffer) void {
|
||||||
|
if (self.window.start == 0) return;
|
||||||
|
|
||||||
|
const window = self.buffer[self.window.start..][0..self.window.len];
|
||||||
|
|
||||||
|
if (self.window.len > self.window.start)
|
||||||
|
std.mem.copyForwards(u8, self.buffer, window)
|
||||||
|
else
|
||||||
|
@memcpy(self.buffer.ptr, window);
|
||||||
|
|
||||||
|
self.window.start = 0;
|
||||||
|
self.used = window.len;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
pub const FixedLineBuffer = struct {
|
||||||
buffer: []const u8,
|
buffer: []const u8,
|
||||||
index: usize = 0,
|
window: IndexSlice,
|
||||||
indentation: IndentationType = .immaterial,
|
|
||||||
last_indent: usize = 0,
|
|
||||||
diagnostics: *Diagnostics,
|
|
||||||
|
|
||||||
row: usize = 0,
|
pub fn init(data: []const u8) FixedLineBuffer {
|
||||||
|
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
|
||||||
|
}
|
||||||
|
|
||||||
const Error = error{
|
pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
|
||||||
BadToken,
|
if (self.window.start >= self.buffer.len or self.window.len == 0)
|
||||||
MixedIndentation,
|
return null;
|
||||||
UnquantizedIndentation,
|
|
||||||
TooMuchIndentation,
|
const window = self.buffer[self.window.start..][0..self.window.len];
|
||||||
MissingNewline,
|
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
|
||||||
TrailingWhitespace,
|
|
||||||
Impossible,
|
self.window.start += split + 1;
|
||||||
|
self.window.len -= split + 1;
|
||||||
|
|
||||||
|
return window[0..split];
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const IndentationType = union(enum) {
|
const IndentationType = union(enum) {
|
||||||
@ -148,13 +233,29 @@ pub const LineTokenizer = struct {
|
|||||||
raw: []const u8,
|
raw: []const u8,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub fn next(self: *LineTokenizer) Error!?Line {
|
pub fn LineTokenizer(comptime Buffer: type) type {
|
||||||
if (self.index == self.buffer.len) return null;
|
return struct {
|
||||||
|
buffer: Buffer,
|
||||||
|
index: usize = 0,
|
||||||
|
indentation: IndentationType = .immaterial,
|
||||||
|
last_indent: usize = 0,
|
||||||
|
diagnostics: *Diagnostics,
|
||||||
|
row: usize = 0,
|
||||||
|
|
||||||
|
const Error = error{
|
||||||
|
BadToken,
|
||||||
|
MixedIndentation,
|
||||||
|
UnquantizedIndentation,
|
||||||
|
TooMuchIndentation,
|
||||||
|
MissingNewline,
|
||||||
|
TrailingWhitespace,
|
||||||
|
Impossible,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn next(self: *@This()) Error!?Line {
|
||||||
|
lineloop: while (self.buffer.nextLine()) |raw_line| {
|
||||||
var indent: usize = 0;
|
var indent: usize = 0;
|
||||||
var offset: usize = 0;
|
for (raw_line, 0..) |char, idx| {
|
||||||
|
|
||||||
for (self.buffer[self.index..], 0..) |char, idx| {
|
|
||||||
switch (char) {
|
switch (char) {
|
||||||
' ' => {
|
' ' => {
|
||||||
switch (self.indentation) {
|
switch (self.indentation) {
|
||||||
@ -168,7 +269,6 @@ pub const LineTokenizer = struct {
|
|||||||
.spaces => {},
|
.spaces => {},
|
||||||
.tabs => return error.MixedIndentation,
|
.tabs => return error.MixedIndentation,
|
||||||
}
|
}
|
||||||
indent += 1;
|
|
||||||
},
|
},
|
||||||
'\t' => {
|
'\t' => {
|
||||||
switch (self.indentation) {
|
switch (self.indentation) {
|
||||||
@ -176,40 +276,28 @@ pub const LineTokenizer = struct {
|
|||||||
.spaces => return error.MixedIndentation,
|
.spaces => return error.MixedIndentation,
|
||||||
.tabs => {},
|
.tabs => {},
|
||||||
}
|
}
|
||||||
indent += 1;
|
|
||||||
},
|
},
|
||||||
'\r' => {
|
'\r' => {
|
||||||
return error.BadToken;
|
return error.BadToken;
|
||||||
},
|
},
|
||||||
'\n' => {
|
else => {
|
||||||
// don't even emit anything for empty rows.
|
indent = idx;
|
||||||
self.row += 1;
|
break;
|
||||||
offset = idx + 1;
|
|
||||||
// if it's too hard to deal with, Just Make It An Error!!!
|
|
||||||
// an empty line with whitespace on it is garbage. It can mess with
|
|
||||||
// the indentation detection grossly in a way that is annoying to
|
|
||||||
// deal with. Besides, having whitespace-only lines in a document
|
|
||||||
// is essentially terrorism, with which negotiations are famously
|
|
||||||
// not permitted.
|
|
||||||
if (indent > 0) return error.TrailingWhitespace;
|
|
||||||
},
|
},
|
||||||
else => break,
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
std.debug.assert(self.buffer.len == self.index + indent + offset + 1);
|
if (raw_line.len > 0) return error.TrailingWhitespace;
|
||||||
self.index = self.buffer.len;
|
continue :lineloop;
|
||||||
// this prong will get hit when the document only consists of whitespace
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var quantized: usize = if (self.indentation == .spaces) blk: {
|
var quantized: usize = if (self.indentation == .spaces) quant: {
|
||||||
if (self.indentation.spaces == 0) {
|
if (self.indentation.spaces == 0) {
|
||||||
self.indentation.spaces = indent;
|
self.indentation.spaces = indent;
|
||||||
}
|
}
|
||||||
if (@rem(indent, self.indentation.spaces) != 0)
|
if (@rem(indent, self.indentation.spaces) != 0)
|
||||||
return error.UnquantizedIndentation;
|
return error.UnquantizedIndentation;
|
||||||
|
|
||||||
break :blk @divExact(indent, self.indentation.spaces);
|
break :quant @divExact(indent, self.indentation.spaces);
|
||||||
} else indent;
|
} else indent;
|
||||||
|
|
||||||
const relative: RelativeIndent = if (quantized > self.last_indent) rel: {
|
const relative: RelativeIndent = if (quantized > self.last_indent) rel: {
|
||||||
@ -221,16 +309,12 @@ pub const LineTokenizer = struct {
|
|||||||
else
|
else
|
||||||
.none;
|
.none;
|
||||||
|
|
||||||
offset += indent;
|
|
||||||
|
|
||||||
defer {
|
defer {
|
||||||
self.row += 1;
|
self.row += 1;
|
||||||
self.last_indent = quantized;
|
self.last_indent = quantized;
|
||||||
self.index += offset;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const line = try consumeLine(self.buffer[self.index + offset ..]);
|
const line = raw_line[indent..];
|
||||||
offset += line.len + 1;
|
|
||||||
|
|
||||||
// this should not be possible, as empty lines are caught earlier.
|
// this should not be possible, as empty lines are caught earlier.
|
||||||
if (line.len == 0) return error.Impossible;
|
if (line.len == 0) return error.Impossible;
|
||||||
@ -294,6 +378,11 @@ pub const LineTokenizer = struct {
|
|||||||
};
|
};
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// somehow everything else has failed
|
||||||
|
return error.Impossible;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn detectInlineItem(buf: []const u8) Error!InlineItem {
|
fn detectInlineItem(buf: []const u8) Error!InlineItem {
|
||||||
@ -336,19 +425,8 @@ pub const LineTokenizer = struct {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn consumeLine(buf: []const u8) ![]const u8 {
|
|
||||||
for (buf, 0..) |char, idx| {
|
|
||||||
switch (char) {
|
|
||||||
'\n' => return buf[0..idx],
|
|
||||||
'\r' => return error.BadToken,
|
|
||||||
else => {},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return error.MissingNewline;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
}
|
||||||
|
|
||||||
pub const Value = union(enum) {
|
pub const Value = union(enum) {
|
||||||
pub const String = std.ArrayList(u8);
|
pub const String = std.ArrayList(u8);
|
||||||
@ -489,7 +567,7 @@ pub const Parser = struct {
|
|||||||
DuplicateKey,
|
DuplicateKey,
|
||||||
BadMapEntry,
|
BadMapEntry,
|
||||||
Fail,
|
Fail,
|
||||||
} || LineTokenizer.Error || FlowParser.Error || std.mem.Allocator.Error;
|
} || LineTokenizer(FixedLineBuffer).Error || FlowParser.Error || std.mem.Allocator.Error;
|
||||||
|
|
||||||
pub const DuplicateKeyBehavior = enum {
|
pub const DuplicateKeyBehavior = enum {
|
||||||
use_first,
|
use_first,
|
||||||
@ -536,7 +614,7 @@ pub const Parser = struct {
|
|||||||
document: Document,
|
document: Document,
|
||||||
value_stack: Stack,
|
value_stack: Stack,
|
||||||
state: ParseState = .initial,
|
state: ParseState = .initial,
|
||||||
expect_shift: LineTokenizer.ShiftDirection = .none,
|
expect_shift: ShiftDirection = .none,
|
||||||
dangling_key: ?[]const u8 = null,
|
dangling_key: ?[]const u8 = null,
|
||||||
|
|
||||||
pub fn init(alloc: std.mem.Allocator) State {
|
pub fn init(alloc: std.mem.Allocator) State {
|
||||||
@ -557,12 +635,16 @@ pub const Parser = struct {
|
|||||||
const arena_alloc = document.arena.allocator();
|
const arena_alloc = document.arena.allocator();
|
||||||
|
|
||||||
var state: ParseState = .initial;
|
var state: ParseState = .initial;
|
||||||
var expect_shift: LineTokenizer.ShiftDirection = .none;
|
var expect_shift: ShiftDirection = .none;
|
||||||
var dangling_key: ?[]const u8 = null;
|
var dangling_key: ?[]const u8 = null;
|
||||||
var stack = std.ArrayList(*Value).init(arena_alloc);
|
var stack = std.ArrayList(*Value).init(arena_alloc);
|
||||||
defer stack.deinit();
|
defer stack.deinit();
|
||||||
|
|
||||||
var tok: LineTokenizer = .{ .buffer = buffer, .diagnostics = &self.diagnostics };
|
var tok: LineTokenizer(FixedLineBuffer) = .{
|
||||||
|
.buffer = FixedLineBuffer.init(buffer),
|
||||||
|
.diagnostics = &self.diagnostics,
|
||||||
|
};
|
||||||
|
|
||||||
while (try tok.next()) |line| {
|
while (try tok.next()) |line| {
|
||||||
if (line.contents == .comment) continue;
|
if (line.contents == .comment) continue;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user