Compare commits

..

No commits in common. "95a15adad72e628a14ed039e85a3a54e0482174f" and "6c1eb176be0852eaac540218f66664e0476021b2" have entirely different histories.

View File

@ -63,123 +63,38 @@
const std = @import("std"); const std = @import("std");
pub const IndexSlice = struct { start: usize, len: usize };
pub const Diagnostics = struct { pub const Diagnostics = struct {
row: usize, row: usize,
span: struct { absolute: usize, line_offset: usize, length: usize }, span: struct { absolute: usize, line_offset: usize, length: usize },
message: []const u8, message: []const u8,
}; };
pub const LineBuffer = struct { pub const LineTokenizer = struct {
allocator: std.mem.Allocator,
buffer: []u8,
used: usize,
window: IndexSlice,
pub const default_capacity: usize = 4096;
pub const Error = std.mem.Allocator.Error;
pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
return initCapacity(allocator, default_capacity);
}
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
return .{
.allocator = allocator,
.buffer = try allocator.alloc(u8, capacity),
.used = 0,
.window = .{ .start = 0, .len = 0 },
};
}
pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
if (data.len == 0) return;
// TODO: check for usize overflow here if we want Maximum Robustness
const new_window_len = self.window.len + data.len;
// data cannot fit in the buffer with our scan window, so we have to realloc
if (new_window_len > self.buffer.len) {
// TODO: adopt an overallocation strategy? Will potentially avoid allocating
// on every invocation but will cause the buffer to oversize
try self.allocator.realloc(self.buffer, new_window_len);
self.rehome();
@memcpy(self.buffer[self.used..].ptr, data);
self.used = new_window_len;
self.window.len = new_window_len;
}
// data will fit, but needs to be moved in the buffer
else if (self.window.start + new_window_len > self.buffer.len) {
self.rehome();
@memcpy(self.buffer[self.used..].ptr, data);
self.used = new_window_len;
self.window.len = new_window_len;
}
// data can simply be appended
else {
@memcpy(self.buffer[self.used..].ptr, data);
}
}
/// The memory returned by this function is valid until the next call to `feed`.
/// The resulting slice does not include the newline character.
pub fn nextLine(self: *LineBuffer) ?[]const u8 {
if (self.window.start >= self.buffer.len or self.window.len == 0)
return null;
const window = self.buffer[self.window.start..][0..self.window.len];
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
self.window.start += split + 1;
self.window.len -= split + 1;
return window[0..split];
}
fn rehome(self: *LineBuffer) void {
if (self.window.start == 0) return;
const window = self.buffer[self.window.start..][0..self.window.len];
if (self.window.len > self.window.start)
std.mem.copyForwards(u8, self.buffer, window)
else
@memcpy(self.buffer.ptr, window);
self.window.start = 0;
self.used = window.len;
}
};
pub const FixedLineBuffer = struct {
buffer: []const u8, buffer: []const u8,
window: IndexSlice, index: usize = 0,
indentation: IndentationType = .immaterial,
last_indent: usize = 0,
diagnostics: *Diagnostics,
pub fn init(data: []const u8) FixedLineBuffer { row: usize = 0,
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
}
pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 { const Error = error{
if (self.window.start >= self.buffer.len or self.window.len == 0) BadToken,
return null; MixedIndentation,
UnquantizedIndentation,
TooMuchIndentation,
MissingNewline,
TrailingWhitespace,
Impossible,
};
const window = self.buffer[self.window.start..][0..self.window.len]; const IndentationType = union(enum) {
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
self.window.start += split + 1;
self.window.len -= split + 1;
return window[0..split];
}
};
const IndentationType = union(enum) {
immaterial: void, immaterial: void,
spaces: usize, spaces: usize,
tabs: void, tabs: void,
}; };
const InlineItem = union(enum) { const InlineItem = union(enum) {
empty: void, empty: void,
scalar: []const u8, scalar: []const u8,
line_string: []const u8, line_string: []const u8,
@ -195,67 +110,51 @@ const InlineItem = union(enum) {
else => unreachable, else => unreachable,
}; };
} }
}; };
const LineContents = union(enum) { const LineContents = union(enum) {
comment: []const u8, comment: []const u8,
in_line: InlineItem, in_line: InlineItem,
list_item: InlineItem, list_item: InlineItem,
map_item: struct { key: []const u8, val: InlineItem }, map_item: struct { key: []const u8, val: InlineItem },
}; };
// we can dedent multiple levels at once. Example: // we can dedent multiple levels at once. Example:
// //
// foo: // foo:
// bar: // bar:
// > a // > a
// > string // > string
// baz: [qux] // baz: [qux]
// //
// capturing this is conceptually simple, but implementing it without complex // capturing this is conceptually simple, but implementing it without complex
// indentation tracking requires quantizing the indentation. This means our // indentation tracking requires quantizing the indentation. This means our
// IndentationType will also need to track the number of spaces used for // IndentationType will also need to track the number of spaces used for
// indentation, as detected. Then every line we have to check indent rem the // indentation, as detected. Then every line we have to check indent rem the
// quantization level == 0 (otherwise we broke quantization) and compute indent // quantization level == 0 (otherwise we broke quantization) and compute indent
// div the quantization level to give us our effective indentation level. // div the quantization level to give us our effective indentation level.
const ShiftDirection = enum { indent, dedent, none }; const ShiftDirection = enum { indent, dedent, none };
const RelativeIndent = union(ShiftDirection) { const RelativeIndent = union(ShiftDirection) {
indent: void, indent: void,
dedent: usize, dedent: usize,
none: void, none: void,
}; };
const Line = struct { const Line = struct {
indent: RelativeIndent, indent: RelativeIndent,
contents: LineContents, contents: LineContents,
raw: []const u8, raw: []const u8,
};
pub fn LineTokenizer(comptime Buffer: type) type {
return struct {
buffer: Buffer,
index: usize = 0,
indentation: IndentationType = .immaterial,
last_indent: usize = 0,
diagnostics: *Diagnostics,
row: usize = 0,
const Error = error{
BadToken,
MixedIndentation,
UnquantizedIndentation,
TooMuchIndentation,
MissingNewline,
TrailingWhitespace,
Impossible,
}; };
pub fn next(self: *@This()) Error!?Line { pub fn next(self: *LineTokenizer) Error!?Line {
lineloop: while (self.buffer.nextLine()) |raw_line| { if (self.index == self.buffer.len) return null;
var indent: usize = 0; var indent: usize = 0;
for (raw_line, 0..) |char, idx| { var offset: usize = 0;
for (self.buffer[self.index..], 0..) |char, idx| {
switch (char) { switch (char) {
' ' => { ' ' => {
switch (self.indentation) { switch (self.indentation) {
@ -269,6 +168,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.spaces => {}, .spaces => {},
.tabs => return error.MixedIndentation, .tabs => return error.MixedIndentation,
} }
indent += 1;
}, },
'\t' => { '\t' => {
switch (self.indentation) { switch (self.indentation) {
@ -276,28 +176,40 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.spaces => return error.MixedIndentation, .spaces => return error.MixedIndentation,
.tabs => {}, .tabs => {},
} }
indent += 1;
}, },
'\r' => { '\r' => {
return error.BadToken; return error.BadToken;
}, },
else => { '\n' => {
indent = idx; // don't even emit anything for empty rows.
break; self.row += 1;
offset = idx + 1;
// if it's too hard to deal with, Just Make It An Error!!!
// an empty line with whitespace on it is garbage. It can mess with
// the indentation detection grossly in a way that is annoying to
// deal with. Besides, having whitespace-only lines in a document
// is essentially terrorism, with which negotiations are famously
// not permitted.
if (indent > 0) return error.TrailingWhitespace;
}, },
else => break,
} }
} else { } else {
if (raw_line.len > 0) return error.TrailingWhitespace; std.debug.assert(self.buffer.len == self.index + indent + offset + 1);
continue :lineloop; self.index = self.buffer.len;
// this prong will get hit when the document only consists of whitespace
return null;
} }
var quantized: usize = if (self.indentation == .spaces) quant: { var quantized: usize = if (self.indentation == .spaces) blk: {
if (self.indentation.spaces == 0) { if (self.indentation.spaces == 0) {
self.indentation.spaces = indent; self.indentation.spaces = indent;
} }
if (@rem(indent, self.indentation.spaces) != 0) if (@rem(indent, self.indentation.spaces) != 0)
return error.UnquantizedIndentation; return error.UnquantizedIndentation;
break :quant @divExact(indent, self.indentation.spaces); break :blk @divExact(indent, self.indentation.spaces);
} else indent; } else indent;
const relative: RelativeIndent = if (quantized > self.last_indent) rel: { const relative: RelativeIndent = if (quantized > self.last_indent) rel: {
@ -309,12 +221,16 @@ pub fn LineTokenizer(comptime Buffer: type) type {
else else
.none; .none;
offset += indent;
defer { defer {
self.row += 1; self.row += 1;
self.last_indent = quantized; self.last_indent = quantized;
self.index += offset;
} }
const line = raw_line[indent..]; const line = try consumeLine(self.buffer[self.index + offset ..]);
offset += line.len + 1;
// this should not be possible, as empty lines are caught earlier. // this should not be possible, as empty lines are caught earlier.
if (line.len == 0) return error.Impossible; if (line.len == 0) return error.Impossible;
@ -378,11 +294,6 @@ pub fn LineTokenizer(comptime Buffer: type) type {
}; };
}, },
} }
// somehow everything else has failed
return error.Impossible;
}
return null;
} }
fn detectInlineItem(buf: []const u8) Error!InlineItem { fn detectInlineItem(buf: []const u8) Error!InlineItem {
@ -425,12 +336,23 @@ pub fn LineTokenizer(comptime Buffer: type) type {
}, },
} }
} }
};
} fn consumeLine(buf: []const u8) ![]const u8 {
for (buf, 0..) |char, idx| {
switch (char) {
'\n' => return buf[0..idx],
'\r' => return error.BadToken,
else => {},
}
}
return error.MissingNewline;
}
};
pub const Value = union(enum) { pub const Value = union(enum) {
pub const String = std.ArrayList(u8); pub const String = std.ArrayList(u8);
pub const Map = std.StringArrayHashMap(Value); pub const Map = std.StringHashMap(Value);
pub const List = std.ArrayList(Value); pub const List = std.ArrayList(Value);
pub const TagType = @typeInfo(Value).Union.tag_type.?; pub const TagType = @typeInfo(Value).Union.tag_type.?;
@ -567,7 +489,7 @@ pub const Parser = struct {
DuplicateKey, DuplicateKey,
BadMapEntry, BadMapEntry,
Fail, Fail,
} || LineTokenizer(FixedLineBuffer).Error || FlowParser.Error || std.mem.Allocator.Error; } || LineTokenizer.Error || FlowParser.Error || std.mem.Allocator.Error;
pub const DuplicateKeyBehavior = enum { pub const DuplicateKeyBehavior = enum {
use_first, use_first,
@ -614,7 +536,7 @@ pub const Parser = struct {
document: Document, document: Document,
value_stack: Stack, value_stack: Stack,
state: ParseState = .initial, state: ParseState = .initial,
expect_shift: ShiftDirection = .none, expect_shift: LineTokenizer.ShiftDirection = .none,
dangling_key: ?[]const u8 = null, dangling_key: ?[]const u8 = null,
pub fn init(alloc: std.mem.Allocator) State { pub fn init(alloc: std.mem.Allocator) State {
@ -635,16 +557,12 @@ pub const Parser = struct {
const arena_alloc = document.arena.allocator(); const arena_alloc = document.arena.allocator();
var state: ParseState = .initial; var state: ParseState = .initial;
var expect_shift: ShiftDirection = .none; var expect_shift: LineTokenizer.ShiftDirection = .none;
var dangling_key: ?[]const u8 = null; var dangling_key: ?[]const u8 = null;
var stack = std.ArrayList(*Value).init(arena_alloc); var stack = std.ArrayList(*Value).init(arena_alloc);
defer stack.deinit(); defer stack.deinit();
var tok: LineTokenizer(FixedLineBuffer) = .{ var tok: LineTokenizer = .{ .buffer = buffer, .diagnostics = &self.diagnostics };
.buffer = FixedLineBuffer.init(buffer),
.diagnostics = &self.diagnostics,
};
while (try tok.next()) |line| { while (try tok.next()) |line| {
if (line.contents == .comment) continue; if (line.contents == .comment) continue;
@ -727,7 +645,7 @@ pub const Parser = struct {
// key somewhere until we can consume the // key somewhere until we can consume the
// value. More parser state to lug along. // value. More parser state to lug along.
dangling_key = try arena_alloc.dupe(u8, pair.key); dangling_key = pair.key;
state = .value; state = .value;
}, },
.scalar => |str| { .scalar => |str| {
@ -897,7 +815,7 @@ pub const Parser = struct {
switch (pair.val) { switch (pair.val) {
.empty => { .empty => {
dangling_key = try arena_alloc.dupe(u8, pair.key); dangling_key = pair.key;
expect_shift = .indent; expect_shift = .indent;
}, },
.scalar => |str| try new_map.map.put(pair.key, try Value.fromScalar(arena_alloc, str)), .scalar => |str| try new_map.map.put(pair.key, try Value.fromScalar(arena_alloc, str)),
@ -995,7 +913,7 @@ pub const Parser = struct {
.none, .dedent => switch (pair.val) { .none, .dedent => switch (pair.val) {
.empty => { .empty => {
expect_shift = .indent; expect_shift = .indent;
dangling_key = try arena_alloc.dupe(u8, pair.key); dangling_key = pair.key;
}, },
.scalar => |str| try putMap(map, pair.key, try Value.fromScalar(arena_alloc, str), self.dupe_behavior), .scalar => |str| try putMap(map, pair.key, try Value.fromScalar(arena_alloc, str), self.dupe_behavior),
.line_string, .space_string => |str| try putMap(map, pair.key, try Value.fromString(arena_alloc, str), self.dupe_behavior), .line_string, .space_string => |str| try putMap(map, pair.key, try Value.fromString(arena_alloc, str), self.dupe_behavior),
@ -1013,7 +931,7 @@ pub const Parser = struct {
switch (pair.val) { switch (pair.val) {
.empty => { .empty => {
expect_shift = .indent; expect_shift = .indent;
dangling_key = try arena_alloc.dupe(u8, pair.key); dangling_key = pair.key;
}, },
.scalar => |str| try new_map.map.put(pair.key, try Value.fromScalar(arena_alloc, str)), .scalar => |str| try new_map.map.put(pair.key, try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try new_map.map.put(pair.key, try Value.fromString(arena_alloc, str)), .line_string, .space_string => |str| try new_map.map.put(pair.key, try Value.fromString(arena_alloc, str)),
@ -1334,7 +1252,7 @@ pub const FlowParser = struct {
.consuming_map_key => switch (char) { .consuming_map_key => switch (char) {
':' => { ':' => {
const tip = try getStackTip(self.stack); const tip = try getStackTip(self.stack);
dangling_key = try self.alloc.dupe(u8, self.buffer[tip.item_start..idx]); dangling_key = self.buffer[tip.item_start..idx];
self.state = .want_map_value; self.state = .want_map_value;
}, },