nice-data/src/config.zig
torque 54e4a14e38
config: item start does not need to be stored at every stack level
This is a simplification, but the main motivation is that the flow
parser stack can be integrated with the main parser stack because they
are not disparate types any more.
2023-09-24 14:58:31 -07:00

1303 lines
54 KiB
Zig

// Heavily inspired by, but not quite compatible with, NestedText. Key differences:
//
// - Doesn't support multiline keys (this means map keys cannot start with
// ' ', \t, #, {, [, |, or >, and they cannot contain :)
// - Allows using tabs for indentation (but not mixed tabs/spaces)
// - Indentation must be quantized consistently throughout the document. e.g.
// every nested layer being exactly 2 spaces past its parent. Tabs may
// only use one tab per indentation level.
// - Allows flow-style lists, maps, and strings on the same line as map keys or
// list items (i.e. the following are legal):
//
// key: {inline: map}
// key: [inline, list]
// key: > inline string
// - {map: item}
// - [list, item]
// - > inline string
//
// The string case retains the possibility of having an inline map value starting
// with {, [, or >
// - inline lists and maps cannot contain other inline structures. This may
// change, as writing {:[{:[{:[{:[{:[{:[]}]}]}]}]}]} seems tremendously useful
// - a map keys and list item dashes must be followed by a value or an indented
// section to reduce parser quantum state. This means that
//
// foo:
// bar: baz
//
// or
//
// -
// - qux
//
// are not valid. This can be represented with an inline empty string after foo:
//
// foo: >
// bar: baz
//
// or
//
// - >
// - qux
//
// - newlines are strictly LF, if the parser finds CR, it is an error
// - blank lines may not contain any whitespace characters except the single LF
// - Additional string indicator `|` for soft-wrapped strings, i.e.
//
// key: | this is not special
// key:
// | these lines are
// | soft-wrapped
//
// soft-wrapped lines are joined with a ' ' instead of a newline character.
// Like multiline strings, the final space is stripped (I guess this is a very
// janky way to add trailing whitespace to a string).
//
// - terminated strings to allow trailing whitespace:
// | this string has trailing whitespace |
// > and so does this one |
// - The parser is both strict and probably sloppy and may have weird edge
// cases since I'm slinging code, not writing a spec. For example, tabs are
// not trimmed from the values of inline lists/maps
const std = @import("std");
pub const IndexSlice = struct { start: usize, len: usize };
pub const Diagnostics = struct {
row: usize,
span: struct { absolute: usize, line_offset: usize, length: usize },
message: []const u8,
};
pub const LineBuffer = struct {
allocator: std.mem.Allocator,
buffer: []u8,
used: usize,
window: IndexSlice,
pub const default_capacity: usize = 4096;
pub const Error = std.mem.Allocator.Error;
pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
return initCapacity(allocator, default_capacity);
}
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
return .{
.allocator = allocator,
.buffer = try allocator.alloc(u8, capacity),
.used = 0,
.window = .{ .start = 0, .len = 0 },
};
}
pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
if (data.len == 0) return;
// TODO: check for usize overflow here if we want Maximum Robustness
const new_window_len = self.window.len + data.len;
// data cannot fit in the buffer with our scan window, so we have to realloc
if (new_window_len > self.buffer.len) {
// TODO: adopt an overallocation strategy? Will potentially avoid allocating
// on every invocation but will cause the buffer to oversize
try self.allocator.realloc(self.buffer, new_window_len);
self.rehome();
@memcpy(self.buffer[self.used..].ptr, data);
self.used = new_window_len;
self.window.len = new_window_len;
}
// data will fit, but needs to be moved in the buffer
else if (self.window.start + new_window_len > self.buffer.len) {
self.rehome();
@memcpy(self.buffer[self.used..].ptr, data);
self.used = new_window_len;
self.window.len = new_window_len;
}
// data can simply be appended
else {
@memcpy(self.buffer[self.used..].ptr, data);
}
}
/// The memory returned by this function is valid until the next call to `feed`.
/// The resulting slice does not include the newline character.
pub fn nextLine(self: *LineBuffer) ?[]const u8 {
if (self.window.start >= self.buffer.len or self.window.len == 0)
return null;
const window = self.buffer[self.window.start..][0..self.window.len];
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
self.window.start += split + 1;
self.window.len -= split + 1;
return window[0..split];
}
fn rehome(self: *LineBuffer) void {
if (self.window.start == 0) return;
const window = self.buffer[self.window.start..][0..self.window.len];
if (self.window.len > self.window.start)
std.mem.copyForwards(u8, self.buffer, window)
else
@memcpy(self.buffer.ptr, window);
self.window.start = 0;
self.used = window.len;
}
};
pub const FixedLineBuffer = struct {
buffer: []const u8,
window: IndexSlice,
pub fn init(data: []const u8) FixedLineBuffer {
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
}
pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
if (self.window.start >= self.buffer.len or self.window.len == 0)
return null;
const window = self.buffer[self.window.start..][0..self.window.len];
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
self.window.start += split + 1;
self.window.len -= split + 1;
return window[0..split];
}
};
const IndentationType = union(enum) {
immaterial: void,
spaces: usize,
tabs: void,
};
const InlineItem = union(enum) {
empty: void,
scalar: []const u8,
line_string: []const u8,
space_string: []const u8,
flow_list: []const u8,
flow_map: []const u8,
fn lineEnding(self: InlineItem) u8 {
return switch (self) {
.line_string => '\n',
.space_string => ' ',
else => unreachable,
};
}
};
const LineContents = union(enum) {
comment: []const u8,
in_line: InlineItem,
list_item: InlineItem,
map_item: struct { key: []const u8, val: InlineItem },
};
// we can dedent multiple levels at once. Example:
//
// foo:
// bar:
// > a
// > string
// baz: [qux]
//
// capturing this is conceptually simple, but implementing it without complex
// indentation tracking requires quantizing the indentation. This means our
// IndentationType will also need to track the number of spaces used for
// indentation, as detected. Then every line we have to check indent rem the
// quantization level == 0 (otherwise we broke quantization) and compute indent
// div the quantization level to give us our effective indentation level.
const ShiftDirection = enum { indent, dedent, none };
const RelativeIndent = union(ShiftDirection) {
indent: void,
dedent: usize,
none: void,
};
const Line = struct {
indent: RelativeIndent,
contents: LineContents,
raw: []const u8,
};
pub fn LineTokenizer(comptime Buffer: type) type {
return struct {
buffer: Buffer,
index: usize = 0,
indentation: IndentationType = .immaterial,
last_indent: usize = 0,
diagnostics: *Diagnostics,
row: usize = 0,
const Error = error{
BadToken,
MixedIndentation,
UnquantizedIndentation,
TooMuchIndentation,
MissingNewline,
TrailingWhitespace,
Impossible,
};
pub fn next(self: *@This()) Error!?Line {
lineloop: while (self.buffer.nextLine()) |raw_line| {
var indent: usize = 0;
for (raw_line, 0..) |char, idx| {
switch (char) {
' ' => {
switch (self.indentation) {
// There's a weird coupling here because we can't set this until
// all spaces have been consumed. I also thought about ignoring
// spaces on comment lines since those don't affect the
// relative indent/dedent, but then we would allow comments
// to ignore our indent quantum, which I dislike due to it making
// ugly documents.
.immaterial => self.indentation = .{ .spaces = 0 },
.spaces => {},
.tabs => return error.MixedIndentation,
}
},
'\t' => {
switch (self.indentation) {
.immaterial => self.indentation = .tabs,
.spaces => return error.MixedIndentation,
.tabs => {},
}
},
'\r' => {
return error.BadToken;
},
else => {
indent = idx;
break;
},
}
} else {
if (raw_line.len > 0) return error.TrailingWhitespace;
continue :lineloop;
}
var quantized: usize = if (self.indentation == .spaces) quant: {
if (self.indentation.spaces == 0) {
self.indentation.spaces = indent;
}
if (@rem(indent, self.indentation.spaces) != 0)
return error.UnquantizedIndentation;
break :quant @divExact(indent, self.indentation.spaces);
} else indent;
const relative: RelativeIndent = if (quantized > self.last_indent) rel: {
if ((quantized - self.last_indent) > 1)
return error.TooMuchIndentation;
break :rel .indent;
} else if (quantized < self.last_indent)
.{ .dedent = self.last_indent - quantized }
else
.none;
defer {
self.row += 1;
self.last_indent = quantized;
}
const line = raw_line[indent..];
// this should not be possible, as empty lines are caught earlier.
if (line.len == 0) return error.Impossible;
switch (line[0]) {
'#' => {
// simply lie about indentation when the line is a comment.
quantized = self.last_indent;
return .{
.indent = .none,
.contents = .{ .comment = line[1..] },
.raw = line,
};
},
'|', '>', '[', '{' => {
return .{
.indent = relative,
.contents = .{ .in_line = try detectInlineItem(line) },
.raw = line,
};
},
'-' => {
if (line.len > 1 and line[1] != ' ') return error.BadToken;
return if (line.len == 1) .{
.indent = relative,
.contents = .{ .list_item = .empty },
.raw = line,
} else .{
.indent = relative,
.contents = .{ .list_item = try detectInlineItem(line[2..]) },
.raw = line,
};
},
else => {
for (line, 0..) |char, idx| {
if (char == ':') {
if (idx + 1 == line.len) return .{
.indent = relative,
.contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
.raw = line,
};
if (line[idx + 1] != ' ') return error.BadToken;
return .{
.indent = relative,
.contents = .{ .map_item = .{
.key = line[0..idx],
.val = try detectInlineItem(line[idx + 2 ..]),
} },
.raw = line,
};
}
}
return .{
.indent = relative,
.contents = .{ .in_line = .{ .scalar = line } },
.raw = line,
};
},
}
// somehow everything else has failed
return error.Impossible;
}
return null;
}
fn detectInlineItem(buf: []const u8) Error!InlineItem {
if (buf.len == 0) return .empty;
switch (buf[0]) {
'>', '|' => |char| {
if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
const slice: []const u8 = switch (buf[buf.len - 1]) {
' ', '\t' => return error.TrailingWhitespace,
'|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
else => buf[@min(2, buf.len)..buf.len],
};
return if (char == '>')
.{ .line_string = slice }
else
.{ .space_string = slice };
},
'[' => {
if (buf.len < 2 or buf[buf.len - 1] != ']')
return error.BadToken;
// keep the closing ] for the flow parser
return .{ .flow_list = buf[1..] };
},
'{' => {
if (buf.len < 2 or buf[buf.len - 1] != '}')
return error.BadToken;
// keep the closing } fpr the flow parser
return .{ .flow_map = buf[1..] };
},
else => {
if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t')
return error.TrailingWhitespace;
return .{ .scalar = buf };
},
}
}
};
}
pub const Value = union(enum) {
pub const String = std.ArrayList(u8);
pub const Map = std.StringArrayHashMap(Value);
pub const List = std.ArrayList(Value);
pub const TagType = @typeInfo(Value).Union.tag_type.?;
scalar: String,
string: String,
list: List,
flow_list: List,
map: Map,
flow_map: Map,
pub inline fn fromScalar(alloc: std.mem.Allocator, input: []const u8) !Value {
return try _fromScalarOrString(alloc, .scalar, input);
}
pub inline fn fromString(alloc: std.mem.Allocator, input: []const u8) !Value {
return try _fromScalarOrString(alloc, .string, input);
}
inline fn _fromScalarOrString(alloc: std.mem.Allocator, comptime classification: TagType, input: []const u8) !Value {
var res = @unionInit(Value, @tagName(classification), try String.initCapacity(alloc, input.len));
@field(res, @tagName(classification)).appendSliceAssumeCapacity(input);
return res;
}
pub inline fn newScalar(alloc: std.mem.Allocator) Value {
return .{ .scalar = String.init(alloc) };
}
pub inline fn newString(alloc: std.mem.Allocator) Value {
return .{ .string = String.init(alloc) };
}
pub inline fn newList(alloc: std.mem.Allocator) Value {
return .{ .list = List.init(alloc) };
}
pub inline fn newFlowList(alloc: std.mem.Allocator) Value {
return .{ .flow_list = List.init(alloc) };
}
pub inline fn newMap(alloc: std.mem.Allocator) Value {
return .{ .map = Map.init(alloc) };
}
pub inline fn newFlowMap(alloc: std.mem.Allocator) Value {
return .{ .flow_map = Map.init(alloc) };
}
pub fn printDebug(self: Value) void {
self.printRecursive(0);
std.debug.print("\n", .{});
}
fn printRecursive(self: Value, indent: usize) void {
switch (self) {
.scalar, .string => |str| {
if (std.mem.indexOfScalar(u8, str.items, '\n')) |_| {
var lines = std.mem.splitScalar(u8, str.items, '\n');
std.debug.print("\n", .{});
while (lines.next()) |line| {
std.debug.print(
"{[empty]s: >[indent]}{[line]s}{[nl]s}",
.{
.empty = "",
.indent = indent,
.line = line,
.nl = if (lines.peek() == null) "" else "\n",
},
);
}
} else {
std.debug.print("{s}", .{str.items});
}
},
.list, .flow_list => |list| {
if (list.items.len == 0) {
std.debug.print("[]", .{});
return;
}
std.debug.print("[\n", .{});
for (list.items, 0..) |value, idx| {
std.debug.print("{[empty]s: >[indent]}[{[idx]d}] = ", .{ .empty = "", .indent = indent, .idx = idx });
value.printRecursive(indent + 2);
std.debug.print(",\n", .{});
}
std.debug.print(
"{[empty]s: >[indent]}]",
.{ .empty = "", .indent = indent },
);
},
.map, .flow_map => |map| {
if (map.count() == 0) {
std.debug.print("{{}}", .{});
return;
}
std.debug.print("{{\n", .{});
var iter = map.iterator();
while (iter.next()) |entry| {
std.debug.print(
"{[empty]s: >[indent]}{[key]s}: ",
.{ .empty = "", .indent = indent + 2, .key = entry.key_ptr.* },
);
entry.value_ptr.printRecursive(indent + 4);
std.debug.print(",\n", .{});
}
std.debug.print(
"{[empty]s: >[indent]}}}",
.{ .empty = "", .indent = indent },
);
},
}
}
};
pub const Parser = struct {
allocator: std.mem.Allocator,
dupe_behavior: DuplicateKeyBehavior = .fail,
default_object: DefaultObject = .fail,
diagnostics: Diagnostics = .{
.row = 0,
.span = .{ .absolute = 0, .line_offset = 0, .length = 0 },
.message = "all is well",
},
pub const Error = error{
UnexpectedIndent,
UnexpectedValue,
ExtraContent,
EmptyDocument,
DuplicateKey,
BadMapEntry,
BadState,
BadToken,
Fail,
} || LineTokenizer(FixedLineBuffer).Error || std.mem.Allocator.Error;
pub const DuplicateKeyBehavior = enum {
use_first,
use_last,
fail,
};
pub const DefaultObject = enum {
string,
list,
map,
fail,
};
pub const ParseState = enum {
initial,
value,
done,
};
pub const Document = struct {
arena: std.heap.ArenaAllocator,
root: Value,
pub fn init(alloc: std.mem.Allocator) Document {
return .{
.arena = std.heap.ArenaAllocator.init(alloc),
.root = undefined,
};
}
pub fn printDebug(self: Document) void {
return self.root.printDebug();
}
pub fn deinit(self: Document) void {
self.arena.deinit();
}
};
pub const State = struct {
pub const Stack = std.ArrayList(*Value);
document: Document,
value_stack: Stack,
state: ParseState = .initial,
expect_shift: ShiftDirection = .none,
dangling_key: ?[]const u8 = null,
pub fn init(alloc: std.mem.Allocator) State {
return .{
.document = Document.init(alloc),
.value_stack = Stack.init(alloc),
};
}
pub fn deinit(self: State) void {
self.value_stack.deinit();
}
};
pub fn parseBuffer(self: *Parser, buffer: []const u8) Error!Document {
var document = Document.init(self.allocator);
errdefer document.deinit();
const arena_alloc = document.arena.allocator();
var state: ParseState = .initial;
var expect_shift: ShiftDirection = .none;
var dangling_key: ?[]const u8 = null;
var stack = std.ArrayList(*Value).init(arena_alloc);
defer stack.deinit();
var tok: LineTokenizer(FixedLineBuffer) = .{
.buffer = FixedLineBuffer.init(buffer),
.diagnostics = &self.diagnostics,
};
while (try tok.next()) |line| {
if (line.contents == .comment) continue;
var flip = true;
var flop = false;
// this is needed to give us a second go round when the line is dedented
flipflop: while (flip) : (flop = true) {
switch (state) {
.initial => {
if (line.indent == .indent) return error.UnexpectedIndent;
switch (line.contents) {
// we filter out comments above
.comment => unreachable,
.in_line => |in_line| switch (in_line) {
// empty scalars are only emitted for a list_item or a map_item
.empty => unreachable,
.scalar => |str| {
document.root = try Value.fromScalar(arena_alloc, str);
// this is a cheesy hack. If the document consists
// solely of a scalar, the finalizer will try to
// chop a line ending off of it, so we need to add
// a sacrificial padding character to avoid
// chopping off something that matters.
try document.root.string.append(' ');
state = .done;
},
.line_string, .space_string => |str| {
document.root = try Value.fromString(arena_alloc, str);
try document.root.string.append(in_line.lineEnding());
try stack.append(&document.root);
state = .value;
},
.flow_list => |str| {
document.root = try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior);
state = .done;
},
.flow_map => |str| {
document.root = try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior);
state = .done;
},
},
.list_item => |value| {
document.root = Value.newList(arena_alloc);
try stack.append(&document.root);
state = .value;
switch (value) {
.empty => expect_shift = .indent,
.scalar => |str| try document.root.list.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try document.root.list.append(try Value.fromString(arena_alloc, str)),
.flow_list => |str| try document.root.list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
.flow_map => |str| try document.root.list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
}
},
.map_item => |pair| {
document.root = Value.newMap(arena_alloc);
try stack.append(&document.root);
state = .value;
const dupekey = try arena_alloc.dupe(u8, pair.key);
switch (pair.val) {
.empty => {
expect_shift = .indent;
// If the key is on its own line, we don't have
// an associated value until we parse the next
// line. We need to store a reference to this
// key somewhere until we can consume the
// value. More parser state to lug along.
dangling_key = dupekey;
},
.scalar => |str| try document.root.map.put(dupekey, try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try document.root.map.put(dupekey, try Value.fromString(arena_alloc, str)),
.flow_list => |str| try document.root.map.put(dupekey, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
.flow_map => |str| try document.root.map.put(dupekey, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
}
},
}
},
.value => switch (stack.getLast().*) {
// these three states are never reachable here. flow_list and
// flow_map are parsed with a separate state machine. These
// value types can only be present by themselves as the first
// line of the document, in which case the document consists
// only of that single line: this parser jumps immediately into
// the .done state, bypassing the .value state in which this
// switch is embedded.
.scalar, .flow_list, .flow_map => unreachable,
.string => |*string| {
if (line.indent == .indent)
return error.UnexpectedIndent;
if (!flop and line.indent == .dedent) {
// kick off the last trailing space or newline
_ = string.pop();
var dedent_depth = line.indent.dedent;
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = stack.pop();
continue :flipflop;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| switch (in_line) {
.empty => unreachable,
.line_string, .space_string => |str| {
try string.appendSlice(str);
try string.append(in_line.lineEnding());
},
else => return error.UnexpectedValue,
},
else => return error.UnexpectedValue,
}
},
.list => |*list| {
// detect that the previous item was actually empty
//
// -
// - something
//
// the first line here creates the expect_shift, but the second line
// is a valid continuation of the list despite not being indented
if (!flop and (expect_shift == .indent and line.indent != .indent))
try list.append(Value.newScalar(arena_alloc));
// Consider:
//
// -
// own-line scalar
// - inline scalar
//
// the own-line scalar will not push the stack but the next list item will be a dedent
if (!flop and line.indent == .dedent) {
// if line.indent.dedent is 1 and we're expecting it, the stack will not be popped,
// but we will continue loop flipflop. However, flop will be set to false on the next
// trip, so this if prong will not be run again.
var dedent_depth = line.indent.dedent - @intFromBool(expect_shift == .dedent);
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = stack.pop();
continue :flipflop;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| {
// assert that this line has been indented. this is required for an inline value when
// the stack is in list mode.
if (expect_shift != .indent or line.indent != .indent)
return error.UnexpectedValue;
expect_shift = .dedent;
switch (in_line) {
.empty => unreachable,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.flow_list => |str| try list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
.flow_map => |str| try list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
.line_string, .space_string => |str| {
// string pushes the stack
const new_string = try appendListGetValue(list, try Value.fromString(arena_alloc, str));
try stack.append(new_string);
try new_string.string.append(in_line.lineEnding());
expect_shift = .none;
},
}
},
.list_item => |value| {
if (flop or (line.indent == .none or line.indent == .dedent)) {
expect_shift = .none;
switch (value) {
.empty => expect_shift = .indent,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try list.append(try Value.fromString(arena_alloc, str)),
.flow_list => |str| try list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
.flow_map => |str| try list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
}
} else if (line.indent == .indent) {
if (expect_shift != .indent) return error.UnexpectedIndent;
const new_list = try appendListGetValue(list, Value.newList(arena_alloc));
try stack.append(new_list);
expect_shift = .none;
continue :flipflop;
} else unreachable;
},
.map_item => {
// this prong cannot be hit on dedent in a valid way.
//
// -
// map: value
// second: value
// third: value
//
// dedenting back to the list stack level requires list_item
if (line.indent != .indent)
return error.UnexpectedValue;
const new_map = try appendListGetValue(list, Value.newMap(arena_alloc));
try stack.append(new_map);
expect_shift = .none;
continue :flipflop;
},
}
},
.map => |*map| {
// detect that the previous item was actually empty
//
// foo:
// bar: baz
//
// the first line here creates the expect_shift, but the second line
// is a valid continuation of the map despite not being indented
if (!flop and (expect_shift == .indent and line.indent != .indent)) {
try putMap(
map,
dangling_key orelse return error.Fail,
Value.newScalar(arena_alloc),
self.dupe_behavior,
);
dangling_key = null;
}
if (!flop and line.indent == .dedent) {
var dedent_depth = line.indent.dedent - @intFromBool(expect_shift == .dedent);
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = stack.pop();
continue :flipflop;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| {
// assert that this line has been indented. this is required for an inline value when
// the stack is in map mode.
if (expect_shift != .indent or line.indent != .indent or dangling_key == null)
return error.UnexpectedValue;
expect_shift = .dedent;
switch (in_line) {
.empty => unreachable,
.scalar => |str| try putMap(map, dangling_key.?, try Value.fromScalar(arena_alloc, str), self.dupe_behavior),
.flow_list => |str| try putMap(map, dangling_key.?, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior), self.dupe_behavior),
.flow_map => |str| {
try putMap(map, dangling_key.?, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior), self.dupe_behavior);
},
.line_string, .space_string => |str| {
// string pushes the stack
const new_string = try putMapGetValue(map, dangling_key.?, try Value.fromString(arena_alloc, str), self.dupe_behavior);
try new_string.string.append(in_line.lineEnding());
try stack.append(new_string);
expect_shift = .none;
},
}
dangling_key = null;
},
.list_item => {
// this prong cannot be hit on dedent in a valid way.
//
// map:
// - value
// - invalid
//
// dedenting back to the map stack level requires map_item
if (expect_shift != .indent or line.indent != .indent or dangling_key == null)
return error.UnexpectedValue;
const new_list = try putMapGetValue(map, dangling_key.?, Value.newList(arena_alloc), self.dupe_behavior);
try stack.append(new_list);
dangling_key = null;
expect_shift = .none;
continue :flipflop;
},
.map_item => |pair| {
if (flop or (line.indent == .none or line.indent == .dedent)) {
expect_shift = .none;
const dupekey = try arena_alloc.dupe(u8, pair.key);
switch (pair.val) {
.empty => {
expect_shift = .indent;
dangling_key = dupekey;
},
.scalar => |str| try putMap(map, dupekey, try Value.fromScalar(arena_alloc, str), self.dupe_behavior),
.line_string, .space_string => |str| try putMap(map, dupekey, try Value.fromString(arena_alloc, str), self.dupe_behavior),
.flow_list => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior), self.dupe_behavior),
.flow_map => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior), self.dupe_behavior),
}
} else if (line.indent == .indent) {
if (expect_shift != .indent or dangling_key == null) return error.UnexpectedValue;
const new_map = try putMapGetValue(map, dangling_key.?, Value.newMap(arena_alloc), self.dupe_behavior);
try stack.append(new_map);
dangling_key = null;
continue :flipflop;
} else unreachable;
},
}
},
},
.done => return error.ExtraContent,
}
// this is specifically performed at the end of the loop body so that
// `continue :flipflop` skips setting it.
flip = false;
}
}
switch (state) {
.initial => switch (self.default_object) {
.string => document.root = .{ .string = std.ArrayList(u8).init(arena_alloc) },
.list => document.root = Value.newList(arena_alloc),
.map => document.root = Value.newMap(arena_alloc),
.fail => return error.EmptyDocument,
},
.value => switch (stack.getLast().*) {
// remove the final trailing newline or space
.scalar, .string => |*string| _ = string.popOrNull(),
// if we have a dangling -, attach an empty string to it
.list => |*list| if (expect_shift == .indent) try list.append(Value.newScalar(arena_alloc)),
// if we have a dangling "key:", attach an empty string to it
.map => |*map| if (dangling_key) |dk| try putMap(map, dk, Value.newScalar(arena_alloc), self.dupe_behavior),
.flow_list, .flow_map => {},
},
.done => {},
}
return document;
}
const FlowStack: type = std.ArrayList(*Value);
inline fn getStackTip(stack: FlowStack) Error!*Value {
if (stack.items.len == 0) return error.BadState;
return stack.items[stack.items.len - 1];
}
inline fn popStack(stack: *FlowStack) Error!FlowParseState {
if (stack.popOrNull() == null)
return error.BadState;
const parent = stack.getLastOrNull() orelse return .done;
return switch (parent.*) {
.flow_list => .want_list_separator,
.flow_map => .want_map_separator,
else => return error.BadState,
};
}
const FlowParseState = enum {
want_list_item,
consuming_list_item,
want_list_separator,
want_map_key,
consuming_map_key,
want_map_value,
consuming_map_value,
want_map_separator,
done,
};
pub fn parseFlow(
alloc: std.mem.Allocator,
contents: []const u8,
root_type: Value.TagType,
dupe_behavior: DuplicateKeyBehavior,
) Error!Value {
var root: Value = switch (root_type) {
.flow_list => Value.newFlowList(alloc),
.flow_map => Value.newFlowMap(alloc),
else => return error.BadState,
};
var state: FlowParseState = switch (root_type) {
.flow_list => .want_list_item,
.flow_map => .want_map_key,
else => unreachable,
};
var stack = try FlowStack.initCapacity(alloc, 1);
stack.appendAssumeCapacity(&root);
// used to distinguish betwen [] and [ ], and it also tracks
// a continuous value between different states
var item_start: usize = 0;
var dangling_key: ?[]const u8 = null;
charloop: for (contents, 0..) |char, idx| {
switch (state) {
.want_list_item => switch (char) {
' ', '\t' => continue :charloop,
',' => {
// empty value
const tip = try getStackTip(stack);
try tip.flow_list.append(Value.newScalar(alloc));
item_start = idx + 1;
},
'{' => {
const tip = try getStackTip(stack);
const new_map = try Parser.appendListGetValue(
&tip.flow_list,
Value.newFlowMap(alloc),
);
item_start = idx;
try stack.append(new_map);
state = .want_map_key;
},
'[' => {
const tip = try getStackTip(stack);
const new_list = try Parser.appendListGetValue(
&tip.flow_list,
Value.newFlowList(alloc),
);
item_start = idx + 1;
try stack.append(new_list);
state = .want_list_item;
},
']' => {
const finished = stack.getLastOrNull() orelse return error.BadState;
if (finished.flow_list.items.len > 0 or idx > item_start)
try finished.flow_list.append(Value.newScalar(alloc));
state = try popStack(&stack);
},
else => {
item_start = idx;
state = .consuming_list_item;
},
},
.consuming_list_item => switch (char) {
',' => {
const tip = try getStackTip(stack);
try tip.flow_list.append(
try Value.fromScalar(alloc, contents[item_start..idx]),
);
item_start = idx + 1;
state = .want_list_item;
},
']' => {
const finished = stack.getLastOrNull() orelse return error.BadState;
try finished.flow_list.append(
try Value.fromScalar(alloc, contents[item_start..idx]),
);
state = try popStack(&stack);
},
else => continue :charloop,
},
.want_list_separator => switch (char) {
' ', '\t' => continue :charloop,
',' => {
item_start = idx;
state = .want_list_item;
},
']' => state = try popStack(&stack),
else => return error.BadToken,
},
.want_map_key => switch (char) {
' ', '\t' => continue :charloop,
// forbid these characters so that flow dictionary keys cannot start
// with characters that regular dictionary keys cannot start with
// (even though they're unambiguous in this specific context).
'{', '[', '#', '-', '>', '|', ',' => return error.BadToken,
':' => {
// we have an empty map key
dangling_key = "";
state = .want_map_value;
},
'}' => state = try popStack(&stack),
else => {
item_start = idx;
state = .consuming_map_key;
},
},
.consuming_map_key => switch (char) {
':' => {
dangling_key = try alloc.dupe(u8, contents[item_start..idx]);
state = .want_map_value;
},
else => continue :charloop,
},
.want_map_value => switch (char) {
' ', '\t' => continue :charloop,
',' => {
const tip = try getStackTip(stack);
try Parser.putMap(
&tip.flow_map,
dangling_key.?,
Value.newScalar(alloc),
dupe_behavior,
);
dangling_key = null;
state = .want_map_key;
},
'[' => {
const tip = try getStackTip(stack);
const new_list = try Parser.putMapGetValue(
&tip.flow_map,
dangling_key.?,
Value.newFlowList(alloc),
dupe_behavior,
);
try stack.append(new_list);
dangling_key = null;
item_start = idx + 1;
state = .want_list_item;
},
'{' => {
const tip = try getStackTip(stack);
const new_map = try Parser.putMapGetValue(
&tip.flow_map,
dangling_key.?,
Value.newFlowMap(alloc),
dupe_behavior,
);
try stack.append(new_map);
dangling_key = null;
state = .want_map_key;
},
'}' => {
// the value is an empty string and this map is closed
const tip = try getStackTip(stack);
try Parser.putMap(
&tip.flow_map,
dangling_key.?,
Value.newScalar(alloc),
dupe_behavior,
);
dangling_key = null;
state = try popStack(&stack);
},
else => {
item_start = idx;
state = .consuming_map_value;
},
},
.consuming_map_value => switch (char) {
',', '}' => |term| {
const tip = try getStackTip(stack);
try Parser.putMap(
&tip.flow_map,
dangling_key.?,
try Value.fromScalar(alloc, contents[item_start..idx]),
dupe_behavior,
);
dangling_key = null;
state = .want_map_key;
if (term == '}') state = try popStack(&stack);
},
else => continue :charloop,
},
.want_map_separator => switch (char) {
' ', '\t' => continue :charloop,
',' => state = .want_map_key,
'}' => state = try popStack(&stack),
else => return error.BadToken,
},
// the root value was closed but there are characters remaining
// in the buffer
.done => return error.BadState,
}
}
// we ran out of characters while still in the middle of an object
if (state != .done) return error.BadState;
return root;
}
inline fn appendListGetValue(list: *Value.List, value: Value) Error!*Value {
try list.append(value);
return &list.items[list.items.len - 1];
}
inline fn putMap(map: *Value.Map, key: []const u8, value: Value, dupe_behavior: DuplicateKeyBehavior) Error!void {
_ = try putMapGetValue(map, key, value, dupe_behavior);
}
inline fn putMapGetValue(map: *Value.Map, key: []const u8, value: Value, dupe_behavior: DuplicateKeyBehavior) Error!*Value {
const gop = try map.getOrPut(key);
if (gop.found_existing)
switch (dupe_behavior) {
.fail => return error.DuplicateKey,
.use_first => {},
.use_last => gop.value_ptr.* = value,
}
else
gop.value_ptr.* = value;
return gop.value_ptr;
}
pub fn dumpBufLines(self: *Parser, buf: []const u8) Error!void {
var tok: LineTokenizer = .{ .buffer = buf, .diagnostics = &self.diagnostics };
while (try tok.next()) |line| {
dumpLine(line);
}
}
fn dumpLine(line: LineTokenizer.Line) void {
var dedbuf: [64]u8 = .{0} ** 64;
var keybuf: [2048]u8 = .{0} ** 2048;
var valbuf: [2048]u8 = .{0} ** 2048;
const shiftstr = if (line.indent == .dedent)
std.fmt.bufPrint(&dedbuf, " ({d})", .{line.indent.dedent}) catch unreachable
else
"";
std.debug.print("{s}{s}: {s} => {s}\n", .{
@tagName(line.indent), shiftstr, @tagName(line.contents), switch (line.contents) {
.comment => |str| str,
.in_line, .list_item => |scalar| switch (scalar) {
.empty => "[empty]",
.scalar,
.string,
.flow_list,
.flow_map,
=> |str| std.fmt.bufPrint(&keybuf, "{s} => {s}", .{ @tagName(scalar), str }) catch unreachable,
},
.map_item => |map| std.fmt.bufPrint(&keybuf, "{s} : {s}", .{
map.key,
switch (map.val) {
.empty => "[empty]",
.scalar,
.string,
.flow_list,
.flow_map,
=> |str| std.fmt.bufPrint(&valbuf, "{s} => {s}", .{ @tagName(map.val), str }) catch unreachable,
},
}) catch unreachable,
},
});
}
};