2023-09-13 00:11:45 -07:00
|
|
|
// Heavily inspired by, but not quite compatible with, NestedText. Key differences:
|
|
|
|
//
|
|
|
|
// - Doesn't support multiline keys (this means map keys cannot start with
|
2023-09-17 19:28:07 -07:00
|
|
|
// ' ', \t, #, {, [, |, or >, and they cannot contain :)
|
2023-09-13 00:11:45 -07:00
|
|
|
// - Allows using tabs for indentation (but not mixed tabs/spaces)
|
|
|
|
// - Indentation must be quantized consistently throughout the document. e.g.
|
|
|
|
// every nested layer being exactly 2 spaces past its parent. Tabs may
|
|
|
|
// only use one tab per indentation level.
|
|
|
|
// - Allows flow-style lists, maps, and strings on the same line as map keys or
|
|
|
|
// list items (i.e. the following are legal):
|
|
|
|
//
|
|
|
|
// key: {inline: map}
|
|
|
|
// key: [inline, list]
|
|
|
|
// key: > inline string
|
|
|
|
// - {map: item}
|
|
|
|
// - [list, item]
|
|
|
|
// - > inline string
|
|
|
|
//
|
|
|
|
// The string case retains the possibility of having an inline map value starting
|
|
|
|
// with {, [, or >
|
2023-09-14 23:38:24 -07:00
|
|
|
// - inline lists and maps cannot contain other inline structures. This may
|
|
|
|
// change, as writing {:[{:[{:[{:[{:[{:[]}]}]}]}]}]} seems tremendously useful
|
2023-09-13 00:11:45 -07:00
|
|
|
// - a map keys and list item dashes must be followed by a value or an indented
|
|
|
|
// section to reduce parser quantum state. This means that
|
|
|
|
//
|
|
|
|
// foo:
|
|
|
|
// bar: baz
|
|
|
|
//
|
2023-09-14 23:38:24 -07:00
|
|
|
// or
|
2023-09-13 00:11:45 -07:00
|
|
|
//
|
|
|
|
// -
|
|
|
|
// - qux
|
|
|
|
//
|
|
|
|
// are not valid. This can be represented with an inline empty string after foo:
|
|
|
|
//
|
|
|
|
// foo: >
|
|
|
|
// bar: baz
|
|
|
|
//
|
|
|
|
// or
|
|
|
|
//
|
|
|
|
// - >
|
|
|
|
// - qux
|
|
|
|
//
|
|
|
|
// - newlines are strictly LF, if the parser finds CR, it is an error
|
|
|
|
// - blank lines may not contain any whitespace characters except the single LF
|
|
|
|
// - Additional string indicator `|` for soft-wrapped strings, i.e.
|
|
|
|
//
|
|
|
|
// key: | this is not special
|
|
|
|
// key:
|
|
|
|
// | these lines are
|
|
|
|
// | soft-wrapped
|
2023-09-14 23:38:24 -07:00
|
|
|
//
|
2023-09-13 00:11:45 -07:00
|
|
|
// soft-wrapped lines are joined with a ' ' instead of a newline character.
|
|
|
|
// Like multiline strings, the final space is stripped (I guess this is a very
|
|
|
|
// janky way to add trailing whitespace to a string).
|
|
|
|
//
|
2023-09-17 23:09:26 -07:00
|
|
|
// - terminated strings to allow trailing whitespace:
|
|
|
|
// | this string has trailing whitespace |
|
|
|
|
// > and so does this one |
|
2023-09-13 00:11:45 -07:00
|
|
|
// - The parser is both strict and probably sloppy and may have weird edge
|
2023-09-14 23:38:24 -07:00
|
|
|
// cases since I'm slinging code, not writing a spec. For example, tabs are
|
|
|
|
// not trimmed from the values of inline lists/maps
|
2023-09-13 00:11:45 -07:00
|
|
|
|
|
|
|
const std = @import("std");
|
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
pub const IndexSlice = struct { start: usize, len: usize };
|
|
|
|
|
2023-09-13 00:11:45 -07:00
|
|
|
pub const Diagnostics = struct {
|
|
|
|
row: usize,
|
|
|
|
span: struct { absolute: usize, line_offset: usize, length: usize },
|
|
|
|
message: []const u8,
|
|
|
|
};
|
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
pub const LineBuffer = struct {
|
|
|
|
allocator: std.mem.Allocator,
|
|
|
|
buffer: []u8,
|
|
|
|
used: usize,
|
|
|
|
window: IndexSlice,
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
pub const default_capacity: usize = 4096;
|
|
|
|
pub const Error = std.mem.Allocator.Error;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
|
|
|
|
return initCapacity(allocator, default_capacity);
|
|
|
|
}
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
|
|
|
|
return .{
|
|
|
|
.allocator = allocator,
|
|
|
|
.buffer = try allocator.alloc(u8, capacity),
|
|
|
|
.used = 0,
|
|
|
|
.window = .{ .start = 0, .len = 0 },
|
|
|
|
};
|
|
|
|
}
|
2023-09-17 23:09:26 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
|
|
|
|
if (data.len == 0) return;
|
|
|
|
// TODO: check for usize overflow here if we want Maximum Robustness
|
|
|
|
const new_window_len = self.window.len + data.len;
|
|
|
|
|
|
|
|
// data cannot fit in the buffer with our scan window, so we have to realloc
|
|
|
|
if (new_window_len > self.buffer.len) {
|
|
|
|
// TODO: adopt an overallocation strategy? Will potentially avoid allocating
|
|
|
|
// on every invocation but will cause the buffer to oversize
|
|
|
|
try self.allocator.realloc(self.buffer, new_window_len);
|
|
|
|
self.rehome();
|
|
|
|
@memcpy(self.buffer[self.used..].ptr, data);
|
|
|
|
self.used = new_window_len;
|
|
|
|
self.window.len = new_window_len;
|
2023-09-17 23:09:26 -07:00
|
|
|
}
|
2023-09-21 23:34:17 -07:00
|
|
|
// data will fit, but needs to be moved in the buffer
|
|
|
|
else if (self.window.start + new_window_len > self.buffer.len) {
|
|
|
|
self.rehome();
|
|
|
|
@memcpy(self.buffer[self.used..].ptr, data);
|
|
|
|
self.used = new_window_len;
|
|
|
|
self.window.len = new_window_len;
|
|
|
|
}
|
|
|
|
// data can simply be appended
|
|
|
|
else {
|
|
|
|
@memcpy(self.buffer[self.used..].ptr, data);
|
|
|
|
}
|
|
|
|
}
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
/// The memory returned by this function is valid until the next call to `feed`.
|
|
|
|
/// The resulting slice does not include the newline character.
|
|
|
|
pub fn nextLine(self: *LineBuffer) ?[]const u8 {
|
|
|
|
if (self.window.start >= self.buffer.len or self.window.len == 0)
|
|
|
|
return null;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
const window = self.buffer[self.window.start..][0..self.window.len];
|
|
|
|
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
self.window.start += split + 1;
|
|
|
|
self.window.len -= split + 1;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
return window[0..split];
|
|
|
|
}
|
|
|
|
|
|
|
|
fn rehome(self: *LineBuffer) void {
|
|
|
|
if (self.window.start == 0) return;
|
|
|
|
|
|
|
|
const window = self.buffer[self.window.start..][0..self.window.len];
|
|
|
|
|
|
|
|
if (self.window.len > self.window.start)
|
|
|
|
std.mem.copyForwards(u8, self.buffer, window)
|
|
|
|
else
|
|
|
|
@memcpy(self.buffer.ptr, window);
|
|
|
|
|
|
|
|
self.window.start = 0;
|
|
|
|
self.used = window.len;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
pub const FixedLineBuffer = struct {
|
|
|
|
buffer: []const u8,
|
|
|
|
window: IndexSlice,
|
|
|
|
|
|
|
|
pub fn init(data: []const u8) FixedLineBuffer {
|
|
|
|
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
|
|
|
|
if (self.window.start >= self.buffer.len or self.window.len == 0)
|
|
|
|
return null;
|
|
|
|
|
|
|
|
const window = self.buffer[self.window.start..][0..self.window.len];
|
|
|
|
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
|
|
|
|
|
|
|
|
self.window.start += split + 1;
|
|
|
|
self.window.len -= split + 1;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
return window[0..split];
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
const IndentationType = union(enum) {
|
|
|
|
immaterial: void,
|
|
|
|
spaces: usize,
|
|
|
|
tabs: void,
|
|
|
|
};
|
|
|
|
|
|
|
|
const InlineItem = union(enum) {
|
|
|
|
empty: void,
|
|
|
|
scalar: []const u8,
|
|
|
|
line_string: []const u8,
|
|
|
|
space_string: []const u8,
|
|
|
|
|
|
|
|
flow_list: []const u8,
|
|
|
|
flow_map: []const u8,
|
|
|
|
|
|
|
|
fn lineEnding(self: InlineItem) u8 {
|
|
|
|
return switch (self) {
|
|
|
|
.line_string => '\n',
|
|
|
|
.space_string => ' ',
|
|
|
|
else => unreachable,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
const LineContents = union(enum) {
|
|
|
|
comment: []const u8,
|
|
|
|
|
|
|
|
in_line: InlineItem,
|
|
|
|
list_item: InlineItem,
|
|
|
|
map_item: struct { key: []const u8, val: InlineItem },
|
|
|
|
};
|
|
|
|
|
|
|
|
// we can dedent multiple levels at once. Example:
|
|
|
|
//
|
|
|
|
// foo:
|
|
|
|
// bar:
|
|
|
|
// > a
|
|
|
|
// > string
|
|
|
|
// baz: [qux]
|
|
|
|
//
|
|
|
|
// capturing this is conceptually simple, but implementing it without complex
|
|
|
|
// indentation tracking requires quantizing the indentation. This means our
|
|
|
|
// IndentationType will also need to track the number of spaces used for
|
|
|
|
// indentation, as detected. Then every line we have to check indent rem the
|
|
|
|
// quantization level == 0 (otherwise we broke quantization) and compute indent
|
|
|
|
// div the quantization level to give us our effective indentation level.
|
|
|
|
|
|
|
|
const ShiftDirection = enum { indent, dedent, none };
|
|
|
|
const RelativeIndent = union(ShiftDirection) {
|
|
|
|
indent: void,
|
|
|
|
dedent: usize,
|
|
|
|
none: void,
|
|
|
|
};
|
|
|
|
|
|
|
|
const Line = struct {
|
|
|
|
indent: RelativeIndent,
|
|
|
|
contents: LineContents,
|
|
|
|
raw: []const u8,
|
|
|
|
};
|
|
|
|
|
|
|
|
pub fn LineTokenizer(comptime Buffer: type) type {
|
|
|
|
return struct {
|
|
|
|
buffer: Buffer,
|
|
|
|
index: usize = 0,
|
|
|
|
indentation: IndentationType = .immaterial,
|
|
|
|
last_indent: usize = 0,
|
|
|
|
diagnostics: *Diagnostics,
|
|
|
|
row: usize = 0,
|
|
|
|
|
|
|
|
const Error = error{
|
|
|
|
BadToken,
|
|
|
|
MixedIndentation,
|
|
|
|
UnquantizedIndentation,
|
|
|
|
TooMuchIndentation,
|
|
|
|
MissingNewline,
|
|
|
|
TrailingWhitespace,
|
|
|
|
Impossible,
|
|
|
|
};
|
|
|
|
|
|
|
|
pub fn next(self: *@This()) Error!?Line {
|
|
|
|
lineloop: while (self.buffer.nextLine()) |raw_line| {
|
|
|
|
var indent: usize = 0;
|
|
|
|
for (raw_line, 0..) |char, idx| {
|
|
|
|
switch (char) {
|
|
|
|
' ' => {
|
|
|
|
switch (self.indentation) {
|
|
|
|
// There's a weird coupling here because we can't set this until
|
|
|
|
// all spaces have been consumed. I also thought about ignoring
|
|
|
|
// spaces on comment lines since those don't affect the
|
|
|
|
// relative indent/dedent, but then we would allow comments
|
|
|
|
// to ignore our indent quantum, which I dislike due to it making
|
|
|
|
// ugly documents.
|
|
|
|
.immaterial => self.indentation = .{ .spaces = 0 },
|
|
|
|
.spaces => {},
|
|
|
|
.tabs => return error.MixedIndentation,
|
|
|
|
}
|
|
|
|
},
|
|
|
|
'\t' => {
|
|
|
|
switch (self.indentation) {
|
|
|
|
.immaterial => self.indentation = .tabs,
|
|
|
|
.spaces => return error.MixedIndentation,
|
|
|
|
.tabs => {},
|
|
|
|
}
|
|
|
|
},
|
|
|
|
'\r' => {
|
|
|
|
return error.BadToken;
|
|
|
|
},
|
|
|
|
else => {
|
|
|
|
indent = idx;
|
|
|
|
break;
|
|
|
|
},
|
2023-09-13 00:11:45 -07:00
|
|
|
}
|
2023-09-21 23:34:17 -07:00
|
|
|
} else {
|
|
|
|
if (raw_line.len > 0) return error.TrailingWhitespace;
|
|
|
|
continue :lineloop;
|
|
|
|
}
|
|
|
|
|
|
|
|
var quantized: usize = if (self.indentation == .spaces) quant: {
|
|
|
|
if (self.indentation.spaces == 0) {
|
|
|
|
self.indentation.spaces = indent;
|
2023-09-13 00:11:45 -07:00
|
|
|
}
|
2023-09-21 23:34:17 -07:00
|
|
|
if (@rem(indent, self.indentation.spaces) != 0)
|
|
|
|
return error.UnquantizedIndentation;
|
|
|
|
|
|
|
|
break :quant @divExact(indent, self.indentation.spaces);
|
|
|
|
} else indent;
|
|
|
|
|
|
|
|
const relative: RelativeIndent = if (quantized > self.last_indent) rel: {
|
|
|
|
if ((quantized - self.last_indent) > 1)
|
|
|
|
return error.TooMuchIndentation;
|
|
|
|
break :rel .indent;
|
|
|
|
} else if (quantized < self.last_indent)
|
|
|
|
.{ .dedent = self.last_indent - quantized }
|
|
|
|
else
|
|
|
|
.none;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
defer {
|
|
|
|
self.row += 1;
|
|
|
|
self.last_indent = quantized;
|
|
|
|
}
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
const line = raw_line[indent..];
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
// this should not be possible, as empty lines are caught earlier.
|
|
|
|
if (line.len == 0) return error.Impossible;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
switch (line[0]) {
|
|
|
|
'#' => {
|
|
|
|
// simply lie about indentation when the line is a comment.
|
|
|
|
quantized = self.last_indent;
|
|
|
|
return .{
|
|
|
|
.indent = .none,
|
|
|
|
.contents = .{ .comment = line[1..] },
|
|
|
|
.raw = line,
|
|
|
|
};
|
|
|
|
},
|
|
|
|
'|', '>', '[', '{' => {
|
|
|
|
return .{
|
2023-09-13 00:11:45 -07:00
|
|
|
.indent = relative,
|
2023-09-21 23:34:17 -07:00
|
|
|
.contents = .{ .in_line = try detectInlineItem(line) },
|
2023-09-14 23:38:24 -07:00
|
|
|
.raw = line,
|
2023-09-13 00:11:45 -07:00
|
|
|
};
|
2023-09-21 23:34:17 -07:00
|
|
|
},
|
|
|
|
'-' => {
|
|
|
|
if (line.len > 1 and line[1] != ' ') return error.BadToken;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
return if (line.len == 1) .{
|
|
|
|
.indent = relative,
|
|
|
|
.contents = .{ .list_item = .empty },
|
|
|
|
.raw = line,
|
|
|
|
} else .{
|
|
|
|
.indent = relative,
|
|
|
|
.contents = .{ .list_item = try detectInlineItem(line[2..]) },
|
|
|
|
.raw = line,
|
|
|
|
};
|
|
|
|
},
|
|
|
|
else => {
|
|
|
|
for (line, 0..) |char, idx| {
|
|
|
|
if (char == ':') {
|
|
|
|
if (idx + 1 == line.len) return .{
|
|
|
|
.indent = relative,
|
|
|
|
.contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
|
|
|
|
.raw = line,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (line[idx + 1] != ' ') return error.BadToken;
|
|
|
|
|
|
|
|
return .{
|
|
|
|
.indent = relative,
|
|
|
|
.contents = .{ .map_item = .{
|
|
|
|
.key = line[0..idx],
|
|
|
|
.val = try detectInlineItem(line[idx + 2 ..]),
|
|
|
|
} },
|
|
|
|
.raw = line,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
}
|
2023-09-13 00:11:45 -07:00
|
|
|
|
|
|
|
return .{
|
|
|
|
.indent = relative,
|
2023-09-21 23:34:17 -07:00
|
|
|
.contents = .{ .in_line = .{ .scalar = line } },
|
2023-09-14 23:38:24 -07:00
|
|
|
.raw = line,
|
2023-09-13 00:11:45 -07:00
|
|
|
};
|
2023-09-21 23:34:17 -07:00
|
|
|
},
|
2023-09-13 00:11:45 -07:00
|
|
|
}
|
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
// somehow everything else has failed
|
|
|
|
return error.Impossible;
|
|
|
|
}
|
|
|
|
return null;
|
2023-09-13 00:11:45 -07:00
|
|
|
}
|
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
fn detectInlineItem(buf: []const u8) Error!InlineItem {
|
|
|
|
if (buf.len == 0) return .empty;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
switch (buf[0]) {
|
|
|
|
'>', '|' => |char| {
|
|
|
|
if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
|
2023-09-17 23:09:26 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
const slice: []const u8 = switch (buf[buf.len - 1]) {
|
|
|
|
' ', '\t' => return error.TrailingWhitespace,
|
|
|
|
'|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
|
|
|
|
else => buf[@min(2, buf.len)..buf.len],
|
|
|
|
};
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
return if (char == '>')
|
|
|
|
.{ .line_string = slice }
|
|
|
|
else
|
|
|
|
.{ .space_string = slice };
|
|
|
|
},
|
|
|
|
'[' => {
|
|
|
|
if (buf.len < 2 or buf[buf.len - 1] != ']')
|
|
|
|
return error.BadToken;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
// keep the closing ] for the flow parser
|
|
|
|
return .{ .flow_list = buf[1..] };
|
|
|
|
},
|
|
|
|
'{' => {
|
|
|
|
if (buf.len < 2 or buf[buf.len - 1] != '}')
|
|
|
|
return error.BadToken;
|
2023-09-17 23:09:26 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
// keep the closing } fpr the flow parser
|
|
|
|
return .{ .flow_map = buf[1..] };
|
|
|
|
},
|
|
|
|
else => {
|
|
|
|
if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t')
|
|
|
|
return error.TrailingWhitespace;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
return .{ .scalar = buf };
|
|
|
|
},
|
2023-09-13 00:11:45 -07:00
|
|
|
}
|
|
|
|
}
|
2023-09-21 23:34:17 -07:00
|
|
|
};
|
|
|
|
}
|
2023-09-13 00:11:45 -07:00
|
|
|
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
pub const Value = union(enum) {
|
|
|
|
pub const String = std.ArrayList(u8);
|
config: use std.StringArrayHashMap for the map type
As I was thinking about this, I realized that data serialization is
much more of a bear than deserialization. Or, more accurately, trying
to make stable round trip serialization a goal puts heavier demands on
deserialization, including preserving input order.
I think there may be a mountain hiding under this molehill, though,
because the goals of having a format that is designed to be
handwritten and also machine written are at odds with each other.
Right now, the parser does not preserve comments at all. But even if
we did (they could easily become a special type of string), comment
indentation is ignored. Comments are not directly a child of any other
part of the document, they're awkward text that exists interspersed
throughout it.
With the current design, there are some essentially unsolvable
problems, like comments interspersed throughout multiline strings. The
string is processed into a single object in the output, so there can't
be weird magic data interleaved with it because it loses the concept
of being interleaved entirely (this is a bigger issue for space
strings, which don't even preserve a unique way to reserialize them.
Line strings at least contain a character (the newline) that can
appear nowhere else but at a break in the string). Obviously this isn't
technically impossible, but it would require a change to the way that
values are modeled.
And even if we did take the approach of associating a comment with,
say, the value that follows it (which I think is a reasonable thing to
do, ignoring the interleaved comment situation described above), if
software reads in data, changes it, and writes it back out, how do we
account for deleted items? Does the comment get deleted with the item?
Does it become a dangling comment that just gets shoved somewhere in
the document? How are comments that come after everything else in the
document handled?
From a pure data perspective, it's fairly obvious why JSON omits
comments: they're trivial to parse, but there's not a strategy for
emitting them that will always be correct, especially in a format that
doesn't give a hoot about linebreaks. It may be interesting to look at
fancy TOML (barf) parsers to see how they handle comments, though I
assume the general technique is to store their row position in the
original document and track when a line is added or removed.
Ultimately, I think the use case of a format to be written by humans
and read by computers is still useful. That's my intended use case for
this and why I started it, but its application as a configuration file
format is probably hamstrung muchly by software not being able to
write it back. On the other hand, there's a lot of successful software
I use where the config files are not written directly by the software
at all, so maybe it's entirely fine to declare this as being out of
scope and not worrying about it further. At the very least it's almost
certainly less of an issue than erroring on carriage returns. Also the
fact that certain keys are simply unrepresentable.
As a side note, I guess what they say about commit message length being
inversely proportional to the change length is true. Hope you enjoyed
the blog over this 5 character change.
2023-09-22 00:30:08 -07:00
|
|
|
pub const Map = std.StringArrayHashMap(Value);
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
pub const List = std.ArrayList(Value);
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
pub const TagType = @typeInfo(Value).Union.tag_type.?;
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
scalar: String,
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
string: String,
|
|
|
|
list: List,
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
flow_list: List,
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
map: Map,
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
flow_map: Map,
|
|
|
|
|
|
|
|
pub inline fn fromScalar(alloc: std.mem.Allocator, input: []const u8) !Value {
|
|
|
|
return try _fromScalarOrString(alloc, .scalar, input);
|
|
|
|
}
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
|
|
|
|
pub inline fn fromString(alloc: std.mem.Allocator, input: []const u8) !Value {
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
return try _fromScalarOrString(alloc, .string, input);
|
|
|
|
}
|
|
|
|
|
|
|
|
inline fn _fromScalarOrString(alloc: std.mem.Allocator, comptime classification: TagType, input: []const u8) !Value {
|
|
|
|
var res = @unionInit(Value, @tagName(classification), try String.initCapacity(alloc, input.len));
|
|
|
|
@field(res, @tagName(classification)).appendSliceAssumeCapacity(input);
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
pub inline fn newScalar(alloc: std.mem.Allocator) Value {
|
|
|
|
return .{ .scalar = String.init(alloc) };
|
|
|
|
}
|
|
|
|
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
pub inline fn newString(alloc: std.mem.Allocator) Value {
|
|
|
|
return .{ .string = String.init(alloc) };
|
|
|
|
}
|
|
|
|
|
|
|
|
pub inline fn newList(alloc: std.mem.Allocator) Value {
|
|
|
|
return .{ .list = List.init(alloc) };
|
|
|
|
}
|
|
|
|
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
pub inline fn newFlowList(alloc: std.mem.Allocator) Value {
|
|
|
|
return .{ .flow_list = List.init(alloc) };
|
|
|
|
}
|
|
|
|
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
pub inline fn newMap(alloc: std.mem.Allocator) Value {
|
|
|
|
return .{ .map = Map.init(alloc) };
|
|
|
|
}
|
|
|
|
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
pub inline fn newFlowMap(alloc: std.mem.Allocator) Value {
|
|
|
|
return .{ .flow_map = Map.init(alloc) };
|
|
|
|
}
|
|
|
|
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
pub fn printDebug(self: Value) void {
|
|
|
|
self.printRecursive(0);
|
|
|
|
std.debug.print("\n", .{});
|
|
|
|
}
|
|
|
|
|
|
|
|
fn printRecursive(self: Value, indent: usize) void {
|
|
|
|
switch (self) {
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
.scalar, .string => |str| {
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
if (std.mem.indexOfScalar(u8, str.items, '\n')) |_| {
|
|
|
|
var lines = std.mem.splitScalar(u8, str.items, '\n');
|
|
|
|
std.debug.print("\n", .{});
|
|
|
|
while (lines.next()) |line| {
|
|
|
|
std.debug.print(
|
|
|
|
"{[empty]s: >[indent]}{[line]s}{[nl]s}",
|
|
|
|
.{
|
|
|
|
.empty = "",
|
|
|
|
.indent = indent,
|
|
|
|
.line = line,
|
|
|
|
.nl = if (lines.peek() == null) "" else "\n",
|
|
|
|
},
|
|
|
|
);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
std.debug.print("{s}", .{str.items});
|
|
|
|
}
|
|
|
|
},
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
.list, .flow_list => |list| {
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
if (list.items.len == 0) {
|
|
|
|
std.debug.print("[]", .{});
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
std.debug.print("[\n", .{});
|
|
|
|
for (list.items, 0..) |value, idx| {
|
|
|
|
std.debug.print("{[empty]s: >[indent]}[{[idx]d}] = ", .{ .empty = "", .indent = indent, .idx = idx });
|
|
|
|
value.printRecursive(indent + 2);
|
|
|
|
std.debug.print(",\n", .{});
|
|
|
|
}
|
|
|
|
std.debug.print(
|
|
|
|
"{[empty]s: >[indent]}]",
|
|
|
|
.{ .empty = "", .indent = indent },
|
|
|
|
);
|
|
|
|
},
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
.map, .flow_map => |map| {
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
if (map.count() == 0) {
|
|
|
|
std.debug.print("{{}}", .{});
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
std.debug.print("{{\n", .{});
|
|
|
|
|
|
|
|
var iter = map.iterator();
|
|
|
|
|
|
|
|
while (iter.next()) |entry| {
|
|
|
|
std.debug.print(
|
|
|
|
"{[empty]s: >[indent]}{[key]s}: ",
|
|
|
|
.{ .empty = "", .indent = indent + 2, .key = entry.key_ptr.* },
|
|
|
|
);
|
|
|
|
entry.value_ptr.printRecursive(indent + 4);
|
|
|
|
std.debug.print(",\n", .{});
|
|
|
|
}
|
|
|
|
std.debug.print(
|
|
|
|
"{[empty]s: >[indent]}}}",
|
|
|
|
.{ .empty = "", .indent = indent },
|
|
|
|
);
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-09-13 00:11:45 -07:00
|
|
|
pub const Parser = struct {
|
|
|
|
allocator: std.mem.Allocator,
|
|
|
|
dupe_behavior: DuplicateKeyBehavior = .fail,
|
|
|
|
default_object: DefaultObject = .fail,
|
|
|
|
diagnostics: Diagnostics = .{
|
|
|
|
.row = 0,
|
|
|
|
.span = .{ .absolute = 0, .line_offset = 0, .length = 0 },
|
|
|
|
.message = "all is well",
|
|
|
|
},
|
|
|
|
|
|
|
|
pub const Error = error{
|
|
|
|
UnexpectedIndent,
|
|
|
|
UnexpectedValue,
|
|
|
|
ExtraContent,
|
|
|
|
EmptyDocument,
|
|
|
|
DuplicateKey,
|
2023-09-14 23:38:24 -07:00
|
|
|
BadMapEntry,
|
2023-09-23 13:29:49 -07:00
|
|
|
BadState,
|
|
|
|
BadToken,
|
2023-09-13 00:11:45 -07:00
|
|
|
Fail,
|
2023-09-23 13:29:49 -07:00
|
|
|
} || LineTokenizer(FixedLineBuffer).Error || std.mem.Allocator.Error;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
|
|
|
pub const DuplicateKeyBehavior = enum {
|
|
|
|
use_first,
|
|
|
|
use_last,
|
|
|
|
fail,
|
|
|
|
};
|
|
|
|
|
|
|
|
pub const DefaultObject = enum {
|
|
|
|
string,
|
|
|
|
list,
|
|
|
|
map,
|
|
|
|
fail,
|
|
|
|
};
|
|
|
|
|
|
|
|
pub const ParseState = enum {
|
|
|
|
initial,
|
|
|
|
value,
|
|
|
|
done,
|
|
|
|
};
|
|
|
|
|
|
|
|
pub const Document = struct {
|
|
|
|
arena: std.heap.ArenaAllocator,
|
|
|
|
root: Value,
|
|
|
|
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
pub fn init(alloc: std.mem.Allocator) Document {
|
|
|
|
return .{
|
|
|
|
.arena = std.heap.ArenaAllocator.init(alloc),
|
|
|
|
.root = undefined,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2023-09-14 23:38:24 -07:00
|
|
|
pub fn printDebug(self: Document) void {
|
|
|
|
return self.root.printDebug();
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn deinit(self: Document) void {
|
2023-09-13 00:11:45 -07:00
|
|
|
self.arena.deinit();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
pub const State = struct {
|
|
|
|
pub const Stack = std.ArrayList(*Value);
|
|
|
|
|
|
|
|
document: Document,
|
|
|
|
value_stack: Stack,
|
|
|
|
state: ParseState = .initial,
|
2023-09-21 23:34:17 -07:00
|
|
|
expect_shift: ShiftDirection = .none,
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
dangling_key: ?[]const u8 = null,
|
|
|
|
|
|
|
|
pub fn init(alloc: std.mem.Allocator) State {
|
|
|
|
return .{
|
|
|
|
.document = Document.init(alloc),
|
|
|
|
.value_stack = Stack.init(alloc),
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn deinit(self: State) void {
|
|
|
|
self.value_stack.deinit();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-09-14 23:38:24 -07:00
|
|
|
pub fn parseBuffer(self: *Parser, buffer: []const u8) Error!Document {
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
var document = Document.init(self.allocator);
|
2023-09-13 00:11:45 -07:00
|
|
|
errdefer document.deinit();
|
|
|
|
const arena_alloc = document.arena.allocator();
|
|
|
|
|
|
|
|
var state: ParseState = .initial;
|
2023-09-21 23:34:17 -07:00
|
|
|
var expect_shift: ShiftDirection = .none;
|
2023-09-17 19:28:07 -07:00
|
|
|
var dangling_key: ?[]const u8 = null;
|
2023-09-13 00:11:45 -07:00
|
|
|
var stack = std.ArrayList(*Value).init(arena_alloc);
|
|
|
|
defer stack.deinit();
|
|
|
|
|
2023-09-21 23:34:17 -07:00
|
|
|
var tok: LineTokenizer(FixedLineBuffer) = .{
|
|
|
|
.buffer = FixedLineBuffer.init(buffer),
|
|
|
|
.diagnostics = &self.diagnostics,
|
|
|
|
};
|
|
|
|
|
2023-09-13 00:11:45 -07:00
|
|
|
while (try tok.next()) |line| {
|
|
|
|
if (line.contents == .comment) continue;
|
|
|
|
|
|
|
|
var flip = true;
|
|
|
|
var flop = false;
|
|
|
|
// this is needed to give us a second go round when the line is dedented
|
|
|
|
flipflop: while (flip) : (flop = true) {
|
|
|
|
switch (state) {
|
|
|
|
.initial => {
|
|
|
|
if (line.indent == .indent) return error.UnexpectedIndent;
|
|
|
|
|
|
|
|
switch (line.contents) {
|
|
|
|
// we filter out comments above
|
|
|
|
.comment => unreachable,
|
|
|
|
.in_line => |in_line| switch (in_line) {
|
|
|
|
// empty scalars are only emitted for a list_item or a map_item
|
|
|
|
.empty => unreachable,
|
|
|
|
.scalar => |str| {
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
document.root = try Value.fromScalar(arena_alloc, str);
|
2023-09-17 23:09:26 -07:00
|
|
|
// this is a cheesy hack. If the document consists
|
|
|
|
// solely of a scalar, the finalizer will try to
|
|
|
|
// chop a line ending off of it, so we need to add
|
|
|
|
// a sacrificial padding character to avoid
|
|
|
|
// chopping off something that matters.
|
|
|
|
try document.root.string.append(' ');
|
2023-09-13 00:11:45 -07:00
|
|
|
state = .done;
|
|
|
|
},
|
2023-09-17 23:09:26 -07:00
|
|
|
.line_string, .space_string => |str| {
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
document.root = try Value.fromString(arena_alloc, str);
|
2023-09-17 23:09:26 -07:00
|
|
|
try document.root.string.append(in_line.lineEnding());
|
2023-09-14 23:38:24 -07:00
|
|
|
try stack.append(&document.root);
|
|
|
|
state = .value;
|
|
|
|
},
|
2023-09-13 00:11:45 -07:00
|
|
|
.flow_list => |str| {
|
2023-09-23 13:29:49 -07:00
|
|
|
document.root = try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior);
|
2023-09-13 00:11:45 -07:00
|
|
|
state = .done;
|
|
|
|
},
|
|
|
|
.flow_map => |str| {
|
2023-09-23 13:29:49 -07:00
|
|
|
document.root = try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior);
|
2023-09-13 00:11:45 -07:00
|
|
|
state = .done;
|
|
|
|
},
|
|
|
|
},
|
|
|
|
.list_item => |value| {
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
document.root = Value.newList(arena_alloc);
|
2023-09-13 00:11:45 -07:00
|
|
|
try stack.append(&document.root);
|
2023-09-23 14:17:31 -07:00
|
|
|
state = .value;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
|
|
|
switch (value) {
|
2023-09-23 14:17:31 -07:00
|
|
|
.empty => expect_shift = .indent,
|
|
|
|
.scalar => |str| try document.root.list.append(try Value.fromScalar(arena_alloc, str)),
|
|
|
|
.line_string, .space_string => |str| try document.root.list.append(try Value.fromString(arena_alloc, str)),
|
|
|
|
.flow_list => |str| try document.root.list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
|
|
|
|
.flow_map => |str| try document.root.list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
|
2023-09-13 00:11:45 -07:00
|
|
|
}
|
|
|
|
},
|
|
|
|
.map_item => |pair| {
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
document.root = Value.newMap(arena_alloc);
|
2023-09-13 00:11:45 -07:00
|
|
|
try stack.append(&document.root);
|
2023-09-23 14:17:31 -07:00
|
|
|
state = .value;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-23 14:17:31 -07:00
|
|
|
const dupekey = try arena_alloc.dupe(u8, pair.key);
|
2023-09-13 00:11:45 -07:00
|
|
|
switch (pair.val) {
|
|
|
|
.empty => {
|
|
|
|
expect_shift = .indent;
|
|
|
|
// If the key is on its own line, we don't have
|
|
|
|
// an associated value until we parse the next
|
|
|
|
// line. We need to store a reference to this
|
|
|
|
// key somewhere until we can consume the
|
|
|
|
// value. More parser state to lug along.
|
|
|
|
|
2023-09-23 14:17:31 -07:00
|
|
|
dangling_key = dupekey;
|
2023-09-13 00:11:45 -07:00
|
|
|
},
|
2023-09-23 14:17:31 -07:00
|
|
|
.scalar => |str| try document.root.map.put(dupekey, try Value.fromScalar(arena_alloc, str)),
|
|
|
|
.line_string, .space_string => |str| try document.root.map.put(dupekey, try Value.fromString(arena_alloc, str)),
|
|
|
|
.flow_list => |str| try document.root.map.put(dupekey, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
|
|
|
|
.flow_map => |str| try document.root.map.put(dupekey, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
|
2023-09-13 00:11:45 -07:00
|
|
|
}
|
|
|
|
},
|
|
|
|
}
|
|
|
|
},
|
2023-09-14 23:38:24 -07:00
|
|
|
.value => switch (stack.getLast().*) {
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
// these three states are never reachable here. flow_list and
|
|
|
|
// flow_map are parsed with a separate state machine. These
|
2023-09-23 01:07:04 -07:00
|
|
|
// value types can only be present by themselves as the first
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
// line of the document, in which case the document consists
|
|
|
|
// only of that single line: this parser jumps immediately into
|
|
|
|
// the .done state, bypassing the .value state in which this
|
|
|
|
// switch is embedded.
|
|
|
|
.scalar, .flow_list, .flow_map => unreachable,
|
2023-09-13 00:11:45 -07:00
|
|
|
.string => |*string| {
|
2023-09-17 23:09:26 -07:00
|
|
|
if (line.indent == .indent)
|
|
|
|
return error.UnexpectedIndent;
|
|
|
|
|
2023-09-13 00:11:45 -07:00
|
|
|
if (!flop and line.indent == .dedent) {
|
2023-09-17 23:09:26 -07:00
|
|
|
// kick off the last trailing space or newline
|
2023-09-17 19:28:07 -07:00
|
|
|
_ = string.pop();
|
2023-09-13 00:11:45 -07:00
|
|
|
|
|
|
|
var dedent_depth = line.indent.dedent;
|
|
|
|
while (dedent_depth > 0) : (dedent_depth -= 1)
|
|
|
|
_ = stack.pop();
|
|
|
|
|
|
|
|
continue :flipflop;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (line.contents) {
|
|
|
|
.comment => unreachable,
|
|
|
|
.in_line => |in_line| switch (in_line) {
|
|
|
|
.empty => unreachable,
|
2023-09-17 23:09:26 -07:00
|
|
|
.line_string, .space_string => |str| {
|
2023-09-14 23:38:24 -07:00
|
|
|
try string.appendSlice(str);
|
2023-09-17 23:09:26 -07:00
|
|
|
try string.append(in_line.lineEnding());
|
2023-09-14 23:38:24 -07:00
|
|
|
},
|
2023-09-13 00:11:45 -07:00
|
|
|
else => return error.UnexpectedValue,
|
|
|
|
},
|
|
|
|
else => return error.UnexpectedValue,
|
|
|
|
}
|
|
|
|
},
|
|
|
|
.list => |*list| {
|
2023-09-17 23:09:26 -07:00
|
|
|
// detect that the previous item was actually empty
|
|
|
|
//
|
|
|
|
// -
|
|
|
|
// - something
|
|
|
|
//
|
|
|
|
// the first line here creates the expect_shift, but the second line
|
|
|
|
// is a valid continuation of the list despite not being indented
|
2023-09-23 01:07:04 -07:00
|
|
|
if (!flop and (expect_shift == .indent and line.indent != .indent))
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
try list.append(Value.newScalar(arena_alloc));
|
2023-09-14 23:38:24 -07:00
|
|
|
|
2023-09-13 00:11:45 -07:00
|
|
|
// Consider:
|
|
|
|
//
|
2023-09-17 23:09:26 -07:00
|
|
|
// -
|
|
|
|
// own-line scalar
|
|
|
|
// - inline scalar
|
2023-09-13 00:11:45 -07:00
|
|
|
//
|
|
|
|
// the own-line scalar will not push the stack but the next list item will be a dedent
|
|
|
|
if (!flop and line.indent == .dedent) {
|
|
|
|
// if line.indent.dedent is 1 and we're expecting it, the stack will not be popped,
|
|
|
|
// but we will continue loop flipflop. However, flop will be set to false on the next
|
|
|
|
// trip, so this if prong will not be run again.
|
|
|
|
var dedent_depth = line.indent.dedent - @intFromBool(expect_shift == .dedent);
|
|
|
|
|
|
|
|
while (dedent_depth > 0) : (dedent_depth -= 1)
|
|
|
|
_ = stack.pop();
|
|
|
|
|
|
|
|
continue :flipflop;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (line.contents) {
|
|
|
|
.comment => unreachable,
|
|
|
|
.in_line => |in_line| {
|
|
|
|
// assert that this line has been indented. this is required for an inline value when
|
|
|
|
// the stack is in list mode.
|
2023-09-14 23:38:24 -07:00
|
|
|
if (expect_shift != .indent or line.indent != .indent)
|
|
|
|
return error.UnexpectedValue;
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-14 23:38:24 -07:00
|
|
|
expect_shift = .dedent;
|
2023-09-13 00:11:45 -07:00
|
|
|
switch (in_line) {
|
|
|
|
.empty => unreachable,
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
|
2023-09-23 13:29:49 -07:00
|
|
|
.flow_list => |str| try list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
|
|
|
|
.flow_map => |str| try list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
|
2023-09-17 23:09:26 -07:00
|
|
|
.line_string, .space_string => |str| {
|
2023-09-13 00:11:45 -07:00
|
|
|
// string pushes the stack
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
const new_string = try appendListGetValue(list, try Value.fromString(arena_alloc, str));
|
2023-09-23 01:07:04 -07:00
|
|
|
try stack.append(new_string);
|
2023-09-14 23:38:24 -07:00
|
|
|
|
2023-09-17 23:09:26 -07:00
|
|
|
try new_string.string.append(in_line.lineEnding());
|
2023-09-13 00:11:45 -07:00
|
|
|
expect_shift = .none;
|
|
|
|
},
|
2023-09-14 23:38:24 -07:00
|
|
|
}
|
|
|
|
},
|
|
|
|
.list_item => |value| {
|
2023-09-23 01:07:04 -07:00
|
|
|
if (flop or (line.indent == .none or line.indent == .dedent)) {
|
|
|
|
expect_shift = .none;
|
|
|
|
switch (value) {
|
|
|
|
.empty => expect_shift = .indent,
|
|
|
|
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
|
|
|
|
.line_string, .space_string => |str| try list.append(try Value.fromString(arena_alloc, str)),
|
2023-09-23 13:29:49 -07:00
|
|
|
.flow_list => |str| try list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
|
|
|
|
.flow_map => |str| try list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
|
2023-09-23 01:07:04 -07:00
|
|
|
}
|
|
|
|
} else if (line.indent == .indent) {
|
|
|
|
if (expect_shift != .indent) return error.UnexpectedIndent;
|
|
|
|
|
|
|
|
const new_list = try appendListGetValue(list, Value.newList(arena_alloc));
|
|
|
|
try stack.append(new_list);
|
|
|
|
expect_shift = .none;
|
|
|
|
continue :flipflop;
|
|
|
|
} else unreachable;
|
2023-09-13 00:11:45 -07:00
|
|
|
},
|
2023-09-23 01:07:04 -07:00
|
|
|
.map_item => {
|
2023-09-14 23:38:24 -07:00
|
|
|
// this prong cannot be hit on dedent in a valid way.
|
|
|
|
//
|
|
|
|
// -
|
|
|
|
// map: value
|
|
|
|
// second: value
|
|
|
|
// third: value
|
|
|
|
//
|
|
|
|
// dedenting back to the list stack level requires list_item
|
|
|
|
|
|
|
|
if (line.indent != .indent)
|
|
|
|
return error.UnexpectedValue;
|
|
|
|
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
const new_map = try appendListGetValue(list, Value.newMap(arena_alloc));
|
2023-09-14 23:38:24 -07:00
|
|
|
try stack.append(new_map);
|
|
|
|
expect_shift = .none;
|
2023-09-23 01:07:04 -07:00
|
|
|
continue :flipflop;
|
2023-09-13 00:11:45 -07:00
|
|
|
},
|
|
|
|
}
|
|
|
|
},
|
2023-09-14 23:38:24 -07:00
|
|
|
.map => |*map| {
|
2023-09-17 23:09:26 -07:00
|
|
|
// detect that the previous item was actually empty
|
|
|
|
//
|
|
|
|
// foo:
|
|
|
|
// bar: baz
|
|
|
|
//
|
|
|
|
// the first line here creates the expect_shift, but the second line
|
|
|
|
// is a valid continuation of the map despite not being indented
|
2023-09-23 01:07:04 -07:00
|
|
|
if (!flop and (expect_shift == .indent and line.indent != .indent)) {
|
2023-09-17 19:28:07 -07:00
|
|
|
try putMap(
|
2023-09-14 23:38:24 -07:00
|
|
|
map,
|
2023-09-17 19:28:07 -07:00
|
|
|
dangling_key orelse return error.Fail,
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
Value.newScalar(arena_alloc),
|
2023-09-17 19:28:07 -07:00
|
|
|
self.dupe_behavior,
|
2023-09-14 23:38:24 -07:00
|
|
|
);
|
2023-09-17 19:28:07 -07:00
|
|
|
dangling_key = null;
|
2023-09-14 23:38:24 -07:00
|
|
|
}
|
|
|
|
|
2023-09-13 00:11:45 -07:00
|
|
|
if (!flop and line.indent == .dedent) {
|
2023-09-14 23:38:24 -07:00
|
|
|
var dedent_depth = line.indent.dedent - @intFromBool(expect_shift == .dedent);
|
|
|
|
|
|
|
|
while (dedent_depth > 0) : (dedent_depth -= 1)
|
|
|
|
_ = stack.pop();
|
|
|
|
|
2023-09-13 00:11:45 -07:00
|
|
|
continue :flipflop;
|
|
|
|
}
|
2023-09-14 23:38:24 -07:00
|
|
|
|
|
|
|
switch (line.contents) {
|
|
|
|
.comment => unreachable,
|
|
|
|
.in_line => |in_line| {
|
|
|
|
// assert that this line has been indented. this is required for an inline value when
|
|
|
|
// the stack is in map mode.
|
2023-09-17 19:28:07 -07:00
|
|
|
if (expect_shift != .indent or line.indent != .indent or dangling_key == null)
|
2023-09-14 23:38:24 -07:00
|
|
|
return error.UnexpectedValue;
|
|
|
|
|
|
|
|
expect_shift = .dedent;
|
|
|
|
|
|
|
|
switch (in_line) {
|
|
|
|
.empty => unreachable,
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
.scalar => |str| try putMap(map, dangling_key.?, try Value.fromScalar(arena_alloc, str), self.dupe_behavior),
|
2023-09-23 13:29:49 -07:00
|
|
|
.flow_list => |str| try putMap(map, dangling_key.?, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior), self.dupe_behavior),
|
2023-09-14 23:38:24 -07:00
|
|
|
.flow_map => |str| {
|
2023-09-23 13:29:49 -07:00
|
|
|
try putMap(map, dangling_key.?, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior), self.dupe_behavior);
|
2023-09-14 23:38:24 -07:00
|
|
|
},
|
2023-09-17 23:09:26 -07:00
|
|
|
.line_string, .space_string => |str| {
|
2023-09-14 23:38:24 -07:00
|
|
|
// string pushes the stack
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
const new_string = try putMapGetValue(map, dangling_key.?, try Value.fromString(arena_alloc, str), self.dupe_behavior);
|
2023-09-17 23:09:26 -07:00
|
|
|
try new_string.string.append(in_line.lineEnding());
|
2023-09-14 23:38:24 -07:00
|
|
|
try stack.append(new_string);
|
|
|
|
expect_shift = .none;
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
2023-09-17 19:28:07 -07:00
|
|
|
dangling_key = null;
|
2023-09-14 23:38:24 -07:00
|
|
|
},
|
2023-09-23 01:07:04 -07:00
|
|
|
.list_item => {
|
2023-09-14 23:38:24 -07:00
|
|
|
// this prong cannot be hit on dedent in a valid way.
|
|
|
|
//
|
|
|
|
// map:
|
|
|
|
// - value
|
|
|
|
// - invalid
|
|
|
|
//
|
|
|
|
// dedenting back to the map stack level requires map_item
|
|
|
|
|
2023-09-17 19:28:07 -07:00
|
|
|
if (expect_shift != .indent or line.indent != .indent or dangling_key == null)
|
2023-09-14 23:38:24 -07:00
|
|
|
return error.UnexpectedValue;
|
|
|
|
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
const new_list = try putMapGetValue(map, dangling_key.?, Value.newList(arena_alloc), self.dupe_behavior);
|
2023-09-14 23:38:24 -07:00
|
|
|
try stack.append(new_list);
|
2023-09-17 19:28:07 -07:00
|
|
|
dangling_key = null;
|
2023-09-14 23:38:24 -07:00
|
|
|
expect_shift = .none;
|
2023-09-23 01:07:04 -07:00
|
|
|
continue :flipflop;
|
2023-09-14 23:38:24 -07:00
|
|
|
},
|
|
|
|
.map_item => |pair| {
|
2023-09-23 01:07:04 -07:00
|
|
|
if (flop or (line.indent == .none or line.indent == .dedent)) {
|
|
|
|
expect_shift = .none;
|
2023-09-23 14:17:31 -07:00
|
|
|
const dupekey = try arena_alloc.dupe(u8, pair.key);
|
2023-09-23 01:07:04 -07:00
|
|
|
switch (pair.val) {
|
2023-09-14 23:38:24 -07:00
|
|
|
.empty => {
|
|
|
|
expect_shift = .indent;
|
2023-09-23 14:17:31 -07:00
|
|
|
dangling_key = dupekey;
|
2023-09-14 23:38:24 -07:00
|
|
|
},
|
2023-09-23 14:17:31 -07:00
|
|
|
.scalar => |str| try putMap(map, dupekey, try Value.fromScalar(arena_alloc, str), self.dupe_behavior),
|
|
|
|
.line_string, .space_string => |str| try putMap(map, dupekey, try Value.fromString(arena_alloc, str), self.dupe_behavior),
|
|
|
|
.flow_list => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior), self.dupe_behavior),
|
|
|
|
.flow_map => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior), self.dupe_behavior),
|
2023-09-23 01:07:04 -07:00
|
|
|
}
|
|
|
|
} else if (line.indent == .indent) {
|
|
|
|
if (expect_shift != .indent or dangling_key == null) return error.UnexpectedValue;
|
|
|
|
|
|
|
|
const new_map = try putMapGetValue(map, dangling_key.?, Value.newMap(arena_alloc), self.dupe_behavior);
|
|
|
|
try stack.append(new_map);
|
|
|
|
dangling_key = null;
|
|
|
|
continue :flipflop;
|
|
|
|
} else unreachable;
|
2023-09-14 23:38:24 -07:00
|
|
|
},
|
|
|
|
}
|
2023-09-13 00:11:45 -07:00
|
|
|
},
|
|
|
|
},
|
|
|
|
.done => return error.ExtraContent,
|
|
|
|
}
|
|
|
|
|
2023-09-14 23:38:24 -07:00
|
|
|
// this is specifically performed at the end of the loop body so that
|
|
|
|
// `continue :flipflop` skips setting it.
|
|
|
|
flip = false;
|
|
|
|
}
|
2023-09-13 00:11:45 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
switch (state) {
|
|
|
|
.initial => switch (self.default_object) {
|
|
|
|
.string => document.root = .{ .string = std.ArrayList(u8).init(arena_alloc) },
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
.list => document.root = Value.newList(arena_alloc),
|
|
|
|
.map => document.root = Value.newMap(arena_alloc),
|
2023-09-13 00:11:45 -07:00
|
|
|
.fail => return error.EmptyDocument,
|
|
|
|
},
|
2023-09-14 23:38:24 -07:00
|
|
|
.value => switch (stack.getLast().*) {
|
|
|
|
// remove the final trailing newline or space
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
.scalar, .string => |*string| _ = string.popOrNull(),
|
2023-09-14 23:38:24 -07:00
|
|
|
// if we have a dangling -, attach an empty string to it
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
.list => |*list| if (expect_shift == .indent) try list.append(Value.newScalar(arena_alloc)),
|
2023-09-17 19:28:07 -07:00
|
|
|
// if we have a dangling "key:", attach an empty string to it
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
.map => |*map| if (dangling_key) |dk| try putMap(map, dk, Value.newScalar(arena_alloc), self.dupe_behavior),
|
|
|
|
.flow_list, .flow_map => {},
|
2023-09-13 00:11:45 -07:00
|
|
|
},
|
|
|
|
.done => {},
|
|
|
|
}
|
|
|
|
|
|
|
|
return document;
|
|
|
|
}
|
|
|
|
|
2023-09-23 17:27:21 -07:00
|
|
|
const FlowStack: type = std.ArrayList(*Value);
|
2023-09-17 19:47:18 -07:00
|
|
|
|
2023-09-23 17:27:21 -07:00
|
|
|
inline fn getStackTip(stack: FlowStack) Error!*Value {
|
2023-09-17 19:47:18 -07:00
|
|
|
if (stack.items.len == 0) return error.BadState;
|
2023-09-23 17:27:21 -07:00
|
|
|
return stack.items[stack.items.len - 1];
|
2023-09-17 19:47:18 -07:00
|
|
|
}
|
|
|
|
|
2023-09-23 13:29:49 -07:00
|
|
|
inline fn popStack(stack: *FlowStack) Error!FlowParseState {
|
|
|
|
if (stack.popOrNull() == null)
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
return error.BadState;
|
2023-09-17 19:47:18 -07:00
|
|
|
|
2023-09-23 13:29:49 -07:00
|
|
|
const parent = stack.getLastOrNull() orelse return .done;
|
2023-09-17 19:47:18 -07:00
|
|
|
|
2023-09-23 17:27:21 -07:00
|
|
|
return switch (parent.*) {
|
config: differentiate fields in Value
This makes handling Value very slightly more work, but it provides
useful metadata that can be used to perform better conversion and
serialization.
The motivation behind the "scalar" type is that in general, only
scalars can be coerced to other types. For example, a scalar `null`
and a string `> null` have the same in-memory representation. If they
are treated identically, this precludes unambiguously converting an
optional string whose contents are "null". With the two disambiguated,
we can choose to convert `null` to the null object and `> null` to a
string of contents "null". This ambiguity does not necessary exist for
the standard boolean values `true` and `false`, but it does allow the
conversion to be more strict, and it will theoretically result in
documents that read more naturally.
The motivation behind exposing flow_list and flow_map is that it will
allow preserving document formatting round trip (well, this isn't
strictly true: single line explicit strings neither remember whether
they were line strings or space strings, and they don't remember if
they were indented. However, that is much less information to lose).
The following formulations will parse to the same indistinguishable
value:
key: > value
key:
> value
key: | value
key:
| value
I think that's okay. It's a lot easier to chose a canonical form for
this case than it is for a map/list without any hints regarding its
origin.
2023-09-19 00:14:29 -07:00
|
|
|
.flow_list => .want_list_separator,
|
|
|
|
.flow_map => .want_map_separator,
|
2023-09-17 19:47:18 -07:00
|
|
|
else => return error.BadState,
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
};
|
2023-09-17 19:47:18 -07:00
|
|
|
}
|
|
|
|
|
2023-09-23 13:29:49 -07:00
|
|
|
const FlowParseState = enum {
|
|
|
|
want_list_item,
|
|
|
|
consuming_list_item,
|
|
|
|
want_list_separator,
|
|
|
|
want_map_key,
|
|
|
|
consuming_map_key,
|
|
|
|
want_map_value,
|
|
|
|
consuming_map_value,
|
|
|
|
want_map_separator,
|
|
|
|
done,
|
|
|
|
};
|
|
|
|
|
|
|
|
pub fn parseFlow(
|
|
|
|
alloc: std.mem.Allocator,
|
|
|
|
contents: []const u8,
|
|
|
|
root_type: Value.TagType,
|
|
|
|
dupe_behavior: DuplicateKeyBehavior,
|
|
|
|
) Error!Value {
|
|
|
|
var root: Value = switch (root_type) {
|
|
|
|
.flow_list => Value.newFlowList(alloc),
|
|
|
|
.flow_map => Value.newFlowMap(alloc),
|
|
|
|
else => return error.BadState,
|
|
|
|
};
|
|
|
|
var state: FlowParseState = switch (root_type) {
|
|
|
|
.flow_list => .want_list_item,
|
|
|
|
.flow_map => .want_map_key,
|
|
|
|
else => unreachable,
|
|
|
|
};
|
|
|
|
var stack = try FlowStack.initCapacity(alloc, 1);
|
2023-09-23 17:27:21 -07:00
|
|
|
stack.appendAssumeCapacity(&root);
|
|
|
|
// used to distinguish betwen [] and [ ], and it also tracks
|
|
|
|
// a continuous value between different states
|
|
|
|
var item_start: usize = 0;
|
2023-09-17 23:09:26 -07:00
|
|
|
var dangling_key: ?[]const u8 = null;
|
|
|
|
|
2023-09-23 13:29:49 -07:00
|
|
|
charloop: for (contents, 0..) |char, idx| {
|
|
|
|
switch (state) {
|
2023-09-17 19:47:18 -07:00
|
|
|
.want_list_item => switch (char) {
|
|
|
|
' ', '\t' => continue :charloop,
|
|
|
|
',' => {
|
|
|
|
// empty value
|
2023-09-23 13:29:49 -07:00
|
|
|
const tip = try getStackTip(stack);
|
2023-09-23 17:27:21 -07:00
|
|
|
try tip.flow_list.append(Value.newScalar(alloc));
|
|
|
|
item_start = idx + 1;
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
|
|
|
'{' => {
|
2023-09-23 13:29:49 -07:00
|
|
|
const tip = try getStackTip(stack);
|
2023-09-17 19:47:18 -07:00
|
|
|
|
|
|
|
const new_map = try Parser.appendListGetValue(
|
2023-09-23 17:27:21 -07:00
|
|
|
&tip.flow_list,
|
2023-09-23 13:29:49 -07:00
|
|
|
Value.newFlowMap(alloc),
|
2023-09-17 19:47:18 -07:00
|
|
|
);
|
|
|
|
|
2023-09-23 17:27:21 -07:00
|
|
|
item_start = idx;
|
|
|
|
try stack.append(new_map);
|
2023-09-23 13:29:49 -07:00
|
|
|
state = .want_map_key;
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
|
|
|
'[' => {
|
2023-09-23 13:29:49 -07:00
|
|
|
const tip = try getStackTip(stack);
|
2023-09-17 19:47:18 -07:00
|
|
|
|
|
|
|
const new_list = try Parser.appendListGetValue(
|
2023-09-23 17:27:21 -07:00
|
|
|
&tip.flow_list,
|
2023-09-23 13:29:49 -07:00
|
|
|
Value.newFlowList(alloc),
|
2023-09-17 19:47:18 -07:00
|
|
|
);
|
|
|
|
|
2023-09-23 17:27:21 -07:00
|
|
|
item_start = idx + 1;
|
|
|
|
try stack.append(new_list);
|
2023-09-23 13:29:49 -07:00
|
|
|
state = .want_list_item;
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
']' => {
|
2023-09-23 13:29:49 -07:00
|
|
|
const finished = stack.getLastOrNull() orelse return error.BadState;
|
2023-09-23 17:27:21 -07:00
|
|
|
if (finished.flow_list.items.len > 0 or idx > item_start)
|
|
|
|
try finished.flow_list.append(Value.newScalar(alloc));
|
2023-09-23 13:29:49 -07:00
|
|
|
state = try popStack(&stack);
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
},
|
2023-09-17 19:47:18 -07:00
|
|
|
else => {
|
2023-09-23 17:27:21 -07:00
|
|
|
item_start = idx;
|
2023-09-23 13:29:49 -07:00
|
|
|
state = .consuming_list_item;
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
|
|
|
},
|
|
|
|
.consuming_list_item => switch (char) {
|
|
|
|
',' => {
|
2023-09-23 13:29:49 -07:00
|
|
|
const tip = try getStackTip(stack);
|
2023-09-17 19:47:18 -07:00
|
|
|
|
2023-09-23 17:27:21 -07:00
|
|
|
try tip.flow_list.append(
|
|
|
|
try Value.fromScalar(alloc, contents[item_start..idx]),
|
2023-09-17 19:47:18 -07:00
|
|
|
);
|
2023-09-23 17:27:21 -07:00
|
|
|
item_start = idx + 1;
|
2023-09-17 19:47:18 -07:00
|
|
|
|
2023-09-23 13:29:49 -07:00
|
|
|
state = .want_list_item;
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
']' => {
|
2023-09-23 13:29:49 -07:00
|
|
|
const finished = stack.getLastOrNull() orelse return error.BadState;
|
2023-09-23 17:27:21 -07:00
|
|
|
try finished.flow_list.append(
|
|
|
|
try Value.fromScalar(alloc, contents[item_start..idx]),
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
);
|
2023-09-23 13:29:49 -07:00
|
|
|
state = try popStack(&stack);
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
},
|
2023-09-17 19:47:18 -07:00
|
|
|
else => continue :charloop,
|
|
|
|
},
|
|
|
|
.want_list_separator => switch (char) {
|
|
|
|
' ', '\t' => continue :charloop,
|
|
|
|
',' => {
|
2023-09-23 17:27:21 -07:00
|
|
|
item_start = idx;
|
2023-09-23 13:29:49 -07:00
|
|
|
state = .want_list_item;
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
2023-09-23 13:29:49 -07:00
|
|
|
']' => state = try popStack(&stack),
|
2023-09-17 19:47:18 -07:00
|
|
|
else => return error.BadToken,
|
|
|
|
},
|
|
|
|
.want_map_key => switch (char) {
|
|
|
|
' ', '\t' => continue :charloop,
|
|
|
|
// forbid these characters so that flow dictionary keys cannot start
|
|
|
|
// with characters that regular dictionary keys cannot start with
|
|
|
|
// (even though they're unambiguous in this specific context).
|
2023-09-23 01:07:04 -07:00
|
|
|
'{', '[', '#', '-', '>', '|', ',' => return error.BadToken,
|
2023-09-17 19:47:18 -07:00
|
|
|
':' => {
|
|
|
|
// we have an empty map key
|
2023-09-17 23:09:26 -07:00
|
|
|
dangling_key = "";
|
2023-09-23 13:29:49 -07:00
|
|
|
state = .want_map_value;
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
2023-09-23 13:29:49 -07:00
|
|
|
'}' => state = try popStack(&stack),
|
2023-09-17 19:47:18 -07:00
|
|
|
else => {
|
2023-09-23 17:27:21 -07:00
|
|
|
item_start = idx;
|
2023-09-23 13:29:49 -07:00
|
|
|
state = .consuming_map_key;
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
|
|
|
},
|
|
|
|
.consuming_map_key => switch (char) {
|
|
|
|
':' => {
|
2023-09-23 17:27:21 -07:00
|
|
|
dangling_key = try alloc.dupe(u8, contents[item_start..idx]);
|
2023-09-23 13:29:49 -07:00
|
|
|
state = .want_map_value;
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
|
|
|
else => continue :charloop,
|
|
|
|
},
|
|
|
|
.want_map_value => switch (char) {
|
|
|
|
' ', '\t' => continue :charloop,
|
|
|
|
',' => {
|
2023-09-23 13:29:49 -07:00
|
|
|
const tip = try getStackTip(stack);
|
2023-09-17 19:47:18 -07:00
|
|
|
try Parser.putMap(
|
2023-09-23 17:27:21 -07:00
|
|
|
&tip.flow_map,
|
2023-09-17 23:09:26 -07:00
|
|
|
dangling_key.?,
|
2023-09-23 13:29:49 -07:00
|
|
|
Value.newScalar(alloc),
|
2023-09-17 19:47:18 -07:00
|
|
|
dupe_behavior,
|
|
|
|
);
|
|
|
|
|
2023-09-17 23:09:26 -07:00
|
|
|
dangling_key = null;
|
2023-09-23 13:29:49 -07:00
|
|
|
state = .want_map_key;
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
|
|
|
'[' => {
|
2023-09-23 13:29:49 -07:00
|
|
|
const tip = try getStackTip(stack);
|
2023-09-17 19:47:18 -07:00
|
|
|
|
|
|
|
const new_list = try Parser.putMapGetValue(
|
2023-09-23 17:27:21 -07:00
|
|
|
&tip.flow_map,
|
2023-09-17 23:09:26 -07:00
|
|
|
dangling_key.?,
|
2023-09-23 13:29:49 -07:00
|
|
|
Value.newFlowList(alloc),
|
2023-09-17 19:47:18 -07:00
|
|
|
dupe_behavior,
|
|
|
|
);
|
|
|
|
|
2023-09-23 17:27:21 -07:00
|
|
|
try stack.append(new_list);
|
2023-09-17 23:09:26 -07:00
|
|
|
dangling_key = null;
|
2023-09-23 17:27:21 -07:00
|
|
|
item_start = idx + 1;
|
2023-09-23 13:29:49 -07:00
|
|
|
state = .want_list_item;
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
|
|
|
'{' => {
|
2023-09-23 13:29:49 -07:00
|
|
|
const tip = try getStackTip(stack);
|
2023-09-17 19:47:18 -07:00
|
|
|
|
|
|
|
const new_map = try Parser.putMapGetValue(
|
2023-09-23 17:27:21 -07:00
|
|
|
&tip.flow_map,
|
2023-09-17 23:09:26 -07:00
|
|
|
dangling_key.?,
|
2023-09-23 13:29:49 -07:00
|
|
|
Value.newFlowMap(alloc),
|
2023-09-17 19:47:18 -07:00
|
|
|
dupe_behavior,
|
|
|
|
);
|
|
|
|
|
2023-09-23 17:27:21 -07:00
|
|
|
try stack.append(new_map);
|
2023-09-17 23:09:26 -07:00
|
|
|
dangling_key = null;
|
2023-09-23 13:29:49 -07:00
|
|
|
state = .want_map_key;
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
|
|
|
'}' => {
|
|
|
|
// the value is an empty string and this map is closed
|
2023-09-23 13:29:49 -07:00
|
|
|
const tip = try getStackTip(stack);
|
2023-09-17 19:47:18 -07:00
|
|
|
try Parser.putMap(
|
2023-09-23 17:27:21 -07:00
|
|
|
&tip.flow_map,
|
2023-09-17 23:09:26 -07:00
|
|
|
dangling_key.?,
|
2023-09-23 13:29:49 -07:00
|
|
|
Value.newScalar(alloc),
|
2023-09-17 19:47:18 -07:00
|
|
|
dupe_behavior,
|
|
|
|
);
|
|
|
|
|
2023-09-17 23:09:26 -07:00
|
|
|
dangling_key = null;
|
2023-09-23 13:29:49 -07:00
|
|
|
state = try popStack(&stack);
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
|
|
|
else => {
|
2023-09-23 17:27:21 -07:00
|
|
|
item_start = idx;
|
2023-09-23 13:29:49 -07:00
|
|
|
state = .consuming_map_value;
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
|
|
|
},
|
|
|
|
.consuming_map_value => switch (char) {
|
|
|
|
',', '}' => |term| {
|
2023-09-23 13:29:49 -07:00
|
|
|
const tip = try getStackTip(stack);
|
2023-09-17 19:47:18 -07:00
|
|
|
try Parser.putMap(
|
2023-09-23 17:27:21 -07:00
|
|
|
&tip.flow_map,
|
2023-09-17 23:09:26 -07:00
|
|
|
dangling_key.?,
|
2023-09-23 17:27:21 -07:00
|
|
|
try Value.fromScalar(alloc, contents[item_start..idx]),
|
2023-09-17 19:47:18 -07:00
|
|
|
dupe_behavior,
|
|
|
|
);
|
2023-09-17 23:09:26 -07:00
|
|
|
dangling_key = null;
|
2023-09-23 13:29:49 -07:00
|
|
|
state = .want_map_key;
|
|
|
|
if (term == '}') state = try popStack(&stack);
|
2023-09-17 19:47:18 -07:00
|
|
|
},
|
|
|
|
else => continue :charloop,
|
|
|
|
},
|
|
|
|
.want_map_separator => switch (char) {
|
|
|
|
' ', '\t' => continue :charloop,
|
2023-09-23 13:29:49 -07:00
|
|
|
',' => state = .want_map_key,
|
|
|
|
'}' => state = try popStack(&stack),
|
2023-09-17 19:47:18 -07:00
|
|
|
else => return error.BadToken,
|
|
|
|
},
|
|
|
|
// the root value was closed but there are characters remaining
|
|
|
|
// in the buffer
|
|
|
|
.done => return error.BadState,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// we ran out of characters while still in the middle of an object
|
2023-09-23 13:29:49 -07:00
|
|
|
if (state != .done) return error.BadState;
|
|
|
|
|
|
|
|
return root;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline fn appendListGetValue(list: *Value.List, value: Value) Error!*Value {
|
|
|
|
try list.append(value);
|
|
|
|
return &list.items[list.items.len - 1];
|
|
|
|
}
|
|
|
|
|
|
|
|
inline fn putMap(map: *Value.Map, key: []const u8, value: Value, dupe_behavior: DuplicateKeyBehavior) Error!void {
|
|
|
|
_ = try putMapGetValue(map, key, value, dupe_behavior);
|
|
|
|
}
|
|
|
|
|
|
|
|
inline fn putMapGetValue(map: *Value.Map, key: []const u8, value: Value, dupe_behavior: DuplicateKeyBehavior) Error!*Value {
|
|
|
|
const gop = try map.getOrPut(key);
|
|
|
|
|
|
|
|
if (gop.found_existing)
|
|
|
|
switch (dupe_behavior) {
|
|
|
|
.fail => return error.DuplicateKey,
|
|
|
|
.use_first => {},
|
|
|
|
.use_last => gop.value_ptr.* = value,
|
|
|
|
}
|
|
|
|
else
|
|
|
|
gop.value_ptr.* = value;
|
|
|
|
|
|
|
|
return gop.value_ptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn dumpBufLines(self: *Parser, buf: []const u8) Error!void {
|
|
|
|
var tok: LineTokenizer = .{ .buffer = buf, .diagnostics = &self.diagnostics };
|
|
|
|
while (try tok.next()) |line| {
|
|
|
|
dumpLine(line);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn dumpLine(line: LineTokenizer.Line) void {
|
|
|
|
var dedbuf: [64]u8 = .{0} ** 64;
|
|
|
|
var keybuf: [2048]u8 = .{0} ** 2048;
|
|
|
|
var valbuf: [2048]u8 = .{0} ** 2048;
|
2023-09-17 19:47:18 -07:00
|
|
|
|
2023-09-23 13:29:49 -07:00
|
|
|
const shiftstr = if (line.indent == .dedent)
|
|
|
|
std.fmt.bufPrint(&dedbuf, " ({d})", .{line.indent.dedent}) catch unreachable
|
|
|
|
else
|
|
|
|
"";
|
|
|
|
|
|
|
|
std.debug.print("{s}{s}: {s} => {s}\n", .{
|
|
|
|
@tagName(line.indent), shiftstr, @tagName(line.contents), switch (line.contents) {
|
|
|
|
.comment => |str| str,
|
|
|
|
.in_line, .list_item => |scalar| switch (scalar) {
|
|
|
|
.empty => "[empty]",
|
|
|
|
.scalar,
|
|
|
|
.string,
|
|
|
|
.flow_list,
|
|
|
|
.flow_map,
|
|
|
|
=> |str| std.fmt.bufPrint(&keybuf, "{s} => {s}", .{ @tagName(scalar), str }) catch unreachable,
|
|
|
|
},
|
|
|
|
.map_item => |map| std.fmt.bufPrint(&keybuf, "{s} : {s}", .{
|
|
|
|
map.key,
|
|
|
|
switch (map.val) {
|
|
|
|
.empty => "[empty]",
|
|
|
|
.scalar,
|
|
|
|
.string,
|
|
|
|
.flow_list,
|
|
|
|
.flow_map,
|
|
|
|
=> |str| std.fmt.bufPrint(&valbuf, "{s} => {s}", .{ @tagName(map.val), str }) catch unreachable,
|
|
|
|
},
|
|
|
|
}) catch unreachable,
|
|
|
|
},
|
|
|
|
});
|
2023-09-17 19:47:18 -07:00
|
|
|
}
|
|
|
|
};
|