nice-data/src/config.zig

1435 lines
62 KiB
Zig
Raw Normal View History

2023-09-13 00:11:45 -07:00
// Heavily inspired by, but not quite compatible with, NestedText. Key differences:
//
// - Doesn't support multiline keys (this means map keys cannot start with
// ' ', \t, #, {, [, |, or >, and they cannot contain :)
2023-09-13 00:11:45 -07:00
// - Allows using tabs for indentation (but not mixed tabs/spaces)
// - Indentation must be quantized consistently throughout the document. e.g.
// every nested layer being exactly 2 spaces past its parent. Tabs may
// only use one tab per indentation level.
// - Allows flow-style lists, maps, and strings on the same line as map keys or
// list items (i.e. the following are legal):
//
// key: {inline: map}
// key: [inline, list]
// key: > inline string
// - {map: item}
// - [list, item]
// - > inline string
//
// The string case retains the possibility of having an inline map value starting
// with {, [, or >
// - inline lists and maps cannot contain other inline structures. This may
// change, as writing {:[{:[{:[{:[{:[{:[]}]}]}]}]}]} seems tremendously useful
2023-09-13 00:11:45 -07:00
// - a map keys and list item dashes must be followed by a value or an indented
// section to reduce parser quantum state. This means that
//
// foo:
// bar: baz
//
// or
2023-09-13 00:11:45 -07:00
//
// -
// - qux
//
// are not valid. This can be represented with an inline empty string after foo:
//
// foo: >
// bar: baz
//
// or
//
// - >
// - qux
//
// - newlines are strictly LF, if the parser finds CR, it is an error
// - blank lines may not contain any whitespace characters except the single LF
// - Additional string indicator `|` for soft-wrapped strings, i.e.
//
// key: | this is not special
// key:
// | these lines are
// | soft-wrapped
//
2023-09-13 00:11:45 -07:00
// soft-wrapped lines are joined with a ' ' instead of a newline character.
// Like multiline strings, the final space is stripped (I guess this is a very
// janky way to add trailing whitespace to a string).
//
2023-09-17 23:09:26 -07:00
// - terminated strings to allow trailing whitespace:
// | this string has trailing whitespace |
// > and so does this one |
2023-09-13 00:11:45 -07:00
// - The parser is both strict and probably sloppy and may have weird edge
// cases since I'm slinging code, not writing a spec. For example, tabs are
// not trimmed from the values of inline lists/maps
2023-09-13 00:11:45 -07:00
const std = @import("std");
pub const IndexSlice = struct { start: usize, len: usize };
2023-09-13 00:11:45 -07:00
pub const Diagnostics = struct {
row: usize,
span: struct { absolute: usize, line_offset: usize, length: usize },
message: []const u8,
};
pub const LineBuffer = struct {
allocator: std.mem.Allocator,
buffer: []u8,
used: usize,
window: IndexSlice,
2023-09-13 00:11:45 -07:00
pub const default_capacity: usize = 4096;
pub const Error = std.mem.Allocator.Error;
2023-09-13 00:11:45 -07:00
pub fn init(allocator: std.mem.Allocator) Error!LineBuffer {
return initCapacity(allocator, default_capacity);
}
2023-09-13 00:11:45 -07:00
pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer {
return .{
.allocator = allocator,
.buffer = try allocator.alloc(u8, capacity),
.used = 0,
.window = .{ .start = 0, .len = 0 },
};
}
2023-09-17 23:09:26 -07:00
pub fn feed(self: *LineBuffer, data: []const u8) Error!void {
if (data.len == 0) return;
// TODO: check for usize overflow here if we want Maximum Robustness
const new_window_len = self.window.len + data.len;
// data cannot fit in the buffer with our scan window, so we have to realloc
if (new_window_len > self.buffer.len) {
// TODO: adopt an overallocation strategy? Will potentially avoid allocating
// on every invocation but will cause the buffer to oversize
try self.allocator.realloc(self.buffer, new_window_len);
self.rehome();
@memcpy(self.buffer[self.used..].ptr, data);
self.used = new_window_len;
self.window.len = new_window_len;
2023-09-17 23:09:26 -07:00
}
// data will fit, but needs to be moved in the buffer
else if (self.window.start + new_window_len > self.buffer.len) {
self.rehome();
@memcpy(self.buffer[self.used..].ptr, data);
self.used = new_window_len;
self.window.len = new_window_len;
}
// data can simply be appended
else {
@memcpy(self.buffer[self.used..].ptr, data);
}
}
2023-09-13 00:11:45 -07:00
/// The memory returned by this function is valid until the next call to `feed`.
/// The resulting slice does not include the newline character.
pub fn nextLine(self: *LineBuffer) ?[]const u8 {
if (self.window.start >= self.buffer.len or self.window.len == 0)
return null;
2023-09-13 00:11:45 -07:00
const window = self.buffer[self.window.start..][0..self.window.len];
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
2023-09-13 00:11:45 -07:00
self.window.start += split + 1;
self.window.len -= split + 1;
2023-09-13 00:11:45 -07:00
return window[0..split];
}
fn rehome(self: *LineBuffer) void {
if (self.window.start == 0) return;
const window = self.buffer[self.window.start..][0..self.window.len];
if (self.window.len > self.window.start)
std.mem.copyForwards(u8, self.buffer, window)
else
@memcpy(self.buffer.ptr, window);
self.window.start = 0;
self.used = window.len;
}
};
pub const FixedLineBuffer = struct {
buffer: []const u8,
window: IndexSlice,
pub fn init(data: []const u8) FixedLineBuffer {
return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } };
}
pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 {
if (self.window.start >= self.buffer.len or self.window.len == 0)
return null;
const window = self.buffer[self.window.start..][0..self.window.len];
const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null;
self.window.start += split + 1;
self.window.len -= split + 1;
2023-09-13 00:11:45 -07:00
return window[0..split];
}
};
const IndentationType = union(enum) {
immaterial: void,
spaces: usize,
tabs: void,
};
const InlineItem = union(enum) {
empty: void,
scalar: []const u8,
line_string: []const u8,
space_string: []const u8,
flow_list: []const u8,
flow_map: []const u8,
fn lineEnding(self: InlineItem) u8 {
return switch (self) {
.line_string => '\n',
.space_string => ' ',
else => unreachable,
};
}
};
const LineContents = union(enum) {
comment: []const u8,
in_line: InlineItem,
list_item: InlineItem,
map_item: struct { key: []const u8, val: InlineItem },
};
// we can dedent multiple levels at once. Example:
//
// foo:
// bar:
// > a
// > string
// baz: [qux]
//
// capturing this is conceptually simple, but implementing it without complex
// indentation tracking requires quantizing the indentation. This means our
// IndentationType will also need to track the number of spaces used for
// indentation, as detected. Then every line we have to check indent rem the
// quantization level == 0 (otherwise we broke quantization) and compute indent
// div the quantization level to give us our effective indentation level.
const ShiftDirection = enum { indent, dedent, none };
const RelativeIndent = union(ShiftDirection) {
indent: void,
dedent: usize,
none: void,
};
const Line = struct {
indent: RelativeIndent,
contents: LineContents,
raw: []const u8,
};
pub fn LineTokenizer(comptime Buffer: type) type {
return struct {
buffer: Buffer,
index: usize = 0,
indentation: IndentationType = .immaterial,
last_indent: usize = 0,
diagnostics: *Diagnostics,
row: usize = 0,
const Error = error{
BadToken,
MixedIndentation,
UnquantizedIndentation,
TooMuchIndentation,
MissingNewline,
TrailingWhitespace,
Impossible,
};
pub fn next(self: *@This()) Error!?Line {
lineloop: while (self.buffer.nextLine()) |raw_line| {
var indent: usize = 0;
for (raw_line, 0..) |char, idx| {
switch (char) {
' ' => {
switch (self.indentation) {
// There's a weird coupling here because we can't set this until
// all spaces have been consumed. I also thought about ignoring
// spaces on comment lines since those don't affect the
// relative indent/dedent, but then we would allow comments
// to ignore our indent quantum, which I dislike due to it making
// ugly documents.
.immaterial => self.indentation = .{ .spaces = 0 },
.spaces => {},
.tabs => return error.MixedIndentation,
}
},
'\t' => {
switch (self.indentation) {
.immaterial => self.indentation = .tabs,
.spaces => return error.MixedIndentation,
.tabs => {},
}
},
'\r' => {
return error.BadToken;
},
else => {
indent = idx;
break;
},
2023-09-13 00:11:45 -07:00
}
} else {
if (raw_line.len > 0) return error.TrailingWhitespace;
continue :lineloop;
}
var quantized: usize = if (self.indentation == .spaces) quant: {
if (self.indentation.spaces == 0) {
self.indentation.spaces = indent;
2023-09-13 00:11:45 -07:00
}
if (@rem(indent, self.indentation.spaces) != 0)
return error.UnquantizedIndentation;
break :quant @divExact(indent, self.indentation.spaces);
} else indent;
const relative: RelativeIndent = if (quantized > self.last_indent) rel: {
if ((quantized - self.last_indent) > 1)
return error.TooMuchIndentation;
break :rel .indent;
} else if (quantized < self.last_indent)
.{ .dedent = self.last_indent - quantized }
else
.none;
2023-09-13 00:11:45 -07:00
defer {
self.row += 1;
self.last_indent = quantized;
}
2023-09-13 00:11:45 -07:00
const line = raw_line[indent..];
2023-09-13 00:11:45 -07:00
// this should not be possible, as empty lines are caught earlier.
if (line.len == 0) return error.Impossible;
2023-09-13 00:11:45 -07:00
switch (line[0]) {
'#' => {
// simply lie about indentation when the line is a comment.
quantized = self.last_indent;
return .{
.indent = .none,
.contents = .{ .comment = line[1..] },
.raw = line,
};
},
'|', '>', '[', '{' => {
return .{
2023-09-13 00:11:45 -07:00
.indent = relative,
.contents = .{ .in_line = try detectInlineItem(line) },
.raw = line,
2023-09-13 00:11:45 -07:00
};
},
'-' => {
if (line.len > 1 and line[1] != ' ') return error.BadToken;
2023-09-13 00:11:45 -07:00
return if (line.len == 1) .{
.indent = relative,
.contents = .{ .list_item = .empty },
.raw = line,
} else .{
.indent = relative,
.contents = .{ .list_item = try detectInlineItem(line[2..]) },
.raw = line,
};
},
else => {
for (line, 0..) |char, idx| {
if (char == ':') {
if (idx + 1 == line.len) return .{
.indent = relative,
.contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
.raw = line,
};
if (line[idx + 1] != ' ') return error.BadToken;
return .{
.indent = relative,
.contents = .{ .map_item = .{
.key = line[0..idx],
.val = try detectInlineItem(line[idx + 2 ..]),
} },
.raw = line,
};
}
}
2023-09-13 00:11:45 -07:00
return .{
.indent = relative,
.contents = .{ .in_line = .{ .scalar = line } },
.raw = line,
2023-09-13 00:11:45 -07:00
};
},
2023-09-13 00:11:45 -07:00
}
// somehow everything else has failed
return error.Impossible;
}
return null;
2023-09-13 00:11:45 -07:00
}
fn detectInlineItem(buf: []const u8) Error!InlineItem {
if (buf.len == 0) return .empty;
2023-09-13 00:11:45 -07:00
switch (buf[0]) {
'>', '|' => |char| {
if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
2023-09-17 23:09:26 -07:00
const slice: []const u8 = switch (buf[buf.len - 1]) {
' ', '\t' => return error.TrailingWhitespace,
'|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
else => buf[@min(2, buf.len)..buf.len],
};
2023-09-13 00:11:45 -07:00
return if (char == '>')
.{ .line_string = slice }
else
.{ .space_string = slice };
},
'[' => {
if (buf.len < 2 or buf[buf.len - 1] != ']')
return error.BadToken;
2023-09-13 00:11:45 -07:00
// keep the closing ] for the flow parser
return .{ .flow_list = buf[1..] };
},
'{' => {
if (buf.len < 2 or buf[buf.len - 1] != '}')
return error.BadToken;
2023-09-17 23:09:26 -07:00
// keep the closing } fpr the flow parser
return .{ .flow_map = buf[1..] };
},
else => {
if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t')
return error.TrailingWhitespace;
2023-09-13 00:11:45 -07:00
return .{ .scalar = buf };
},
2023-09-13 00:11:45 -07:00
}
}
};
}
2023-09-13 00:11:45 -07:00
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
pub const Value = union(enum) {
pub const String = std.ArrayList(u8);
pub const Map = std.StringHashMap(Value);
pub const List = std.ArrayList(Value);
pub const TagType = @typeInfo(Value).Union.tag_type.?;
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
scalar: String,
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
string: String,
list: List,
flow_list: List,
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
map: Map,
flow_map: Map,
pub inline fn fromScalar(alloc: std.mem.Allocator, input: []const u8) !Value {
return try _fromScalarOrString(alloc, .scalar, input);
}
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
pub inline fn fromString(alloc: std.mem.Allocator, input: []const u8) !Value {
return try _fromScalarOrString(alloc, .string, input);
}
inline fn _fromScalarOrString(alloc: std.mem.Allocator, comptime classification: TagType, input: []const u8) !Value {
var res = @unionInit(Value, @tagName(classification), try String.initCapacity(alloc, input.len));
@field(res, @tagName(classification)).appendSliceAssumeCapacity(input);
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
return res;
}
pub inline fn newScalar(alloc: std.mem.Allocator) Value {
return .{ .scalar = String.init(alloc) };
}
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
pub inline fn newString(alloc: std.mem.Allocator) Value {
return .{ .string = String.init(alloc) };
}
pub inline fn newList(alloc: std.mem.Allocator) Value {
return .{ .list = List.init(alloc) };
}
pub inline fn newFlowList(alloc: std.mem.Allocator) Value {
return .{ .flow_list = List.init(alloc) };
}
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
pub inline fn newMap(alloc: std.mem.Allocator) Value {
return .{ .map = Map.init(alloc) };
}
pub inline fn newFlowMap(alloc: std.mem.Allocator) Value {
return .{ .flow_map = Map.init(alloc) };
}
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
pub fn printDebug(self: Value) void {
self.printRecursive(0);
std.debug.print("\n", .{});
}
fn printRecursive(self: Value, indent: usize) void {
switch (self) {
.scalar, .string => |str| {
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
if (std.mem.indexOfScalar(u8, str.items, '\n')) |_| {
var lines = std.mem.splitScalar(u8, str.items, '\n');
std.debug.print("\n", .{});
while (lines.next()) |line| {
std.debug.print(
"{[empty]s: >[indent]}{[line]s}{[nl]s}",
.{
.empty = "",
.indent = indent,
.line = line,
.nl = if (lines.peek() == null) "" else "\n",
},
);
}
} else {
std.debug.print("{s}", .{str.items});
}
},
.list, .flow_list => |list| {
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
if (list.items.len == 0) {
std.debug.print("[]", .{});
return;
}
std.debug.print("[\n", .{});
for (list.items, 0..) |value, idx| {
std.debug.print("{[empty]s: >[indent]}[{[idx]d}] = ", .{ .empty = "", .indent = indent, .idx = idx });
value.printRecursive(indent + 2);
std.debug.print(",\n", .{});
}
std.debug.print(
"{[empty]s: >[indent]}]",
.{ .empty = "", .indent = indent },
);
},
.map, .flow_map => |map| {
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
if (map.count() == 0) {
std.debug.print("{{}}", .{});
return;
}
std.debug.print("{{\n", .{});
var iter = map.iterator();
while (iter.next()) |entry| {
std.debug.print(
"{[empty]s: >[indent]}{[key]s}: ",
.{ .empty = "", .indent = indent + 2, .key = entry.key_ptr.* },
);
entry.value_ptr.printRecursive(indent + 4);
std.debug.print(",\n", .{});
}
std.debug.print(
"{[empty]s: >[indent]}}}",
.{ .empty = "", .indent = indent },
);
},
}
}
};
2023-09-13 00:11:45 -07:00
pub const Parser = struct {
allocator: std.mem.Allocator,
dupe_behavior: DuplicateKeyBehavior = .fail,
default_object: DefaultObject = .fail,
diagnostics: Diagnostics = .{
.row = 0,
.span = .{ .absolute = 0, .line_offset = 0, .length = 0 },
.message = "all is well",
},
pub const Error = error{
UnexpectedIndent,
UnexpectedValue,
ExtraContent,
EmptyDocument,
DuplicateKey,
BadMapEntry,
2023-09-13 00:11:45 -07:00
Fail,
} || LineTokenizer(FixedLineBuffer).Error || FlowParser.Error || std.mem.Allocator.Error;
2023-09-13 00:11:45 -07:00
pub const DuplicateKeyBehavior = enum {
use_first,
use_last,
fail,
};
pub const DefaultObject = enum {
string,
list,
map,
fail,
};
pub const ParseState = enum {
initial,
value,
done,
};
pub const Document = struct {
arena: std.heap.ArenaAllocator,
root: Value,
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
pub fn init(alloc: std.mem.Allocator) Document {
return .{
.arena = std.heap.ArenaAllocator.init(alloc),
.root = undefined,
};
}
pub fn printDebug(self: Document) void {
return self.root.printDebug();
}
pub fn deinit(self: Document) void {
2023-09-13 00:11:45 -07:00
self.arena.deinit();
}
};
pub const State = struct {
pub const Stack = std.ArrayList(*Value);
document: Document,
value_stack: Stack,
state: ParseState = .initial,
expect_shift: ShiftDirection = .none,
dangling_key: ?[]const u8 = null,
pub fn init(alloc: std.mem.Allocator) State {
return .{
.document = Document.init(alloc),
.value_stack = Stack.init(alloc),
};
}
pub fn deinit(self: State) void {
self.value_stack.deinit();
}
};
pub fn parseBuffer(self: *Parser, buffer: []const u8) Error!Document {
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
var document = Document.init(self.allocator);
2023-09-13 00:11:45 -07:00
errdefer document.deinit();
const arena_alloc = document.arena.allocator();
var state: ParseState = .initial;
var expect_shift: ShiftDirection = .none;
var dangling_key: ?[]const u8 = null;
2023-09-13 00:11:45 -07:00
var stack = std.ArrayList(*Value).init(arena_alloc);
defer stack.deinit();
var tok: LineTokenizer(FixedLineBuffer) = .{
.buffer = FixedLineBuffer.init(buffer),
.diagnostics = &self.diagnostics,
};
2023-09-13 00:11:45 -07:00
while (try tok.next()) |line| {
if (line.contents == .comment) continue;
var flip = true;
var flop = false;
// this is needed to give us a second go round when the line is dedented
flipflop: while (flip) : (flop = true) {
switch (state) {
.initial => {
if (line.indent == .indent) return error.UnexpectedIndent;
switch (line.contents) {
// we filter out comments above
.comment => unreachable,
.in_line => |in_line| switch (in_line) {
// empty scalars are only emitted for a list_item or a map_item
.empty => unreachable,
.scalar => |str| {
document.root = try Value.fromScalar(arena_alloc, str);
2023-09-17 23:09:26 -07:00
// this is a cheesy hack. If the document consists
// solely of a scalar, the finalizer will try to
// chop a line ending off of it, so we need to add
// a sacrificial padding character to avoid
// chopping off something that matters.
try document.root.string.append(' ');
2023-09-13 00:11:45 -07:00
state = .done;
},
2023-09-17 23:09:26 -07:00
.line_string, .space_string => |str| {
document.root = try Value.fromString(arena_alloc, str);
2023-09-17 23:09:26 -07:00
try document.root.string.append(in_line.lineEnding());
try stack.append(&document.root);
state = .value;
},
2023-09-13 00:11:45 -07:00
.flow_list => |str| {
document.root = try parseFlowList(arena_alloc, str, self.dupe_behavior);
2023-09-13 00:11:45 -07:00
state = .done;
},
.flow_map => |str| {
document.root = try parseFlowMap(arena_alloc, str, self.dupe_behavior);
2023-09-13 00:11:45 -07:00
state = .done;
},
},
.list_item => |value| {
document.root = Value.newList(arena_alloc);
2023-09-13 00:11:45 -07:00
try stack.append(&document.root);
switch (value) {
.empty => {
expect_shift = .indent;
state = .value;
},
.scalar => |str| {
try document.root.list.append(try Value.fromScalar(arena_alloc, str));
state = .value;
},
.line_string, .space_string => |str| {
try document.root.list.append(try Value.fromString(arena_alloc, str));
2023-09-13 00:11:45 -07:00
state = .value;
},
.flow_list => |str| {
try document.root.list.append(try parseFlowList(arena_alloc, str, self.dupe_behavior));
2023-09-13 00:11:45 -07:00
state = .value;
},
.flow_map => |str| {
try document.root.list.append(try parseFlowMap(arena_alloc, str, self.dupe_behavior));
2023-09-13 00:11:45 -07:00
state = .value;
},
}
},
.map_item => |pair| {
document.root = Value.newMap(arena_alloc);
2023-09-13 00:11:45 -07:00
try stack.append(&document.root);
switch (pair.val) {
.empty => {
expect_shift = .indent;
// If the key is on its own line, we don't have
// an associated value until we parse the next
// line. We need to store a reference to this
// key somewhere until we can consume the
// value. More parser state to lug along.
dangling_key = pair.key;
2023-09-13 00:11:45 -07:00
state = .value;
},
.scalar => |str| {
2023-09-13 00:11:45 -07:00
// we can do direct puts here because this is
// the very first line of the document
try document.root.map.put(pair.key, try Value.fromScalar(arena_alloc, str));
state = .value;
},
.line_string, .space_string => |str| {
// we can do direct puts here because this is
// the very first line of the document
try document.root.map.put(pair.key, try Value.fromString(arena_alloc, str));
2023-09-13 00:11:45 -07:00
state = .value;
},
.flow_list => |str| {
try document.root.map.put(pair.key, try parseFlowList(arena_alloc, str, self.dupe_behavior));
2023-09-13 00:11:45 -07:00
state = .value;
},
.flow_map => |str| {
try document.root.map.put(pair.key, try parseFlowMap(arena_alloc, str, self.dupe_behavior));
2023-09-13 00:11:45 -07:00
state = .value;
},
}
},
}
},
.value => switch (stack.getLast().*) {
// these three states are never reachable here. flow_list and
// flow_map are parsed with a separate state machine. These
// value tyeps can only be present by themselves as the first
// line of the document, in which case the document consists
// only of that single line: this parser jumps immediately into
// the .done state, bypassing the .value state in which this
// switch is embedded.
.scalar, .flow_list, .flow_map => unreachable,
2023-09-13 00:11:45 -07:00
.string => |*string| {
2023-09-17 23:09:26 -07:00
if (line.indent == .indent)
return error.UnexpectedIndent;
2023-09-13 00:11:45 -07:00
if (!flop and line.indent == .dedent) {
2023-09-17 23:09:26 -07:00
// kick off the last trailing space or newline
_ = string.pop();
2023-09-13 00:11:45 -07:00
var dedent_depth = line.indent.dedent;
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = stack.pop();
continue :flipflop;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| switch (in_line) {
.empty => unreachable,
2023-09-17 23:09:26 -07:00
.line_string, .space_string => |str| {
try string.appendSlice(str);
2023-09-17 23:09:26 -07:00
try string.append(in_line.lineEnding());
},
2023-09-13 00:11:45 -07:00
else => return error.UnexpectedValue,
},
else => return error.UnexpectedValue,
}
},
.list => |*list| {
2023-09-17 23:09:26 -07:00
// detect that the previous item was actually empty
//
// -
// - something
//
// the first line here creates the expect_shift, but the second line
// is a valid continuation of the list despite not being indented
if (expect_shift == .indent and line.indent != .indent)
try list.append(Value.newScalar(arena_alloc));
2023-09-13 00:11:45 -07:00
// Consider:
//
2023-09-17 23:09:26 -07:00
// -
// own-line scalar
// - inline scalar
2023-09-13 00:11:45 -07:00
//
// the own-line scalar will not push the stack but the next list item will be a dedent
if (!flop and line.indent == .dedent) {
// if line.indent.dedent is 1 and we're expecting it, the stack will not be popped,
// but we will continue loop flipflop. However, flop will be set to false on the next
// trip, so this if prong will not be run again.
var dedent_depth = line.indent.dedent - @intFromBool(expect_shift == .dedent);
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = stack.pop();
continue :flipflop;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| {
// assert that this line has been indented. this is required for an inline value when
// the stack is in list mode.
if (expect_shift != .indent or line.indent != .indent)
return error.UnexpectedValue;
2023-09-13 00:11:45 -07:00
expect_shift = .dedent;
2023-09-13 00:11:45 -07:00
switch (in_line) {
.empty => unreachable,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.flow_list => |str| try list.append(try parseFlowList(arena_alloc, str, self.dupe_behavior)),
.flow_map => |str| try list.append(try parseFlowMap(arena_alloc, str, self.dupe_behavior)),
2023-09-17 23:09:26 -07:00
.line_string, .space_string => |str| {
2023-09-13 00:11:45 -07:00
// string pushes the stack
const new_string = try appendListGetValue(list, try Value.fromString(arena_alloc, str));
2023-09-17 23:09:26 -07:00
try new_string.string.append(in_line.lineEnding());
try stack.append(new_string);
2023-09-13 00:11:45 -07:00
expect_shift = .none;
},
}
},
.list_item => |value| {
switch (line.indent) {
// for dedent, the stack has already been popped, so this should be fine
.none, .dedent => {
expect_shift = .none;
switch (value) {
.empty => expect_shift = .indent,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try list.append(try Value.fromString(arena_alloc, str)),
.flow_list => |str| try list.append(try parseFlowList(arena_alloc, str, self.dupe_behavior)),
.flow_map => |str| try list.append(try parseFlowMap(arena_alloc, str, self.dupe_behavior)),
}
2023-09-13 00:11:45 -07:00
},
// a new list is being created
.indent => {
if (expect_shift != .indent)
return error.UnexpectedIndent;
const new_list = try appendListGetValue(list, Value.newList(arena_alloc));
try stack.append(new_list);
expect_shift = .none;
switch (value) {
.empty => expect_shift = .indent,
.scalar => |str| try new_list.list.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try new_list.list.append(try Value.fromString(arena_alloc, str)),
.flow_list => |str| try new_list.list.append(try parseFlowList(arena_alloc, str, self.dupe_behavior)),
.flow_map => |str| try new_list.list.append(try parseFlowMap(arena_alloc, str, self.dupe_behavior)),
}
2023-09-13 00:11:45 -07:00
},
}
},
.map_item => |pair| {
// this prong cannot be hit on dedent in a valid way.
//
// -
// map: value
// second: value
// third: value
//
// dedenting back to the list stack level requires list_item
if (line.indent != .indent)
return error.UnexpectedValue;
const new_map = try appendListGetValue(list, Value.newMap(arena_alloc));
try stack.append(new_map);
expect_shift = .none;
switch (pair.val) {
.empty => {
dangling_key = pair.key;
expect_shift = .indent;
},
.scalar => |str| try new_map.map.put(pair.key, try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try new_map.map.put(pair.key, try Value.fromString(arena_alloc, str)),
.flow_list => |str| try new_map.map.put(pair.key, try parseFlowList(arena_alloc, str, self.dupe_behavior)),
.flow_map => |str| try new_map.map.put(pair.key, try parseFlowMap(arena_alloc, str, self.dupe_behavior)),
}
2023-09-13 00:11:45 -07:00
},
}
},
.map => |*map| {
2023-09-17 23:09:26 -07:00
// detect that the previous item was actually empty
//
// foo:
// bar: baz
//
// the first line here creates the expect_shift, but the second line
// is a valid continuation of the map despite not being indented
if (expect_shift == .indent and line.indent != .indent) {
try putMap(
map,
dangling_key orelse return error.Fail,
Value.newScalar(arena_alloc),
self.dupe_behavior,
);
dangling_key = null;
}
2023-09-13 00:11:45 -07:00
if (!flop and line.indent == .dedent) {
var dedent_depth = line.indent.dedent - @intFromBool(expect_shift == .dedent);
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = stack.pop();
2023-09-13 00:11:45 -07:00
continue :flipflop;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| {
// assert that this line has been indented. this is required for an inline value when
// the stack is in map mode.
if (expect_shift != .indent or line.indent != .indent or dangling_key == null)
return error.UnexpectedValue;
expect_shift = .dedent;
switch (in_line) {
.empty => unreachable,
.scalar => |str| try putMap(map, dangling_key.?, try Value.fromScalar(arena_alloc, str), self.dupe_behavior),
.flow_list => |str| try putMap(map, dangling_key.?, try parseFlowList(arena_alloc, str, self.dupe_behavior), self.dupe_behavior),
.flow_map => |str| {
try putMap(map, dangling_key.?, try parseFlowMap(arena_alloc, str, self.dupe_behavior), self.dupe_behavior);
},
2023-09-17 23:09:26 -07:00
.line_string, .space_string => |str| {
// string pushes the stack
const new_string = try putMapGetValue(map, dangling_key.?, try Value.fromString(arena_alloc, str), self.dupe_behavior);
2023-09-17 23:09:26 -07:00
try new_string.string.append(in_line.lineEnding());
try stack.append(new_string);
expect_shift = .none;
},
}
dangling_key = null;
},
.list_item => |value| {
// this prong cannot be hit on dedent in a valid way.
//
// map:
// - value
// - invalid
//
// dedenting back to the map stack level requires map_item
if (expect_shift != .indent or line.indent != .indent or dangling_key == null)
return error.UnexpectedValue;
const new_list = try putMapGetValue(map, dangling_key.?, Value.newList(arena_alloc), self.dupe_behavior);
try stack.append(new_list);
dangling_key = null;
expect_shift = .none;
switch (value) {
.empty => expect_shift = .indent,
.scalar => |str| try new_list.list.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try new_list.list.append(try Value.fromString(arena_alloc, str)),
.flow_list => |str| try new_list.list.append(try parseFlowList(arena_alloc, str, self.dupe_behavior)),
.flow_map => |str| try new_list.list.append(try parseFlowMap(arena_alloc, str, self.dupe_behavior)),
}
},
.map_item => |pair| {
expect_shift = .none;
switch (line.indent) {
// for dedent, the stack has already been popped, so this should be fine
.none, .dedent => switch (pair.val) {
.empty => {
expect_shift = .indent;
dangling_key = pair.key;
},
.scalar => |str| try putMap(map, pair.key, try Value.fromScalar(arena_alloc, str), self.dupe_behavior),
.line_string, .space_string => |str| try putMap(map, pair.key, try Value.fromString(arena_alloc, str), self.dupe_behavior),
.flow_list => |str| try putMap(map, pair.key, try parseFlowList(arena_alloc, str, self.dupe_behavior), self.dupe_behavior),
.flow_map => |str| try putMap(map, pair.key, try parseFlowMap(arena_alloc, str, self.dupe_behavior), self.dupe_behavior),
},
// a new map is being created
.indent => {
if (expect_shift != .indent or dangling_key == null) return error.UnexpectedValue;
const new_map = try putMapGetValue(map, dangling_key.?, Value.newMap(arena_alloc), self.dupe_behavior);
try stack.append(new_map);
dangling_key = null;
switch (pair.val) {
.empty => {
expect_shift = .indent;
dangling_key = pair.key;
},
.scalar => |str| try new_map.map.put(pair.key, try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try new_map.map.put(pair.key, try Value.fromString(arena_alloc, str)),
.flow_list => |str| try new_map.map.put(pair.key, try parseFlowList(arena_alloc, str, self.dupe_behavior)),
.flow_map => |str| try new_map.map.put(pair.key, try parseFlowMap(arena_alloc, str, self.dupe_behavior)),
}
},
}
},
}
2023-09-13 00:11:45 -07:00
},
},
.done => return error.ExtraContent,
}
// this is specifically performed at the end of the loop body so that
// `continue :flipflop` skips setting it.
flip = false;
}
2023-09-13 00:11:45 -07:00
}
switch (state) {
.initial => switch (self.default_object) {
.string => document.root = .{ .string = std.ArrayList(u8).init(arena_alloc) },
.list => document.root = Value.newList(arena_alloc),
.map => document.root = Value.newMap(arena_alloc),
2023-09-13 00:11:45 -07:00
.fail => return error.EmptyDocument,
},
.value => switch (stack.getLast().*) {
// remove the final trailing newline or space
.scalar, .string => |*string| _ = string.popOrNull(),
// if we have a dangling -, attach an empty string to it
.list => |*list| if (expect_shift == .indent) try list.append(Value.newScalar(arena_alloc)),
// if we have a dangling "key:", attach an empty string to it
.map => |*map| if (dangling_key) |dk| try putMap(map, dk, Value.newScalar(arena_alloc), self.dupe_behavior),
.flow_list, .flow_map => {},
2023-09-13 00:11:45 -07:00
},
.done => {},
}
return document;
}
fn parseFlowList(alloc: std.mem.Allocator, contents: []const u8, dupe_behavior: DuplicateKeyBehavior) Error!Value {
var parser = try FlowParser.initList(alloc, contents);
defer parser.deinit();
2023-09-13 00:11:45 -07:00
return try parser.parse(dupe_behavior);
2023-09-13 00:11:45 -07:00
}
fn parseFlowMap(alloc: std.mem.Allocator, contents: []const u8, dupe_behavior: DuplicateKeyBehavior) Error!Value {
var parser = try FlowParser.initMap(alloc, contents);
defer parser.deinit();
return try parser.parse(dupe_behavior);
}
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
inline fn appendListGetValue(list: *Value.List, value: Value) Error!*Value {
try list.append(value);
return &list.items[list.items.len - 1];
}
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
inline fn putMap(map: *Value.Map, key: []const u8, value: Value, dupe_behavior: DuplicateKeyBehavior) Error!void {
_ = try putMapGetValue(map, key, value, dupe_behavior);
}
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
inline fn putMapGetValue(map: *Value.Map, key: []const u8, value: Value, dupe_behavior: DuplicateKeyBehavior) Error!*Value {
const gop = try map.getOrPut(key);
if (gop.found_existing)
switch (dupe_behavior) {
.fail => return error.DuplicateKey,
.use_first => {},
.use_last => gop.value_ptr.* = value,
}
else
gop.value_ptr.* = value;
return gop.value_ptr;
2023-09-13 00:11:45 -07:00
}
pub fn dumpBufLines(self: *Parser, buf: []const u8) Error!void {
var tok: LineTokenizer = .{ .buffer = buf, .diagnostics = &self.diagnostics };
while (try tok.next()) |line| {
dumpLine(line);
}
}
fn dumpLine(line: LineTokenizer.Line) void {
var dedbuf: [64]u8 = .{0} ** 64;
var keybuf: [2048]u8 = .{0} ** 2048;
var valbuf: [2048]u8 = .{0} ** 2048;
const shiftstr = if (line.indent == .dedent)
std.fmt.bufPrint(&dedbuf, " ({d})", .{line.indent.dedent}) catch unreachable
else
"";
std.debug.print("{s}{s}: {s} => {s}\n", .{
@tagName(line.indent), shiftstr, @tagName(line.contents), switch (line.contents) {
.comment => |str| str,
.in_line, .list_item => |scalar| switch (scalar) {
.empty => "[empty]",
.scalar,
.string,
.flow_list,
.flow_map,
=> |str| std.fmt.bufPrint(&keybuf, "{s} => {s}", .{ @tagName(scalar), str }) catch unreachable,
},
.map_item => |map| std.fmt.bufPrint(&keybuf, "{s} : {s}", .{
map.key,
switch (map.val) {
.empty => "[empty]",
.scalar,
.string,
.flow_list,
.flow_map,
=> |str| std.fmt.bufPrint(&valbuf, "{s} => {s}", .{ @tagName(map.val), str }) catch unreachable,
},
}) catch unreachable,
},
});
}
};
pub const FlowParser = struct {
const FlowStackItem = struct {
value: *Value,
// lists need this. maps do also for keys and values.
item_start: usize = 0,
};
const FlowStack: type = std.ArrayList(FlowStackItem);
buffer: []const u8,
root: Value,
alloc: std.mem.Allocator,
stack: FlowStack,
state: ParseState,
// make this an ugly state machine parser
const ParseState = enum {
want_list_item,
consuming_list_item,
want_list_separator,
want_map_key,
consuming_map_key,
want_map_value,
consuming_map_value,
want_map_separator,
done,
};
const Error = error{
BadState,
BadToken,
} || std.mem.Allocator.Error;
pub fn initList(alloc: std.mem.Allocator, buffer: []const u8) Error!FlowParser {
return .{
.buffer = buffer,
.root = undefined,
.alloc = alloc,
.stack = undefined,
.state = .want_list_item,
};
}
pub fn initMap(alloc: std.mem.Allocator, buffer: []const u8) Error!FlowParser {
return .{
.buffer = buffer,
.root = undefined,
.alloc = alloc,
.stack = undefined,
.state = .want_map_key,
};
}
pub fn deinit(self: *FlowParser) void {
self.stack.deinit();
}
inline fn getStackTip(stack: FlowStack) Error!*FlowStackItem {
if (stack.items.len == 0) return error.BadState;
return &stack.items[stack.items.len - 1];
}
inline fn setStackItemStart(stack: FlowStack, start: usize) Error!void {
if (stack.items.len == 0) return error.BadState;
stack.items[stack.items.len - 1].item_start = start;
}
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
inline fn popStack(self: *FlowParser) Parser.Error!ParseState {
if (self.stack.popOrNull() == null)
return error.BadState;
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
const parent = self.stack.getLastOrNull() orelse return .done;
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
return switch (parent.value.*) {
.flow_list => .want_list_separator,
.flow_map => .want_map_separator,
else => return error.BadState,
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
};
}
pub fn parse(self: *FlowParser, dupe_behavior: Parser.DuplicateKeyBehavior) Parser.Error!Value {
// prime the stack:
switch (self.state) {
.want_list_item => {
self.root = Value.newFlowList(self.alloc);
self.stack = try FlowStack.initCapacity(self.alloc, 1);
self.stack.appendAssumeCapacity(.{ .value = &self.root });
},
.want_map_key => {
self.root = Value.newFlowMap(self.alloc);
self.stack = try FlowStack.initCapacity(self.alloc, 1);
self.stack.appendAssumeCapacity(.{ .value = &self.root });
},
else => {
return error.BadState;
},
}
2023-09-17 23:09:26 -07:00
var dangling_key: ?[]const u8 = null;
charloop: for (self.buffer, 0..) |char, idx| {
// std.debug.print("{s} => {c}\n", .{ @tagName(self.state), char });
switch (self.state) {
.want_list_item => switch (char) {
' ', '\t' => continue :charloop,
',' => {
// empty value
const tip = try getStackTip(self.stack);
try tip.value.flow_list.append(Value.newScalar(self.alloc));
tip.item_start = idx + 1;
},
'{' => {
const tip = try getStackTip(self.stack);
const new_map = try Parser.appendListGetValue(
&tip.value.flow_list,
Value.newFlowMap(self.alloc),
);
tip.item_start = idx;
try self.stack.append(.{ .value = new_map });
self.state = .want_map_key;
},
'[' => {
const tip = try getStackTip(self.stack);
const new_list = try Parser.appendListGetValue(
&tip.value.flow_list,
Value.newFlowList(self.alloc),
);
tip.item_start = idx;
try self.stack.append(.{ .value = new_list, .item_start = idx + 1 });
self.state = .want_list_item;
},
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
']' => {
const finished = self.stack.getLastOrNull() orelse return error.BadState;
if (finished.value.flow_list.items.len > 0 or idx > finished.item_start)
try finished.value.flow_list.append(Value.newScalar(self.alloc));
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
self.state = try self.popStack();
},
else => {
try setStackItemStart(self.stack, idx);
self.state = .consuming_list_item;
},
},
.consuming_list_item => switch (char) {
',' => {
const tip = try getStackTip(self.stack);
try tip.value.flow_list.append(
try Value.fromScalar(self.alloc, self.buffer[tip.item_start..idx]),
);
tip.item_start = idx + 1;
self.state = .want_list_item;
},
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
']' => {
const finished = self.stack.getLastOrNull() orelse return error.BadState;
try finished.value.flow_list.append(
try Value.fromScalar(self.alloc, self.buffer[finished.item_start..idx]),
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
);
self.state = try self.popStack();
},
else => continue :charloop,
},
.want_list_separator => switch (char) {
' ', '\t' => continue :charloop,
',' => {
try setStackItemStart(self.stack, idx);
self.state = .want_list_item;
},
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
']' => self.state = try self.popStack(),
else => return error.BadToken,
},
.want_map_key => switch (char) {
' ', '\t' => continue :charloop,
// forbid these characters so that flow dictionary keys cannot start
// with characters that regular dictionary keys cannot start with
// (even though they're unambiguous in this specific context).
'{', '[', '#', '>', '|', ',' => return error.BadToken,
':' => {
// we have an empty map key
2023-09-17 23:09:26 -07:00
dangling_key = "";
self.state = .want_map_value;
},
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
'}' => self.state = try self.popStack(),
else => {
try setStackItemStart(self.stack, idx);
self.state = .consuming_map_key;
},
},
.consuming_map_key => switch (char) {
':' => {
const tip = try getStackTip(self.stack);
2023-09-17 23:09:26 -07:00
dangling_key = self.buffer[tip.item_start..idx];
self.state = .want_map_value;
},
else => continue :charloop,
},
.want_map_value => switch (char) {
' ', '\t' => continue :charloop,
',' => {
const tip = try getStackTip(self.stack);
try Parser.putMap(
&tip.value.flow_map,
2023-09-17 23:09:26 -07:00
dangling_key.?,
Value.newScalar(self.alloc),
dupe_behavior,
);
2023-09-17 23:09:26 -07:00
dangling_key = null;
self.state = .want_map_key;
},
'[' => {
const tip = try getStackTip(self.stack);
const new_list = try Parser.putMapGetValue(
&tip.value.flow_map,
2023-09-17 23:09:26 -07:00
dangling_key.?,
Value.newFlowList(self.alloc),
dupe_behavior,
);
try self.stack.append(.{ .value = new_list, .item_start = idx + 1 });
2023-09-17 23:09:26 -07:00
dangling_key = null;
self.state = .want_list_item;
},
'{' => {
const tip = try getStackTip(self.stack);
const new_map = try Parser.putMapGetValue(
&tip.value.flow_map,
2023-09-17 23:09:26 -07:00
dangling_key.?,
Value.newFlowMap(self.alloc),
dupe_behavior,
);
try self.stack.append(.{ .value = new_map });
2023-09-17 23:09:26 -07:00
dangling_key = null;
self.state = .want_map_key;
},
'}' => {
// the value is an empty string and this map is closed
const tip = try getStackTip(self.stack);
try Parser.putMap(
&tip.value.flow_map,
2023-09-17 23:09:26 -07:00
dangling_key.?,
Value.newScalar(self.alloc),
dupe_behavior,
);
2023-09-17 23:09:26 -07:00
dangling_key = null;
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
self.state = try self.popStack();
},
else => {
try setStackItemStart(self.stack, idx);
self.state = .consuming_map_value;
},
},
.consuming_map_value => switch (char) {
',', '}' => |term| {
const tip = try getStackTip(self.stack);
try Parser.putMap(
&tip.value.flow_map,
2023-09-17 23:09:26 -07:00
dangling_key.?,
try Value.fromScalar(self.alloc, self.buffer[tip.item_start..idx]),
dupe_behavior,
);
2023-09-17 23:09:26 -07:00
dangling_key = null;
self.state = .want_map_key;
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
if (term == '}') self.state = try self.popStack();
},
else => continue :charloop,
},
.want_map_separator => switch (char) {
' ', '\t' => continue :charloop,
',' => self.state = .want_map_key,
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
'}' => self.state = try self.popStack(),
else => return error.BadToken,
},
// the root value was closed but there are characters remaining
// in the buffer
.done => return error.BadState,
}
}
// we ran out of characters while still in the middle of an object
if (self.state != .done) return error.BadState;
return self.root;
}
};