nice-data/src/parser.zig

719 lines
34 KiB
Zig
Raw Normal View History

2023-09-13 00:11:45 -07:00
const std = @import("std");
const buffers = @import("./linebuffer.zig");
const tokenizer = @import("./tokenizer.zig");
const Value = @import("./parser/value.zig").Value;
2023-09-13 00:11:45 -07:00
pub const Diagnostics = struct {
row: usize,
span: struct { absolute: usize, line_offset: usize, length: usize },
message: []const u8,
};
pub const Error = error{
UnexpectedIndent,
UnexpectedValue,
ExtraContent,
EmptyDocument,
DuplicateKey,
BadMapEntry,
BadState,
BadToken,
Fail,
} || tokenizer.Error || std.mem.Allocator.Error;
pub const DuplicateKeyBehavior = enum {
use_first,
use_last,
fail,
};
pub const DefaultObject = enum {
scalar,
string,
list,
map,
fail,
};
const ParseState = enum { initial, value, done };
pub const Document = struct {
arena: std.heap.ArenaAllocator,
root: Value,
pub fn init(alloc: std.mem.Allocator) Document {
return .{
.arena = std.heap.ArenaAllocator.init(alloc),
.root = undefined,
};
}
2023-09-17 23:09:26 -07:00
pub fn printDebug(self: Document) void {
return self.root.printDebug();
}
pub fn deinit(self: Document) void {
self.arena.deinit();
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
}
};
2023-09-13 00:11:45 -07:00
pub const Parser = struct {
allocator: std.mem.Allocator,
dupe_behavior: DuplicateKeyBehavior = .fail,
default_object: DefaultObject = .fail,
diagnostics: Diagnostics = .{
.row = 0,
.span = .{ .absolute = 0, .line_offset = 0, .length = 0 },
.message = "all is well",
},
pub const State = struct {
pub const Stack = std.ArrayList(*Value);
document: Document,
value_stack: Stack,
state: enum { initial, value, done } = .initial,
expect_shift: tokenizer.ShiftDirection = .none,
dangling_key: ?[]const u8 = null,
pub fn init(alloc: std.mem.Allocator) State {
return .{
.document = Document.init(alloc),
.value_stack = Stack.init(alloc),
};
}
pub fn deinit(self: State) void {
self.value_stack.deinit();
}
};
pub fn parseBuffer(self: *Parser, buffer: []const u8) Error!Document {
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
var document = Document.init(self.allocator);
2023-09-13 00:11:45 -07:00
errdefer document.deinit();
const arena_alloc = document.arena.allocator();
var state: ParseState = .initial;
var expect_shift: tokenizer.ShiftDirection = .none;
var dangling_key: ?[]const u8 = null;
2023-09-13 00:11:45 -07:00
var stack = std.ArrayList(*Value).init(arena_alloc);
defer stack.deinit();
var tok: tokenizer.LineTokenizer(buffers.FixedLineBuffer) = .{
.buffer = buffers.FixedLineBuffer.init(buffer),
.diagnostics = &self.diagnostics,
};
2023-09-13 00:11:45 -07:00
while (try tok.next()) |line| {
if (line.contents == .comment) continue;
var flip = true;
var flop = false;
// this is needed to give us a second go round when the line is dedented
flipflop: while (flip) : (flop = true) {
switch (state) {
.initial => {
if (line.shift == .indent) return error.UnexpectedIndent;
2023-09-13 00:11:45 -07:00
switch (line.contents) {
// we filter out comments above
.comment => unreachable,
.in_line => |in_line| switch (in_line) {
// empty scalars are only emitted for a list_item or a map_item
.empty => unreachable,
.scalar => |str| {
document.root = try Value.fromScalar(arena_alloc, str);
2023-09-17 23:09:26 -07:00
// this is a cheesy hack. If the document consists
// solely of a scalar, the finalizer will try to
// chop a line ending off of it, so we need to add
// a sacrificial padding character to avoid
// chopping off something that matters.
try document.root.string.append(' ');
2023-09-13 00:11:45 -07:00
state = .done;
},
2023-09-17 23:09:26 -07:00
.line_string, .space_string => |str| {
document.root = try Value.fromString(arena_alloc, str);
2023-09-17 23:09:26 -07:00
try document.root.string.append(in_line.lineEnding());
try stack.append(&document.root);
state = .value;
},
2023-09-13 00:11:45 -07:00
.flow_list => |str| {
document.root = try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior);
2023-09-13 00:11:45 -07:00
state = .done;
},
.flow_map => |str| {
document.root = try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior);
2023-09-13 00:11:45 -07:00
state = .done;
},
},
.list_item => |value| {
document.root = Value.newList(arena_alloc);
2023-09-13 00:11:45 -07:00
try stack.append(&document.root);
state = .value;
2023-09-13 00:11:45 -07:00
switch (value) {
.empty => expect_shift = .indent,
.scalar => |str| try document.root.list.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try document.root.list.append(try Value.fromString(arena_alloc, str)),
.flow_list => |str| try document.root.list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
.flow_map => |str| try document.root.list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
2023-09-13 00:11:45 -07:00
}
},
.map_item => |pair| {
document.root = Value.newMap(arena_alloc);
2023-09-13 00:11:45 -07:00
try stack.append(&document.root);
state = .value;
2023-09-13 00:11:45 -07:00
const dupekey = try arena_alloc.dupe(u8, pair.key);
2023-09-13 00:11:45 -07:00
switch (pair.val) {
.empty => {
expect_shift = .indent;
// If the key is on its own line, we don't have
// an associated value until we parse the next
// line. We need to store a reference to this
// key somewhere until we can consume the
// value. More parser state to lug along.
dangling_key = dupekey;
2023-09-13 00:11:45 -07:00
},
.scalar => |str| try document.root.map.put(dupekey, try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try document.root.map.put(dupekey, try Value.fromString(arena_alloc, str)),
.flow_list => |str| try document.root.map.put(dupekey, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
.flow_map => |str| try document.root.map.put(dupekey, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
2023-09-13 00:11:45 -07:00
}
},
}
},
.value => switch (stack.getLast().*) {
// these three states are never reachable here. flow_list and
// flow_map are parsed with a separate state machine. These
// value types can only be present by themselves as the first
// line of the document, in which case the document consists
// only of that single line: this parser jumps immediately into
// the .done state, bypassing the .value state in which this
// switch is embedded.
.scalar, .flow_list, .flow_map => unreachable,
2023-09-13 00:11:45 -07:00
.string => |*string| {
if (line.shift == .indent)
2023-09-17 23:09:26 -07:00
return error.UnexpectedIndent;
if (!flop and line.shift == .dedent) {
2023-09-17 23:09:26 -07:00
// kick off the last trailing space or newline
_ = string.pop();
2023-09-13 00:11:45 -07:00
var dedent_depth = line.shift.dedent;
2023-09-13 00:11:45 -07:00
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = stack.pop();
continue :flipflop;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| switch (in_line) {
.empty => unreachable,
2023-09-17 23:09:26 -07:00
.line_string, .space_string => |str| {
try string.appendSlice(str);
2023-09-17 23:09:26 -07:00
try string.append(in_line.lineEnding());
},
2023-09-13 00:11:45 -07:00
else => return error.UnexpectedValue,
},
else => return error.UnexpectedValue,
}
},
.list => |*list| {
2023-09-17 23:09:26 -07:00
// detect that the previous item was actually empty
//
// -
// - something
//
// the first line here creates the expect_shift, but the second line
// is a valid continuation of the list despite not being indented
if (!flop and (expect_shift == .indent and line.shift != .indent))
try list.append(Value.newScalar(arena_alloc));
2023-09-13 00:11:45 -07:00
// Consider:
//
2023-09-17 23:09:26 -07:00
// -
// own-line scalar
// - inline scalar
2023-09-13 00:11:45 -07:00
//
// the own-line scalar will not push the stack but the next list item will be a dedent
if (!flop and line.shift == .dedent) {
// if line.shift.dedent is 1 and we're expecting it, the stack will not be popped,
2023-09-13 00:11:45 -07:00
// but we will continue loop flipflop. However, flop will be set to false on the next
// trip, so this if prong will not be run again.
var dedent_depth = line.shift.dedent - @intFromBool(expect_shift == .dedent);
2023-09-13 00:11:45 -07:00
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = stack.pop();
continue :flipflop;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| {
// assert that this line has been indented. this is required for an inline value when
// the stack is in list mode.
if (expect_shift != .indent or line.shift != .indent)
return error.UnexpectedValue;
2023-09-13 00:11:45 -07:00
expect_shift = .dedent;
2023-09-13 00:11:45 -07:00
switch (in_line) {
.empty => unreachable,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.flow_list => |str| try list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
.flow_map => |str| try list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
2023-09-17 23:09:26 -07:00
.line_string, .space_string => |str| {
2023-09-13 00:11:45 -07:00
// string pushes the stack
const new_string = try appendListGetValue(list, try Value.fromString(arena_alloc, str));
try stack.append(new_string);
2023-09-17 23:09:26 -07:00
try new_string.string.append(in_line.lineEnding());
2023-09-13 00:11:45 -07:00
expect_shift = .none;
},
}
},
.list_item => |value| {
if (flop or (line.shift == .none or line.shift == .dedent)) {
expect_shift = .none;
switch (value) {
.empty => expect_shift = .indent,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try list.append(try Value.fromString(arena_alloc, str)),
.flow_list => |str| try list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
.flow_map => |str| try list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
}
} else if (line.shift == .indent) {
if (expect_shift != .indent) return error.UnexpectedIndent;
const new_list = try appendListGetValue(list, Value.newList(arena_alloc));
try stack.append(new_list);
expect_shift = .none;
continue :flipflop;
} else unreachable;
2023-09-13 00:11:45 -07:00
},
.map_item => {
// this prong cannot be hit on dedent in a valid way.
//
// -
// map: value
// second: value
// third: value
//
// dedenting back to the list stack level requires list_item
if (line.shift != .indent)
return error.UnexpectedValue;
const new_map = try appendListGetValue(list, Value.newMap(arena_alloc));
try stack.append(new_map);
expect_shift = .none;
continue :flipflop;
2023-09-13 00:11:45 -07:00
},
}
},
.map => |*map| {
2023-09-17 23:09:26 -07:00
// detect that the previous item was actually empty
//
// foo:
// bar: baz
//
// the first line here creates the expect_shift, but the second line
// is a valid continuation of the map despite not being indented
if (!flop and (expect_shift == .indent and line.shift != .indent)) {
try putMap(
map,
dangling_key orelse return error.Fail,
Value.newScalar(arena_alloc),
self.dupe_behavior,
);
dangling_key = null;
}
if (!flop and line.shift == .dedent) {
var dedent_depth = line.shift.dedent - @intFromBool(expect_shift == .dedent);
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = stack.pop();
2023-09-13 00:11:45 -07:00
continue :flipflop;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| {
// assert that this line has been indented. this is required for an inline value when
// the stack is in map mode.
if (expect_shift != .indent or line.shift != .indent or dangling_key == null)
return error.UnexpectedValue;
expect_shift = .dedent;
switch (in_line) {
.empty => unreachable,
.scalar => |str| try putMap(map, dangling_key.?, try Value.fromScalar(arena_alloc, str), self.dupe_behavior),
.flow_list => |str| try putMap(map, dangling_key.?, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior), self.dupe_behavior),
.flow_map => |str| {
try putMap(map, dangling_key.?, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior), self.dupe_behavior);
},
2023-09-17 23:09:26 -07:00
.line_string, .space_string => |str| {
// string pushes the stack
const new_string = try putMapGetValue(map, dangling_key.?, try Value.fromString(arena_alloc, str), self.dupe_behavior);
2023-09-17 23:09:26 -07:00
try new_string.string.append(in_line.lineEnding());
try stack.append(new_string);
expect_shift = .none;
},
}
dangling_key = null;
},
.list_item => {
// this prong cannot be hit on dedent in a valid way.
//
// map:
// - value
// - invalid
//
// dedenting back to the map stack level requires map_item
if (expect_shift != .indent or line.shift != .indent or dangling_key == null)
return error.UnexpectedValue;
const new_list = try putMapGetValue(map, dangling_key.?, Value.newList(arena_alloc), self.dupe_behavior);
try stack.append(new_list);
dangling_key = null;
expect_shift = .none;
continue :flipflop;
},
.map_item => |pair| {
if (flop or (line.shift == .none or line.shift == .dedent)) {
expect_shift = .none;
const dupekey = try arena_alloc.dupe(u8, pair.key);
switch (pair.val) {
.empty => {
expect_shift = .indent;
dangling_key = dupekey;
},
.scalar => |str| try putMap(map, dupekey, try Value.fromScalar(arena_alloc, str), self.dupe_behavior),
.line_string, .space_string => |str| try putMap(map, dupekey, try Value.fromString(arena_alloc, str), self.dupe_behavior),
.flow_list => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior), self.dupe_behavior),
.flow_map => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior), self.dupe_behavior),
}
} else if (line.shift == .indent) {
if (expect_shift != .indent or dangling_key == null) return error.UnexpectedValue;
const new_map = try putMapGetValue(map, dangling_key.?, Value.newMap(arena_alloc), self.dupe_behavior);
try stack.append(new_map);
dangling_key = null;
continue :flipflop;
} else unreachable;
},
}
2023-09-13 00:11:45 -07:00
},
},
.done => return error.ExtraContent,
}
// this is specifically performed at the end of the loop body so that
// `continue :flipflop` skips setting it.
flip = false;
}
2023-09-13 00:11:45 -07:00
}
switch (state) {
.initial => switch (self.default_object) {
.scalar => document.root = .{ .scalar = std.ArrayList(u8).init(arena_alloc) },
2023-09-13 00:11:45 -07:00
.string => document.root = .{ .string = std.ArrayList(u8).init(arena_alloc) },
.list => document.root = Value.newList(arena_alloc),
.map => document.root = Value.newMap(arena_alloc),
2023-09-13 00:11:45 -07:00
.fail => return error.EmptyDocument,
},
.value => switch (stack.getLast().*) {
// remove the final trailing newline or space
.scalar, .string => |*string| _ = string.popOrNull(),
// if we have a dangling -, attach an empty string to it
.list => |*list| if (expect_shift == .indent) try list.append(Value.newScalar(arena_alloc)),
// if we have a dangling "key:", attach an empty string to it
.map => |*map| if (dangling_key) |dk| try putMap(map, dk, Value.newScalar(arena_alloc), self.dupe_behavior),
.flow_list, .flow_map => {},
2023-09-13 00:11:45 -07:00
},
.done => {},
}
return document;
}
const FlowStack: type = std.ArrayList(*Value);
inline fn getStackTip(stack: FlowStack) Error!*Value {
if (stack.items.len == 0) return error.BadState;
return stack.items[stack.items.len - 1];
}
inline fn popStack(stack: *FlowStack) Error!FlowParseState {
if (stack.popOrNull() == null)
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
return error.BadState;
const parent = stack.getLastOrNull() orelse return .done;
return switch (parent.*) {
.flow_list => .want_list_separator,
.flow_map => .want_map_separator,
else => return error.BadState,
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
};
}
const FlowParseState = enum {
want_list_item,
consuming_list_item,
want_list_separator,
want_map_key,
consuming_map_key,
want_map_value,
consuming_map_value,
want_map_separator,
done,
};
pub fn parseFlow(
alloc: std.mem.Allocator,
contents: []const u8,
root_type: Value.TagType,
dupe_behavior: DuplicateKeyBehavior,
) Error!Value {
var root: Value = switch (root_type) {
.flow_list => Value.newFlowList(alloc),
.flow_map => Value.newFlowMap(alloc),
else => return error.BadState,
};
var state: FlowParseState = switch (root_type) {
.flow_list => .want_list_item,
.flow_map => .want_map_key,
else => unreachable,
};
var stack = try FlowStack.initCapacity(alloc, 1);
stack.appendAssumeCapacity(&root);
// used to distinguish betwen [] and [ ], and it also tracks
// a continuous value between different states
var item_start: usize = 0;
2023-09-17 23:09:26 -07:00
var dangling_key: ?[]const u8 = null;
charloop: for (contents, 0..) |char, idx| {
switch (state) {
.want_list_item => switch (char) {
' ', '\t' => continue :charloop,
',' => {
// empty value
const tip = try getStackTip(stack);
try tip.flow_list.append(Value.newScalar(alloc));
item_start = idx + 1;
},
'{' => {
const tip = try getStackTip(stack);
const new_map = try Parser.appendListGetValue(
&tip.flow_list,
Value.newFlowMap(alloc),
);
item_start = idx;
try stack.append(new_map);
state = .want_map_key;
},
'[' => {
const tip = try getStackTip(stack);
const new_list = try Parser.appendListGetValue(
&tip.flow_list,
Value.newFlowList(alloc),
);
item_start = idx + 1;
try stack.append(new_list);
state = .want_list_item;
},
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
']' => {
const finished = stack.getLastOrNull() orelse return error.BadState;
if (finished.flow_list.items.len > 0 or idx > item_start)
try finished.flow_list.append(Value.newScalar(alloc));
state = try popStack(&stack);
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
},
else => {
item_start = idx;
state = .consuming_list_item;
},
},
.consuming_list_item => switch (char) {
',' => {
const tip = try getStackTip(stack);
try tip.flow_list.append(
try Value.fromScalar(alloc, contents[item_start..idx]),
);
item_start = idx + 1;
state = .want_list_item;
},
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
']' => {
const finished = stack.getLastOrNull() orelse return error.BadState;
try finished.flow_list.append(
try Value.fromScalar(alloc, contents[item_start..idx]),
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
);
state = try popStack(&stack);
config: start doing some code cleanup I was pretty sloppy with the code organization while writing out the state machines because my focus was on thinking through the parsing process and logic there. However, The code was not in good shape to continue implementing code features (not document features). This is the first of probably several commits that will work on cleaning up some things. Value has been promoted to the top level namespace, and Document has an initializer function. Referencing Value.List and Value.Map are much cleaner now. Type aliases are good. For the flow parser, `popStack` does not have to access anything except the current stack. This can be passed in as a parameter. This means that `parse` is ready to be refactored to take a buffer and an allocator. The main next steps for code improvement are: 1. reentrant/streaming parser. I am planning to leave it as line-buffered, though I could go further. Line-buffered has two main benefits: the tokenizer doesn't need to be refactored significantly, and the flow parser doesn't need to be made reentrant. I may reevaluate this as I am implementing it, however, as those changes may be simpler than I think. 2. Actually implement the error diagnostics info. I have some skeleton structure in place for this, so it should just be doing the work of getting it hooked up. 3. Parse into object. Metaprogramming, let's go. It will be interesting to try to do this non-recursively, as well (curious to see if it results in code bloat). 4. Object to Document. This is probably going to be annoying, since there are a variety of edge cases that will have to be handled. And lots of objects that cannot be represented as documents. 5. Serialize Document. One thing the parser does not preserve is whether a Value was flow-style or not, so it will be impossible to do round-trip formatting preservation. That's currently a non-goal, and I haven't decided yet if flow-style output should be based on some heuristic (number/length of values in container) or just never emitted. Lack of round-trip preservation does make using this as a general purpose config format a lot more dubious, so I will have to think about this some more. 6. Document to JSON. Why not? I will hand roll this and it will suck. And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
},
else => continue :charloop,
},
.want_list_separator => switch (char) {
' ', '\t' => continue :charloop,
',' => {
item_start = idx;
state = .want_list_item;
},
']' => state = try popStack(&stack),
else => return error.BadToken,
},
.want_map_key => switch (char) {
' ', '\t' => continue :charloop,
// forbid these characters so that flow dictionary keys cannot start
// with characters that regular dictionary keys cannot start with
// (even though they're unambiguous in this specific context).
'{', '[', '#', '-', '>', '|', ',' => return error.BadToken,
':' => {
// we have an empty map key
2023-09-17 23:09:26 -07:00
dangling_key = "";
state = .want_map_value;
},
'}' => state = try popStack(&stack),
else => {
item_start = idx;
state = .consuming_map_key;
},
},
.consuming_map_key => switch (char) {
':' => {
dangling_key = try alloc.dupe(u8, contents[item_start..idx]);
state = .want_map_value;
},
else => continue :charloop,
},
.want_map_value => switch (char) {
' ', '\t' => continue :charloop,
',' => {
const tip = try getStackTip(stack);
try Parser.putMap(
&tip.flow_map,
2023-09-17 23:09:26 -07:00
dangling_key.?,
Value.newScalar(alloc),
dupe_behavior,
);
2023-09-17 23:09:26 -07:00
dangling_key = null;
state = .want_map_key;
},
'[' => {
const tip = try getStackTip(stack);
const new_list = try Parser.putMapGetValue(
&tip.flow_map,
2023-09-17 23:09:26 -07:00
dangling_key.?,
Value.newFlowList(alloc),
dupe_behavior,
);
try stack.append(new_list);
2023-09-17 23:09:26 -07:00
dangling_key = null;
item_start = idx + 1;
state = .want_list_item;
},
'{' => {
const tip = try getStackTip(stack);
const new_map = try Parser.putMapGetValue(
&tip.flow_map,
2023-09-17 23:09:26 -07:00
dangling_key.?,
Value.newFlowMap(alloc),
dupe_behavior,
);
try stack.append(new_map);
2023-09-17 23:09:26 -07:00
dangling_key = null;
state = .want_map_key;
},
'}' => {
// the value is an empty string and this map is closed
const tip = try getStackTip(stack);
try Parser.putMap(
&tip.flow_map,
2023-09-17 23:09:26 -07:00
dangling_key.?,
Value.newScalar(alloc),
dupe_behavior,
);
2023-09-17 23:09:26 -07:00
dangling_key = null;
state = try popStack(&stack);
},
else => {
item_start = idx;
state = .consuming_map_value;
},
},
.consuming_map_value => switch (char) {
',', '}' => |term| {
const tip = try getStackTip(stack);
try Parser.putMap(
&tip.flow_map,
2023-09-17 23:09:26 -07:00
dangling_key.?,
try Value.fromScalar(alloc, contents[item_start..idx]),
dupe_behavior,
);
2023-09-17 23:09:26 -07:00
dangling_key = null;
state = .want_map_key;
if (term == '}') state = try popStack(&stack);
},
else => continue :charloop,
},
.want_map_separator => switch (char) {
' ', '\t' => continue :charloop,
',' => state = .want_map_key,
'}' => state = try popStack(&stack),
else => return error.BadToken,
},
// the root value was closed but there are characters remaining
// in the buffer
.done => return error.BadState,
}
}
// we ran out of characters while still in the middle of an object
if (state != .done) return error.BadState;
return root;
}
inline fn appendListGetValue(list: *Value.List, value: Value) Error!*Value {
try list.append(value);
return &list.items[list.items.len - 1];
}
inline fn putMap(map: *Value.Map, key: []const u8, value: Value, dupe_behavior: DuplicateKeyBehavior) Error!void {
_ = try putMapGetValue(map, key, value, dupe_behavior);
}
inline fn putMapGetValue(map: *Value.Map, key: []const u8, value: Value, dupe_behavior: DuplicateKeyBehavior) Error!*Value {
const gop = try map.getOrPut(key);
if (gop.found_existing)
switch (dupe_behavior) {
.fail => return error.DuplicateKey,
.use_first => {},
.use_last => gop.value_ptr.* = value,
}
else
gop.value_ptr.* = value;
return gop.value_ptr;
}
};