parser: stateful reentrancy

finally the flow parser has been "integrated" with the main parser in
that they now share a stack. The bigger thing is that the parsing has
been decoupled from the tokenization, which will allow parsing
documents without loading them fully into memory first.

I've been calling this the streaming parser, but it's worth noting that
I am referring to streaming input, not streaming output. It would
certainly be possible to do streaming output, but I am not interested
in that at the moment (it would be the lowest-memory-overhead
approach, but it's a lot of work for little gain, and it is less
flexible for converting input to objects).
This commit is contained in:
torque 2023-09-24 22:24:33 -07:00
parent 38e47b39dc
commit 1d65b072ee
Signed by: torque
SSH Key Fingerprint: SHA256:nCrXefBNo6EbjNSQhv0nXmEg/VuNq3sMF5b8zETw3Tk
2 changed files with 669 additions and 667 deletions

View File

@ -3,6 +3,8 @@ const std = @import("std");
const buffers = @import("./linebuffer.zig"); const buffers = @import("./linebuffer.zig");
const tokenizer = @import("./tokenizer.zig"); const tokenizer = @import("./tokenizer.zig");
const Value = @import("./parser/value.zig").Value; const Value = @import("./parser/value.zig").Value;
const State = @import("./parser/state.zig").State;
const Document = @import("./parser/state.zig").Document;
pub const Diagnostics = struct { pub const Diagnostics = struct {
row: usize, row: usize,
@ -28,691 +30,40 @@ pub const DuplicateKeyBehavior = enum {
fail, fail,
}; };
pub const DefaultObject = enum { pub const Options = struct {
scalar, // If a mapping has multiple entries with the same key, this option defines how the
string, // parser should behave. The default behavior is to emit an error if a repeated key
list, // is encountered.
map, duplicate_key_behavior: DuplicateKeyBehavior = .fail,
fail,
};
const ParseState = enum { initial, value, done }; // If an empty document is parsed, this defines what value type should be the
// resulting document root object. The default behavior is to emit an error if the
pub const Document = struct { // document is empty.
arena: std.heap.ArenaAllocator, default_object: enum { string, list, map, fail } = .fail,
root: Value,
pub fn init(alloc: std.mem.Allocator) Document {
return .{
.arena = std.heap.ArenaAllocator.init(alloc),
.root = undefined,
};
}
pub fn printDebug(self: Document) void {
return self.root.printDebug();
}
pub fn deinit(self: Document) void {
self.arena.deinit();
}
}; };
pub const Parser = struct { pub const Parser = struct {
allocator: std.mem.Allocator, allocator: std.mem.Allocator,
dupe_behavior: DuplicateKeyBehavior = .fail, options: Options = .{},
default_object: DefaultObject = .fail,
diagnostics: Diagnostics = .{ diagnostics: Diagnostics = .{
.row = 0, .row = 0,
.span = .{ .absolute = 0, .line_offset = 0, .length = 0 }, .span = .{ .absolute = 0, .line_offset = 0, .length = 0 },
.message = "all is well", .message = "all is well",
}, },
pub const State = struct {
pub const Stack = std.ArrayList(*Value);
document: Document,
value_stack: Stack,
state: enum { initial, value, done } = .initial,
expect_shift: tokenizer.ShiftDirection = .none,
dangling_key: ?[]const u8 = null,
pub fn init(alloc: std.mem.Allocator) State {
return .{
.document = Document.init(alloc),
.value_stack = Stack.init(alloc),
};
}
pub fn deinit(self: State) void {
self.value_stack.deinit();
}
};
pub fn parseBuffer(self: *Parser, buffer: []const u8) Error!Document { pub fn parseBuffer(self: *Parser, buffer: []const u8) Error!Document {
var document = Document.init(self.allocator);
errdefer document.deinit();
const arena_alloc = document.arena.allocator();
var state: ParseState = .initial;
var expect_shift: tokenizer.ShiftDirection = .none;
var dangling_key: ?[]const u8 = null;
var stack = std.ArrayList(*Value).init(arena_alloc);
defer stack.deinit();
var tok: tokenizer.LineTokenizer(buffers.FixedLineBuffer) = .{ var tok: tokenizer.LineTokenizer(buffers.FixedLineBuffer) = .{
.buffer = buffers.FixedLineBuffer.init(buffer), .buffer = buffers.FixedLineBuffer.init(buffer),
.diagnostics = &self.diagnostics, .diagnostics = &self.diagnostics,
}; };
while (try tok.next()) |line| { var state = State.init(self.allocator);
if (line.contents == .comment) continue; defer state.deinit();
errdefer state.document.deinit();
var flip = true; // TODO: pass the diagnostics pointer as well
var flop = false; while (try tok.next()) |line| try state.parseLine(line, self.options.duplicate_key_behavior);
// this is needed to give us a second go round when the line is dedented
flipflop: while (flip) : (flop = true) {
switch (state) {
.initial => {
if (line.shift == .indent) return error.UnexpectedIndent;
switch (line.contents) { return try state.finish(self.options);
// we filter out comments above
.comment => unreachable,
.in_line => |in_line| switch (in_line) {
// empty scalars are only emitted for a list_item or a map_item
.empty => unreachable,
.scalar => |str| {
document.root = try Value.fromScalar(arena_alloc, str);
// this is a cheesy hack. If the document consists
// solely of a scalar, the finalizer will try to
// chop a line ending off of it, so we need to add
// a sacrificial padding character to avoid
// chopping off something that matters.
try document.root.string.append(' ');
state = .done;
},
.line_string, .space_string => |str| {
document.root = try Value.fromString(arena_alloc, str);
try document.root.string.append(in_line.lineEnding());
try stack.append(&document.root);
state = .value;
},
.flow_list => |str| {
document.root = try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior);
state = .done;
},
.flow_map => |str| {
document.root = try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior);
state = .done;
},
},
.list_item => |value| {
document.root = Value.newList(arena_alloc);
try stack.append(&document.root);
state = .value;
switch (value) {
.empty => expect_shift = .indent,
.scalar => |str| try document.root.list.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try document.root.list.append(try Value.fromString(arena_alloc, str)),
.flow_list => |str| try document.root.list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
.flow_map => |str| try document.root.list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
}
},
.map_item => |pair| {
document.root = Value.newMap(arena_alloc);
try stack.append(&document.root);
state = .value;
const dupekey = try arena_alloc.dupe(u8, pair.key);
switch (pair.val) {
.empty => {
expect_shift = .indent;
// If the key is on its own line, we don't have
// an associated value until we parse the next
// line. We need to store a reference to this
// key somewhere until we can consume the
// value. More parser state to lug along.
dangling_key = dupekey;
},
.scalar => |str| try document.root.map.put(dupekey, try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try document.root.map.put(dupekey, try Value.fromString(arena_alloc, str)),
.flow_list => |str| try document.root.map.put(dupekey, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
.flow_map => |str| try document.root.map.put(dupekey, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
}
},
}
},
.value => switch (stack.getLast().*) {
// these three states are never reachable here. flow_list and
// flow_map are parsed with a separate state machine. These
// value types can only be present by themselves as the first
// line of the document, in which case the document consists
// only of that single line: this parser jumps immediately into
// the .done state, bypassing the .value state in which this
// switch is embedded.
.scalar, .flow_list, .flow_map => unreachable,
.string => |*string| {
if (line.shift == .indent)
return error.UnexpectedIndent;
if (!flop and line.shift == .dedent) {
// kick off the last trailing space or newline
_ = string.pop();
var dedent_depth = line.shift.dedent;
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = stack.pop();
continue :flipflop;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| switch (in_line) {
.empty => unreachable,
.line_string, .space_string => |str| {
try string.appendSlice(str);
try string.append(in_line.lineEnding());
},
else => return error.UnexpectedValue,
},
else => return error.UnexpectedValue,
}
},
.list => |*list| {
// detect that the previous item was actually empty
//
// -
// - something
//
// the first line here creates the expect_shift, but the second line
// is a valid continuation of the list despite not being indented
if (!flop and (expect_shift == .indent and line.shift != .indent))
try list.append(Value.newScalar(arena_alloc));
// Consider:
//
// -
// own-line scalar
// - inline scalar
//
// the own-line scalar will not push the stack but the next list item will be a dedent
if (!flop and line.shift == .dedent) {
// if line.shift.dedent is 1 and we're expecting it, the stack will not be popped,
// but we will continue loop flipflop. However, flop will be set to false on the next
// trip, so this if prong will not be run again.
var dedent_depth = line.shift.dedent - @intFromBool(expect_shift == .dedent);
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = stack.pop();
continue :flipflop;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| {
// assert that this line has been indented. this is required for an inline value when
// the stack is in list mode.
if (expect_shift != .indent or line.shift != .indent)
return error.UnexpectedValue;
expect_shift = .dedent;
switch (in_line) {
.empty => unreachable,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.flow_list => |str| try list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
.flow_map => |str| try list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
.line_string, .space_string => |str| {
// string pushes the stack
const new_string = try appendListGetValue(list, try Value.fromString(arena_alloc, str));
try stack.append(new_string);
try new_string.string.append(in_line.lineEnding());
expect_shift = .none;
},
}
},
.list_item => |value| {
if (flop or (line.shift == .none or line.shift == .dedent)) {
expect_shift = .none;
switch (value) {
.empty => expect_shift = .indent,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try list.append(try Value.fromString(arena_alloc, str)),
.flow_list => |str| try list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)),
.flow_map => |str| try list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)),
}
} else if (line.shift == .indent) {
if (expect_shift != .indent) return error.UnexpectedIndent;
const new_list = try appendListGetValue(list, Value.newList(arena_alloc));
try stack.append(new_list);
expect_shift = .none;
continue :flipflop;
} else unreachable;
},
.map_item => {
// this prong cannot be hit on dedent in a valid way.
//
// -
// map: value
// second: value
// third: value
//
// dedenting back to the list stack level requires list_item
if (line.shift != .indent)
return error.UnexpectedValue;
const new_map = try appendListGetValue(list, Value.newMap(arena_alloc));
try stack.append(new_map);
expect_shift = .none;
continue :flipflop;
},
}
},
.map => |*map| {
// detect that the previous item was actually empty
//
// foo:
// bar: baz
//
// the first line here creates the expect_shift, but the second line
// is a valid continuation of the map despite not being indented
if (!flop and (expect_shift == .indent and line.shift != .indent)) {
try putMap(
map,
dangling_key orelse return error.Fail,
Value.newScalar(arena_alloc),
self.dupe_behavior,
);
dangling_key = null;
}
if (!flop and line.shift == .dedent) {
var dedent_depth = line.shift.dedent - @intFromBool(expect_shift == .dedent);
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = stack.pop();
continue :flipflop;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| {
// assert that this line has been indented. this is required for an inline value when
// the stack is in map mode.
if (expect_shift != .indent or line.shift != .indent or dangling_key == null)
return error.UnexpectedValue;
expect_shift = .dedent;
switch (in_line) {
.empty => unreachable,
.scalar => |str| try putMap(map, dangling_key.?, try Value.fromScalar(arena_alloc, str), self.dupe_behavior),
.flow_list => |str| try putMap(map, dangling_key.?, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior), self.dupe_behavior),
.flow_map => |str| {
try putMap(map, dangling_key.?, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior), self.dupe_behavior);
},
.line_string, .space_string => |str| {
// string pushes the stack
const new_string = try putMapGetValue(map, dangling_key.?, try Value.fromString(arena_alloc, str), self.dupe_behavior);
try new_string.string.append(in_line.lineEnding());
try stack.append(new_string);
expect_shift = .none;
},
}
dangling_key = null;
},
.list_item => {
// this prong cannot be hit on dedent in a valid way.
//
// map:
// - value
// - invalid
//
// dedenting back to the map stack level requires map_item
if (expect_shift != .indent or line.shift != .indent or dangling_key == null)
return error.UnexpectedValue;
const new_list = try putMapGetValue(map, dangling_key.?, Value.newList(arena_alloc), self.dupe_behavior);
try stack.append(new_list);
dangling_key = null;
expect_shift = .none;
continue :flipflop;
},
.map_item => |pair| {
if (flop or (line.shift == .none or line.shift == .dedent)) {
expect_shift = .none;
const dupekey = try arena_alloc.dupe(u8, pair.key);
switch (pair.val) {
.empty => {
expect_shift = .indent;
dangling_key = dupekey;
},
.scalar => |str| try putMap(map, dupekey, try Value.fromScalar(arena_alloc, str), self.dupe_behavior),
.line_string, .space_string => |str| try putMap(map, dupekey, try Value.fromString(arena_alloc, str), self.dupe_behavior),
.flow_list => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior), self.dupe_behavior),
.flow_map => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior), self.dupe_behavior),
}
} else if (line.shift == .indent) {
if (expect_shift != .indent or dangling_key == null) return error.UnexpectedValue;
const new_map = try putMapGetValue(map, dangling_key.?, Value.newMap(arena_alloc), self.dupe_behavior);
try stack.append(new_map);
dangling_key = null;
continue :flipflop;
} else unreachable;
},
}
},
},
.done => return error.ExtraContent,
}
// this is specifically performed at the end of the loop body so that
// `continue :flipflop` skips setting it.
flip = false;
}
}
switch (state) {
.initial => switch (self.default_object) {
.scalar => document.root = .{ .scalar = std.ArrayList(u8).init(arena_alloc) },
.string => document.root = .{ .string = std.ArrayList(u8).init(arena_alloc) },
.list => document.root = Value.newList(arena_alloc),
.map => document.root = Value.newMap(arena_alloc),
.fail => return error.EmptyDocument,
},
.value => switch (stack.getLast().*) {
// remove the final trailing newline or space
.scalar, .string => |*string| _ = string.popOrNull(),
// if we have a dangling -, attach an empty string to it
.list => |*list| if (expect_shift == .indent) try list.append(Value.newScalar(arena_alloc)),
// if we have a dangling "key:", attach an empty string to it
.map => |*map| if (dangling_key) |dk| try putMap(map, dk, Value.newScalar(arena_alloc), self.dupe_behavior),
.flow_list, .flow_map => {},
},
.done => {},
}
return document;
}
const FlowStack: type = std.ArrayList(*Value);
inline fn getStackTip(stack: FlowStack) Error!*Value {
if (stack.items.len == 0) return error.BadState;
return stack.items[stack.items.len - 1];
}
inline fn popStack(stack: *FlowStack) Error!FlowParseState {
if (stack.popOrNull() == null)
return error.BadState;
const parent = stack.getLastOrNull() orelse return .done;
return switch (parent.*) {
.flow_list => .want_list_separator,
.flow_map => .want_map_separator,
else => return error.BadState,
};
}
const FlowParseState = enum {
want_list_item,
consuming_list_item,
want_list_separator,
want_map_key,
consuming_map_key,
want_map_value,
consuming_map_value,
want_map_separator,
done,
};
pub fn parseFlow(
alloc: std.mem.Allocator,
contents: []const u8,
root_type: Value.TagType,
dupe_behavior: DuplicateKeyBehavior,
) Error!Value {
var root: Value = switch (root_type) {
.flow_list => Value.newFlowList(alloc),
.flow_map => Value.newFlowMap(alloc),
else => return error.BadState,
};
var state: FlowParseState = switch (root_type) {
.flow_list => .want_list_item,
.flow_map => .want_map_key,
else => unreachable,
};
var stack = try FlowStack.initCapacity(alloc, 1);
stack.appendAssumeCapacity(&root);
// used to distinguish betwen [] and [ ], and it also tracks
// a continuous value between different states
var item_start: usize = 0;
var dangling_key: ?[]const u8 = null;
charloop: for (contents, 0..) |char, idx| {
switch (state) {
.want_list_item => switch (char) {
' ', '\t' => continue :charloop,
',' => {
// empty value
const tip = try getStackTip(stack);
try tip.flow_list.append(Value.newScalar(alloc));
item_start = idx + 1;
},
'{' => {
const tip = try getStackTip(stack);
const new_map = try Parser.appendListGetValue(
&tip.flow_list,
Value.newFlowMap(alloc),
);
item_start = idx;
try stack.append(new_map);
state = .want_map_key;
},
'[' => {
const tip = try getStackTip(stack);
const new_list = try Parser.appendListGetValue(
&tip.flow_list,
Value.newFlowList(alloc),
);
item_start = idx + 1;
try stack.append(new_list);
state = .want_list_item;
},
']' => {
const finished = stack.getLastOrNull() orelse return error.BadState;
if (finished.flow_list.items.len > 0 or idx > item_start)
try finished.flow_list.append(Value.newScalar(alloc));
state = try popStack(&stack);
},
else => {
item_start = idx;
state = .consuming_list_item;
},
},
.consuming_list_item => switch (char) {
',' => {
const tip = try getStackTip(stack);
try tip.flow_list.append(
try Value.fromScalar(alloc, contents[item_start..idx]),
);
item_start = idx + 1;
state = .want_list_item;
},
']' => {
const finished = stack.getLastOrNull() orelse return error.BadState;
try finished.flow_list.append(
try Value.fromScalar(alloc, contents[item_start..idx]),
);
state = try popStack(&stack);
},
else => continue :charloop,
},
.want_list_separator => switch (char) {
' ', '\t' => continue :charloop,
',' => {
item_start = idx;
state = .want_list_item;
},
']' => state = try popStack(&stack),
else => return error.BadToken,
},
.want_map_key => switch (char) {
' ', '\t' => continue :charloop,
// forbid these characters so that flow dictionary keys cannot start
// with characters that regular dictionary keys cannot start with
// (even though they're unambiguous in this specific context).
'{', '[', '#', '-', '>', '|', ',' => return error.BadToken,
':' => {
// we have an empty map key
dangling_key = "";
state = .want_map_value;
},
'}' => state = try popStack(&stack),
else => {
item_start = idx;
state = .consuming_map_key;
},
},
.consuming_map_key => switch (char) {
':' => {
dangling_key = try alloc.dupe(u8, contents[item_start..idx]);
state = .want_map_value;
},
else => continue :charloop,
},
.want_map_value => switch (char) {
' ', '\t' => continue :charloop,
',' => {
const tip = try getStackTip(stack);
try Parser.putMap(
&tip.flow_map,
dangling_key.?,
Value.newScalar(alloc),
dupe_behavior,
);
dangling_key = null;
state = .want_map_key;
},
'[' => {
const tip = try getStackTip(stack);
const new_list = try Parser.putMapGetValue(
&tip.flow_map,
dangling_key.?,
Value.newFlowList(alloc),
dupe_behavior,
);
try stack.append(new_list);
dangling_key = null;
item_start = idx + 1;
state = .want_list_item;
},
'{' => {
const tip = try getStackTip(stack);
const new_map = try Parser.putMapGetValue(
&tip.flow_map,
dangling_key.?,
Value.newFlowMap(alloc),
dupe_behavior,
);
try stack.append(new_map);
dangling_key = null;
state = .want_map_key;
},
'}' => {
// the value is an empty string and this map is closed
const tip = try getStackTip(stack);
try Parser.putMap(
&tip.flow_map,
dangling_key.?,
Value.newScalar(alloc),
dupe_behavior,
);
dangling_key = null;
state = try popStack(&stack);
},
else => {
item_start = idx;
state = .consuming_map_value;
},
},
.consuming_map_value => switch (char) {
',', '}' => |term| {
const tip = try getStackTip(stack);
try Parser.putMap(
&tip.flow_map,
dangling_key.?,
try Value.fromScalar(alloc, contents[item_start..idx]),
dupe_behavior,
);
dangling_key = null;
state = .want_map_key;
if (term == '}') state = try popStack(&stack);
},
else => continue :charloop,
},
.want_map_separator => switch (char) {
' ', '\t' => continue :charloop,
',' => state = .want_map_key,
'}' => state = try popStack(&stack),
else => return error.BadToken,
},
// the root value was closed but there are characters remaining
// in the buffer
.done => return error.BadState,
}
}
// we ran out of characters while still in the middle of an object
if (state != .done) return error.BadState;
return root;
}
inline fn appendListGetValue(list: *Value.List, value: Value) Error!*Value {
try list.append(value);
return &list.items[list.items.len - 1];
}
inline fn putMap(map: *Value.Map, key: []const u8, value: Value, dupe_behavior: DuplicateKeyBehavior) Error!void {
_ = try putMapGetValue(map, key, value, dupe_behavior);
}
inline fn putMapGetValue(map: *Value.Map, key: []const u8, value: Value, dupe_behavior: DuplicateKeyBehavior) Error!*Value {
const gop = try map.getOrPut(key);
if (gop.found_existing)
switch (dupe_behavior) {
.fail => return error.DuplicateKey,
.use_first => {},
.use_last => gop.value_ptr.* = value,
}
else
gop.value_ptr.* = value;
return gop.value_ptr;
} }
}; };

651
src/parser/state.zig Normal file
View File

@ -0,0 +1,651 @@
const std = @import("std");
const tokenizer = @import("../tokenizer.zig");
const Error = @import("../parser.zig").Error;
const DuplicateKeyBehavior = @import("../parser.zig").DuplicateKeyBehavior;
const Options = @import("../parser.zig").Options;
const Value = @import("./value.zig").Value;
pub const Document = struct {
arena: std.heap.ArenaAllocator,
root: Value,
pub fn init(alloc: std.mem.Allocator) Document {
return .{
.arena = std.heap.ArenaAllocator.init(alloc),
.root = undefined,
};
}
pub fn printDebug(self: Document) void {
return self.root.printDebug();
}
pub fn deinit(self: Document) void {
self.arena.deinit();
}
};
const FlowParseState = enum {
want_list_item,
consuming_list_item,
want_list_separator,
want_map_key,
consuming_map_key,
want_map_value,
consuming_map_value,
want_map_separator,
done,
};
pub const State = struct {
pub const Stack = std.ArrayList(*Value);
document: Document,
value_stack: Stack,
mode: enum { initial, value, done } = .initial,
expect_shift: tokenizer.ShiftDirection = .none,
dangling_key: ?[]const u8 = null,
pub fn init(allocator: std.mem.Allocator) State {
return .{
.document = Document.init(allocator),
.value_stack = Stack.init(allocator),
};
}
pub fn deinit(self: State) void {
self.value_stack.deinit();
}
pub fn finish(state: *State, options: Options) Error!Document {
const arena_alloc = state.document.arena.allocator();
switch (state.mode) {
.initial => switch (options.default_object) {
.string => state.document.root = Value.newString(arena_alloc),
.list => state.document.root = Value.newList(arena_alloc),
.map => state.document.root = Value.newMap(arena_alloc),
.fail => return error.EmptyDocument,
},
.value => switch (state.value_stack.getLast().*) {
// remove the final trailing newline or space
.string => |*string| _ = string.popOrNull(),
// if we have a dangling -, attach an empty string to it
.list => |*list| if (state.expect_shift == .indent) try list.append(Value.newScalar(arena_alloc)),
// if we have a dangling "key:", attach an empty string to it
.map => |*map| if (state.dangling_key) |dk| try putMap(
map,
dk,
Value.newScalar(arena_alloc),
options.duplicate_key_behavior,
),
.scalar, .flow_list, .flow_map => {},
},
.done => {},
}
return state.document;
}
pub fn parseLine(state: *State, line: tokenizer.Line, dkb: DuplicateKeyBehavior) Error!void {
if (line.contents == .comment) return;
// this gives us a second loop when the stack tip changes (i.e. during dedent or
// some indents (not all indents push the stack))
const arena_alloc = state.document.arena.allocator();
var firstpass = true;
restack: while (true) : (firstpass = false) {
switch (state.mode) {
.initial => {
if (line.shift == .indent) return error.UnexpectedIndent;
switch (line.contents) {
// we filter out comments above
.comment => unreachable,
.in_line => |in_line| switch (in_line) {
// empty scalars are only emitted for a list_item or a map_item
.empty => unreachable,
.scalar => |str| {
state.document.root = try Value.fromScalar(arena_alloc, str);
state.mode = .done;
},
.line_string, .space_string => |str| {
state.document.root = try Value.fromString(arena_alloc, str);
try state.document.root.string.append(in_line.lineEnding());
try state.value_stack.append(&state.document.root);
state.mode = .value;
},
.flow_list => |str| {
state.document.root = try state.parseFlow(str, .flow_list, dkb);
state.mode = .done;
},
.flow_map => |str| {
state.document.root = try state.parseFlow(str, .flow_map, dkb);
state.mode = .done;
},
},
.list_item => |value| {
state.document.root = Value.newList(arena_alloc);
try state.value_stack.append(&state.document.root);
state.mode = .value;
const rootlist = &state.document.root.list;
switch (value) {
.empty => state.expect_shift = .indent,
.scalar => |str| try rootlist.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try rootlist.append(try Value.fromString(arena_alloc, str)),
.flow_list => |str| try rootlist.append(try state.parseFlow(str, .flow_list, dkb)),
.flow_map => |str| try rootlist.append(try state.parseFlow(str, .flow_map, dkb)),
}
},
.map_item => |pair| {
state.document.root = Value.newMap(arena_alloc);
try state.value_stack.append(&state.document.root);
state.mode = .value;
const rootmap = &state.document.root.map;
const dupekey = try arena_alloc.dupe(u8, pair.key);
switch (pair.val) {
.empty => {
state.expect_shift = .indent;
state.dangling_key = dupekey;
},
.scalar => |str| try rootmap.put(dupekey, try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try rootmap.put(dupekey, try Value.fromString(arena_alloc, str)),
.flow_list => |str| try rootmap.put(dupekey, try state.parseFlow(str, .flow_list, dkb)),
.flow_map => |str| try rootmap.put(dupekey, try state.parseFlow(str, .flow_map, dkb)),
}
},
}
},
.value => switch (state.value_stack.getLast().*) {
// these three states are never reachable here. flow_list and
// flow_map are parsed with a separate state machine. These
// value types can only be present by themselves as the first
// line of the document, in which case the document consists
// only of that single line: this parser jumps immediately into
// the .done state, bypassing the .value state in which this
// switch is embedded.
.scalar, .flow_list, .flow_map => return error.Fail,
.string => |*string| {
if (line.shift == .indent)
return error.UnexpectedIndent;
if (firstpass and line.shift == .dedent) {
// kick off the last trailing space or newline
_ = string.pop();
var dedent_depth = line.shift.dedent;
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = state.value_stack.pop();
continue :restack;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| switch (in_line) {
.empty => unreachable,
.line_string, .space_string => |str| {
try string.appendSlice(str);
try string.append(in_line.lineEnding());
},
else => return error.UnexpectedValue,
},
else => return error.UnexpectedValue,
}
},
.list => |*list| {
// detect that the previous item was actually empty
//
// -
// - something
//
// the first line here creates the state.expect_shift, but the second line
// is a valid continuation of the list despite not being indented
if (firstpass and (state.expect_shift == .indent and line.shift != .indent))
try list.append(Value.newScalar(arena_alloc));
// Consider:
//
// -
// own-line scalar
// - inline scalar
//
// the own-line scalar will not push the stack but the next list item will be a dedent
if (firstpass and line.shift == .dedent) {
// if line.shift.dedent is 1 and we're expecting it, the stack will not be popped,
// but we will continue restack. However, firstpass will be set to false on the next
// trip, so this if prong will not be run again.
var dedent_depth = line.shift.dedent - @intFromBool(state.expect_shift == .dedent);
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = state.value_stack.pop();
continue :restack;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| {
// assert that this line has been indented and that indentation is expected.
if (state.expect_shift != .indent or line.shift != .indent)
return error.UnexpectedValue;
state.expect_shift = .dedent;
switch (in_line) {
.empty => unreachable,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.flow_list => |str| try list.append(try state.parseFlow(str, .flow_list, dkb)),
.flow_map => |str| try list.append(try state.parseFlow(str, .flow_map, dkb)),
.line_string, .space_string => |str| {
const new_string = try appendListGetValue(list, try Value.fromString(arena_alloc, str));
try new_string.string.append(in_line.lineEnding());
try state.value_stack.append(new_string);
state.expect_shift = .none;
},
}
},
.list_item => |value| {
if (!firstpass or (line.shift == .none or line.shift == .dedent)) {
state.expect_shift = .none;
switch (value) {
.empty => state.expect_shift = .indent,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .space_string => |str| try list.append(try Value.fromString(arena_alloc, str)),
.flow_list => |str| try list.append(try state.parseFlow(str, .flow_list, dkb)),
.flow_map => |str| try list.append(try state.parseFlow(str, .flow_map, dkb)),
}
} else if (line.shift == .indent) {
if (state.expect_shift != .indent) return error.UnexpectedIndent;
const new_list = try appendListGetValue(list, Value.newList(arena_alloc));
try state.value_stack.append(new_list);
state.expect_shift = .none;
continue :restack;
} else unreachable;
},
.map_item => {
// this prong cannot be hit on dedent in a valid way.
//
// -
// map: value
// second: value
// third: value
//
// dedenting back to the list stack level requires list_item
if (state.expect_shift != .indent or line.shift != .indent)
return error.UnexpectedValue;
const new_map = try appendListGetValue(list, Value.newMap(arena_alloc));
try state.value_stack.append(new_map);
state.expect_shift = .none;
continue :restack;
},
}
},
.map => |*map| {
// detect that the previous item was actually empty
//
// foo:
// bar: baz
//
// the first line here creates the state.expect_shift, but the second line
// is a valid continuation of the map despite not being indented
if (firstpass and (state.expect_shift == .indent and line.shift != .indent)) {
try putMap(
map,
state.dangling_key orelse return error.Fail,
Value.newScalar(arena_alloc),
dkb,
);
state.dangling_key = null;
}
if (firstpass and line.shift == .dedent) {
var dedent_depth = line.shift.dedent - @intFromBool(state.expect_shift == .dedent);
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = state.value_stack.pop();
continue :restack;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| {
// assert that this line has been indented. this is required for an inline value when
// the stack is in map mode.
if (state.expect_shift != .indent or line.shift != .indent or state.dangling_key == null)
return error.UnexpectedValue;
state.expect_shift = .dedent;
switch (in_line) {
.empty => unreachable,
.scalar => |str| try putMap(map, state.dangling_key.?, try Value.fromScalar(arena_alloc, str), dkb),
.flow_list => |str| try putMap(map, state.dangling_key.?, try state.parseFlow(str, .flow_list, dkb), dkb),
.flow_map => |str| {
try putMap(map, state.dangling_key.?, try state.parseFlow(str, .flow_map, dkb), dkb);
},
.line_string, .space_string => |str| {
// string pushes the stack
const new_string = try putMapGetValue(map, state.dangling_key.?, try Value.fromString(arena_alloc, str), dkb);
try new_string.string.append(in_line.lineEnding());
try state.value_stack.append(new_string);
state.expect_shift = .none;
},
}
state.dangling_key = null;
},
.list_item => {
// this prong cannot be hit on dedent in a valid way.
//
// map:
// - value
// - invalid
//
// dedenting back to the map stack level requires map_item
if (state.expect_shift != .indent or line.shift != .indent or state.dangling_key == null)
return error.UnexpectedValue;
const new_list = try putMapGetValue(map, state.dangling_key.?, Value.newList(arena_alloc), dkb);
try state.value_stack.append(new_list);
state.dangling_key = null;
state.expect_shift = .none;
continue :restack;
},
.map_item => |pair| {
if (!firstpass or (line.shift == .none or line.shift == .dedent)) {
state.expect_shift = .none;
const dupekey = try arena_alloc.dupe(u8, pair.key);
switch (pair.val) {
.empty => {
state.expect_shift = .indent;
state.dangling_key = dupekey;
},
.scalar => |str| try putMap(map, dupekey, try Value.fromScalar(arena_alloc, str), dkb),
.line_string, .space_string => |str| try putMap(map, dupekey, try Value.fromString(arena_alloc, str), dkb),
.flow_list => |str| try putMap(map, dupekey, try state.parseFlow(str, .flow_list, dkb), dkb),
.flow_map => |str| try putMap(map, dupekey, try state.parseFlow(str, .flow_map, dkb), dkb),
}
} else if (line.shift == .indent) {
if (state.expect_shift != .indent or state.dangling_key == null) return error.UnexpectedValue;
const new_map = try putMapGetValue(map, state.dangling_key.?, Value.newMap(arena_alloc), dkb);
try state.value_stack.append(new_map);
state.dangling_key = null;
continue :restack;
} else unreachable;
},
}
},
},
.done => return error.ExtraContent,
}
// the stack has not changed, so break the loop
break :restack;
}
}
pub fn parseFlow(
state: *State,
contents: []const u8,
root_type: Value.TagType,
dkb: DuplicateKeyBehavior,
) Error!Value {
const arena_alloc = state.document.arena.allocator();
var root: Value = switch (root_type) {
.flow_list => Value.newFlowList(arena_alloc),
.flow_map => Value.newFlowMap(arena_alloc),
else => return error.BadState,
};
var pstate: FlowParseState = switch (root_type) {
.flow_list => .want_list_item,
.flow_map => .want_map_key,
else => unreachable,
};
// used to distinguish betwen [] and [ ], and it also tracks
// a continuous value between different states
var item_start: usize = 0;
var dangling_key: ?[]const u8 = null;
try state.value_stack.append(&root);
charloop: for (contents, 0..) |char, idx| {
switch (pstate) {
.want_list_item => switch (char) {
' ', '\t' => continue :charloop,
',' => {
// empty value
const tip = try state.getStackTip();
try tip.flow_list.append(Value.newScalar(arena_alloc));
item_start = idx + 1;
},
'{' => {
const tip = try state.getStackTip();
const new_map = try appendListGetValue(
&tip.flow_list,
Value.newFlowMap(arena_alloc),
);
item_start = idx;
try state.value_stack.append(new_map);
pstate = .want_map_key;
},
'[' => {
const tip = try state.getStackTip();
const new_list = try appendListGetValue(
&tip.flow_list,
Value.newFlowList(arena_alloc),
);
item_start = idx + 1;
try state.value_stack.append(new_list);
pstate = .want_list_item;
},
']' => {
const finished = state.value_stack.getLastOrNull() orelse return error.BadState;
if (finished.flow_list.items.len > 0 or idx > item_start)
try finished.flow_list.append(Value.newScalar(arena_alloc));
pstate = try state.popFlowStack();
},
else => {
item_start = idx;
pstate = .consuming_list_item;
},
},
.consuming_list_item => switch (char) {
',' => {
const tip = try state.getStackTip();
try tip.flow_list.append(
try Value.fromScalar(arena_alloc, contents[item_start..idx]),
);
item_start = idx + 1;
pstate = .want_list_item;
},
']' => {
const finished = state.value_stack.getLastOrNull() orelse return error.BadState;
try finished.flow_list.append(
try Value.fromScalar(arena_alloc, contents[item_start..idx]),
);
pstate = try state.popFlowStack();
},
else => continue :charloop,
},
.want_list_separator => switch (char) {
' ', '\t' => continue :charloop,
',' => {
item_start = idx;
pstate = .want_list_item;
},
']' => pstate = try state.popFlowStack(),
else => return error.BadToken,
},
.want_map_key => switch (char) {
' ', '\t' => continue :charloop,
// forbid these characters so that flow dictionary keys cannot start
// with characters that regular dictionary keys cannot start with
// (even though they're unambiguous in this specific context).
'{', '[', '#', '-', '>', '|', ',' => return error.BadToken,
':' => {
// we have an empty map key
dangling_key = "";
pstate = .want_map_value;
},
'}' => pstate = try state.popFlowStack(),
else => {
item_start = idx;
pstate = .consuming_map_key;
},
},
.consuming_map_key => switch (char) {
':' => {
dangling_key = try arena_alloc.dupe(u8, contents[item_start..idx]);
pstate = .want_map_value;
},
else => continue :charloop,
},
.want_map_value => switch (char) {
' ', '\t' => continue :charloop,
',' => {
const tip = try state.getStackTip();
try putMap(
&tip.flow_map,
dangling_key.?,
Value.newScalar(arena_alloc),
dkb,
);
dangling_key = null;
pstate = .want_map_key;
},
'[' => {
const tip = try state.getStackTip();
const new_list = try putMapGetValue(
&tip.flow_map,
dangling_key.?,
Value.newFlowList(arena_alloc),
dkb,
);
try state.value_stack.append(new_list);
dangling_key = null;
item_start = idx + 1;
pstate = .want_list_item;
},
'{' => {
const tip = try state.getStackTip();
const new_map = try putMapGetValue(
&tip.flow_map,
dangling_key.?,
Value.newFlowMap(arena_alloc),
dkb,
);
try state.value_stack.append(new_map);
dangling_key = null;
pstate = .want_map_key;
},
'}' => {
// the value is an empty string and this map is closed
const tip = try state.getStackTip();
try putMap(
&tip.flow_map,
dangling_key.?,
Value.newScalar(arena_alloc),
dkb,
);
dangling_key = null;
pstate = try state.popFlowStack();
},
else => {
item_start = idx;
pstate = .consuming_map_value;
},
},
.consuming_map_value => switch (char) {
',', '}' => |term| {
const tip = try state.getStackTip();
try putMap(
&tip.flow_map,
dangling_key.?,
try Value.fromScalar(arena_alloc, contents[item_start..idx]),
dkb,
);
dangling_key = null;
pstate = .want_map_key;
if (term == '}') pstate = try state.popFlowStack();
},
else => continue :charloop,
},
.want_map_separator => switch (char) {
' ', '\t' => continue :charloop,
',' => pstate = .want_map_key,
'}' => pstate = try state.popFlowStack(),
else => return error.BadToken,
},
// the root value was closed but there are characters remaining
// in the buffer
.done => return error.BadState,
}
}
// we ran out of characters while still in the middle of an object
if (pstate != .done) return error.BadState;
return root;
}
inline fn getStackTip(state: State) Error!*Value {
if (state.value_stack.items.len == 0) return error.BadState;
return state.value_stack.items[state.value_stack.items.len - 1];
}
inline fn popFlowStack(state: *State) Error!FlowParseState {
if (state.value_stack.popOrNull() == null) return error.BadState;
const parent = state.value_stack.getLastOrNull() orelse return .done;
return switch (parent.*) {
.flow_list => .want_list_separator,
.flow_map => .want_map_separator,
else => .done,
};
}
inline fn appendListGetValue(list: *Value.List, value: Value) Error!*Value {
try list.append(value);
return &list.items[list.items.len - 1];
}
inline fn putMap(map: *Value.Map, key: []const u8, value: Value, dkb: DuplicateKeyBehavior) Error!void {
_ = try putMapGetValue(map, key, value, dkb);
}
inline fn putMapGetValue(map: *Value.Map, key: []const u8, value: Value, dkb: DuplicateKeyBehavior) Error!*Value {
const gop = try map.getOrPut(key);
if (gop.found_existing)
switch (dkb) {
.fail => return error.DuplicateKey,
.use_first => {},
.use_last => gop.value_ptr.* = value,
}
else
gop.value_ptr.* = value;
return gop.value_ptr;
}
};