parser: reintroduce space strings and change token parsing strategy

Once again I have entangled two conceptually distinct changes into a
single commit because demuxing them from the diff is too much work.
Alas. Let's break it down.

The simpler part of this change is to reintroduce "space strings" with
a slightly fresh coat of paint. We now have 3 different types of
string leaders that can be used together. So we now have:

    | directly concatenates this line with the previous line
    > prepends an LF character before concatenation
    + (NEW) prepends a single space character before concatenation

The `+` leader enables more æsthetic soft line wrapping than `|`
because it doesn't require the use of leading or trailing the
whitespace to separate words, as long as lines are broken at word
boundaries. Perhaps this is not as common a usecase as I am making it,
but I do like to hard wrap paragraphs in documents, so if anything,
it's a feature for me.

As I was considering what character to use for this leader, I realized
that I wanted to be able to support numeric map keys, a la:

    -1: negative one
    0:  zero
    +1: positive one

But previously this would not parse correctly, as the tokenizer would
find `-` and expect it to be followed by a space to indicate a list
item (and the additional string leader would cause the same problem
with `+`). I wanted to support this use case, so the parser was
changed to take a second pass on lines starting with the string
leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the
leader has a non-space character following it. Note that this does not
apply to the comment leader (`#` not followed by a space or a newline
is a tokenization error) or to the inline list/map leaders(since those
do not respect internal whitespace, there is no way to treat them
unambiguously).

To reduce the likelihood of confusing documents, scalars are no longer
allowed to occupy their own line (the exception to this is if the
document consists only of a scalar value). Inline lists and maps can
still occupy their own line, though I am considering changing this as
well to force them to truly be inline. I think this change makes
sense, as scalars are generally intended to be represent an unbroken
single item serialization of some non-string value. In other words,

    # these two lines used to parse the same way
    key: 9001
    # but now the following line is a parse error due to the scalar
    # occupying its own line
    key:
        9001
    # also, this still works, but it may be changed to be an error in
    # the future
    key:
        [ 9, 0, 0, 1 ]

Inline maps have also been changed so that their keys can start with the
now-unforbidden string leaders and list item leader characters.
This commit is contained in:
torque 2023-10-18 00:20:19 -07:00
parent 25386ac87a
commit 4c966ca9d0
Signed by: torque
SSH Key Fingerprint: SHA256:nCrXefBNo6EbjNSQhv0nXmEg/VuNq3sMF5b8zETw3Tk
2 changed files with 145 additions and 114 deletions

View File

@ -59,7 +59,7 @@ pub const State = struct {
}, },
}, },
.value => switch (state.value_stack.getLast().*) { .value => switch (state.value_stack.getLast().*) {
// remove the final trailing newline or space // we have an in-progress string, finish it.
.string => |*string| string.* = try state.string_builder.toOwnedSlice(arena_alloc), .string => |*string| string.* = try state.string_builder.toOwnedSlice(arena_alloc),
// if we have a dangling -, attach an empty scalar to it // if we have a dangling -, attach an empty scalar to it
.list => |*list| if (state.expect_shift == .indent) try list.append(Value.emptyScalar()), .list => |*list| if (state.expect_shift == .indent) try list.append(Value.emptyScalar()),
@ -104,7 +104,7 @@ pub const State = struct {
state.document.root = try Value.fromScalar(arena_alloc, str); state.document.root = try Value.fromScalar(arena_alloc, str);
state.mode = .done; state.mode = .done;
}, },
.line_string, .concat_string => |str| { .line_string, .space_string, .concat_string => |str| {
state.document.root = Value.emptyString(); state.document.root = Value.emptyString();
try state.string_builder.appendSlice(arena_alloc, str); try state.string_builder.appendSlice(arena_alloc, str);
try state.value_stack.append(&state.document.root); try state.value_stack.append(&state.document.root);
@ -128,7 +128,7 @@ pub const State = struct {
switch (value) { switch (value) {
.empty => state.expect_shift = .indent, .empty => state.expect_shift = .indent,
.scalar => |str| try rootlist.append(try Value.fromScalar(arena_alloc, str)), .scalar => |str| try rootlist.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .concat_string => |str| try rootlist.append(try Value.fromString(arena_alloc, str)), .line_string, .space_string, .concat_string => |str| try rootlist.append(try Value.fromString(arena_alloc, str)),
.inline_list => |str| try rootlist.append(try state.parseFlow(str, .inline_list, dkb)), .inline_list => |str| try rootlist.append(try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try rootlist.append(try state.parseFlow(str, .inline_map, dkb)), .inline_map => |str| try rootlist.append(try state.parseFlow(str, .inline_map, dkb)),
} }
@ -146,7 +146,7 @@ pub const State = struct {
state.dangling_key = dupekey; state.dangling_key = dupekey;
}, },
.scalar => |str| try rootmap.put(dupekey, try Value.fromScalar(arena_alloc, str)), .scalar => |str| try rootmap.put(dupekey, try Value.fromScalar(arena_alloc, str)),
.line_string, .concat_string => |str| try rootmap.put(dupekey, try Value.fromString(arena_alloc, str)), .line_string, .space_string, .concat_string => |str| try rootmap.put(dupekey, try Value.fromString(arena_alloc, str)),
.inline_list => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_list, dkb)), .inline_list => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_map, dkb)), .inline_map => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_map, dkb)),
} }
@ -188,9 +188,11 @@ pub const State = struct {
.comment => unreachable, .comment => unreachable,
.in_line => |in_line| switch (in_line) { .in_line => |in_line| switch (in_line) {
.empty => unreachable, .empty => unreachable,
inline .line_string, .concat_string => |str, tag| { inline .line_string, .space_string, .concat_string => |str, tag| {
if (tag == .line_string) if (tag == .line_string)
try state.string_builder.append(arena_alloc, '\n'); try state.string_builder.append(arena_alloc, '\n');
if (tag == .space_string)
try state.string_builder.append(arena_alloc, ' ');
try state.string_builder.appendSlice(arena_alloc, str); try state.string_builder.appendSlice(arena_alloc, str);
}, },
else => { else => {
@ -249,10 +251,14 @@ pub const State = struct {
state.expect_shift = .dedent; state.expect_shift = .dedent;
switch (in_line) { switch (in_line) {
.empty => unreachable, .empty => unreachable,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)), .scalar => {
state.diagnostics.length = 1;
state.diagnostics.message = "the document may not contain a scalar value on its own line";
return error.UnexpectedValue;
},
.inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)), .inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)), .inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)),
.line_string, .concat_string => |str| { .line_string, .space_string, .concat_string => |str| {
const new_string = try appendListGetValue(list, Value.emptyString()); const new_string = try appendListGetValue(list, Value.emptyString());
try state.string_builder.appendSlice(arena_alloc, str); try state.string_builder.appendSlice(arena_alloc, str);
try state.value_stack.append(new_string); try state.value_stack.append(new_string);
@ -266,7 +272,7 @@ pub const State = struct {
switch (value) { switch (value) {
.empty => state.expect_shift = .indent, .empty => state.expect_shift = .indent,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)), .scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .concat_string => |str| try list.append(try Value.fromString(arena_alloc, str)), .line_string, .space_string, .concat_string => |str| try list.append(try Value.fromString(arena_alloc, str)),
.inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)), .inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)), .inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)),
} }
@ -291,7 +297,7 @@ pub const State = struct {
if (state.expect_shift != .indent or line.shift != .indent) { if (state.expect_shift != .indent or line.shift != .indent) {
state.diagnostics.length = 1; state.diagnostics.length = 1;
state.diagnostics.message = "the document contains an invalid map key in a list"; state.diagnostics.message = "the document contains a map item where a list item is expected";
return error.UnexpectedValue; return error.UnexpectedValue;
} }
@ -348,12 +354,16 @@ pub const State = struct {
switch (in_line) { switch (in_line) {
.empty => unreachable, .empty => unreachable,
.scalar => |str| try state.putMap(map, state.dangling_key.?, try Value.fromScalar(arena_alloc, str), dkb), .scalar => {
state.diagnostics.length = 1;
state.diagnostics.message = "the document may not contain a scalar value on its own line";
return error.UnexpectedValue;
},
.inline_list => |str| try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_list, dkb), dkb), .inline_list => |str| try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_list, dkb), dkb),
.inline_map => |str| { .inline_map => |str| {
try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_map, dkb), dkb); try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_map, dkb), dkb);
}, },
.line_string, .concat_string => |str| { .line_string, .space_string, .concat_string => |str| {
// string pushes the stack // string pushes the stack
const new_string = try state.putMapGetValue(map, state.dangling_key.?, Value.emptyString(), dkb); const new_string = try state.putMapGetValue(map, state.dangling_key.?, Value.emptyString(), dkb);
try state.string_builder.appendSlice(arena_alloc, str); try state.string_builder.appendSlice(arena_alloc, str);
@ -375,7 +385,7 @@ pub const State = struct {
if (state.expect_shift != .indent or line.shift != .indent or state.dangling_key == null) { if (state.expect_shift != .indent or line.shift != .indent or state.dangling_key == null) {
state.diagnostics.length = 1; state.diagnostics.length = 1;
state.diagnostics.message = "the document contains an invalid list item in a map"; state.diagnostics.message = "the document contains a list item where a map item is expected";
return error.UnexpectedValue; return error.UnexpectedValue;
} }
@ -395,7 +405,7 @@ pub const State = struct {
state.dangling_key = dupekey; state.dangling_key = dupekey;
}, },
.scalar => |str| try state.putMap(map, dupekey, try Value.fromScalar(arena_alloc, str), dkb), .scalar => |str| try state.putMap(map, dupekey, try Value.fromScalar(arena_alloc, str), dkb),
.line_string, .concat_string => |str| try state.putMap(map, dupekey, try Value.fromString(arena_alloc, str), dkb), .line_string, .space_string, .concat_string => |str| try state.putMap(map, dupekey, try Value.fromString(arena_alloc, str), dkb),
.inline_list => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_list, dkb), dkb), .inline_list => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_list, dkb), dkb),
.inline_map => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_map, dkb), dkb), .inline_map => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_map, dkb), dkb),
} }
@ -567,7 +577,12 @@ pub const State = struct {
// forbid these characters so that inline dictionary keys cannot start // forbid these characters so that inline dictionary keys cannot start
// with characters that regular dictionary keys cannot start with // with characters that regular dictionary keys cannot start with
// (even though they're unambiguous in this specific context). // (even though they're unambiguous in this specific context).
'{', '[', '#', '-', '>', '|', ',' => return { '{', '[', '#', ',' => return {
state.diagnostics.length = 1;
state.diagnostics.message = "this document contains a inline map key that starts with an invalid character";
return error.BadToken;
},
'-', '>', '+', '|' => if ((idx + 1) < contents.len and contents[idx + 1] == ' ') {
state.diagnostics.length = 1; state.diagnostics.length = 1;
state.diagnostics.message = "this document contains a inline map key that starts with an invalid sequence"; state.diagnostics.message = "this document contains a inline map key that starts with an invalid sequence";
return error.BadToken; return error.BadToken;

View File

@ -23,6 +23,7 @@ pub const InlineItem = union(enum) {
empty: void, empty: void,
scalar: []const u8, scalar: []const u8,
line_string: []const u8, line_string: []const u8,
space_string: []const u8,
concat_string: []const u8, concat_string: []const u8,
inline_list: []const u8, inline_list: []const u8,
@ -162,10 +163,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
// this should not be possible, as empty lines are caught earlier. // this should not be possible, as empty lines are caught earlier.
if (line.len == 0) return error.Impossible; if (line.len == 0) return error.Impossible;
sigil: {
switch (line[0]) { switch (line[0]) {
'#' => { '#' => {
// force comments to be followed by a space. This makes them // Force comments to be followed by a space. We could
// behave the same way as strings, actually. // allow #: to be interpreted as a map key, but I'm going
// to specifically forbid it instead.
if (line.len > 1 and line[1] != ' ') { if (line.len > 1 and line[1] != ' ') {
self.buffer.diag().line_offset += 1; self.buffer.diag().line_offset += 1;
self.buffer.diag().length = 1; self.buffer.diag().length = 1;
@ -181,7 +184,21 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.raw = line, .raw = line,
}; };
}, },
'|', '>', '[', '{' => { '|', '>', '+' => {
if (line.len > 1 and line[1] != ' ') {
// we want to try parsing this as a map key
break :sigil;
}
return .{
.shift = shift,
.contents = .{ .in_line = try self.detectInlineItem(line) },
.raw = line,
};
},
'[', '{' => {
// these don't require being followed by a space, so they
// cannot be interpreted as starting a map key in any way.
return .{ return .{
.shift = shift, .shift = shift,
.contents = .{ .in_line = try self.detectInlineItem(line) }, .contents = .{ .in_line = try self.detectInlineItem(line) },
@ -190,10 +207,8 @@ pub fn LineTokenizer(comptime Buffer: type) type {
}, },
'-' => { '-' => {
if (line.len > 1 and line[1] != ' ') { if (line.len > 1 and line[1] != ' ') {
self.buffer.diag().line_offset += 1; // we want to try parsing this as a map key
self.buffer.diag().length = 1; break :sigil;
self.buffer.diag().message = "this line is missing a space after the list entry character '-'";
return error.BadToken;
} }
// blindly add 2 here because an empty item cannot fail in // blindly add 2 here because an empty item cannot fail in
@ -210,7 +225,10 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.raw = line, .raw = line,
}; };
}, },
else => { else => break :sigil,
}
}
for (line, 0..) |char, idx| { for (line, 0..) |char, idx| {
if (char == ':') { if (char == ':') {
if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) { if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) {
@ -252,14 +270,6 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.contents = .{ .in_line = .{ .scalar = line } }, .contents = .{ .in_line = .{ .scalar = line } },
.raw = line, .raw = line,
}; };
},
}
// somehow everything else has failed
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = raw_line.len;
self.buffer.diag().message = "this document contains an unknown error. Please report this.";
return error.Impossible;
} }
return null; return null;
} }
@ -281,8 +291,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
}; };
switch (buf[start]) { switch (buf[start]) {
'>', '|' => |char| { '>', '|', '+' => |char| {
if (buf.len - start > 1 and buf[start + 1] != ' ') return error.BadToken; if (buf.len - start > 1 and buf[start + 1] != ' ') {
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the string start character";
return error.BadToken;
}
const slice: []const u8 = switch (buf[buf.len - 1]) { const slice: []const u8 = switch (buf[buf.len - 1]) {
' ', '\t' => { ' ', '\t' => {
@ -295,10 +309,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
else => buf[start + @min(2, buf.len - start) .. buf.len], else => buf[start + @min(2, buf.len - start) .. buf.len],
}; };
return if (char == '>') return switch (char) {
.{ .line_string = slice } '>' => .{ .line_string = slice },
else '+' => .{ .space_string = slice },
.{ .concat_string = slice }; '|' => .{ .concat_string = slice },
else => unreachable,
};
}, },
'[' => { '[' => {
if (buf.len - start < 2 or buf[buf.len - 1] != ']') { if (buf.len - start < 2 or buf[buf.len - 1] != ']') {