parser: reintroduce space strings and change token parsing strategy

I don't think I have the wherewithal to write this full commit message
right now. Since it should be a long one.

Basically. `+ ` now is the string space concatenation operator because
that is a very common use case. It's essentially the soft-wrap
character.

Also, lines starting with -, +, >, and | will now try to tokenize as
map keys if they do not contain the following space. The motivation
here is numeric map keys. Specifically, +/- are numeric leaders.

To facilitate this change, own-line scalars are now prohibited. So, for
example:

    key: -1000

is still fine, but

    key:
        -1000

is no longer accepted.
This commit is contained in:
torque 2023-10-18 00:20:19 -07:00
parent 25386ac87a
commit 258cf2ae83
Signed by: torque
SSH Key Fingerprint: SHA256:nCrXefBNo6EbjNSQhv0nXmEg/VuNq3sMF5b8zETw3Tk
2 changed files with 145 additions and 114 deletions

View File

@ -59,7 +59,7 @@ pub const State = struct {
}, },
}, },
.value => switch (state.value_stack.getLast().*) { .value => switch (state.value_stack.getLast().*) {
// remove the final trailing newline or space // we have an in-progress string, finish it.
.string => |*string| string.* = try state.string_builder.toOwnedSlice(arena_alloc), .string => |*string| string.* = try state.string_builder.toOwnedSlice(arena_alloc),
// if we have a dangling -, attach an empty scalar to it // if we have a dangling -, attach an empty scalar to it
.list => |*list| if (state.expect_shift == .indent) try list.append(Value.emptyScalar()), .list => |*list| if (state.expect_shift == .indent) try list.append(Value.emptyScalar()),
@ -104,7 +104,7 @@ pub const State = struct {
state.document.root = try Value.fromScalar(arena_alloc, str); state.document.root = try Value.fromScalar(arena_alloc, str);
state.mode = .done; state.mode = .done;
}, },
.line_string, .concat_string => |str| { .line_string, .space_string, .concat_string => |str| {
state.document.root = Value.emptyString(); state.document.root = Value.emptyString();
try state.string_builder.appendSlice(arena_alloc, str); try state.string_builder.appendSlice(arena_alloc, str);
try state.value_stack.append(&state.document.root); try state.value_stack.append(&state.document.root);
@ -128,7 +128,7 @@ pub const State = struct {
switch (value) { switch (value) {
.empty => state.expect_shift = .indent, .empty => state.expect_shift = .indent,
.scalar => |str| try rootlist.append(try Value.fromScalar(arena_alloc, str)), .scalar => |str| try rootlist.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .concat_string => |str| try rootlist.append(try Value.fromString(arena_alloc, str)), .line_string, .space_string, .concat_string => |str| try rootlist.append(try Value.fromString(arena_alloc, str)),
.inline_list => |str| try rootlist.append(try state.parseFlow(str, .inline_list, dkb)), .inline_list => |str| try rootlist.append(try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try rootlist.append(try state.parseFlow(str, .inline_map, dkb)), .inline_map => |str| try rootlist.append(try state.parseFlow(str, .inline_map, dkb)),
} }
@ -146,7 +146,7 @@ pub const State = struct {
state.dangling_key = dupekey; state.dangling_key = dupekey;
}, },
.scalar => |str| try rootmap.put(dupekey, try Value.fromScalar(arena_alloc, str)), .scalar => |str| try rootmap.put(dupekey, try Value.fromScalar(arena_alloc, str)),
.line_string, .concat_string => |str| try rootmap.put(dupekey, try Value.fromString(arena_alloc, str)), .line_string, .space_string, .concat_string => |str| try rootmap.put(dupekey, try Value.fromString(arena_alloc, str)),
.inline_list => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_list, dkb)), .inline_list => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_map, dkb)), .inline_map => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_map, dkb)),
} }
@ -188,9 +188,11 @@ pub const State = struct {
.comment => unreachable, .comment => unreachable,
.in_line => |in_line| switch (in_line) { .in_line => |in_line| switch (in_line) {
.empty => unreachable, .empty => unreachable,
inline .line_string, .concat_string => |str, tag| { inline .line_string, .space_string, .concat_string => |str, tag| {
if (tag == .line_string) if (tag == .line_string)
try state.string_builder.append(arena_alloc, '\n'); try state.string_builder.append(arena_alloc, '\n');
if (tag == .space_string)
try state.string_builder.append(arena_alloc, ' ');
try state.string_builder.appendSlice(arena_alloc, str); try state.string_builder.appendSlice(arena_alloc, str);
}, },
else => { else => {
@ -249,10 +251,14 @@ pub const State = struct {
state.expect_shift = .dedent; state.expect_shift = .dedent;
switch (in_line) { switch (in_line) {
.empty => unreachable, .empty => unreachable,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)), .scalar => {
state.diagnostics.length = 1;
state.diagnostics.message = "the document may not contain a scalar value on its own line";
return error.UnexpectedValue;
},
.inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)), .inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)), .inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)),
.line_string, .concat_string => |str| { .line_string, .space_string, .concat_string => |str| {
const new_string = try appendListGetValue(list, Value.emptyString()); const new_string = try appendListGetValue(list, Value.emptyString());
try state.string_builder.appendSlice(arena_alloc, str); try state.string_builder.appendSlice(arena_alloc, str);
try state.value_stack.append(new_string); try state.value_stack.append(new_string);
@ -266,7 +272,7 @@ pub const State = struct {
switch (value) { switch (value) {
.empty => state.expect_shift = .indent, .empty => state.expect_shift = .indent,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)), .scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .concat_string => |str| try list.append(try Value.fromString(arena_alloc, str)), .line_string, .space_string, .concat_string => |str| try list.append(try Value.fromString(arena_alloc, str)),
.inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)), .inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)), .inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)),
} }
@ -291,7 +297,7 @@ pub const State = struct {
if (state.expect_shift != .indent or line.shift != .indent) { if (state.expect_shift != .indent or line.shift != .indent) {
state.diagnostics.length = 1; state.diagnostics.length = 1;
state.diagnostics.message = "the document contains an invalid map key in a list"; state.diagnostics.message = "the document contains a map item where a list item is expected";
return error.UnexpectedValue; return error.UnexpectedValue;
} }
@ -348,12 +354,16 @@ pub const State = struct {
switch (in_line) { switch (in_line) {
.empty => unreachable, .empty => unreachable,
.scalar => |str| try state.putMap(map, state.dangling_key.?, try Value.fromScalar(arena_alloc, str), dkb), .scalar => {
state.diagnostics.length = 1;
state.diagnostics.message = "the document may not contain a scalar value on its own line";
return error.UnexpectedValue;
},
.inline_list => |str| try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_list, dkb), dkb), .inline_list => |str| try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_list, dkb), dkb),
.inline_map => |str| { .inline_map => |str| {
try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_map, dkb), dkb); try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_map, dkb), dkb);
}, },
.line_string, .concat_string => |str| { .line_string, .space_string, .concat_string => |str| {
// string pushes the stack // string pushes the stack
const new_string = try state.putMapGetValue(map, state.dangling_key.?, Value.emptyString(), dkb); const new_string = try state.putMapGetValue(map, state.dangling_key.?, Value.emptyString(), dkb);
try state.string_builder.appendSlice(arena_alloc, str); try state.string_builder.appendSlice(arena_alloc, str);
@ -375,7 +385,7 @@ pub const State = struct {
if (state.expect_shift != .indent or line.shift != .indent or state.dangling_key == null) { if (state.expect_shift != .indent or line.shift != .indent or state.dangling_key == null) {
state.diagnostics.length = 1; state.diagnostics.length = 1;
state.diagnostics.message = "the document contains an invalid list item in a map"; state.diagnostics.message = "the document contains a list item where a map item is expected";
return error.UnexpectedValue; return error.UnexpectedValue;
} }
@ -395,7 +405,7 @@ pub const State = struct {
state.dangling_key = dupekey; state.dangling_key = dupekey;
}, },
.scalar => |str| try state.putMap(map, dupekey, try Value.fromScalar(arena_alloc, str), dkb), .scalar => |str| try state.putMap(map, dupekey, try Value.fromScalar(arena_alloc, str), dkb),
.line_string, .concat_string => |str| try state.putMap(map, dupekey, try Value.fromString(arena_alloc, str), dkb), .line_string, .space_string, .concat_string => |str| try state.putMap(map, dupekey, try Value.fromString(arena_alloc, str), dkb),
.inline_list => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_list, dkb), dkb), .inline_list => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_list, dkb), dkb),
.inline_map => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_map, dkb), dkb), .inline_map => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_map, dkb), dkb),
} }
@ -567,7 +577,12 @@ pub const State = struct {
// forbid these characters so that inline dictionary keys cannot start // forbid these characters so that inline dictionary keys cannot start
// with characters that regular dictionary keys cannot start with // with characters that regular dictionary keys cannot start with
// (even though they're unambiguous in this specific context). // (even though they're unambiguous in this specific context).
'{', '[', '#', '-', '>', '|', ',' => return { '{', '[', '#', ',' => return {
state.diagnostics.length = 1;
state.diagnostics.message = "this document contains a inline map key that starts with an invalid character";
return error.BadToken;
},
'-', '>', '+', '|' => if ((idx + 1) < contents.len and contents[idx + 1] == ' ') {
state.diagnostics.length = 1; state.diagnostics.length = 1;
state.diagnostics.message = "this document contains a inline map key that starts with an invalid sequence"; state.diagnostics.message = "this document contains a inline map key that starts with an invalid sequence";
return error.BadToken; return error.BadToken;

View File

@ -23,6 +23,7 @@ pub const InlineItem = union(enum) {
empty: void, empty: void,
scalar: []const u8, scalar: []const u8,
line_string: []const u8, line_string: []const u8,
space_string: []const u8,
concat_string: []const u8, concat_string: []const u8,
inline_list: []const u8, inline_list: []const u8,
@ -162,10 +163,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
// this should not be possible, as empty lines are caught earlier. // this should not be possible, as empty lines are caught earlier.
if (line.len == 0) return error.Impossible; if (line.len == 0) return error.Impossible;
sigil: {
switch (line[0]) { switch (line[0]) {
'#' => { '#' => {
// force comments to be followed by a space. This makes them // Force comments to be followed by a space. We could
// behave the same way as strings, actually. // allow #: to be interpreted as a map key, but I'm going
// to specifically forbid it instead.
if (line.len > 1 and line[1] != ' ') { if (line.len > 1 and line[1] != ' ') {
self.buffer.diag().line_offset += 1; self.buffer.diag().line_offset += 1;
self.buffer.diag().length = 1; self.buffer.diag().length = 1;
@ -181,7 +184,21 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.raw = line, .raw = line,
}; };
}, },
'|', '>', '[', '{' => { '|', '>', '+' => {
if (line.len > 1 and line[1] != ' ') {
// we want to try parsing this as a map key
break :sigil;
}
return .{
.shift = shift,
.contents = .{ .in_line = try self.detectInlineItem(line) },
.raw = line,
};
},
'[', '{' => {
// these don't require being followed by a space, so they
// cannot be interpreted as starting a map key in any way.
return .{ return .{
.shift = shift, .shift = shift,
.contents = .{ .in_line = try self.detectInlineItem(line) }, .contents = .{ .in_line = try self.detectInlineItem(line) },
@ -190,10 +207,8 @@ pub fn LineTokenizer(comptime Buffer: type) type {
}, },
'-' => { '-' => {
if (line.len > 1 and line[1] != ' ') { if (line.len > 1 and line[1] != ' ') {
self.buffer.diag().line_offset += 1; // we want to try parsing this as a map key
self.buffer.diag().length = 1; break :sigil;
self.buffer.diag().message = "this line is missing a space after the list entry character '-'";
return error.BadToken;
} }
// blindly add 2 here because an empty item cannot fail in // blindly add 2 here because an empty item cannot fail in
@ -210,7 +225,10 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.raw = line, .raw = line,
}; };
}, },
else => { else => break :sigil,
}
}
for (line, 0..) |char, idx| { for (line, 0..) |char, idx| {
if (char == ':') { if (char == ':') {
if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) { if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) {
@ -252,14 +270,6 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.contents = .{ .in_line = .{ .scalar = line } }, .contents = .{ .in_line = .{ .scalar = line } },
.raw = line, .raw = line,
}; };
},
}
// somehow everything else has failed
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = raw_line.len;
self.buffer.diag().message = "this document contains an unknown error. Please report this.";
return error.Impossible;
} }
return null; return null;
} }
@ -281,8 +291,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
}; };
switch (buf[start]) { switch (buf[start]) {
'>', '|' => |char| { '>', '|', '+' => |char| {
if (buf.len - start > 1 and buf[start + 1] != ' ') return error.BadToken; if (buf.len - start > 1 and buf[start + 1] != ' ') {
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the string start character";
return error.BadToken;
}
const slice: []const u8 = switch (buf[buf.len - 1]) { const slice: []const u8 = switch (buf[buf.len - 1]) {
' ', '\t' => { ' ', '\t' => {
@ -295,10 +309,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
else => buf[start + @min(2, buf.len - start) .. buf.len], else => buf[start + @min(2, buf.len - start) .. buf.len],
}; };
return if (char == '>') return switch (char) {
.{ .line_string = slice } '>' => .{ .line_string = slice },
else '+' => .{ .space_string = slice },
.{ .concat_string = slice }; '|' => .{ .concat_string = slice },
else => unreachable,
};
}, },
'[' => { '[' => {
if (buf.len - start < 2 or buf[buf.len - 1] != ']') { if (buf.len - start < 2 or buf[buf.len - 1] != ']') {