parser: reintroduce space strings and change token parsing strategy

I don't think I have the wherewithal to write this full commit message
right now. Since it should be a long one.

Basically. `+ ` now is the string space concatenation operator because
that is a very common use case. It's essentially the soft-wrap
character.

Also, lines starting with -, +, >, and | will now try to tokenize as
map keys if they do not contain the following space. The motivation
here is numeric map keys. Specifically, +/- are numeric leaders.

To facilitate this change, own-line scalars are now prohibited. So, for
example:

    key: -1000

is still fine, but

    key:
        -1000

is no longer accepted.
This commit is contained in:
torque 2023-10-18 00:20:19 -07:00
parent 25386ac87a
commit 258cf2ae83
Signed by: torque
SSH Key Fingerprint: SHA256:nCrXefBNo6EbjNSQhv0nXmEg/VuNq3sMF5b8zETw3Tk
2 changed files with 145 additions and 114 deletions

View File

@ -59,7 +59,7 @@ pub const State = struct {
},
},
.value => switch (state.value_stack.getLast().*) {
// remove the final trailing newline or space
// we have an in-progress string, finish it.
.string => |*string| string.* = try state.string_builder.toOwnedSlice(arena_alloc),
// if we have a dangling -, attach an empty scalar to it
.list => |*list| if (state.expect_shift == .indent) try list.append(Value.emptyScalar()),
@ -104,7 +104,7 @@ pub const State = struct {
state.document.root = try Value.fromScalar(arena_alloc, str);
state.mode = .done;
},
.line_string, .concat_string => |str| {
.line_string, .space_string, .concat_string => |str| {
state.document.root = Value.emptyString();
try state.string_builder.appendSlice(arena_alloc, str);
try state.value_stack.append(&state.document.root);
@ -128,7 +128,7 @@ pub const State = struct {
switch (value) {
.empty => state.expect_shift = .indent,
.scalar => |str| try rootlist.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .concat_string => |str| try rootlist.append(try Value.fromString(arena_alloc, str)),
.line_string, .space_string, .concat_string => |str| try rootlist.append(try Value.fromString(arena_alloc, str)),
.inline_list => |str| try rootlist.append(try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try rootlist.append(try state.parseFlow(str, .inline_map, dkb)),
}
@ -146,7 +146,7 @@ pub const State = struct {
state.dangling_key = dupekey;
},
.scalar => |str| try rootmap.put(dupekey, try Value.fromScalar(arena_alloc, str)),
.line_string, .concat_string => |str| try rootmap.put(dupekey, try Value.fromString(arena_alloc, str)),
.line_string, .space_string, .concat_string => |str| try rootmap.put(dupekey, try Value.fromString(arena_alloc, str)),
.inline_list => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_map, dkb)),
}
@ -188,9 +188,11 @@ pub const State = struct {
.comment => unreachable,
.in_line => |in_line| switch (in_line) {
.empty => unreachable,
inline .line_string, .concat_string => |str, tag| {
inline .line_string, .space_string, .concat_string => |str, tag| {
if (tag == .line_string)
try state.string_builder.append(arena_alloc, '\n');
if (tag == .space_string)
try state.string_builder.append(arena_alloc, ' ');
try state.string_builder.appendSlice(arena_alloc, str);
},
else => {
@ -249,10 +251,14 @@ pub const State = struct {
state.expect_shift = .dedent;
switch (in_line) {
.empty => unreachable,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.scalar => {
state.diagnostics.length = 1;
state.diagnostics.message = "the document may not contain a scalar value on its own line";
return error.UnexpectedValue;
},
.inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)),
.line_string, .concat_string => |str| {
.line_string, .space_string, .concat_string => |str| {
const new_string = try appendListGetValue(list, Value.emptyString());
try state.string_builder.appendSlice(arena_alloc, str);
try state.value_stack.append(new_string);
@ -266,7 +272,7 @@ pub const State = struct {
switch (value) {
.empty => state.expect_shift = .indent,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
.line_string, .concat_string => |str| try list.append(try Value.fromString(arena_alloc, str)),
.line_string, .space_string, .concat_string => |str| try list.append(try Value.fromString(arena_alloc, str)),
.inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)),
}
@ -291,7 +297,7 @@ pub const State = struct {
if (state.expect_shift != .indent or line.shift != .indent) {
state.diagnostics.length = 1;
state.diagnostics.message = "the document contains an invalid map key in a list";
state.diagnostics.message = "the document contains a map item where a list item is expected";
return error.UnexpectedValue;
}
@ -348,12 +354,16 @@ pub const State = struct {
switch (in_line) {
.empty => unreachable,
.scalar => |str| try state.putMap(map, state.dangling_key.?, try Value.fromScalar(arena_alloc, str), dkb),
.scalar => {
state.diagnostics.length = 1;
state.diagnostics.message = "the document may not contain a scalar value on its own line";
return error.UnexpectedValue;
},
.inline_list => |str| try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_list, dkb), dkb),
.inline_map => |str| {
try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_map, dkb), dkb);
},
.line_string, .concat_string => |str| {
.line_string, .space_string, .concat_string => |str| {
// string pushes the stack
const new_string = try state.putMapGetValue(map, state.dangling_key.?, Value.emptyString(), dkb);
try state.string_builder.appendSlice(arena_alloc, str);
@ -375,7 +385,7 @@ pub const State = struct {
if (state.expect_shift != .indent or line.shift != .indent or state.dangling_key == null) {
state.diagnostics.length = 1;
state.diagnostics.message = "the document contains an invalid list item in a map";
state.diagnostics.message = "the document contains a list item where a map item is expected";
return error.UnexpectedValue;
}
@ -395,7 +405,7 @@ pub const State = struct {
state.dangling_key = dupekey;
},
.scalar => |str| try state.putMap(map, dupekey, try Value.fromScalar(arena_alloc, str), dkb),
.line_string, .concat_string => |str| try state.putMap(map, dupekey, try Value.fromString(arena_alloc, str), dkb),
.line_string, .space_string, .concat_string => |str| try state.putMap(map, dupekey, try Value.fromString(arena_alloc, str), dkb),
.inline_list => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_list, dkb), dkb),
.inline_map => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_map, dkb), dkb),
}
@ -567,7 +577,12 @@ pub const State = struct {
// forbid these characters so that inline dictionary keys cannot start
// with characters that regular dictionary keys cannot start with
// (even though they're unambiguous in this specific context).
'{', '[', '#', '-', '>', '|', ',' => return {
'{', '[', '#', ',' => return {
state.diagnostics.length = 1;
state.diagnostics.message = "this document contains a inline map key that starts with an invalid character";
return error.BadToken;
},
'-', '>', '+', '|' => if ((idx + 1) < contents.len and contents[idx + 1] == ' ') {
state.diagnostics.length = 1;
state.diagnostics.message = "this document contains a inline map key that starts with an invalid sequence";
return error.BadToken;

View File

@ -23,6 +23,7 @@ pub const InlineItem = union(enum) {
empty: void,
scalar: []const u8,
line_string: []const u8,
space_string: []const u8,
concat_string: []const u8,
inline_list: []const u8,
@ -162,10 +163,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
// this should not be possible, as empty lines are caught earlier.
if (line.len == 0) return error.Impossible;
sigil: {
switch (line[0]) {
'#' => {
// force comments to be followed by a space. This makes them
// behave the same way as strings, actually.
// Force comments to be followed by a space. We could
// allow #: to be interpreted as a map key, but I'm going
// to specifically forbid it instead.
if (line.len > 1 and line[1] != ' ') {
self.buffer.diag().line_offset += 1;
self.buffer.diag().length = 1;
@ -181,7 +184,21 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.raw = line,
};
},
'|', '>', '[', '{' => {
'|', '>', '+' => {
if (line.len > 1 and line[1] != ' ') {
// we want to try parsing this as a map key
break :sigil;
}
return .{
.shift = shift,
.contents = .{ .in_line = try self.detectInlineItem(line) },
.raw = line,
};
},
'[', '{' => {
// these don't require being followed by a space, so they
// cannot be interpreted as starting a map key in any way.
return .{
.shift = shift,
.contents = .{ .in_line = try self.detectInlineItem(line) },
@ -190,10 +207,8 @@ pub fn LineTokenizer(comptime Buffer: type) type {
},
'-' => {
if (line.len > 1 and line[1] != ' ') {
self.buffer.diag().line_offset += 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the list entry character '-'";
return error.BadToken;
// we want to try parsing this as a map key
break :sigil;
}
// blindly add 2 here because an empty item cannot fail in
@ -210,7 +225,10 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.raw = line,
};
},
else => {
else => break :sigil,
}
}
for (line, 0..) |char, idx| {
if (char == ':') {
if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) {
@ -252,14 +270,6 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.contents = .{ .in_line = .{ .scalar = line } },
.raw = line,
};
},
}
// somehow everything else has failed
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = raw_line.len;
self.buffer.diag().message = "this document contains an unknown error. Please report this.";
return error.Impossible;
}
return null;
}
@ -281,8 +291,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
};
switch (buf[start]) {
'>', '|' => |char| {
if (buf.len - start > 1 and buf[start + 1] != ' ') return error.BadToken;
'>', '|', '+' => |char| {
if (buf.len - start > 1 and buf[start + 1] != ' ') {
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the string start character";
return error.BadToken;
}
const slice: []const u8 = switch (buf[buf.len - 1]) {
' ', '\t' => {
@ -295,10 +309,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
else => buf[start + @min(2, buf.len - start) .. buf.len],
};
return if (char == '>')
.{ .line_string = slice }
else
.{ .concat_string = slice };
return switch (char) {
'>' => .{ .line_string = slice },
'+' => .{ .space_string = slice },
'|' => .{ .concat_string = slice },
else => unreachable,
};
},
'[' => {
if (buf.len - start < 2 or buf[buf.len - 1] != ']') {