parser: reintroduce space strings and change token parsing strategy
I don't think I have the wherewithal to write this full commit message right now. Since it should be a long one. Basically. `+ ` now is the string space concatenation operator because that is a very common use case. It's essentially the soft-wrap character. Also, lines starting with -, +, >, and | will now try to tokenize as map keys if they do not contain the following space. The motivation here is numeric map keys. Specifically, +/- are numeric leaders. To facilitate this change, own-line scalars are now prohibited. So, for example: key: -1000 is still fine, but key: -1000 is no longer accepted.
This commit is contained in:
parent
25386ac87a
commit
258cf2ae83
@ -59,7 +59,7 @@ pub const State = struct {
|
||||
},
|
||||
},
|
||||
.value => switch (state.value_stack.getLast().*) {
|
||||
// remove the final trailing newline or space
|
||||
// we have an in-progress string, finish it.
|
||||
.string => |*string| string.* = try state.string_builder.toOwnedSlice(arena_alloc),
|
||||
// if we have a dangling -, attach an empty scalar to it
|
||||
.list => |*list| if (state.expect_shift == .indent) try list.append(Value.emptyScalar()),
|
||||
@ -104,7 +104,7 @@ pub const State = struct {
|
||||
state.document.root = try Value.fromScalar(arena_alloc, str);
|
||||
state.mode = .done;
|
||||
},
|
||||
.line_string, .concat_string => |str| {
|
||||
.line_string, .space_string, .concat_string => |str| {
|
||||
state.document.root = Value.emptyString();
|
||||
try state.string_builder.appendSlice(arena_alloc, str);
|
||||
try state.value_stack.append(&state.document.root);
|
||||
@ -128,7 +128,7 @@ pub const State = struct {
|
||||
switch (value) {
|
||||
.empty => state.expect_shift = .indent,
|
||||
.scalar => |str| try rootlist.append(try Value.fromScalar(arena_alloc, str)),
|
||||
.line_string, .concat_string => |str| try rootlist.append(try Value.fromString(arena_alloc, str)),
|
||||
.line_string, .space_string, .concat_string => |str| try rootlist.append(try Value.fromString(arena_alloc, str)),
|
||||
.inline_list => |str| try rootlist.append(try state.parseFlow(str, .inline_list, dkb)),
|
||||
.inline_map => |str| try rootlist.append(try state.parseFlow(str, .inline_map, dkb)),
|
||||
}
|
||||
@ -146,7 +146,7 @@ pub const State = struct {
|
||||
state.dangling_key = dupekey;
|
||||
},
|
||||
.scalar => |str| try rootmap.put(dupekey, try Value.fromScalar(arena_alloc, str)),
|
||||
.line_string, .concat_string => |str| try rootmap.put(dupekey, try Value.fromString(arena_alloc, str)),
|
||||
.line_string, .space_string, .concat_string => |str| try rootmap.put(dupekey, try Value.fromString(arena_alloc, str)),
|
||||
.inline_list => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_list, dkb)),
|
||||
.inline_map => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_map, dkb)),
|
||||
}
|
||||
@ -188,9 +188,11 @@ pub const State = struct {
|
||||
.comment => unreachable,
|
||||
.in_line => |in_line| switch (in_line) {
|
||||
.empty => unreachable,
|
||||
inline .line_string, .concat_string => |str, tag| {
|
||||
inline .line_string, .space_string, .concat_string => |str, tag| {
|
||||
if (tag == .line_string)
|
||||
try state.string_builder.append(arena_alloc, '\n');
|
||||
if (tag == .space_string)
|
||||
try state.string_builder.append(arena_alloc, ' ');
|
||||
try state.string_builder.appendSlice(arena_alloc, str);
|
||||
},
|
||||
else => {
|
||||
@ -249,10 +251,14 @@ pub const State = struct {
|
||||
state.expect_shift = .dedent;
|
||||
switch (in_line) {
|
||||
.empty => unreachable,
|
||||
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
|
||||
.scalar => {
|
||||
state.diagnostics.length = 1;
|
||||
state.diagnostics.message = "the document may not contain a scalar value on its own line";
|
||||
return error.UnexpectedValue;
|
||||
},
|
||||
.inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)),
|
||||
.inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)),
|
||||
.line_string, .concat_string => |str| {
|
||||
.line_string, .space_string, .concat_string => |str| {
|
||||
const new_string = try appendListGetValue(list, Value.emptyString());
|
||||
try state.string_builder.appendSlice(arena_alloc, str);
|
||||
try state.value_stack.append(new_string);
|
||||
@ -266,7 +272,7 @@ pub const State = struct {
|
||||
switch (value) {
|
||||
.empty => state.expect_shift = .indent,
|
||||
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
|
||||
.line_string, .concat_string => |str| try list.append(try Value.fromString(arena_alloc, str)),
|
||||
.line_string, .space_string, .concat_string => |str| try list.append(try Value.fromString(arena_alloc, str)),
|
||||
.inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)),
|
||||
.inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)),
|
||||
}
|
||||
@ -291,7 +297,7 @@ pub const State = struct {
|
||||
|
||||
if (state.expect_shift != .indent or line.shift != .indent) {
|
||||
state.diagnostics.length = 1;
|
||||
state.diagnostics.message = "the document contains an invalid map key in a list";
|
||||
state.diagnostics.message = "the document contains a map item where a list item is expected";
|
||||
return error.UnexpectedValue;
|
||||
}
|
||||
|
||||
@ -348,12 +354,16 @@ pub const State = struct {
|
||||
|
||||
switch (in_line) {
|
||||
.empty => unreachable,
|
||||
.scalar => |str| try state.putMap(map, state.dangling_key.?, try Value.fromScalar(arena_alloc, str), dkb),
|
||||
.scalar => {
|
||||
state.diagnostics.length = 1;
|
||||
state.diagnostics.message = "the document may not contain a scalar value on its own line";
|
||||
return error.UnexpectedValue;
|
||||
},
|
||||
.inline_list => |str| try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_list, dkb), dkb),
|
||||
.inline_map => |str| {
|
||||
try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_map, dkb), dkb);
|
||||
},
|
||||
.line_string, .concat_string => |str| {
|
||||
.line_string, .space_string, .concat_string => |str| {
|
||||
// string pushes the stack
|
||||
const new_string = try state.putMapGetValue(map, state.dangling_key.?, Value.emptyString(), dkb);
|
||||
try state.string_builder.appendSlice(arena_alloc, str);
|
||||
@ -375,7 +385,7 @@ pub const State = struct {
|
||||
|
||||
if (state.expect_shift != .indent or line.shift != .indent or state.dangling_key == null) {
|
||||
state.diagnostics.length = 1;
|
||||
state.diagnostics.message = "the document contains an invalid list item in a map";
|
||||
state.diagnostics.message = "the document contains a list item where a map item is expected";
|
||||
return error.UnexpectedValue;
|
||||
}
|
||||
|
||||
@ -395,7 +405,7 @@ pub const State = struct {
|
||||
state.dangling_key = dupekey;
|
||||
},
|
||||
.scalar => |str| try state.putMap(map, dupekey, try Value.fromScalar(arena_alloc, str), dkb),
|
||||
.line_string, .concat_string => |str| try state.putMap(map, dupekey, try Value.fromString(arena_alloc, str), dkb),
|
||||
.line_string, .space_string, .concat_string => |str| try state.putMap(map, dupekey, try Value.fromString(arena_alloc, str), dkb),
|
||||
.inline_list => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_list, dkb), dkb),
|
||||
.inline_map => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_map, dkb), dkb),
|
||||
}
|
||||
@ -567,7 +577,12 @@ pub const State = struct {
|
||||
// forbid these characters so that inline dictionary keys cannot start
|
||||
// with characters that regular dictionary keys cannot start with
|
||||
// (even though they're unambiguous in this specific context).
|
||||
'{', '[', '#', '-', '>', '|', ',' => return {
|
||||
'{', '[', '#', ',' => return {
|
||||
state.diagnostics.length = 1;
|
||||
state.diagnostics.message = "this document contains a inline map key that starts with an invalid character";
|
||||
return error.BadToken;
|
||||
},
|
||||
'-', '>', '+', '|' => if ((idx + 1) < contents.len and contents[idx + 1] == ' ') {
|
||||
state.diagnostics.length = 1;
|
||||
state.diagnostics.message = "this document contains a inline map key that starts with an invalid sequence";
|
||||
return error.BadToken;
|
||||
|
@ -23,6 +23,7 @@ pub const InlineItem = union(enum) {
|
||||
empty: void,
|
||||
scalar: []const u8,
|
||||
line_string: []const u8,
|
||||
space_string: []const u8,
|
||||
concat_string: []const u8,
|
||||
|
||||
inline_list: []const u8,
|
||||
@ -162,104 +163,113 @@ pub fn LineTokenizer(comptime Buffer: type) type {
|
||||
// this should not be possible, as empty lines are caught earlier.
|
||||
if (line.len == 0) return error.Impossible;
|
||||
|
||||
switch (line[0]) {
|
||||
'#' => {
|
||||
// force comments to be followed by a space. This makes them
|
||||
// behave the same way as strings, actually.
|
||||
if (line.len > 1 and line[1] != ' ') {
|
||||
self.buffer.diag().line_offset += 1;
|
||||
self.buffer.diag().length = 1;
|
||||
self.buffer.diag().message = "this line is missing a space after the start of comment character '#'";
|
||||
return error.BadToken;
|
||||
}
|
||||
|
||||
// simply lie about indentation when the line is a comment.
|
||||
quantized = self.last_indent;
|
||||
return .{
|
||||
.shift = .none,
|
||||
.contents = .{ .comment = line[1..] },
|
||||
.raw = line,
|
||||
};
|
||||
},
|
||||
'|', '>', '[', '{' => {
|
||||
return .{
|
||||
.shift = shift,
|
||||
.contents = .{ .in_line = try self.detectInlineItem(line) },
|
||||
.raw = line,
|
||||
};
|
||||
},
|
||||
'-' => {
|
||||
if (line.len > 1 and line[1] != ' ') {
|
||||
self.buffer.diag().line_offset += 1;
|
||||
self.buffer.diag().length = 1;
|
||||
self.buffer.diag().message = "this line is missing a space after the list entry character '-'";
|
||||
return error.BadToken;
|
||||
}
|
||||
|
||||
// blindly add 2 here because an empty item cannot fail in
|
||||
// the value, only if a bogus dedent has occurred
|
||||
self.buffer.diag().line_offset += 2;
|
||||
|
||||
return if (line.len == 1) .{
|
||||
.shift = shift,
|
||||
.contents = .{ .list_item = .empty },
|
||||
.raw = line,
|
||||
} else .{
|
||||
.shift = shift,
|
||||
.contents = .{ .list_item = try self.detectInlineItem(line[2..]) },
|
||||
.raw = line,
|
||||
};
|
||||
},
|
||||
else => {
|
||||
for (line, 0..) |char, idx| {
|
||||
if (char == ':') {
|
||||
if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) {
|
||||
self.buffer.diag().line_offset += idx - 1;
|
||||
self.buffer.diag().length = 1;
|
||||
self.buffer.diag().message = "this line contains space before the map key-value separator character ':'";
|
||||
return error.TrailingWhitespace;
|
||||
}
|
||||
|
||||
if (idx + 1 == line.len) {
|
||||
self.buffer.diag().line_offset += idx + 1;
|
||||
return .{
|
||||
.shift = shift,
|
||||
.contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
|
||||
.raw = line,
|
||||
};
|
||||
}
|
||||
|
||||
if (line[idx + 1] != ' ') {
|
||||
self.buffer.diag().line_offset += idx + 1;
|
||||
self.buffer.diag().length = 1;
|
||||
self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'";
|
||||
return error.BadToken;
|
||||
}
|
||||
|
||||
return .{
|
||||
.shift = shift,
|
||||
.contents = .{ .map_item = .{
|
||||
.key = line[0..idx],
|
||||
.val = try self.detectInlineItem(line[idx + 2 ..]),
|
||||
} },
|
||||
.raw = line,
|
||||
};
|
||||
sigil: {
|
||||
switch (line[0]) {
|
||||
'#' => {
|
||||
// Force comments to be followed by a space. We could
|
||||
// allow #: to be interpreted as a map key, but I'm going
|
||||
// to specifically forbid it instead.
|
||||
if (line.len > 1 and line[1] != ' ') {
|
||||
self.buffer.diag().line_offset += 1;
|
||||
self.buffer.diag().length = 1;
|
||||
self.buffer.diag().message = "this line is missing a space after the start of comment character '#'";
|
||||
return error.BadToken;
|
||||
}
|
||||
}
|
||||
|
||||
return .{
|
||||
.shift = shift,
|
||||
.contents = .{ .in_line = .{ .scalar = line } },
|
||||
.raw = line,
|
||||
};
|
||||
},
|
||||
// simply lie about indentation when the line is a comment.
|
||||
quantized = self.last_indent;
|
||||
return .{
|
||||
.shift = .none,
|
||||
.contents = .{ .comment = line[1..] },
|
||||
.raw = line,
|
||||
};
|
||||
},
|
||||
'|', '>', '+' => {
|
||||
if (line.len > 1 and line[1] != ' ') {
|
||||
// we want to try parsing this as a map key
|
||||
break :sigil;
|
||||
}
|
||||
|
||||
return .{
|
||||
.shift = shift,
|
||||
.contents = .{ .in_line = try self.detectInlineItem(line) },
|
||||
.raw = line,
|
||||
};
|
||||
},
|
||||
'[', '{' => {
|
||||
// these don't require being followed by a space, so they
|
||||
// cannot be interpreted as starting a map key in any way.
|
||||
return .{
|
||||
.shift = shift,
|
||||
.contents = .{ .in_line = try self.detectInlineItem(line) },
|
||||
.raw = line,
|
||||
};
|
||||
},
|
||||
'-' => {
|
||||
if (line.len > 1 and line[1] != ' ') {
|
||||
// we want to try parsing this as a map key
|
||||
break :sigil;
|
||||
}
|
||||
|
||||
// blindly add 2 here because an empty item cannot fail in
|
||||
// the value, only if a bogus dedent has occurred
|
||||
self.buffer.diag().line_offset += 2;
|
||||
|
||||
return if (line.len == 1) .{
|
||||
.shift = shift,
|
||||
.contents = .{ .list_item = .empty },
|
||||
.raw = line,
|
||||
} else .{
|
||||
.shift = shift,
|
||||
.contents = .{ .list_item = try self.detectInlineItem(line[2..]) },
|
||||
.raw = line,
|
||||
};
|
||||
},
|
||||
else => break :sigil,
|
||||
}
|
||||
}
|
||||
|
||||
// somehow everything else has failed
|
||||
self.buffer.diag().line_offset = 0;
|
||||
self.buffer.diag().length = raw_line.len;
|
||||
self.buffer.diag().message = "this document contains an unknown error. Please report this.";
|
||||
return error.Impossible;
|
||||
for (line, 0..) |char, idx| {
|
||||
if (char == ':') {
|
||||
if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) {
|
||||
self.buffer.diag().line_offset += idx - 1;
|
||||
self.buffer.diag().length = 1;
|
||||
self.buffer.diag().message = "this line contains space before the map key-value separator character ':'";
|
||||
return error.TrailingWhitespace;
|
||||
}
|
||||
|
||||
if (idx + 1 == line.len) {
|
||||
self.buffer.diag().line_offset += idx + 1;
|
||||
return .{
|
||||
.shift = shift,
|
||||
.contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
|
||||
.raw = line,
|
||||
};
|
||||
}
|
||||
|
||||
if (line[idx + 1] != ' ') {
|
||||
self.buffer.diag().line_offset += idx + 1;
|
||||
self.buffer.diag().length = 1;
|
||||
self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'";
|
||||
return error.BadToken;
|
||||
}
|
||||
|
||||
return .{
|
||||
.shift = shift,
|
||||
.contents = .{ .map_item = .{
|
||||
.key = line[0..idx],
|
||||
.val = try self.detectInlineItem(line[idx + 2 ..]),
|
||||
} },
|
||||
.raw = line,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return .{
|
||||
.shift = shift,
|
||||
.contents = .{ .in_line = .{ .scalar = line } },
|
||||
.raw = line,
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}
|
||||
@ -281,8 +291,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
|
||||
};
|
||||
|
||||
switch (buf[start]) {
|
||||
'>', '|' => |char| {
|
||||
if (buf.len - start > 1 and buf[start + 1] != ' ') return error.BadToken;
|
||||
'>', '|', '+' => |char| {
|
||||
if (buf.len - start > 1 and buf[start + 1] != ' ') {
|
||||
self.buffer.diag().length = 1;
|
||||
self.buffer.diag().message = "this line is missing a space after the string start character";
|
||||
return error.BadToken;
|
||||
}
|
||||
|
||||
const slice: []const u8 = switch (buf[buf.len - 1]) {
|
||||
' ', '\t' => {
|
||||
@ -295,10 +309,12 @@ pub fn LineTokenizer(comptime Buffer: type) type {
|
||||
else => buf[start + @min(2, buf.len - start) .. buf.len],
|
||||
};
|
||||
|
||||
return if (char == '>')
|
||||
.{ .line_string = slice }
|
||||
else
|
||||
.{ .concat_string = slice };
|
||||
return switch (char) {
|
||||
'>' => .{ .line_string = slice },
|
||||
'+' => .{ .space_string = slice },
|
||||
'|' => .{ .concat_string = slice },
|
||||
else => unreachable,
|
||||
};
|
||||
},
|
||||
'[' => {
|
||||
if (buf.len - start < 2 or buf[buf.len - 1] != ']') {
|
||||
|
Loading…
x
Reference in New Issue
Block a user