state/tokenizer: go completely the opposite direction re: whitespace

This commit makes both the parser and tokenizer a lot more willing to
accept whitespace in places where it would previously cause strange
behavior. Also, whitespace is ignored preceding and following all
values and keys in flow-style objects now (in regular objects,
trailing whitespace is an error, and it is also an error for non-flow
map keys to have whitespace before the colon). Tabs are no longer
allowed as whitespace in the line. They can be inside scalar values,
though, including map keys. Also strings allow tabs inside of them.

The primary motivation here is to apply the principle of least
astonishment. For example, the following

  -  [hello, there]

would previously have been parsed as the scalar " [hello, there]" due
to the presence of an additional space after the "-" list item
indicator. This obviously looks like a flow list, and the way it was
previously parsed was very visually confusing (this change does mean
that scalars cannot start with [, but strings can, so this is not a
real limitation. Note that strings still allow leading whitespace, so

  >  hello

will produce the string " hello" due to the additional space after the
string designator. For flow lists,

  [ a, b ]

would have been parsed as ["a", "b "], which was obviously confusing.
The previous commit fixed this by making whitespace rules more strict.
This commit fixes this by making whitespace rules more relaxed. In
particular, all whitespace preceding and following flow items is now
stripped. The main motivation for going in this direction is to allow
aligning list items over multiple lines, visually, which can make data
much easier to read for people, an explicit design goal. For example

  key:   [  1,  2,  3 ]
  other: [ 10, 20, 30 ]

is now allowed. The indentation rules do not allow right-aligning
"key" to "other", but I think that is acceptable (if we forced using
tabs for indentation, we could actually allow this, which I think is
worth consideration, at least). Flow maps are more generous:

  foo:  {  bar:  baz }
  fooq: { barq: bazq }

is allowed because flow maps do not use whitespace as a structural
designator. These changes do affect how some things can be
represented. Scalar values can no longer contain leading or trailing
whitespace (previously the could contain leading whitespace). Map keys
cannot contain trailing whitespace (they could before. This also means
that keys consisting of whitespace cannot be represented at all).
Ultimately, given the other restrictions the format imposes on keys
and values, I find these to be acceptable and consistent with the goal
of the format.
This commit is contained in:
torque 2023-10-04 22:54:53 -07:00
parent 1683197bc0
commit 7db6094dd5
Signed by: torque
SSH Key Fingerprint: SHA256:nCrXefBNo6EbjNSQhv0nXmEg/VuNq3sMF5b8zETw3Tk
2 changed files with 88 additions and 59 deletions

View File

@ -452,10 +452,10 @@ pub const State = struct {
charloop: for (contents, 0..) |char, idx| {
switch (pstate) {
.want_list_item => switch (char) {
' ', '\t' => continue :charloop,
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
',' => {
// empty value
// don't check for whitespace here: [ , ] is okay, as is [ , , ]
const tip = try state.getStackTip();
try tip.flow_list.append(Value.newScalar(arena_alloc));
item_start = idx + 1;
@ -500,35 +500,33 @@ pub const State = struct {
},
},
.consuming_list_item => switch (char) {
// consider: detecting trailing whitespace. "[ 1 ]" should
// produce "1" and not "1 " as it currently does, which breaks
// the principle of least astonishment. design: no trailing
// whitespace before "," and only a single space is allowed before "]"
',' => {
if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
state.diagnostics.length = 1;
state.diagnostics.message = "the flow list contains whitespace before ,";
return error.TrailingWhitespace;
}
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
break :end countup;
};
const tip = try state.getStackTip();
try tip.flow_list.append(
try Value.fromScalar(arena_alloc, contents[item_start..idx]),
try Value.fromScalar(arena_alloc, contents[item_start..end]),
);
item_start = idx + 1;
pstate = .want_list_item;
},
']' => {
var end = idx;
if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
if (idx > 1 and (contents[idx - 2] == ' ' or contents[idx - 2] == '\t')) {
state.diagnostics.length = 1;
state.diagnostics.message = "the flow list contains extra whitespace before ]";
return error.TrailingWhitespace;
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
end = idx - 1;
}
break :end countup;
};
const finished = state.value_stack.getLastOrNull() orelse {
state.diagnostics.length = 1;
@ -543,7 +541,8 @@ pub const State = struct {
else => continue :charloop,
},
.want_list_separator => switch (char) {
' ', '\t' => continue :charloop,
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
',' => {
item_start = idx;
pstate = .want_list_item;
@ -556,7 +555,8 @@ pub const State = struct {
},
},
.want_map_key => switch (char) {
' ', '\t' => continue :charloop,
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
// forbid these characters so that flow dictionary keys cannot start
// with characters that regular dictionary keys cannot start with
// (even though they're unambiguous in this specific context).
@ -578,18 +578,22 @@ pub const State = struct {
},
.consuming_map_key => switch (char) {
':' => {
if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
state.diagnostics.length = 1;
state.diagnostics.message = "the flow map contains whitespace before :";
return error.TrailingWhitespace;
}
dangling_key = try arena_alloc.dupe(u8, contents[item_start..idx]);
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
break :end countup;
};
dangling_key = try arena_alloc.dupe(u8, contents[item_start..end]);
pstate = .want_map_value;
},
else => continue :charloop,
},
.want_map_value => switch (char) {
' ', '\t' => continue :charloop,
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
',' => {
const tip = try state.getStackTip();
try state.putMap(
@ -651,31 +655,34 @@ pub const State = struct {
},
.consuming_map_value => switch (char) {
',' => {
if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
state.diagnostics.length = 1;
state.diagnostics.message = "the flow map contains whitespace before ,";
return error.TrailingWhitespace;
}
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
break :end countup;
};
const tip = try state.getStackTip();
try state.putMap(
&tip.flow_map,
dangling_key.?,
try Value.fromScalar(arena_alloc, contents[item_start..idx]),
try Value.fromScalar(arena_alloc, contents[item_start..end]),
dkb,
);
dangling_key = null;
pstate = .want_map_key;
},
'}' => {
var end = idx;
if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
if (idx > 1 and (contents[idx - 2] == ' ' or contents[idx - 2] == '\t')) {
state.diagnostics.length = 1;
state.diagnostics.message = "the flow map contains extra whitespace before }";
return error.TrailingWhitespace;
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
end = idx - 1;
}
break :end countup;
};
const tip = try state.getStackTip();
try state.putMap(
@ -690,7 +697,8 @@ pub const State = struct {
else => continue :charloop,
},
.want_map_separator => switch (char) {
' ', '\t' => continue :charloop,
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
',' => pstate = .want_map_key,
'}' => pstate = try state.popFlowStack(),
else => return {

View File

@ -9,6 +9,7 @@ pub const Error = error{
TooMuchIndentation,
UnquantizedIndentation,
TrailingWhitespace,
IllegalTabWhitespaceInLine,
Impossible,
};
@ -220,13 +221,21 @@ pub fn LineTokenizer(comptime Buffer: type) type {
else => {
for (line, 0..) |char, idx| {
if (char == ':') {
self.buffer.diag().line_offset += idx + 2;
if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) {
self.buffer.diag().line_offset += idx - 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains space before the map key-value separator character ':'";
return error.TrailingWhitespace;
}
if (idx + 1 == line.len) return .{
.shift = shift,
.contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
.raw = line,
};
if (idx + 1 == line.len) {
self.buffer.diag().line_offset += idx + 1;
return .{
.shift = shift,
.contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
.raw = line,
};
}
if (line[idx + 1] != ' ') {
self.buffer.diag().line_offset += idx + 1;
@ -267,9 +276,21 @@ pub fn LineTokenizer(comptime Buffer: type) type {
fn detectInlineItem(self: @This(), buf: []const u8) Error!InlineItem {
if (buf.len == 0) return .empty;
switch (buf[0]) {
const start = start: {
for (buf, 0..) |chr, idx|
if (chr == ' ')
continue
else if (chr == '\t')
return error.IllegalTabWhitespaceInLine
else
break :start idx;
return error.TrailingWhitespace;
};
switch (buf[start]) {
'>', '|' => |char| {
if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
if (buf.len - start > 1 and buf[start + 1] != ' ') return error.BadToken;
const slice: []const u8 = switch (buf[buf.len - 1]) {
' ', '\t' => {
@ -278,8 +299,8 @@ pub fn LineTokenizer(comptime Buffer: type) type {
self.buffer.diag().message = "this line contains trailing whitespace";
return error.TrailingWhitespace;
},
'|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
else => buf[@min(2, buf.len)..buf.len],
'|' => buf[start + @min(2, buf.len - start) .. buf.len - @intFromBool(buf.len - start > 1)],
else => buf[start + @min(2, buf.len - start) .. buf.len],
};
return if (char == '>')
@ -288,7 +309,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
.{ .space_string = slice };
},
'[' => {
if (buf.len < 2 or buf[buf.len - 1] != ']') {
if (buf.len - start < 2 or buf[buf.len - 1] != ']') {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains a flow-style list but does not end with the closing character ']'";
@ -296,10 +317,10 @@ pub fn LineTokenizer(comptime Buffer: type) type {
}
// keep the closing ] for the flow parser
return .{ .flow_list = buf[1..] };
return .{ .flow_list = buf[start + 1 ..] };
},
'{' => {
if (buf.len < 2 or buf[buf.len - 1] != '}') {
if (buf.len - start < 2 or buf[buf.len - 1] != '}') {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains a flow-style map but does not end with the closing character '}'";
@ -307,7 +328,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
}
// keep the closing } fpr the flow parser
return .{ .flow_map = buf[1..] };
return .{ .flow_map = buf[start + 1 ..] };
},
else => {
if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') {
@ -317,7 +338,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
return error.TrailingWhitespace;
}
return .{ .scalar = buf };
return .{ .scalar = buf[start..] };
},
}
}