state/tokenizer: go completely the opposite direction re: whitespace

This commit makes both the parser and tokenizer a lot more willing to
accept whitespace in places where it would previously cause strange
behavior. Also, whitespace is ignored preceding and following all
values and keys in flow-style objects now (in regular objects,
trailing whitespace is an error, and it is also an error for non-flow
map keys to have whitespace before the colon). Tabs are no longer
allowed as whitespace in the line. They can be inside scalar values,
though, including map keys. Also strings allow tabs inside of them.

The primary motivation here is to apply the principle of least
astonishment. For example, the following

  -  [hello, there]

would previously have been parsed as the scalar " [hello, there]" due
to the presence of an additional space after the "-" list item
indicator. This obviously looks like a flow list, and the way it was
previously parsed was very visually confusing (this change does mean
that scalars cannot start with [, but strings can, so this is not a
real limitation. Note that strings still allow leading whitespace, so

  >  hello

will produce the string " hello" due to the additional space after the
string designator. For flow lists,

  [ a, b ]

would have been parsed as ["a", "b "], which was obviously confusing.
The previous commit fixed this by making whitespace rules more strict.
This commit fixes this by making whitespace rules more relaxed. In
particular, all whitespace preceding and following flow items is now
stripped. The main motivation for going in this direction is to allow
aligning list items over multiple lines, visually, which can make data
much easier to read for people, an explicit design goal. For example

  key:   [  1,  2,  3 ]
  other: [ 10, 20, 30 ]

is now allowed. The indentation rules do not allow right-aligning
"key" to "other", but I think that is acceptable (if we forced using
tabs for indentation, we could actually allow this, which I think is
worth consideration, at least). Flow maps are more generous:

  foo:  {  bar:  baz }
  fooq: { barq: bazq }

is allowed because flow maps do not use whitespace as a structural
designator. These changes do affect how some things can be
represented. Scalar values can no longer contain leading or trailing
whitespace (previously the could contain leading whitespace). Map keys
cannot contain trailing whitespace (they could before. This also means
that keys consisting of whitespace cannot be represented at all).
Ultimately, given the other restrictions the format imposes on keys
and values, I find these to be acceptable and consistent with the goal
of the format.
This commit is contained in:
2023-10-04 22:54:53 -07:00
parent 1683197bc0
commit 7db6094dd5
2 changed files with 88 additions and 59 deletions

View File

@@ -452,10 +452,10 @@ pub const State = struct {
charloop: for (contents, 0..) |char, idx| {
switch (pstate) {
.want_list_item => switch (char) {
' ', '\t' => continue :charloop,
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
',' => {
// empty value
// don't check for whitespace here: [ , ] is okay, as is [ , , ]
const tip = try state.getStackTip();
try tip.flow_list.append(Value.newScalar(arena_alloc));
item_start = idx + 1;
@@ -500,35 +500,33 @@ pub const State = struct {
},
},
.consuming_list_item => switch (char) {
// consider: detecting trailing whitespace. "[ 1 ]" should
// produce "1" and not "1 " as it currently does, which breaks
// the principle of least astonishment. design: no trailing
// whitespace before "," and only a single space is allowed before "]"
',' => {
if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
state.diagnostics.length = 1;
state.diagnostics.message = "the flow list contains whitespace before ,";
return error.TrailingWhitespace;
}
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
break :end countup;
};
const tip = try state.getStackTip();
try tip.flow_list.append(
try Value.fromScalar(arena_alloc, contents[item_start..idx]),
try Value.fromScalar(arena_alloc, contents[item_start..end]),
);
item_start = idx + 1;
pstate = .want_list_item;
},
']' => {
var end = idx;
if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
if (idx > 1 and (contents[idx - 2] == ' ' or contents[idx - 2] == '\t')) {
state.diagnostics.length = 1;
state.diagnostics.message = "the flow list contains extra whitespace before ]";
return error.TrailingWhitespace;
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
end = idx - 1;
}
break :end countup;
};
const finished = state.value_stack.getLastOrNull() orelse {
state.diagnostics.length = 1;
@@ -543,7 +541,8 @@ pub const State = struct {
else => continue :charloop,
},
.want_list_separator => switch (char) {
' ', '\t' => continue :charloop,
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
',' => {
item_start = idx;
pstate = .want_list_item;
@@ -556,7 +555,8 @@ pub const State = struct {
},
},
.want_map_key => switch (char) {
' ', '\t' => continue :charloop,
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
// forbid these characters so that flow dictionary keys cannot start
// with characters that regular dictionary keys cannot start with
// (even though they're unambiguous in this specific context).
@@ -578,18 +578,22 @@ pub const State = struct {
},
.consuming_map_key => switch (char) {
':' => {
if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
state.diagnostics.length = 1;
state.diagnostics.message = "the flow map contains whitespace before :";
return error.TrailingWhitespace;
}
dangling_key = try arena_alloc.dupe(u8, contents[item_start..idx]);
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
break :end countup;
};
dangling_key = try arena_alloc.dupe(u8, contents[item_start..end]);
pstate = .want_map_value;
},
else => continue :charloop,
},
.want_map_value => switch (char) {
' ', '\t' => continue :charloop,
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
',' => {
const tip = try state.getStackTip();
try state.putMap(
@@ -651,31 +655,34 @@ pub const State = struct {
},
.consuming_map_value => switch (char) {
',' => {
if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
state.diagnostics.length = 1;
state.diagnostics.message = "the flow map contains whitespace before ,";
return error.TrailingWhitespace;
}
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
break :end countup;
};
const tip = try state.getStackTip();
try state.putMap(
&tip.flow_map,
dangling_key.?,
try Value.fromScalar(arena_alloc, contents[item_start..idx]),
try Value.fromScalar(arena_alloc, contents[item_start..end]),
dkb,
);
dangling_key = null;
pstate = .want_map_key;
},
'}' => {
var end = idx;
if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
if (idx > 1 and (contents[idx - 2] == ' ' or contents[idx - 2] == '\t')) {
state.diagnostics.length = 1;
state.diagnostics.message = "the flow map contains extra whitespace before }";
return error.TrailingWhitespace;
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
end = idx - 1;
}
break :end countup;
};
const tip = try state.getStackTip();
try state.putMap(
@@ -690,7 +697,8 @@ pub const State = struct {
else => continue :charloop,
},
.want_map_separator => switch (char) {
' ', '\t' => continue :charloop,
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
',' => pstate = .want_map_key,
'}' => pstate = try state.popFlowStack(),
else => return {