2023-09-24 18:22:12 -07:00
|
|
|
const std = @import("std");
|
|
|
|
|
|
|
|
const Diagnostics = @import("./parser.zig").Diagnostics;
|
|
|
|
|
|
|
|
pub const Error = error{
|
|
|
|
BadToken,
|
2023-09-27 23:44:06 -07:00
|
|
|
ExtraContent,
|
2023-09-24 18:22:12 -07:00
|
|
|
MixedIndentation,
|
|
|
|
TooMuchIndentation,
|
2023-09-27 23:44:06 -07:00
|
|
|
UnquantizedIndentation,
|
2023-09-24 18:22:12 -07:00
|
|
|
TrailingWhitespace,
|
state/tokenizer: go completely the opposite direction re: whitespace
This commit makes both the parser and tokenizer a lot more willing to
accept whitespace in places where it would previously cause strange
behavior. Also, whitespace is ignored preceding and following all
values and keys in flow-style objects now (in regular objects,
trailing whitespace is an error, and it is also an error for non-flow
map keys to have whitespace before the colon). Tabs are no longer
allowed as whitespace in the line. They can be inside scalar values,
though, including map keys. Also strings allow tabs inside of them.
The primary motivation here is to apply the principle of least
astonishment. For example, the following
- [hello, there]
would previously have been parsed as the scalar " [hello, there]" due
to the presence of an additional space after the "-" list item
indicator. This obviously looks like a flow list, and the way it was
previously parsed was very visually confusing (this change does mean
that scalars cannot start with [, but strings can, so this is not a
real limitation. Note that strings still allow leading whitespace, so
> hello
will produce the string " hello" due to the additional space after the
string designator. For flow lists,
[ a, b ]
would have been parsed as ["a", "b "], which was obviously confusing.
The previous commit fixed this by making whitespace rules more strict.
This commit fixes this by making whitespace rules more relaxed. In
particular, all whitespace preceding and following flow items is now
stripped. The main motivation for going in this direction is to allow
aligning list items over multiple lines, visually, which can make data
much easier to read for people, an explicit design goal. For example
key: [ 1, 2, 3 ]
other: [ 10, 20, 30 ]
is now allowed. The indentation rules do not allow right-aligning
"key" to "other", but I think that is acceptable (if we forced using
tabs for indentation, we could actually allow this, which I think is
worth consideration, at least). Flow maps are more generous:
foo: { bar: baz }
fooq: { barq: bazq }
is allowed because flow maps do not use whitespace as a structural
designator. These changes do affect how some things can be
represented. Scalar values can no longer contain leading or trailing
whitespace (previously the could contain leading whitespace). Map keys
cannot contain trailing whitespace (they could before. This also means
that keys consisting of whitespace cannot be represented at all).
Ultimately, given the other restrictions the format imposes on keys
and values, I find these to be acceptable and consistent with the goal
of the format.
2023-10-04 22:54:53 -07:00
|
|
|
IllegalTabWhitespaceInLine,
|
2023-09-24 18:22:12 -07:00
|
|
|
Impossible,
|
|
|
|
};
|
|
|
|
|
|
|
|
pub const DetectedIndentation = union(enum) {
|
|
|
|
unknown: void,
|
|
|
|
spaces: usize,
|
|
|
|
tabs: void,
|
|
|
|
};
|
|
|
|
|
|
|
|
pub const InlineItem = union(enum) {
|
|
|
|
empty: void,
|
|
|
|
scalar: []const u8,
|
|
|
|
line_string: []const u8,
|
|
|
|
space_string: []const u8,
|
|
|
|
|
|
|
|
flow_list: []const u8,
|
|
|
|
flow_map: []const u8,
|
|
|
|
|
|
|
|
pub fn lineEnding(self: InlineItem) u8 {
|
|
|
|
return switch (self) {
|
|
|
|
.line_string => '\n',
|
|
|
|
.space_string => ' ',
|
|
|
|
else => unreachable,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
pub const LineContents = union(enum) {
|
|
|
|
comment: []const u8,
|
|
|
|
|
|
|
|
in_line: InlineItem,
|
|
|
|
list_item: InlineItem,
|
|
|
|
map_item: struct { key: []const u8, val: InlineItem },
|
|
|
|
};
|
|
|
|
|
|
|
|
pub const ShiftDirection = enum { indent, dedent, none };
|
|
|
|
|
|
|
|
pub const LineShift = union(ShiftDirection) {
|
|
|
|
indent: void,
|
|
|
|
// we can dedent multiple levels at once.
|
|
|
|
dedent: usize,
|
|
|
|
none: void,
|
|
|
|
};
|
|
|
|
|
|
|
|
pub const Line = struct {
|
|
|
|
shift: LineShift,
|
|
|
|
contents: LineContents,
|
|
|
|
raw: []const u8,
|
|
|
|
};
|
|
|
|
|
|
|
|
// buffer is expected to be either LineBuffer or FixedLineBuffer, but can
|
2023-09-27 23:44:06 -07:00
|
|
|
// technically be anything with a conformant interface.
|
2023-09-24 18:22:12 -07:00
|
|
|
pub fn LineTokenizer(comptime Buffer: type) type {
|
|
|
|
return struct {
|
|
|
|
buffer: Buffer,
|
|
|
|
index: usize = 0,
|
|
|
|
indentation: DetectedIndentation = .unknown,
|
|
|
|
last_indent: usize = 0,
|
|
|
|
|
2023-09-27 23:35:24 -07:00
|
|
|
pub fn finish(self: @This()) !void {
|
|
|
|
if (!self.buffer.empty()) {
|
2023-09-27 23:44:06 -07:00
|
|
|
self.buffer.diag().line_offset = 0;
|
|
|
|
self.buffer.diag().length = 1;
|
|
|
|
self.buffer.diag().message = "the document has extra content or is missing the final LF character";
|
2023-09-27 23:35:24 -07:00
|
|
|
return error.ExtraContent;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
pub fn next(self: *@This()) !?Line {
|
|
|
|
lineloop: while (try self.buffer.nextLine()) |raw_line| {
|
2023-09-24 18:22:12 -07:00
|
|
|
var indent: usize = 0;
|
|
|
|
for (raw_line, 0..) |char, idx| {
|
|
|
|
switch (char) {
|
|
|
|
' ' => {
|
|
|
|
switch (self.indentation) {
|
|
|
|
// There's a weird coupling here because we can't set this until
|
|
|
|
// all spaces have been consumed. I also thought about ignoring
|
|
|
|
// spaces on comment lines since those don't affect the
|
|
|
|
// relative indent/dedent, but then we would allow comments
|
|
|
|
// to ignore our indent quantum, which I dislike due to it making
|
|
|
|
// ugly documents.
|
|
|
|
.unknown => self.indentation = .{ .spaces = 0 },
|
|
|
|
.spaces => {},
|
2023-09-27 23:44:06 -07:00
|
|
|
.tabs => {
|
|
|
|
self.buffer.diag().line_offset = idx;
|
|
|
|
self.buffer.diag().length = 1;
|
|
|
|
self.buffer.diag().message = "the document contains mixed tab/space indentation";
|
|
|
|
return error.MixedIndentation;
|
|
|
|
},
|
2023-09-24 18:22:12 -07:00
|
|
|
}
|
|
|
|
},
|
|
|
|
'\t' => {
|
|
|
|
switch (self.indentation) {
|
|
|
|
.unknown => self.indentation = .tabs,
|
2023-09-27 23:44:06 -07:00
|
|
|
.spaces => {
|
|
|
|
self.buffer.diag().line_offset = idx;
|
|
|
|
self.buffer.diag().length = 1;
|
|
|
|
self.buffer.diag().message = "the document contains mixed tab/space indentation";
|
|
|
|
return error.MixedIndentation;
|
|
|
|
},
|
2023-09-24 18:22:12 -07:00
|
|
|
.tabs => {},
|
|
|
|
}
|
|
|
|
},
|
|
|
|
'\r' => {
|
|
|
|
return error.BadToken;
|
|
|
|
},
|
|
|
|
else => {
|
|
|
|
indent = idx;
|
|
|
|
break;
|
|
|
|
},
|
|
|
|
}
|
|
|
|
} else {
|
2023-09-27 23:44:06 -07:00
|
|
|
if (raw_line.len > 0) {
|
|
|
|
self.buffer.diag().line_offset = raw_line.len - 1;
|
|
|
|
self.buffer.diag().length = 1;
|
|
|
|
self.buffer.diag().message = "this line contains trailing whitespace";
|
|
|
|
return error.TrailingWhitespace;
|
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
continue :lineloop;
|
|
|
|
}
|
|
|
|
|
|
|
|
var quantized: usize = if (self.indentation == .spaces) quant: {
|
|
|
|
if (self.indentation.spaces == 0) {
|
|
|
|
self.indentation.spaces = indent;
|
|
|
|
}
|
2023-09-27 23:44:06 -07:00
|
|
|
if (@rem(indent, self.indentation.spaces) != 0) {
|
|
|
|
self.buffer.diag().line_offset = 0;
|
|
|
|
self.buffer.diag().length = indent;
|
|
|
|
self.buffer.diag().message = "this line contains incorrectly quantized indentation";
|
2023-09-24 18:22:12 -07:00
|
|
|
return error.UnquantizedIndentation;
|
2023-09-27 23:44:06 -07:00
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
|
|
|
|
break :quant @divExact(indent, self.indentation.spaces);
|
|
|
|
} else indent;
|
|
|
|
|
|
|
|
const shift: LineShift = if (quantized > self.last_indent) rel: {
|
2023-09-27 23:44:06 -07:00
|
|
|
if ((quantized - self.last_indent) > 1) {
|
|
|
|
self.buffer.diag().line_offset = 0;
|
|
|
|
self.buffer.diag().length = indent;
|
|
|
|
self.buffer.diag().message = "this line contains too much indentation";
|
2023-09-24 18:22:12 -07:00
|
|
|
return error.TooMuchIndentation;
|
2023-09-27 23:44:06 -07:00
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
break :rel .indent;
|
|
|
|
} else if (quantized < self.last_indent)
|
|
|
|
.{ .dedent = self.last_indent - quantized }
|
|
|
|
else
|
|
|
|
.none;
|
|
|
|
|
|
|
|
defer {
|
|
|
|
self.last_indent = quantized;
|
|
|
|
}
|
|
|
|
|
2023-09-27 23:44:06 -07:00
|
|
|
// update the diagnostics so that the parser can use them without
|
|
|
|
// knowing about the whitespace.
|
|
|
|
self.buffer.diag().line_offset = indent;
|
2023-09-24 18:22:12 -07:00
|
|
|
const line = raw_line[indent..];
|
|
|
|
|
|
|
|
// this should not be possible, as empty lines are caught earlier.
|
|
|
|
if (line.len == 0) return error.Impossible;
|
|
|
|
|
|
|
|
switch (line[0]) {
|
|
|
|
'#' => {
|
|
|
|
// force comments to be followed by a space. This makes them
|
|
|
|
// behave the same way as strings, actually.
|
2023-09-27 23:44:06 -07:00
|
|
|
if (line.len > 1 and line[1] != ' ') {
|
|
|
|
self.buffer.diag().line_offset += 1;
|
|
|
|
self.buffer.diag().length = 1;
|
|
|
|
self.buffer.diag().message = "this line is missing a space after the start of comment character '#'";
|
|
|
|
return error.BadToken;
|
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
|
|
|
|
// simply lie about indentation when the line is a comment.
|
|
|
|
quantized = self.last_indent;
|
|
|
|
return .{
|
|
|
|
.shift = .none,
|
|
|
|
.contents = .{ .comment = line[1..] },
|
|
|
|
.raw = line,
|
|
|
|
};
|
|
|
|
},
|
|
|
|
'|', '>', '[', '{' => {
|
|
|
|
return .{
|
|
|
|
.shift = shift,
|
2023-09-27 23:44:06 -07:00
|
|
|
.contents = .{ .in_line = try self.detectInlineItem(line) },
|
2023-09-24 18:22:12 -07:00
|
|
|
.raw = line,
|
|
|
|
};
|
|
|
|
},
|
|
|
|
'-' => {
|
2023-09-27 23:44:06 -07:00
|
|
|
if (line.len > 1 and line[1] != ' ') {
|
|
|
|
self.buffer.diag().line_offset += 1;
|
|
|
|
self.buffer.diag().length = 1;
|
|
|
|
self.buffer.diag().message = "this line is missing a space after the list entry character '-'";
|
|
|
|
return error.BadToken;
|
|
|
|
}
|
|
|
|
|
|
|
|
// blindly add 2 here because an empty item cannot fail in
|
|
|
|
// the value, only if a bogus dedent has occurred
|
|
|
|
self.buffer.diag().line_offset += 2;
|
2023-09-24 18:22:12 -07:00
|
|
|
|
|
|
|
return if (line.len == 1) .{
|
|
|
|
.shift = shift,
|
|
|
|
.contents = .{ .list_item = .empty },
|
|
|
|
.raw = line,
|
|
|
|
} else .{
|
|
|
|
.shift = shift,
|
2023-09-27 23:44:06 -07:00
|
|
|
.contents = .{ .list_item = try self.detectInlineItem(line[2..]) },
|
2023-09-24 18:22:12 -07:00
|
|
|
.raw = line,
|
|
|
|
};
|
|
|
|
},
|
|
|
|
else => {
|
|
|
|
for (line, 0..) |char, idx| {
|
|
|
|
if (char == ':') {
|
state/tokenizer: go completely the opposite direction re: whitespace
This commit makes both the parser and tokenizer a lot more willing to
accept whitespace in places where it would previously cause strange
behavior. Also, whitespace is ignored preceding and following all
values and keys in flow-style objects now (in regular objects,
trailing whitespace is an error, and it is also an error for non-flow
map keys to have whitespace before the colon). Tabs are no longer
allowed as whitespace in the line. They can be inside scalar values,
though, including map keys. Also strings allow tabs inside of them.
The primary motivation here is to apply the principle of least
astonishment. For example, the following
- [hello, there]
would previously have been parsed as the scalar " [hello, there]" due
to the presence of an additional space after the "-" list item
indicator. This obviously looks like a flow list, and the way it was
previously parsed was very visually confusing (this change does mean
that scalars cannot start with [, but strings can, so this is not a
real limitation. Note that strings still allow leading whitespace, so
> hello
will produce the string " hello" due to the additional space after the
string designator. For flow lists,
[ a, b ]
would have been parsed as ["a", "b "], which was obviously confusing.
The previous commit fixed this by making whitespace rules more strict.
This commit fixes this by making whitespace rules more relaxed. In
particular, all whitespace preceding and following flow items is now
stripped. The main motivation for going in this direction is to allow
aligning list items over multiple lines, visually, which can make data
much easier to read for people, an explicit design goal. For example
key: [ 1, 2, 3 ]
other: [ 10, 20, 30 ]
is now allowed. The indentation rules do not allow right-aligning
"key" to "other", but I think that is acceptable (if we forced using
tabs for indentation, we could actually allow this, which I think is
worth consideration, at least). Flow maps are more generous:
foo: { bar: baz }
fooq: { barq: bazq }
is allowed because flow maps do not use whitespace as a structural
designator. These changes do affect how some things can be
represented. Scalar values can no longer contain leading or trailing
whitespace (previously the could contain leading whitespace). Map keys
cannot contain trailing whitespace (they could before. This also means
that keys consisting of whitespace cannot be represented at all).
Ultimately, given the other restrictions the format imposes on keys
and values, I find these to be acceptable and consistent with the goal
of the format.
2023-10-04 22:54:53 -07:00
|
|
|
if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) {
|
|
|
|
self.buffer.diag().line_offset += idx - 1;
|
|
|
|
self.buffer.diag().length = 1;
|
|
|
|
self.buffer.diag().message = "this line contains space before the map key-value separator character ':'";
|
|
|
|
return error.TrailingWhitespace;
|
|
|
|
}
|
2023-09-27 23:44:06 -07:00
|
|
|
|
state/tokenizer: go completely the opposite direction re: whitespace
This commit makes both the parser and tokenizer a lot more willing to
accept whitespace in places where it would previously cause strange
behavior. Also, whitespace is ignored preceding and following all
values and keys in flow-style objects now (in regular objects,
trailing whitespace is an error, and it is also an error for non-flow
map keys to have whitespace before the colon). Tabs are no longer
allowed as whitespace in the line. They can be inside scalar values,
though, including map keys. Also strings allow tabs inside of them.
The primary motivation here is to apply the principle of least
astonishment. For example, the following
- [hello, there]
would previously have been parsed as the scalar " [hello, there]" due
to the presence of an additional space after the "-" list item
indicator. This obviously looks like a flow list, and the way it was
previously parsed was very visually confusing (this change does mean
that scalars cannot start with [, but strings can, so this is not a
real limitation. Note that strings still allow leading whitespace, so
> hello
will produce the string " hello" due to the additional space after the
string designator. For flow lists,
[ a, b ]
would have been parsed as ["a", "b "], which was obviously confusing.
The previous commit fixed this by making whitespace rules more strict.
This commit fixes this by making whitespace rules more relaxed. In
particular, all whitespace preceding and following flow items is now
stripped. The main motivation for going in this direction is to allow
aligning list items over multiple lines, visually, which can make data
much easier to read for people, an explicit design goal. For example
key: [ 1, 2, 3 ]
other: [ 10, 20, 30 ]
is now allowed. The indentation rules do not allow right-aligning
"key" to "other", but I think that is acceptable (if we forced using
tabs for indentation, we could actually allow this, which I think is
worth consideration, at least). Flow maps are more generous:
foo: { bar: baz }
fooq: { barq: bazq }
is allowed because flow maps do not use whitespace as a structural
designator. These changes do affect how some things can be
represented. Scalar values can no longer contain leading or trailing
whitespace (previously the could contain leading whitespace). Map keys
cannot contain trailing whitespace (they could before. This also means
that keys consisting of whitespace cannot be represented at all).
Ultimately, given the other restrictions the format imposes on keys
and values, I find these to be acceptable and consistent with the goal
of the format.
2023-10-04 22:54:53 -07:00
|
|
|
if (idx + 1 == line.len) {
|
|
|
|
self.buffer.diag().line_offset += idx + 1;
|
|
|
|
return .{
|
|
|
|
.shift = shift,
|
|
|
|
.contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
|
|
|
|
.raw = line,
|
|
|
|
};
|
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
|
2023-09-27 23:44:06 -07:00
|
|
|
if (line[idx + 1] != ' ') {
|
|
|
|
self.buffer.diag().line_offset += idx + 1;
|
|
|
|
self.buffer.diag().length = 1;
|
|
|
|
self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'";
|
|
|
|
return error.BadToken;
|
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
|
|
|
|
return .{
|
|
|
|
.shift = shift,
|
|
|
|
.contents = .{ .map_item = .{
|
|
|
|
.key = line[0..idx],
|
2023-09-27 23:44:06 -07:00
|
|
|
.val = try self.detectInlineItem(line[idx + 2 ..]),
|
2023-09-24 18:22:12 -07:00
|
|
|
} },
|
|
|
|
.raw = line,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return .{
|
|
|
|
.shift = shift,
|
|
|
|
.contents = .{ .in_line = .{ .scalar = line } },
|
|
|
|
.raw = line,
|
|
|
|
};
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
// somehow everything else has failed
|
2023-09-27 23:44:06 -07:00
|
|
|
self.buffer.diag().line_offset = 0;
|
|
|
|
self.buffer.diag().length = raw_line.len;
|
|
|
|
self.buffer.diag().message = "this document contains an unknown error. Please report this.";
|
2023-09-24 18:22:12 -07:00
|
|
|
return error.Impossible;
|
|
|
|
}
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
2023-09-27 23:44:06 -07:00
|
|
|
// TODO: it's impossible to get the right diagnostic offset in this function at the moment
|
|
|
|
fn detectInlineItem(self: @This(), buf: []const u8) Error!InlineItem {
|
2023-09-24 18:22:12 -07:00
|
|
|
if (buf.len == 0) return .empty;
|
|
|
|
|
state/tokenizer: go completely the opposite direction re: whitespace
This commit makes both the parser and tokenizer a lot more willing to
accept whitespace in places where it would previously cause strange
behavior. Also, whitespace is ignored preceding and following all
values and keys in flow-style objects now (in regular objects,
trailing whitespace is an error, and it is also an error for non-flow
map keys to have whitespace before the colon). Tabs are no longer
allowed as whitespace in the line. They can be inside scalar values,
though, including map keys. Also strings allow tabs inside of them.
The primary motivation here is to apply the principle of least
astonishment. For example, the following
- [hello, there]
would previously have been parsed as the scalar " [hello, there]" due
to the presence of an additional space after the "-" list item
indicator. This obviously looks like a flow list, and the way it was
previously parsed was very visually confusing (this change does mean
that scalars cannot start with [, but strings can, so this is not a
real limitation. Note that strings still allow leading whitespace, so
> hello
will produce the string " hello" due to the additional space after the
string designator. For flow lists,
[ a, b ]
would have been parsed as ["a", "b "], which was obviously confusing.
The previous commit fixed this by making whitespace rules more strict.
This commit fixes this by making whitespace rules more relaxed. In
particular, all whitespace preceding and following flow items is now
stripped. The main motivation for going in this direction is to allow
aligning list items over multiple lines, visually, which can make data
much easier to read for people, an explicit design goal. For example
key: [ 1, 2, 3 ]
other: [ 10, 20, 30 ]
is now allowed. The indentation rules do not allow right-aligning
"key" to "other", but I think that is acceptable (if we forced using
tabs for indentation, we could actually allow this, which I think is
worth consideration, at least). Flow maps are more generous:
foo: { bar: baz }
fooq: { barq: bazq }
is allowed because flow maps do not use whitespace as a structural
designator. These changes do affect how some things can be
represented. Scalar values can no longer contain leading or trailing
whitespace (previously the could contain leading whitespace). Map keys
cannot contain trailing whitespace (they could before. This also means
that keys consisting of whitespace cannot be represented at all).
Ultimately, given the other restrictions the format imposes on keys
and values, I find these to be acceptable and consistent with the goal
of the format.
2023-10-04 22:54:53 -07:00
|
|
|
const start = start: {
|
|
|
|
for (buf, 0..) |chr, idx|
|
|
|
|
if (chr == ' ')
|
|
|
|
continue
|
|
|
|
else if (chr == '\t')
|
|
|
|
return error.IllegalTabWhitespaceInLine
|
|
|
|
else
|
|
|
|
break :start idx;
|
|
|
|
|
|
|
|
return error.TrailingWhitespace;
|
|
|
|
};
|
|
|
|
|
|
|
|
switch (buf[start]) {
|
2023-09-24 18:22:12 -07:00
|
|
|
'>', '|' => |char| {
|
state/tokenizer: go completely the opposite direction re: whitespace
This commit makes both the parser and tokenizer a lot more willing to
accept whitespace in places where it would previously cause strange
behavior. Also, whitespace is ignored preceding and following all
values and keys in flow-style objects now (in regular objects,
trailing whitespace is an error, and it is also an error for non-flow
map keys to have whitespace before the colon). Tabs are no longer
allowed as whitespace in the line. They can be inside scalar values,
though, including map keys. Also strings allow tabs inside of them.
The primary motivation here is to apply the principle of least
astonishment. For example, the following
- [hello, there]
would previously have been parsed as the scalar " [hello, there]" due
to the presence of an additional space after the "-" list item
indicator. This obviously looks like a flow list, and the way it was
previously parsed was very visually confusing (this change does mean
that scalars cannot start with [, but strings can, so this is not a
real limitation. Note that strings still allow leading whitespace, so
> hello
will produce the string " hello" due to the additional space after the
string designator. For flow lists,
[ a, b ]
would have been parsed as ["a", "b "], which was obviously confusing.
The previous commit fixed this by making whitespace rules more strict.
This commit fixes this by making whitespace rules more relaxed. In
particular, all whitespace preceding and following flow items is now
stripped. The main motivation for going in this direction is to allow
aligning list items over multiple lines, visually, which can make data
much easier to read for people, an explicit design goal. For example
key: [ 1, 2, 3 ]
other: [ 10, 20, 30 ]
is now allowed. The indentation rules do not allow right-aligning
"key" to "other", but I think that is acceptable (if we forced using
tabs for indentation, we could actually allow this, which I think is
worth consideration, at least). Flow maps are more generous:
foo: { bar: baz }
fooq: { barq: bazq }
is allowed because flow maps do not use whitespace as a structural
designator. These changes do affect how some things can be
represented. Scalar values can no longer contain leading or trailing
whitespace (previously the could contain leading whitespace). Map keys
cannot contain trailing whitespace (they could before. This also means
that keys consisting of whitespace cannot be represented at all).
Ultimately, given the other restrictions the format imposes on keys
and values, I find these to be acceptable and consistent with the goal
of the format.
2023-10-04 22:54:53 -07:00
|
|
|
if (buf.len - start > 1 and buf[start + 1] != ' ') return error.BadToken;
|
2023-09-24 18:22:12 -07:00
|
|
|
|
|
|
|
const slice: []const u8 = switch (buf[buf.len - 1]) {
|
2023-09-27 23:44:06 -07:00
|
|
|
' ', '\t' => {
|
|
|
|
self.buffer.diag().line_offset = 0;
|
|
|
|
self.buffer.diag().length = 1;
|
|
|
|
self.buffer.diag().message = "this line contains trailing whitespace";
|
|
|
|
return error.TrailingWhitespace;
|
|
|
|
},
|
state/tokenizer: go completely the opposite direction re: whitespace
This commit makes both the parser and tokenizer a lot more willing to
accept whitespace in places where it would previously cause strange
behavior. Also, whitespace is ignored preceding and following all
values and keys in flow-style objects now (in regular objects,
trailing whitespace is an error, and it is also an error for non-flow
map keys to have whitespace before the colon). Tabs are no longer
allowed as whitespace in the line. They can be inside scalar values,
though, including map keys. Also strings allow tabs inside of them.
The primary motivation here is to apply the principle of least
astonishment. For example, the following
- [hello, there]
would previously have been parsed as the scalar " [hello, there]" due
to the presence of an additional space after the "-" list item
indicator. This obviously looks like a flow list, and the way it was
previously parsed was very visually confusing (this change does mean
that scalars cannot start with [, but strings can, so this is not a
real limitation. Note that strings still allow leading whitespace, so
> hello
will produce the string " hello" due to the additional space after the
string designator. For flow lists,
[ a, b ]
would have been parsed as ["a", "b "], which was obviously confusing.
The previous commit fixed this by making whitespace rules more strict.
This commit fixes this by making whitespace rules more relaxed. In
particular, all whitespace preceding and following flow items is now
stripped. The main motivation for going in this direction is to allow
aligning list items over multiple lines, visually, which can make data
much easier to read for people, an explicit design goal. For example
key: [ 1, 2, 3 ]
other: [ 10, 20, 30 ]
is now allowed. The indentation rules do not allow right-aligning
"key" to "other", but I think that is acceptable (if we forced using
tabs for indentation, we could actually allow this, which I think is
worth consideration, at least). Flow maps are more generous:
foo: { bar: baz }
fooq: { barq: bazq }
is allowed because flow maps do not use whitespace as a structural
designator. These changes do affect how some things can be
represented. Scalar values can no longer contain leading or trailing
whitespace (previously the could contain leading whitespace). Map keys
cannot contain trailing whitespace (they could before. This also means
that keys consisting of whitespace cannot be represented at all).
Ultimately, given the other restrictions the format imposes on keys
and values, I find these to be acceptable and consistent with the goal
of the format.
2023-10-04 22:54:53 -07:00
|
|
|
'|' => buf[start + @min(2, buf.len - start) .. buf.len - @intFromBool(buf.len - start > 1)],
|
|
|
|
else => buf[start + @min(2, buf.len - start) .. buf.len],
|
2023-09-24 18:22:12 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
return if (char == '>')
|
|
|
|
.{ .line_string = slice }
|
|
|
|
else
|
|
|
|
.{ .space_string = slice };
|
|
|
|
},
|
|
|
|
'[' => {
|
state/tokenizer: go completely the opposite direction re: whitespace
This commit makes both the parser and tokenizer a lot more willing to
accept whitespace in places where it would previously cause strange
behavior. Also, whitespace is ignored preceding and following all
values and keys in flow-style objects now (in regular objects,
trailing whitespace is an error, and it is also an error for non-flow
map keys to have whitespace before the colon). Tabs are no longer
allowed as whitespace in the line. They can be inside scalar values,
though, including map keys. Also strings allow tabs inside of them.
The primary motivation here is to apply the principle of least
astonishment. For example, the following
- [hello, there]
would previously have been parsed as the scalar " [hello, there]" due
to the presence of an additional space after the "-" list item
indicator. This obviously looks like a flow list, and the way it was
previously parsed was very visually confusing (this change does mean
that scalars cannot start with [, but strings can, so this is not a
real limitation. Note that strings still allow leading whitespace, so
> hello
will produce the string " hello" due to the additional space after the
string designator. For flow lists,
[ a, b ]
would have been parsed as ["a", "b "], which was obviously confusing.
The previous commit fixed this by making whitespace rules more strict.
This commit fixes this by making whitespace rules more relaxed. In
particular, all whitespace preceding and following flow items is now
stripped. The main motivation for going in this direction is to allow
aligning list items over multiple lines, visually, which can make data
much easier to read for people, an explicit design goal. For example
key: [ 1, 2, 3 ]
other: [ 10, 20, 30 ]
is now allowed. The indentation rules do not allow right-aligning
"key" to "other", but I think that is acceptable (if we forced using
tabs for indentation, we could actually allow this, which I think is
worth consideration, at least). Flow maps are more generous:
foo: { bar: baz }
fooq: { barq: bazq }
is allowed because flow maps do not use whitespace as a structural
designator. These changes do affect how some things can be
represented. Scalar values can no longer contain leading or trailing
whitespace (previously the could contain leading whitespace). Map keys
cannot contain trailing whitespace (they could before. This also means
that keys consisting of whitespace cannot be represented at all).
Ultimately, given the other restrictions the format imposes on keys
and values, I find these to be acceptable and consistent with the goal
of the format.
2023-10-04 22:54:53 -07:00
|
|
|
if (buf.len - start < 2 or buf[buf.len - 1] != ']') {
|
2023-09-27 23:44:06 -07:00
|
|
|
self.buffer.diag().line_offset = 0;
|
|
|
|
self.buffer.diag().length = 1;
|
|
|
|
self.buffer.diag().message = "this line contains a flow-style list but does not end with the closing character ']'";
|
2023-09-24 18:22:12 -07:00
|
|
|
return error.BadToken;
|
2023-09-27 23:44:06 -07:00
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
|
|
|
|
// keep the closing ] for the flow parser
|
state/tokenizer: go completely the opposite direction re: whitespace
This commit makes both the parser and tokenizer a lot more willing to
accept whitespace in places where it would previously cause strange
behavior. Also, whitespace is ignored preceding and following all
values and keys in flow-style objects now (in regular objects,
trailing whitespace is an error, and it is also an error for non-flow
map keys to have whitespace before the colon). Tabs are no longer
allowed as whitespace in the line. They can be inside scalar values,
though, including map keys. Also strings allow tabs inside of them.
The primary motivation here is to apply the principle of least
astonishment. For example, the following
- [hello, there]
would previously have been parsed as the scalar " [hello, there]" due
to the presence of an additional space after the "-" list item
indicator. This obviously looks like a flow list, and the way it was
previously parsed was very visually confusing (this change does mean
that scalars cannot start with [, but strings can, so this is not a
real limitation. Note that strings still allow leading whitespace, so
> hello
will produce the string " hello" due to the additional space after the
string designator. For flow lists,
[ a, b ]
would have been parsed as ["a", "b "], which was obviously confusing.
The previous commit fixed this by making whitespace rules more strict.
This commit fixes this by making whitespace rules more relaxed. In
particular, all whitespace preceding and following flow items is now
stripped. The main motivation for going in this direction is to allow
aligning list items over multiple lines, visually, which can make data
much easier to read for people, an explicit design goal. For example
key: [ 1, 2, 3 ]
other: [ 10, 20, 30 ]
is now allowed. The indentation rules do not allow right-aligning
"key" to "other", but I think that is acceptable (if we forced using
tabs for indentation, we could actually allow this, which I think is
worth consideration, at least). Flow maps are more generous:
foo: { bar: baz }
fooq: { barq: bazq }
is allowed because flow maps do not use whitespace as a structural
designator. These changes do affect how some things can be
represented. Scalar values can no longer contain leading or trailing
whitespace (previously the could contain leading whitespace). Map keys
cannot contain trailing whitespace (they could before. This also means
that keys consisting of whitespace cannot be represented at all).
Ultimately, given the other restrictions the format imposes on keys
and values, I find these to be acceptable and consistent with the goal
of the format.
2023-10-04 22:54:53 -07:00
|
|
|
return .{ .flow_list = buf[start + 1 ..] };
|
2023-09-24 18:22:12 -07:00
|
|
|
},
|
|
|
|
'{' => {
|
state/tokenizer: go completely the opposite direction re: whitespace
This commit makes both the parser and tokenizer a lot more willing to
accept whitespace in places where it would previously cause strange
behavior. Also, whitespace is ignored preceding and following all
values and keys in flow-style objects now (in regular objects,
trailing whitespace is an error, and it is also an error for non-flow
map keys to have whitespace before the colon). Tabs are no longer
allowed as whitespace in the line. They can be inside scalar values,
though, including map keys. Also strings allow tabs inside of them.
The primary motivation here is to apply the principle of least
astonishment. For example, the following
- [hello, there]
would previously have been parsed as the scalar " [hello, there]" due
to the presence of an additional space after the "-" list item
indicator. This obviously looks like a flow list, and the way it was
previously parsed was very visually confusing (this change does mean
that scalars cannot start with [, but strings can, so this is not a
real limitation. Note that strings still allow leading whitespace, so
> hello
will produce the string " hello" due to the additional space after the
string designator. For flow lists,
[ a, b ]
would have been parsed as ["a", "b "], which was obviously confusing.
The previous commit fixed this by making whitespace rules more strict.
This commit fixes this by making whitespace rules more relaxed. In
particular, all whitespace preceding and following flow items is now
stripped. The main motivation for going in this direction is to allow
aligning list items over multiple lines, visually, which can make data
much easier to read for people, an explicit design goal. For example
key: [ 1, 2, 3 ]
other: [ 10, 20, 30 ]
is now allowed. The indentation rules do not allow right-aligning
"key" to "other", but I think that is acceptable (if we forced using
tabs for indentation, we could actually allow this, which I think is
worth consideration, at least). Flow maps are more generous:
foo: { bar: baz }
fooq: { barq: bazq }
is allowed because flow maps do not use whitespace as a structural
designator. These changes do affect how some things can be
represented. Scalar values can no longer contain leading or trailing
whitespace (previously the could contain leading whitespace). Map keys
cannot contain trailing whitespace (they could before. This also means
that keys consisting of whitespace cannot be represented at all).
Ultimately, given the other restrictions the format imposes on keys
and values, I find these to be acceptable and consistent with the goal
of the format.
2023-10-04 22:54:53 -07:00
|
|
|
if (buf.len - start < 2 or buf[buf.len - 1] != '}') {
|
2023-09-27 23:44:06 -07:00
|
|
|
self.buffer.diag().line_offset = 0;
|
|
|
|
self.buffer.diag().length = 1;
|
|
|
|
self.buffer.diag().message = "this line contains a flow-style map but does not end with the closing character '}'";
|
2023-09-24 18:22:12 -07:00
|
|
|
return error.BadToken;
|
2023-09-27 23:44:06 -07:00
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
|
|
|
|
// keep the closing } fpr the flow parser
|
state/tokenizer: go completely the opposite direction re: whitespace
This commit makes both the parser and tokenizer a lot more willing to
accept whitespace in places where it would previously cause strange
behavior. Also, whitespace is ignored preceding and following all
values and keys in flow-style objects now (in regular objects,
trailing whitespace is an error, and it is also an error for non-flow
map keys to have whitespace before the colon). Tabs are no longer
allowed as whitespace in the line. They can be inside scalar values,
though, including map keys. Also strings allow tabs inside of them.
The primary motivation here is to apply the principle of least
astonishment. For example, the following
- [hello, there]
would previously have been parsed as the scalar " [hello, there]" due
to the presence of an additional space after the "-" list item
indicator. This obviously looks like a flow list, and the way it was
previously parsed was very visually confusing (this change does mean
that scalars cannot start with [, but strings can, so this is not a
real limitation. Note that strings still allow leading whitespace, so
> hello
will produce the string " hello" due to the additional space after the
string designator. For flow lists,
[ a, b ]
would have been parsed as ["a", "b "], which was obviously confusing.
The previous commit fixed this by making whitespace rules more strict.
This commit fixes this by making whitespace rules more relaxed. In
particular, all whitespace preceding and following flow items is now
stripped. The main motivation for going in this direction is to allow
aligning list items over multiple lines, visually, which can make data
much easier to read for people, an explicit design goal. For example
key: [ 1, 2, 3 ]
other: [ 10, 20, 30 ]
is now allowed. The indentation rules do not allow right-aligning
"key" to "other", but I think that is acceptable (if we forced using
tabs for indentation, we could actually allow this, which I think is
worth consideration, at least). Flow maps are more generous:
foo: { bar: baz }
fooq: { barq: bazq }
is allowed because flow maps do not use whitespace as a structural
designator. These changes do affect how some things can be
represented. Scalar values can no longer contain leading or trailing
whitespace (previously the could contain leading whitespace). Map keys
cannot contain trailing whitespace (they could before. This also means
that keys consisting of whitespace cannot be represented at all).
Ultimately, given the other restrictions the format imposes on keys
and values, I find these to be acceptable and consistent with the goal
of the format.
2023-10-04 22:54:53 -07:00
|
|
|
return .{ .flow_map = buf[start + 1 ..] };
|
2023-09-24 18:22:12 -07:00
|
|
|
},
|
|
|
|
else => {
|
2023-09-27 23:44:06 -07:00
|
|
|
if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') {
|
|
|
|
self.buffer.diag().line_offset = 0;
|
|
|
|
self.buffer.diag().length = 1;
|
|
|
|
self.buffer.diag().message = "this line contains trailing whitespace";
|
2023-09-24 18:22:12 -07:00
|
|
|
return error.TrailingWhitespace;
|
2023-09-27 23:44:06 -07:00
|
|
|
}
|
2023-09-24 18:22:12 -07:00
|
|
|
|
state/tokenizer: go completely the opposite direction re: whitespace
This commit makes both the parser and tokenizer a lot more willing to
accept whitespace in places where it would previously cause strange
behavior. Also, whitespace is ignored preceding and following all
values and keys in flow-style objects now (in regular objects,
trailing whitespace is an error, and it is also an error for non-flow
map keys to have whitespace before the colon). Tabs are no longer
allowed as whitespace in the line. They can be inside scalar values,
though, including map keys. Also strings allow tabs inside of them.
The primary motivation here is to apply the principle of least
astonishment. For example, the following
- [hello, there]
would previously have been parsed as the scalar " [hello, there]" due
to the presence of an additional space after the "-" list item
indicator. This obviously looks like a flow list, and the way it was
previously parsed was very visually confusing (this change does mean
that scalars cannot start with [, but strings can, so this is not a
real limitation. Note that strings still allow leading whitespace, so
> hello
will produce the string " hello" due to the additional space after the
string designator. For flow lists,
[ a, b ]
would have been parsed as ["a", "b "], which was obviously confusing.
The previous commit fixed this by making whitespace rules more strict.
This commit fixes this by making whitespace rules more relaxed. In
particular, all whitespace preceding and following flow items is now
stripped. The main motivation for going in this direction is to allow
aligning list items over multiple lines, visually, which can make data
much easier to read for people, an explicit design goal. For example
key: [ 1, 2, 3 ]
other: [ 10, 20, 30 ]
is now allowed. The indentation rules do not allow right-aligning
"key" to "other", but I think that is acceptable (if we forced using
tabs for indentation, we could actually allow this, which I think is
worth consideration, at least). Flow maps are more generous:
foo: { bar: baz }
fooq: { barq: bazq }
is allowed because flow maps do not use whitespace as a structural
designator. These changes do affect how some things can be
represented. Scalar values can no longer contain leading or trailing
whitespace (previously the could contain leading whitespace). Map keys
cannot contain trailing whitespace (they could before. This also means
that keys consisting of whitespace cannot be represented at all).
Ultimately, given the other restrictions the format imposes on keys
and values, I find these to be acceptable and consistent with the goal
of the format.
2023-10-04 22:54:53 -07:00
|
|
|
return .{ .scalar = buf[start..] };
|
2023-09-24 18:22:12 -07:00
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|