state/tokenizer: go completely the opposite direction re: whitespace

This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00 · 2023-10-04 22:54:53 -07:00 · 7db6094dd5
commit 7db6094dd5
parent 1683197bc0
2 changed files with 88 additions and 59 deletions
--- a/src/parser/state.zig
+++ b/src/parser/state.zig
@ -452,10 +452,10 @@ pub const State = struct {
        charloop: for (contents, 0..) |char, idx| {
            switch (pstate) {
                .want_list_item => switch (char) {
-                    ' ', '\t' => continue :charloop,
+                    ' ' => continue :charloop,
+                    '\t' => return error.IllegalTabWhitespaceInLine,
                    ',' => {
                        // empty value
-                        // don't check for whitespace here: [ , ] is okay, as is [ , , ]
                        const tip = try state.getStackTip();
                        try tip.flow_list.append(Value.newScalar(arena_alloc));
                        item_start = idx + 1;
@ -500,35 +500,33 @@ pub const State = struct {
                    },
                },
                .consuming_list_item => switch (char) {
-                    // consider: detecting trailing whitespace. "[ 1 ]" should
-                    // produce "1" and not "1 " as it currently does, which breaks
-                    // the principle of least astonishment. design: no trailing
-                    // whitespace before "," and only a single space is allowed before "]"
                    ',' => {
-                        if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
-                            state.diagnostics.length = 1;
-                            state.diagnostics.message = "the flow list contains whitespace before ,";
-                            return error.TrailingWhitespace;
-                        }
+                        const end = end: {
+                            var countup = @max(idx, 1) - 1;
+                            while (countup > 0) : (countup -= 1) {
+                                if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
+                                if (contents[countup] != ' ') break :end countup + 1;
+                            }
+                            break :end countup;
+                        };

                        const tip = try state.getStackTip();
                        try tip.flow_list.append(
-                            try Value.fromScalar(arena_alloc, contents[item_start..idx]),
+                            try Value.fromScalar(arena_alloc, contents[item_start..end]),
                        );
                        item_start = idx + 1;

                        pstate = .want_list_item;
                    },
                    ']' => {
-                        var end = idx;
-                        if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
-                            if (idx > 1 and (contents[idx - 2] == ' ' or contents[idx - 2] == '\t')) {
-                                state.diagnostics.length = 1;
-                                state.diagnostics.message = "the flow list contains extra whitespace before ]";
-                                return error.TrailingWhitespace;
+                        const end = end: {
+                            var countup = @max(idx, 1) - 1;
+                            while (countup > 0) : (countup -= 1) {
+                                if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
+                                if (contents[countup] != ' ') break :end countup + 1;
                            }
-                            end = idx - 1;
-                        }
+                            break :end countup;
+                        };

                        const finished = state.value_stack.getLastOrNull() orelse {
                            state.diagnostics.length = 1;
@ -543,7 +541,8 @@ pub const State = struct {
                    else => continue :charloop,
                },
                .want_list_separator => switch (char) {
-                    ' ', '\t' => continue :charloop,
+                    ' ' => continue :charloop,
+                    '\t' => return error.IllegalTabWhitespaceInLine,
                    ',' => {
                        item_start = idx;
                        pstate = .want_list_item;
@ -556,7 +555,8 @@ pub const State = struct {
                    },
                },
                .want_map_key => switch (char) {
-                    ' ', '\t' => continue :charloop,
+                    ' ' => continue :charloop,
+                    '\t' => return error.IllegalTabWhitespaceInLine,
                    // forbid these characters so that flow dictionary keys cannot start
                    // with characters that regular dictionary keys cannot start with
                    // (even though they're unambiguous in this specific context).
@ -578,18 +578,22 @@ pub const State = struct {
                },
                .consuming_map_key => switch (char) {
                    ':' => {
-                        if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
-                            state.diagnostics.length = 1;
-                            state.diagnostics.message = "the flow map contains whitespace before :";
-                            return error.TrailingWhitespace;
-                        }
-                        dangling_key = try arena_alloc.dupe(u8, contents[item_start..idx]);
+                        const end = end: {
+                            var countup = @max(idx, 1) - 1;
+                            while (countup > 0) : (countup -= 1) {
+                                if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
+                                if (contents[countup] != ' ') break :end countup + 1;
+                            }
+                            break :end countup;
+                        };
+                        dangling_key = try arena_alloc.dupe(u8, contents[item_start..end]);
                        pstate = .want_map_value;
                    },
                    else => continue :charloop,
                },
                .want_map_value => switch (char) {
-                    ' ', '\t' => continue :charloop,
+                    ' ' => continue :charloop,
+                    '\t' => return error.IllegalTabWhitespaceInLine,
                    ',' => {
                        const tip = try state.getStackTip();
                        try state.putMap(
@ -651,31 +655,34 @@ pub const State = struct {
                },
                .consuming_map_value => switch (char) {
                    ',' => {
-                        if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
-                            state.diagnostics.length = 1;
-                            state.diagnostics.message = "the flow map contains whitespace before ,";
-                            return error.TrailingWhitespace;
-                        }
+                        const end = end: {
+                            var countup = @max(idx, 1) - 1;
+                            while (countup > 0) : (countup -= 1) {
+                                if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
+                                if (contents[countup] != ' ') break :end countup + 1;
+                            }
+                            break :end countup;
+                        };
+
                        const tip = try state.getStackTip();
                        try state.putMap(
                            &tip.flow_map,
                            dangling_key.?,
-                            try Value.fromScalar(arena_alloc, contents[item_start..idx]),
+                            try Value.fromScalar(arena_alloc, contents[item_start..end]),
                            dkb,
                        );
                        dangling_key = null;
                        pstate = .want_map_key;
                    },
                    '}' => {
-                        var end = idx;
-                        if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
-                            if (idx > 1 and (contents[idx - 2] == ' ' or contents[idx - 2] == '\t')) {
-                                state.diagnostics.length = 1;
-                                state.diagnostics.message = "the flow map contains extra whitespace before }";
-                                return error.TrailingWhitespace;
+                        const end = end: {
+                            var countup = @max(idx, 1) - 1;
+                            while (countup > 0) : (countup -= 1) {
+                                if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
+                                if (contents[countup] != ' ') break :end countup + 1;
                            }
-                            end = idx - 1;
-                        }
+                            break :end countup;
+                        };

                        const tip = try state.getStackTip();
                        try state.putMap(
@ -690,7 +697,8 @@ pub const State = struct {
                    else => continue :charloop,
                },
                .want_map_separator => switch (char) {
-                    ' ', '\t' => continue :charloop,
+                    ' ' => continue :charloop,
+                    '\t' => return error.IllegalTabWhitespaceInLine,
                    ',' => pstate = .want_map_key,
                    '}' => pstate = try state.popFlowStack(),
                    else => return {
--- a/src/tokenizer.zig
+++ b/src/tokenizer.zig
@ -9,6 +9,7 @@ pub const Error = error{
    TooMuchIndentation,
    UnquantizedIndentation,
    TrailingWhitespace,
+    IllegalTabWhitespaceInLine,
    Impossible,
 };

@ -220,13 +221,21 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                    else => {
                        for (line, 0..) |char, idx| {
                            if (char == ':') {
-                                self.buffer.diag().line_offset += idx + 2;
+                                if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) {
+                                    self.buffer.diag().line_offset += idx - 1;
+                                    self.buffer.diag().length = 1;
+                                    self.buffer.diag().message = "this line contains space before the map key-value separator character ':'";
+                                    return error.TrailingWhitespace;
+                                }

-                                if (idx + 1 == line.len) return .{
-                                    .shift = shift,
-                                    .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
-                                    .raw = line,
-                                };
+                                if (idx + 1 == line.len) {
+                                    self.buffer.diag().line_offset += idx + 1;
+                                    return .{
+                                        .shift = shift,
+                                        .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
+                                        .raw = line,
+                                    };
+                                }

                                if (line[idx + 1] != ' ') {
                                    self.buffer.diag().line_offset += idx + 1;
@ -267,9 +276,21 @@ pub fn LineTokenizer(comptime Buffer: type) type {
        fn detectInlineItem(self: @This(), buf: []const u8) Error!InlineItem {
            if (buf.len == 0) return .empty;

-            switch (buf[0]) {
+            const start = start: {
+                for (buf, 0..) |chr, idx|
+                    if (chr == ' ')
+                        continue
+                    else if (chr == '\t')
+                        return error.IllegalTabWhitespaceInLine
+                    else
+                        break :start idx;
+
+                return error.TrailingWhitespace;
+            };
+
+            switch (buf[start]) {
                '>', '|' => |char| {
-                    if (buf.len > 1 and buf[1] != ' ') return error.BadToken;
+                    if (buf.len - start > 1 and buf[start + 1] != ' ') return error.BadToken;

                    const slice: []const u8 = switch (buf[buf.len - 1]) {
                        ' ', '\t' => {
@ -278,8 +299,8 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                            self.buffer.diag().message = "this line contains trailing whitespace";
                            return error.TrailingWhitespace;
                        },
-                        '|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)],
-                        else => buf[@min(2, buf.len)..buf.len],
+                        '|' => buf[start + @min(2, buf.len - start) .. buf.len - @intFromBool(buf.len - start > 1)],
+                        else => buf[start + @min(2, buf.len - start) .. buf.len],
                    };

                    return if (char == '>')
@ -288,7 +309,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                        .{ .space_string = slice };
                },
                '[' => {
-                    if (buf.len < 2 or buf[buf.len - 1] != ']') {
+                    if (buf.len - start < 2 or buf[buf.len - 1] != ']') {
                        self.buffer.diag().line_offset = 0;
                        self.buffer.diag().length = 1;
                        self.buffer.diag().message = "this line contains a flow-style list but does not end with the closing character ']'";
@ -296,10 +317,10 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                    }

                    // keep the closing ] for the flow parser
-                    return .{ .flow_list = buf[1..] };
+                    return .{ .flow_list = buf[start + 1 ..] };
                },
                '{' => {
-                    if (buf.len < 2 or buf[buf.len - 1] != '}') {
+                    if (buf.len - start < 2 or buf[buf.len - 1] != '}') {
                        self.buffer.diag().line_offset = 0;
                        self.buffer.diag().length = 1;
                        self.buffer.diag().message = "this line contains a flow-style map but does not end with the closing character '}'";
@ -307,7 +328,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                    }

                    // keep the closing } fpr the flow parser
-                    return .{ .flow_map = buf[1..] };
+                    return .{ .flow_map = buf[start + 1 ..] };
                },
                else => {
                    if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') {
@ -317,7 +338,7 @@ pub fn LineTokenizer(comptime Buffer: type) type {
                        return error.TrailingWhitespace;
                    }

-                    return .{ .scalar = buf };
+                    return .{ .scalar = buf[start..] };
                },
            }
        }