diff --git a/src/parser/state.zig b/src/parser/state.zig index 1532027..eb40e5a 100644 --- a/src/parser/state.zig +++ b/src/parser/state.zig @@ -59,7 +59,7 @@ pub const State = struct { }, }, .value => switch (state.value_stack.getLast().*) { - // remove the final trailing newline or space + // we have an in-progress string, finish it. .string => |*string| string.* = try state.string_builder.toOwnedSlice(arena_alloc), // if we have a dangling -, attach an empty scalar to it .list => |*list| if (state.expect_shift == .indent) try list.append(Value.emptyScalar()), @@ -104,7 +104,7 @@ pub const State = struct { state.document.root = try Value.fromScalar(arena_alloc, str); state.mode = .done; }, - .line_string, .concat_string => |str| { + .line_string, .space_string, .concat_string => |str| { state.document.root = Value.emptyString(); try state.string_builder.appendSlice(arena_alloc, str); try state.value_stack.append(&state.document.root); @@ -128,7 +128,7 @@ pub const State = struct { switch (value) { .empty => state.expect_shift = .indent, .scalar => |str| try rootlist.append(try Value.fromScalar(arena_alloc, str)), - .line_string, .concat_string => |str| try rootlist.append(try Value.fromString(arena_alloc, str)), + .line_string, .space_string, .concat_string => |str| try rootlist.append(try Value.fromString(arena_alloc, str)), .inline_list => |str| try rootlist.append(try state.parseFlow(str, .inline_list, dkb)), .inline_map => |str| try rootlist.append(try state.parseFlow(str, .inline_map, dkb)), } @@ -146,7 +146,7 @@ pub const State = struct { state.dangling_key = dupekey; }, .scalar => |str| try rootmap.put(dupekey, try Value.fromScalar(arena_alloc, str)), - .line_string, .concat_string => |str| try rootmap.put(dupekey, try Value.fromString(arena_alloc, str)), + .line_string, .space_string, .concat_string => |str| try rootmap.put(dupekey, try Value.fromString(arena_alloc, str)), .inline_list => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_list, dkb)), .inline_map => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_map, dkb)), } @@ -188,9 +188,11 @@ pub const State = struct { .comment => unreachable, .in_line => |in_line| switch (in_line) { .empty => unreachable, - inline .line_string, .concat_string => |str, tag| { + inline .line_string, .space_string, .concat_string => |str, tag| { if (tag == .line_string) try state.string_builder.append(arena_alloc, '\n'); + if (tag == .space_string) + try state.string_builder.append(arena_alloc, ' '); try state.string_builder.appendSlice(arena_alloc, str); }, else => { @@ -249,10 +251,14 @@ pub const State = struct { state.expect_shift = .dedent; switch (in_line) { .empty => unreachable, - .scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)), + .scalar => { + state.diagnostics.length = 1; + state.diagnostics.message = "the document may not contain a scalar value on its own line"; + return error.UnexpectedValue; + }, .inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)), .inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)), - .line_string, .concat_string => |str| { + .line_string, .space_string, .concat_string => |str| { const new_string = try appendListGetValue(list, Value.emptyString()); try state.string_builder.appendSlice(arena_alloc, str); try state.value_stack.append(new_string); @@ -266,7 +272,7 @@ pub const State = struct { switch (value) { .empty => state.expect_shift = .indent, .scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)), - .line_string, .concat_string => |str| try list.append(try Value.fromString(arena_alloc, str)), + .line_string, .space_string, .concat_string => |str| try list.append(try Value.fromString(arena_alloc, str)), .inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)), .inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)), } @@ -291,7 +297,7 @@ pub const State = struct { if (state.expect_shift != .indent or line.shift != .indent) { state.diagnostics.length = 1; - state.diagnostics.message = "the document contains an invalid map key in a list"; + state.diagnostics.message = "the document contains a map item where a list item is expected"; return error.UnexpectedValue; } @@ -348,12 +354,16 @@ pub const State = struct { switch (in_line) { .empty => unreachable, - .scalar => |str| try state.putMap(map, state.dangling_key.?, try Value.fromScalar(arena_alloc, str), dkb), + .scalar => { + state.diagnostics.length = 1; + state.diagnostics.message = "the document may not contain a scalar value on its own line"; + return error.UnexpectedValue; + }, .inline_list => |str| try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_list, dkb), dkb), .inline_map => |str| { try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_map, dkb), dkb); }, - .line_string, .concat_string => |str| { + .line_string, .space_string, .concat_string => |str| { // string pushes the stack const new_string = try state.putMapGetValue(map, state.dangling_key.?, Value.emptyString(), dkb); try state.string_builder.appendSlice(arena_alloc, str); @@ -375,7 +385,7 @@ pub const State = struct { if (state.expect_shift != .indent or line.shift != .indent or state.dangling_key == null) { state.diagnostics.length = 1; - state.diagnostics.message = "the document contains an invalid list item in a map"; + state.diagnostics.message = "the document contains a list item where a map item is expected"; return error.UnexpectedValue; } @@ -395,7 +405,7 @@ pub const State = struct { state.dangling_key = dupekey; }, .scalar => |str| try state.putMap(map, dupekey, try Value.fromScalar(arena_alloc, str), dkb), - .line_string, .concat_string => |str| try state.putMap(map, dupekey, try Value.fromString(arena_alloc, str), dkb), + .line_string, .space_string, .concat_string => |str| try state.putMap(map, dupekey, try Value.fromString(arena_alloc, str), dkb), .inline_list => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_list, dkb), dkb), .inline_map => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_map, dkb), dkb), } @@ -567,7 +577,12 @@ pub const State = struct { // forbid these characters so that inline dictionary keys cannot start // with characters that regular dictionary keys cannot start with // (even though they're unambiguous in this specific context). - '{', '[', '#', '-', '>', '|', ',' => return { + '{', '[', '#', ',' => return { + state.diagnostics.length = 1; + state.diagnostics.message = "this document contains a inline map key that starts with an invalid character"; + return error.BadToken; + }, + '-', '>', '+', '|' => if ((idx + 1) < contents.len and contents[idx + 1] == ' ') { state.diagnostics.length = 1; state.diagnostics.message = "this document contains a inline map key that starts with an invalid sequence"; return error.BadToken; diff --git a/src/tokenizer.zig b/src/tokenizer.zig index 7b42cf2..6c82f57 100644 --- a/src/tokenizer.zig +++ b/src/tokenizer.zig @@ -23,6 +23,7 @@ pub const InlineItem = union(enum) { empty: void, scalar: []const u8, line_string: []const u8, + space_string: []const u8, concat_string: []const u8, inline_list: []const u8, @@ -162,104 +163,113 @@ pub fn LineTokenizer(comptime Buffer: type) type { // this should not be possible, as empty lines are caught earlier. if (line.len == 0) return error.Impossible; - switch (line[0]) { - '#' => { - // force comments to be followed by a space. This makes them - // behave the same way as strings, actually. - if (line.len > 1 and line[1] != ' ') { - self.buffer.diag().line_offset += 1; - self.buffer.diag().length = 1; - self.buffer.diag().message = "this line is missing a space after the start of comment character '#'"; - return error.BadToken; - } - - // simply lie about indentation when the line is a comment. - quantized = self.last_indent; - return .{ - .shift = .none, - .contents = .{ .comment = line[1..] }, - .raw = line, - }; - }, - '|', '>', '[', '{' => { - return .{ - .shift = shift, - .contents = .{ .in_line = try self.detectInlineItem(line) }, - .raw = line, - }; - }, - '-' => { - if (line.len > 1 and line[1] != ' ') { - self.buffer.diag().line_offset += 1; - self.buffer.diag().length = 1; - self.buffer.diag().message = "this line is missing a space after the list entry character '-'"; - return error.BadToken; - } - - // blindly add 2 here because an empty item cannot fail in - // the value, only if a bogus dedent has occurred - self.buffer.diag().line_offset += 2; - - return if (line.len == 1) .{ - .shift = shift, - .contents = .{ .list_item = .empty }, - .raw = line, - } else .{ - .shift = shift, - .contents = .{ .list_item = try self.detectInlineItem(line[2..]) }, - .raw = line, - }; - }, - else => { - for (line, 0..) |char, idx| { - if (char == ':') { - if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) { - self.buffer.diag().line_offset += idx - 1; - self.buffer.diag().length = 1; - self.buffer.diag().message = "this line contains space before the map key-value separator character ':'"; - return error.TrailingWhitespace; - } - - if (idx + 1 == line.len) { - self.buffer.diag().line_offset += idx + 1; - return .{ - .shift = shift, - .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } }, - .raw = line, - }; - } - - if (line[idx + 1] != ' ') { - self.buffer.diag().line_offset += idx + 1; - self.buffer.diag().length = 1; - self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'"; - return error.BadToken; - } - - return .{ - .shift = shift, - .contents = .{ .map_item = .{ - .key = line[0..idx], - .val = try self.detectInlineItem(line[idx + 2 ..]), - } }, - .raw = line, - }; + sigil: { + switch (line[0]) { + '#' => { + // Force comments to be followed by a space. We could + // allow #: to be interpreted as a map key, but I'm going + // to specifically forbid it instead. + if (line.len > 1 and line[1] != ' ') { + self.buffer.diag().line_offset += 1; + self.buffer.diag().length = 1; + self.buffer.diag().message = "this line is missing a space after the start of comment character '#'"; + return error.BadToken; } - } - return .{ - .shift = shift, - .contents = .{ .in_line = .{ .scalar = line } }, - .raw = line, - }; - }, + // simply lie about indentation when the line is a comment. + quantized = self.last_indent; + return .{ + .shift = .none, + .contents = .{ .comment = line[1..] }, + .raw = line, + }; + }, + '|', '>', '+' => { + if (line.len > 1 and line[1] != ' ') { + // we want to try parsing this as a map key + break :sigil; + } + + return .{ + .shift = shift, + .contents = .{ .in_line = try self.detectInlineItem(line) }, + .raw = line, + }; + }, + '[', '{' => { + // these don't require being followed by a space, so they + // cannot be interpreted as starting a map key in any way. + return .{ + .shift = shift, + .contents = .{ .in_line = try self.detectInlineItem(line) }, + .raw = line, + }; + }, + '-' => { + if (line.len > 1 and line[1] != ' ') { + // we want to try parsing this as a map key + break :sigil; + } + + // blindly add 2 here because an empty item cannot fail in + // the value, only if a bogus dedent has occurred + self.buffer.diag().line_offset += 2; + + return if (line.len == 1) .{ + .shift = shift, + .contents = .{ .list_item = .empty }, + .raw = line, + } else .{ + .shift = shift, + .contents = .{ .list_item = try self.detectInlineItem(line[2..]) }, + .raw = line, + }; + }, + else => break :sigil, + } } - // somehow everything else has failed - self.buffer.diag().line_offset = 0; - self.buffer.diag().length = raw_line.len; - self.buffer.diag().message = "this document contains an unknown error. Please report this."; - return error.Impossible; + for (line, 0..) |char, idx| { + if (char == ':') { + if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) { + self.buffer.diag().line_offset += idx - 1; + self.buffer.diag().length = 1; + self.buffer.diag().message = "this line contains space before the map key-value separator character ':'"; + return error.TrailingWhitespace; + } + + if (idx + 1 == line.len) { + self.buffer.diag().line_offset += idx + 1; + return .{ + .shift = shift, + .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } }, + .raw = line, + }; + } + + if (line[idx + 1] != ' ') { + self.buffer.diag().line_offset += idx + 1; + self.buffer.diag().length = 1; + self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'"; + return error.BadToken; + } + + return .{ + .shift = shift, + .contents = .{ .map_item = .{ + .key = line[0..idx], + .val = try self.detectInlineItem(line[idx + 2 ..]), + } }, + .raw = line, + }; + } + } + + return .{ + .shift = shift, + .contents = .{ .in_line = .{ .scalar = line } }, + .raw = line, + }; } return null; } @@ -281,8 +291,12 @@ pub fn LineTokenizer(comptime Buffer: type) type { }; switch (buf[start]) { - '>', '|' => |char| { - if (buf.len - start > 1 and buf[start + 1] != ' ') return error.BadToken; + '>', '|', '+' => |char| { + if (buf.len - start > 1 and buf[start + 1] != ' ') { + self.buffer.diag().length = 1; + self.buffer.diag().message = "this line is missing a space after the string start character"; + return error.BadToken; + } const slice: []const u8 = switch (buf[buf.len - 1]) { ' ', '\t' => { @@ -295,10 +309,12 @@ pub fn LineTokenizer(comptime Buffer: type) type { else => buf[start + @min(2, buf.len - start) .. buf.len], }; - return if (char == '>') - .{ .line_string = slice } - else - .{ .concat_string = slice }; + return switch (char) { + '>' => .{ .line_string = slice }, + '+' => .{ .space_string = slice }, + '|' => .{ .concat_string = slice }, + else => unreachable, + }; }, '[' => { if (buf.len - start < 2 or buf[buf.len - 1] != ']') {