nice-data/src/parser/state.zig

816 lines
40 KiB
Zig
Raw Normal View History

// Copyright 2023 torque@epicyclic.dev
//
// Licensed under the MIT/Expat license. You may not use this file except in
// compliance with the license. You may obtain a copy of the license at
//
// https://spdx.org/licenses/MIT.html
//
// This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied.
const std = @import("std");
const tokenizer = @import("../tokenizer.zig");
const Error = @import("../parser.zig").Error;
const DuplicateKeyBehavior = @import("../parser.zig").DuplicateKeyBehavior;
const Options = @import("../parser.zig").Options;
const Diagnostics = @import("../parser.zig").Diagnostics;
const Document = @import("./value.zig").Document;
const Value = @import("./value.zig").Value;
const FlowParseState = enum {
want_list_item,
consuming_list_item,
want_list_separator,
want_map_key,
consuming_map_key,
want_map_value,
consuming_map_value,
want_map_separator,
done,
};
pub const State = struct {
pub const Stack = std.ArrayList(*Value);
document: Document,
diagnostics: *Diagnostics,
value_stack: Stack,
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
string_builder: std.ArrayListUnmanaged(u8),
mode: enum { initial, value, done } = .initial,
expect_shift: tokenizer.ShiftDirection = .none,
dangling_key: ?[]const u8 = null,
pub fn init(allocator: std.mem.Allocator, diagnostics: *Diagnostics) State {
return .{
.document = Document.init(allocator),
.diagnostics = diagnostics,
.value_stack = Stack.init(allocator),
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
.string_builder = std.ArrayListUnmanaged(u8){},
};
}
pub fn deinit(self: State) void {
self.value_stack.deinit();
}
pub fn finish(state: *State, options: Options) !Document {
const arena_alloc = state.document.arena.allocator();
switch (state.mode) {
.initial => switch (options.default_object) {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
.string => state.document.root = Value.emptyString(),
.list => state.document.root = Value.newList(arena_alloc),
.map => state.document.root = Value.newMap(arena_alloc),
.fail => {
state.diagnostics.length = 0;
state.diagnostics.message = "the document is empty";
return error.EmptyDocument;
},
},
.value => switch (state.value_stack.getLast().*) {
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
// we have an in-progress string, finish it.
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
.string => |*string| string.* = try state.string_builder.toOwnedSlice(arena_alloc),
// if we have a dangling -, attach an empty scalar to it
.list => |*list| if (state.expect_shift == .indent) try list.append(Value.emptyScalar()),
// if we have a dangling "key:", attach an empty scalar to it
.map => |*map| if (state.dangling_key) |dk| try state.putMap(
map,
dk,
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
Value.emptyScalar(),
options.duplicate_key_behavior,
),
.scalar, .inline_list, .inline_map => {},
},
.done => {},
}
return state.document;
}
pub fn parseLine(state: *State, line: tokenizer.Line, dkb: DuplicateKeyBehavior) !void {
if (line.contents == .comment) return;
// this gives us a second loop when the stack tip changes (i.e. during dedent or
// some indents (not all indents push the stack))
const arena_alloc = state.document.arena.allocator();
var firstpass = true;
restack: while (true) : (firstpass = false) {
switch (state.mode) {
.initial => {
if (line.shift == .indent) {
state.diagnostics.length = 1;
state.diagnostics.message = "the first object in the document cannot be indented";
return error.UnexpectedIndent;
}
switch (line.contents) {
// we filter out comments above
.comment => unreachable,
.in_line => |in_line| switch (in_line) {
// empty scalars are only emitted for a list_item or a map_item
.empty => unreachable,
.scalar => |str| {
state.document.root = try Value.fromScalar(arena_alloc, str);
state.mode = .done;
},
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
.line_string, .space_string, .concat_string => |str| {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
state.document.root = Value.emptyString();
try state.string_builder.appendSlice(arena_alloc, str);
try state.value_stack.append(&state.document.root);
state.mode = .value;
},
.inline_list => |str| {
state.document.root = try state.parseFlow(str, .inline_list, dkb);
state.mode = .done;
},
.inline_map => |str| {
state.document.root = try state.parseFlow(str, .inline_map, dkb);
state.mode = .done;
},
},
.list_item => |value| {
state.document.root = Value.newList(arena_alloc);
try state.value_stack.append(&state.document.root);
state.mode = .value;
const rootlist = &state.document.root.list;
switch (value) {
.empty => state.expect_shift = .indent,
.scalar => |str| try rootlist.append(try Value.fromScalar(arena_alloc, str)),
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
.line_string, .space_string, .concat_string => |str| try rootlist.append(try Value.fromString(arena_alloc, str)),
.inline_list => |str| try rootlist.append(try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try rootlist.append(try state.parseFlow(str, .inline_map, dkb)),
}
},
.map_item => |pair| {
state.document.root = Value.newMap(arena_alloc);
try state.value_stack.append(&state.document.root);
state.mode = .value;
const rootmap = &state.document.root.map;
const dupekey = try arena_alloc.dupe(u8, pair.key);
switch (pair.val) {
.empty => {
state.expect_shift = .indent;
state.dangling_key = dupekey;
},
.scalar => |str| try rootmap.put(dupekey, try Value.fromScalar(arena_alloc, str)),
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
.line_string, .space_string, .concat_string => |str| try rootmap.put(dupekey, try Value.fromString(arena_alloc, str)),
.inline_list => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try rootmap.put(dupekey, try state.parseFlow(str, .inline_map, dkb)),
}
},
}
},
.value => switch (state.value_stack.getLast().*) {
// these three states are never reachable here. inline_list and
// inline_map are parsed with a separate state machine. These
// value types can only be present by themselves as the first
// line of the document, in which case the document consists
// only of that single line: this parser jumps immediately into
// the .done state, bypassing the .value state in which this
// switch is embedded.
.scalar, .inline_list, .inline_map => {
state.diagnostics.length = 1;
state.diagnostics.message = "the document contains invalid data following a single-line value";
return error.Fail;
},
.string => |*string| {
if (line.shift == .indent) {
state.diagnostics.length = 1;
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
state.diagnostics.message = "the document contains invalid indentation in a multiline string";
return error.UnexpectedIndent;
}
if (firstpass and line.shift == .dedent) {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
// copy the string into the document proper
string.* = try state.string_builder.toOwnedSlice(arena_alloc);
var dedent_depth = line.shift.dedent;
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = state.value_stack.pop();
continue :restack;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| switch (in_line) {
.empty => unreachable,
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
inline .line_string, .space_string, .concat_string => |str, tag| {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
if (tag == .line_string)
try state.string_builder.append(arena_alloc, '\n');
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
if (tag == .space_string)
try state.string_builder.append(arena_alloc, ' ');
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
try state.string_builder.appendSlice(arena_alloc, str);
},
else => {
state.diagnostics.length = 1;
state.diagnostics.message = "the document contains an invalid object in a multiline string";
return error.UnexpectedValue;
},
},
else => {
state.diagnostics.length = 1;
state.diagnostics.message = "the document contains an invalid object in a multiline string";
return error.UnexpectedValue;
},
}
},
.list => |*list| {
// detect that the previous item was actually empty
//
// -
// - something
//
// the first line here creates the state.expect_shift, but the second line
// is a valid continuation of the list despite not being indented
if (firstpass and (state.expect_shift == .indent and line.shift != .indent))
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
try list.append(Value.emptyScalar());
// Consider:
//
// -
// own-line scalar
// - inline scalar
//
// the own-line scalar will not push the stack but the next list item will be a dedent
if (firstpass and line.shift == .dedent) {
// if line.shift.dedent is 1 and we're expecting it, the stack will not be popped,
// but we will continue restack. However, firstpass will be set to false on the next
// trip, so this if prong will not be run again.
var dedent_depth = line.shift.dedent - @intFromBool(state.expect_shift == .dedent);
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = state.value_stack.pop();
continue :restack;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| {
// assert that this line has been indented and that indentation is expected.
if (state.expect_shift != .indent or line.shift != .indent) {
state.diagnostics.length = 1;
state.diagnostics.message = "the document contains an invalid inline object in a list";
return error.UnexpectedValue;
}
state.expect_shift = .dedent;
switch (in_line) {
.empty => unreachable,
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
.scalar => {
state.diagnostics.length = 1;
state.diagnostics.message = "the document may not contain a scalar value on its own line";
return error.UnexpectedValue;
},
.inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)),
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
.line_string, .space_string, .concat_string => |str| {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
const new_string = try appendListGetValue(list, Value.emptyString());
try state.string_builder.appendSlice(arena_alloc, str);
try state.value_stack.append(new_string);
state.expect_shift = .none;
},
}
},
.list_item => |value| {
if (!firstpass or (line.shift == .none or line.shift == .dedent)) {
state.expect_shift = .none;
switch (value) {
.empty => state.expect_shift = .indent,
.scalar => |str| try list.append(try Value.fromScalar(arena_alloc, str)),
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
.line_string, .space_string, .concat_string => |str| try list.append(try Value.fromString(arena_alloc, str)),
.inline_list => |str| try list.append(try state.parseFlow(str, .inline_list, dkb)),
.inline_map => |str| try list.append(try state.parseFlow(str, .inline_map, dkb)),
}
} else if (line.shift == .indent) {
if (state.expect_shift != .indent) return error.UnexpectedIndent;
const new_list = try appendListGetValue(list, Value.newList(arena_alloc));
try state.value_stack.append(new_list);
state.expect_shift = .none;
continue :restack;
} else unreachable;
},
.map_item => {
// this prong cannot be hit on dedent in a valid way.
//
// -
// map: value
// second: value
// third: value
//
// dedenting back to the list stack level requires list_item
if (state.expect_shift != .indent or line.shift != .indent) {
state.diagnostics.length = 1;
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
state.diagnostics.message = "the document contains a map item where a list item is expected";
return error.UnexpectedValue;
}
const new_map = try appendListGetValue(list, Value.newMap(arena_alloc));
try state.value_stack.append(new_map);
state.expect_shift = .none;
continue :restack;
},
}
},
.map => |*map| {
// detect that the previous item was actually empty
//
// foo:
// bar: baz
//
// the first line here creates the state.expect_shift, but the second line
// is a valid continuation of the map despite not being indented
if (firstpass and (state.expect_shift == .indent and line.shift != .indent)) {
try state.putMap(
map,
state.dangling_key orelse {
state.diagnostics.length = 1;
state.diagnostics.message = "the document is somehow missing a key (this shouldn't be possible)";
return error.Fail;
},
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
Value.emptyScalar(),
dkb,
);
state.dangling_key = null;
}
if (firstpass and line.shift == .dedent) {
var dedent_depth = line.shift.dedent - @intFromBool(state.expect_shift == .dedent);
while (dedent_depth > 0) : (dedent_depth -= 1)
_ = state.value_stack.pop();
continue :restack;
}
switch (line.contents) {
.comment => unreachable,
.in_line => |in_line| {
// assert that this line has been indented. this is required for an inline value when
// the stack is in map mode.
if (state.expect_shift != .indent or line.shift != .indent or state.dangling_key == null) {
state.diagnostics.length = 1;
state.diagnostics.message = "the document contains an invalid inline object in a map";
return error.UnexpectedValue;
}
state.expect_shift = .dedent;
switch (in_line) {
.empty => unreachable,
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
.scalar => {
state.diagnostics.length = 1;
state.diagnostics.message = "the document may not contain a scalar value on its own line";
return error.UnexpectedValue;
},
.inline_list => |str| try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_list, dkb), dkb),
.inline_map => |str| {
try state.putMap(map, state.dangling_key.?, try state.parseFlow(str, .inline_map, dkb), dkb);
},
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
.line_string, .space_string, .concat_string => |str| {
// string pushes the stack
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
const new_string = try state.putMapGetValue(map, state.dangling_key.?, Value.emptyString(), dkb);
try state.string_builder.appendSlice(arena_alloc, str);
try state.value_stack.append(new_string);
state.expect_shift = .none;
},
}
state.dangling_key = null;
},
.list_item => {
// this prong cannot be hit on dedent in a valid way.
//
// map:
// - value
// - invalid
//
// dedenting back to the map stack level requires map_item
if (state.expect_shift != .indent or line.shift != .indent or state.dangling_key == null) {
state.diagnostics.length = 1;
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
state.diagnostics.message = "the document contains a list item where a map item is expected";
return error.UnexpectedValue;
}
const new_list = try state.putMapGetValue(map, state.dangling_key.?, Value.newList(arena_alloc), dkb);
try state.value_stack.append(new_list);
state.dangling_key = null;
state.expect_shift = .none;
continue :restack;
},
.map_item => |pair| {
if (!firstpass or (line.shift == .none or line.shift == .dedent)) {
state.expect_shift = .none;
const dupekey = try arena_alloc.dupe(u8, pair.key);
switch (pair.val) {
.empty => {
state.expect_shift = .indent;
state.dangling_key = dupekey;
},
.scalar => |str| try state.putMap(map, dupekey, try Value.fromScalar(arena_alloc, str), dkb),
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
.line_string, .space_string, .concat_string => |str| try state.putMap(map, dupekey, try Value.fromString(arena_alloc, str), dkb),
.inline_list => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_list, dkb), dkb),
.inline_map => |str| try state.putMap(map, dupekey, try state.parseFlow(str, .inline_map, dkb), dkb),
}
} else if (line.shift == .indent) {
if (state.expect_shift != .indent or state.dangling_key == null) {
state.diagnostics.length = 1;
state.diagnostics.message = "the document contains indented map item in a map";
return error.UnexpectedValue;
}
const new_map = try state.putMapGetValue(map, state.dangling_key.?, Value.newMap(arena_alloc), dkb);
try state.value_stack.append(new_map);
state.dangling_key = null;
continue :restack;
} else unreachable;
},
}
},
},
.done => {
state.diagnostics.length = 1;
state.diagnostics.message = "the document contains extra data after the top level structure";
return error.ExtraContent;
},
}
// the stack has not changed, so break the loop
break :restack;
}
}
pub fn parseFlow(
state: *State,
contents: []const u8,
root_type: Value.TagType,
dkb: DuplicateKeyBehavior,
) !Value {
const arena_alloc = state.document.arena.allocator();
var root: Value = switch (root_type) {
.inline_list => Value.newFlowList(arena_alloc),
.inline_map => Value.newFlowMap(arena_alloc),
else => {
state.diagnostics.length = 1;
state.diagnostics.message = "the inline map or list was closed too many times";
return error.BadState;
},
};
var pstate: FlowParseState = switch (root_type) {
.inline_list => .want_list_item,
.inline_map => .want_map_key,
else => unreachable,
};
// used to distinguish between [] and [ ], and it also tracks
// a continuous value between different states
var item_start: usize = 0;
var dangling_key: ?[]const u8 = null;
try state.value_stack.append(&root);
charloop: for (contents, 0..) |char, idx| {
switch (pstate) {
.want_list_item => switch (char) {
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
',' => {
// empty value
const tip = try state.getStackTip();
try tip.inline_list.append(Value.emptyScalar());
item_start = idx + 1;
},
'{' => {
const tip = try state.getStackTip();
const new_map = try appendListGetValue(
&tip.inline_list,
Value.newFlowMap(arena_alloc),
);
item_start = idx;
try state.value_stack.append(new_map);
pstate = .want_map_key;
},
'[' => {
const tip = try state.getStackTip();
const new_list = try appendListGetValue(
&tip.inline_list,
Value.newFlowList(arena_alloc),
);
item_start = idx + 1;
try state.value_stack.append(new_list);
pstate = .want_list_item;
},
']' => {
const finished = state.value_stack.getLastOrNull() orelse {
state.diagnostics.length = 1;
state.diagnostics.message = "the inline list was closed too many times";
return error.BadState;
};
if (finished.inline_list.items.len > 0 or idx > item_start)
try finished.inline_list.append(Value.emptyScalar());
pstate = try state.popFlowStack();
},
else => {
item_start = idx;
pstate = .consuming_list_item;
},
},
.consuming_list_item => switch (char) {
',' => {
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
break :end countup;
};
const tip = try state.getStackTip();
try tip.inline_list.append(
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
try Value.fromScalar(arena_alloc, contents[item_start..end]),
);
item_start = idx + 1;
pstate = .want_list_item;
},
']' => {
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
break :end countup;
};
const finished = state.value_stack.getLastOrNull() orelse {
state.diagnostics.length = 1;
state.diagnostics.message = "the inline list was closed too many times";
return error.BadState;
};
try finished.inline_list.append(
try Value.fromScalar(arena_alloc, contents[item_start..end]),
);
pstate = try state.popFlowStack();
},
else => continue :charloop,
},
.want_list_separator => switch (char) {
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
',' => {
item_start = idx;
pstate = .want_list_item;
},
']' => pstate = try state.popFlowStack(),
else => return {
state.diagnostics.length = 1;
state.diagnostics.message = "the document contains an invalid inline list separator";
return error.BadToken;
},
},
.want_map_key => switch (char) {
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
// forbid these characters so that inline dictionary keys cannot start
// with characters that regular dictionary keys cannot start with
// (even though they're unambiguous in this specific context).
parser: reintroduce space strings and change token parsing strategy Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
2023-10-18 00:20:19 -07:00
'{', '[', '#', ',' => return {
state.diagnostics.length = 1;
state.diagnostics.message = "this document contains a inline map key that starts with an invalid character";
return error.BadToken;
},
'-', '>', '+', '|' => if ((idx + 1) < contents.len and contents[idx + 1] == ' ') {
state.diagnostics.length = 1;
state.diagnostics.message = "this document contains a inline map key that starts with an invalid sequence";
return error.BadToken;
},
':' => {
// we have an empty map key
dangling_key = "";
pstate = .want_map_value;
},
'}' => pstate = try state.popFlowStack(),
else => {
item_start = idx;
pstate = .consuming_map_key;
},
},
.consuming_map_key => switch (char) {
':' => {
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
break :end countup;
};
dangling_key = try arena_alloc.dupe(u8, contents[item_start..end]);
pstate = .want_map_value;
},
else => continue :charloop,
},
.want_map_value => switch (char) {
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
',' => {
const tip = try state.getStackTip();
try state.putMap(
&tip.inline_map,
dangling_key.?,
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
Value.emptyScalar(),
dkb,
);
dangling_key = null;
pstate = .want_map_key;
},
'[' => {
const tip = try state.getStackTip();
const new_list = try state.putMapGetValue(
&tip.inline_map,
dangling_key.?,
Value.newFlowList(arena_alloc),
dkb,
);
try state.value_stack.append(new_list);
dangling_key = null;
item_start = idx + 1;
pstate = .want_list_item;
},
'{' => {
const tip = try state.getStackTip();
const new_map = try state.putMapGetValue(
&tip.inline_map,
dangling_key.?,
Value.newFlowMap(arena_alloc),
dkb,
);
try state.value_stack.append(new_map);
dangling_key = null;
pstate = .want_map_key;
},
'}' => {
// the value is an empty string and this map is closed
const tip = try state.getStackTip();
try state.putMap(
&tip.inline_map,
dangling_key.?,
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
Value.emptyScalar(),
dkb,
);
dangling_key = null;
pstate = try state.popFlowStack();
},
else => {
item_start = idx;
pstate = .consuming_map_value;
},
},
.consuming_map_value => switch (char) {
',' => {
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
break :end countup;
};
const tip = try state.getStackTip();
try state.putMap(
&tip.inline_map,
dangling_key.?,
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
try Value.fromScalar(arena_alloc, contents[item_start..end]),
dkb,
);
dangling_key = null;
pstate = .want_map_key;
},
'}' => {
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
const end = end: {
var countup = @max(idx, 1) - 1;
while (countup > 0) : (countup -= 1) {
if (contents[countup] == '\t') return error.IllegalTabWhitespaceInLine;
if (contents[countup] != ' ') break :end countup + 1;
}
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
break :end countup;
};
const tip = try state.getStackTip();
try state.putMap(
&tip.inline_map,
dangling_key.?,
try Value.fromScalar(arena_alloc, contents[item_start..end]),
dkb,
);
dangling_key = null;
pstate = try state.popFlowStack();
},
else => continue :charloop,
},
.want_map_separator => switch (char) {
state/tokenizer: go completely the opposite direction re: whitespace This commit makes both the parser and tokenizer a lot more willing to accept whitespace in places where it would previously cause strange behavior. Also, whitespace is ignored preceding and following all values and keys in flow-style objects now (in regular objects, trailing whitespace is an error, and it is also an error for non-flow map keys to have whitespace before the colon). Tabs are no longer allowed as whitespace in the line. They can be inside scalar values, though, including map keys. Also strings allow tabs inside of them. The primary motivation here is to apply the principle of least astonishment. For example, the following - [hello, there] would previously have been parsed as the scalar " [hello, there]" due to the presence of an additional space after the "-" list item indicator. This obviously looks like a flow list, and the way it was previously parsed was very visually confusing (this change does mean that scalars cannot start with [, but strings can, so this is not a real limitation. Note that strings still allow leading whitespace, so > hello will produce the string " hello" due to the additional space after the string designator. For flow lists, [ a, b ] would have been parsed as ["a", "b "], which was obviously confusing. The previous commit fixed this by making whitespace rules more strict. This commit fixes this by making whitespace rules more relaxed. In particular, all whitespace preceding and following flow items is now stripped. The main motivation for going in this direction is to allow aligning list items over multiple lines, visually, which can make data much easier to read for people, an explicit design goal. For example key: [ 1, 2, 3 ] other: [ 10, 20, 30 ] is now allowed. The indentation rules do not allow right-aligning "key" to "other", but I think that is acceptable (if we forced using tabs for indentation, we could actually allow this, which I think is worth consideration, at least). Flow maps are more generous: foo: { bar: baz } fooq: { barq: bazq } is allowed because flow maps do not use whitespace as a structural designator. These changes do affect how some things can be represented. Scalar values can no longer contain leading or trailing whitespace (previously the could contain leading whitespace). Map keys cannot contain trailing whitespace (they could before. This also means that keys consisting of whitespace cannot be represented at all). Ultimately, given the other restrictions the format imposes on keys and values, I find these to be acceptable and consistent with the goal of the format.
2023-10-04 22:54:53 -07:00
' ' => continue :charloop,
'\t' => return error.IllegalTabWhitespaceInLine,
',' => pstate = .want_map_key,
'}' => pstate = try state.popFlowStack(),
else => return {
state.diagnostics.length = 1;
state.diagnostics.message = "this document contains an invalid character instead of a inline map separator";
return error.BadToken;
},
},
// the root value was closed but there are characters remaining
// in the buffer
.done => return {
state.diagnostics.length = 1;
state.diagnostics.message = "this document extra data after single item";
return error.BadState;
},
}
}
// we ran out of characters while still in the middle of an object
if (pstate != .done) return {
state.diagnostics.length = 1;
state.diagnostics.message = "this document contains an unterminated inline map or list";
return error.BadState;
};
return root;
}
inline fn getStackTip(state: State) !*Value {
if (state.value_stack.items.len == 0) return {
state.diagnostics.length = 1;
state.diagnostics.message = "this document contains an unexpected bottom of the stack";
return error.BadState;
};
return state.value_stack.items[state.value_stack.items.len - 1];
}
inline fn popFlowStack(state: *State) !FlowParseState {
if (state.value_stack.popOrNull() == null) {
state.diagnostics.length = 1;
state.diagnostics.message = "this document contains an unexpected bottom of the stack";
return error.BadState;
}
const parent = state.value_stack.getLastOrNull() orelse return .done;
return switch (parent.*) {
.inline_list => .want_list_separator,
.inline_map => .want_map_separator,
else => .done,
};
}
inline fn appendListGetValue(list: *Value.List, value: Value) !*Value {
try list.append(value);
return &list.items[list.items.len - 1];
}
inline fn putMap(state: *State, map: *Value.Map, key: []const u8, value: Value, dkb: DuplicateKeyBehavior) !void {
_ = try state.putMapGetValue(map, key, value, dkb);
}
inline fn putMapGetValue(state: *State, map: *Value.Map, key: []const u8, value: Value, dkb: DuplicateKeyBehavior) !*Value {
const gop = try map.getOrPut(key);
if (gop.found_existing)
switch (dkb) {
.fail => {
state.diagnostics.length = 1;
state.diagnostics.message = "this document contains a duplicate key";
return error.DuplicateKey;
},
.use_first => {},
.use_last => {
_ = map.orderedRemove(key);
map.putAssumeCapacityNoClobber(key, value);
},
}
else
gop.value_ptr.* = value;
return gop.value_ptr;
}
};