nice-data/src/parser/value.zig

413 lines
18 KiB
Zig
Raw Normal View History

const std = @import("std");
const Options = @import("../parser.zig").Options;
pub const Document = struct {
arena: std.heap.ArenaAllocator,
root: Value,
pub fn init(alloc: std.mem.Allocator) Document {
return .{
.arena = std.heap.ArenaAllocator.init(alloc),
.root = undefined,
};
}
pub fn convertTo(self: *Document, comptime T: type, options: Options) !Parsed(T) {
return .{
.value = try self.root.convertTo(T, self.arena.allocator(), options),
.arena = self.arena,
};
}
pub fn printDebug(self: Document) void {
return self.root.printDebug();
}
pub fn deinit(self: Document) void {
self.arena.deinit();
}
};
pub fn Parsed(comptime T: type) type {
return struct {
value: T,
arena: std.heap.ArenaAllocator,
pub fn deinit(self: @This()) void {
self.arena.deinit();
}
};
}
pub const Value = union(enum) {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
pub const String = []const u8;
pub const Map = std.StringArrayHashMap(Value);
pub const List = std.ArrayList(Value);
pub const TagType = @typeInfo(Value).Union.tag_type.?;
scalar: String,
string: String,
list: List,
inline_list: List,
map: Map,
inline_map: Map,
pub fn convertTo(self: Value, comptime T: type, allocator: std.mem.Allocator, options: Options) !T {
switch (@typeInfo(T)) {
.Void => {
switch (self) {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
.scalar => |str| return if (str.len == 0) void{} else error.BadValue,
.string => |str| return if (options.coerce_strings and str.len == 0) void{} else error.BadValue,
else => return error.BadValue,
}
},
.Bool => {
switch (self) {
inline .scalar, .string => |str, tag| {
if (tag == .string and !options.coerce_strings) return error.BadValue;
if (options.case_insensitive_scalar_coersion) {
for (options.boolean_strings.truthy) |check|
if (std.ascii.eqlIgnoreCase(str, check)) return true;
for (options.boolean_strings.falsy) |check|
if (std.ascii.eqlIgnoreCase(str, check)) return false;
} else {
for (options.boolean_strings.truthy) |check|
if (std.mem.eql(u8, str, check)) return true;
for (options.boolean_strings.falsy) |check|
if (std.mem.eql(u8, str, check)) return false;
}
return error.BadValue;
},
else => return error.BadValue,
}
},
.Int, .ComptimeInt => {
switch (self) {
inline .scalar, .string => |str, tag| {
if (tag == .string and !options.coerce_strings) return error.BadValue;
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
return try std.fmt.parseInt(T, str, 0);
},
else => return error.BadValue,
}
},
.Float, .ComptimeFloat => {
switch (self) {
inline .scalar, .string => |str, tag| {
if (tag == .string and !options.coerce_strings) return error.BadValue;
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
return try std.fmt.parseFloat(T, str, 0);
},
else => return error.BadValue,
}
},
.Pointer => |ptr| switch (ptr.size) {
.Slice => {
// TODO: There is ambiguity here because a document expecting a list
// of u8 could parse a string instead. Introduce a special
// type to use for this? the problem is that it becomes
// invasive into downstream code. Ultimately this should
// probably be solved in the zig stdlib or similar.
// TODO: This also doesn't handle sentinels properly.
switch (self) {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
.scalar, .string => |str| return if (ptr.child == u8) str else error.BadValue,
.list, .inline_list => |lst| {
var result = try std.ArrayList(ptr.child).initCapacity(allocator, lst.items.len);
errdefer result.deinit();
for (lst.items) |item| {
result.appendAssumeCapacity(try item.convertTo(ptr.child, allocator, options));
}
return result.toOwnedSlice();
},
else => return error.BadValue,
}
},
.One => {
const result = try allocator.create(ptr.child);
errdefer allocator.destroy(result);
result.* = try self.convertTo(ptr.child, allocator, options);
return result;
},
else => @compileError("Cannot deserialize into many-pointer or c-pointer " ++ @typeName(T)), // do not support many or C item pointers.
},
.Array => |arr| {
// TODO: There is ambiguity here because a document expecting a list
// of u8 could parse a string instead. Introduce a special
// type to use for this? the problem is that it becomes
// invasive into downstream code. Ultimately this should
// probably be solved in the zig stdlib or similar.
// TODO: This also doesn't handle sentinels properly.
switch (self) {
.scalar, .string => |str| {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
if (arr.child == u8 and str.len == arr.len) {
var result: T = undefined;
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
@memcpy(&result, str);
return result;
} else return error.BadValue;
},
.list, .inline_list => |lst| {
var storage = try std.ArrayList(arr.child).initCapacity(allocator, arr.len);
defer storage.deinit();
for (lst.items) |item| {
storage.appendAssumeCapacity(try item.convertTo(arr.child, allocator, options));
}
// this may result in a big stack allocation, which is not ideal
var result: T = undefined;
@memcpy(&result, storage.items);
return result;
},
else => return error.BadValue,
}
},
.Struct => |stt| {
if (comptime std.meta.trait.hasFn("deserializeNice")(T))
return T.deserializeNice(self, allocator, options);
if (stt.is_tuple) {
switch (self) {
.list, .inline_list => |list| {
if (list.items.len != stt.fields.len) return error.BadValue;
var result: T = undefined;
inline for (stt.fields, 0..) |field, idx| {
result[idx] = try list.items[idx].convertTo(field.type, allocator, options);
}
return result;
},
else => return error.BadValue,
}
}
switch (self) {
.map, .inline_map => |map| {
var result: T = undefined;
if (options.ignore_extra_fields) {
inline for (stt.fields) |field| {
if (map.get(field.name)) |value| {
@field(result, field.name) = try value.convertTo(field.type, allocator, options);
} else if (options.treat_omitted_as_null and @typeInfo(field.type) == .Optional) {
@field(result, field.name) = null;
} else {
return error.BadValue;
}
}
} else {
// we could iterate over each map key and do an exhaustive
// comparison with each struct field name. This would save
// memory and it would probably be a fair amount faster for
// small structs.
var clone = try map.clone();
defer clone.deinit();
inline for (stt.fields) |field| {
if (clone.fetchSwapRemove(field.name)) |kv| {
@field(result, field.name) = try kv.value.convertTo(field.type, allocator, options);
} else if (options.treat_omitted_as_null and @typeInfo(field.type) == .Optional) {
@field(result, field.name) = null;
} else return error.BadValue;
}
// there were extra fields in the data
if (clone.count() > 0) return error.BadValue;
}
return result;
},
else => return error.BadValue,
}
},
.Enum => {
if (comptime std.meta.trait.hasFn("deserializeNice")(T))
return T.deserializeNice(self, allocator, options);
switch (self) {
inline .scalar, .string => |str, tag| {
if (tag == .string and !options.coerce_strings) return error.BadValue;
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
if (std.meta.stringToEnum(T, str)) |value| return value;
if (options.allow_numeric_enums) {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
const parsed = std.fmt.parseInt(@typeInfo(T).Enum.tag_type, str, 10) catch
return error.BadValue;
return std.meta.intToEnum(T, parsed) catch error.BadValue;
}
return error.BadValue;
},
else => return error.BadValue,
}
},
.Union => |unn| {
if (comptime std.meta.trait.hasFn("deserializeNice")(T))
return T.deserializeNice(self, allocator, options);
if (unn.tag_type == null) @compileError("Cannot deserialize into untagged union " ++ @typeName(T));
switch (self) {
.map, .inline_map => |map| {
// a union may not ever be deserialized from a map with more than one value
if (map.count() != 1) return error.BadValue;
const key = map.keys()[0];
inline for (unn.fields) |field| {
if (std.mem.eql(u8, key, field.name))
return @unionInit(T, field.name, try map.get(key).?.convertTo(field.type, allocator, options));
}
return error.BadValue;
},
// TODO: if the field is a 0 width type like void, we could parse it
// directly from a scalar/string value (i.e. a name with no
// corresponding value)
else => return error.BadValue,
}
},
.Optional => |opt| {
switch (self) {
inline .scalar, .string => |str, tag| {
if (tag == .string and !options.coerce_strings) return error.BadValue;
if (options.case_insensitive_scalar_coersion) {
for (options.null_strings) |check|
if (std.ascii.eqlIgnoreCase(str, check)) return null;
} else {
for (options.null_strings) |check|
if (std.mem.eql(u8, str, check)) return null;
}
return try self.convertTo(opt.child, allocator, options);
},
else => return error.BadValue,
}
},
else => @compileError("Cannot deserialize into unsupported type " ++ @typeName(T)),
}
}
pub inline fn fromScalar(alloc: std.mem.Allocator, input: []const u8) !Value {
return try _fromScalarOrString(alloc, .scalar, input);
}
pub inline fn fromString(alloc: std.mem.Allocator, input: []const u8) !Value {
return try _fromScalarOrString(alloc, .string, input);
}
inline fn _fromScalarOrString(alloc: std.mem.Allocator, comptime classification: TagType, input: []const u8) !Value {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
return @unionInit(Value, @tagName(classification), try alloc.dupe(u8, input));
}
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
pub inline fn emptyScalar() Value {
return .{ .scalar = "" };
}
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
pub inline fn emptyString() Value {
return .{ .string = "" };
}
pub inline fn newList(alloc: std.mem.Allocator) Value {
return .{ .list = List.init(alloc) };
}
pub inline fn newFlowList(alloc: std.mem.Allocator) Value {
return .{ .inline_list = List.init(alloc) };
}
pub inline fn newMap(alloc: std.mem.Allocator) Value {
return .{ .map = Map.init(alloc) };
}
pub inline fn newFlowMap(alloc: std.mem.Allocator) Value {
return .{ .inline_map = Map.init(alloc) };
}
pub fn recursiveEqualsExact(self: Value, other: Value) bool {
if (@as(TagType, self) != other) return false;
switch (self) {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
inline .scalar, .string => |str, tag| return std.mem.eql(u8, str, @field(other, @tagName(tag))),
inline .list, .inline_list => |lst, tag| {
const olst = @field(other, @tagName(tag));
if (lst.items.len != olst.items.len) return false;
for (lst.items, olst.items) |this, that| if (!this.recursiveEqualsExact(that)) return false;
return true;
},
inline .map, .inline_map => |map, tag| {
const omap = @field(other, @tagName(tag));
if (map.count() != omap.count()) return false;
var iter = map.iterator();
var oiter = omap.iterator();
// this loop structure enforces that the maps are in the same order
while (iter.next()) |this| {
const that = oiter.next() orelse return false;
if (!std.mem.eql(u8, this.key_ptr.*, that.key_ptr.*) or !this.value_ptr.recursiveEqualsExact(that.value_ptr.*)) return false;
}
// the maps are equal if we have also consumed all of the values from
// other.
return oiter.next() == null;
},
}
}
pub fn printDebug(self: Value) void {
self.printRecursive(0);
std.debug.print("\n", .{});
}
fn printRecursive(self: Value, indent: usize) void {
switch (self) {
.scalar, .string => |str| {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
if (std.mem.indexOfScalar(u8, str, '\n')) |_| {
var lines = std.mem.splitScalar(u8, str, '\n');
std.debug.print("\n", .{});
while (lines.next()) |line| {
std.debug.print(
"{[empty]s: >[indent]}{[line]s}{[nl]s}",
.{
.empty = "",
.indent = indent,
.line = line,
.nl = if (lines.peek() == null) "" else "\n",
},
);
}
} else {
parser: change string and | semantics and expose slices in Value The way I implemented these changes ended up being directly coupled and I am not interested in trying to decouple them, so instead here's a single commit that makes changes to both the API and the format. Let's go over these. | now acts as a direct concatenation operator, rather than concatenating with a space. This is because the format allows the specification of a trailing space (by using | to fence the string just before the newline). So it's now possible to spread a long string without spaces over multiple lines, which couldn't be done before. This does have the downside that the common pattern of concatenating strings with a space now requires some extra trailing line noise. I may introduce a THIRD type of concatenating string (thinking of using + as the prefix) because I am a jerk. We will see. The way multi-line strings are concatenated has changed. Partially this has to do with increasing the simplicity of the aforementioned implementation change (the parser forgets the string type from the tokenizer. This worked before because there would always be a trailing character that could be popped off. But since one type now appends no character, this would have to be tracked through the parsing to determine if a character would need to be popped at the end). But I was also not terribly satisfied with the semantics of multiline strings before. I wrote several words about this in 429734e6e813b225654aa71c283f4a8b4444609f, where I reached the opposite conclusion from what is implemented in this commit. Basically, when different types of string concatenation are mixed, the results may be surprising. The previous approach would append the line terminator at the end of the line specified. The new approach prepends the line terminator at the beginning of the line specified. Since the specifier character is at the beginning of the line, I feel like this reads a little better simply due to the colocation of information. As an example: > first | second > third Would previously have resulted in "first\nsecondthird" but it will now result in "firstsecond\nthird". The only mildly baffling part about this is that the string signifier on the first line has absolutely no impact on the string. In the old design, it was the last line that had no impact. Finally, this commit also changes Value so that it uses []const u8 slices directly to store strings instead of ArrayLists. This is because everything downstream of the value was just reaching into string.items to access the slice directly, so cut out the middleman. It was unintuitive to access a field named .string and get an arraylist rather than a slice, anyway.
2023-10-08 16:57:52 -07:00
std.debug.print("{s}", .{str});
}
},
.list, .inline_list => |list| {
if (list.items.len == 0) {
std.debug.print("[]", .{});
return;
}
std.debug.print("[\n", .{});
for (list.items, 0..) |value, idx| {
std.debug.print("{[empty]s: >[indent]}[{[idx]d}] = ", .{ .empty = "", .indent = indent, .idx = idx });
value.printRecursive(indent + 2);
std.debug.print(",\n", .{});
}
std.debug.print(
"{[empty]s: >[indent]}]",
.{ .empty = "", .indent = indent },
);
},
.map, .inline_map => |map| {
if (map.count() == 0) {
std.debug.print("{{}}", .{});
return;
}
std.debug.print("{{\n", .{});
var iter = map.iterator();
while (iter.next()) |entry| {
std.debug.print(
"{[empty]s: >[indent]}{[key]s}: ",
.{ .empty = "", .indent = indent + 2, .key = entry.key_ptr.* },
);
entry.value_ptr.printRecursive(indent + 4);
std.debug.print(",\n", .{});
}
std.debug.print(
"{[empty]s: >[indent]}}}",
.{ .empty = "", .indent = indent },
);
},
}
}
};