diff --git a/src/parser.zig b/src/parser.zig index 16bab9d..1ab5028 100644 --- a/src/parser.zig +++ b/src/parser.zig @@ -3,7 +3,8 @@ const std = @import("std"); const buffers = @import("./linebuffer.zig"); const tokenizer = @import("./tokenizer.zig"); const State = @import("./parser/state.zig").State; -pub const Document = @import("./parser/state.zig").Document; +pub const Document = @import("./parser/value.zig").Document; +pub const Parsed = @import("./parser/value.zig").Parsed; pub const Value = @import("./parser/value.zig").Value; pub const Diagnostics = struct { @@ -40,9 +41,62 @@ pub const Options = struct { // resulting document root object. The default behavior is to emit an error if the // document is empty. default_object: enum { string, list, map, fail } = .fail, + + // Only used by the parseTo family of functions. + // If false, and a mapping contains additional keys that do not map to the fields of + // the corresponding object, an error will be raised. By default, additional keys + // will be skipped and no error will be raised. Note that tagged unions must be + // represented by a map with a single key, and having more than one key will always + // be an error, even if this option is set to true. + ignore_extra_fields: bool = true, + + // Only used by the parseTo family of functions. + // If true, if a struct field is an optional type and the corresponding mapping key + // does not exist, the object field will be set to `null`. By default, if the + // parsed document is missing a mapping key for a given field, an error will be + // raised instead. + treat_omitted_as_null: bool = false, + + // Only used by the parseTo family of functions. + // If true, strings may be coerced into other scalar types, like booleans or + // numbers. By default, only document scalar fields will attempt to coerce to + // non-string values. + coerce_strings: bool = false, + + // Only used by the parseTo family of functions. + // Two lists of strings. Truthy strings will be parsed to boolean true. Falsy + // strings will be parsed to boolean false. All other strings will raise an + // error. + boolean_strings: struct { truthy: []const []const u8, falsy: []const []const u8 } = .{ + .truthy = &.{ "true", "True", "yes", "on" }, + .falsy = &.{ "false", "False", "no", "off" }, + }, + + null_strings: []const []const u8 = &.{ "null", "nil", "None" }, + + // Only used by the parseTo family of functions. + // If true, document scalars that appear to be numbers will attempt to convert into + // enum values as an integer. By default, all enums in the document must be + // specified by name, not by numeric value. Note that conversion by name will always + // be tried first, even if this option is enabled, so if you're stupid enough to do: + // + // const Horrible = enum { + // @"1" = 0, + // @"0" = 1, + // }; + // + // then you deserve what you get. And what you'll get is confusing results. + // Also note that this option does not apply to tagged unions, despite those being + // backed by possibly ordered enums. + allow_numeric_enums: bool = false, }; -pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, diagnostics: *Diagnostics, options: Options) !Document { +pub fn parseBuffer( + allocator: std.mem.Allocator, + buffer: []const u8, + diagnostics: *Diagnostics, + options: Options, +) !Document { var state = State.init(allocator, diagnostics); defer state.deinit(); errdefer state.document.deinit(); @@ -59,6 +113,17 @@ pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, diagnostics return try state.finish(options); } +pub fn parseBufferTo( + comptime T: type, + allocator: std.mem.Allocator, + buffer: []const u8, + diagnostics: *Diagnostics, + options: Options, +) !Parsed(T) { + var doc = try parseBuffer(allocator, buffer, diagnostics, options); + return try doc.convertTo(T, options); +} + pub const StreamParser = struct { linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer), parse_state: State, diff --git a/src/parser/state.zig b/src/parser/state.zig index c099dbb..7852a65 100644 --- a/src/parser/state.zig +++ b/src/parser/state.zig @@ -5,28 +5,9 @@ const Error = @import("../parser.zig").Error; const DuplicateKeyBehavior = @import("../parser.zig").DuplicateKeyBehavior; const Options = @import("../parser.zig").Options; const Diagnostics = @import("../parser.zig").Diagnostics; +const Document = @import("./value.zig").Document; const Value = @import("./value.zig").Value; -pub const Document = struct { - arena: std.heap.ArenaAllocator, - root: Value, - - pub fn init(alloc: std.mem.Allocator) Document { - return .{ - .arena = std.heap.ArenaAllocator.init(alloc), - .root = undefined, - }; - } - - pub fn printDebug(self: Document) void { - return self.root.printDebug(); - } - - pub fn deinit(self: Document) void { - self.arena.deinit(); - } -}; - const FlowParseState = enum { want_list_item, consuming_list_item, diff --git a/src/parser/value.zig b/src/parser/value.zig index 85c783f..d312c69 100644 --- a/src/parser/value.zig +++ b/src/parser/value.zig @@ -1,5 +1,45 @@ const std = @import("std"); +const Options = @import("../parser.zig").Options; + +pub const Document = struct { + arena: std.heap.ArenaAllocator, + root: Value, + + pub fn init(alloc: std.mem.Allocator) Document { + return .{ + .arena = std.heap.ArenaAllocator.init(alloc), + .root = undefined, + }; + } + + pub fn convertTo(self: *Document, comptime T: type, options: Options) !Parsed(T) { + return .{ + .value = try self.root.convertTo(T, self.arena.allocator(), options), + .arena = self.arena, + }; + } + + pub fn printDebug(self: Document) void { + return self.root.printDebug(); + } + + pub fn deinit(self: Document) void { + self.arena.deinit(); + } +}; + +pub fn Parsed(comptime T: type) type { + return struct { + value: T, + arena: std.heap.ArenaAllocator, + + pub fn deinit(self: @This()) void { + self.arena.deinit(); + } + }; +} + pub const Value = union(enum) { pub const String = std.ArrayList(u8); pub const Map = std.StringArrayHashMap(Value); @@ -13,6 +53,219 @@ pub const Value = union(enum) { map: Map, flow_map: Map, + pub fn convertTo(self: Value, comptime T: type, allocator: std.mem.Allocator, options: Options) !T { + switch (@typeInfo(T)) { + .Void => { + switch (self) { + .scalar => |str| return if (str.items.len == 0) void{} else error.BadValue, + .string => |str| return if (options.coerce_strings and str.items.len == 0) void{} else error.BadValue, + else => return error.BadValue, + } + }, + .Bool => { + switch (self) { + inline .scalar, .string => |str, tag| { + if (tag == .string and !options.coerce_strings) return error.BadValue; + for (options.boolean_strings.truthy) |check| + if (std.mem.eql(u8, str.items, check)) return true; + for (options.boolean_strings.falsy) |check| + if (std.mem.eql(u8, str.items, check)) return false; + + return error.BadValue; + }, + else => return error.BadValue, + } + }, + .Int, .ComptimeInt => { + switch (self) { + inline .scalar, .string => |str, tag| { + if (tag == .string and !options.coerce_strings) return error.BadValue; + std.debug.print("'{s}'\n", .{str.items}); + return try std.fmt.parseInt(T, str.items, 0); + }, + else => return error.BadValue, + } + }, + .Float, .ComptimeFloat => { + switch (self) { + inline .scalar, .string => |str, tag| { + if (tag == .string and !options.coerce_strings) return error.BadValue; + return try std.fmt.parseFloat(T, str.items, 0); + }, + else => return error.BadValue, + } + }, + .Pointer => |ptr| switch (ptr.size) { + .Slice => { + // TODO: There is ambiguity here because a document expecting a list + // of u8 could parse a string instead. Introduce a special + // type to use for this? the problem is that it becomes + // invasive into downstream code. Ultimately this should + // probably be solved in the zig stdlib or similar. + // TODO: This also doesn't handle sentinels properly. + switch (self) { + .scalar, .string => |str| return if (ptr.child == u8) str.items else error.BadValue, + .list, .flow_list => |lst| { + var result = try std.ArrayList(ptr.child).initCapacity(allocator, lst.items.len); + errdefer result.deinit(); + for (lst.items) |item| { + result.appendAssumeCapacity(try item.convertTo(ptr.child, allocator, options)); + } + return result.toOwnedSlice(); + }, + else => return error.BadValue, + } + }, + .One => { + const result = try allocator.create(ptr.child); + errdefer allocator.destroy(result); + result.* = try self.convertTo(ptr.child, allocator, options); + return result; + }, + else => @compileError("Cannot deserialize into many-pointer or c-pointer " ++ @typeName(T)), // do not support many or C item pointers. + }, + .Array => |arr| { + // TODO: There is ambiguity here because a document expecting a list + // of u8 could parse a string instead. Introduce a special + // type to use for this? the problem is that it becomes + // invasive into downstream code. Ultimately this should + // probably be solved in the zig stdlib or similar. + // TODO: This also doesn't handle sentinels properly. + switch (self) { + .scalar, .string => |str| { + if (arr.child == u8 and str.items.len == arr.len) { + var result: T = undefined; + @memcpy(&result, str.items); + return result; + } else return error.BadValue; + }, + .list, .flow_list => |lst| { + var storage = try std.ArrayList(arr.child).initCapacity(allocator, arr.len); + defer storage.deinit(); + for (lst.items) |item| { + storage.appendAssumeCapacity(try item.convertTo(arr.child, allocator, options)); + } + // this may result in a big stack allocation, which is not ideal + var result: T = undefined; + @memcpy(&result, storage.items); + return result; + }, + else => return error.BadValue, + } + }, + .Struct => |stt| { + if (comptime std.meta.trait.hasFn("deserializeNice")(T)) + return T.deserializeNice(self, allocator, options); + + if (stt.is_tuple) { + switch (self) { + .list, .flow_list => |list| { + if (list.items.len != stt.fields.len) return error.BadValue; + var result: T = undefined; + inline for (stt.fields, 0..) |field, idx| { + result[idx] = try list.items[idx].convertTo(field.type, allocator, options); + } + return result; + }, + else => return error.BadValue, + } + } + + switch (self) { + .map, .flow_map => |map| { + var result: T = undefined; + + if (options.ignore_extra_fields) { + inline for (stt.fields) |field| { + if (map.get(field.name)) |value| { + @field(result, field.name) = try value.convertTo(field.type, allocator, options); + } else if (options.treat_omitted_as_null and @typeInfo(field.type) == .Optional) { + @field(result, field.name) = null; + } else { + std.debug.print("{s}\n", .{field.name}); + return error.BadValue; + } + } + } else { + // we could iterate over each map key and do an exhaustive + // comparison with each struct field name. This would save + // memory and it would probably be a fair amount faster for + // small structs. + var clone = try map.clone(); + defer clone.deinit(); + inline for (stt.fields) |field| { + if (clone.fetchSwapRemove(field.name)) |kv| { + @field(result, field.name) = try kv.value.convertTo(field.type, allocator, options); + } else if (options.treat_omitted_as_null and @typeInfo(field.type) == .Optional) { + @field(result, field.name) = null; + } else return error.BadValue; + } + // there were extra fields in the data + if (clone.count() > 0) return error.BadValue; + } + + return result; + }, + else => return error.BadValue, + } + }, + .Enum => { + if (comptime std.meta.trait.hasFn("deserializeNice")(T)) + return T.deserializeNice(self, allocator, options); + + switch (self) { + inline .scalar, .string => |str, tag| { + if (tag == .string and !options.coerce_strings) return error.BadValue; + if (std.meta.stringToEnum(T, str.items)) |value| return value; + if (options.allow_numeric_enums) { + const parsed = std.fmt.parseInt(@typeInfo(T).Enum.tag_type, str.items, 10) catch + return error.BadValue; + return std.meta.intToEnum(T, parsed) catch error.BadValue; + } + return error.BadValue; + }, + else => return error.BadValue, + } + }, + .Union => |unn| { + if (comptime std.meta.trait.hasFn("deserializeNice")(T)) + return T.deserializeNice(self, allocator, options); + + if (unn.tag_type == null) @compileError("Cannot deserialize into untagged union " ++ @typeName(T)); + + switch (self) { + .map, .flow_map => |map| { + // a union may not ever be deserialized from a map with more than one value + if (map.count() != 1) return error.BadValue; + const key = map.keys()[0]; + inline for (unn.fields) |field| { + if (std.mem.eql(u8, key, field.name)) + return @unionInit(T, field.name, try map.get(key).?.convertTo(field.type, allocator, options)); + } + return error.BadValue; + }, + // TODO: if the field is a 0 width type like void, we could parse it + // directly from a scalar/string value (i.e. a name with no + // corresponding value) + else => return error.BadValue, + } + }, + .Optional => |opt| { + switch (self) { + inline .scalar, .string => |str, tag| { + if (tag == .string and !options.coerce_strings) return error.BadValue; + for (options.null_strings) |check| + if (std.mem.eql(u8, str.items, check)) return null; + + return try self.convertTo(opt.child, allocator, options); + }, + else => return error.BadValue, + } + }, + else => @compileError("Cannot deserialize into unsupported type " ++ @typeName(T)), + } + } + pub inline fn fromScalar(alloc: std.mem.Allocator, input: []const u8) !Value { return try _fromScalarOrString(alloc, .scalar, input); }