state: parse whitespace in flow objects a bit differently

There were (and probably still are) some weird and ugly edge cases here. For example, `[ 1 ]` would parse to a list of `1 `. This implementation allows a single space to precede the closing ] and errors out if there is more than one. Additionally, it rejects any spaces before the item separator comma. This also applies to flow maps, with the addition that they do not permit whitespace before `:` now, either. Leading spaces are still consumed with reckless abandon, so, for example, `[ lopsided]` is valid. There is also some state sloppiness flying around so `[ val, ]` probably currently works as well. Tightening up the handling of leading whitespace will be a bigger restructuring that may involve state machine changes. I'll have to think about it.
state: use inferred error sets
2023-10-03 23:25:58 -07:00 · 2023-10-03 23:19:01 -07:00 · 2023-10-03 23:17:37 -07:00
3 changed files with 384 additions and 35 deletions
--- a/src/parser.zig
+++ b/src/parser.zig
@ -3,7 +3,8 @@ const std = @import("std");
 const buffers = @import("./linebuffer.zig");
 const tokenizer = @import("./tokenizer.zig");
 const State = @import("./parser/state.zig").State;
-pub const Document = @import("./parser/state.zig").Document;
+pub const Document = @import("./parser/value.zig").Document;
+pub const Parsed = @import("./parser/value.zig").Parsed;
 pub const Value = @import("./parser/value.zig").Value;

 pub const Diagnostics = struct {
@ -40,9 +41,62 @@ pub const Options = struct {
    // resulting document root object. The default behavior is to emit an error if the
    // document is empty.
    default_object: enum { string, list, map, fail } = .fail,
+
+    // Only used by the parseTo family of functions.
+    // If false, and a mapping contains additional keys that do not map to the fields of
+    // the corresponding object, an error will be raised. By default, additional keys
+    // will be skipped and no error will be raised. Note that tagged unions must be
+    // represented by a map with a single key, and having more than one key will always
+    // be an error, even if this option is set to true.
+    ignore_extra_fields: bool = true,
+
+    // Only used by the parseTo family of functions.
+    // If true, if a struct field is an optional type and the corresponding mapping key
+    // does not exist, the object field will be set to `null`. By default, if the
+    // parsed document is missing a mapping key for a given field, an error will be
+    // raised instead.
+    treat_omitted_as_null: bool = false,
+
+    // Only used by the parseTo family of functions.
+    // If true, strings may be coerced into other scalar types, like booleans or
+    // numbers. By default, only document scalar fields will attempt to coerce to
+    // non-string values.
+    coerce_strings: bool = false,
+
+    // Only used by the parseTo family of functions.
+    // Two lists of strings. Truthy strings will be parsed to boolean true. Falsy
+    // strings will be parsed to boolean  false. All other strings will raise an
+    // error.
+    boolean_strings: struct { truthy: []const []const u8, falsy: []const []const u8 } = .{
+        .truthy = &.{ "true", "True", "yes", "on" },
+        .falsy = &.{ "false", "False", "no", "off" },
+    },
+
+    null_strings: []const []const u8 = &.{ "null", "nil", "None" },
+
+    // Only used by the parseTo family of functions.
+    // If true, document scalars that appear to be numbers will attempt to convert into
+    // enum values as an integer. By default, all enums in the document must be
+    // specified by name, not by numeric value. Note that conversion by name will always
+    // be tried first, even if this option is enabled, so if you're stupid enough to do:
+    //
+    //     const Horrible = enum {
+    //         @"1" = 0,
+    //         @"0" = 1,
+    //     };
+    //
+    // then you deserve what you get. And what you'll get is confusing results.
+    // Also note that this option does not apply to tagged unions, despite those being
+    // backed by possibly ordered enums.
+    allow_numeric_enums: bool = false,
 };

-pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, diagnostics: *Diagnostics, options: Options) !Document {
+pub fn parseBuffer(
+    allocator: std.mem.Allocator,
+    buffer: []const u8,
+    diagnostics: *Diagnostics,
+    options: Options,
+) !Document {
    var state = State.init(allocator, diagnostics);
    defer state.deinit();
    errdefer state.document.deinit();
@ -59,6 +113,17 @@ pub fn parseBuffer(allocator: std.mem.Allocator, buffer: []const u8, diagnostics
    return try state.finish(options);
 }

+pub fn parseBufferTo(
+    comptime T: type,
+    allocator: std.mem.Allocator,
+    buffer: []const u8,
+    diagnostics: *Diagnostics,
+    options: Options,
+) !Parsed(T) {
+    var doc = try parseBuffer(allocator, buffer, diagnostics, options);
+    return try doc.convertTo(T, options);
+}
+
 pub const StreamParser = struct {
    linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer),
    parse_state: State,
--- a/src/parser/state.zig
+++ b/src/parser/state.zig
@ -5,28 +5,9 @@ const Error = @import("../parser.zig").Error;
 const DuplicateKeyBehavior = @import("../parser.zig").DuplicateKeyBehavior;
 const Options = @import("../parser.zig").Options;
 const Diagnostics = @import("../parser.zig").Diagnostics;
+const Document = @import("./value.zig").Document;
 const Value = @import("./value.zig").Value;

-pub const Document = struct {
-    arena: std.heap.ArenaAllocator,
-    root: Value,
-
-    pub fn init(alloc: std.mem.Allocator) Document {
-        return .{
-            .arena = std.heap.ArenaAllocator.init(alloc),
-            .root = undefined,
-        };
-    }
-
-    pub fn printDebug(self: Document) void {
-        return self.root.printDebug();
-    }
-
-    pub fn deinit(self: Document) void {
-        self.arena.deinit();
-    }
-};
-
 const FlowParseState = enum {
    want_list_item,
    consuming_list_item,
@ -61,7 +42,7 @@ pub const State = struct {
        self.value_stack.deinit();
    }

-    pub fn finish(state: *State, options: Options) Error!Document {
+    pub fn finish(state: *State, options: Options) !Document {
        const arena_alloc = state.document.arena.allocator();

        switch (state.mode) {
@ -95,7 +76,7 @@ pub const State = struct {
        return state.document;
    }

-    pub fn parseLine(state: *State, line: tokenizer.Line, dkb: DuplicateKeyBehavior) Error!void {
+    pub fn parseLine(state: *State, line: tokenizer.Line, dkb: DuplicateKeyBehavior) !void {
        if (line.contents == .comment) return;

        // this gives us a second loop when the stack tip changes (i.e. during dedent or
@ -444,7 +425,7 @@ pub const State = struct {
        contents: []const u8,
        root_type: Value.TagType,
        dkb: DuplicateKeyBehavior,
-    ) Error!Value {
+    ) !Value {
        const arena_alloc = state.document.arena.allocator();

        var root: Value = switch (root_type) {
@ -462,7 +443,7 @@ pub const State = struct {
            else => unreachable,
        };

-        // used to distinguish betwen [] and [ ], and it also tracks
+        // used to distinguish between [] and [ ], and it also tracks
        // a continuous value between different states
        var item_start: usize = 0;
        var dangling_key: ?[]const u8 = null;
@ -474,6 +455,7 @@ pub const State = struct {
                    ' ', '\t' => continue :charloop,
                    ',' => {
                        // empty value
+                        // don't check for whitespace here: [ , ] is okay, as is [ , , ]
                        const tip = try state.getStackTip();
                        try tip.flow_list.append(Value.newScalar(arena_alloc));
                        item_start = idx + 1;
@ -518,9 +500,18 @@ pub const State = struct {
                    },
                },
                .consuming_list_item => switch (char) {
+                    // consider: detecting trailing whitespace. "[ 1 ]" should
+                    // produce "1" and not "1 " as it currently does, which breaks
+                    // the principle of least astonishment. design: no trailing
+                    // whitespace before "," and only a single space is allowed before "]"
                    ',' => {
-                        const tip = try state.getStackTip();
+                        if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
+                            state.diagnostics.length = 1;
+                            state.diagnostics.message = "the flow list contains whitespace before ,";
+                            return error.TrailingWhitespace;
+                        }

+                        const tip = try state.getStackTip();
                        try tip.flow_list.append(
                            try Value.fromScalar(arena_alloc, contents[item_start..idx]),
                        );
@ -529,13 +520,23 @@ pub const State = struct {
                        pstate = .want_list_item;
                    },
                    ']' => {
+                        var end = idx;
+                        if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
+                            if (idx > 1 and (contents[idx - 2] == ' ' or contents[idx - 2] == '\t')) {
+                                state.diagnostics.length = 1;
+                                state.diagnostics.message = "the flow list contains extra whitespace before ]";
+                                return error.TrailingWhitespace;
+                            }
+                            end = idx - 1;
+                        }
+
                        const finished = state.value_stack.getLastOrNull() orelse {
                            state.diagnostics.length = 1;
                            state.diagnostics.message = "the flow list was closed too many times";
                            return error.BadState;
                        };
                        try finished.flow_list.append(
-                            try Value.fromScalar(arena_alloc, contents[item_start..idx]),
+                            try Value.fromScalar(arena_alloc, contents[item_start..end]),
                        );
                        pstate = try state.popFlowStack();
                    },
@ -577,6 +578,11 @@ pub const State = struct {
                },
                .consuming_map_key => switch (char) {
                    ':' => {
+                        if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
+                            state.diagnostics.length = 1;
+                            state.diagnostics.message = "the flow map contains whitespace before :";
+                            return error.TrailingWhitespace;
+                        }
                        dangling_key = try arena_alloc.dupe(u8, contents[item_start..idx]);
                        pstate = .want_map_value;
                    },
@ -644,7 +650,12 @@ pub const State = struct {
                    },
                },
                .consuming_map_value => switch (char) {
-                    ',', '}' => |term| {
+                    ',' => {
+                        if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
+                            state.diagnostics.length = 1;
+                            state.diagnostics.message = "the flow map contains whitespace before ,";
+                            return error.TrailingWhitespace;
+                        }
                        const tip = try state.getStackTip();
                        try state.putMap(
                            &tip.flow_map,
@ -654,7 +665,27 @@ pub const State = struct {
                        );
                        dangling_key = null;
                        pstate = .want_map_key;
-                        if (term == '}') pstate = try state.popFlowStack();
+                    },
+                    '}' => {
+                        var end = idx;
+                        if (contents[idx - 1] == ' ' or contents[idx - 1] == '\t') {
+                            if (idx > 1 and (contents[idx - 2] == ' ' or contents[idx - 2] == '\t')) {
+                                state.diagnostics.length = 1;
+                                state.diagnostics.message = "the flow map contains extra whitespace before }";
+                                return error.TrailingWhitespace;
+                            }
+                            end = idx - 1;
+                        }
+
+                        const tip = try state.getStackTip();
+                        try state.putMap(
+                            &tip.flow_map,
+                            dangling_key.?,
+                            try Value.fromScalar(arena_alloc, contents[item_start..end]),
+                            dkb,
+                        );
+                        dangling_key = null;
+                        pstate = try state.popFlowStack();
                    },
                    else => continue :charloop,
                },
@ -687,7 +718,7 @@ pub const State = struct {
        return root;
    }

-    inline fn getStackTip(state: State) Error!*Value {
+    inline fn getStackTip(state: State) !*Value {
        if (state.value_stack.items.len == 0) return {
            state.diagnostics.length = 1;
            state.diagnostics.message = "this document contains an unexpected bottom of the stack";
@ -696,7 +727,7 @@ pub const State = struct {
        return state.value_stack.items[state.value_stack.items.len - 1];
    }

-    inline fn popFlowStack(state: *State) Error!FlowParseState {
+    inline fn popFlowStack(state: *State) !FlowParseState {
        if (state.value_stack.popOrNull() == null) {
            state.diagnostics.length = 1;
            state.diagnostics.message = "this document contains an unexpected bottom of the stack";
@ -711,16 +742,16 @@ pub const State = struct {
        };
    }

-    inline fn appendListGetValue(list: *Value.List, value: Value) Error!*Value {
+    inline fn appendListGetValue(list: *Value.List, value: Value) !*Value {
        try list.append(value);
        return &list.items[list.items.len - 1];
    }

-    inline fn putMap(state: *State, map: *Value.Map, key: []const u8, value: Value, dkb: DuplicateKeyBehavior) Error!void {
+    inline fn putMap(state: *State, map: *Value.Map, key: []const u8, value: Value, dkb: DuplicateKeyBehavior) !void {
        _ = try state.putMapGetValue(map, key, value, dkb);
    }

-    inline fn putMapGetValue(state: *State, map: *Value.Map, key: []const u8, value: Value, dkb: DuplicateKeyBehavior) Error!*Value {
+    inline fn putMapGetValue(state: *State, map: *Value.Map, key: []const u8, value: Value, dkb: DuplicateKeyBehavior) !*Value {
        const gop = try map.getOrPut(key);

        if (gop.found_existing)
--- a/src/parser/value.zig
+++ b/src/parser/value.zig
@ -1,5 +1,45 @@
 const std = @import("std");

+const Options = @import("../parser.zig").Options;
+
+pub const Document = struct {
+    arena: std.heap.ArenaAllocator,
+    root: Value,
+
+    pub fn init(alloc: std.mem.Allocator) Document {
+        return .{
+            .arena = std.heap.ArenaAllocator.init(alloc),
+            .root = undefined,
+        };
+    }
+
+    pub fn convertTo(self: *Document, comptime T: type, options: Options) !Parsed(T) {
+        return .{
+            .value = try self.root.convertTo(T, self.arena.allocator(), options),
+            .arena = self.arena,
+        };
+    }
+
+    pub fn printDebug(self: Document) void {
+        return self.root.printDebug();
+    }
+
+    pub fn deinit(self: Document) void {
+        self.arena.deinit();
+    }
+};
+
+pub fn Parsed(comptime T: type) type {
+    return struct {
+        value: T,
+        arena: std.heap.ArenaAllocator,
+
+        pub fn deinit(self: @This()) void {
+            self.arena.deinit();
+        }
+    };
+}
+
 pub const Value = union(enum) {
    pub const String = std.ArrayList(u8);
    pub const Map = std.StringArrayHashMap(Value);
@ -13,6 +53,219 @@ pub const Value = union(enum) {
    map: Map,
    flow_map: Map,

+    pub fn convertTo(self: Value, comptime T: type, allocator: std.mem.Allocator, options: Options) !T {
+        switch (@typeInfo(T)) {
+            .Void => {
+                switch (self) {
+                    .scalar => |str| return if (str.items.len == 0) void{} else error.BadValue,
+                    .string => |str| return if (options.coerce_strings and str.items.len == 0) void{} else error.BadValue,
+                    else => return error.BadValue,
+                }
+            },
+            .Bool => {
+                switch (self) {
+                    inline .scalar, .string => |str, tag| {
+                        if (tag == .string and !options.coerce_strings) return error.BadValue;
+                        for (options.boolean_strings.truthy) |check|
+                            if (std.mem.eql(u8, str.items, check)) return true;
+                        for (options.boolean_strings.falsy) |check|
+                            if (std.mem.eql(u8, str.items, check)) return false;
+
+                        return error.BadValue;
+                    },
+                    else => return error.BadValue,
+                }
+            },
+            .Int, .ComptimeInt => {
+                switch (self) {
+                    inline .scalar, .string => |str, tag| {
+                        if (tag == .string and !options.coerce_strings) return error.BadValue;
+                        std.debug.print("'{s}'\n", .{str.items});
+                        return try std.fmt.parseInt(T, str.items, 0);
+                    },
+                    else => return error.BadValue,
+                }
+            },
+            .Float, .ComptimeFloat => {
+                switch (self) {
+                    inline .scalar, .string => |str, tag| {
+                        if (tag == .string and !options.coerce_strings) return error.BadValue;
+                        return try std.fmt.parseFloat(T, str.items, 0);
+                    },
+                    else => return error.BadValue,
+                }
+            },
+            .Pointer => |ptr| switch (ptr.size) {
+                .Slice => {
+                    // TODO: There is ambiguity here because a document expecting a list
+                    //       of u8 could parse a string instead. Introduce a special
+                    //       type to use for this? the problem is that it becomes
+                    //       invasive into downstream code. Ultimately this should
+                    //       probably be solved in the zig stdlib or similar.
+                    // TODO: This also doesn't handle sentinels properly.
+                    switch (self) {
+                        .scalar, .string => |str| return if (ptr.child == u8) str.items else error.BadValue,
+                        .list, .flow_list => |lst| {
+                            var result = try std.ArrayList(ptr.child).initCapacity(allocator, lst.items.len);
+                            errdefer result.deinit();
+                            for (lst.items) |item| {
+                                result.appendAssumeCapacity(try item.convertTo(ptr.child, allocator, options));
+                            }
+                            return result.toOwnedSlice();
+                        },
+                        else => return error.BadValue,
+                    }
+                },
+                .One => {
+                    const result = try allocator.create(ptr.child);
+                    errdefer allocator.destroy(result);
+                    result.* = try self.convertTo(ptr.child, allocator, options);
+                    return result;
+                },
+                else => @compileError("Cannot deserialize into many-pointer or c-pointer " ++ @typeName(T)), // do not support many or C item pointers.
+            },
+            .Array => |arr| {
+                // TODO: There is ambiguity here because a document expecting a list
+                //       of u8 could parse a string instead. Introduce a special
+                //       type to use for this? the problem is that it becomes
+                //       invasive into downstream code. Ultimately this should
+                //       probably be solved in the zig stdlib or similar.
+                // TODO: This also doesn't handle sentinels properly.
+                switch (self) {
+                    .scalar, .string => |str| {
+                        if (arr.child == u8 and str.items.len == arr.len) {
+                            var result: T = undefined;
+                            @memcpy(&result, str.items);
+                            return result;
+                        } else return error.BadValue;
+                    },
+                    .list, .flow_list => |lst| {
+                        var storage = try std.ArrayList(arr.child).initCapacity(allocator, arr.len);
+                        defer storage.deinit();
+                        for (lst.items) |item| {
+                            storage.appendAssumeCapacity(try item.convertTo(arr.child, allocator, options));
+                        }
+                        // this may result in a big stack allocation, which is not ideal
+                        var result: T = undefined;
+                        @memcpy(&result, storage.items);
+                        return result;
+                    },
+                    else => return error.BadValue,
+                }
+            },
+            .Struct => |stt| {
+                if (comptime std.meta.trait.hasFn("deserializeNice")(T))
+                    return T.deserializeNice(self, allocator, options);
+
+                if (stt.is_tuple) {
+                    switch (self) {
+                        .list, .flow_list => |list| {
+                            if (list.items.len != stt.fields.len) return error.BadValue;
+                            var result: T = undefined;
+                            inline for (stt.fields, 0..) |field, idx| {
+                                result[idx] = try list.items[idx].convertTo(field.type, allocator, options);
+                            }
+                            return result;
+                        },
+                        else => return error.BadValue,
+                    }
+                }
+
+                switch (self) {
+                    .map, .flow_map => |map| {
+                        var result: T = undefined;
+
+                        if (options.ignore_extra_fields) {
+                            inline for (stt.fields) |field| {
+                                if (map.get(field.name)) |value| {
+                                    @field(result, field.name) = try value.convertTo(field.type, allocator, options);
+                                } else if (options.treat_omitted_as_null and @typeInfo(field.type) == .Optional) {
+                                    @field(result, field.name) = null;
+                                } else {
+                                    std.debug.print("{s}\n", .{field.name});
+                                    return error.BadValue;
+                                }
+                            }
+                        } else {
+                            // we could iterate over each map key and do an exhaustive
+                            // comparison with each struct field name. This would save
+                            // memory and it would probably be a fair amount faster for
+                            // small structs.
+                            var clone = try map.clone();
+                            defer clone.deinit();
+                            inline for (stt.fields) |field| {
+                                if (clone.fetchSwapRemove(field.name)) |kv| {
+                                    @field(result, field.name) = try kv.value.convertTo(field.type, allocator, options);
+                                } else if (options.treat_omitted_as_null and @typeInfo(field.type) == .Optional) {
+                                    @field(result, field.name) = null;
+                                } else return error.BadValue;
+                            }
+                            // there were extra fields in the data
+                            if (clone.count() > 0) return error.BadValue;
+                        }
+
+                        return result;
+                    },
+                    else => return error.BadValue,
+                }
+            },
+            .Enum => {
+                if (comptime std.meta.trait.hasFn("deserializeNice")(T))
+                    return T.deserializeNice(self, allocator, options);
+
+                switch (self) {
+                    inline .scalar, .string => |str, tag| {
+                        if (tag == .string and !options.coerce_strings) return error.BadValue;
+                        if (std.meta.stringToEnum(T, str.items)) |value| return value;
+                        if (options.allow_numeric_enums) {
+                            const parsed = std.fmt.parseInt(@typeInfo(T).Enum.tag_type, str.items, 10) catch
+                                return error.BadValue;
+                            return std.meta.intToEnum(T, parsed) catch error.BadValue;
+                        }
+                        return error.BadValue;
+                    },
+                    else => return error.BadValue,
+                }
+            },
+            .Union => |unn| {
+                if (comptime std.meta.trait.hasFn("deserializeNice")(T))
+                    return T.deserializeNice(self, allocator, options);
+
+                if (unn.tag_type == null) @compileError("Cannot deserialize into untagged union " ++ @typeName(T));
+
+                switch (self) {
+                    .map, .flow_map => |map| {
+                        // a union may not ever be deserialized from a map with more than one value
+                        if (map.count() != 1) return error.BadValue;
+                        const key = map.keys()[0];
+                        inline for (unn.fields) |field| {
+                            if (std.mem.eql(u8, key, field.name))
+                                return @unionInit(T, field.name, try map.get(key).?.convertTo(field.type, allocator, options));
+                        }
+                        return error.BadValue;
+                    },
+                    // TODO: if the field is a 0 width type like void, we could parse it
+                    //       directly from a scalar/string value (i.e. a name with no
+                    //       corresponding value)
+                    else => return error.BadValue,
+                }
+            },
+            .Optional => |opt| {
+                switch (self) {
+                    inline .scalar, .string => |str, tag| {
+                        if (tag == .string and !options.coerce_strings) return error.BadValue;
+                        for (options.null_strings) |check|
+                            if (std.mem.eql(u8, str.items, check)) return null;
+
+                        return try self.convertTo(opt.child, allocator, options);
+                    },
+                    else => return error.BadValue,
+                }
+            },
+            else => @compileError("Cannot deserialize into unsupported type " ++ @typeName(T)),
+        }
+    }
+
    pub inline fn fromScalar(alloc: std.mem.Allocator, input: []const u8) !Value {
        return try _fromScalarOrString(alloc, .scalar, input);
    }