nice-data/src/parser.zig
torque 21a9753d46
parser: change omitted value behavior to work with all default values
Special casing optional values was a little odd before. Now, the user
can supply a default value for any field that may be omitted from the
serialized data. This behaves the same way as the stdlib JSON parser
as well.
2023-11-23 17:47:21 -08:00

197 lines
7.9 KiB
Zig

// Copyright 2023 torque@epicyclic.dev
//
// Licensed under the MIT/Expat license. You may not use this file except in
// compliance with the license. You may obtain a copy of the license at
//
// https://spdx.org/licenses/MIT.html
//
// This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied.
const std = @import("std");
const buffers = @import("./linebuffer.zig");
const tokenizer = @import("./tokenizer.zig");
const State = @import("./parser/state.zig").State;
pub const Document = @import("./parser/value.zig").Document;
pub const Parsed = @import("./parser/value.zig").Parsed;
pub const Value = @import("./parser/value.zig").Value;
pub const Diagnostics = struct {
row: usize = 0,
line_offset: usize = 0,
length: usize = 0,
message: []const u8 = "no problems",
};
pub const Error = error{
UnexpectedIndent,
UnexpectedValue,
EmptyDocument,
DuplicateKey,
BadMapEntry,
BadState,
BadToken,
Fail,
} || tokenizer.Error || std.mem.Allocator.Error;
pub const DuplicateKeyBehavior = enum {
use_first,
use_last,
fail,
};
pub const Options = struct {
// If a mapping has multiple entries with the same key, this option defines how the
// parser should behave. The default behavior is to emit an error if a repeated key
// is encountered.
duplicate_key_behavior: DuplicateKeyBehavior = .fail,
// If an empty document is parsed, this defines what value type should be the
// resulting document root object. The default behavior is to emit an error if the
// document is empty.
default_object: enum { string, list, map, fail } = .fail,
// Only used by the parseTo family of functions.
// If false, and a mapping contains additional keys that do not map to the fields of
// the corresponding object, an error will be raised. By default, additional keys
// will be skipped and no error will be raised. Note that tagged unions must be
// represented by a map with a single key, and having more than one key will always
// be an error, even if this option is set to true.
ignore_extra_fields: bool = true,
// Only used by the parseTo family of functions.
// If true, if a struct field has a default value associated with it and the
// corresponding mapping key does not exist, the object field will be set to the
// default value. By default, this behavior is enabled, allowing succinct
// representation of objects that have default fields.
allow_omitting_default_values: bool = true,
// Only used by the parseTo family of functions.
// If true, strings may be coerced into other scalar types, like booleans or
// numbers. By default, only document scalar fields will attempt to coerce to
// non-string values.
coerce_strings: bool = false,
// Only used by the parseTo family of functions.
// Two lists of strings. Scalars in a document that match any of the truthy values
// will be parsed to boolean true. Scalars in the document that match any of the
// falsy values will be parsed to boolean false. All other scalar values will raise
// an error if the destination is a boolean type. By default, these comparisons are
// case-sensitive. See the `case_insensitive_scalar_coersion` option to change
// this.
boolean_scalars: struct { truthy: []const []const u8, falsy: []const []const u8 } = .{
.truthy = &.{ "true", "True", "yes", "on" },
.falsy = &.{ "false", "False", "no", "off" },
},
// Only used by the parseTo family of functions.
// A list of strings. Scalars in the doucment that match any of the values listed
// will be parsed to optional `null`. Any other scalar value will be parsed as the
// optional child type if the destination type is an optional. By default, these
// comparisons are case-sensitive. See the `case_insensitive_scalar_coersion`
// option to change this.
null_scalars: []const []const u8 = &.{ "null", "nil", "None" },
// Only used by the parseTo family of functions.
// Choose whether to strip the leading `.` off of expected enum values. By default,
// `.enum_field` will be parsed into the enum field `enum_field`, which makes them
// look like source code enum literals. Any enum value missing the leading `.` will
// result in a conversion error. If set to false, no preprocessing will be done
// and enum values will be converted from the literal scalar/string. These two styles
// cannot be mixed in a single document.
expect_enum_dot: bool = true,
// Only used by the parseTo family of functions.
// Perform ASCII-case-insensitive comparisons for scalars (i.e. `TRUE` in a document
// will match `true` in the boolean scalars. Unicode case folding is not currently
// supported.
case_insensitive_scalar_coersion: bool = false,
// Only used by the parseTo family of functions.
// If true, document scalars that appear to be numbers will attempt to convert into
// enum values as an integer. By default, all enums in the document must be
// specified by name, not by numeric value. Note that conversion by name will always
// be tried first, even if this option is enabled, so if you're stupid enough to do:
//
// const Horrible = enum {
// @"1" = 0,
// @"0" = 1,
// };
//
// then you deserve what you get. And what you'll get is confusing results.
// Also note that this option does not apply to tagged unions, despite those being
// backed by possibly ordered enums.
allow_numeric_enums: bool = false,
};
pub fn parseBuffer(
allocator: std.mem.Allocator,
buffer: []const u8,
diagnostics: *Diagnostics,
options: Options,
) !Document {
var state = State.init(allocator, diagnostics);
defer state.deinit();
errdefer state.document.deinit();
var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{
.buffer = buffers.ValidatingFixedLineBuffer.init(buffer, diagnostics),
};
while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior);
// state doesn't have access to the tokenizer, which is the only thing that can
// error if unparsed lines remain in the buffer by the time that "finish" is
// called.
try tok.finish();
return try state.finish(options);
}
pub fn parseBufferTo(
comptime T: type,
allocator: std.mem.Allocator,
buffer: []const u8,
diagnostics: *Diagnostics,
options: Options,
) !Parsed(T) {
var doc = try parseBuffer(allocator, buffer, diagnostics, options);
errdefer doc.deinit();
return try doc.convertTo(T, options);
}
pub const StreamParser = struct {
linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer),
parse_state: State,
parse_options: Options = .{},
pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser {
const diagnostics = try allocator.create(Diagnostics);
errdefer allocator.destroy(diagnostics);
diagnostics.* = Diagnostics{};
return .{
.linetok = .{
.buffer = try buffers.ValidatingLineBuffer.init(allocator, diagnostics),
},
.parse_state = State.init(allocator, diagnostics),
.parse_options = options,
};
}
pub fn deinit(self: StreamParser) void {
self.linetok.buffer.allocator.destroy(self.parse_state.diagnostics);
self.linetok.buffer.deinit();
self.parse_state.deinit();
}
pub fn feed(self: *StreamParser, data: []const u8) !void {
try self.linetok.buffer.feed(data);
while (try self.linetok.next()) |line| try self.parse_state.parseLine(line, self.parse_options.duplicate_key_behavior);
}
pub fn finish(self: *StreamParser) !Document {
try self.linetok.finish();
return try self.parse_state.finish(self.parse_options);
}
};