2023-10-22 16:16:57 -07:00
|
|
|
// Copyright 2023 torque@epicyclic.dev
|
|
|
|
//
|
|
|
|
// Licensed under the MIT/Expat license. You may not use this file except in
|
|
|
|
// compliance with the license. You may obtain a copy of the license at
|
|
|
|
//
|
|
|
|
// https://spdx.org/licenses/MIT.html
|
|
|
|
//
|
|
|
|
// This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
|
|
|
|
// CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
2023-09-13 00:11:45 -07:00
|
|
|
const std = @import("std");
|
|
|
|
|
2023-09-24 18:22:12 -07:00
|
|
|
const buffers = @import("./linebuffer.zig");
|
|
|
|
const tokenizer = @import("./tokenizer.zig");
|
2023-09-24 22:24:33 -07:00
|
|
|
const State = @import("./parser/state.zig").State;
|
2023-10-03 23:17:37 -07:00
|
|
|
pub const Document = @import("./parser/value.zig").Document;
|
|
|
|
pub const Parsed = @import("./parser/value.zig").Parsed;
|
2023-09-25 01:18:09 -07:00
|
|
|
pub const Value = @import("./parser/value.zig").Value;
|
2023-09-21 23:34:17 -07:00
|
|
|
|
2023-09-13 00:11:45 -07:00
|
|
|
pub const Diagnostics = struct {
|
2023-09-26 00:06:39 -07:00
|
|
|
row: usize = 0,
|
2023-09-27 23:44:06 -07:00
|
|
|
line_offset: usize = 0,
|
|
|
|
length: usize = 0,
|
2023-09-26 00:06:39 -07:00
|
|
|
message: []const u8 = "no problems",
|
2023-09-13 00:11:45 -07:00
|
|
|
};
|
|
|
|
|
2023-09-24 18:22:12 -07:00
|
|
|
pub const Error = error{
|
|
|
|
UnexpectedIndent,
|
|
|
|
UnexpectedValue,
|
|
|
|
EmptyDocument,
|
|
|
|
DuplicateKey,
|
|
|
|
BadMapEntry,
|
|
|
|
BadState,
|
|
|
|
BadToken,
|
|
|
|
Fail,
|
|
|
|
} || tokenizer.Error || std.mem.Allocator.Error;
|
|
|
|
|
|
|
|
pub const DuplicateKeyBehavior = enum {
|
|
|
|
use_first,
|
|
|
|
use_last,
|
|
|
|
fail,
|
2023-09-21 23:34:17 -07:00
|
|
|
};
|
|
|
|
|
2023-09-24 22:24:33 -07:00
|
|
|
pub const Options = struct {
|
|
|
|
// If a mapping has multiple entries with the same key, this option defines how the
|
|
|
|
// parser should behave. The default behavior is to emit an error if a repeated key
|
|
|
|
// is encountered.
|
|
|
|
duplicate_key_behavior: DuplicateKeyBehavior = .fail,
|
2023-09-17 23:09:26 -07:00
|
|
|
|
2023-09-24 22:24:33 -07:00
|
|
|
// If an empty document is parsed, this defines what value type should be the
|
|
|
|
// resulting document root object. The default behavior is to emit an error if the
|
|
|
|
// document is empty.
|
|
|
|
default_object: enum { string, list, map, fail } = .fail,
|
2023-10-03 23:17:37 -07:00
|
|
|
|
|
|
|
// Only used by the parseTo family of functions.
|
|
|
|
// If false, and a mapping contains additional keys that do not map to the fields of
|
|
|
|
// the corresponding object, an error will be raised. By default, additional keys
|
|
|
|
// will be skipped and no error will be raised. Note that tagged unions must be
|
|
|
|
// represented by a map with a single key, and having more than one key will always
|
|
|
|
// be an error, even if this option is set to true.
|
|
|
|
ignore_extra_fields: bool = true,
|
|
|
|
|
|
|
|
// Only used by the parseTo family of functions.
|
2023-11-23 17:47:21 -08:00
|
|
|
// If true, if a struct field has a default value associated with it and the
|
|
|
|
// corresponding mapping key does not exist, the object field will be set to the
|
|
|
|
// default value. By default, this behavior is enabled, allowing succinct
|
|
|
|
// representation of objects that have default fields.
|
|
|
|
allow_omitting_default_values: bool = true,
|
2023-10-03 23:17:37 -07:00
|
|
|
|
|
|
|
// Only used by the parseTo family of functions.
|
|
|
|
// If true, strings may be coerced into other scalar types, like booleans or
|
|
|
|
// numbers. By default, only document scalar fields will attempt to coerce to
|
|
|
|
// non-string values.
|
|
|
|
coerce_strings: bool = false,
|
|
|
|
|
|
|
|
// Only used by the parseTo family of functions.
|
2023-10-18 21:34:07 -07:00
|
|
|
// Two lists of strings. Scalars in a document that match any of the truthy values
|
|
|
|
// will be parsed to boolean true. Scalars in the document that match any of the
|
|
|
|
// falsy values will be parsed to boolean false. All other scalar values will raise
|
|
|
|
// an error if the destination is a boolean type. By default, these comparisons are
|
|
|
|
// case-sensitive. See the `case_insensitive_scalar_coersion` option to change
|
|
|
|
// this.
|
|
|
|
boolean_scalars: struct { truthy: []const []const u8, falsy: []const []const u8 } = .{
|
2023-10-03 23:17:37 -07:00
|
|
|
.truthy = &.{ "true", "True", "yes", "on" },
|
|
|
|
.falsy = &.{ "false", "False", "no", "off" },
|
|
|
|
},
|
|
|
|
|
2023-10-18 21:34:07 -07:00
|
|
|
// Only used by the parseTo family of functions.
|
|
|
|
// A list of strings. Scalars in the doucment that match any of the values listed
|
|
|
|
// will be parsed to optional `null`. Any other scalar value will be parsed as the
|
|
|
|
// optional child type if the destination type is an optional. By default, these
|
|
|
|
// comparisons are case-sensitive. See the `case_insensitive_scalar_coersion`
|
|
|
|
// option to change this.
|
|
|
|
null_scalars: []const []const u8 = &.{ "null", "nil", "None" },
|
|
|
|
|
2023-10-22 16:48:45 -07:00
|
|
|
// Only used by the parseTo family of functions.
|
|
|
|
// Choose whether to strip the leading `.` off of expected enum values. By default,
|
|
|
|
// `.enum_field` will be parsed into the enum field `enum_field`, which makes them
|
|
|
|
// look like source code enum literals. Any enum value missing the leading `.` will
|
|
|
|
// result in a conversion error. If set to false, no preprocessing will be done
|
|
|
|
// and enum values will be converted from the literal scalar/string. These two styles
|
|
|
|
// cannot be mixed in a single document.
|
|
|
|
expect_enum_dot: bool = true,
|
|
|
|
|
2023-10-18 21:34:07 -07:00
|
|
|
// Only used by the parseTo family of functions.
|
|
|
|
// Perform ASCII-case-insensitive comparisons for scalars (i.e. `TRUE` in a document
|
|
|
|
// will match `true` in the boolean scalars. Unicode case folding is not currently
|
|
|
|
// supported.
|
|
|
|
case_insensitive_scalar_coersion: bool = false,
|
2023-10-03 23:17:37 -07:00
|
|
|
|
|
|
|
// Only used by the parseTo family of functions.
|
|
|
|
// If true, document scalars that appear to be numbers will attempt to convert into
|
|
|
|
// enum values as an integer. By default, all enums in the document must be
|
|
|
|
// specified by name, not by numeric value. Note that conversion by name will always
|
|
|
|
// be tried first, even if this option is enabled, so if you're stupid enough to do:
|
|
|
|
//
|
|
|
|
// const Horrible = enum {
|
|
|
|
// @"1" = 0,
|
|
|
|
// @"0" = 1,
|
|
|
|
// };
|
|
|
|
//
|
|
|
|
// then you deserve what you get. And what you'll get is confusing results.
|
|
|
|
// Also note that this option does not apply to tagged unions, despite those being
|
|
|
|
// backed by possibly ordered enums.
|
|
|
|
allow_numeric_enums: bool = false,
|
config: start doing some code cleanup
I was pretty sloppy with the code organization while writing out the
state machines because my focus was on thinking through the parsing
process and logic there. However, The code was not in good shape to
continue implementing code features (not document features). This is
the first of probably several commits that will work on cleaning up
some things.
Value has been promoted to the top level namespace, and Document has an
initializer function. Referencing Value.List and Value.Map are much
cleaner now. Type aliases are good.
For the flow parser, `popStack` does not have to access anything except
the current stack. This can be passed in as a parameter. This means
that `parse` is ready to be refactored to take a buffer and an
allocator.
The main next steps for code improvement are:
1. reentrant/streaming parser. I am planning to leave it as
line-buffered, though I could go further. Line-buffered has two main
benefits: the tokenizer doesn't need to be refactored significantly,
and the flow parser doesn't need to be made reentrant. I may
reevaluate this as I am implementing it, however, as those changes
may be simpler than I think.
2. Actually implement the error diagnostics info. I have some skeleton
structure in place for this, so it should just be doing the work of
getting it hooked up.
3. Parse into object. Metaprogramming, let's go. It will be interesting
to try to do this non-recursively, as well (curious to see if it
results in code bloat).
4. Object to Document. This is probably going to be annoying, since
there are a variety of edge cases that will have to be handled. And
lots of objects that cannot be represented as documents.
5. Serialize Document. One thing the parser does not preserve is
whether a Value was flow-style or not, so it will be impossible to
do round-trip formatting preservation. That's currently a non-goal,
and I haven't decided yet if flow-style output should be based on
some heuristic (number/length of values in container) or just never
emitted. Lack of round-trip preservation does make using this as a
general purpose config format a lot more dubious, so I will have to
think about this some more.
6. Document to JSON. Why not? I will hand roll this and it will suck.
And then everything will be perfect and never need to be touched again.
2023-09-18 00:01:36 -07:00
|
|
|
};
|
|
|
|
|
2023-10-03 23:17:37 -07:00
|
|
|
pub fn parseBuffer(
|
|
|
|
allocator: std.mem.Allocator,
|
|
|
|
buffer: []const u8,
|
|
|
|
diagnostics: *Diagnostics,
|
|
|
|
options: Options,
|
|
|
|
) !Document {
|
2023-09-27 23:44:06 -07:00
|
|
|
var state = State.init(allocator, diagnostics);
|
2023-09-26 00:06:39 -07:00
|
|
|
defer state.deinit();
|
|
|
|
errdefer state.document.deinit();
|
2023-09-13 00:11:45 -07:00
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
var tok: tokenizer.LineTokenizer(buffers.ValidatingFixedLineBuffer) = .{
|
2023-09-27 23:44:06 -07:00
|
|
|
.buffer = buffers.ValidatingFixedLineBuffer.init(buffer, diagnostics),
|
2023-09-26 00:06:39 -07:00
|
|
|
};
|
2023-09-23 13:29:49 -07:00
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
while (try tok.next()) |line| try state.parseLine(line, options.duplicate_key_behavior);
|
2023-09-27 23:35:24 -07:00
|
|
|
// state doesn't have access to the tokenizer, which is the only thing that can
|
|
|
|
// error if unparsed lines remain in the buffer by the time that "finish" is
|
|
|
|
// called.
|
|
|
|
try tok.finish();
|
2023-09-26 00:06:39 -07:00
|
|
|
return try state.finish(options);
|
|
|
|
}
|
2023-09-25 01:18:09 -07:00
|
|
|
|
2023-10-03 23:17:37 -07:00
|
|
|
pub fn parseBufferTo(
|
|
|
|
comptime T: type,
|
|
|
|
allocator: std.mem.Allocator,
|
|
|
|
buffer: []const u8,
|
|
|
|
diagnostics: *Diagnostics,
|
|
|
|
options: Options,
|
|
|
|
) !Parsed(T) {
|
|
|
|
var doc = try parseBuffer(allocator, buffer, diagnostics, options);
|
2023-10-22 16:49:12 -07:00
|
|
|
errdefer doc.deinit();
|
2023-10-03 23:17:37 -07:00
|
|
|
return try doc.convertTo(T, options);
|
|
|
|
}
|
|
|
|
|
2023-09-25 01:18:09 -07:00
|
|
|
pub const StreamParser = struct {
|
2023-09-26 00:06:39 -07:00
|
|
|
linetok: tokenizer.LineTokenizer(buffers.ValidatingLineBuffer),
|
|
|
|
parse_state: State,
|
|
|
|
parse_options: Options = .{},
|
2023-09-25 01:18:09 -07:00
|
|
|
|
|
|
|
pub fn init(allocator: std.mem.Allocator, options: Options) !StreamParser {
|
2023-09-26 00:06:39 -07:00
|
|
|
const diagnostics = try allocator.create(Diagnostics);
|
|
|
|
errdefer allocator.destroy(diagnostics);
|
|
|
|
diagnostics.* = Diagnostics{};
|
|
|
|
|
2023-09-25 01:18:09 -07:00
|
|
|
return .{
|
|
|
|
.linetok = .{
|
2023-09-27 23:44:06 -07:00
|
|
|
.buffer = try buffers.ValidatingLineBuffer.init(allocator, diagnostics),
|
2023-09-25 01:18:09 -07:00
|
|
|
},
|
2023-09-27 23:44:06 -07:00
|
|
|
.parse_state = State.init(allocator, diagnostics),
|
2023-09-26 00:06:39 -07:00
|
|
|
.parse_options = options,
|
2023-09-25 01:18:09 -07:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn deinit(self: StreamParser) void {
|
2023-09-27 23:44:06 -07:00
|
|
|
self.linetok.buffer.allocator.destroy(self.parse_state.diagnostics);
|
2023-09-25 01:18:09 -07:00
|
|
|
self.linetok.buffer.deinit();
|
2023-09-26 00:06:39 -07:00
|
|
|
self.parse_state.deinit();
|
2023-09-25 01:18:09 -07:00
|
|
|
}
|
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
pub fn feed(self: *StreamParser, data: []const u8) !void {
|
2023-09-25 01:18:09 -07:00
|
|
|
try self.linetok.buffer.feed(data);
|
2023-09-26 00:06:39 -07:00
|
|
|
while (try self.linetok.next()) |line| try self.parse_state.parseLine(line, self.parse_options.duplicate_key_behavior);
|
2023-09-25 01:18:09 -07:00
|
|
|
}
|
|
|
|
|
2023-09-26 00:06:39 -07:00
|
|
|
pub fn finish(self: *StreamParser) !Document {
|
2023-09-27 23:35:24 -07:00
|
|
|
try self.linetok.finish();
|
2023-09-26 00:06:39 -07:00
|
|
|
return try self.parse_state.finish(self.parse_options);
|
2023-09-25 01:18:09 -07:00
|
|
|
}
|
|
|
|
};
|