From 38e47b39dc4d3c0c8c2fcb50c2f458021cb86407 Mon Sep 17 00:00:00 2001 From: torque Date: Sun, 24 Sep 2023 18:22:12 -0700 Subject: [PATCH] all: do some restructuring I don't like big monolithic source files, so let's restructure a bit. parser.zig is still bigger than I would like it to be, but there isn't a good way to break up the two state machine parsers, which take up most of the space. This is the last junk commit before I am seriously going to implement the "streaming" parser. Which is the last change before implementing deserialization to object. I am definitely not just spinning my wheels here. --- build.zig | 2 +- src/linebuffer.zig | 105 +++++ src/nice.zig | 67 ++++ src/{config.zig => parser.zig} | 712 +++------------------------------ src/parser/value.zig | 151 +++++++ src/tokenizer.zig | 251 ++++++++++++ 6 files changed, 639 insertions(+), 649 deletions(-) create mode 100644 src/linebuffer.zig create mode 100644 src/nice.zig rename src/{config.zig => parser.zig} (57%) create mode 100644 src/parser/value.zig create mode 100644 src/tokenizer.zig diff --git a/build.zig b/build.zig index 2fcea8d..caf97f7 100644 --- a/build.zig +++ b/build.zig @@ -4,7 +4,7 @@ pub fn build(b: *std.Build) void { const target = b.standardTargetOptions(.{}); const nice = b.addModule("nice", .{ - .source_file = .{ .path = "src/config.zig" }, + .source_file = .{ .path = "src/nice.zig" }, }); add_examples(b, .{ diff --git a/src/linebuffer.zig b/src/linebuffer.zig new file mode 100644 index 0000000..1f50e13 --- /dev/null +++ b/src/linebuffer.zig @@ -0,0 +1,105 @@ +const std = @import("std"); + +pub const IndexSlice = struct { start: usize, len: usize }; + +pub const LineBuffer = struct { + allocator: std.mem.Allocator, + internal: FixedLineBuffer, + used: usize, + + pub const default_capacity: usize = 4096; + pub const Error = std.mem.Allocator.Error; + + pub fn init(allocator: std.mem.Allocator) Error!LineBuffer { + return initCapacity(allocator, default_capacity); + } + + pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer { + return .{ + .allocator = allocator, + .internal = .{ + .buffer = try allocator.alloc(u8, capacity), + .window = .{ .start = 0, .len = 0 }, + }, + .used = 0, + }; + } + + pub fn feed(self: *LineBuffer, data: []const u8) Error!void { + if (data.len == 0) return; + // TODO: check for usize overflow here if we want Maximum Robustness + const new_window_len = self.internal.window.len + data.len; + + // data cannot fit in the buffer with our scan window, so we have to realloc + if (new_window_len > self.internal.buffer.len) { + // TODO: adopt an overallocation strategy? Will potentially avoid allocating + // on every invocation but will cause the buffer to oversize + try self.allocator.realloc(self.internal.buffer, new_window_len); + self.rehome(); + @memcpy(self.internal.buffer[self.used..].ptr, data); + self.used = new_window_len; + self.internal.window.len = new_window_len; + } + // data will fit, but needs to be moved in the buffer + else if (self.internal.window.start + new_window_len > self.internal.buffer.len) { + self.rehome(); + @memcpy(self.internal.buffer[self.used..].ptr, data); + self.used = new_window_len; + self.internal.window.len = new_window_len; + } + // data can simply be appended + else { + @memcpy(self.internal.buffer[self.used..].ptr, data); + } + } + + /// The memory returned by this function is valid until the next call to `feed`. + /// The resulting slice does not include the newline character. + pub fn nextLine(self: *LineBuffer) ?[]const u8 { + return self.internal.nextLine(); + } + + fn rehome(self: *LineBuffer) void { + self.internal.rehome(); + self.used = self.internal.window.len; + } +}; + +pub const FixedLineBuffer = struct { + buffer: []const u8, + window: IndexSlice, + + pub fn init(data: []const u8) FixedLineBuffer { + return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } }; + } + + pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 { + if (self.window.start >= self.buffer.len or self.window.len == 0) + return null; + + const window = self.buffer[self.window.start..][0..self.window.len]; + const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null; + + self.window.start += split + 1; + self.window.len -= split + 1; + + return window[0..split]; + } + + // move the current scan window to the beginning of the buffer. This internal + // method is used by LineBuffer. + fn rehome(self: *LineBuffer) usize { + if (self.window.start == 0) return; + + const window = self.buffer[self.window.start..][0..self.window.len]; + + // if the window is longer than its starting index, the memory move will be + // overlapping, so we can't use memcpy + if (self.window.len > self.window.start) + std.mem.copyForwards(u8, self.buffer, window) + else + @memcpy(self.buffer.ptr, window); + + self.window.start = 0; + } +}; diff --git a/src/nice.zig b/src/nice.zig new file mode 100644 index 0000000..3920756 --- /dev/null +++ b/src/nice.zig @@ -0,0 +1,67 @@ +// Heavily inspired by, but not quite compatible with, NestedText. Key differences: +// +// - Doesn't support multiline keys (this means map keys cannot start with +// ' ', \t, #, {, [, |, or >, and they cannot contain :) +// - Allows using tabs for indentation (but not mixed tabs/spaces) +// - Indentation must be quantized consistently throughout the document. e.g. +// every nested layer being exactly 2 spaces past its parent. Tabs may +// only use one tab per indentation level. +// - Allows flow-style lists, maps, and strings on the same line as map keys or +// list items (i.e. the following are legal): +// +// key: {inline: map} +// key: [inline, list] +// key: > inline string +// - {map: item} +// - [list, item] +// - > inline string +// +// The string case retains the possibility of having an inline map value starting +// with {, [, or > +// - a map keys and list item dashes must be followed by a value or an indented +// section to reduce parser quantum state. This means that +// +// foo: +// bar: baz +// +// or +// +// - +// - qux +// +// are not valid. This can be represented with an inline empty string after foo: +// +// foo: > +// bar: baz +// +// or +// +// - > +// - qux +// +// - newlines are strictly LF, if the parser finds CR, it is an error +// - blank lines may not contain any whitespace characters except the single LF +// - Additional string indicator `|` for soft-wrapped strings, i.e. +// +// key: | this is not special +// key: +// | these lines are +// | soft-wrapped +// +// soft-wrapped lines are joined with a ' ' instead of a newline character. +// Like multiline strings, the final space is stripped (I guess this is a very +// janky way to add trailing whitespace to a string). +// +// - terminated strings to allow trailing whitespace: +// | this string has trailing whitespace | +// > and so does this one | +// - The parser is both strict and probably sloppy and may have weird edge +// cases since I'm slinging code, not writing a spec. For example, tabs are +// not trimmed from the values of inline lists/maps + +const std = @import("std"); + +pub const buffers = @import("./linebuffer.zig"); +pub const tokenizer = @import("./tokenizer.zig"); +pub const parser = @import("./parser.zig"); +pub const Parser = parser.Parser; diff --git a/src/config.zig b/src/parser.zig similarity index 57% rename from src/config.zig rename to src/parser.zig index 3d297ec..34ec496 100644 --- a/src/config.zig +++ b/src/parser.zig @@ -1,69 +1,8 @@ -// Heavily inspired by, but not quite compatible with, NestedText. Key differences: -// -// - Doesn't support multiline keys (this means map keys cannot start with -// ' ', \t, #, {, [, |, or >, and they cannot contain :) -// - Allows using tabs for indentation (but not mixed tabs/spaces) -// - Indentation must be quantized consistently throughout the document. e.g. -// every nested layer being exactly 2 spaces past its parent. Tabs may -// only use one tab per indentation level. -// - Allows flow-style lists, maps, and strings on the same line as map keys or -// list items (i.e. the following are legal): -// -// key: {inline: map} -// key: [inline, list] -// key: > inline string -// - {map: item} -// - [list, item] -// - > inline string -// -// The string case retains the possibility of having an inline map value starting -// with {, [, or > -// - inline lists and maps cannot contain other inline structures. This may -// change, as writing {:[{:[{:[{:[{:[{:[]}]}]}]}]}]} seems tremendously useful -// - a map keys and list item dashes must be followed by a value or an indented -// section to reduce parser quantum state. This means that -// -// foo: -// bar: baz -// -// or -// -// - -// - qux -// -// are not valid. This can be represented with an inline empty string after foo: -// -// foo: > -// bar: baz -// -// or -// -// - > -// - qux -// -// - newlines are strictly LF, if the parser finds CR, it is an error -// - blank lines may not contain any whitespace characters except the single LF -// - Additional string indicator `|` for soft-wrapped strings, i.e. -// -// key: | this is not special -// key: -// | these lines are -// | soft-wrapped -// -// soft-wrapped lines are joined with a ' ' instead of a newline character. -// Like multiline strings, the final space is stripped (I guess this is a very -// janky way to add trailing whitespace to a string). -// -// - terminated strings to allow trailing whitespace: -// | this string has trailing whitespace | -// > and so does this one | -// - The parser is both strict and probably sloppy and may have weird edge -// cases since I'm slinging code, not writing a spec. For example, tabs are -// not trimmed from the values of inline lists/maps - const std = @import("std"); -pub const IndexSlice = struct { start: usize, len: usize }; +const buffers = @import("./linebuffer.zig"); +const tokenizer = @import("./tokenizer.zig"); +const Value = @import("./parser/value.zig").Value; pub const Diagnostics = struct { row: usize, @@ -71,481 +10,51 @@ pub const Diagnostics = struct { message: []const u8, }; -pub const LineBuffer = struct { - allocator: std.mem.Allocator, - buffer: []u8, - used: usize, - window: IndexSlice, +pub const Error = error{ + UnexpectedIndent, + UnexpectedValue, + ExtraContent, + EmptyDocument, + DuplicateKey, + BadMapEntry, + BadState, + BadToken, + Fail, +} || tokenizer.Error || std.mem.Allocator.Error; - pub const default_capacity: usize = 4096; - pub const Error = std.mem.Allocator.Error; +pub const DuplicateKeyBehavior = enum { + use_first, + use_last, + fail, +}; - pub fn init(allocator: std.mem.Allocator) Error!LineBuffer { - return initCapacity(allocator, default_capacity); - } +pub const DefaultObject = enum { + scalar, + string, + list, + map, + fail, +}; - pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) Error!LineBuffer { +const ParseState = enum { initial, value, done }; + +pub const Document = struct { + arena: std.heap.ArenaAllocator, + root: Value, + + pub fn init(alloc: std.mem.Allocator) Document { return .{ - .allocator = allocator, - .buffer = try allocator.alloc(u8, capacity), - .used = 0, - .window = .{ .start = 0, .len = 0 }, + .arena = std.heap.ArenaAllocator.init(alloc), + .root = undefined, }; } - pub fn feed(self: *LineBuffer, data: []const u8) Error!void { - if (data.len == 0) return; - // TODO: check for usize overflow here if we want Maximum Robustness - const new_window_len = self.window.len + data.len; - - // data cannot fit in the buffer with our scan window, so we have to realloc - if (new_window_len > self.buffer.len) { - // TODO: adopt an overallocation strategy? Will potentially avoid allocating - // on every invocation but will cause the buffer to oversize - try self.allocator.realloc(self.buffer, new_window_len); - self.rehome(); - @memcpy(self.buffer[self.used..].ptr, data); - self.used = new_window_len; - self.window.len = new_window_len; - } - // data will fit, but needs to be moved in the buffer - else if (self.window.start + new_window_len > self.buffer.len) { - self.rehome(); - @memcpy(self.buffer[self.used..].ptr, data); - self.used = new_window_len; - self.window.len = new_window_len; - } - // data can simply be appended - else { - @memcpy(self.buffer[self.used..].ptr, data); - } + pub fn printDebug(self: Document) void { + return self.root.printDebug(); } - /// The memory returned by this function is valid until the next call to `feed`. - /// The resulting slice does not include the newline character. - pub fn nextLine(self: *LineBuffer) ?[]const u8 { - if (self.window.start >= self.buffer.len or self.window.len == 0) - return null; - - const window = self.buffer[self.window.start..][0..self.window.len]; - const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null; - - self.window.start += split + 1; - self.window.len -= split + 1; - - return window[0..split]; - } - - fn rehome(self: *LineBuffer) void { - if (self.window.start == 0) return; - - const window = self.buffer[self.window.start..][0..self.window.len]; - - if (self.window.len > self.window.start) - std.mem.copyForwards(u8, self.buffer, window) - else - @memcpy(self.buffer.ptr, window); - - self.window.start = 0; - self.used = window.len; - } -}; - -pub const FixedLineBuffer = struct { - buffer: []const u8, - window: IndexSlice, - - pub fn init(data: []const u8) FixedLineBuffer { - return .{ .buffer = data, .window = .{ .start = 0, .len = data.len } }; - } - - pub fn nextLine(self: *FixedLineBuffer) ?[]const u8 { - if (self.window.start >= self.buffer.len or self.window.len == 0) - return null; - - const window = self.buffer[self.window.start..][0..self.window.len]; - const split = std.mem.indexOfScalar(u8, window, '\n') orelse return null; - - self.window.start += split + 1; - self.window.len -= split + 1; - - return window[0..split]; - } -}; - -const IndentationType = union(enum) { - immaterial: void, - spaces: usize, - tabs: void, -}; - -const InlineItem = union(enum) { - empty: void, - scalar: []const u8, - line_string: []const u8, - space_string: []const u8, - - flow_list: []const u8, - flow_map: []const u8, - - fn lineEnding(self: InlineItem) u8 { - return switch (self) { - .line_string => '\n', - .space_string => ' ', - else => unreachable, - }; - } -}; - -const LineContents = union(enum) { - comment: []const u8, - - in_line: InlineItem, - list_item: InlineItem, - map_item: struct { key: []const u8, val: InlineItem }, -}; - -// we can dedent multiple levels at once. Example: -// -// foo: -// bar: -// > a -// > string -// baz: [qux] -// -// capturing this is conceptually simple, but implementing it without complex -// indentation tracking requires quantizing the indentation. This means our -// IndentationType will also need to track the number of spaces used for -// indentation, as detected. Then every line we have to check indent rem the -// quantization level == 0 (otherwise we broke quantization) and compute indent -// div the quantization level to give us our effective indentation level. - -const ShiftDirection = enum { indent, dedent, none }; -const RelativeIndent = union(ShiftDirection) { - indent: void, - dedent: usize, - none: void, -}; - -const Line = struct { - indent: RelativeIndent, - contents: LineContents, - raw: []const u8, -}; - -pub fn LineTokenizer(comptime Buffer: type) type { - return struct { - buffer: Buffer, - index: usize = 0, - indentation: IndentationType = .immaterial, - last_indent: usize = 0, - diagnostics: *Diagnostics, - row: usize = 0, - - const Error = error{ - BadToken, - MixedIndentation, - UnquantizedIndentation, - TooMuchIndentation, - MissingNewline, - TrailingWhitespace, - Impossible, - }; - - pub fn next(self: *@This()) Error!?Line { - lineloop: while (self.buffer.nextLine()) |raw_line| { - var indent: usize = 0; - for (raw_line, 0..) |char, idx| { - switch (char) { - ' ' => { - switch (self.indentation) { - // There's a weird coupling here because we can't set this until - // all spaces have been consumed. I also thought about ignoring - // spaces on comment lines since those don't affect the - // relative indent/dedent, but then we would allow comments - // to ignore our indent quantum, which I dislike due to it making - // ugly documents. - .immaterial => self.indentation = .{ .spaces = 0 }, - .spaces => {}, - .tabs => return error.MixedIndentation, - } - }, - '\t' => { - switch (self.indentation) { - .immaterial => self.indentation = .tabs, - .spaces => return error.MixedIndentation, - .tabs => {}, - } - }, - '\r' => { - return error.BadToken; - }, - else => { - indent = idx; - break; - }, - } - } else { - if (raw_line.len > 0) return error.TrailingWhitespace; - continue :lineloop; - } - - var quantized: usize = if (self.indentation == .spaces) quant: { - if (self.indentation.spaces == 0) { - self.indentation.spaces = indent; - } - if (@rem(indent, self.indentation.spaces) != 0) - return error.UnquantizedIndentation; - - break :quant @divExact(indent, self.indentation.spaces); - } else indent; - - const relative: RelativeIndent = if (quantized > self.last_indent) rel: { - if ((quantized - self.last_indent) > 1) - return error.TooMuchIndentation; - break :rel .indent; - } else if (quantized < self.last_indent) - .{ .dedent = self.last_indent - quantized } - else - .none; - - defer { - self.row += 1; - self.last_indent = quantized; - } - - const line = raw_line[indent..]; - - // this should not be possible, as empty lines are caught earlier. - if (line.len == 0) return error.Impossible; - - switch (line[0]) { - '#' => { - // simply lie about indentation when the line is a comment. - quantized = self.last_indent; - return .{ - .indent = .none, - .contents = .{ .comment = line[1..] }, - .raw = line, - }; - }, - '|', '>', '[', '{' => { - return .{ - .indent = relative, - .contents = .{ .in_line = try detectInlineItem(line) }, - .raw = line, - }; - }, - '-' => { - if (line.len > 1 and line[1] != ' ') return error.BadToken; - - return if (line.len == 1) .{ - .indent = relative, - .contents = .{ .list_item = .empty }, - .raw = line, - } else .{ - .indent = relative, - .contents = .{ .list_item = try detectInlineItem(line[2..]) }, - .raw = line, - }; - }, - else => { - for (line, 0..) |char, idx| { - if (char == ':') { - if (idx + 1 == line.len) return .{ - .indent = relative, - .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } }, - .raw = line, - }; - - if (line[idx + 1] != ' ') return error.BadToken; - - return .{ - .indent = relative, - .contents = .{ .map_item = .{ - .key = line[0..idx], - .val = try detectInlineItem(line[idx + 2 ..]), - } }, - .raw = line, - }; - } - } - - return .{ - .indent = relative, - .contents = .{ .in_line = .{ .scalar = line } }, - .raw = line, - }; - }, - } - - // somehow everything else has failed - return error.Impossible; - } - return null; - } - - fn detectInlineItem(buf: []const u8) Error!InlineItem { - if (buf.len == 0) return .empty; - - switch (buf[0]) { - '>', '|' => |char| { - if (buf.len > 1 and buf[1] != ' ') return error.BadToken; - - const slice: []const u8 = switch (buf[buf.len - 1]) { - ' ', '\t' => return error.TrailingWhitespace, - '|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)], - else => buf[@min(2, buf.len)..buf.len], - }; - - return if (char == '>') - .{ .line_string = slice } - else - .{ .space_string = slice }; - }, - '[' => { - if (buf.len < 2 or buf[buf.len - 1] != ']') - return error.BadToken; - - // keep the closing ] for the flow parser - return .{ .flow_list = buf[1..] }; - }, - '{' => { - if (buf.len < 2 or buf[buf.len - 1] != '}') - return error.BadToken; - - // keep the closing } fpr the flow parser - return .{ .flow_map = buf[1..] }; - }, - else => { - if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') - return error.TrailingWhitespace; - - return .{ .scalar = buf }; - }, - } - } - }; -} - -pub const Value = union(enum) { - pub const String = std.ArrayList(u8); - pub const Map = std.StringArrayHashMap(Value); - pub const List = std.ArrayList(Value); - pub const TagType = @typeInfo(Value).Union.tag_type.?; - - scalar: String, - string: String, - list: List, - flow_list: List, - map: Map, - flow_map: Map, - - pub inline fn fromScalar(alloc: std.mem.Allocator, input: []const u8) !Value { - return try _fromScalarOrString(alloc, .scalar, input); - } - - pub inline fn fromString(alloc: std.mem.Allocator, input: []const u8) !Value { - return try _fromScalarOrString(alloc, .string, input); - } - - inline fn _fromScalarOrString(alloc: std.mem.Allocator, comptime classification: TagType, input: []const u8) !Value { - var res = @unionInit(Value, @tagName(classification), try String.initCapacity(alloc, input.len)); - @field(res, @tagName(classification)).appendSliceAssumeCapacity(input); - return res; - } - - pub inline fn newScalar(alloc: std.mem.Allocator) Value { - return .{ .scalar = String.init(alloc) }; - } - - pub inline fn newString(alloc: std.mem.Allocator) Value { - return .{ .string = String.init(alloc) }; - } - - pub inline fn newList(alloc: std.mem.Allocator) Value { - return .{ .list = List.init(alloc) }; - } - - pub inline fn newFlowList(alloc: std.mem.Allocator) Value { - return .{ .flow_list = List.init(alloc) }; - } - - pub inline fn newMap(alloc: std.mem.Allocator) Value { - return .{ .map = Map.init(alloc) }; - } - - pub inline fn newFlowMap(alloc: std.mem.Allocator) Value { - return .{ .flow_map = Map.init(alloc) }; - } - - pub fn printDebug(self: Value) void { - self.printRecursive(0); - std.debug.print("\n", .{}); - } - - fn printRecursive(self: Value, indent: usize) void { - switch (self) { - .scalar, .string => |str| { - if (std.mem.indexOfScalar(u8, str.items, '\n')) |_| { - var lines = std.mem.splitScalar(u8, str.items, '\n'); - std.debug.print("\n", .{}); - while (lines.next()) |line| { - std.debug.print( - "{[empty]s: >[indent]}{[line]s}{[nl]s}", - .{ - .empty = "", - .indent = indent, - .line = line, - .nl = if (lines.peek() == null) "" else "\n", - }, - ); - } - } else { - std.debug.print("{s}", .{str.items}); - } - }, - .list, .flow_list => |list| { - if (list.items.len == 0) { - std.debug.print("[]", .{}); - return; - } - - std.debug.print("[\n", .{}); - for (list.items, 0..) |value, idx| { - std.debug.print("{[empty]s: >[indent]}[{[idx]d}] = ", .{ .empty = "", .indent = indent, .idx = idx }); - value.printRecursive(indent + 2); - std.debug.print(",\n", .{}); - } - std.debug.print( - "{[empty]s: >[indent]}]", - .{ .empty = "", .indent = indent }, - ); - }, - .map, .flow_map => |map| { - if (map.count() == 0) { - std.debug.print("{{}}", .{}); - return; - } - - std.debug.print("{{\n", .{}); - - var iter = map.iterator(); - - while (iter.next()) |entry| { - std.debug.print( - "{[empty]s: >[indent]}{[key]s}: ", - .{ .empty = "", .indent = indent + 2, .key = entry.key_ptr.* }, - ); - entry.value_ptr.printRecursive(indent + 4); - std.debug.print(",\n", .{}); - } - std.debug.print( - "{[empty]s: >[indent]}}}", - .{ .empty = "", .indent = indent }, - ); - }, - } + pub fn deinit(self: Document) void { + self.arena.deinit(); } }; @@ -559,64 +68,13 @@ pub const Parser = struct { .message = "all is well", }, - pub const Error = error{ - UnexpectedIndent, - UnexpectedValue, - ExtraContent, - EmptyDocument, - DuplicateKey, - BadMapEntry, - BadState, - BadToken, - Fail, - } || LineTokenizer(FixedLineBuffer).Error || std.mem.Allocator.Error; - - pub const DuplicateKeyBehavior = enum { - use_first, - use_last, - fail, - }; - - pub const DefaultObject = enum { - string, - list, - map, - fail, - }; - - pub const ParseState = enum { - initial, - value, - done, - }; - - pub const Document = struct { - arena: std.heap.ArenaAllocator, - root: Value, - - pub fn init(alloc: std.mem.Allocator) Document { - return .{ - .arena = std.heap.ArenaAllocator.init(alloc), - .root = undefined, - }; - } - - pub fn printDebug(self: Document) void { - return self.root.printDebug(); - } - - pub fn deinit(self: Document) void { - self.arena.deinit(); - } - }; - pub const State = struct { pub const Stack = std.ArrayList(*Value); document: Document, value_stack: Stack, - state: ParseState = .initial, - expect_shift: ShiftDirection = .none, + state: enum { initial, value, done } = .initial, + expect_shift: tokenizer.ShiftDirection = .none, dangling_key: ?[]const u8 = null, pub fn init(alloc: std.mem.Allocator) State { @@ -637,13 +95,13 @@ pub const Parser = struct { const arena_alloc = document.arena.allocator(); var state: ParseState = .initial; - var expect_shift: ShiftDirection = .none; + var expect_shift: tokenizer.ShiftDirection = .none; var dangling_key: ?[]const u8 = null; var stack = std.ArrayList(*Value).init(arena_alloc); defer stack.deinit(); - var tok: LineTokenizer(FixedLineBuffer) = .{ - .buffer = FixedLineBuffer.init(buffer), + var tok: tokenizer.LineTokenizer(buffers.FixedLineBuffer) = .{ + .buffer = buffers.FixedLineBuffer.init(buffer), .diagnostics = &self.diagnostics, }; @@ -656,7 +114,7 @@ pub const Parser = struct { flipflop: while (flip) : (flop = true) { switch (state) { .initial => { - if (line.indent == .indent) return error.UnexpectedIndent; + if (line.shift == .indent) return error.UnexpectedIndent; switch (line.contents) { // we filter out comments above @@ -737,14 +195,14 @@ pub const Parser = struct { // switch is embedded. .scalar, .flow_list, .flow_map => unreachable, .string => |*string| { - if (line.indent == .indent) + if (line.shift == .indent) return error.UnexpectedIndent; - if (!flop and line.indent == .dedent) { + if (!flop and line.shift == .dedent) { // kick off the last trailing space or newline _ = string.pop(); - var dedent_depth = line.indent.dedent; + var dedent_depth = line.shift.dedent; while (dedent_depth > 0) : (dedent_depth -= 1) _ = stack.pop(); @@ -772,7 +230,7 @@ pub const Parser = struct { // // the first line here creates the expect_shift, but the second line // is a valid continuation of the list despite not being indented - if (!flop and (expect_shift == .indent and line.indent != .indent)) + if (!flop and (expect_shift == .indent and line.shift != .indent)) try list.append(Value.newScalar(arena_alloc)); // Consider: @@ -782,11 +240,11 @@ pub const Parser = struct { // - inline scalar // // the own-line scalar will not push the stack but the next list item will be a dedent - if (!flop and line.indent == .dedent) { - // if line.indent.dedent is 1 and we're expecting it, the stack will not be popped, + if (!flop and line.shift == .dedent) { + // if line.shift.dedent is 1 and we're expecting it, the stack will not be popped, // but we will continue loop flipflop. However, flop will be set to false on the next // trip, so this if prong will not be run again. - var dedent_depth = line.indent.dedent - @intFromBool(expect_shift == .dedent); + var dedent_depth = line.shift.dedent - @intFromBool(expect_shift == .dedent); while (dedent_depth > 0) : (dedent_depth -= 1) _ = stack.pop(); @@ -799,7 +257,7 @@ pub const Parser = struct { .in_line => |in_line| { // assert that this line has been indented. this is required for an inline value when // the stack is in list mode. - if (expect_shift != .indent or line.indent != .indent) + if (expect_shift != .indent or line.shift != .indent) return error.UnexpectedValue; expect_shift = .dedent; @@ -819,7 +277,7 @@ pub const Parser = struct { } }, .list_item => |value| { - if (flop or (line.indent == .none or line.indent == .dedent)) { + if (flop or (line.shift == .none or line.shift == .dedent)) { expect_shift = .none; switch (value) { .empty => expect_shift = .indent, @@ -828,7 +286,7 @@ pub const Parser = struct { .flow_list => |str| try list.append(try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior)), .flow_map => |str| try list.append(try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior)), } - } else if (line.indent == .indent) { + } else if (line.shift == .indent) { if (expect_shift != .indent) return error.UnexpectedIndent; const new_list = try appendListGetValue(list, Value.newList(arena_alloc)); @@ -847,7 +305,7 @@ pub const Parser = struct { // // dedenting back to the list stack level requires list_item - if (line.indent != .indent) + if (line.shift != .indent) return error.UnexpectedValue; const new_map = try appendListGetValue(list, Value.newMap(arena_alloc)); @@ -865,7 +323,7 @@ pub const Parser = struct { // // the first line here creates the expect_shift, but the second line // is a valid continuation of the map despite not being indented - if (!flop and (expect_shift == .indent and line.indent != .indent)) { + if (!flop and (expect_shift == .indent and line.shift != .indent)) { try putMap( map, dangling_key orelse return error.Fail, @@ -875,8 +333,8 @@ pub const Parser = struct { dangling_key = null; } - if (!flop and line.indent == .dedent) { - var dedent_depth = line.indent.dedent - @intFromBool(expect_shift == .dedent); + if (!flop and line.shift == .dedent) { + var dedent_depth = line.shift.dedent - @intFromBool(expect_shift == .dedent); while (dedent_depth > 0) : (dedent_depth -= 1) _ = stack.pop(); @@ -889,7 +347,7 @@ pub const Parser = struct { .in_line => |in_line| { // assert that this line has been indented. this is required for an inline value when // the stack is in map mode. - if (expect_shift != .indent or line.indent != .indent or dangling_key == null) + if (expect_shift != .indent or line.shift != .indent or dangling_key == null) return error.UnexpectedValue; expect_shift = .dedent; @@ -921,7 +379,7 @@ pub const Parser = struct { // // dedenting back to the map stack level requires map_item - if (expect_shift != .indent or line.indent != .indent or dangling_key == null) + if (expect_shift != .indent or line.shift != .indent or dangling_key == null) return error.UnexpectedValue; const new_list = try putMapGetValue(map, dangling_key.?, Value.newList(arena_alloc), self.dupe_behavior); @@ -931,7 +389,7 @@ pub const Parser = struct { continue :flipflop; }, .map_item => |pair| { - if (flop or (line.indent == .none or line.indent == .dedent)) { + if (flop or (line.shift == .none or line.shift == .dedent)) { expect_shift = .none; const dupekey = try arena_alloc.dupe(u8, pair.key); switch (pair.val) { @@ -944,7 +402,7 @@ pub const Parser = struct { .flow_list => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_list, self.dupe_behavior), self.dupe_behavior), .flow_map => |str| try putMap(map, dupekey, try parseFlow(arena_alloc, str, .flow_map, self.dupe_behavior), self.dupe_behavior), } - } else if (line.indent == .indent) { + } else if (line.shift == .indent) { if (expect_shift != .indent or dangling_key == null) return error.UnexpectedValue; const new_map = try putMapGetValue(map, dangling_key.?, Value.newMap(arena_alloc), self.dupe_behavior); @@ -967,6 +425,7 @@ pub const Parser = struct { switch (state) { .initial => switch (self.default_object) { + .scalar => document.root = .{ .scalar = std.ArrayList(u8).init(arena_alloc) }, .string => document.root = .{ .string = std.ArrayList(u8).init(arena_alloc) }, .list => document.root = Value.newList(arena_alloc), .map => document.root = Value.newMap(arena_alloc), @@ -1256,47 +715,4 @@ pub const Parser = struct { return gop.value_ptr; } - - pub fn dumpBufLines(self: *Parser, buf: []const u8) Error!void { - var tok: LineTokenizer = .{ .buffer = buf, .diagnostics = &self.diagnostics }; - while (try tok.next()) |line| { - dumpLine(line); - } - } - - fn dumpLine(line: LineTokenizer.Line) void { - var dedbuf: [64]u8 = .{0} ** 64; - var keybuf: [2048]u8 = .{0} ** 2048; - var valbuf: [2048]u8 = .{0} ** 2048; - - const shiftstr = if (line.indent == .dedent) - std.fmt.bufPrint(&dedbuf, " ({d})", .{line.indent.dedent}) catch unreachable - else - ""; - - std.debug.print("{s}{s}: {s} => {s}\n", .{ - @tagName(line.indent), shiftstr, @tagName(line.contents), switch (line.contents) { - .comment => |str| str, - .in_line, .list_item => |scalar| switch (scalar) { - .empty => "[empty]", - .scalar, - .string, - .flow_list, - .flow_map, - => |str| std.fmt.bufPrint(&keybuf, "{s} => {s}", .{ @tagName(scalar), str }) catch unreachable, - }, - .map_item => |map| std.fmt.bufPrint(&keybuf, "{s} : {s}", .{ - map.key, - switch (map.val) { - .empty => "[empty]", - .scalar, - .string, - .flow_list, - .flow_map, - => |str| std.fmt.bufPrint(&valbuf, "{s} => {s}", .{ @tagName(map.val), str }) catch unreachable, - }, - }) catch unreachable, - }, - }); - } }; diff --git a/src/parser/value.zig b/src/parser/value.zig new file mode 100644 index 0000000..85c783f --- /dev/null +++ b/src/parser/value.zig @@ -0,0 +1,151 @@ +const std = @import("std"); + +pub const Value = union(enum) { + pub const String = std.ArrayList(u8); + pub const Map = std.StringArrayHashMap(Value); + pub const List = std.ArrayList(Value); + pub const TagType = @typeInfo(Value).Union.tag_type.?; + + scalar: String, + string: String, + list: List, + flow_list: List, + map: Map, + flow_map: Map, + + pub inline fn fromScalar(alloc: std.mem.Allocator, input: []const u8) !Value { + return try _fromScalarOrString(alloc, .scalar, input); + } + + pub inline fn fromString(alloc: std.mem.Allocator, input: []const u8) !Value { + return try _fromScalarOrString(alloc, .string, input); + } + + inline fn _fromScalarOrString(alloc: std.mem.Allocator, comptime classification: TagType, input: []const u8) !Value { + var res = @unionInit(Value, @tagName(classification), try String.initCapacity(alloc, input.len)); + @field(res, @tagName(classification)).appendSliceAssumeCapacity(input); + return res; + } + + pub inline fn newScalar(alloc: std.mem.Allocator) Value { + return .{ .scalar = String.init(alloc) }; + } + + pub inline fn newString(alloc: std.mem.Allocator) Value { + return .{ .string = String.init(alloc) }; + } + + pub inline fn newList(alloc: std.mem.Allocator) Value { + return .{ .list = List.init(alloc) }; + } + + pub inline fn newFlowList(alloc: std.mem.Allocator) Value { + return .{ .flow_list = List.init(alloc) }; + } + + pub inline fn newMap(alloc: std.mem.Allocator) Value { + return .{ .map = Map.init(alloc) }; + } + + pub inline fn newFlowMap(alloc: std.mem.Allocator) Value { + return .{ .flow_map = Map.init(alloc) }; + } + + pub fn recursiveEqualsExact(self: Value, other: Value) bool { + if (@as(TagType, self) != other) return false; + switch (self) { + inline .scalar, .string => |str, tag| return std.mem.eql(u8, str.items, @field(other, @tagName(tag)).items), + inline .list, .flow_list => |lst, tag| { + const olst = @field(other, @tagName(tag)); + + if (lst.items.len != olst.items.len) return false; + for (lst.items, olst.items) |this, that| if (!this.recursiveEqualsExact(that)) return false; + return true; + }, + inline .map, .flow_map => |map, tag| { + const omap = @field(other, @tagName(tag)); + + if (map.count() != omap.count()) return false; + var iter = map.iterator(); + var oiter = omap.iterator(); + // this loop structure enforces that the maps are in the same order + while (iter.next()) |this| { + const that = oiter.next() orelse return false; + if (!std.mem.eql(u8, this.key_ptr.*, that.key_ptr.*) or !this.value_ptr.recursiveEqualsExact(that.value_ptr.*)) return false; + } + // the maps are equal if we have also consumed all of the values from + // other. + return oiter.next() == null; + }, + } + } + + pub fn printDebug(self: Value) void { + self.printRecursive(0); + std.debug.print("\n", .{}); + } + + fn printRecursive(self: Value, indent: usize) void { + switch (self) { + .scalar, .string => |str| { + if (std.mem.indexOfScalar(u8, str.items, '\n')) |_| { + var lines = std.mem.splitScalar(u8, str.items, '\n'); + std.debug.print("\n", .{}); + while (lines.next()) |line| { + std.debug.print( + "{[empty]s: >[indent]}{[line]s}{[nl]s}", + .{ + .empty = "", + .indent = indent, + .line = line, + .nl = if (lines.peek() == null) "" else "\n", + }, + ); + } + } else { + std.debug.print("{s}", .{str.items}); + } + }, + .list, .flow_list => |list| { + if (list.items.len == 0) { + std.debug.print("[]", .{}); + return; + } + + std.debug.print("[\n", .{}); + for (list.items, 0..) |value, idx| { + std.debug.print("{[empty]s: >[indent]}[{[idx]d}] = ", .{ .empty = "", .indent = indent, .idx = idx }); + value.printRecursive(indent + 2); + std.debug.print(",\n", .{}); + } + std.debug.print( + "{[empty]s: >[indent]}]", + .{ .empty = "", .indent = indent }, + ); + }, + .map, .flow_map => |map| { + if (map.count() == 0) { + std.debug.print("{{}}", .{}); + return; + } + + std.debug.print("{{\n", .{}); + + var iter = map.iterator(); + + while (iter.next()) |entry| { + std.debug.print( + "{[empty]s: >[indent]}{[key]s}: ", + .{ .empty = "", .indent = indent + 2, .key = entry.key_ptr.* }, + ); + entry.value_ptr.printRecursive(indent + 4); + std.debug.print(",\n", .{}); + } + std.debug.print( + "{[empty]s: >[indent]}}}", + .{ .empty = "", .indent = indent }, + ); + }, + } + } +}; diff --git a/src/tokenizer.zig b/src/tokenizer.zig new file mode 100644 index 0000000..af3786d --- /dev/null +++ b/src/tokenizer.zig @@ -0,0 +1,251 @@ +const std = @import("std"); + +const Diagnostics = @import("./parser.zig").Diagnostics; + +pub const Error = error{ + BadToken, + MixedIndentation, + UnquantizedIndentation, + TooMuchIndentation, + MissingNewline, + TrailingWhitespace, + Impossible, +}; + +pub const DetectedIndentation = union(enum) { + unknown: void, + spaces: usize, + tabs: void, +}; + +pub const InlineItem = union(enum) { + empty: void, + scalar: []const u8, + line_string: []const u8, + space_string: []const u8, + + flow_list: []const u8, + flow_map: []const u8, + + pub fn lineEnding(self: InlineItem) u8 { + return switch (self) { + .line_string => '\n', + .space_string => ' ', + else => unreachable, + }; + } +}; + +pub const LineContents = union(enum) { + comment: []const u8, + + in_line: InlineItem, + list_item: InlineItem, + map_item: struct { key: []const u8, val: InlineItem }, +}; + +pub const ShiftDirection = enum { indent, dedent, none }; + +pub const LineShift = union(ShiftDirection) { + indent: void, + // we can dedent multiple levels at once. + dedent: usize, + none: void, +}; + +pub const Line = struct { + shift: LineShift, + contents: LineContents, + raw: []const u8, +}; + +// buffer is expected to be either LineBuffer or FixedLineBuffer, but can +// technically be anything with a `nextLine` method +pub fn LineTokenizer(comptime Buffer: type) type { + return struct { + buffer: Buffer, + index: usize = 0, + indentation: DetectedIndentation = .unknown, + last_indent: usize = 0, + diagnostics: *Diagnostics, + row: usize = 0, + + pub fn next(self: *@This()) Error!?Line { + lineloop: while (self.buffer.nextLine()) |raw_line| { + var indent: usize = 0; + for (raw_line, 0..) |char, idx| { + switch (char) { + ' ' => { + switch (self.indentation) { + // There's a weird coupling here because we can't set this until + // all spaces have been consumed. I also thought about ignoring + // spaces on comment lines since those don't affect the + // relative indent/dedent, but then we would allow comments + // to ignore our indent quantum, which I dislike due to it making + // ugly documents. + .unknown => self.indentation = .{ .spaces = 0 }, + .spaces => {}, + .tabs => return error.MixedIndentation, + } + }, + '\t' => { + switch (self.indentation) { + .unknown => self.indentation = .tabs, + .spaces => return error.MixedIndentation, + .tabs => {}, + } + }, + '\r' => { + return error.BadToken; + }, + else => { + indent = idx; + break; + }, + } + } else { + if (raw_line.len > 0) return error.TrailingWhitespace; + continue :lineloop; + } + + var quantized: usize = if (self.indentation == .spaces) quant: { + if (self.indentation.spaces == 0) { + self.indentation.spaces = indent; + } + if (@rem(indent, self.indentation.spaces) != 0) + return error.UnquantizedIndentation; + + break :quant @divExact(indent, self.indentation.spaces); + } else indent; + + const shift: LineShift = if (quantized > self.last_indent) rel: { + if ((quantized - self.last_indent) > 1) + return error.TooMuchIndentation; + break :rel .indent; + } else if (quantized < self.last_indent) + .{ .dedent = self.last_indent - quantized } + else + .none; + + defer { + self.row += 1; + self.last_indent = quantized; + } + + const line = raw_line[indent..]; + + // this should not be possible, as empty lines are caught earlier. + if (line.len == 0) return error.Impossible; + + switch (line[0]) { + '#' => { + // force comments to be followed by a space. This makes them + // behave the same way as strings, actually. + if (line.len > 1 and line[1] != ' ') return error.BadToken; + + // simply lie about indentation when the line is a comment. + quantized = self.last_indent; + return .{ + .shift = .none, + .contents = .{ .comment = line[1..] }, + .raw = line, + }; + }, + '|', '>', '[', '{' => { + return .{ + .shift = shift, + .contents = .{ .in_line = try detectInlineItem(line) }, + .raw = line, + }; + }, + '-' => { + if (line.len > 1 and line[1] != ' ') return error.BadToken; + + return if (line.len == 1) .{ + .shift = shift, + .contents = .{ .list_item = .empty }, + .raw = line, + } else .{ + .shift = shift, + .contents = .{ .list_item = try detectInlineItem(line[2..]) }, + .raw = line, + }; + }, + else => { + for (line, 0..) |char, idx| { + if (char == ':') { + if (idx + 1 == line.len) return .{ + .shift = shift, + .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } }, + .raw = line, + }; + + if (line[idx + 1] != ' ') return error.BadToken; + + return .{ + .shift = shift, + .contents = .{ .map_item = .{ + .key = line[0..idx], + .val = try detectInlineItem(line[idx + 2 ..]), + } }, + .raw = line, + }; + } + } + + return .{ + .shift = shift, + .contents = .{ .in_line = .{ .scalar = line } }, + .raw = line, + }; + }, + } + + // somehow everything else has failed + return error.Impossible; + } + return null; + } + + fn detectInlineItem(buf: []const u8) Error!InlineItem { + if (buf.len == 0) return .empty; + + switch (buf[0]) { + '>', '|' => |char| { + if (buf.len > 1 and buf[1] != ' ') return error.BadToken; + + const slice: []const u8 = switch (buf[buf.len - 1]) { + ' ', '\t' => return error.TrailingWhitespace, + '|' => buf[@min(2, buf.len) .. buf.len - @intFromBool(buf.len > 1)], + else => buf[@min(2, buf.len)..buf.len], + }; + + return if (char == '>') + .{ .line_string = slice } + else + .{ .space_string = slice }; + }, + '[' => { + if (buf.len < 2 or buf[buf.len - 1] != ']') + return error.BadToken; + + // keep the closing ] for the flow parser + return .{ .flow_list = buf[1..] }; + }, + '{' => { + if (buf.len < 2 or buf[buf.len - 1] != '}') + return error.BadToken; + + // keep the closing } fpr the flow parser + return .{ .flow_map = buf[1..] }; + }, + else => { + if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') + return error.TrailingWhitespace; + + return .{ .scalar = buf }; + }, + } + } + }; +}