From bff1fa95c946c16bd04c40aec0e33500548d23dd Mon Sep 17 00:00:00 2001 From: torque Date: Wed, 13 Sep 2023 00:11:45 -0700 Subject: [PATCH] ignominious dump of creation it's beautiful. and broken --- build.zig | 22 ++ build.zig.zon | 14 ++ src/cli.zig | 75 ++++++ src/config.zig | 667 +++++++++++++++++++++++++++++++++++++++++++++++++ src/main.zig | 14 ++ 5 files changed, 792 insertions(+) create mode 100644 build.zig create mode 100644 build.zig.zon create mode 100644 src/cli.zig create mode 100644 src/config.zig create mode 100644 src/main.zig diff --git a/build.zig b/build.zig new file mode 100644 index 0000000..5cb73ff --- /dev/null +++ b/build.zig @@ -0,0 +1,22 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + + const lore = b.addExecutable(.{ + .name = "lore", + .root_source_file = .{ .path = "src/main.zig" }, + .target = target, + .optimize = optimize, + }); + + const noclip_dep = b.dependency("noclip", .{}); + const cmark_dep = b.dependency("cmark", .{}); + + lore.addModule("noclip", noclip_dep.module("noclip")); + lore.addModule("cmark", cmark_dep.module("cmark")); + lore.linkLibrary(cmark_dep.artifact("cmark-c")); + + b.installArtifact(lore); +} diff --git a/build.zig.zon b/build.zig.zon new file mode 100644 index 0000000..258d51a --- /dev/null +++ b/build.zig.zon @@ -0,0 +1,14 @@ +.{ + .name = "lore", + .version = "0.1.0", + .dependencies = .{ + .noclip = .{ + .url = "https://github.com/torque/NOCLIP/archive/35915191fba37d0bc161cc586e2a4aeb70e6a14c.tar.gz", + .hash = "1220e73de78e25f44aca18d3c3008d280efb8fbd0a4b4e90a6f679f3d823a27d2495", + }, + .cmark = .{ + .url = "https://github.com/epicyclic-dev/cmark-zig/archive/3a73fbe31216561eb1bbf9b2324fc2e948f37de2.tar.gz", + .hash = "122076efef52a9952684341b8a6f9a31d8b83666ecc7bd496d7b1bcef76ed6dcea45", + }, + }, +} diff --git a/src/cli.zig b/src/cli.zig new file mode 100644 index 0000000..31f6dbd --- /dev/null +++ b/src/cli.zig @@ -0,0 +1,75 @@ +const std = @import("std"); + +const noclip = @import("noclip"); + +const config = @import("./config.zig"); + +const scribe_cmd = cmd: { + var builder = noclip.CommandBuilder(*const std.mem.Allocator){ + .description = + \\consume lore files and produce rendered output + \\ + \\ + , + }; + + break :cmd builder; +}; + +pub fn scribe(alloc: *const std.mem.Allocator, _: scribe_cmd.Output()) !void { + var parser = config.Parser{ .allocator = alloc.* }; + + _ = try parser.dumpBufLines( + \\# this is a comment + \\key: value + \\key2: + \\ - electric + \\ - boogaloo + \\#comment + \\ + \\ + \\ + \\#commentary + \\ + \\keyzo: > fake string + \\ + \\beezo: + \\ > this string + \\ # comment + \\ | goes on + \\ + \\ > and on + \\ + \\qux: + \\ {my: flow, is: unstoppable} + \\qux: {my: flow, is: unstoppable} + \\qux: [my, flow, is, unstoppable] + \\qux: > {my: flow, is: unstoppable} + \\quux: + \\ [one, two, three] + \\ + \\ + \\foo: + \\ - + \\ bar: + \\ - + \\ + \\ bazinga + \\baz: 1 + \\ + ); +} + +pub fn run(alloc: std.mem.Allocator) !void { + const base = try noclip.commandGroup(alloc, .{ + .description = + \\produce and manipulate collections of lore + \\ + , + }); + defer base.deinitTree(); + + try base.addSubcommand("scribe", try scribe_cmd.createInterface(alloc, scribe, &alloc)); + + try base.execute(); +} diff --git a/src/config.zig b/src/config.zig new file mode 100644 index 0000000..7000e3d --- /dev/null +++ b/src/config.zig @@ -0,0 +1,667 @@ +// Heavily inspired by, but not quite compatible with, NestedText. Key differences: +// +// - Doesn't support multiline keys (this means map keys cannot start with +// ' ', \t, #, {, [, or >, and they cannot contain :) +// - Allows using tabs for indentation (but not mixed tabs/spaces) +// - Indentation must be quantized consistently throughout the document. e.g. +// every nested layer being exactly 2 spaces past its parent. Tabs may +// only use one tab per indentation level. +// - Allows flow-style lists, maps, and strings on the same line as map keys or +// list items (i.e. the following are legal): +// +// key: {inline: map} +// key: [inline, list] +// key: > inline string +// - {map: item} +// - [list, item] +// - > inline string +// +// The string case retains the possibility of having an inline map value starting +// with {, [, or > +// - a map keys and list item dashes must be followed by a value or an indented +// section to reduce parser quantum state. This means that +// +// foo: +// bar: baz +// +// or +// +// - +// - qux +// +// are not valid. This can be represented with an inline empty string after foo: +// +// foo: > +// bar: baz +// +// or +// +// - > +// - qux +// +// - newlines are strictly LF, if the parser finds CR, it is an error +// - trailing spaces are not trimmed (and may cause errors in the future) +// - blank lines may not contain any whitespace characters except the single LF +// - Additional string indicator `|` for soft-wrapped strings, i.e. +// +// key: | this is not special +// key: +// | these lines are +// | soft-wrapped +// +// soft-wrapped lines are joined with a ' ' instead of a newline character. +// Like multiline strings, the final space is stripped (I guess this is a very +// janky way to add trailing whitespace to a string). +// +// - The parser is both strict and probably sloppy and may have weird edge +// cases since I'm slinging code, not writing a spec. + +const std = @import("std"); + +pub const Diagnostics = struct { + row: usize, + span: struct { absolute: usize, line_offset: usize, length: usize }, + message: []const u8, +}; + +pub const LineTokenizer = struct { + buffer: []const u8, + index: usize = 0, + indentation: IndentationType = .immaterial, + last_indent: usize = 0, + diagnostics: *Diagnostics, + + row: usize = 0, + + const Error = error{ + BadToken, + MixedIndentation, + UnquantizedIndentation, + MissingNewline, + TrailingWhitespace, + Impossible, + }; + + const IndentationType = union(enum) { + immaterial: void, + spaces: usize, + tabs: void, + }; + + const InlineItem = union(enum) { + empty: void, + scalar: []const u8, + string: []const u8, + + flow_list: []const u8, + flow_map: []const u8, + }; + + const LineContents = union(enum) { + comment: []const u8, + + in_line: InlineItem, + list_item: InlineItem, + map_item: struct { key: []const u8, val: InlineItem }, + }; + + // we can dedent multiple levels at once. Example: + // + // foo: + // bar: + // > a + // > string + // baz: [qux] + // + // capturing this is conceptually simple, but implementing it without complex + // indentation tracking requires quantizing the indentation. This means our + // IndentationType will also need to track the number of spaces used for + // indentation, as detected. Then every line we have to check indent rem the + // quantization level == 0 (otherwise we broke quantization) and compute indent + // div the quantization level to give us our effective indentation level. + + const ShiftDirection = enum { indent, dedent, none }; + const RelativeIndent = union(ShiftDirection) { + indent: void, + dedent: usize, + none: void, + }; + + const Line = struct { + indent: RelativeIndent, + contents: LineContents, + }; + + pub fn next(self: *LineTokenizer) Error!?Line { + if (self.index == self.buffer.len) return null; + + var indent: usize = 0; + var offset: usize = 0; + + for (self.buffer[self.index..], 0..) |char, idx| { + switch (char) { + ' ' => { + switch (self.indentation) { + // There's a weird coupling here because we can't set this until + // all spaces have been consumed. I also thought about ignoring + // spaces on comment lines since those don't affect the + // relative indent/dedent, but then we would allow comments + // to ignore our indent quantum, which I dislike due to it making + // ugly documents. + .immaterial => self.indentation = .{ .spaces = 0 }, + .spaces => {}, + .tabs => return error.MixedIndentation, + } + indent += 1; + }, + '\t' => { + switch (self.indentation) { + .immaterial => self.indentation = .tabs, + .spaces => return error.MixedIndentation, + .tabs => {}, + } + indent += 1; + }, + '\r' => { + return error.BadToken; + }, + '\n' => { + // don't even emit anything for empty rows. + self.row += 1; + offset = idx + 1; + // if it's too hard to deal with, Just Make It An Error!!! + // an empty line with whitespace on it is garbage. It can mess with + // the indentation detection grossly in a way that is annoying to + // deal with. Besides, having whitespace-only lines in a document + // is essentially terrorism, with which negotiations are famously + // not permitted. + if (indent > 0) return error.TrailingWhitespace; + }, + else => break, + } + } else { + std.debug.assert(self.buffer.len == self.index + indent + offset + 1); + self.index = self.buffer.len; + // this prong will get hit when the document only consists of whitespace + return null; + } + + var quantized: usize = if (self.indentation == .spaces) blk: { + if (self.indentation.spaces == 0) { + self.indentation.spaces = indent; + } + if (@rem(indent, self.indentation.spaces) != 0) + return error.UnquantizedIndentation; + + break :blk @divExact(indent, self.indentation.spaces); + } else indent; + + const relative: RelativeIndent = if (indent > self.last_indent) + .indent + else if (indent < self.last_indent) + .{ .dedent = self.last_indent - quantized } + else + .none; + + offset += indent; + + defer { + self.row += 1; + self.last_indent = quantized; + self.index += offset; + } + + const line = try consumeLine(self.buffer[self.index + offset ..]); + offset += line.len + 1; + + // this should not be possible, as empty lines are caught earlier. + if (line.len == 0) return error.Impossible; + + switch (line[0]) { + '#' => { + // simply lie about indentation when the line is a comment. + quantized = self.last_indent; + return .{ + .indent = .none, + .contents = .{ .comment = line[1..] }, + }; + }, + '|', '>', '[', '{' => { + return .{ + .indent = relative, + .contents = .{ .in_line = try detectInlineItem(line) }, + }; + }, + '-' => { + if (line.len > 1 and line[1] != ' ') return error.BadToken; + + return if (line.len == 1) .{ + .indent = relative, + .contents = .{ .list_item = .empty }, + } else .{ + .indent = relative, + .contents = .{ .list_item = try detectInlineItem(line[2..]) }, + }; + }, + else => { + for (line, 0..) |char, idx| { + if (char == ':') { + if (idx + 1 == line.len) return .{ + .indent = relative, + .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } }, + }; + + if (line[idx + 1] != ' ') return error.BadToken; + + return .{ + .indent = relative, + .contents = .{ .map_item = .{ + .key = line[0..idx], + .val = try detectInlineItem(line[idx + 2 ..]), + } }, + }; + } + } + + return .{ + .indent = relative, + .contents = .{ .in_line = .{ .scalar = line } }, + }; + }, + } + } + + fn detectInlineItem(buf: []const u8) Error!InlineItem { + if (buf.len == 0) return .empty; + + switch (buf[0]) { + '|', '>' => |char| { + if (buf.len > 1 and buf[1] != ' ') return error.BadToken; + + return if (buf.len == 1) .{ + .string = if (char == '|') buf[1..] else buf.ptr[1 .. buf.len + 1], + } else .{ + .string = if (char == '|') buf[2..] else buf.ptr[2 .. buf.len + 1], + }; + }, + '[' => { + if (buf.len < 2 or buf[buf.len - 1] != ']') return error.BadToken; + + return .{ .flow_list = buf[1 .. buf.len - 1] }; + }, + '{' => { + if (buf.len < 2 or buf[buf.len - 1] != '}') return error.BadToken; + + return .{ .flow_map = buf[1 .. buf.len - 1] }; + }, + else => { + return .{ .scalar = buf }; + }, + } + } + + fn consumeLine(buf: []const u8) ![]const u8 { + for (buf, 0..) |char, idx| { + switch (char) { + '\n' => return buf[0..idx], + '\r' => return error.BadToken, + else => {}, + } + } + + return error.MissingNewline; + } +}; + +pub const Parser = struct { + allocator: std.mem.Allocator, + dupe_behavior: DuplicateKeyBehavior = .fail, + default_object: DefaultObject = .fail, + diagnostics: Diagnostics = .{ + .row = 0, + .span = .{ .absolute = 0, .line_offset = 0, .length = 0 }, + .message = "all is well", + }, + + pub const Error = error{ + UnexpectedIndent, + UnexpectedValue, + ExtraContent, + EmptyDocument, + DuplicateKey, + Fail, + } || LineTokenizer.Error || std.mem.Allocator.Error; + + pub const DuplicateKeyBehavior = enum { + use_first, + use_last, + fail, + }; + + pub const DefaultObject = enum { + string, + list, + map, + fail, + }; + + pub const Map = std.StringHashMap; + pub const List = std.ArrayList; + + pub const Value = union(enum) { + string: std.ArrayList(u8), + list: List(Value), + map: Map(Value), + }; + + pub const ParseState = enum { + initial, + value, + done, + }; + + pub const Document = struct { + arena: std.heap.ArenaAllocator, + root: Value, + + fn deinit(self: *Document) void { + self.arena.deinit(); + } + }; + + pub fn parseBuffer(self: *Parser, buffer: []const u8, allocator: std.mem.Allocator) Error!Document { + var document: Document = .{ + .arena = std.heap.ArenaAllocator.init(allocator), + .root = undefined, + }; + errdefer document.deinit(); + const arena_alloc = document.arena.allocator(); + + var state: ParseState = .initial; + var expect_shift: LineTokenizer.ShiftDirection = .none; + var empty_key: ?[]const u8 = null; + var stack = std.ArrayList(*Value).init(arena_alloc); + defer stack.deinit(); + + var tok: LineTokenizer = .{ .buffer = buffer, .diagnostics = &self.diagnostics }; + while (try tok.next()) |line| { + if (line.contents == .comment) continue; + + var flip = true; + var flop = false; + // this is needed to give us a second go round when the line is dedented + flipflop: while (flip) : (flop = true) { + switch (state) { + .initial => { + if (line.indent == .indent) return error.UnexpectedIndent; + + switch (line.contents) { + // we filter out comments above + .comment => unreachable, + .in_line => |in_line| switch (in_line) { + // empty scalars are only emitted for a list_item or a map_item + .empty => unreachable, + .scalar => |str| { + document.root = try valueFromString(arena_alloc, str); + state = .done; + }, + .flow_list => |str| { + document.root = try parseFlowList(arena_alloc, str); + state = .done; + }, + .flow_map => |str| { + document.root = try parseFlowMap(arena_alloc, str); + state = .done; + }, + .string => |str| { + document.root = valueFromString(arena_alloc, str); + // cheesy technique for differentiating the different string types + // if (str[str.len-1] != '\n') try document.root.string.append(' '); + try stack.append(&document.root); + state = .value; + }, + }, + .list_item => |value| { + document.root = .{ .list = List(Value).init(arena_alloc) }; + try stack.append(&document.root); + + switch (value) { + .empty => { + expect_shift = .indent; + state = .value; + }, + .string, .scalar => |str| { + document.root.list.append(try valueFromString(arena_alloc, str)); + state = .value; + }, + .flow_list => |str| { + document.root.list.append(try parseFlowList(arena_alloc, str)); + state = .value; + }, + .flow_map => |str| { + document.root.list.append(try parseFlowMap(arena_alloc, str)); + state = .value; + }, + } + }, + .map_item => |pair| { + document.root = .{ .map = Map(Value).init(arena_alloc) }; + try stack.append(&document.root); + + switch (pair.val) { + .empty => { + expect_shift = .indent; + // If the key is on its own line, we don't have + // an associated value until we parse the next + // line. We need to store a reference to this + // key somewhere until we can consume the + // value. More parser state to lug along. + + empty_key = pair.key; + state = .value; + }, + .string, .scalar => |str| { + // we can do direct puts here because this is + // the very first line of the document + document.root.map.put(pair.key, try valueFromString(arena_alloc, str)); + state = .value; + }, + .flow_list => |str| { + document.root.map.put(pair.key, try parseFlowList(arena_alloc, str)); + state = .value; + }, + .flow_map => |str| { + document.root.map.put(pair.key, try parseFlowMap(arena_alloc, str)); + state = .value; + }, + } + }, + } + }, + .value => switch (stack.getLast()) { + .string => |*string| { + if (line.indent == .indent) return error.UnexpectedIndent; + if (!flop and line.indent == .dedent) { + // TODO: remove final newline or trailing space here + + var dedent_depth = line.indent.dedent; + while (dedent_depth > 0) : (dedent_depth -= 1) + _ = stack.pop(); + + continue :flipflop; + } + + switch (line.contents) { + .comment => unreachable, + .in_line => |in_line| switch (in_line) { + .empty => unreachable, + .string => |str| try string.appendSlice(str), + else => return error.UnexpectedValue, + }, + else => return error.UnexpectedValue, + } + }, + .list => |*list| { + if (expect_shift == .indent and line.indent != .indent) return error.ExpectedIndent; + // Consider: + // + // - + // own-line scalar + // - inline scalar + // + // the own-line scalar will not push the stack but the next list item will be a dedent + if (!flop and line.indent == .dedent) { + // if line.indent.dedent is 1 and we're expecting it, the stack will not be popped, + // but we will continue loop flipflop. However, flop will be set to false on the next + // trip, so this if prong will not be run again. + var dedent_depth = line.indent.dedent - @intFromBool(expect_shift == .dedent); + + while (dedent_depth > 0) : (dedent_depth -= 1) + _ = stack.pop(); + + continue :flipflop; + } + + switch (line.contents) { + .comment => unreachable, + .in_line => |in_line| { + // assert that this line has been indented. this is required for an inline value when + // the stack is in list mode. + if (line.indent != .indent) return error.UnexpectedValue; + + switch (in_line) { + .empty => unreachable, + .scalar => |str| { + expect_shift = .dedent; + try list.append(try valueFromString(arena_alloc, str)); + }, + .string => |str| { + // string pushes the stack + expect_shift = .none; + list.append(try valueFromString(arena_alloc, str)); + try stack.append(&list.items[list.items.len - 1]); + }, + .flow_list => |str| { + expect_shift = .dedent; + try list.append(try parseFlowList(arena_alloc, str)); + }, + .flow_map => |str| { + expect_shift = .dedent; + try list.append(try parseFlowList(arena_alloc, str)); + }, + } + }, + .list_item => |value| switch (value) { + .empty => expect_shift = .indent, + .scalar, .string => |str| list.append(try valueFromString(arena_alloc, str)), + .flow_list => |str| try list.append(try parseFlowList(arena_alloc, str)), + .flow_map => |str| try list.append(try parseFlowList(arena_alloc, str)), + }, + .map_item => return error.UnexpectedValue, + } + }, + .map => { + if (expect_shift and line.indent != .indent) return error.ExpectedIndent; + if (!flop and line.indent == .dedent) { + _ = stack.pop(); + continue :flipflop; + } + }, + }, + .done => return error.ExtraContent, + } + } + + // this is specifically performed at the end of the loop body so that + // `continue :flipflop` skips setting it. + flip = false; + } + + switch (state) { + .initial => switch (self.default_object) { + .string => document.root = .{ .string = std.ArrayList(u8).init(arena_alloc) }, + .list => document.root = .{ .list = List(Value).init(arena_alloc) }, + .map => document.root = .{ .map = Map(Value).init(arena_alloc) }, + .fail => return error.EmptyDocument, + }, + .value => switch (stack.getLast()) { + .string => { + // remove final character (newline or trailing space) + }, + .list => {}, + .map => {}, + }, + .done => {}, + } + + return document; + } + + fn valueFromString(alloc: std.mem.Allocator, buffer: []const u8) Error!Value { + var result: Value = .{ .string = try std.ArrayList(u8).initCapacity(alloc, buffer.len) }; + result.string.appendSliceAssumeCapacity(buffer); + return result; + } + + fn parseFlowList(alloc: std.mem.Allocator, contents: []const u8) Error!Value { + var result: Value = .{ .list = List(Value).init(alloc) }; + + var splitter = std.mem.splitScalar(u8, contents, ','); + while (splitter.next()) |chunk| { + try result.list.append( + try valueFromString(alloc, std.mem.trim(u8, chunk, " ")), + ); + } + + return result; + } + + fn parseFlowMap(alloc: std.mem.Allocator, contents: []const u8) Error!Value { + _ = contents; + var result: Value = .{ .map = Map(Value).init(alloc) }; + var splitter = std.mem.splitScalar() + } + + pub fn dumpBufLines(self: *Parser, buf: []const u8) Error!void { + var tok: LineTokenizer = .{ .buffer = buf, .diagnostics = &self.diagnostics }; + while (try tok.next()) |line| { + dumpLine(line); + } + } + + fn dumpLine(line: LineTokenizer.Line) void { + var dedbuf: [64]u8 = .{0} ** 64; + var keybuf: [2048]u8 = .{0} ** 2048; + var valbuf: [2048]u8 = .{0} ** 2048; + + const shiftstr = if (line.indent == .dedent) + std.fmt.bufPrint(&dedbuf, " ({d})", .{line.indent.dedent}) catch unreachable + else + ""; + + std.debug.print("{s}{s}: {s} => {s}\n", .{ + @tagName(line.indent), shiftstr, @tagName(line.contents), switch (line.contents) { + .comment => |str| str, + .in_line, .list_item => |scalar| switch (scalar) { + .empty => "[empty]", + .scalar, + .string, + .flow_list, + .flow_map, + => |str| std.fmt.bufPrint(&keybuf, "{s} => {s}", .{ @tagName(scalar), str }) catch unreachable, + }, + .map_item => |map| std.fmt.bufPrint(&keybuf, "{s} : {s}", .{ + map.key, + switch (map.val) { + .empty => "[empty]", + .scalar, + .string, + .flow_list, + .flow_map, + => |str| std.fmt.bufPrint(&valbuf, "{s} => {s}", .{ @tagName(map.val), str }) catch unreachable, + }, + }) catch unreachable, + }, + }); + } +}; diff --git a/src/main.zig b/src/main.zig new file mode 100644 index 0000000..8800cb1 --- /dev/null +++ b/src/main.zig @@ -0,0 +1,14 @@ +const std = @import("std"); + +const cli = @import("./cli.zig"); + +pub fn main() !void { + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer { + if (gpa.deinit() != .ok) { + // do something maybe? idk + } + } + + try cli.run(gpa.allocator()); +}