consider a tokenizer

2025-03-14 00:10:06 -06:00
parent 11b7d3e06b
commit 5746fbbd5e
2 changed files with 386 additions and 0 deletions
--- a/source/mem.zig
+++ b/source/mem.zig
@@ -0,0 +1,41 @@
+pub const Partition = struct { lhs: []const u8, rhs: ?[]const u8 };
+
+pub const Codepoint = u21;
+
+pub fn partition(str: []const u8, char: u8) Partition {
+    return if (std.mem.indexOfScalar(u8, str, char)) |idx|
+        .{ .lhs = str[0..idx], .rhs = str[idx + 1 ..] }
+    else
+        .{ .lhs = str, .rhs = null };
+}
+
+pub fn SliceIter(comptime T: type) type {
+    return struct {
+        slice: []const T,
+        index: usize = 0,
+
+        pub fn pop(self: *@This()) ?T {
+            defer self.index +|= 1;
+            return self.peek();
+        }
+
+        pub fn peek(self: *@This()) ?T {
+            if (self.index >= self.slice.len) return null;
+            return self.slice[self.index];
+        }
+    };
+}
+
+pub fn encodeShort(comptime short: Codepoint) []const u8 {
+    comptime {
+        const encoded = enc: {
+            const len = std.unicode.utf8CodepointSequenceLength(short) catch unreachable;
+            var buf: [len]u8 = undefined;
+            _ = std.unicode.utf8Encode(short, &buf) catch @compileError("invalid unicode character");
+            break :enc buf;
+        };
+        return encoded[0..];
+    }
+}
+
+const std = @import("std");
--- a/source/tokenizer.zig
+++ b/source/tokenizer.zig
@@ -0,0 +1,345 @@
+pub const TokenContext = struct {
+    forward_ddash: bool = false,
+    short: Options,
+    long: Options,
+    positional: []const []const u8,
+    subcommands: Subcommands,
+
+    pub const Options = std.StaticStringMap(OptionContext);
+    pub const Subcommands = std.StaticStringMap(*const TokenContext);
+
+    pub const OptionContext = struct {
+        global: NestLevel = .none,
+        value: bool,
+    };
+
+    pub const NestLevel = enum(usize) {
+        root = 0,
+        none = std.math.maxInt(usize),
+        _,
+
+        pub fn wrap(lv: usize) NestLevel {
+            return @enumFromInt(lv);
+        }
+        pub fn incr(self: NestLevel) NestLevel {
+            return wrap(self.unwrap() + 1);
+        }
+        pub fn unwrap(self: NestLevel) usize {
+            return @intFromEnum(self);
+        }
+    };
+};
+
+pub const Token = union(enum) {
+    doubledash,
+    short: u21,
+    long: []const u8,
+    shortvalue: struct { name: u21, value: []const u8 },
+    longvalue: struct { name: []const u8, value: []const u8 },
+    value: []const u8,
+    subcommand: []const u8,
+
+    pub fn dump(self: Token) void {
+        switch (self) {
+            .doubledash => std.debug.print("'--'\n", .{}),
+            .short => |val| std.debug.print(".short => '{u}'\n", .{val}),
+            .long => |val| std.debug.print(".long => \"{s}\"\n", .{val}),
+            .shortvalue => |val| std.debug.print(".shortvalue => '{u}': \"{s}\"\n", .{ val.name, val.value }),
+            .longvalue => |val| std.debug.print(".shortvalue => {s}: \"{s}\"\n", .{ val.name, val.value }),
+            .value => |val| std.debug.print(".value => \"{s}\"\n", .{val}),
+            .subcommand => |val| std.debug.print(".subcommand => \"{s}\"\n", .{val}),
+        }
+    }
+};
+
+const Assembler = struct {
+    // this underallocates if fused short args are used and overallocates when
+    // values are stored in a separate arg. it probably overallocates on
+    // average, but we correct by growing it when fused arguments are
+    // encountered, so it always overallocates
+    tokens: std.ArrayListUnmanaged(Token) = .empty,
+    // this overallocates in every case except the case where every argument is
+    // a subcommand. There is no reason to change this after the initial
+    // allocation.
+    indices: [*]usize,
+    len: usize,
+    cap: usize,
+
+    fn init(alloc: std.mem.Allocator, cap: usize) !Assembler {
+        const idx = try alloc.alloc(usize, cap);
+        errdefer alloc.free(idx);
+        return .{
+            .tokens = try .initCapacity(alloc, cap),
+            .indices = idx.ptr,
+            .len = 0,
+            .cap = cap,
+        };
+    }
+
+    fn addCapacity(self: *Assembler, alloc: std.mem.Allocator, extra: usize) !void {
+        try self.tokens.ensureTotalCapacity(alloc, self.tokens.capacity + extra);
+    }
+
+    fn deinit(self: *Assembler, alloc: std.mem.Allocator) void {
+        alloc.free(self.indices[0..self.cap]);
+        self.tokens.deinit(alloc);
+    }
+
+    fn finish(self: *Assembler, alloc: std.mem.Allocator) ![]const Token {
+        return try self.tokens.toOwnedSlice(alloc);
+    }
+
+    fn pushCommand(self: *Assembler) void {
+        self.indices[self.len] = self.tokens.items.len;
+        self.len += 1;
+    }
+
+    fn append(self: *Assembler, tok: Token) void {
+        self.tokens.insertAssumeCapacity(self.indices[self.len - 1], tok);
+        self.indices[self.len - 1] += 1;
+    }
+
+    fn insert(self: *Assembler, level: TokenContext.NestLevel, tok: Token) void {
+        if (level == .none) {
+            self.append(tok);
+            return;
+        }
+
+        std.debug.assert(level.unwrap() < self.len);
+        self.tokens.insertAssumeCapacity(self.indices[level.unwrap()], tok);
+        for (level.unwrap()..self.len) |idx| {
+            self.indices[idx] += 1;
+        }
+    }
+};
+
+// This tokenizer is very sloppy; it will happily create tokens that are
+// mismatch the details of the TokenContext it has (e.g. it may produce a .short
+// token without a value even if the context indicates that flag must produce a
+// .shortvalue token). There are two reasons for this approach: the first is
+// that tokenization is the wrong place to get persnickety about these details;
+// the parser has a lot more context that it can use to produce useful errors
+// when the token type mismatches its expectation. The seconds reason is that it
+// allows us to use the tokenizer in situations where incomplete or incorrect
+// input is expected and we want to get partial results, e.g. for an incomplete
+// command line asking for completion options. Theoretically, the only true
+// failure modes that the tokenizer can experience are allocation failures (OOM)
+// and utf-8 decode failures.
+//
+// This is also the piece of code responsible for associating global parameters
+// with the command that declares them. It is possible to do that here because
+// the Parser guarantees that global parameters cannot be shadowed. This does
+// generally make the true original order of the command line impossible to
+// recover, although this could be rectified by keeping an index of when the
+// token was actually encountered. Rearranging the globals here saves the need
+// for a two-pass parsing strategy (though creating the tokens and then actually
+// iterating the tokens is two passes, no parsing has to be done on the tokens,
+// only value conversion).
+//
+// The produced list of tokens store references to the data contained in the
+// provided argument vector. That is, the tokens do not own all of their memory,
+// so the argument vector must be kept allocated until the end of the lifetime
+// of the list of tokens.
+pub fn tokenize(alloc: std.mem.Allocator, tokctx: *const TokenContext, argv: []const []const u8) ![]const Token {
+    var assembler: Assembler = try .init(alloc, argv.len);
+    defer assembler.deinit(alloc);
+    assembler.pushCommand();
+
+    var cmdctx: *const TokenContext = tokctx;
+    var mode: enum { any, fused, ordered } = .any;
+    var argit: mem.SliceIter([]const u8) = .{ .slice = argv };
+
+    while (argit.pop()) |arg| {
+        mod: switch (mode) {
+            .any => if (std.mem.eql(u8, arg, "--") and !cmdctx.forward_ddash) {
+                mode = .ordered;
+            } else if (std.mem.startsWith(u8, arg, "--")) {
+                const part = mem.partition(arg[2..], '=');
+                if (part.rhs) |val| rhs: {
+                    if (cmdctx.long.get(part.lhs)) |optctx| {
+                        assembler.insert(optctx.global, .{
+                            .longvalue = .{ .name = part.lhs, .value = val },
+                        });
+                        break :rhs;
+                    }
+                    assembler.append(
+                        .{ .longvalue = .{ .name = part.lhs, .value = val } },
+                    );
+                } else norhs: {
+                    if (cmdctx.long.get(part.lhs)) |optctx| {
+                        if (optctx.value) {
+                            if (argit.pop()) |val| {
+                                assembler.insert(optctx.global, .{
+                                    .longvalue = .{ .name = part.lhs, .value = val },
+                                });
+                                break :norhs;
+                            }
+                        }
+                        assembler.insert(optctx.global, .{ .long = part.lhs });
+                        break :norhs;
+                    }
+                    assembler.append(.{ .long = part.lhs });
+                }
+            } else if (std.mem.startsWith(u8, arg, "-") and arg.len > 1) {
+                const cpcount = try std.unicode.utf8CountCodepoints(arg[1..]);
+                if (cpcount > 1)
+                    try assembler.addCapacity(alloc, cpcount);
+                continue :mod .fused;
+            } else {
+                continue :mod .ordered;
+            },
+            .fused => {
+                var iter: std.unicode.Utf8Iterator = .{ .bytes = arg[1..], .i = 0 };
+                u8i: while (iter.nextCodepointSlice()) |cps| {
+                    const codepoint = std.unicode.utf8Decode(cps) catch unreachable;
+                    if (cmdctx.short.get(cps)) |optctx| {
+                        if (optctx.value and iter.peek(1).len == 0) {
+                            if (argit.pop()) |val| {
+                                assembler.insert(optctx.global, .{
+                                    .shortvalue = .{ .name = codepoint, .value = val },
+                                });
+                                continue :u8i;
+                            }
+                        }
+                        assembler.insert(optctx.global, .{
+                            .short = codepoint,
+                        });
+                        continue :u8i;
+                    }
+                    assembler.append(.{ .short = codepoint });
+                }
+            },
+            .ordered => if (cmdctx.subcommands.get(arg)) |scmd| {
+                mode = .any;
+                cmdctx = scmd;
+                assembler.pushCommand();
+                assembler.append(.{ .subcommand = arg });
+            } else {
+                assembler.append(.{ .value = arg });
+            },
+        }
+    }
+    return try assembler.finish(alloc);
+}
+
+const std = @import("std");
+const mem = @import("./mem.zig");
+
+fn makeContext() *const TokenContext {
+    const ToC = TokenContext.OptionContext;
+    const Nl = TokenContext.NestLevel;
+
+    const childa: TokenContext = .{
+        .short = .initComptime(&.{
+            .{ "z", ToC{ .global = .none, .value = false } },
+            .{ "y", ToC{ .global = .none, .value = true } },
+            .{ "x", ToC{ .global = .none, .value = false } },
+            .{ "w", ToC{ .global = .none, .value = true } },
+            // these are provided by the parent
+            .{ "c", ToC{ .global = Nl.wrap(0), .value = false } },
+            .{ "d", ToC{ .global = Nl.wrap(0), .value = true } },
+        }),
+        .long = .initComptime(&.{
+            .{ "long-z", ToC{ .global = .none, .value = false } },
+            .{ "global-a", ToC{ .global = Nl.wrap(0), .value = false } },
+        }),
+        .positional = &.{ "argument-z", "argument-y" },
+        .subcommands = .initComptime(&.{}),
+    };
+
+    const ctx: TokenContext = .{
+        .short = .initComptime(&.{
+            .{ "a", ToC{ .global = .none, .value = false } },
+            .{ "b", ToC{ .global = .none, .value = true } },
+            // global arguments are not global on the command that defines them
+            .{ "c", ToC{ .global = .none, .value = false } },
+            .{ "d", ToC{ .global = .none, .value = true } },
+        }),
+        .long = .initComptime(&.{
+            .{ "long-a", ToC{ .global = .none, .value = false } },
+            .{ "global-a", ToC{ .global = .none, .value = false } },
+        }),
+        .positional = &.{},
+        .subcommands = .initComptime(&.{
+            .{ "subcommand-a", &childa },
+        }),
+    };
+
+    return &ctx;
+}
+
+test "tokenize" {
+    const alloc = std.testing.allocator;
+    const context = comptime makeContext();
+
+    {
+        const tokens = try tokenize(alloc, context, &.{"-abc"});
+        defer alloc.free(tokens);
+
+        try std.testing.expectEqual(3, tokens.len);
+        try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]);
+        try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]);
+        try std.testing.expectEqual(Token{ .short = 'c' }, tokens[2]);
+    }
+
+    {
+        const tokens = try tokenize(alloc, context, &.{ "-abd", "dee" });
+        defer alloc.free(tokens);
+
+        try std.testing.expectEqual(3, tokens.len);
+        try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]);
+        try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]);
+        try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "dee" } }, tokens[2]);
+    }
+
+    {
+        const tokens = try tokenize(alloc, context, &.{ "-cba", "dee" });
+        defer alloc.free(tokens);
+
+        try std.testing.expectEqual(4, tokens.len);
+        try std.testing.expectEqual(Token{ .short = 'c' }, tokens[0]);
+        try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]);
+        try std.testing.expectEqual(Token{ .short = 'a' }, tokens[2]);
+        try std.testing.expectEqual(Token{ .value = "dee" }, tokens[3]);
+    }
+
+    {
+        const tokens = try tokenize(alloc, context, &.{ "-acb", "dee", "-d", "-zyx" });
+        defer alloc.free(tokens);
+
+        try std.testing.expectEqual(4, tokens.len);
+        try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]);
+        try std.testing.expectEqual(Token{ .short = 'c' }, tokens[1]);
+        try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'b', .value = "dee" } }, tokens[2]);
+        try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "-zyx" } }, tokens[3]);
+    }
+
+    {
+        const tokens = try tokenize(alloc, context, &.{ "-a", "-c", "subcommand-a", "-d", "dee", "-zyx", "--global-a" });
+        defer alloc.free(tokens);
+
+        try std.testing.expectEqual(8, tokens.len);
+        try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]);
+        try std.testing.expectEqual(Token{ .short = 'c' }, tokens[1]);
+        try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "dee" } }, tokens[2]);
+        try std.testing.expectEqualDeep(Token{ .long = "global-a" }, tokens[3]);
+        try std.testing.expectEqual(Token{ .subcommand = "subcommand-a" }, tokens[4]);
+        try std.testing.expectEqual(Token{ .short = 'z' }, tokens[5]);
+        try std.testing.expectEqual(Token{ .short = 'y' }, tokens[6]);
+        try std.testing.expectEqual(Token{ .short = 'x' }, tokens[7]);
+    }
+}
+
+// parameter styles to accept:
+// --name value
+// --name=value
+// -n value
+// -fused (parsed as -f -u -s -e -d)
+// -fused value (parsed as -f -u -s -e -d value)
+// ordered
+// a named parameter can only take zero or one values. Establish a convention for compound values:
+// --name val1,val2
+// --name=val1,val2
+// --name="val1, val2" (probably should not consume whitespace, since the user has to go out of their way to do quoting for it)
+// --name key1=val1,key2=val2
+// --name=key1=val1,key2=val2 (should be familiar from docker)