pub const TokenContext = struct { forward_ddash: bool = false, short: Options, long: Options, positional: []const []const u8, subcommands: Subcommands, pub const Options = std.StaticStringMap(OptionContext); pub const Subcommands = std.StaticStringMap(*const TokenContext); pub const OptionContext = struct { global: NestLevel = .none, value: bool, }; pub const NestLevel = enum(usize) { root = 0, none = std.math.maxInt(usize), _, pub fn wrap(lv: usize) NestLevel { return @enumFromInt(lv); } pub fn incr(self: NestLevel) NestLevel { return wrap(self.unwrap() + 1); } pub fn unwrap(self: NestLevel) usize { return @intFromEnum(self); } }; }; pub const Token = union(enum) { doubledash, short: u21, long: []const u8, shortvalue: struct { name: u21, value: []const u8 }, longvalue: struct { name: []const u8, value: []const u8 }, value: []const u8, subcommand: []const u8, pub fn dump(self: Token) void { switch (self) { .doubledash => std.debug.print("'--'\n", .{}), .short => |val| std.debug.print(".short => '{u}'\n", .{val}), .long => |val| std.debug.print(".long => \"{s}\"\n", .{val}), .shortvalue => |val| std.debug.print(".shortvalue => '{u}': \"{s}\"\n", .{ val.name, val.value }), .longvalue => |val| std.debug.print(".shortvalue => {s}: \"{s}\"\n", .{ val.name, val.value }), .value => |val| std.debug.print(".value => \"{s}\"\n", .{val}), .subcommand => |val| std.debug.print(".subcommand => \"{s}\"\n", .{val}), } } }; const Assembler = struct { // this underallocates if fused short args are used and overallocates when // values are stored in a separate arg. it probably overallocates on // average, but we correct by growing it when fused arguments are // encountered, so it always overallocates tokens: std.ArrayListUnmanaged(Token) = .empty, // this overallocates in every case except the case where every argument is // a subcommand. There is no reason to change this after the initial // allocation. indices: [*]usize, len: usize, cap: usize, fn init(alloc: std.mem.Allocator, cap: usize) !Assembler { const idx = try alloc.alloc(usize, cap); errdefer alloc.free(idx); return .{ .tokens = try .initCapacity(alloc, cap), .indices = idx.ptr, .len = 0, .cap = cap, }; } fn addCapacity(self: *Assembler, alloc: std.mem.Allocator, extra: usize) !void { try self.tokens.ensureTotalCapacity(alloc, self.tokens.capacity + extra); } fn deinit(self: *Assembler, alloc: std.mem.Allocator) void { alloc.free(self.indices[0..self.cap]); self.tokens.deinit(alloc); } fn finish(self: *Assembler, alloc: std.mem.Allocator) ![]const Token { return try self.tokens.toOwnedSlice(alloc); } fn pushCommand(self: *Assembler) void { self.indices[self.len] = self.tokens.items.len; self.len += 1; } fn append(self: *Assembler, tok: Token) void { self.tokens.insertAssumeCapacity(self.indices[self.len - 1], tok); self.indices[self.len - 1] += 1; } fn insert(self: *Assembler, level: TokenContext.NestLevel, tok: Token) void { if (level == .none) { self.append(tok); return; } std.debug.assert(level.unwrap() < self.len); self.tokens.insertAssumeCapacity(self.indices[level.unwrap()], tok); for (level.unwrap()..self.len) |idx| { self.indices[idx] += 1; } } }; // This tokenizer is very sloppy; it will happily create tokens that are // mismatch the details of the TokenContext it has (e.g. it may produce a .short // token without a value even if the context indicates that flag must produce a // .shortvalue token). There are two reasons for this approach: the first is // that tokenization is the wrong place to get persnickety about these details; // the parser has a lot more context that it can use to produce useful errors // when the token type mismatches its expectation. The seconds reason is that it // allows us to use the tokenizer in situations where incomplete or incorrect // input is expected and we want to get partial results, e.g. for an incomplete // command line asking for completion options. Theoretically, the only true // failure modes that the tokenizer can experience are allocation failures (OOM) // and utf-8 decode failures. // // This is also the piece of code responsible for associating global parameters // with the command that declares them. It is possible to do that here because // the Parser guarantees that global parameters cannot be shadowed. This does // generally make the true original order of the command line impossible to // recover, although this could be rectified by keeping an index of when the // token was actually encountered. Rearranging the globals here saves the need // for a two-pass parsing strategy (though creating the tokens and then actually // iterating the tokens is two passes, no parsing has to be done on the tokens, // only value conversion). // // The produced list of tokens store references to the data contained in the // provided argument vector. That is, the tokens do not own all of their memory, // so the argument vector must be kept allocated until the end of the lifetime // of the list of tokens. pub fn tokenize(alloc: std.mem.Allocator, tokctx: *const TokenContext, argv: []const []const u8) ![]const Token { var assembler: Assembler = try .init(alloc, argv.len); defer assembler.deinit(alloc); assembler.pushCommand(); var cmdctx: *const TokenContext = tokctx; var mode: enum { any, fused, ordered } = .any; var argit: mem.SliceIter([]const u8) = .{ .slice = argv }; while (argit.pop()) |arg| { mod: switch (mode) { .any => if (std.mem.eql(u8, arg, "--") and !cmdctx.forward_ddash) { mode = .ordered; } else if (std.mem.startsWith(u8, arg, "--")) { const part = mem.partition(arg[2..], '='); if (part.rhs) |val| rhs: { if (cmdctx.long.get(part.lhs)) |optctx| { assembler.insert(optctx.global, .{ .longvalue = .{ .name = part.lhs, .value = val }, }); break :rhs; } assembler.append( .{ .longvalue = .{ .name = part.lhs, .value = val } }, ); } else norhs: { if (cmdctx.long.get(part.lhs)) |optctx| { if (optctx.value) { if (argit.pop()) |val| { assembler.insert(optctx.global, .{ .longvalue = .{ .name = part.lhs, .value = val }, }); break :norhs; } } assembler.insert(optctx.global, .{ .long = part.lhs }); break :norhs; } assembler.append(.{ .long = part.lhs }); } } else if (std.mem.startsWith(u8, arg, "-") and arg.len > 1) { const cpcount = try std.unicode.utf8CountCodepoints(arg[1..]); if (cpcount > 1) try assembler.addCapacity(alloc, cpcount); continue :mod .fused; } else { continue :mod .ordered; }, .fused => { var iter: std.unicode.Utf8Iterator = .{ .bytes = arg[1..], .i = 0 }; u8i: while (iter.nextCodepointSlice()) |cps| { const codepoint = std.unicode.utf8Decode(cps) catch unreachable; if (cmdctx.short.get(cps)) |optctx| { if (optctx.value and iter.peek(1).len == 0) { if (argit.pop()) |val| { assembler.insert(optctx.global, .{ .shortvalue = .{ .name = codepoint, .value = val }, }); continue :u8i; } } assembler.insert(optctx.global, .{ .short = codepoint, }); continue :u8i; } assembler.append(.{ .short = codepoint }); } }, .ordered => if (cmdctx.subcommands.get(arg)) |scmd| { mode = .any; cmdctx = scmd; assembler.pushCommand(); assembler.append(.{ .subcommand = arg }); } else { assembler.append(.{ .value = arg }); }, } } return try assembler.finish(alloc); } const std = @import("std"); const mem = @import("./mem.zig"); fn makeContext() *const TokenContext { const ToC = TokenContext.OptionContext; const Nl = TokenContext.NestLevel; const childa: TokenContext = .{ .short = .initComptime(&.{ .{ "z", ToC{ .global = .none, .value = false } }, .{ "y", ToC{ .global = .none, .value = true } }, .{ "x", ToC{ .global = .none, .value = false } }, .{ "w", ToC{ .global = .none, .value = true } }, // these are provided by the parent .{ "c", ToC{ .global = Nl.wrap(0), .value = false } }, .{ "d", ToC{ .global = Nl.wrap(0), .value = true } }, }), .long = .initComptime(&.{ .{ "long-z", ToC{ .global = .none, .value = false } }, .{ "global-a", ToC{ .global = Nl.wrap(0), .value = false } }, }), .positional = &.{ "argument-z", "argument-y" }, .subcommands = .initComptime(&.{}), }; const ctx: TokenContext = .{ .short = .initComptime(&.{ .{ "a", ToC{ .global = .none, .value = false } }, .{ "b", ToC{ .global = .none, .value = true } }, // global arguments are not global on the command that defines them .{ "c", ToC{ .global = .none, .value = false } }, .{ "d", ToC{ .global = .none, .value = true } }, }), .long = .initComptime(&.{ .{ "long-a", ToC{ .global = .none, .value = false } }, .{ "global-a", ToC{ .global = .none, .value = false } }, }), .positional = &.{}, .subcommands = .initComptime(&.{ .{ "subcommand-a", &childa }, }), }; return &ctx; } test "tokenize" { const alloc = std.testing.allocator; const context = comptime makeContext(); { const tokens = try tokenize(alloc, context, &.{"-abc"}); defer alloc.free(tokens); try std.testing.expectEqual(3, tokens.len); try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]); try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]); try std.testing.expectEqual(Token{ .short = 'c' }, tokens[2]); } { const tokens = try tokenize(alloc, context, &.{ "-abd", "dee" }); defer alloc.free(tokens); try std.testing.expectEqual(3, tokens.len); try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]); try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]); try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "dee" } }, tokens[2]); } { const tokens = try tokenize(alloc, context, &.{ "-cba", "dee" }); defer alloc.free(tokens); try std.testing.expectEqual(4, tokens.len); try std.testing.expectEqual(Token{ .short = 'c' }, tokens[0]); try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]); try std.testing.expectEqual(Token{ .short = 'a' }, tokens[2]); try std.testing.expectEqual(Token{ .value = "dee" }, tokens[3]); } { const tokens = try tokenize(alloc, context, &.{ "-acb", "dee", "-d", "-zyx" }); defer alloc.free(tokens); try std.testing.expectEqual(4, tokens.len); try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]); try std.testing.expectEqual(Token{ .short = 'c' }, tokens[1]); try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'b', .value = "dee" } }, tokens[2]); try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "-zyx" } }, tokens[3]); } { const tokens = try tokenize(alloc, context, &.{ "-a", "-c", "subcommand-a", "-d", "dee", "-zyx", "--global-a" }); defer alloc.free(tokens); try std.testing.expectEqual(8, tokens.len); try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]); try std.testing.expectEqual(Token{ .short = 'c' }, tokens[1]); try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "dee" } }, tokens[2]); try std.testing.expectEqualDeep(Token{ .long = "global-a" }, tokens[3]); try std.testing.expectEqual(Token{ .subcommand = "subcommand-a" }, tokens[4]); try std.testing.expectEqual(Token{ .short = 'z' }, tokens[5]); try std.testing.expectEqual(Token{ .short = 'y' }, tokens[6]); try std.testing.expectEqual(Token{ .short = 'x' }, tokens[7]); } } // parameter styles to accept: // --name value // --name=value // -n value // -fused (parsed as -f -u -s -e -d) // -fused value (parsed as -f -u -s -e -d value) // ordered // a named parameter can only take zero or one values. Establish a convention for compound values: // --name val1,val2 // --name=val1,val2 // --name="val1, val2" (probably should not consume whitespace, since the user has to go out of their way to do quoting for it) // --name key1=val1,key2=val2 // --name=key1=val1,key2=val2 (should be familiar from docker)