diff --git a/source/mem.zig b/source/mem.zig new file mode 100644 index 0000000..913f066 --- /dev/null +++ b/source/mem.zig @@ -0,0 +1,41 @@ +pub const Partition = struct { lhs: []const u8, rhs: ?[]const u8 }; + +pub const Codepoint = u21; + +pub fn partition(str: []const u8, char: u8) Partition { + return if (std.mem.indexOfScalar(u8, str, char)) |idx| + .{ .lhs = str[0..idx], .rhs = str[idx + 1 ..] } + else + .{ .lhs = str, .rhs = null }; +} + +pub fn SliceIter(comptime T: type) type { + return struct { + slice: []const T, + index: usize = 0, + + pub fn pop(self: *@This()) ?T { + defer self.index +|= 1; + return self.peek(); + } + + pub fn peek(self: *@This()) ?T { + if (self.index >= self.slice.len) return null; + return self.slice[self.index]; + } + }; +} + +pub fn encodeShort(comptime short: Codepoint) []const u8 { + comptime { + const encoded = enc: { + const len = std.unicode.utf8CodepointSequenceLength(short) catch unreachable; + var buf: [len]u8 = undefined; + _ = std.unicode.utf8Encode(short, &buf) catch @compileError("invalid unicode character"); + break :enc buf; + }; + return encoded[0..]; + } +} + +const std = @import("std"); diff --git a/source/tokenizer.zig b/source/tokenizer.zig new file mode 100644 index 0000000..29ed9f6 --- /dev/null +++ b/source/tokenizer.zig @@ -0,0 +1,345 @@ +pub const TokenContext = struct { + forward_ddash: bool = false, + short: Options, + long: Options, + positional: []const []const u8, + subcommands: Subcommands, + + pub const Options = std.StaticStringMap(OptionContext); + pub const Subcommands = std.StaticStringMap(*const TokenContext); + + pub const OptionContext = struct { + global: NestLevel = .none, + value: bool, + }; + + pub const NestLevel = enum(usize) { + root = 0, + none = std.math.maxInt(usize), + _, + + pub fn wrap(lv: usize) NestLevel { + return @enumFromInt(lv); + } + pub fn incr(self: NestLevel) NestLevel { + return wrap(self.unwrap() + 1); + } + pub fn unwrap(self: NestLevel) usize { + return @intFromEnum(self); + } + }; +}; + +pub const Token = union(enum) { + doubledash, + short: u21, + long: []const u8, + shortvalue: struct { name: u21, value: []const u8 }, + longvalue: struct { name: []const u8, value: []const u8 }, + value: []const u8, + subcommand: []const u8, + + pub fn dump(self: Token) void { + switch (self) { + .doubledash => std.debug.print("'--'\n", .{}), + .short => |val| std.debug.print(".short => '{u}'\n", .{val}), + .long => |val| std.debug.print(".long => \"{s}\"\n", .{val}), + .shortvalue => |val| std.debug.print(".shortvalue => '{u}': \"{s}\"\n", .{ val.name, val.value }), + .longvalue => |val| std.debug.print(".shortvalue => {s}: \"{s}\"\n", .{ val.name, val.value }), + .value => |val| std.debug.print(".value => \"{s}\"\n", .{val}), + .subcommand => |val| std.debug.print(".subcommand => \"{s}\"\n", .{val}), + } + } +}; + +const Assembler = struct { + // this underallocates if fused short args are used and overallocates when + // values are stored in a separate arg. it probably overallocates on + // average, but we correct by growing it when fused arguments are + // encountered, so it always overallocates + tokens: std.ArrayListUnmanaged(Token) = .empty, + // this overallocates in every case except the case where every argument is + // a subcommand. There is no reason to change this after the initial + // allocation. + indices: [*]usize, + len: usize, + cap: usize, + + fn init(alloc: std.mem.Allocator, cap: usize) !Assembler { + const idx = try alloc.alloc(usize, cap); + errdefer alloc.free(idx); + return .{ + .tokens = try .initCapacity(alloc, cap), + .indices = idx.ptr, + .len = 0, + .cap = cap, + }; + } + + fn addCapacity(self: *Assembler, alloc: std.mem.Allocator, extra: usize) !void { + try self.tokens.ensureTotalCapacity(alloc, self.tokens.capacity + extra); + } + + fn deinit(self: *Assembler, alloc: std.mem.Allocator) void { + alloc.free(self.indices[0..self.cap]); + self.tokens.deinit(alloc); + } + + fn finish(self: *Assembler, alloc: std.mem.Allocator) ![]const Token { + return try self.tokens.toOwnedSlice(alloc); + } + + fn pushCommand(self: *Assembler) void { + self.indices[self.len] = self.tokens.items.len; + self.len += 1; + } + + fn append(self: *Assembler, tok: Token) void { + self.tokens.insertAssumeCapacity(self.indices[self.len - 1], tok); + self.indices[self.len - 1] += 1; + } + + fn insert(self: *Assembler, level: TokenContext.NestLevel, tok: Token) void { + if (level == .none) { + self.append(tok); + return; + } + + std.debug.assert(level.unwrap() < self.len); + self.tokens.insertAssumeCapacity(self.indices[level.unwrap()], tok); + for (level.unwrap()..self.len) |idx| { + self.indices[idx] += 1; + } + } +}; + +// This tokenizer is very sloppy; it will happily create tokens that are +// mismatch the details of the TokenContext it has (e.g. it may produce a .short +// token without a value even if the context indicates that flag must produce a +// .shortvalue token). There are two reasons for this approach: the first is +// that tokenization is the wrong place to get persnickety about these details; +// the parser has a lot more context that it can use to produce useful errors +// when the token type mismatches its expectation. The seconds reason is that it +// allows us to use the tokenizer in situations where incomplete or incorrect +// input is expected and we want to get partial results, e.g. for an incomplete +// command line asking for completion options. Theoretically, the only true +// failure modes that the tokenizer can experience are allocation failures (OOM) +// and utf-8 decode failures. +// +// This is also the piece of code responsible for associating global parameters +// with the command that declares them. It is possible to do that here because +// the Parser guarantees that global parameters cannot be shadowed. This does +// generally make the true original order of the command line impossible to +// recover, although this could be rectified by keeping an index of when the +// token was actually encountered. Rearranging the globals here saves the need +// for a two-pass parsing strategy (though creating the tokens and then actually +// iterating the tokens is two passes, no parsing has to be done on the tokens, +// only value conversion). +// +// The produced list of tokens store references to the data contained in the +// provided argument vector. That is, the tokens do not own all of their memory, +// so the argument vector must be kept allocated until the end of the lifetime +// of the list of tokens. +pub fn tokenize(alloc: std.mem.Allocator, tokctx: *const TokenContext, argv: []const []const u8) ![]const Token { + var assembler: Assembler = try .init(alloc, argv.len); + defer assembler.deinit(alloc); + assembler.pushCommand(); + + var cmdctx: *const TokenContext = tokctx; + var mode: enum { any, fused, ordered } = .any; + var argit: mem.SliceIter([]const u8) = .{ .slice = argv }; + + while (argit.pop()) |arg| { + mod: switch (mode) { + .any => if (std.mem.eql(u8, arg, "--") and !cmdctx.forward_ddash) { + mode = .ordered; + } else if (std.mem.startsWith(u8, arg, "--")) { + const part = mem.partition(arg[2..], '='); + if (part.rhs) |val| rhs: { + if (cmdctx.long.get(part.lhs)) |optctx| { + assembler.insert(optctx.global, .{ + .longvalue = .{ .name = part.lhs, .value = val }, + }); + break :rhs; + } + assembler.append( + .{ .longvalue = .{ .name = part.lhs, .value = val } }, + ); + } else norhs: { + if (cmdctx.long.get(part.lhs)) |optctx| { + if (optctx.value) { + if (argit.pop()) |val| { + assembler.insert(optctx.global, .{ + .longvalue = .{ .name = part.lhs, .value = val }, + }); + break :norhs; + } + } + assembler.insert(optctx.global, .{ .long = part.lhs }); + break :norhs; + } + assembler.append(.{ .long = part.lhs }); + } + } else if (std.mem.startsWith(u8, arg, "-") and arg.len > 1) { + const cpcount = try std.unicode.utf8CountCodepoints(arg[1..]); + if (cpcount > 1) + try assembler.addCapacity(alloc, cpcount); + continue :mod .fused; + } else { + continue :mod .ordered; + }, + .fused => { + var iter: std.unicode.Utf8Iterator = .{ .bytes = arg[1..], .i = 0 }; + u8i: while (iter.nextCodepointSlice()) |cps| { + const codepoint = std.unicode.utf8Decode(cps) catch unreachable; + if (cmdctx.short.get(cps)) |optctx| { + if (optctx.value and iter.peek(1).len == 0) { + if (argit.pop()) |val| { + assembler.insert(optctx.global, .{ + .shortvalue = .{ .name = codepoint, .value = val }, + }); + continue :u8i; + } + } + assembler.insert(optctx.global, .{ + .short = codepoint, + }); + continue :u8i; + } + assembler.append(.{ .short = codepoint }); + } + }, + .ordered => if (cmdctx.subcommands.get(arg)) |scmd| { + mode = .any; + cmdctx = scmd; + assembler.pushCommand(); + assembler.append(.{ .subcommand = arg }); + } else { + assembler.append(.{ .value = arg }); + }, + } + } + return try assembler.finish(alloc); +} + +const std = @import("std"); +const mem = @import("./mem.zig"); + +fn makeContext() *const TokenContext { + const ToC = TokenContext.OptionContext; + const Nl = TokenContext.NestLevel; + + const childa: TokenContext = .{ + .short = .initComptime(&.{ + .{ "z", ToC{ .global = .none, .value = false } }, + .{ "y", ToC{ .global = .none, .value = true } }, + .{ "x", ToC{ .global = .none, .value = false } }, + .{ "w", ToC{ .global = .none, .value = true } }, + // these are provided by the parent + .{ "c", ToC{ .global = Nl.wrap(0), .value = false } }, + .{ "d", ToC{ .global = Nl.wrap(0), .value = true } }, + }), + .long = .initComptime(&.{ + .{ "long-z", ToC{ .global = .none, .value = false } }, + .{ "global-a", ToC{ .global = Nl.wrap(0), .value = false } }, + }), + .positional = &.{ "argument-z", "argument-y" }, + .subcommands = .initComptime(&.{}), + }; + + const ctx: TokenContext = .{ + .short = .initComptime(&.{ + .{ "a", ToC{ .global = .none, .value = false } }, + .{ "b", ToC{ .global = .none, .value = true } }, + // global arguments are not global on the command that defines them + .{ "c", ToC{ .global = .none, .value = false } }, + .{ "d", ToC{ .global = .none, .value = true } }, + }), + .long = .initComptime(&.{ + .{ "long-a", ToC{ .global = .none, .value = false } }, + .{ "global-a", ToC{ .global = .none, .value = false } }, + }), + .positional = &.{}, + .subcommands = .initComptime(&.{ + .{ "subcommand-a", &childa }, + }), + }; + + return &ctx; +} + +test "tokenize" { + const alloc = std.testing.allocator; + const context = comptime makeContext(); + + { + const tokens = try tokenize(alloc, context, &.{"-abc"}); + defer alloc.free(tokens); + + try std.testing.expectEqual(3, tokens.len); + try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]); + try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]); + try std.testing.expectEqual(Token{ .short = 'c' }, tokens[2]); + } + + { + const tokens = try tokenize(alloc, context, &.{ "-abd", "dee" }); + defer alloc.free(tokens); + + try std.testing.expectEqual(3, tokens.len); + try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]); + try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]); + try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "dee" } }, tokens[2]); + } + + { + const tokens = try tokenize(alloc, context, &.{ "-cba", "dee" }); + defer alloc.free(tokens); + + try std.testing.expectEqual(4, tokens.len); + try std.testing.expectEqual(Token{ .short = 'c' }, tokens[0]); + try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]); + try std.testing.expectEqual(Token{ .short = 'a' }, tokens[2]); + try std.testing.expectEqual(Token{ .value = "dee" }, tokens[3]); + } + + { + const tokens = try tokenize(alloc, context, &.{ "-acb", "dee", "-d", "-zyx" }); + defer alloc.free(tokens); + + try std.testing.expectEqual(4, tokens.len); + try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]); + try std.testing.expectEqual(Token{ .short = 'c' }, tokens[1]); + try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'b', .value = "dee" } }, tokens[2]); + try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "-zyx" } }, tokens[3]); + } + + { + const tokens = try tokenize(alloc, context, &.{ "-a", "-c", "subcommand-a", "-d", "dee", "-zyx", "--global-a" }); + defer alloc.free(tokens); + + try std.testing.expectEqual(8, tokens.len); + try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]); + try std.testing.expectEqual(Token{ .short = 'c' }, tokens[1]); + try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "dee" } }, tokens[2]); + try std.testing.expectEqualDeep(Token{ .long = "global-a" }, tokens[3]); + try std.testing.expectEqual(Token{ .subcommand = "subcommand-a" }, tokens[4]); + try std.testing.expectEqual(Token{ .short = 'z' }, tokens[5]); + try std.testing.expectEqual(Token{ .short = 'y' }, tokens[6]); + try std.testing.expectEqual(Token{ .short = 'x' }, tokens[7]); + } +} + +// parameter styles to accept: +// --name value +// --name=value +// -n value +// -fused (parsed as -f -u -s -e -d) +// -fused value (parsed as -f -u -s -e -d value) +// ordered +// a named parameter can only take zero or one values. Establish a convention for compound values: +// --name val1,val2 +// --name=val1,val2 +// --name="val1, val2" (probably should not consume whitespace, since the user has to go out of their way to do quoting for it) +// --name key1=val1,key2=val2 +// --name=key1=val1,key2=val2 (should be familiar from docker)