NOCLIP/source/tokenizer.zig
2025-03-14 00:17:22 -06:00

346 lines
14 KiB
Zig

pub const TokenContext = struct {
forward_ddash: bool = false,
short: Options,
long: Options,
positional: []const []const u8,
subcommands: Subcommands,
pub const Options = std.StaticStringMap(OptionContext);
pub const Subcommands = std.StaticStringMap(*const TokenContext);
pub const OptionContext = struct {
global: NestLevel = .none,
value: bool,
};
pub const NestLevel = enum(usize) {
root = 0,
none = std.math.maxInt(usize),
_,
pub fn wrap(lv: usize) NestLevel {
return @enumFromInt(lv);
}
pub fn incr(self: NestLevel) NestLevel {
return wrap(self.unwrap() + 1);
}
pub fn unwrap(self: NestLevel) usize {
return @intFromEnum(self);
}
};
};
pub const Token = union(enum) {
doubledash,
short: u21,
long: []const u8,
shortvalue: struct { name: u21, value: []const u8 },
longvalue: struct { name: []const u8, value: []const u8 },
value: []const u8,
subcommand: []const u8,
pub fn dump(self: Token) void {
switch (self) {
.doubledash => std.debug.print("'--'\n", .{}),
.short => |val| std.debug.print(".short => '{u}'\n", .{val}),
.long => |val| std.debug.print(".long => \"{s}\"\n", .{val}),
.shortvalue => |val| std.debug.print(".shortvalue => '{u}': \"{s}\"\n", .{ val.name, val.value }),
.longvalue => |val| std.debug.print(".shortvalue => {s}: \"{s}\"\n", .{ val.name, val.value }),
.value => |val| std.debug.print(".value => \"{s}\"\n", .{val}),
.subcommand => |val| std.debug.print(".subcommand => \"{s}\"\n", .{val}),
}
}
};
const Assembler = struct {
// this underallocates if fused short args are used and overallocates when
// values are stored in a separate arg. it probably overallocates on
// average, but we correct by growing it when fused arguments are
// encountered, so it always overallocates
tokens: std.ArrayListUnmanaged(Token) = .empty,
// this overallocates in every case except the case where every argument is
// a subcommand. There is no reason to change this after the initial
// allocation.
indices: [*]usize,
len: usize,
cap: usize,
fn init(alloc: std.mem.Allocator, cap: usize) !Assembler {
const idx = try alloc.alloc(usize, cap);
errdefer alloc.free(idx);
return .{
.tokens = try .initCapacity(alloc, cap),
.indices = idx.ptr,
.len = 0,
.cap = cap,
};
}
fn addCapacity(self: *Assembler, alloc: std.mem.Allocator, extra: usize) !void {
try self.tokens.ensureTotalCapacity(alloc, self.tokens.capacity + extra);
}
fn deinit(self: *Assembler, alloc: std.mem.Allocator) void {
alloc.free(self.indices[0..self.cap]);
self.tokens.deinit(alloc);
}
fn finish(self: *Assembler, alloc: std.mem.Allocator) ![]const Token {
return try self.tokens.toOwnedSlice(alloc);
}
fn pushCommand(self: *Assembler) void {
self.indices[self.len] = self.tokens.items.len;
self.len += 1;
}
fn append(self: *Assembler, tok: Token) void {
self.tokens.insertAssumeCapacity(self.indices[self.len - 1], tok);
self.indices[self.len - 1] += 1;
}
fn insert(self: *Assembler, level: TokenContext.NestLevel, tok: Token) void {
if (level == .none) {
self.append(tok);
return;
}
std.debug.assert(level.unwrap() < self.len);
self.tokens.insertAssumeCapacity(self.indices[level.unwrap()], tok);
for (level.unwrap()..self.len) |idx| {
self.indices[idx] += 1;
}
}
};
// This tokenizer is very sloppy; it will happily create tokens that are
// mismatch the details of the TokenContext it has (e.g. it may produce a .short
// token without a value even if the context indicates that flag must produce a
// .shortvalue token). There are two reasons for this approach: the first is
// that tokenization is the wrong place to get persnickety about these details;
// the parser has a lot more context that it can use to produce useful errors
// when the token type mismatches its expectation. The seconds reason is that it
// allows us to use the tokenizer in situations where incomplete or incorrect
// input is expected and we want to get partial results, e.g. for an incomplete
// command line asking for completion options. Theoretically, the only true
// failure modes that the tokenizer can experience are allocation failures (OOM)
// and utf-8 decode failures.
//
// This is also the piece of code responsible for associating global parameters
// with the command that declares them. It is possible to do that here because
// the Parser guarantees that global parameters cannot be shadowed. This does
// generally make the true original order of the command line impossible to
// recover, although this could be rectified by keeping an index of when the
// token was actually encountered. Rearranging the globals here saves the need
// for a two-pass parsing strategy (though creating the tokens and then actually
// iterating the tokens is two passes, no parsing has to be done on the tokens,
// only value conversion).
//
// The produced list of tokens store references to the data contained in the
// provided argument vector. That is, the tokens do not own all of their memory,
// so the argument vector must be kept allocated until the end of the lifetime
// of the list of tokens.
pub fn tokenize(alloc: std.mem.Allocator, tokctx: *const TokenContext, argv: []const []const u8) ![]const Token {
var assembler: Assembler = try .init(alloc, argv.len);
defer assembler.deinit(alloc);
assembler.pushCommand();
var cmdctx: *const TokenContext = tokctx;
var mode: enum { any, fused, ordered } = .any;
var argit: mem.SliceIter([]const u8) = .{ .slice = argv };
while (argit.pop()) |arg| {
mod: switch (mode) {
.any => if (std.mem.eql(u8, arg, "--") and !cmdctx.forward_ddash) {
mode = .ordered;
} else if (std.mem.startsWith(u8, arg, "--")) {
const part = mem.partition(arg[2..], '=');
if (part.rhs) |val| rhs: {
if (cmdctx.long.get(part.lhs)) |optctx| {
assembler.insert(optctx.global, .{
.longvalue = .{ .name = part.lhs, .value = val },
});
break :rhs;
}
assembler.append(
.{ .longvalue = .{ .name = part.lhs, .value = val } },
);
} else norhs: {
if (cmdctx.long.get(part.lhs)) |optctx| {
if (optctx.value) {
if (argit.pop()) |val| {
assembler.insert(optctx.global, .{
.longvalue = .{ .name = part.lhs, .value = val },
});
break :norhs;
}
}
assembler.insert(optctx.global, .{ .long = part.lhs });
break :norhs;
}
assembler.append(.{ .long = part.lhs });
}
} else if (std.mem.startsWith(u8, arg, "-") and arg.len > 1) {
const cpcount = try std.unicode.utf8CountCodepoints(arg[1..]);
if (cpcount > 1)
try assembler.addCapacity(alloc, cpcount);
continue :mod .fused;
} else {
continue :mod .ordered;
},
.fused => {
var iter: std.unicode.Utf8Iterator = .{ .bytes = arg[1..], .i = 0 };
u8i: while (iter.nextCodepointSlice()) |cps| {
const codepoint = std.unicode.utf8Decode(cps) catch unreachable;
if (cmdctx.short.get(cps)) |optctx| {
if (optctx.value and iter.peek(1).len == 0) {
if (argit.pop()) |val| {
assembler.insert(optctx.global, .{
.shortvalue = .{ .name = codepoint, .value = val },
});
continue :u8i;
}
}
assembler.insert(optctx.global, .{
.short = codepoint,
});
continue :u8i;
}
assembler.append(.{ .short = codepoint });
}
},
.ordered => if (cmdctx.subcommands.get(arg)) |scmd| {
mode = .any;
cmdctx = scmd;
assembler.pushCommand();
assembler.append(.{ .subcommand = arg });
} else {
assembler.append(.{ .value = arg });
},
}
}
return try assembler.finish(alloc);
}
const std = @import("std");
const mem = @import("./mem.zig");
fn makeContext() *const TokenContext {
const ToC = TokenContext.OptionContext;
const Nl = TokenContext.NestLevel;
const childa: TokenContext = .{
.short = .initComptime(&.{
.{ "z", ToC{ .global = .none, .value = false } },
.{ "y", ToC{ .global = .none, .value = true } },
.{ "x", ToC{ .global = .none, .value = false } },
.{ "w", ToC{ .global = .none, .value = true } },
// these are provided by the parent
.{ "c", ToC{ .global = Nl.wrap(0), .value = false } },
.{ "d", ToC{ .global = Nl.wrap(0), .value = true } },
}),
.long = .initComptime(&.{
.{ "long-z", ToC{ .global = .none, .value = false } },
.{ "global-a", ToC{ .global = Nl.wrap(0), .value = false } },
}),
.positional = &.{ "argument-z", "argument-y" },
.subcommands = .initComptime(&.{}),
};
const ctx: TokenContext = .{
.short = .initComptime(&.{
.{ "a", ToC{ .global = .none, .value = false } },
.{ "b", ToC{ .global = .none, .value = true } },
// global arguments are not global on the command that defines them
.{ "c", ToC{ .global = .none, .value = false } },
.{ "d", ToC{ .global = .none, .value = true } },
}),
.long = .initComptime(&.{
.{ "long-a", ToC{ .global = .none, .value = false } },
.{ "global-a", ToC{ .global = .none, .value = false } },
}),
.positional = &.{},
.subcommands = .initComptime(&.{
.{ "subcommand-a", &childa },
}),
};
return &ctx;
}
test "tokenize" {
const alloc = std.testing.allocator;
const context = comptime makeContext();
{
const tokens = try tokenize(alloc, context, &.{"-abc"});
defer alloc.free(tokens);
try std.testing.expectEqual(3, tokens.len);
try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]);
try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]);
try std.testing.expectEqual(Token{ .short = 'c' }, tokens[2]);
}
{
const tokens = try tokenize(alloc, context, &.{ "-abd", "dee" });
defer alloc.free(tokens);
try std.testing.expectEqual(3, tokens.len);
try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]);
try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]);
try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "dee" } }, tokens[2]);
}
{
const tokens = try tokenize(alloc, context, &.{ "-cba", "dee" });
defer alloc.free(tokens);
try std.testing.expectEqual(4, tokens.len);
try std.testing.expectEqual(Token{ .short = 'c' }, tokens[0]);
try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]);
try std.testing.expectEqual(Token{ .short = 'a' }, tokens[2]);
try std.testing.expectEqual(Token{ .value = "dee" }, tokens[3]);
}
{
const tokens = try tokenize(alloc, context, &.{ "-acb", "dee", "-d", "-zyx" });
defer alloc.free(tokens);
try std.testing.expectEqual(4, tokens.len);
try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]);
try std.testing.expectEqual(Token{ .short = 'c' }, tokens[1]);
try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'b', .value = "dee" } }, tokens[2]);
try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "-zyx" } }, tokens[3]);
}
{
const tokens = try tokenize(alloc, context, &.{ "-a", "-c", "subcommand-a", "-d", "dee", "-zyx", "--global-a" });
defer alloc.free(tokens);
try std.testing.expectEqual(8, tokens.len);
try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]);
try std.testing.expectEqual(Token{ .short = 'c' }, tokens[1]);
try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "dee" } }, tokens[2]);
try std.testing.expectEqualDeep(Token{ .long = "global-a" }, tokens[3]);
try std.testing.expectEqual(Token{ .subcommand = "subcommand-a" }, tokens[4]);
try std.testing.expectEqual(Token{ .short = 'z' }, tokens[5]);
try std.testing.expectEqual(Token{ .short = 'y' }, tokens[6]);
try std.testing.expectEqual(Token{ .short = 'x' }, tokens[7]);
}
}
// parameter styles to accept:
// --name value
// --name=value
// -n value
// -fused (parsed as -f -u -s -e -d)
// -fused value (parsed as -f -u -s -e -d value)
// ordered
// a named parameter can only take zero or one values. Establish a convention for compound values:
// --name val1,val2
// --name=val1,val2
// --name="val1, val2" (probably should not consume whitespace, since the user has to go out of their way to do quoting for it)
// --name key1=val1,key2=val2
// --name=key1=val1,key2=val2 (should be familiar from docker)