consider a tokenizer
This commit is contained in:
parent
11b7d3e06b
commit
5746fbbd5e
41
source/mem.zig
Normal file
41
source/mem.zig
Normal file
@ -0,0 +1,41 @@
|
||||
pub const Partition = struct { lhs: []const u8, rhs: ?[]const u8 };
|
||||
|
||||
pub const Codepoint = u21;
|
||||
|
||||
pub fn partition(str: []const u8, char: u8) Partition {
|
||||
return if (std.mem.indexOfScalar(u8, str, char)) |idx|
|
||||
.{ .lhs = str[0..idx], .rhs = str[idx + 1 ..] }
|
||||
else
|
||||
.{ .lhs = str, .rhs = null };
|
||||
}
|
||||
|
||||
pub fn SliceIter(comptime T: type) type {
|
||||
return struct {
|
||||
slice: []const T,
|
||||
index: usize = 0,
|
||||
|
||||
pub fn pop(self: *@This()) ?T {
|
||||
defer self.index +|= 1;
|
||||
return self.peek();
|
||||
}
|
||||
|
||||
pub fn peek(self: *@This()) ?T {
|
||||
if (self.index >= self.slice.len) return null;
|
||||
return self.slice[self.index];
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub fn encodeShort(comptime short: Codepoint) []const u8 {
|
||||
comptime {
|
||||
const encoded = enc: {
|
||||
const len = std.unicode.utf8CodepointSequenceLength(short) catch unreachable;
|
||||
var buf: [len]u8 = undefined;
|
||||
_ = std.unicode.utf8Encode(short, &buf) catch @compileError("invalid unicode character");
|
||||
break :enc buf;
|
||||
};
|
||||
return encoded[0..];
|
||||
}
|
||||
}
|
||||
|
||||
const std = @import("std");
|
345
source/tokenizer.zig
Normal file
345
source/tokenizer.zig
Normal file
@ -0,0 +1,345 @@
|
||||
pub const TokenContext = struct {
|
||||
forward_ddash: bool = false,
|
||||
short: Options,
|
||||
long: Options,
|
||||
positional: []const []const u8,
|
||||
subcommands: Subcommands,
|
||||
|
||||
pub const Options = std.StaticStringMap(OptionContext);
|
||||
pub const Subcommands = std.StaticStringMap(*const TokenContext);
|
||||
|
||||
pub const OptionContext = struct {
|
||||
global: NestLevel = .none,
|
||||
value: bool,
|
||||
};
|
||||
|
||||
pub const NestLevel = enum(usize) {
|
||||
root = 0,
|
||||
none = std.math.maxInt(usize),
|
||||
_,
|
||||
|
||||
pub fn wrap(lv: usize) NestLevel {
|
||||
return @enumFromInt(lv);
|
||||
}
|
||||
pub fn incr(self: NestLevel) NestLevel {
|
||||
return wrap(self.unwrap() + 1);
|
||||
}
|
||||
pub fn unwrap(self: NestLevel) usize {
|
||||
return @intFromEnum(self);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
pub const Token = union(enum) {
|
||||
doubledash,
|
||||
short: u21,
|
||||
long: []const u8,
|
||||
shortvalue: struct { name: u21, value: []const u8 },
|
||||
longvalue: struct { name: []const u8, value: []const u8 },
|
||||
value: []const u8,
|
||||
subcommand: []const u8,
|
||||
|
||||
pub fn dump(self: Token) void {
|
||||
switch (self) {
|
||||
.doubledash => std.debug.print("'--'\n", .{}),
|
||||
.short => |val| std.debug.print(".short => '{u}'\n", .{val}),
|
||||
.long => |val| std.debug.print(".long => \"{s}\"\n", .{val}),
|
||||
.shortvalue => |val| std.debug.print(".shortvalue => '{u}': \"{s}\"\n", .{ val.name, val.value }),
|
||||
.longvalue => |val| std.debug.print(".shortvalue => {s}: \"{s}\"\n", .{ val.name, val.value }),
|
||||
.value => |val| std.debug.print(".value => \"{s}\"\n", .{val}),
|
||||
.subcommand => |val| std.debug.print(".subcommand => \"{s}\"\n", .{val}),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const Assembler = struct {
|
||||
// this underallocates if fused short args are used and overallocates when
|
||||
// values are stored in a separate arg. it probably overallocates on
|
||||
// average, but we correct by growing it when fused arguments are
|
||||
// encountered, so it always overallocates
|
||||
tokens: std.ArrayListUnmanaged(Token) = .empty,
|
||||
// this overallocates in every case except the case where every argument is
|
||||
// a subcommand. There is no reason to change this after the initial
|
||||
// allocation.
|
||||
indices: [*]usize,
|
||||
len: usize,
|
||||
cap: usize,
|
||||
|
||||
fn init(alloc: std.mem.Allocator, cap: usize) !Assembler {
|
||||
const idx = try alloc.alloc(usize, cap);
|
||||
errdefer alloc.free(idx);
|
||||
return .{
|
||||
.tokens = try .initCapacity(alloc, cap),
|
||||
.indices = idx.ptr,
|
||||
.len = 0,
|
||||
.cap = cap,
|
||||
};
|
||||
}
|
||||
|
||||
fn addCapacity(self: *Assembler, alloc: std.mem.Allocator, extra: usize) !void {
|
||||
try self.tokens.ensureTotalCapacity(alloc, self.tokens.capacity + extra);
|
||||
}
|
||||
|
||||
fn deinit(self: *Assembler, alloc: std.mem.Allocator) void {
|
||||
alloc.free(self.indices[0..self.cap]);
|
||||
self.tokens.deinit(alloc);
|
||||
}
|
||||
|
||||
fn finish(self: *Assembler, alloc: std.mem.Allocator) ![]const Token {
|
||||
return try self.tokens.toOwnedSlice(alloc);
|
||||
}
|
||||
|
||||
fn pushCommand(self: *Assembler) void {
|
||||
self.indices[self.len] = self.tokens.items.len;
|
||||
self.len += 1;
|
||||
}
|
||||
|
||||
fn append(self: *Assembler, tok: Token) void {
|
||||
self.tokens.insertAssumeCapacity(self.indices[self.len - 1], tok);
|
||||
self.indices[self.len - 1] += 1;
|
||||
}
|
||||
|
||||
fn insert(self: *Assembler, level: TokenContext.NestLevel, tok: Token) void {
|
||||
if (level == .none) {
|
||||
self.append(tok);
|
||||
return;
|
||||
}
|
||||
|
||||
std.debug.assert(level.unwrap() < self.len);
|
||||
self.tokens.insertAssumeCapacity(self.indices[level.unwrap()], tok);
|
||||
for (level.unwrap()..self.len) |idx| {
|
||||
self.indices[idx] += 1;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// This tokenizer is very sloppy; it will happily create tokens that are
|
||||
// mismatch the details of the TokenContext it has (e.g. it may produce a .short
|
||||
// token without a value even if the context indicates that flag must produce a
|
||||
// .shortvalue token). There are two reasons for this approach: the first is
|
||||
// that tokenization is the wrong place to get persnickety about these details;
|
||||
// the parser has a lot more context that it can use to produce useful errors
|
||||
// when the token type mismatches its expectation. The seconds reason is that it
|
||||
// allows us to use the tokenizer in situations where incomplete or incorrect
|
||||
// input is expected and we want to get partial results, e.g. for an incomplete
|
||||
// command line asking for completion options. Theoretically, the only true
|
||||
// failure modes that the tokenizer can experience are allocation failures (OOM)
|
||||
// and utf-8 decode failures.
|
||||
//
|
||||
// This is also the piece of code responsible for associating global parameters
|
||||
// with the command that declares them. It is possible to do that here because
|
||||
// the Parser guarantees that global parameters cannot be shadowed. This does
|
||||
// generally make the true original order of the command line impossible to
|
||||
// recover, although this could be rectified by keeping an index of when the
|
||||
// token was actually encountered. Rearranging the globals here saves the need
|
||||
// for a two-pass parsing strategy (though creating the tokens and then actually
|
||||
// iterating the tokens is two passes, no parsing has to be done on the tokens,
|
||||
// only value conversion).
|
||||
//
|
||||
// The produced list of tokens store references to the data contained in the
|
||||
// provided argument vector. That is, the tokens do not own all of their memory,
|
||||
// so the argument vector must be kept allocated until the end of the lifetime
|
||||
// of the list of tokens.
|
||||
pub fn tokenize(alloc: std.mem.Allocator, tokctx: *const TokenContext, argv: []const []const u8) ![]const Token {
|
||||
var assembler: Assembler = try .init(alloc, argv.len);
|
||||
defer assembler.deinit(alloc);
|
||||
assembler.pushCommand();
|
||||
|
||||
var cmdctx: *const TokenContext = tokctx;
|
||||
var mode: enum { any, fused, ordered } = .any;
|
||||
var argit: mem.SliceIter([]const u8) = .{ .slice = argv };
|
||||
|
||||
while (argit.pop()) |arg| {
|
||||
mod: switch (mode) {
|
||||
.any => if (std.mem.eql(u8, arg, "--") and !cmdctx.forward_ddash) {
|
||||
mode = .ordered;
|
||||
} else if (std.mem.startsWith(u8, arg, "--")) {
|
||||
const part = mem.partition(arg[2..], '=');
|
||||
if (part.rhs) |val| rhs: {
|
||||
if (cmdctx.long.get(part.lhs)) |optctx| {
|
||||
assembler.insert(optctx.global, .{
|
||||
.longvalue = .{ .name = part.lhs, .value = val },
|
||||
});
|
||||
break :rhs;
|
||||
}
|
||||
assembler.append(
|
||||
.{ .longvalue = .{ .name = part.lhs, .value = val } },
|
||||
);
|
||||
} else norhs: {
|
||||
if (cmdctx.long.get(part.lhs)) |optctx| {
|
||||
if (optctx.value) {
|
||||
if (argit.pop()) |val| {
|
||||
assembler.insert(optctx.global, .{
|
||||
.longvalue = .{ .name = part.lhs, .value = val },
|
||||
});
|
||||
break :norhs;
|
||||
}
|
||||
}
|
||||
assembler.insert(optctx.global, .{ .long = part.lhs });
|
||||
break :norhs;
|
||||
}
|
||||
assembler.append(.{ .long = part.lhs });
|
||||
}
|
||||
} else if (std.mem.startsWith(u8, arg, "-") and arg.len > 1) {
|
||||
const cpcount = try std.unicode.utf8CountCodepoints(arg[1..]);
|
||||
if (cpcount > 1)
|
||||
try assembler.addCapacity(alloc, cpcount);
|
||||
continue :mod .fused;
|
||||
} else {
|
||||
continue :mod .ordered;
|
||||
},
|
||||
.fused => {
|
||||
var iter: std.unicode.Utf8Iterator = .{ .bytes = arg[1..], .i = 0 };
|
||||
u8i: while (iter.nextCodepointSlice()) |cps| {
|
||||
const codepoint = std.unicode.utf8Decode(cps) catch unreachable;
|
||||
if (cmdctx.short.get(cps)) |optctx| {
|
||||
if (optctx.value and iter.peek(1).len == 0) {
|
||||
if (argit.pop()) |val| {
|
||||
assembler.insert(optctx.global, .{
|
||||
.shortvalue = .{ .name = codepoint, .value = val },
|
||||
});
|
||||
continue :u8i;
|
||||
}
|
||||
}
|
||||
assembler.insert(optctx.global, .{
|
||||
.short = codepoint,
|
||||
});
|
||||
continue :u8i;
|
||||
}
|
||||
assembler.append(.{ .short = codepoint });
|
||||
}
|
||||
},
|
||||
.ordered => if (cmdctx.subcommands.get(arg)) |scmd| {
|
||||
mode = .any;
|
||||
cmdctx = scmd;
|
||||
assembler.pushCommand();
|
||||
assembler.append(.{ .subcommand = arg });
|
||||
} else {
|
||||
assembler.append(.{ .value = arg });
|
||||
},
|
||||
}
|
||||
}
|
||||
return try assembler.finish(alloc);
|
||||
}
|
||||
|
||||
const std = @import("std");
|
||||
const mem = @import("./mem.zig");
|
||||
|
||||
fn makeContext() *const TokenContext {
|
||||
const ToC = TokenContext.OptionContext;
|
||||
const Nl = TokenContext.NestLevel;
|
||||
|
||||
const childa: TokenContext = .{
|
||||
.short = .initComptime(&.{
|
||||
.{ "z", ToC{ .global = .none, .value = false } },
|
||||
.{ "y", ToC{ .global = .none, .value = true } },
|
||||
.{ "x", ToC{ .global = .none, .value = false } },
|
||||
.{ "w", ToC{ .global = .none, .value = true } },
|
||||
// these are provided by the parent
|
||||
.{ "c", ToC{ .global = Nl.wrap(0), .value = false } },
|
||||
.{ "d", ToC{ .global = Nl.wrap(0), .value = true } },
|
||||
}),
|
||||
.long = .initComptime(&.{
|
||||
.{ "long-z", ToC{ .global = .none, .value = false } },
|
||||
.{ "global-a", ToC{ .global = Nl.wrap(0), .value = false } },
|
||||
}),
|
||||
.positional = &.{ "argument-z", "argument-y" },
|
||||
.subcommands = .initComptime(&.{}),
|
||||
};
|
||||
|
||||
const ctx: TokenContext = .{
|
||||
.short = .initComptime(&.{
|
||||
.{ "a", ToC{ .global = .none, .value = false } },
|
||||
.{ "b", ToC{ .global = .none, .value = true } },
|
||||
// global arguments are not global on the command that defines them
|
||||
.{ "c", ToC{ .global = .none, .value = false } },
|
||||
.{ "d", ToC{ .global = .none, .value = true } },
|
||||
}),
|
||||
.long = .initComptime(&.{
|
||||
.{ "long-a", ToC{ .global = .none, .value = false } },
|
||||
.{ "global-a", ToC{ .global = .none, .value = false } },
|
||||
}),
|
||||
.positional = &.{},
|
||||
.subcommands = .initComptime(&.{
|
||||
.{ "subcommand-a", &childa },
|
||||
}),
|
||||
};
|
||||
|
||||
return &ctx;
|
||||
}
|
||||
|
||||
test "tokenize" {
|
||||
const alloc = std.testing.allocator;
|
||||
const context = comptime makeContext();
|
||||
|
||||
{
|
||||
const tokens = try tokenize(alloc, context, &.{"-abc"});
|
||||
defer alloc.free(tokens);
|
||||
|
||||
try std.testing.expectEqual(3, tokens.len);
|
||||
try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]);
|
||||
try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]);
|
||||
try std.testing.expectEqual(Token{ .short = 'c' }, tokens[2]);
|
||||
}
|
||||
|
||||
{
|
||||
const tokens = try tokenize(alloc, context, &.{ "-abd", "dee" });
|
||||
defer alloc.free(tokens);
|
||||
|
||||
try std.testing.expectEqual(3, tokens.len);
|
||||
try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]);
|
||||
try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]);
|
||||
try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "dee" } }, tokens[2]);
|
||||
}
|
||||
|
||||
{
|
||||
const tokens = try tokenize(alloc, context, &.{ "-cba", "dee" });
|
||||
defer alloc.free(tokens);
|
||||
|
||||
try std.testing.expectEqual(4, tokens.len);
|
||||
try std.testing.expectEqual(Token{ .short = 'c' }, tokens[0]);
|
||||
try std.testing.expectEqual(Token{ .short = 'b' }, tokens[1]);
|
||||
try std.testing.expectEqual(Token{ .short = 'a' }, tokens[2]);
|
||||
try std.testing.expectEqual(Token{ .value = "dee" }, tokens[3]);
|
||||
}
|
||||
|
||||
{
|
||||
const tokens = try tokenize(alloc, context, &.{ "-acb", "dee", "-d", "-zyx" });
|
||||
defer alloc.free(tokens);
|
||||
|
||||
try std.testing.expectEqual(4, tokens.len);
|
||||
try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]);
|
||||
try std.testing.expectEqual(Token{ .short = 'c' }, tokens[1]);
|
||||
try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'b', .value = "dee" } }, tokens[2]);
|
||||
try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "-zyx" } }, tokens[3]);
|
||||
}
|
||||
|
||||
{
|
||||
const tokens = try tokenize(alloc, context, &.{ "-a", "-c", "subcommand-a", "-d", "dee", "-zyx", "--global-a" });
|
||||
defer alloc.free(tokens);
|
||||
|
||||
try std.testing.expectEqual(8, tokens.len);
|
||||
try std.testing.expectEqual(Token{ .short = 'a' }, tokens[0]);
|
||||
try std.testing.expectEqual(Token{ .short = 'c' }, tokens[1]);
|
||||
try std.testing.expectEqual(Token{ .shortvalue = .{ .name = 'd', .value = "dee" } }, tokens[2]);
|
||||
try std.testing.expectEqualDeep(Token{ .long = "global-a" }, tokens[3]);
|
||||
try std.testing.expectEqual(Token{ .subcommand = "subcommand-a" }, tokens[4]);
|
||||
try std.testing.expectEqual(Token{ .short = 'z' }, tokens[5]);
|
||||
try std.testing.expectEqual(Token{ .short = 'y' }, tokens[6]);
|
||||
try std.testing.expectEqual(Token{ .short = 'x' }, tokens[7]);
|
||||
}
|
||||
}
|
||||
|
||||
// parameter styles to accept:
|
||||
// --name value
|
||||
// --name=value
|
||||
// -n value
|
||||
// -fused (parsed as -f -u -s -e -d)
|
||||
// -fused value (parsed as -f -u -s -e -d value)
|
||||
// ordered
|
||||
// a named parameter can only take zero or one values. Establish a convention for compound values:
|
||||
// --name val1,val2
|
||||
// --name=val1,val2
|
||||
// --name="val1, val2" (probably should not consume whitespace, since the user has to go out of their way to do quoting for it)
|
||||
// --name key1=val1,key2=val2
|
||||
// --name=key1=val1,key2=val2 (should be familiar from docker)
|
Loading…
x
Reference in New Issue
Block a user