I will never get tired of vendoring dependencies. ha ha. It is possible
I am insane. I had to do a lot of pruning to get these not to be
ridiculous (especially the unicode data, which had nearly 1 million
lines of... stuff).
This commit is contained in:
2024-08-09 17:32:06 -07:00
commit 7692cb4bc7
155 changed files with 206515 additions and 0 deletions

66
deps/zg/src/CanonData.zig vendored Normal file
View File

@@ -0,0 +1,66 @@
const std = @import("std");
const builtin = @import("builtin");
const compress = std.compress;
const mem = std.mem;
allocator: mem.Allocator,
nfc: std.AutoHashMap([2]u21, u21),
nfd: [][]u21 = undefined,
const Self = @This();
pub fn init(allocator: mem.Allocator) !Self {
const decompressor = compress.flate.inflate.decompressor;
const in_bytes = @embedFile("canon");
var in_fbs = std.io.fixedBufferStream(in_bytes);
var in_decomp = decompressor(.raw, in_fbs.reader());
var reader = in_decomp.reader();
const endian = builtin.cpu.arch.endian();
var self = Self{
.allocator = allocator,
.nfc = std.AutoHashMap([2]u21, u21).init(allocator),
.nfd = try allocator.alloc([]u21, 0x110000),
};
var slices: usize = 0;
errdefer {
self.nfc.deinit();
for (self.nfd[0..slices]) |slice| self.allocator.free(slice);
self.allocator.free(self.nfd);
}
@memset(self.nfd, &.{});
while (true) {
const len: u8 = try reader.readInt(u8, endian);
if (len == 0) break;
const cp = try reader.readInt(u24, endian);
self.nfd[cp] = try allocator.alloc(u21, len - 1);
slices += 1;
for (0..len - 1) |i| {
self.nfd[cp][i] = @intCast(try reader.readInt(u24, endian));
}
if (len == 3) {
try self.nfc.put(self.nfd[cp][0..2].*, @intCast(cp));
}
}
return self;
}
pub fn deinit(self: *Self) void {
self.nfc.deinit();
for (self.nfd) |slice| self.allocator.free(slice);
self.allocator.free(self.nfd);
}
/// Returns canonical decomposition for `cp`.
pub fn toNfd(self: Self, cp: u21) []const u21 {
return self.nfd[cp];
}
// Returns the primary composite for the codepoints in `cp`.
pub fn toNfc(self: Self, cps: [2]u21) ?u21 {
return self.nfc.get(cps);
}

202
deps/zg/src/CaseData.zig vendored Normal file
View File

@@ -0,0 +1,202 @@
const std = @import("std");
const builtin = @import("builtin");
const compress = std.compress;
const mem = std.mem;
const testing = std.testing;
const unicode = std.unicode;
const CodePointIterator = @import("code_point").Iterator;
allocator: mem.Allocator,
case_map: [][2]u21,
prop_s1: []u16 = undefined,
prop_s2: []u8 = undefined,
const Self = @This();
pub fn init(allocator: mem.Allocator) !Self {
const decompressor = compress.flate.inflate.decompressor;
const endian = builtin.cpu.arch.endian();
var self = Self{
.allocator = allocator,
.case_map = try allocator.alloc([2]u21, 0x110000),
};
errdefer allocator.free(self.case_map);
for (0..0x110000) |i| {
const cp: u21 = @intCast(i);
self.case_map[cp] = .{ cp, cp };
}
// Uppercase
const upper_bytes = @embedFile("upper");
var upper_fbs = std.io.fixedBufferStream(upper_bytes);
var upper_decomp = decompressor(.raw, upper_fbs.reader());
var upper_reader = upper_decomp.reader();
while (true) {
const cp = try upper_reader.readInt(i24, endian);
if (cp == 0) break;
const diff = try upper_reader.readInt(i24, endian);
self.case_map[@intCast(cp)][0] = @intCast(cp + diff);
}
// Lowercase
const lower_bytes = @embedFile("lower");
var lower_fbs = std.io.fixedBufferStream(lower_bytes);
var lower_decomp = decompressor(.raw, lower_fbs.reader());
var lower_reader = lower_decomp.reader();
while (true) {
const cp = try lower_reader.readInt(i24, endian);
if (cp == 0) break;
const diff = try lower_reader.readInt(i24, endian);
self.case_map[@intCast(cp)][1] = @intCast(cp + diff);
}
// Case properties
const cp_bytes = @embedFile("case_prop");
var cp_fbs = std.io.fixedBufferStream(cp_bytes);
var cp_decomp = decompressor(.raw, cp_fbs.reader());
var cp_reader = cp_decomp.reader();
const stage_1_len: u16 = try cp_reader.readInt(u16, endian);
self.prop_s1 = try allocator.alloc(u16, stage_1_len);
errdefer allocator.free(self.prop_s1);
for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian);
const stage_2_len: u16 = try cp_reader.readInt(u16, endian);
self.prop_s2 = try allocator.alloc(u8, stage_2_len);
errdefer allocator.free(self.prop_s2);
_ = try cp_reader.readAll(self.prop_s2);
return self;
}
pub fn deinit(self: *const Self) void {
self.allocator.free(self.case_map);
self.allocator.free(self.prop_s1);
self.allocator.free(self.prop_s2);
}
// Returns true if `cp` is either upper, lower, or title case.
pub fn isCased(self: Self, cp: u21) bool {
return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
}
// Returns true if `cp` is uppercase.
pub fn isUpper(self: Self, cp: u21) bool {
return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
}
/// Returns true if `str` is all uppercase.
pub fn isUpperStr(self: Self, str: []const u8) bool {
var iter = CodePointIterator{ .bytes = str };
return while (iter.next()) |cp| {
if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false;
} else true;
}
test "isUpperStr" {
const cd = try init(testing.allocator);
defer cd.deinit();
try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!"));
try testing.expect(!cd.isUpperStr("hello, world 2112!"));
try testing.expect(!cd.isUpperStr("Hello, World 2112!"));
}
/// Returns uppercase mapping for `cp`.
pub fn toUpper(self: Self, cp: u21) u21 {
return self.case_map[cp][0];
}
/// Returns a new string with all letters in uppercase.
/// Caller must free returned bytes with `allocator`.
pub fn toUpperStr(
self: Self,
allocator: mem.Allocator,
str: []const u8,
) ![]u8 {
var bytes = std.ArrayList(u8).init(allocator);
defer bytes.deinit();
var iter = CodePointIterator{ .bytes = str };
var buf: [4]u8 = undefined;
while (iter.next()) |cp| {
const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf);
try bytes.appendSlice(buf[0..len]);
}
return try bytes.toOwnedSlice();
}
test "toUpperStr" {
const cd = try init(testing.allocator);
defer cd.deinit();
const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!");
defer testing.allocator.free(uppered);
try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered);
}
// Returns true if `cp` is lowercase.
pub fn isLower(self: Self, cp: u21) bool {
return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
}
/// Returns true if `str` is all lowercase.
pub fn isLowerStr(self: Self, str: []const u8) bool {
var iter = CodePointIterator{ .bytes = str };
return while (iter.next()) |cp| {
if (self.isCased(cp.code) and !self.isLower(cp.code)) break false;
} else true;
}
test "isLowerStr" {
const cd = try init(testing.allocator);
defer cd.deinit();
try testing.expect(cd.isLowerStr("hello, world 2112!"));
try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!"));
try testing.expect(!cd.isLowerStr("Hello, World 2112!"));
}
/// Returns lowercase mapping for `cp`.
pub fn toLower(self: Self, cp: u21) u21 {
return self.case_map[cp][1];
}
/// Returns a new string with all letters in lowercase.
/// Caller must free returned bytes with `allocator`.
pub fn toLowerStr(
self: Self,
allocator: mem.Allocator,
str: []const u8,
) ![]u8 {
var bytes = std.ArrayList(u8).init(allocator);
defer bytes.deinit();
var iter = CodePointIterator{ .bytes = str };
var buf: [4]u8 = undefined;
while (iter.next()) |cp| {
const len = try unicode.utf8Encode(self.toLower(cp.code), &buf);
try bytes.appendSlice(buf[0..len]);
}
return try bytes.toOwnedSlice();
}
test "toLowerStr" {
const cd = try init(testing.allocator);
defer cd.deinit();
const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!");
defer testing.allocator.free(lowered);
try testing.expectEqualStrings("hello, world 2112!", lowered);
}

189
deps/zg/src/CaseFold.zig vendored Normal file
View File

@@ -0,0 +1,189 @@
const std = @import("std");
const mem = std.mem;
const testing = std.testing;
const ascii = @import("ascii");
pub const FoldData = @import("FoldData");
const Normalize = @import("Normalize");
fold_data: *const FoldData,
const Self = @This();
/// Produces the case folded code points for `cps`. Caller must free returned
/// slice with `allocator`.
pub fn caseFold(
self: Self,
allocator: mem.Allocator,
cps: []const u21,
) ![]const u21 {
var cfcps = std.ArrayList(u21).init(allocator);
defer cfcps.deinit();
var buf: [3]u21 = undefined;
for (cps) |cp| {
const cf = self.fold_data.caseFold(cp, &buf);
if (cf.len == 0) {
try cfcps.append(cp);
} else {
try cfcps.appendSlice(cf);
}
}
return try cfcps.toOwnedSlice();
}
fn changesWhenCaseFolded(self: Self, cps: []const u21) bool {
return for (cps) |cp| {
if (self.fold_data.changesWhenCaseFolded(cp)) break true;
} else false;
}
/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most
/// comprehensive comparison possible, but slower than `canonCaselessMatch`.
pub fn compatCaselessMatch(
self: Self,
allocator: mem.Allocator,
normalizer: *const Normalize,
a: []const u8,
b: []const u8,
) !bool {
if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
// Process a
const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd);
defer allocator.free(nfd_a);
var need_free_cf_nfd_a = false;
var cf_nfd_a: []const u21 = nfd_a;
if (self.changesWhenCaseFolded(nfd_a)) {
cf_nfd_a = try self.caseFold(allocator, nfd_a);
need_free_cf_nfd_a = true;
}
defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
const nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfd_a);
defer allocator.free(nfkd_cf_nfd_a);
const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a);
defer allocator.free(cf_nfkd_cf_nfd_a);
const nfkd_cf_nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
// Process b
const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd);
defer allocator.free(nfd_b);
var need_free_cf_nfd_b = false;
var cf_nfd_b: []const u21 = nfd_b;
if (self.changesWhenCaseFolded(nfd_b)) {
cf_nfd_b = try self.caseFold(allocator, nfd_b);
need_free_cf_nfd_b = true;
}
defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
const nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfd_b);
defer allocator.free(nfkd_cf_nfd_b);
const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b);
defer allocator.free(cf_nfkd_cf_nfd_b);
const nfkd_cf_nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
}
test "compatCaselessMatch" {
const allocator = testing.allocator;
const norm_data = try Normalize.NormData.init(allocator);
defer norm_data.deinit();
const n = Normalize{ .norm_data = &norm_data };
const fold_data = try FoldData.init(allocator);
defer fold_data.deinit();
const caser = Self{ .fold_data = &fold_data };
try testing.expect(try caser.compatCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!"));
const a = "Héllo World! \u{3d3}";
const b = "He\u{301}llo World! \u{3a5}\u{301}";
try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, b));
const c = "He\u{301}llo World! \u{3d2}\u{301}";
try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c));
}
/// Performs canonical caseless string matching by decomposing to NFD. This is
/// faster than `compatCaselessMatch`, but less comprehensive.
pub fn canonCaselessMatch(
self: Self,
allocator: mem.Allocator,
normalizer: *const Normalize,
a: []const u8,
b: []const u8,
) !bool {
if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
// Process a
const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd);
defer allocator.free(nfd_a);
var need_free_cf_nfd_a = false;
var cf_nfd_a: []const u21 = nfd_a;
if (self.changesWhenCaseFolded(nfd_a)) {
cf_nfd_a = try self.caseFold(allocator, nfd_a);
need_free_cf_nfd_a = true;
}
defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
var need_free_nfd_cf_nfd_a = false;
var nfd_cf_nfd_a = cf_nfd_a;
if (!need_free_cf_nfd_a) {
nfd_cf_nfd_a = try normalizer.nfdCodePoints(allocator, cf_nfd_a);
need_free_nfd_cf_nfd_a = true;
}
defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);
// Process b
const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd);
defer allocator.free(nfd_b);
var need_free_cf_nfd_b = false;
var cf_nfd_b: []const u21 = nfd_b;
if (self.changesWhenCaseFolded(nfd_b)) {
cf_nfd_b = try self.caseFold(allocator, nfd_b);
need_free_cf_nfd_b = true;
}
defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
var need_free_nfd_cf_nfd_b = false;
var nfd_cf_nfd_b = cf_nfd_b;
if (!need_free_cf_nfd_b) {
nfd_cf_nfd_b = try normalizer.nfdCodePoints(allocator, cf_nfd_b);
need_free_nfd_cf_nfd_b = true;
}
defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b);
return mem.eql(u21, nfd_cf_nfd_a, nfd_cf_nfd_b);
}
test "canonCaselessMatch" {
const allocator = testing.allocator;
const norm_data = try Normalize.NormData.init(allocator);
defer norm_data.deinit();
const n = Normalize{ .norm_data = &norm_data };
const fold_data = try FoldData.init(allocator);
defer fold_data.deinit();
const caser = Self{ .fold_data = &fold_data };
try testing.expect(try caser.canonCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!"));
const a = "Héllo World! \u{3d3}";
const b = "He\u{301}llo World! \u{3a5}\u{301}";
try testing.expect(!try caser.canonCaselessMatch(allocator, &n, a, b));
const c = "He\u{301}llo World! \u{3d2}\u{301}";
try testing.expect(try caser.canonCaselessMatch(allocator, &n, a, c));
}

49
deps/zg/src/CombiningData.zig vendored Normal file
View File

@@ -0,0 +1,49 @@
const std = @import("std");
const builtin = @import("builtin");
const compress = std.compress;
const mem = std.mem;
allocator: mem.Allocator,
s1: []u16 = undefined,
s2: []u8 = undefined,
const Self = @This();
pub fn init(allocator: mem.Allocator) !Self {
const decompressor = compress.flate.inflate.decompressor;
const in_bytes = @embedFile("ccc");
var in_fbs = std.io.fixedBufferStream(in_bytes);
var in_decomp = decompressor(.raw, in_fbs.reader());
var reader = in_decomp.reader();
const endian = builtin.cpu.arch.endian();
var self = Self{ .allocator = allocator };
const stage_1_len: u16 = try reader.readInt(u16, endian);
self.s1 = try allocator.alloc(u16, stage_1_len);
errdefer allocator.free(self.s1);
for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
const stage_2_len: u16 = try reader.readInt(u16, endian);
self.s2 = try allocator.alloc(u8, stage_2_len);
errdefer allocator.free(self.s2);
_ = try reader.readAll(self.s2);
return self;
}
pub fn deinit(self: *const Self) void {
self.allocator.free(self.s1);
self.allocator.free(self.s2);
}
/// Returns the canonical combining class for a code point.
pub fn ccc(self: Self, cp: u21) u8 {
return self.s2[self.s1[cp >> 8] + (cp & 0xff)];
}
/// True if `cp` is a starter code point, not a combining character.
pub fn isStarter(self: Self, cp: u21) bool {
return self.s2[self.s1[cp >> 8] + (cp & 0xff)] == 0;
}

50
deps/zg/src/CompatData.zig vendored Normal file
View File

@@ -0,0 +1,50 @@
const std = @import("std");
const builtin = @import("builtin");
const compress = std.compress;
const mem = std.mem;
allocator: mem.Allocator,
nfkd: [][]u21 = undefined,
const Self = @This();
pub fn init(allocator: mem.Allocator) !Self {
const decompressor = compress.flate.inflate.decompressor;
const in_bytes = @embedFile("compat");
var in_fbs = std.io.fixedBufferStream(in_bytes);
var in_decomp = decompressor(.raw, in_fbs.reader());
var reader = in_decomp.reader();
const endian = builtin.cpu.arch.endian();
var self = Self{
.allocator = allocator,
.nfkd = try allocator.alloc([]u21, 0x110000),
};
errdefer self.deinit();
@memset(self.nfkd, &.{});
while (true) {
const len: u8 = try reader.readInt(u8, endian);
if (len == 0) break;
const cp = try reader.readInt(u24, endian);
self.nfkd[cp] = try allocator.alloc(u21, len - 1);
for (0..len - 1) |i| {
self.nfkd[cp][i] = @intCast(try reader.readInt(u24, endian));
}
}
return self;
}
pub fn deinit(self: *const Self) void {
for (self.nfkd) |slice| {
if (slice.len != 0) self.allocator.free(slice);
}
self.allocator.free(self.nfkd);
}
/// Returns compatibility decomposition for `cp`.
pub fn toNfkd(self: Self, cp: u21) []u21 {
return self.nfkd[cp];
}

355
deps/zg/src/DisplayWidth.zig vendored Normal file
View File

@@ -0,0 +1,355 @@
const std = @import("std");
const builtin = @import("builtin");
const ArrayList = std.ArrayList;
const mem = std.mem;
const simd = std.simd;
const testing = std.testing;
const ascii = @import("ascii");
const CodePointIterator = @import("code_point").Iterator;
const GraphemeIterator = @import("grapheme").Iterator;
pub const DisplayWidthData = @import("DisplayWidthData");
data: *const DisplayWidthData,
const Self = @This();
/// strWidth returns the total display width of `str` as the number of cells
/// required in a fixed-pitch font (i.e. a terminal screen).
pub fn strWidth(self: Self, str: []const u8) usize {
var total: isize = 0;
// ASCII fast path
if (ascii.isAsciiOnly(str)) {
for (str) |b| total += self.data.codePointWidth(b);
return @intCast(@max(0, total));
}
var giter = GraphemeIterator.init(str, &self.data.g_data);
while (giter.next()) |gc| {
var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) };
var gc_total: isize = 0;
while (cp_iter.next()) |cp| {
var w = self.data.codePointWidth(cp.code);
if (w != 0) {
// Handle text emoji sequence.
if (cp_iter.next()) |ncp| {
// emoji text sequence.
if (ncp.code == 0xFE0E) w = 1;
if (ncp.code == 0xFE0F) w = 2;
}
// Only adding width of first non-zero-width code point.
if (gc_total == 0) {
gc_total = w;
break;
}
}
}
total += gc_total;
}
return @intCast(@max(0, total));
}
test "strWidth" {
const data = try DisplayWidthData.init(testing.allocator);
defer data.deinit();
const self = Self{ .data = &data };
try testing.expectEqual(@as(usize, 5), self.strWidth("Hello\r\n"));
try testing.expectEqual(@as(usize, 1), self.strWidth("\u{0065}\u{0301}"));
try testing.expectEqual(@as(usize, 2), self.strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}"));
try testing.expectEqual(@as(usize, 8), self.strWidth("Hello 😊"));
try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 😊"));
try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo :)"));
try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 🇪🇸"));
try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}")); // Lone emoji
try testing.expectEqual(@as(usize, 1), self.strWidth("\u{26A1}\u{FE0E}")); // Text sequence
try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence
try testing.expectEqual(@as(usize, 1), self.strWidth("\u{2764}")); // Default text presentation
try testing.expectEqual(@as(usize, 1), self.strWidth("\u{2764}\u{FE0E}")); // Default text presentation with VS15 selector
try testing.expectEqual(@as(usize, 2), self.strWidth("\u{2764}\u{FE0F}")); // Default text presentation with VS16 selector
try testing.expectEqual(@as(usize, 0), self.strWidth("A\x08")); // Backspace
try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA")); // DEL
try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA\x08\x08")); // never less than o
// wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py
const empty = "";
try testing.expectEqual(@as(usize, 0), self.strWidth(empty));
const with_null = "hello\x00world";
try testing.expectEqual(@as(usize, 10), self.strWidth(with_null));
const hello_jp = "コンニチハ, セカイ!";
try testing.expectEqual(@as(usize, 19), self.strWidth(hello_jp));
const control = "\x1b[0m";
try testing.expectEqual(@as(usize, 3), self.strWidth(control));
const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}";
try testing.expectEqual(@as(usize, 3), self.strWidth(balinese));
// These commented out tests require a new specification for complex scripts.
// See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf
// const jamo = "\u{1100}\u{1160}";
// try testing.expectEqual(@as(usize, 3), strWidth(jamo));
// const devengari = "\u{0915}\u{094D}\u{0937}\u{093F}";
// try testing.expectEqual(@as(usize, 3), strWidth(devengari));
// const tamal = "\u{0b95}\u{0bcd}\u{0bb7}\u{0bcc}";
// try testing.expectEqual(@as(usize, 5), strWidth(tamal));
// const kannada_1 = "\u{0cb0}\u{0ccd}\u{0c9d}\u{0cc8}";
// try testing.expectEqual(@as(usize, 3), strWidth(kannada_1));
// The following passes but as a mere coincidence.
const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}";
try testing.expectEqual(@as(usize, 2), self.strWidth(kannada_2));
// From Rust https://github.com/jameslanska/unicode-display-width
try testing.expectEqual(@as(usize, 15), self.strWidth("🔥🗡🍩👩🏻‍🚀⏰💃🏼🔦👍🏻"));
try testing.expectEqual(@as(usize, 2), self.strWidth("🦀"));
try testing.expectEqual(@as(usize, 2), self.strWidth("👨‍👩‍👧‍👧"));
try testing.expectEqual(@as(usize, 2), self.strWidth("👩‍🔬"));
try testing.expectEqual(@as(usize, 9), self.strWidth("sane text"));
try testing.expectEqual(@as(usize, 9), self.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ"));
try testing.expectEqual(@as(usize, 17), self.strWidth("슬라바 우크라이나"));
try testing.expectEqual(@as(usize, 1), self.strWidth("\u{378}"));
}
/// centers `str` in a new string of width `total_width` (in display cells) using `pad` as padding.
/// If the length of `str` and `total_width` have different parity, the right side of `str` will
/// receive one additional pad. This makes sure the returned string fills the requested width.
/// Caller must free returned bytes with `allocator`.
pub fn center(
self: Self,
allocator: mem.Allocator,
str: []const u8,
total_width: usize,
pad: []const u8,
) ![]u8 {
const str_width = self.strWidth(str);
if (str_width > total_width) return error.StrTooLong;
if (str_width == total_width) return try allocator.dupe(u8, str);
const pad_width = self.strWidth(pad);
if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
const margin_width = @divFloor((total_width - str_width), 2);
if (pad_width > margin_width) return error.PadTooLong;
const extra_pad: usize = if (total_width % 2 != str_width % 2) 1 else 0;
const pads = @divFloor(margin_width, pad_width) * 2 + extra_pad;
var result = try allocator.alloc(u8, pads * pad.len + str.len);
var bytes_index: usize = 0;
var pads_index: usize = 0;
while (pads_index < pads / 2) : (pads_index += 1) {
@memcpy(result[bytes_index..][0..pad.len], pad);
bytes_index += pad.len;
}
@memcpy(result[bytes_index..][0..str.len], str);
bytes_index += str.len;
pads_index = 0;
while (pads_index < pads / 2 + extra_pad) : (pads_index += 1) {
@memcpy(result[bytes_index..][0..pad.len], pad);
bytes_index += pad.len;
}
return result;
}
test "center" {
const allocator = testing.allocator;
const data = try DisplayWidthData.init(allocator);
defer data.deinit();
const self = Self{ .data = &data };
// Input and width both have odd length
var centered = try self.center(allocator, "abc", 9, "*");
try testing.expectEqualSlices(u8, "***abc***", centered);
// Input and width both have even length
testing.allocator.free(centered);
centered = try self.center(allocator, "w😊w", 10, "-");
try testing.expectEqualSlices(u8, "---w😊w---", centered);
// Input has even length, width has odd length
testing.allocator.free(centered);
centered = try self.center(allocator, "1234", 9, "-");
try testing.expectEqualSlices(u8, "--1234---", centered);
// Input has odd length, width has even length
testing.allocator.free(centered);
centered = try self.center(allocator, "123", 8, "-");
try testing.expectEqualSlices(u8, "--123---", centered);
// Input is the same length as the width
testing.allocator.free(centered);
centered = try self.center(allocator, "123", 3, "-");
try testing.expectEqualSlices(u8, "123", centered);
// Input is empty
testing.allocator.free(centered);
centered = try self.center(allocator, "", 3, "-");
try testing.expectEqualSlices(u8, "---", centered);
// Input is empty and width is zero
testing.allocator.free(centered);
centered = try self.center(allocator, "", 0, "-");
try testing.expectEqualSlices(u8, "", centered);
// Input is longer than the width, which is an error
testing.allocator.free(centered);
try testing.expectError(error.StrTooLong, self.center(allocator, "123", 2, "-"));
}
/// padLeft returns a new string of width `total_width` (in display cells) using `pad` as padding
/// on the left side. Caller must free returned bytes with `allocator`.
pub fn padLeft(
self: Self,
allocator: mem.Allocator,
str: []const u8,
total_width: usize,
pad: []const u8,
) ![]u8 {
const str_width = self.strWidth(str);
if (str_width > total_width) return error.StrTooLong;
const pad_width = self.strWidth(pad);
if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
const margin_width = total_width - str_width;
if (pad_width > margin_width) return error.PadTooLong;
const pads = @divFloor(margin_width, pad_width);
var result = try allocator.alloc(u8, pads * pad.len + str.len);
var bytes_index: usize = 0;
var pads_index: usize = 0;
while (pads_index < pads) : (pads_index += 1) {
@memcpy(result[bytes_index..][0..pad.len], pad);
bytes_index += pad.len;
}
@memcpy(result[bytes_index..][0..str.len], str);
return result;
}
test "padLeft" {
const allocator = testing.allocator;
const data = try DisplayWidthData.init(allocator);
defer data.deinit();
const self = Self{ .data = &data };
var right_aligned = try self.padLeft(allocator, "abc", 9, "*");
defer testing.allocator.free(right_aligned);
try testing.expectEqualSlices(u8, "******abc", right_aligned);
testing.allocator.free(right_aligned);
right_aligned = try self.padLeft(allocator, "w😊w", 10, "-");
try testing.expectEqualSlices(u8, "------w😊w", right_aligned);
}
/// padRight returns a new string of width `total_width` (in display cells) using `pad` as padding
/// on the right side. Caller must free returned bytes with `allocator`.
pub fn padRight(
self: Self,
allocator: mem.Allocator,
str: []const u8,
total_width: usize,
pad: []const u8,
) ![]u8 {
const str_width = self.strWidth(str);
if (str_width > total_width) return error.StrTooLong;
const pad_width = self.strWidth(pad);
if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
const margin_width = total_width - str_width;
if (pad_width > margin_width) return error.PadTooLong;
const pads = @divFloor(margin_width, pad_width);
var result = try allocator.alloc(u8, pads * pad.len + str.len);
var bytes_index: usize = 0;
var pads_index: usize = 0;
@memcpy(result[bytes_index..][0..str.len], str);
bytes_index += str.len;
while (pads_index < pads) : (pads_index += 1) {
@memcpy(result[bytes_index..][0..pad.len], pad);
bytes_index += pad.len;
}
return result;
}
test "padRight" {
const allocator = testing.allocator;
const data = try DisplayWidthData.init(allocator);
defer data.deinit();
const self = Self{ .data = &data };
var left_aligned = try self.padRight(allocator, "abc", 9, "*");
defer testing.allocator.free(left_aligned);
try testing.expectEqualSlices(u8, "abc******", left_aligned);
testing.allocator.free(left_aligned);
left_aligned = try self.padRight(allocator, "w😊w", 10, "-");
try testing.expectEqualSlices(u8, "w😊w------", left_aligned);
}
/// Wraps a string approximately at the given number of colums per line.
/// `threshold` defines how far the last column of the last word can be
/// from the edge. Caller must free returned bytes with `allocator`.
pub fn wrap(
self: Self,
allocator: mem.Allocator,
str: []const u8,
columns: usize,
threshold: usize,
) ![]u8 {
var result = ArrayList(u8).init(allocator);
defer result.deinit();
var line_iter = mem.tokenizeAny(u8, str, "\r\n");
var line_width: usize = 0;
while (line_iter.next()) |line| {
var word_iter = mem.tokenizeScalar(u8, line, ' ');
while (word_iter.next()) |word| {
try result.appendSlice(word);
try result.append(' ');
line_width += self.strWidth(word) + 1;
if (line_width > columns or columns - line_width <= threshold) {
try result.append('\n');
line_width = 0;
}
}
}
// Remove trailing space and newline.
_ = result.pop();
_ = result.pop();
return try result.toOwnedSlice();
}
test "wrap" {
const allocator = testing.allocator;
const data = try DisplayWidthData.init(allocator);
defer data.deinit();
const self = Self{ .data = &data };
const input = "The quick brown fox\r\njumped over the lazy dog!";
const got = try self.wrap(allocator, input, 10, 3);
defer testing.allocator.free(got);
const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!";
try testing.expectEqualStrings(want, got);
}

98
deps/zg/src/FoldData.zig vendored Normal file
View File

@@ -0,0 +1,98 @@
const std = @import("std");
const builtin = @import("builtin");
const compress = std.compress;
const mem = std.mem;
allocator: mem.Allocator,
cutoff: u21 = undefined,
cwcf_exceptions_min: u21 = undefined,
cwcf_exceptions_max: u21 = undefined,
cwcf_exceptions: []u21 = undefined,
multiple_start: u21 = undefined,
stage1: []u8 = undefined,
stage2: []u8 = undefined,
stage3: []i24 = undefined,
const Self = @This();
pub fn init(allocator: mem.Allocator) !Self {
const decompressor = compress.flate.inflate.decompressor;
const in_bytes = @embedFile("fold");
var in_fbs = std.io.fixedBufferStream(in_bytes);
var in_decomp = decompressor(.raw, in_fbs.reader());
var reader = in_decomp.reader();
const endian = builtin.cpu.arch.endian();
var self = Self{ .allocator = allocator };
self.cutoff = @intCast(try reader.readInt(u24, endian));
self.multiple_start = @intCast(try reader.readInt(u24, endian));
var len = try reader.readInt(u16, endian);
self.stage1 = try allocator.alloc(u8, len);
errdefer allocator.free(self.stage1);
for (0..len) |i| self.stage1[i] = try reader.readInt(u8, endian);
len = try reader.readInt(u16, endian);
self.stage2 = try allocator.alloc(u8, len);
errdefer allocator.free(self.stage2);
for (0..len) |i| self.stage2[i] = try reader.readInt(u8, endian);
len = try reader.readInt(u16, endian);
self.stage3 = try allocator.alloc(i24, len);
errdefer allocator.free(self.stage3);
for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian);
self.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian));
self.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian));
len = try reader.readInt(u16, endian);
self.cwcf_exceptions = try allocator.alloc(u21, len);
for (0..len) |i| self.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian));
return self;
}
pub fn deinit(self: *const Self) void {
self.allocator.free(self.stage1);
self.allocator.free(self.stage2);
self.allocator.free(self.stage3);
}
/// Returns the case fold for `cp`.
pub fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 {
if (cp >= self.cutoff) return &.{};
const stage1_val = self.stage1[cp >> 8];
if (stage1_val == 0) return &.{};
const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF);
const stage3_index = self.stage2[stage2_index];
if (stage3_index & 0x80 != 0) {
const real_index = @as(usize, self.multiple_start) + (stage3_index ^ 0x80) * 3;
const mapping = mem.sliceTo(self.stage3[real_index..][0..3], 0);
for (mapping, 0..) |c, i| buf[i] = @intCast(c);
return buf[0..mapping.len];
}
const offset = self.stage3[stage3_index];
if (offset == 0) return &.{};
buf[0] = @intCast(@as(i32, cp) + offset);
return buf[0..1];
}
/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
pub fn changesWhenCaseFolded(self: Self, cp: u21) bool {
var buf: [3]u21 = undefined;
const has_mapping = self.caseFold(cp, &buf).len != 0;
return has_mapping and !self.isCwcfException(cp);
}
fn isCwcfException(self: Self, cp: u21) bool {
return cp >= self.cwcf_exceptions_min and
cp <= self.cwcf_exceptions_max and
std.mem.indexOfScalar(u21, self.cwcf_exceptions, cp) != null;
}

171
deps/zg/src/GenCatData.zig vendored Normal file
View File

@@ -0,0 +1,171 @@
const std = @import("std");
const builtin = @import("builtin");
const compress = std.compress;
const mem = std.mem;
/// General Category
pub const Gc = enum {
Cc, // Other, Control
Cf, // Other, Format
Cn, // Other, Unassigned
Co, // Other, Private Use
Cs, // Other, Surrogate
Ll, // Letter, Lowercase
Lm, // Letter, Modifier
Lo, // Letter, Other
Lu, // Letter, Uppercase
Lt, // Letter, Titlecase
Mc, // Mark, Spacing Combining
Me, // Mark, Enclosing
Mn, // Mark, Non-Spacing
Nd, // Number, Decimal Digit
Nl, // Number, Letter
No, // Number, Other
Pc, // Punctuation, Connector
Pd, // Punctuation, Dash
Pe, // Punctuation, Close
Pf, // Punctuation, Final quote (may behave like Ps or Pe depending on usage)
Pi, // Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
Po, // Punctuation, Other
Ps, // Punctuation, Open
Sc, // Symbol, Currency
Sk, // Symbol, Modifier
Sm, // Symbol, Math
So, // Symbol, Other
Zl, // Separator, Line
Zp, // Separator, Paragraph
Zs, // Separator, Space
};
allocator: mem.Allocator,
s1: []u16 = undefined,
s2: []u5 = undefined,
s3: []u5 = undefined,
const Self = @This();
pub fn init(allocator: mem.Allocator) !Self {
const decompressor = compress.flate.inflate.decompressor;
const in_bytes = @embedFile("gencat");
var in_fbs = std.io.fixedBufferStream(in_bytes);
var in_decomp = decompressor(.raw, in_fbs.reader());
var reader = in_decomp.reader();
const endian = builtin.cpu.arch.endian();
var self = Self{ .allocator = allocator };
const s1_len: u16 = try reader.readInt(u16, endian);
self.s1 = try allocator.alloc(u16, s1_len);
errdefer allocator.free(self.s1);
for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
const s2_len: u16 = try reader.readInt(u16, endian);
self.s2 = try allocator.alloc(u5, s2_len);
errdefer allocator.free(self.s2);
for (0..s2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian));
const s3_len: u16 = try reader.readInt(u8, endian);
self.s3 = try allocator.alloc(u5, s3_len);
errdefer allocator.free(self.s3);
for (0..s3_len) |i| self.s3[i] = @intCast(try reader.readInt(u8, endian));
return self;
}
pub fn deinit(self: *const Self) void {
self.allocator.free(self.s1);
self.allocator.free(self.s2);
self.allocator.free(self.s3);
}
/// Lookup the General Category for `cp`.
pub fn gc(self: Self, cp: u21) Gc {
return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]]);
}
/// True if `cp` has an C general category.
pub fn isControl(self: Self, cp: u21) bool {
return switch (self.gc(cp)) {
.Cc,
.Cf,
.Cn,
.Co,
.Cs,
=> true,
else => false,
};
}
/// True if `cp` has an L general category.
pub fn isLetter(self: Self, cp: u21) bool {
return switch (self.gc(cp)) {
.Ll,
.Lm,
.Lo,
.Lu,
.Lt,
=> true,
else => false,
};
}
/// True if `cp` has an M general category.
pub fn isMark(self: Self, cp: u21) bool {
return switch (self.gc(cp)) {
.Mc,
.Me,
.Mn,
=> true,
else => false,
};
}
/// True if `cp` has an N general category.
pub fn isNumber(self: Self, cp: u21) bool {
return switch (self.gc(cp)) {
.Nd,
.Nl,
.No,
=> true,
else => false,
};
}
/// True if `cp` has an P general category.
pub fn isPunctuation(self: Self, cp: u21) bool {
return switch (self.gc(cp)) {
.Pc,
.Pd,
.Pe,
.Pf,
.Pi,
.Po,
.Ps,
=> true,
else => false,
};
}
/// True if `cp` has an S general category.
pub fn isSymbol(self: Self, cp: u21) bool {
return switch (self.gc(cp)) {
.Sc,
.Sk,
.Sm,
.So,
=> true,
else => false,
};
}
/// True if `cp` has an Z general category.
pub fn isSeparator(self: Self, cp: u21) bool {
return switch (self.gc(cp)) {
.Zl,
.Zp,
.Zs,
=> true,
else => false,
};
}

88
deps/zg/src/GraphemeData.zig vendored Normal file
View File

@@ -0,0 +1,88 @@
const std = @import("std");
const builtin = @import("builtin");
const compress = std.compress;
const mem = std.mem;
/// Indic syllable type.
pub const Indic = enum {
none,
Consonant,
Extend,
Linker,
};
/// Grapheme break property.
pub const Gbp = enum {
none,
Control,
CR,
Extend,
L,
LF,
LV,
LVT,
Prepend,
Regional_Indicator,
SpacingMark,
T,
V,
ZWJ,
};
allocator: mem.Allocator,
s1: []u16 = undefined,
s2: []u16 = undefined,
s3: []u8 = undefined,
const Self = @This();
pub fn init(allocator: mem.Allocator) !Self {
const decompressor = compress.flate.inflate.decompressor;
const in_bytes = @embedFile("gbp");
var in_fbs = std.io.fixedBufferStream(in_bytes);
var in_decomp = decompressor(.raw, in_fbs.reader());
var reader = in_decomp.reader();
const endian = builtin.cpu.arch.endian();
var self = Self{ .allocator = allocator };
const s1_len: u16 = try reader.readInt(u16, endian);
self.s1 = try allocator.alloc(u16, s1_len);
errdefer allocator.free(self.s1);
for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
const s2_len: u16 = try reader.readInt(u16, endian);
self.s2 = try allocator.alloc(u16, s2_len);
errdefer allocator.free(self.s2);
for (0..s2_len) |i| self.s2[i] = try reader.readInt(u16, endian);
const s3_len: u16 = try reader.readInt(u16, endian);
self.s3 = try allocator.alloc(u8, s3_len);
errdefer allocator.free(self.s3);
_ = try reader.readAll(self.s3);
return self;
}
pub fn deinit(self: *const Self) void {
self.allocator.free(self.s1);
self.allocator.free(self.s2);
self.allocator.free(self.s3);
}
/// Lookup the grapheme break property for a code point.
pub fn gbp(self: Self, cp: u21) Gbp {
return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 4);
}
/// Lookup the indic syllable type for a code point.
pub fn indic(self: Self, cp: u21) Indic {
return @enumFromInt((self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7);
}
/// Lookup the indic syllable type for a code point.
pub fn isEmoji(self: Self, cp: u21) bool {
return self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
}

53
deps/zg/src/HangulData.zig vendored Normal file
View File

@@ -0,0 +1,53 @@
const std = @import("std");
const builtin = @import("builtin");
const compress = std.compress;
const mem = std.mem;
const testing = std.testing;
pub const Syllable = enum {
none,
L,
LV,
LVT,
V,
T,
};
allocator: mem.Allocator,
s1: []u16 = undefined,
s2: []u3 = undefined,
const Self = @This();
pub fn init(allocator: mem.Allocator) !Self {
const decompressor = compress.flate.inflate.decompressor;
const in_bytes = @embedFile("hangul");
var in_fbs = std.io.fixedBufferStream(in_bytes);
var in_decomp = decompressor(.raw, in_fbs.reader());
var reader = in_decomp.reader();
const endian = builtin.cpu.arch.endian();
var self = Self{ .allocator = allocator };
const stage_1_len: u16 = try reader.readInt(u16, endian);
self.s1 = try allocator.alloc(u16, stage_1_len);
errdefer allocator.free(self.s1);
for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
const stage_2_len: u16 = try reader.readInt(u16, endian);
self.s2 = try allocator.alloc(u3, stage_2_len);
errdefer allocator.free(self.s2);
for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian));
return self;
}
pub fn deinit(self: *const Self) void {
self.allocator.free(self.s1);
self.allocator.free(self.s2);
}
/// Returns the Hangul syllable type for `cp`.
pub fn syllable(self: Self, cp: u21) Syllable {
return @enumFromInt(self.s2[self.s1[cp >> 8] + (cp & 0xff)]);
}

37
deps/zg/src/NormData.zig vendored Normal file
View File

@@ -0,0 +1,37 @@
const std = @import("std");
const mem = std.mem;
const CanonData = @import("CanonData");
const CccData = @import("CombiningData");
const CompatData = @import("CompatData");
const FoldData = @import("FoldData");
const HangulData = @import("HangulData");
const NormPropsData = @import("NormPropsData");
canon_data: CanonData = undefined,
ccc_data: CccData = undefined,
compat_data: CompatData = undefined,
hangul_data: HangulData = undefined,
normp_data: NormPropsData = undefined,
const Self = @This();
pub fn init(self: *Self, allocator: std.mem.Allocator) !void {
self.canon_data = try CanonData.init(allocator);
errdefer self.canon_data.deinit();
self.ccc_data = try CccData.init(allocator);
errdefer self.ccc_data.deinit();
self.compat_data = try CompatData.init(allocator);
errdefer self.compat_data.deinit();
self.hangul_data = try HangulData.init(allocator);
errdefer self.hangul_data.deinit();
self.normp_data = try NormPropsData.init(allocator);
}
pub fn deinit(self: *Self) void {
self.canon_data.deinit();
self.ccc_data.deinit();
self.compat_data.deinit();
self.hangul_data.deinit();
self.normp_data.deinit();
}

54
deps/zg/src/NormPropsData.zig vendored Normal file
View File

@@ -0,0 +1,54 @@
const std = @import("std");
const builtin = @import("builtin");
const compress = std.compress;
const mem = std.mem;
const testing = std.testing;
allocator: mem.Allocator,
s1: []u16 = undefined,
s2: []u4 = undefined,
const Self = @This();
pub fn init(allocator: mem.Allocator) !Self {
const decompressor = compress.flate.inflate.decompressor;
const in_bytes = @embedFile("normp");
var in_fbs = std.io.fixedBufferStream(in_bytes);
var in_decomp = decompressor(.raw, in_fbs.reader());
var reader = in_decomp.reader();
const endian = builtin.cpu.arch.endian();
var self = Self{ .allocator = allocator };
const stage_1_len: u16 = try reader.readInt(u16, endian);
self.s1 = try allocator.alloc(u16, stage_1_len);
errdefer allocator.free(self.s1);
for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
const stage_2_len: u16 = try reader.readInt(u16, endian);
self.s2 = try allocator.alloc(u4, stage_2_len);
errdefer allocator.free(self.s2);
for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian));
return self;
}
pub fn deinit(self: *const Self) void {
self.allocator.free(self.s1);
self.allocator.free(self.s2);
}
/// Returns true if `cp` is already in NFD form.
pub fn isNfd(self: Self, cp: u21) bool {
return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 1 == 0;
}
/// Returns true if `cp` is already in NFKD form.
pub fn isNfkd(self: Self, cp: u21) bool {
return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 2 == 0;
}
/// Returns true if `cp` is not allowed in any normalized form.
pub fn isFcx(self: Self, cp: u21) bool {
return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
}

622
deps/zg/src/Normalize.zig vendored Normal file
View File

@@ -0,0 +1,622 @@
//! Normalizer contains functions and methods that implement
//! Unicode Normalization. You can normalize strings into NFC,
//! NFKC, NFD, and NFKD normalization forms.
const std = @import("std");
const debug = std.debug;
const assert = debug.assert;
const fmt = std.fmt;
const heap = std.heap;
const mem = std.mem;
const simd = std.simd;
const testing = std.testing;
const unicode = std.unicode;
const ascii = @import("ascii");
const CodePointIterator = @import("code_point").Iterator;
pub const NormData = @import("NormData");
norm_data: *const NormData,
const Self = @This();
const SBase: u21 = 0xAC00;
const LBase: u21 = 0x1100;
const VBase: u21 = 0x1161;
const TBase: u21 = 0x11A7;
const LCount: u21 = 19;
const VCount: u21 = 21;
const TCount: u21 = 28;
const NCount: u21 = 588; // VCount * TCount
const SCount: u21 = 11172; // LCount * NCount
fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp {
const kind = self.norm_data.hangul_data.syllable(cp);
if (kind != .LV and kind != .LVT) return null;
const SIndex: u21 = cp - SBase;
const LIndex: u21 = SIndex / NCount;
const VIndex: u21 = (SIndex % NCount) / TCount;
const TIndex: u21 = SIndex % TCount;
const LPart: u21 = LBase + LIndex;
const VPart: u21 = VBase + VIndex;
var dc = Decomp{ .form = .nfd };
buf[0] = LPart;
buf[1] = VPart;
if (TIndex == 0) {
dc.cps = buf[0..2];
return dc;
}
// TPart
buf[2] = TBase + TIndex;
dc.cps = buf[0..3];
return dc;
}
fn composeHangulCanon(lv: u21, t: u21) u21 {
assert(0x11A8 <= t and t <= 0x11C2);
return lv + (t - TBase);
}
fn composeHangulFull(l: u21, v: u21, t: u21) u21 {
assert(0x1100 <= l and l <= 0x1112);
assert(0x1161 <= v and v <= 0x1175);
const LIndex = l - LBase;
const VIndex = v - VBase;
const LVIndex = LIndex * NCount + VIndex * TCount;
if (t == 0) return SBase + LVIndex;
assert(0x11A8 <= t and t <= 0x11C2);
const TIndex = t - TBase;
return SBase + LVIndex + TIndex;
}
const Form = enum {
nfc,
nfd,
nfkc,
nfkd,
same,
};
const Decomp = struct {
form: Form = .same,
cps: []const u21 = &.{},
};
// `mapping` retrieves the decomposition mapping for a code point as per the UCD.
fn mapping(self: Self, cp: u21, form: Form) Decomp {
var dc = Decomp{};
switch (form) {
.nfd => {
dc.cps = self.norm_data.canon_data.toNfd(cp);
if (dc.cps.len != 0) dc.form = .nfd;
},
.nfkd => {
dc.cps = self.norm_data.compat_data.toNfkd(cp);
if (dc.cps.len != 0) {
dc.form = .nfkd;
} else {
dc.cps = self.norm_data.canon_data.toNfd(cp);
if (dc.cps.len != 0) dc.form = .nfkd;
}
},
else => @panic("Normalizer.mapping only accepts form .nfd or .nfkd."),
}
return dc;
}
// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`.
fn decompose(
self: Self,
cp: u21,
form: Form,
buf: []u21,
) Decomp {
// ASCII
if (cp < 128) return .{};
// NFD / NFKD quick checks.
switch (form) {
.nfd => if (self.norm_data.normp_data.isNfd(cp)) return .{},
.nfkd => if (self.norm_data.normp_data.isNfkd(cp)) return .{},
else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."),
}
// Hangul precomposed syllable full decomposition.
if (self.decomposeHangul(cp, buf)) |dc| return dc;
// Full decomposition.
var dc = Decomp{ .form = form };
var result_index: usize = 0;
var work_index: usize = 1;
// Start work with argument code point.
var work = [_]u21{cp} ++ [_]u21{0} ** 17;
while (work_index > 0) {
// Look at previous code point in work queue.
work_index -= 1;
const next = work[work_index];
const m = self.mapping(next, form);
// No more of decompositions for this code point.
if (m.form == .same) {
buf[result_index] = next;
result_index += 1;
continue;
}
// Work backwards through decomposition.
// `i` starts at 1 because m_last is 1 past the last code point.
var i: usize = 1;
while (i <= m.cps.len) : ({
i += 1;
work_index += 1;
}) {
work[work_index] = m.cps[m.cps.len - i];
}
}
dc.cps = buf[0..result_index];
return dc;
}
test "decompose" {
const allocator = testing.allocator;
var data: NormData = undefined;
try NormData.init(&data, allocator);
defer data.deinit();
var n = Self{ .norm_data = &data };
var buf: [18]u21 = undefined;
var dc = n.decompose('é', .nfd, &buf);
try testing.expect(dc.form == .nfd);
try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
dc = n.decompose('\u{1e0a}', .nfd, &buf);
try testing.expect(dc.form == .nfd);
try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
dc = n.decompose('\u{1e0a}', .nfkd, &buf);
try testing.expect(dc.form == .nfkd);
try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
dc = n.decompose('\u{3189}', .nfd, &buf);
try testing.expect(dc.form == .same);
try testing.expect(dc.cps.len == 0);
dc = n.decompose('\u{3189}', .nfkd, &buf);
try testing.expect(dc.form == .nfkd);
try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
dc = n.decompose('\u{ace1}', .nfd, &buf);
try testing.expect(dc.form == .nfd);
try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
dc = n.decompose('\u{ace1}', .nfkd, &buf);
try testing.expect(dc.form == .nfd);
try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
dc = n.decompose('\u{3d3}', .nfd, &buf);
try testing.expect(dc.form == .nfd);
try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
dc = n.decompose('\u{3d3}', .nfkd, &buf);
try testing.expect(dc.form == .nfkd);
try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
}
/// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory.
pub const Result = struct {
allocator: ?mem.Allocator = null,
slice: []const u8,
pub fn deinit(self: *const Result) void {
if (self.allocator) |allocator| allocator.free(self.slice);
}
};
// Compares code points by Canonical Combining Class order.
fn cccLess(self: Self, lhs: u21, rhs: u21) bool {
return self.norm_data.ccc_data.ccc(lhs) < self.norm_data.ccc_data.ccc(rhs);
}
// Applies the Canonical Sorting Algorithm.
fn canonicalSort(self: Self, cps: []u21) void {
var i: usize = 0;
while (i < cps.len) : (i += 1) {
const start: usize = i;
while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {}
mem.sort(u21, cps[start..i], self, cccLess);
}
}
/// Normalize `str` to NFD.
pub fn nfd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
return self.nfxd(allocator, str, .nfd);
}
/// Normalize `str` to NFKD.
pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
return self.nfxd(allocator, str, .nfkd);
}
pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error![]u21 {
var dcp_list = std.ArrayList(u21).init(allocator);
defer dcp_list.deinit();
var cp_iter = CodePointIterator{ .bytes = str };
var dc_buf: [18]u21 = undefined;
while (cp_iter.next()) |cp| {
const dc = self.decompose(cp.code, form, &dc_buf);
if (dc.form == .same) {
try dcp_list.append(cp.code);
} else {
try dcp_list.appendSlice(dc.cps);
}
}
self.canonicalSort(dcp_list.items);
return try dcp_list.toOwnedSlice();
}
fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result {
// Quick checks.
if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
const dcps = try self.nfxdCodePoints(allocator, str, form);
defer allocator.free(dcps);
var dstr_list = std.ArrayList(u8).init(allocator);
defer dstr_list.deinit();
var buf: [4]u8 = undefined;
for (dcps) |dcp| {
const len = unicode.utf8Encode(dcp, &buf) catch unreachable;
try dstr_list.appendSlice(buf[0..len]);
}
return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() };
}
test "nfd ASCII / no-alloc" {
const allocator = testing.allocator;
var data: NormData = undefined;
try NormData.init(&data, allocator);
defer data.deinit();
const n = Self{ .norm_data = &data };
const result = try n.nfd(allocator, "Hello World!");
defer result.deinit();
try testing.expectEqualStrings("Hello World!", result.slice);
}
test "nfd !ASCII / alloc" {
const allocator = testing.allocator;
var data: NormData = undefined;
try NormData.init(&data, allocator);
defer data.deinit();
const n = Self{ .norm_data = &data };
const result = try n.nfd(allocator, "Héllo World! \u{3d3}");
defer result.deinit();
try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice);
}
test "nfkd ASCII / no-alloc" {
const allocator = testing.allocator;
var data: NormData = undefined;
try NormData.init(&data, allocator);
defer data.deinit();
const n = Self{ .norm_data = &data };
const result = try n.nfkd(allocator, "Hello World!");
defer result.deinit();
try testing.expectEqualStrings("Hello World!", result.slice);
}
test "nfkd !ASCII / alloc" {
const allocator = testing.allocator;
var data: NormData = undefined;
try NormData.init(&data, allocator);
defer data.deinit();
const n = Self{ .norm_data = &data };
const result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
defer result.deinit();
try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
}
pub fn nfdCodePoints(
self: Self,
allocator: mem.Allocator,
cps: []const u21,
) mem.Allocator.Error![]u21 {
var dcp_list = std.ArrayList(u21).init(allocator);
defer dcp_list.deinit();
var dc_buf: [18]u21 = undefined;
for (cps) |cp| {
const dc = self.decompose(cp, .nfd, &dc_buf);
if (dc.form == .same) {
try dcp_list.append(cp);
} else {
try dcp_list.appendSlice(dc.cps);
}
}
self.canonicalSort(dcp_list.items);
return try dcp_list.toOwnedSlice();
}
pub fn nfkdCodePoints(
self: Self,
allocator: mem.Allocator,
cps: []const u21,
) mem.Allocator.Error![]u21 {
var dcp_list = std.ArrayList(u21).init(allocator);
defer dcp_list.deinit();
var dc_buf: [18]u21 = undefined;
for (cps) |cp| {
const dc = self.decompose(cp, .nfkd, &dc_buf);
if (dc.form == .same) {
try dcp_list.append(cp);
} else {
try dcp_list.appendSlice(dc.cps);
}
}
self.canonicalSort(dcp_list.items);
return try dcp_list.toOwnedSlice();
}
// Composition (NFC, NFKC)
fn isHangul(self: Self, cp: u21) bool {
return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none;
}
/// Normalizes `str` to NFC.
pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
return self.nfxc(allocator, str, .nfc);
}
/// Normalizes `str` to NFKC.
pub fn nfkc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
return self.nfxc(allocator, str, .nfkc);
}
fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result {
// Quick checks.
if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str };
// Decompose first.
var dcps = if (form == .nfc)
try self.nfxdCodePoints(allocator, str, .nfd)
else
try self.nfxdCodePoints(allocator, str, .nfkd);
defer allocator.free(dcps);
// Compose
const tombstone = 0xe000; // Start of BMP Private Use Area
// Loop over all decomposed code points.
while (true) {
var i: usize = 1; // start at second code point.
var deleted: usize = 0;
// For each code point, C, find the preceding
// starter code point L, if any.
block_check: while (i < dcps.len) : (i += 1) {
const C = dcps[i];
if (C == tombstone) continue :block_check;
const cc_C = self.norm_data.ccc_data.ccc(C);
var starter_index: ?usize = null;
var j: usize = i;
// Seek back to find starter L, if any.
while (true) {
j -= 1;
if (dcps[j] == tombstone) continue;
// Check for starter.
if (self.norm_data.ccc_data.isStarter(dcps[j])) {
// Check for blocking conditions.
for (dcps[(j + 1)..i]) |B| {
if (B == tombstone) continue;
const cc_B = self.norm_data.ccc_data.ccc(B);
if (cc_B != 0 and self.isHangul(C)) continue :block_check;
if (cc_B >= cc_C) continue :block_check;
}
// Found starter at j.
starter_index = j;
break;
}
if (j == 0) break;
}
// If we have a starter L, see if there's a primary
// composite, P, for the sequence L, C. If so, we must
// repace L with P and delete C.
if (starter_index) |sidx| {
const L = dcps[sidx];
var processed_hangul = false;
// If L and C are Hangul syllables, we can compose
// them algorithmically if possible.
if (self.isHangul(L) and self.isHangul(C)) {
// Get Hangul syllable types.
const l_stype = self.norm_data.hangul_data.syllable(L);
const c_stype = self.norm_data.hangul_data.syllable(C);
if (l_stype == .LV and c_stype == .T) {
// LV, T canonical composition.
dcps[sidx] = composeHangulCanon(L, C);
dcps[i] = tombstone; // Mark for deletion.
processed_hangul = true;
}
if (l_stype == .L and c_stype == .V) {
// L, V full composition. L, V, T is handled via main loop.
dcps[sidx] = composeHangulFull(L, C, 0);
dcps[i] = tombstone; // Mark for deletion.
processed_hangul = true;
}
if (processed_hangul) deleted += 1;
}
// If no composition has occurred yet.
if (!processed_hangul) {
// L, C are not Hangul, so check for primary composite
// in the Unicode Character Database.
if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| {
// We have a primary composite P for L, C.
// We must check if P is not in the Full
// Composition Exclusions (FCX) list,
// preventing it from appearing in any
// composed form (NFC, NFKC).
if (!self.norm_data.normp_data.isFcx(P)) {
dcps[sidx] = P;
dcps[i] = tombstone; // Mark for deletion.
deleted += 1;
}
}
}
}
}
// If we have no deletions. the code point sequence
// has been fully composed.
if (deleted == 0) {
var cstr_list = std.ArrayList(u8).init(allocator);
defer cstr_list.deinit();
var buf: [4]u8 = undefined;
for (dcps) |cp| {
if (cp == tombstone) continue; // "Delete"
const len = unicode.utf8Encode(cp, &buf) catch unreachable;
try cstr_list.appendSlice(buf[0..len]);
}
return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() };
}
}
}
test "nfc" {
const allocator = testing.allocator;
var data: NormData = undefined;
try NormData.init(&data, allocator);
defer data.deinit();
const n = Self{ .norm_data = &data };
const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
defer result.deinit();
try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice);
}
test "nfkc" {
const allocator = testing.allocator;
var data: NormData = undefined;
try NormData.init(&data, allocator);
defer data.deinit();
const n = Self{ .norm_data = &data };
const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
defer result.deinit();
try testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
}
/// Tests for equality of `a` and `b` after normalizing to NFC.
pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool {
const norm_result_a = try self.nfc(allocator, a);
defer norm_result_a.deinit();
const norm_result_b = try self.nfc(allocator, b);
defer norm_result_b.deinit();
return mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
}
test "eql" {
const allocator = testing.allocator;
var data: NormData = undefined;
try NormData.init(&data, allocator);
defer data.deinit();
const n = Self{ .norm_data = &data };
try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
}
/// Returns true if `str` only contains Latin-1 Supplement
/// code points. Uses SIMD if possible.
pub fn isLatin1Only(str: []const u8) bool {
var cp_iter = CodePointIterator{ .bytes = str };
const vec_len = simd.suggestVectorLength(u21) orelse return blk: {
break :blk while (cp_iter.next()) |cp| {
if (cp.code > 256) break false;
} else true;
};
const Vec = @Vector(vec_len, u21);
outer: while (true) {
var v1: Vec = undefined;
const saved_cp_i = cp_iter.i;
for (0..vec_len) |i| {
if (cp_iter.next()) |cp| {
v1[i] = cp.code;
} else {
cp_iter.i = saved_cp_i;
break :outer;
}
}
const v2: Vec = @splat(256);
if (@reduce(.Or, v1 > v2)) return false;
}
return while (cp_iter.next()) |cp| {
if (cp.code > 256) break false;
} else true;
}
test "isLatin1Only" {
const latin1_only = "Hello, World! \u{fe} \u{ff}";
try testing.expect(isLatin1Only(latin1_only));
const not_latin1_only = "Héllo, World! \u{3d3}";
try testing.expect(!isLatin1Only(not_latin1_only));
}

164
deps/zg/src/PropsData.zig vendored Normal file
View File

@@ -0,0 +1,164 @@
const std = @import("std");
const builtin = @import("builtin");
const compress = std.compress;
const mem = std.mem;
const testing = std.testing;
allocator: mem.Allocator,
core_s1: []u16 = undefined,
core_s2: []u8 = undefined,
props_s1: []u16 = undefined,
props_s2: []u8 = undefined,
num_s1: []u16 = undefined,
num_s2: []u8 = undefined,
const Self = @This();
pub fn init(allocator: mem.Allocator) !Self {
const decompressor = compress.flate.inflate.decompressor;
const endian = builtin.cpu.arch.endian();
// Process DerivedCoreProperties.txt
const core_bytes = @embedFile("core_props");
var core_fbs = std.io.fixedBufferStream(core_bytes);
var core_decomp = decompressor(.raw, core_fbs.reader());
var core_reader = core_decomp.reader();
var self = Self{ .allocator = allocator };
const core_stage_1_len: u16 = try core_reader.readInt(u16, endian);
self.core_s1 = try allocator.alloc(u16, core_stage_1_len);
errdefer allocator.free(self.core_s1);
for (0..core_stage_1_len) |i| self.core_s1[i] = try core_reader.readInt(u16, endian);
const core_stage_2_len: u16 = try core_reader.readInt(u16, endian);
self.core_s2 = try allocator.alloc(u8, core_stage_2_len);
errdefer allocator.free(self.core_s2);
_ = try core_reader.readAll(self.core_s2);
// Process PropList.txt
const props_bytes = @embedFile("props");
var props_fbs = std.io.fixedBufferStream(props_bytes);
var props_decomp = decompressor(.raw, props_fbs.reader());
var props_reader = props_decomp.reader();
const stage_1_len: u16 = try props_reader.readInt(u16, endian);
self.props_s1 = try allocator.alloc(u16, stage_1_len);
errdefer allocator.free(self.props_s1);
for (0..stage_1_len) |i| self.props_s1[i] = try props_reader.readInt(u16, endian);
const stage_2_len: u16 = try props_reader.readInt(u16, endian);
self.props_s2 = try allocator.alloc(u8, stage_2_len);
errdefer allocator.free(self.props_s2);
_ = try props_reader.readAll(self.props_s2);
// Process DerivedNumericType.txt
const num_bytes = @embedFile("numeric");
var num_fbs = std.io.fixedBufferStream(num_bytes);
var num_decomp = decompressor(.raw, num_fbs.reader());
var num_reader = num_decomp.reader();
const num_stage_1_len: u16 = try num_reader.readInt(u16, endian);
self.num_s1 = try allocator.alloc(u16, num_stage_1_len);
errdefer allocator.free(self.num_s1);
for (0..num_stage_1_len) |i| self.num_s1[i] = try num_reader.readInt(u16, endian);
const num_stage_2_len: u16 = try num_reader.readInt(u16, endian);
self.num_s2 = try allocator.alloc(u8, num_stage_2_len);
errdefer allocator.free(self.num_s2);
_ = try num_reader.readAll(self.num_s2);
return self;
}
pub fn deinit(self: *const Self) void {
self.allocator.free(self.core_s1);
self.allocator.free(self.core_s2);
self.allocator.free(self.props_s1);
self.allocator.free(self.props_s2);
self.allocator.free(self.num_s1);
self.allocator.free(self.num_s2);
}
/// True if `cp` is a mathematical symbol.
pub fn isMath(self: Self, cp: u21) bool {
return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
}
/// True if `cp` is an alphabetic character.
pub fn isAlphabetic(self: Self, cp: u21) bool {
return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
}
/// True if `cp` is a valid identifier start character.
pub fn isIdStart(self: Self, cp: u21) bool {
return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
}
/// True if `cp` is a valid identifier continuation character.
pub fn isIdContinue(self: Self, cp: u21) bool {
return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 8 == 8;
}
/// True if `cp` is a valid extended identifier start character.
pub fn isXidStart(self: Self, cp: u21) bool {
return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 16 == 16;
}
/// True if `cp` is a valid extended identifier continuation character.
pub fn isXidContinue(self: Self, cp: u21) bool {
return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 32 == 32;
}
/// True if `cp` is a whitespace character.
pub fn isWhitespace(self: Self, cp: u21) bool {
return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
}
/// True if `cp` is a hexadecimal digit.
pub fn isHexDigit(self: Self, cp: u21) bool {
return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
}
/// True if `cp` is a diacritic mark.
pub fn isDiacritic(self: Self, cp: u21) bool {
return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
}
/// True if `cp` is numeric.
pub fn isNumeric(self: Self, cp: u21) bool {
return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
}
/// True if `cp` is a digit.
pub fn isDigit(self: Self, cp: u21) bool {
return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
}
/// True if `cp` is decimal.
pub fn isDecimal(self: Self, cp: u21) bool {
return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
}
test "Props" {
const self = try init(testing.allocator);
defer self.deinit();
try testing.expect(self.isHexDigit('F'));
try testing.expect(self.isHexDigit('a'));
try testing.expect(self.isHexDigit('8'));
try testing.expect(!self.isHexDigit('z'));
try testing.expect(self.isDiacritic('\u{301}'));
try testing.expect(self.isAlphabetic('A'));
try testing.expect(!self.isAlphabetic('3'));
try testing.expect(self.isMath('+'));
try testing.expect(self.isNumeric('\u{277f}'));
try testing.expect(self.isDigit('\u{2070}'));
try testing.expect(self.isDecimal('3'));
try testing.expect(!self.isNumeric('1'));
try testing.expect(!self.isDigit('2'));
try testing.expect(!self.isDecimal('g'));
}

228
deps/zg/src/ScriptsData.zig vendored Normal file
View File

@@ -0,0 +1,228 @@
const std = @import("std");
const builtin = @import("builtin");
const compress = std.compress;
const mem = std.mem;
const testing = std.testing;
/// Scripts
pub const Script = enum {
none,
Adlam,
Ahom,
Anatolian_Hieroglyphs,
Arabic,
Armenian,
Avestan,
Balinese,
Bamum,
Bassa_Vah,
Batak,
Bengali,
Bhaiksuki,
Bopomofo,
Brahmi,
Braille,
Buginese,
Buhid,
Canadian_Aboriginal,
Carian,
Caucasian_Albanian,
Chakma,
Cham,
Cherokee,
Chorasmian,
Common,
Coptic,
Cuneiform,
Cypriot,
Cypro_Minoan,
Cyrillic,
Deseret,
Devanagari,
Dives_Akuru,
Dogra,
Duployan,
Egyptian_Hieroglyphs,
Elbasan,
Elymaic,
Ethiopic,
Georgian,
Glagolitic,
Gothic,
Grantha,
Greek,
Gujarati,
Gunjala_Gondi,
Gurmukhi,
Han,
Hangul,
Hanifi_Rohingya,
Hanunoo,
Hatran,
Hebrew,
Hiragana,
Imperial_Aramaic,
Inherited,
Inscriptional_Pahlavi,
Inscriptional_Parthian,
Javanese,
Kaithi,
Kannada,
Katakana,
Kawi,
Kayah_Li,
Kharoshthi,
Khitan_Small_Script,
Khmer,
Khojki,
Khudawadi,
Lao,
Latin,
Lepcha,
Limbu,
Linear_A,
Linear_B,
Lisu,
Lycian,
Lydian,
Mahajani,
Makasar,
Malayalam,
Mandaic,
Manichaean,
Marchen,
Masaram_Gondi,
Medefaidrin,
Meetei_Mayek,
Mende_Kikakui,
Meroitic_Cursive,
Meroitic_Hieroglyphs,
Miao,
Modi,
Mongolian,
Mro,
Multani,
Myanmar,
Nabataean,
Nag_Mundari,
Nandinagari,
New_Tai_Lue,
Newa,
Nko,
Nushu,
Nyiakeng_Puachue_Hmong,
Ogham,
Ol_Chiki,
Old_Hungarian,
Old_Italic,
Old_North_Arabian,
Old_Permic,
Old_Persian,
Old_Sogdian,
Old_South_Arabian,
Old_Turkic,
Old_Uyghur,
Oriya,
Osage,
Osmanya,
Pahawh_Hmong,
Palmyrene,
Pau_Cin_Hau,
Phags_Pa,
Phoenician,
Psalter_Pahlavi,
Rejang,
Runic,
Samaritan,
Saurashtra,
Sharada,
Shavian,
Siddham,
SignWriting,
Sinhala,
Sogdian,
Sora_Sompeng,
Soyombo,
Sundanese,
Syloti_Nagri,
Syriac,
Tagalog,
Tagbanwa,
Tai_Le,
Tai_Tham,
Tai_Viet,
Takri,
Tamil,
Tangsa,
Tangut,
Telugu,
Thaana,
Thai,
Tibetan,
Tifinagh,
Tirhuta,
Toto,
Ugaritic,
Vai,
Vithkuqi,
Wancho,
Warang_Citi,
Yezidi,
Yi,
Zanabazar_Square,
};
allocator: mem.Allocator,
s1: []u16 = undefined,
s2: []u8 = undefined,
s3: []u8 = undefined,
const Self = @This();
pub fn init(allocator: mem.Allocator) !Self {
const decompressor = compress.flate.inflate.decompressor;
const in_bytes = @embedFile("scripts");
var in_fbs = std.io.fixedBufferStream(in_bytes);
var in_decomp = decompressor(.raw, in_fbs.reader());
var reader = in_decomp.reader();
const endian = builtin.cpu.arch.endian();
var self = Self{ .allocator = allocator };
const s1_len: u16 = try reader.readInt(u16, endian);
self.s1 = try allocator.alloc(u16, s1_len);
errdefer allocator.free(self.s1);
for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
const s2_len: u16 = try reader.readInt(u16, endian);
self.s2 = try allocator.alloc(u8, s2_len);
errdefer allocator.free(self.s2);
_ = try reader.readAll(self.s2);
const s3_len: u16 = try reader.readInt(u8, endian);
self.s3 = try allocator.alloc(u8, s3_len);
errdefer allocator.free(self.s3);
_ = try reader.readAll(self.s3);
return self;
}
pub fn deinit(self: *const Self) void {
self.allocator.free(self.s1);
self.allocator.free(self.s2);
self.allocator.free(self.s3);
}
/// Lookup the Script type for `cp`.
pub fn script(self: Self, cp: u21) ?Script {
const byte = self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]];
if (byte == 0) return null;
return @enumFromInt(byte);
}
test "script" {
const self = try init(std.testing.allocator);
defer self.deinit();
try testing.expectEqual(Script.Latin, self.script('A').?);
}

84
deps/zg/src/WidthData.zig vendored Normal file
View File

@@ -0,0 +1,84 @@
const std = @import("std");
const builtin = @import("builtin");
const compress = std.compress;
const mem = std.mem;
const testing = std.testing;
const GraphemeData = @import("GraphemeData");
allocator: mem.Allocator,
g_data: GraphemeData,
s1: []u16 = undefined,
s2: []i3 = undefined,
const Self = @This();
pub fn init(allocator: mem.Allocator) !Self {
const decompressor = compress.flate.inflate.decompressor;
const in_bytes = @embedFile("dwp");
var in_fbs = std.io.fixedBufferStream(in_bytes);
var in_decomp = decompressor(.raw, in_fbs.reader());
var reader = in_decomp.reader();
const endian = builtin.cpu.arch.endian();
var self = Self{
.allocator = allocator,
.g_data = try GraphemeData.init(allocator),
};
errdefer self.g_data.deinit();
const stage_1_len: u16 = try reader.readInt(u16, endian);
self.s1 = try allocator.alloc(u16, stage_1_len);
errdefer allocator.free(self.s1);
for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
const stage_2_len: u16 = try reader.readInt(u16, endian);
self.s2 = try allocator.alloc(i3, stage_2_len);
errdefer allocator.free(self.s2);
for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(i8, endian));
return self;
}
pub fn deinit(self: *const Self) void {
self.allocator.free(self.s1);
self.allocator.free(self.s2);
self.g_data.deinit();
}
/// codePointWidth returns the number of cells `cp` requires when rendered
/// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to
/// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1
/// control codes return 0. If `cjk` is true, ambiguous code points return 2,
/// otherwise they return 1.
pub fn codePointWidth(self: Self, cp: u21) i3 {
return self.s2[self.s1[cp >> 8] + (cp & 0xff)];
}
test "codePointWidth" {
try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null
try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b
try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL
try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf
try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL
try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF
try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT
try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF
try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR
try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ
try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI
try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf
try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic
try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen
try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash
try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash
try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth
try testing.expectEqual(@as(i3, 1), codePointWidth('é'));
try testing.expectEqual(@as(i3, 2), codePointWidth('😊'));
try testing.expectEqual(@as(i3, 2), codePointWidth('统'));
}

33
deps/zg/src/ascii.zig vendored Normal file
View File

@@ -0,0 +1,33 @@
const std = @import("std");
const simd = std.simd;
const testing = std.testing;
/// Returns true if `str` only contains ASCII bytes. Uses SIMD if possible.
pub fn isAsciiOnly(str: []const u8) bool {
const vec_len = simd.suggestVectorLength(u8) orelse return for (str) |b| {
if (b > 127) break false;
} else true;
const Vec = @Vector(vec_len, u8);
var remaining = str;
while (true) {
if (remaining.len < vec_len) return for (remaining) |b| {
if (b > 127) break false;
} else true;
const v1 = remaining[0..vec_len].*;
const v2: Vec = @splat(127);
if (@reduce(.Or, v1 > v2)) return false;
remaining = remaining[vec_len..];
}
return true;
}
test "isAsciiOnly" {
const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+";
try testing.expect(isAsciiOnly(ascii_only));
const not_ascii_only = "Héllo, World! 0123456789 !@#$%^&*()_-=+";
try testing.expect(!isAsciiOnly(not_ascii_only));
}

118
deps/zg/src/code_point.zig vendored Normal file
View File

@@ -0,0 +1,118 @@
const std = @import("std");
/// `CodePoint` represents a Unicode code point by its code,
/// length, and offset in the source bytes.
pub const CodePoint = struct {
code: u21,
len: u3,
offset: u32,
};
/// given a small slice of a string, decode the corresponding codepoint
pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
// EOS fast path
if (bytes.len == 0) {
return null;
}
// ASCII fast path
if (bytes[0] < 128) {
return .{
.code = bytes[0],
.len = 1,
.offset = offset,
};
}
var cp = CodePoint{
.code = undefined,
.len = switch (bytes[0]) {
0b1100_0000...0b1101_1111 => 2,
0b1110_0000...0b1110_1111 => 3,
0b1111_0000...0b1111_0111 => 4,
else => {
// unicode replacement code point.
return .{
.code = 0xfffd,
.len = 1,
.offset = offset,
};
},
},
.offset = offset,
};
// Return replacement if we don' have a complete codepoint remaining. Consumes only one byte
if (cp.len > bytes.len) {
// Unicode replacement code point.
return .{
.code = 0xfffd,
.len = 1,
.offset = offset,
};
}
const cp_bytes = bytes[0..cp.len];
cp.code = switch (cp.len) {
2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
(cp_bytes[1] & 0b00111111)) << 6) |
(cp_bytes[2] & 0b00111111),
4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
(cp_bytes[1] & 0b00111111)) << 6) |
(cp_bytes[2] & 0b00111111)) << 6) |
(cp_bytes[3] & 0b00111111),
else => @panic("CodePointIterator.next invalid code point length."),
};
return cp;
}
/// `Iterator` iterates a string one `CodePoint` at-a-time.
pub const Iterator = struct {
bytes: []const u8,
i: u32 = 0,
pub fn next(self: *Iterator) ?CodePoint {
if (self.i >= self.bytes.len) return null;
const res = decode(self.bytes[self.i..], self.i);
if (res) |cp| {
self.i += cp.len;
}
return res;
}
pub fn peek(self: *Iterator) ?CodePoint {
const saved_i = self.i;
defer self.i = saved_i;
return self.next();
}
};
test "decode" {
const bytes = "🌩️";
const res = decode(bytes, 0);
if (res) |cp| {
try std.testing.expectEqual(@as(u21, 0x1F329), cp.code);
try std.testing.expectEqual(4, cp.len);
} else {
// shouldn't have failed to return
try std.testing.expect(false);
}
}
test "peek" {
var iter = Iterator{ .bytes = "Hi" };
try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
}

258
deps/zg/src/grapheme.zig vendored Normal file
View File

@@ -0,0 +1,258 @@
const std = @import("std");
const mem = std.mem;
const unicode = std.unicode;
const CodePoint = @import("code_point").CodePoint;
const CodePointIterator = @import("code_point").Iterator;
pub const GraphemeData = @import("GraphemeData");
/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
pub const Grapheme = struct {
len: u8,
offset: u32,
/// `bytes` returns the slice of bytes that correspond to
/// this grapheme cluster in `src`.
pub fn bytes(self: Grapheme, src: []const u8) []const u8 {
return src[self.offset..][0..self.len];
}
};
/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
pub const Iterator = struct {
buf: [2]?CodePoint = .{ null, null },
cp_iter: CodePointIterator,
data: *const GraphemeData,
const Self = @This();
/// Assumes `src` is valid UTF-8.
pub fn init(str: []const u8, data: *const GraphemeData) Self {
var self = Self{ .cp_iter = .{ .bytes = str }, .data = data };
self.advance();
return self;
}
fn advance(self: *Self) void {
self.buf[0] = self.buf[1];
self.buf[1] = self.cp_iter.next();
}
pub fn next(self: *Self) ?Grapheme {
self.advance();
// If no more
if (self.buf[0] == null) return null;
// If last one
if (self.buf[1] == null) return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset };
// If ASCII
if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) {
return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset };
}
const gc_start = self.buf[0].?.offset;
var gc_len: u8 = self.buf[0].?.len;
var state = State{};
if (graphemeBreak(
self.buf[0].?.code,
self.buf[1].?.code,
self.data,
&state,
)) return Grapheme{ .len = gc_len, .offset = gc_start };
while (true) {
self.advance();
if (self.buf[0] == null) break;
gc_len += self.buf[0].?.len;
if (graphemeBreak(
self.buf[0].?.code,
if (self.buf[1]) |ncp| ncp.code else 0,
self.data,
&state,
)) break;
}
return Grapheme{ .len = gc_len, .offset = gc_start };
}
};
// Predicates
fn isBreaker(cp: u21, data: *const GraphemeData) bool {
// Extract relevant properties.
const cp_gbp_prop = data.gbp(cp);
return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
}
// Grapheme break state.
pub const State = struct {
bits: u3 = 0,
// Extended Pictographic (emoji)
fn hasXpic(self: State) bool {
return self.bits & 1 == 1;
}
fn setXpic(self: *State) void {
self.bits |= 1;
}
fn unsetXpic(self: *State) void {
self.bits ^= 1;
}
// Regional Indicatior (flags)
fn hasRegional(self: State) bool {
return self.bits & 2 == 2;
}
fn setRegional(self: *State) void {
self.bits |= 2;
}
fn unsetRegional(self: *State) void {
self.bits ^= 2;
}
// Indic Conjunct
fn hasIndic(self: State) bool {
return self.bits & 4 == 4;
}
fn setIndic(self: *State) void {
self.bits |= 4;
}
fn unsetIndic(self: *State) void {
self.bits ^= 4;
}
};
/// `graphemeBreak` returns true only if a grapheme break point is required
/// between `cp1` and `cp2`. `state` should start out as 0. If calling
/// iteratively over a sequence of code points, this function must be called
/// IN ORDER on ALL potential breaks in a string.
/// Modeled after the API of utf8proc's `utf8proc_grapheme_break_stateful`.
/// https://github.com/JuliaStrings/utf8proc/blob/2bbb1ba932f727aad1fab14fafdbc89ff9dc4604/utf8proc.h#L599-L617
pub fn graphemeBreak(
cp1: u21,
cp2: u21,
data: *const GraphemeData,
state: *State,
) bool {
// Extract relevant properties.
const cp1_gbp_prop = data.gbp(cp1);
const cp1_indic_prop = data.indic(cp1);
const cp1_is_emoji = data.isEmoji(cp1);
const cp2_gbp_prop = data.gbp(cp2);
const cp2_indic_prop = data.indic(cp2);
const cp2_is_emoji = data.isEmoji(cp2);
// GB11: Emoji Extend* ZWJ x Emoji
if (!state.hasXpic() and cp1_is_emoji) state.setXpic();
// GB9c: Indic Conjunct Break
if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic();
// GB3: CR x LF
if (cp1 == '\r' and cp2 == '\n') return false;
// GB4: Control
if (isBreaker(cp1, data)) return true;
// GB11: Emoji Extend* ZWJ x Emoji
if (state.hasXpic() and
cp1_gbp_prop == .ZWJ and
cp2_is_emoji)
{
state.unsetXpic();
return false;
}
// GB9b: x (Extend | ZWJ)
if (cp2_gbp_prop == .Extend or cp2_gbp_prop == .ZWJ) return false;
// GB9a: x Spacing
if (cp2_gbp_prop == .SpacingMark) return false;
// GB9b: Prepend x
if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false;
// GB12, GB13: RI x RI
if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
if (state.hasRegional()) {
state.unsetRegional();
return true;
} else {
state.setRegional();
return false;
}
}
// GB6: Hangul L x (L|V|LV|VT)
if (cp1_gbp_prop == .L) {
if (cp2_gbp_prop == .L or
cp2_gbp_prop == .V or
cp2_gbp_prop == .LV or
cp2_gbp_prop == .LVT) return false;
}
// GB7: Hangul (LV | V) x (V | T)
if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) {
if (cp2_gbp_prop == .V or
cp2_gbp_prop == .T) return false;
}
// GB8: Hangul (LVT | T) x T
if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) {
if (cp2_gbp_prop == .T) return false;
}
// GB9c: Indic Conjunct Break
if (state.hasIndic() and
cp1_indic_prop == .Consonant and
(cp2_indic_prop == .Extend or cp2_indic_prop == .Linker))
{
return false;
}
if (state.hasIndic() and
cp1_indic_prop == .Extend and
cp2_indic_prop == .Linker)
{
return false;
}
if (state.hasIndic() and
(cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and
cp2_indic_prop == .Consonant)
{
state.unsetIndic();
return false;
}
return true;
}
test "Segmentation ZWJ and ZWSP emoji sequences" {
const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2;
const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
const no_joiner = seq_1 ++ seq_2;
const data = try GraphemeData.init(std.testing.allocator);
defer data.deinit();
var iter = Iterator.init(with_zwj, &data);
var i: usize = 0;
while (iter.next()) |_| : (i += 1) {}
try std.testing.expectEqual(@as(usize, 1), i);
iter = Iterator.init(with_zwsp, &data);
i = 0;
while (iter.next()) |_| : (i += 1) {}
try std.testing.expectEqual(@as(usize, 3), i);
iter = Iterator.init(no_joiner, &data);
i = 0;
while (iter.next()) |_| : (i += 1) {}
try std.testing.expectEqual(@as(usize, 2), i);
}

195
deps/zg/src/unicode_tests.zig vendored Normal file
View File

@@ -0,0 +1,195 @@
const std = @import("std");
const fmt = std.fmt;
const fs = std.fs;
const io = std.io;
const heap = std.heap;
const mem = std.mem;
const testing = std.testing;
const unicode = std.unicode;
const Grapheme = @import("grapheme").Grapheme;
const GraphemeData = @import("grapheme").GraphemeData;
const GraphemeIterator = @import("grapheme").Iterator;
const Normalize = @import("Normalize");
test "Unicode normalization tests" {
var arena = heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit();
var allocator = arena.allocator();
var norm_data: Normalize.NormData = undefined;
try Normalize.NormData.init(&norm_data, allocator);
const n = Normalize{ .norm_data = &norm_data };
var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
defer file.close();
var buf_reader = io.bufferedReader(file.reader());
const input_stream = buf_reader.reader();
var line_no: usize = 0;
var buf: [4096]u8 = undefined;
var cp_buf: [4]u8 = undefined;
while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
line_no += 1;
// Skip comments or empty lines.
if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
// Iterate over fields.
var fields = mem.split(u8, line, ";");
var field_index: usize = 0;
var input: []u8 = undefined;
defer allocator.free(input);
while (fields.next()) |field| : (field_index += 1) {
if (field_index == 0) {
var i_buf = std.ArrayList(u8).init(allocator);
defer i_buf.deinit();
var i_fields = mem.split(u8, field, " ");
while (i_fields.next()) |s| {
const icp = try fmt.parseInt(u21, s, 16);
const len = try unicode.utf8Encode(icp, &cp_buf);
try i_buf.appendSlice(cp_buf[0..len]);
}
input = try i_buf.toOwnedSlice();
} else if (field_index == 1) {
//debug.print("\n*** {s} ***\n", .{line});
// NFC, time to test.
var w_buf = std.ArrayList(u8).init(allocator);
defer w_buf.deinit();
var w_fields = mem.split(u8, field, " ");
while (w_fields.next()) |s| {
const wcp = try fmt.parseInt(u21, s, 16);
const len = try unicode.utf8Encode(wcp, &cp_buf);
try w_buf.appendSlice(cp_buf[0..len]);
}
const want = w_buf.items;
var got = try n.nfc(allocator, input);
defer got.deinit();
try testing.expectEqualStrings(want, got.slice);
} else if (field_index == 2) {
// NFD, time to test.
var w_buf = std.ArrayList(u8).init(allocator);
defer w_buf.deinit();
var w_fields = mem.split(u8, field, " ");
while (w_fields.next()) |s| {
const wcp = try fmt.parseInt(u21, s, 16);
const len = try unicode.utf8Encode(wcp, &cp_buf);
try w_buf.appendSlice(cp_buf[0..len]);
}
const want = w_buf.items;
var got = try n.nfd(allocator, input);
defer got.deinit();
try testing.expectEqualStrings(want, got.slice);
} else if (field_index == 3) {
// NFKC, time to test.
var w_buf = std.ArrayList(u8).init(allocator);
defer w_buf.deinit();
var w_fields = mem.split(u8, field, " ");
while (w_fields.next()) |s| {
const wcp = try fmt.parseInt(u21, s, 16);
const len = try unicode.utf8Encode(wcp, &cp_buf);
try w_buf.appendSlice(cp_buf[0..len]);
}
const want = w_buf.items;
var got = try n.nfkc(allocator, input);
defer got.deinit();
try testing.expectEqualStrings(want, got.slice);
} else if (field_index == 4) {
// NFKD, time to test.
var w_buf = std.ArrayList(u8).init(allocator);
defer w_buf.deinit();
var w_fields = mem.split(u8, field, " ");
while (w_fields.next()) |s| {
const wcp = try fmt.parseInt(u21, s, 16);
const len = try unicode.utf8Encode(wcp, &cp_buf);
try w_buf.appendSlice(cp_buf[0..len]);
}
const want = w_buf.items;
const got = try n.nfkd(allocator, input);
defer got.deinit();
try testing.expectEqualStrings(want, got.slice);
} else {
continue;
}
}
}
}
test "Segmentation GraphemeIterator" {
const allocator = std.testing.allocator;
var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
defer file.close();
var buf_reader = std.io.bufferedReader(file.reader());
var input_stream = buf_reader.reader();
const data = try GraphemeData.init(allocator);
defer data.deinit();
var buf: [4096]u8 = undefined;
var line_no: usize = 1;
while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
// Skip comments or empty lines.
if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
// Clean up.
var line = std.mem.trimLeft(u8, raw, "÷ ");
if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
line = line[0..octo];
}
// Iterate over fields.
var want = std.ArrayList(Grapheme).init(allocator);
defer want.deinit();
var all_bytes = std.ArrayList(u8).init(allocator);
defer all_bytes.deinit();
var graphemes = std.mem.split(u8, line, " ÷ ");
var bytes_index: u32 = 0;
while (graphemes.next()) |field| {
var code_points = std.mem.split(u8, field, " ");
var cp_buf: [4]u8 = undefined;
var cp_index: u32 = 0;
var gc_len: u8 = 0;
while (code_points.next()) |code_point| {
if (std.mem.eql(u8, code_point, "×")) continue;
const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
const len = try unicode.utf8Encode(cp, &cp_buf);
try all_bytes.appendSlice(cp_buf[0..len]);
cp_index += len;
gc_len += len;
}
try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
bytes_index += cp_index;
}
// std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
var iter = GraphemeIterator.init(all_bytes.items, &data);
// Chaeck.
for (want.items) |want_gc| {
const got_gc = (iter.next()).?;
try std.testing.expectEqualStrings(
want_gc.bytes(all_bytes.items),
got_gc.bytes(all_bytes.items),
);
}
}
}