init
I will never get tired of vendoring dependencies. ha ha. It is possible I am insane. I had to do a lot of pruning to get these not to be ridiculous (especially the unicode data, which had nearly 1 million lines of... stuff).
This commit is contained in:
66
deps/zg/src/CanonData.zig
vendored
Normal file
66
deps/zg/src/CanonData.zig
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const compress = std.compress;
|
||||
const mem = std.mem;
|
||||
|
||||
allocator: mem.Allocator,
|
||||
nfc: std.AutoHashMap([2]u21, u21),
|
||||
nfd: [][]u21 = undefined,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(allocator: mem.Allocator) !Self {
|
||||
const decompressor = compress.flate.inflate.decompressor;
|
||||
const in_bytes = @embedFile("canon");
|
||||
var in_fbs = std.io.fixedBufferStream(in_bytes);
|
||||
var in_decomp = decompressor(.raw, in_fbs.reader());
|
||||
var reader = in_decomp.reader();
|
||||
|
||||
const endian = builtin.cpu.arch.endian();
|
||||
var self = Self{
|
||||
.allocator = allocator,
|
||||
.nfc = std.AutoHashMap([2]u21, u21).init(allocator),
|
||||
.nfd = try allocator.alloc([]u21, 0x110000),
|
||||
};
|
||||
|
||||
var slices: usize = 0;
|
||||
errdefer {
|
||||
self.nfc.deinit();
|
||||
for (self.nfd[0..slices]) |slice| self.allocator.free(slice);
|
||||
self.allocator.free(self.nfd);
|
||||
}
|
||||
|
||||
@memset(self.nfd, &.{});
|
||||
|
||||
while (true) {
|
||||
const len: u8 = try reader.readInt(u8, endian);
|
||||
if (len == 0) break;
|
||||
const cp = try reader.readInt(u24, endian);
|
||||
self.nfd[cp] = try allocator.alloc(u21, len - 1);
|
||||
slices += 1;
|
||||
for (0..len - 1) |i| {
|
||||
self.nfd[cp][i] = @intCast(try reader.readInt(u24, endian));
|
||||
}
|
||||
if (len == 3) {
|
||||
try self.nfc.put(self.nfd[cp][0..2].*, @intCast(cp));
|
||||
}
|
||||
}
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *Self) void {
|
||||
self.nfc.deinit();
|
||||
for (self.nfd) |slice| self.allocator.free(slice);
|
||||
self.allocator.free(self.nfd);
|
||||
}
|
||||
|
||||
/// Returns canonical decomposition for `cp`.
|
||||
pub fn toNfd(self: Self, cp: u21) []const u21 {
|
||||
return self.nfd[cp];
|
||||
}
|
||||
|
||||
// Returns the primary composite for the codepoints in `cp`.
|
||||
pub fn toNfc(self: Self, cps: [2]u21) ?u21 {
|
||||
return self.nfc.get(cps);
|
||||
}
|
||||
202
deps/zg/src/CaseData.zig
vendored
Normal file
202
deps/zg/src/CaseData.zig
vendored
Normal file
@@ -0,0 +1,202 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const compress = std.compress;
|
||||
const mem = std.mem;
|
||||
const testing = std.testing;
|
||||
const unicode = std.unicode;
|
||||
|
||||
const CodePointIterator = @import("code_point").Iterator;
|
||||
|
||||
allocator: mem.Allocator,
|
||||
case_map: [][2]u21,
|
||||
prop_s1: []u16 = undefined,
|
||||
prop_s2: []u8 = undefined,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(allocator: mem.Allocator) !Self {
|
||||
const decompressor = compress.flate.inflate.decompressor;
|
||||
const endian = builtin.cpu.arch.endian();
|
||||
|
||||
var self = Self{
|
||||
.allocator = allocator,
|
||||
.case_map = try allocator.alloc([2]u21, 0x110000),
|
||||
};
|
||||
errdefer allocator.free(self.case_map);
|
||||
|
||||
for (0..0x110000) |i| {
|
||||
const cp: u21 = @intCast(i);
|
||||
self.case_map[cp] = .{ cp, cp };
|
||||
}
|
||||
|
||||
// Uppercase
|
||||
const upper_bytes = @embedFile("upper");
|
||||
var upper_fbs = std.io.fixedBufferStream(upper_bytes);
|
||||
var upper_decomp = decompressor(.raw, upper_fbs.reader());
|
||||
var upper_reader = upper_decomp.reader();
|
||||
|
||||
while (true) {
|
||||
const cp = try upper_reader.readInt(i24, endian);
|
||||
if (cp == 0) break;
|
||||
const diff = try upper_reader.readInt(i24, endian);
|
||||
self.case_map[@intCast(cp)][0] = @intCast(cp + diff);
|
||||
}
|
||||
|
||||
// Lowercase
|
||||
const lower_bytes = @embedFile("lower");
|
||||
var lower_fbs = std.io.fixedBufferStream(lower_bytes);
|
||||
var lower_decomp = decompressor(.raw, lower_fbs.reader());
|
||||
var lower_reader = lower_decomp.reader();
|
||||
|
||||
while (true) {
|
||||
const cp = try lower_reader.readInt(i24, endian);
|
||||
if (cp == 0) break;
|
||||
const diff = try lower_reader.readInt(i24, endian);
|
||||
self.case_map[@intCast(cp)][1] = @intCast(cp + diff);
|
||||
}
|
||||
|
||||
// Case properties
|
||||
const cp_bytes = @embedFile("case_prop");
|
||||
var cp_fbs = std.io.fixedBufferStream(cp_bytes);
|
||||
var cp_decomp = decompressor(.raw, cp_fbs.reader());
|
||||
var cp_reader = cp_decomp.reader();
|
||||
|
||||
const stage_1_len: u16 = try cp_reader.readInt(u16, endian);
|
||||
self.prop_s1 = try allocator.alloc(u16, stage_1_len);
|
||||
errdefer allocator.free(self.prop_s1);
|
||||
for (0..stage_1_len) |i| self.prop_s1[i] = try cp_reader.readInt(u16, endian);
|
||||
|
||||
const stage_2_len: u16 = try cp_reader.readInt(u16, endian);
|
||||
self.prop_s2 = try allocator.alloc(u8, stage_2_len);
|
||||
errdefer allocator.free(self.prop_s2);
|
||||
_ = try cp_reader.readAll(self.prop_s2);
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *const Self) void {
|
||||
self.allocator.free(self.case_map);
|
||||
self.allocator.free(self.prop_s1);
|
||||
self.allocator.free(self.prop_s2);
|
||||
}
|
||||
|
||||
// Returns true if `cp` is either upper, lower, or title case.
|
||||
pub fn isCased(self: Self, cp: u21) bool {
|
||||
return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
|
||||
}
|
||||
|
||||
// Returns true if `cp` is uppercase.
|
||||
pub fn isUpper(self: Self, cp: u21) bool {
|
||||
return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
|
||||
}
|
||||
|
||||
/// Returns true if `str` is all uppercase.
|
||||
pub fn isUpperStr(self: Self, str: []const u8) bool {
|
||||
var iter = CodePointIterator{ .bytes = str };
|
||||
|
||||
return while (iter.next()) |cp| {
|
||||
if (self.isCased(cp.code) and !self.isUpper(cp.code)) break false;
|
||||
} else true;
|
||||
}
|
||||
|
||||
test "isUpperStr" {
|
||||
const cd = try init(testing.allocator);
|
||||
defer cd.deinit();
|
||||
|
||||
try testing.expect(cd.isUpperStr("HELLO, WORLD 2112!"));
|
||||
try testing.expect(!cd.isUpperStr("hello, world 2112!"));
|
||||
try testing.expect(!cd.isUpperStr("Hello, World 2112!"));
|
||||
}
|
||||
|
||||
/// Returns uppercase mapping for `cp`.
|
||||
pub fn toUpper(self: Self, cp: u21) u21 {
|
||||
return self.case_map[cp][0];
|
||||
}
|
||||
|
||||
/// Returns a new string with all letters in uppercase.
|
||||
/// Caller must free returned bytes with `allocator`.
|
||||
pub fn toUpperStr(
|
||||
self: Self,
|
||||
allocator: mem.Allocator,
|
||||
str: []const u8,
|
||||
) ![]u8 {
|
||||
var bytes = std.ArrayList(u8).init(allocator);
|
||||
defer bytes.deinit();
|
||||
|
||||
var iter = CodePointIterator{ .bytes = str };
|
||||
var buf: [4]u8 = undefined;
|
||||
|
||||
while (iter.next()) |cp| {
|
||||
const len = try unicode.utf8Encode(self.toUpper(cp.code), &buf);
|
||||
try bytes.appendSlice(buf[0..len]);
|
||||
}
|
||||
|
||||
return try bytes.toOwnedSlice();
|
||||
}
|
||||
|
||||
test "toUpperStr" {
|
||||
const cd = try init(testing.allocator);
|
||||
defer cd.deinit();
|
||||
|
||||
const uppered = try cd.toUpperStr(testing.allocator, "Hello, World 2112!");
|
||||
defer testing.allocator.free(uppered);
|
||||
try testing.expectEqualStrings("HELLO, WORLD 2112!", uppered);
|
||||
}
|
||||
|
||||
// Returns true if `cp` is lowercase.
|
||||
pub fn isLower(self: Self, cp: u21) bool {
|
||||
return self.prop_s2[self.prop_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
|
||||
}
|
||||
|
||||
/// Returns true if `str` is all lowercase.
|
||||
pub fn isLowerStr(self: Self, str: []const u8) bool {
|
||||
var iter = CodePointIterator{ .bytes = str };
|
||||
|
||||
return while (iter.next()) |cp| {
|
||||
if (self.isCased(cp.code) and !self.isLower(cp.code)) break false;
|
||||
} else true;
|
||||
}
|
||||
|
||||
test "isLowerStr" {
|
||||
const cd = try init(testing.allocator);
|
||||
defer cd.deinit();
|
||||
|
||||
try testing.expect(cd.isLowerStr("hello, world 2112!"));
|
||||
try testing.expect(!cd.isLowerStr("HELLO, WORLD 2112!"));
|
||||
try testing.expect(!cd.isLowerStr("Hello, World 2112!"));
|
||||
}
|
||||
|
||||
/// Returns lowercase mapping for `cp`.
|
||||
pub fn toLower(self: Self, cp: u21) u21 {
|
||||
return self.case_map[cp][1];
|
||||
}
|
||||
|
||||
/// Returns a new string with all letters in lowercase.
|
||||
/// Caller must free returned bytes with `allocator`.
|
||||
pub fn toLowerStr(
|
||||
self: Self,
|
||||
allocator: mem.Allocator,
|
||||
str: []const u8,
|
||||
) ![]u8 {
|
||||
var bytes = std.ArrayList(u8).init(allocator);
|
||||
defer bytes.deinit();
|
||||
|
||||
var iter = CodePointIterator{ .bytes = str };
|
||||
var buf: [4]u8 = undefined;
|
||||
|
||||
while (iter.next()) |cp| {
|
||||
const len = try unicode.utf8Encode(self.toLower(cp.code), &buf);
|
||||
try bytes.appendSlice(buf[0..len]);
|
||||
}
|
||||
|
||||
return try bytes.toOwnedSlice();
|
||||
}
|
||||
|
||||
test "toLowerStr" {
|
||||
const cd = try init(testing.allocator);
|
||||
defer cd.deinit();
|
||||
|
||||
const lowered = try cd.toLowerStr(testing.allocator, "Hello, World 2112!");
|
||||
defer testing.allocator.free(lowered);
|
||||
try testing.expectEqualStrings("hello, world 2112!", lowered);
|
||||
}
|
||||
189
deps/zg/src/CaseFold.zig
vendored
Normal file
189
deps/zg/src/CaseFold.zig
vendored
Normal file
@@ -0,0 +1,189 @@
|
||||
const std = @import("std");
|
||||
const mem = std.mem;
|
||||
const testing = std.testing;
|
||||
|
||||
const ascii = @import("ascii");
|
||||
pub const FoldData = @import("FoldData");
|
||||
const Normalize = @import("Normalize");
|
||||
|
||||
fold_data: *const FoldData,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
/// Produces the case folded code points for `cps`. Caller must free returned
|
||||
/// slice with `allocator`.
|
||||
pub fn caseFold(
|
||||
self: Self,
|
||||
allocator: mem.Allocator,
|
||||
cps: []const u21,
|
||||
) ![]const u21 {
|
||||
var cfcps = std.ArrayList(u21).init(allocator);
|
||||
defer cfcps.deinit();
|
||||
var buf: [3]u21 = undefined;
|
||||
|
||||
for (cps) |cp| {
|
||||
const cf = self.fold_data.caseFold(cp, &buf);
|
||||
|
||||
if (cf.len == 0) {
|
||||
try cfcps.append(cp);
|
||||
} else {
|
||||
try cfcps.appendSlice(cf);
|
||||
}
|
||||
}
|
||||
|
||||
return try cfcps.toOwnedSlice();
|
||||
}
|
||||
|
||||
fn changesWhenCaseFolded(self: Self, cps: []const u21) bool {
|
||||
return for (cps) |cp| {
|
||||
if (self.fold_data.changesWhenCaseFolded(cp)) break true;
|
||||
} else false;
|
||||
}
|
||||
|
||||
/// Caseless compare `a` and `b` by decomposing to NFKD. This is the most
|
||||
/// comprehensive comparison possible, but slower than `canonCaselessMatch`.
|
||||
pub fn compatCaselessMatch(
|
||||
self: Self,
|
||||
allocator: mem.Allocator,
|
||||
normalizer: *const Normalize,
|
||||
a: []const u8,
|
||||
b: []const u8,
|
||||
) !bool {
|
||||
if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
|
||||
|
||||
// Process a
|
||||
const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd);
|
||||
defer allocator.free(nfd_a);
|
||||
|
||||
var need_free_cf_nfd_a = false;
|
||||
var cf_nfd_a: []const u21 = nfd_a;
|
||||
if (self.changesWhenCaseFolded(nfd_a)) {
|
||||
cf_nfd_a = try self.caseFold(allocator, nfd_a);
|
||||
need_free_cf_nfd_a = true;
|
||||
}
|
||||
defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
|
||||
|
||||
const nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfd_a);
|
||||
defer allocator.free(nfkd_cf_nfd_a);
|
||||
const cf_nfkd_cf_nfd_a = try self.caseFold(allocator, nfkd_cf_nfd_a);
|
||||
defer allocator.free(cf_nfkd_cf_nfd_a);
|
||||
const nfkd_cf_nfkd_cf_nfd_a = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_a);
|
||||
defer allocator.free(nfkd_cf_nfkd_cf_nfd_a);
|
||||
|
||||
// Process b
|
||||
const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd);
|
||||
defer allocator.free(nfd_b);
|
||||
|
||||
var need_free_cf_nfd_b = false;
|
||||
var cf_nfd_b: []const u21 = nfd_b;
|
||||
if (self.changesWhenCaseFolded(nfd_b)) {
|
||||
cf_nfd_b = try self.caseFold(allocator, nfd_b);
|
||||
need_free_cf_nfd_b = true;
|
||||
}
|
||||
defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
|
||||
|
||||
const nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfd_b);
|
||||
defer allocator.free(nfkd_cf_nfd_b);
|
||||
const cf_nfkd_cf_nfd_b = try self.caseFold(allocator, nfkd_cf_nfd_b);
|
||||
defer allocator.free(cf_nfkd_cf_nfd_b);
|
||||
const nfkd_cf_nfkd_cf_nfd_b = try normalizer.nfkdCodePoints(allocator, cf_nfkd_cf_nfd_b);
|
||||
defer allocator.free(nfkd_cf_nfkd_cf_nfd_b);
|
||||
|
||||
return mem.eql(u21, nfkd_cf_nfkd_cf_nfd_a, nfkd_cf_nfkd_cf_nfd_b);
|
||||
}
|
||||
|
||||
test "compatCaselessMatch" {
|
||||
const allocator = testing.allocator;
|
||||
|
||||
const norm_data = try Normalize.NormData.init(allocator);
|
||||
defer norm_data.deinit();
|
||||
const n = Normalize{ .norm_data = &norm_data };
|
||||
|
||||
const fold_data = try FoldData.init(allocator);
|
||||
defer fold_data.deinit();
|
||||
const caser = Self{ .fold_data = &fold_data };
|
||||
|
||||
try testing.expect(try caser.compatCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!"));
|
||||
|
||||
const a = "Héllo World! \u{3d3}";
|
||||
const b = "He\u{301}llo World! \u{3a5}\u{301}";
|
||||
try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, b));
|
||||
|
||||
const c = "He\u{301}llo World! \u{3d2}\u{301}";
|
||||
try testing.expect(try caser.compatCaselessMatch(allocator, &n, a, c));
|
||||
}
|
||||
|
||||
/// Performs canonical caseless string matching by decomposing to NFD. This is
|
||||
/// faster than `compatCaselessMatch`, but less comprehensive.
|
||||
pub fn canonCaselessMatch(
|
||||
self: Self,
|
||||
allocator: mem.Allocator,
|
||||
normalizer: *const Normalize,
|
||||
a: []const u8,
|
||||
b: []const u8,
|
||||
) !bool {
|
||||
if (ascii.isAsciiOnly(a) and ascii.isAsciiOnly(b)) return std.ascii.eqlIgnoreCase(a, b);
|
||||
|
||||
// Process a
|
||||
const nfd_a = try normalizer.nfxdCodePoints(allocator, a, .nfd);
|
||||
defer allocator.free(nfd_a);
|
||||
|
||||
var need_free_cf_nfd_a = false;
|
||||
var cf_nfd_a: []const u21 = nfd_a;
|
||||
if (self.changesWhenCaseFolded(nfd_a)) {
|
||||
cf_nfd_a = try self.caseFold(allocator, nfd_a);
|
||||
need_free_cf_nfd_a = true;
|
||||
}
|
||||
defer if (need_free_cf_nfd_a) allocator.free(cf_nfd_a);
|
||||
|
||||
var need_free_nfd_cf_nfd_a = false;
|
||||
var nfd_cf_nfd_a = cf_nfd_a;
|
||||
if (!need_free_cf_nfd_a) {
|
||||
nfd_cf_nfd_a = try normalizer.nfdCodePoints(allocator, cf_nfd_a);
|
||||
need_free_nfd_cf_nfd_a = true;
|
||||
}
|
||||
defer if (need_free_nfd_cf_nfd_a) allocator.free(nfd_cf_nfd_a);
|
||||
|
||||
// Process b
|
||||
const nfd_b = try normalizer.nfxdCodePoints(allocator, b, .nfd);
|
||||
defer allocator.free(nfd_b);
|
||||
|
||||
var need_free_cf_nfd_b = false;
|
||||
var cf_nfd_b: []const u21 = nfd_b;
|
||||
if (self.changesWhenCaseFolded(nfd_b)) {
|
||||
cf_nfd_b = try self.caseFold(allocator, nfd_b);
|
||||
need_free_cf_nfd_b = true;
|
||||
}
|
||||
defer if (need_free_cf_nfd_b) allocator.free(cf_nfd_b);
|
||||
|
||||
var need_free_nfd_cf_nfd_b = false;
|
||||
var nfd_cf_nfd_b = cf_nfd_b;
|
||||
if (!need_free_cf_nfd_b) {
|
||||
nfd_cf_nfd_b = try normalizer.nfdCodePoints(allocator, cf_nfd_b);
|
||||
need_free_nfd_cf_nfd_b = true;
|
||||
}
|
||||
defer if (need_free_nfd_cf_nfd_b) allocator.free(nfd_cf_nfd_b);
|
||||
|
||||
return mem.eql(u21, nfd_cf_nfd_a, nfd_cf_nfd_b);
|
||||
}
|
||||
|
||||
test "canonCaselessMatch" {
|
||||
const allocator = testing.allocator;
|
||||
|
||||
const norm_data = try Normalize.NormData.init(allocator);
|
||||
defer norm_data.deinit();
|
||||
const n = Normalize{ .norm_data = &norm_data };
|
||||
|
||||
const fold_data = try FoldData.init(allocator);
|
||||
defer fold_data.deinit();
|
||||
const caser = Self{ .fold_data = &fold_data };
|
||||
|
||||
try testing.expect(try caser.canonCaselessMatch(allocator, &n, "ascii only!", "ASCII Only!"));
|
||||
|
||||
const a = "Héllo World! \u{3d3}";
|
||||
const b = "He\u{301}llo World! \u{3a5}\u{301}";
|
||||
try testing.expect(!try caser.canonCaselessMatch(allocator, &n, a, b));
|
||||
|
||||
const c = "He\u{301}llo World! \u{3d2}\u{301}";
|
||||
try testing.expect(try caser.canonCaselessMatch(allocator, &n, a, c));
|
||||
}
|
||||
49
deps/zg/src/CombiningData.zig
vendored
Normal file
49
deps/zg/src/CombiningData.zig
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const compress = std.compress;
|
||||
const mem = std.mem;
|
||||
|
||||
allocator: mem.Allocator,
|
||||
s1: []u16 = undefined,
|
||||
s2: []u8 = undefined,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(allocator: mem.Allocator) !Self {
|
||||
const decompressor = compress.flate.inflate.decompressor;
|
||||
const in_bytes = @embedFile("ccc");
|
||||
var in_fbs = std.io.fixedBufferStream(in_bytes);
|
||||
var in_decomp = decompressor(.raw, in_fbs.reader());
|
||||
var reader = in_decomp.reader();
|
||||
|
||||
const endian = builtin.cpu.arch.endian();
|
||||
|
||||
var self = Self{ .allocator = allocator };
|
||||
|
||||
const stage_1_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s1 = try allocator.alloc(u16, stage_1_len);
|
||||
errdefer allocator.free(self.s1);
|
||||
for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
|
||||
|
||||
const stage_2_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s2 = try allocator.alloc(u8, stage_2_len);
|
||||
errdefer allocator.free(self.s2);
|
||||
_ = try reader.readAll(self.s2);
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *const Self) void {
|
||||
self.allocator.free(self.s1);
|
||||
self.allocator.free(self.s2);
|
||||
}
|
||||
|
||||
/// Returns the canonical combining class for a code point.
|
||||
pub fn ccc(self: Self, cp: u21) u8 {
|
||||
return self.s2[self.s1[cp >> 8] + (cp & 0xff)];
|
||||
}
|
||||
|
||||
/// True if `cp` is a starter code point, not a combining character.
|
||||
pub fn isStarter(self: Self, cp: u21) bool {
|
||||
return self.s2[self.s1[cp >> 8] + (cp & 0xff)] == 0;
|
||||
}
|
||||
50
deps/zg/src/CompatData.zig
vendored
Normal file
50
deps/zg/src/CompatData.zig
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const compress = std.compress;
|
||||
const mem = std.mem;
|
||||
|
||||
allocator: mem.Allocator,
|
||||
nfkd: [][]u21 = undefined,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(allocator: mem.Allocator) !Self {
|
||||
const decompressor = compress.flate.inflate.decompressor;
|
||||
const in_bytes = @embedFile("compat");
|
||||
var in_fbs = std.io.fixedBufferStream(in_bytes);
|
||||
var in_decomp = decompressor(.raw, in_fbs.reader());
|
||||
var reader = in_decomp.reader();
|
||||
|
||||
const endian = builtin.cpu.arch.endian();
|
||||
var self = Self{
|
||||
.allocator = allocator,
|
||||
.nfkd = try allocator.alloc([]u21, 0x110000),
|
||||
};
|
||||
errdefer self.deinit();
|
||||
|
||||
@memset(self.nfkd, &.{});
|
||||
|
||||
while (true) {
|
||||
const len: u8 = try reader.readInt(u8, endian);
|
||||
if (len == 0) break;
|
||||
const cp = try reader.readInt(u24, endian);
|
||||
self.nfkd[cp] = try allocator.alloc(u21, len - 1);
|
||||
for (0..len - 1) |i| {
|
||||
self.nfkd[cp][i] = @intCast(try reader.readInt(u24, endian));
|
||||
}
|
||||
}
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *const Self) void {
|
||||
for (self.nfkd) |slice| {
|
||||
if (slice.len != 0) self.allocator.free(slice);
|
||||
}
|
||||
self.allocator.free(self.nfkd);
|
||||
}
|
||||
|
||||
/// Returns compatibility decomposition for `cp`.
|
||||
pub fn toNfkd(self: Self, cp: u21) []u21 {
|
||||
return self.nfkd[cp];
|
||||
}
|
||||
355
deps/zg/src/DisplayWidth.zig
vendored
Normal file
355
deps/zg/src/DisplayWidth.zig
vendored
Normal file
@@ -0,0 +1,355 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const ArrayList = std.ArrayList;
|
||||
const mem = std.mem;
|
||||
const simd = std.simd;
|
||||
const testing = std.testing;
|
||||
|
||||
const ascii = @import("ascii");
|
||||
const CodePointIterator = @import("code_point").Iterator;
|
||||
const GraphemeIterator = @import("grapheme").Iterator;
|
||||
pub const DisplayWidthData = @import("DisplayWidthData");
|
||||
|
||||
data: *const DisplayWidthData,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
/// strWidth returns the total display width of `str` as the number of cells
|
||||
/// required in a fixed-pitch font (i.e. a terminal screen).
|
||||
pub fn strWidth(self: Self, str: []const u8) usize {
|
||||
var total: isize = 0;
|
||||
|
||||
// ASCII fast path
|
||||
if (ascii.isAsciiOnly(str)) {
|
||||
for (str) |b| total += self.data.codePointWidth(b);
|
||||
return @intCast(@max(0, total));
|
||||
}
|
||||
|
||||
var giter = GraphemeIterator.init(str, &self.data.g_data);
|
||||
|
||||
while (giter.next()) |gc| {
|
||||
var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) };
|
||||
var gc_total: isize = 0;
|
||||
|
||||
while (cp_iter.next()) |cp| {
|
||||
var w = self.data.codePointWidth(cp.code);
|
||||
|
||||
if (w != 0) {
|
||||
// Handle text emoji sequence.
|
||||
if (cp_iter.next()) |ncp| {
|
||||
// emoji text sequence.
|
||||
if (ncp.code == 0xFE0E) w = 1;
|
||||
if (ncp.code == 0xFE0F) w = 2;
|
||||
}
|
||||
|
||||
// Only adding width of first non-zero-width code point.
|
||||
if (gc_total == 0) {
|
||||
gc_total = w;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
total += gc_total;
|
||||
}
|
||||
|
||||
return @intCast(@max(0, total));
|
||||
}
|
||||
|
||||
test "strWidth" {
|
||||
const data = try DisplayWidthData.init(testing.allocator);
|
||||
defer data.deinit();
|
||||
const self = Self{ .data = &data };
|
||||
|
||||
try testing.expectEqual(@as(usize, 5), self.strWidth("Hello\r\n"));
|
||||
try testing.expectEqual(@as(usize, 1), self.strWidth("\u{0065}\u{0301}"));
|
||||
try testing.expectEqual(@as(usize, 2), self.strWidth("\u{1F476}\u{1F3FF}\u{0308}\u{200D}\u{1F476}\u{1F3FF}"));
|
||||
try testing.expectEqual(@as(usize, 8), self.strWidth("Hello 😊"));
|
||||
try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 😊"));
|
||||
try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo :)"));
|
||||
try testing.expectEqual(@as(usize, 8), self.strWidth("Héllo 🇪🇸"));
|
||||
try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}")); // Lone emoji
|
||||
try testing.expectEqual(@as(usize, 1), self.strWidth("\u{26A1}\u{FE0E}")); // Text sequence
|
||||
try testing.expectEqual(@as(usize, 2), self.strWidth("\u{26A1}\u{FE0F}")); // Presentation sequence
|
||||
try testing.expectEqual(@as(usize, 1), self.strWidth("\u{2764}")); // Default text presentation
|
||||
try testing.expectEqual(@as(usize, 1), self.strWidth("\u{2764}\u{FE0E}")); // Default text presentation with VS15 selector
|
||||
try testing.expectEqual(@as(usize, 2), self.strWidth("\u{2764}\u{FE0F}")); // Default text presentation with VS16 selector
|
||||
try testing.expectEqual(@as(usize, 0), self.strWidth("A\x08")); // Backspace
|
||||
try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA")); // DEL
|
||||
try testing.expectEqual(@as(usize, 0), self.strWidth("\x7FA\x08\x08")); // never less than o
|
||||
|
||||
// wcwidth Python lib tests. See: https://github.com/jquast/wcwidth/blob/master/tests/test_core.py
|
||||
const empty = "";
|
||||
try testing.expectEqual(@as(usize, 0), self.strWidth(empty));
|
||||
const with_null = "hello\x00world";
|
||||
try testing.expectEqual(@as(usize, 10), self.strWidth(with_null));
|
||||
const hello_jp = "コンニチハ, セカイ!";
|
||||
try testing.expectEqual(@as(usize, 19), self.strWidth(hello_jp));
|
||||
const control = "\x1b[0m";
|
||||
try testing.expectEqual(@as(usize, 3), self.strWidth(control));
|
||||
const balinese = "\u{1B13}\u{1B28}\u{1B2E}\u{1B44}";
|
||||
try testing.expectEqual(@as(usize, 3), self.strWidth(balinese));
|
||||
|
||||
// These commented out tests require a new specification for complex scripts.
|
||||
// See: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf
|
||||
// const jamo = "\u{1100}\u{1160}";
|
||||
// try testing.expectEqual(@as(usize, 3), strWidth(jamo));
|
||||
// const devengari = "\u{0915}\u{094D}\u{0937}\u{093F}";
|
||||
// try testing.expectEqual(@as(usize, 3), strWidth(devengari));
|
||||
// const tamal = "\u{0b95}\u{0bcd}\u{0bb7}\u{0bcc}";
|
||||
// try testing.expectEqual(@as(usize, 5), strWidth(tamal));
|
||||
// const kannada_1 = "\u{0cb0}\u{0ccd}\u{0c9d}\u{0cc8}";
|
||||
// try testing.expectEqual(@as(usize, 3), strWidth(kannada_1));
|
||||
// The following passes but as a mere coincidence.
|
||||
const kannada_2 = "\u{0cb0}\u{0cbc}\u{0ccd}\u{0c9a}";
|
||||
try testing.expectEqual(@as(usize, 2), self.strWidth(kannada_2));
|
||||
|
||||
// From Rust https://github.com/jameslanska/unicode-display-width
|
||||
try testing.expectEqual(@as(usize, 15), self.strWidth("🔥🗡🍩👩🏻🚀⏰💃🏼🔦👍🏻"));
|
||||
try testing.expectEqual(@as(usize, 2), self.strWidth("🦀"));
|
||||
try testing.expectEqual(@as(usize, 2), self.strWidth("👨👩👧👧"));
|
||||
try testing.expectEqual(@as(usize, 2), self.strWidth("👩🔬"));
|
||||
try testing.expectEqual(@as(usize, 9), self.strWidth("sane text"));
|
||||
try testing.expectEqual(@as(usize, 9), self.strWidth("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ"));
|
||||
try testing.expectEqual(@as(usize, 17), self.strWidth("슬라바 우크라이나"));
|
||||
try testing.expectEqual(@as(usize, 1), self.strWidth("\u{378}"));
|
||||
}
|
||||
|
||||
/// centers `str` in a new string of width `total_width` (in display cells) using `pad` as padding.
|
||||
/// If the length of `str` and `total_width` have different parity, the right side of `str` will
|
||||
/// receive one additional pad. This makes sure the returned string fills the requested width.
|
||||
/// Caller must free returned bytes with `allocator`.
|
||||
pub fn center(
|
||||
self: Self,
|
||||
allocator: mem.Allocator,
|
||||
str: []const u8,
|
||||
total_width: usize,
|
||||
pad: []const u8,
|
||||
) ![]u8 {
|
||||
const str_width = self.strWidth(str);
|
||||
if (str_width > total_width) return error.StrTooLong;
|
||||
if (str_width == total_width) return try allocator.dupe(u8, str);
|
||||
|
||||
const pad_width = self.strWidth(pad);
|
||||
if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
|
||||
|
||||
const margin_width = @divFloor((total_width - str_width), 2);
|
||||
if (pad_width > margin_width) return error.PadTooLong;
|
||||
const extra_pad: usize = if (total_width % 2 != str_width % 2) 1 else 0;
|
||||
const pads = @divFloor(margin_width, pad_width) * 2 + extra_pad;
|
||||
|
||||
var result = try allocator.alloc(u8, pads * pad.len + str.len);
|
||||
var bytes_index: usize = 0;
|
||||
var pads_index: usize = 0;
|
||||
|
||||
while (pads_index < pads / 2) : (pads_index += 1) {
|
||||
@memcpy(result[bytes_index..][0..pad.len], pad);
|
||||
bytes_index += pad.len;
|
||||
}
|
||||
|
||||
@memcpy(result[bytes_index..][0..str.len], str);
|
||||
bytes_index += str.len;
|
||||
|
||||
pads_index = 0;
|
||||
while (pads_index < pads / 2 + extra_pad) : (pads_index += 1) {
|
||||
@memcpy(result[bytes_index..][0..pad.len], pad);
|
||||
bytes_index += pad.len;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
test "center" {
|
||||
const allocator = testing.allocator;
|
||||
const data = try DisplayWidthData.init(allocator);
|
||||
defer data.deinit();
|
||||
const self = Self{ .data = &data };
|
||||
|
||||
// Input and width both have odd length
|
||||
var centered = try self.center(allocator, "abc", 9, "*");
|
||||
try testing.expectEqualSlices(u8, "***abc***", centered);
|
||||
|
||||
// Input and width both have even length
|
||||
testing.allocator.free(centered);
|
||||
centered = try self.center(allocator, "w😊w", 10, "-");
|
||||
try testing.expectEqualSlices(u8, "---w😊w---", centered);
|
||||
|
||||
// Input has even length, width has odd length
|
||||
testing.allocator.free(centered);
|
||||
centered = try self.center(allocator, "1234", 9, "-");
|
||||
try testing.expectEqualSlices(u8, "--1234---", centered);
|
||||
|
||||
// Input has odd length, width has even length
|
||||
testing.allocator.free(centered);
|
||||
centered = try self.center(allocator, "123", 8, "-");
|
||||
try testing.expectEqualSlices(u8, "--123---", centered);
|
||||
|
||||
// Input is the same length as the width
|
||||
testing.allocator.free(centered);
|
||||
centered = try self.center(allocator, "123", 3, "-");
|
||||
try testing.expectEqualSlices(u8, "123", centered);
|
||||
|
||||
// Input is empty
|
||||
testing.allocator.free(centered);
|
||||
centered = try self.center(allocator, "", 3, "-");
|
||||
try testing.expectEqualSlices(u8, "---", centered);
|
||||
|
||||
// Input is empty and width is zero
|
||||
testing.allocator.free(centered);
|
||||
centered = try self.center(allocator, "", 0, "-");
|
||||
try testing.expectEqualSlices(u8, "", centered);
|
||||
|
||||
// Input is longer than the width, which is an error
|
||||
testing.allocator.free(centered);
|
||||
try testing.expectError(error.StrTooLong, self.center(allocator, "123", 2, "-"));
|
||||
}
|
||||
|
||||
/// padLeft returns a new string of width `total_width` (in display cells) using `pad` as padding
|
||||
/// on the left side. Caller must free returned bytes with `allocator`.
|
||||
pub fn padLeft(
|
||||
self: Self,
|
||||
allocator: mem.Allocator,
|
||||
str: []const u8,
|
||||
total_width: usize,
|
||||
pad: []const u8,
|
||||
) ![]u8 {
|
||||
const str_width = self.strWidth(str);
|
||||
if (str_width > total_width) return error.StrTooLong;
|
||||
|
||||
const pad_width = self.strWidth(pad);
|
||||
if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
|
||||
|
||||
const margin_width = total_width - str_width;
|
||||
if (pad_width > margin_width) return error.PadTooLong;
|
||||
|
||||
const pads = @divFloor(margin_width, pad_width);
|
||||
|
||||
var result = try allocator.alloc(u8, pads * pad.len + str.len);
|
||||
var bytes_index: usize = 0;
|
||||
var pads_index: usize = 0;
|
||||
|
||||
while (pads_index < pads) : (pads_index += 1) {
|
||||
@memcpy(result[bytes_index..][0..pad.len], pad);
|
||||
bytes_index += pad.len;
|
||||
}
|
||||
|
||||
@memcpy(result[bytes_index..][0..str.len], str);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
test "padLeft" {
|
||||
const allocator = testing.allocator;
|
||||
const data = try DisplayWidthData.init(allocator);
|
||||
defer data.deinit();
|
||||
const self = Self{ .data = &data };
|
||||
|
||||
var right_aligned = try self.padLeft(allocator, "abc", 9, "*");
|
||||
defer testing.allocator.free(right_aligned);
|
||||
try testing.expectEqualSlices(u8, "******abc", right_aligned);
|
||||
|
||||
testing.allocator.free(right_aligned);
|
||||
right_aligned = try self.padLeft(allocator, "w😊w", 10, "-");
|
||||
try testing.expectEqualSlices(u8, "------w😊w", right_aligned);
|
||||
}
|
||||
|
||||
/// padRight returns a new string of width `total_width` (in display cells) using `pad` as padding
|
||||
/// on the right side. Caller must free returned bytes with `allocator`.
|
||||
pub fn padRight(
|
||||
self: Self,
|
||||
allocator: mem.Allocator,
|
||||
str: []const u8,
|
||||
total_width: usize,
|
||||
pad: []const u8,
|
||||
) ![]u8 {
|
||||
const str_width = self.strWidth(str);
|
||||
if (str_width > total_width) return error.StrTooLong;
|
||||
|
||||
const pad_width = self.strWidth(pad);
|
||||
if (pad_width > total_width or str_width + pad_width > total_width) return error.PadTooLong;
|
||||
|
||||
const margin_width = total_width - str_width;
|
||||
if (pad_width > margin_width) return error.PadTooLong;
|
||||
|
||||
const pads = @divFloor(margin_width, pad_width);
|
||||
|
||||
var result = try allocator.alloc(u8, pads * pad.len + str.len);
|
||||
var bytes_index: usize = 0;
|
||||
var pads_index: usize = 0;
|
||||
|
||||
@memcpy(result[bytes_index..][0..str.len], str);
|
||||
bytes_index += str.len;
|
||||
|
||||
while (pads_index < pads) : (pads_index += 1) {
|
||||
@memcpy(result[bytes_index..][0..pad.len], pad);
|
||||
bytes_index += pad.len;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
test "padRight" {
|
||||
const allocator = testing.allocator;
|
||||
const data = try DisplayWidthData.init(allocator);
|
||||
defer data.deinit();
|
||||
const self = Self{ .data = &data };
|
||||
|
||||
var left_aligned = try self.padRight(allocator, "abc", 9, "*");
|
||||
defer testing.allocator.free(left_aligned);
|
||||
try testing.expectEqualSlices(u8, "abc******", left_aligned);
|
||||
|
||||
testing.allocator.free(left_aligned);
|
||||
left_aligned = try self.padRight(allocator, "w😊w", 10, "-");
|
||||
try testing.expectEqualSlices(u8, "w😊w------", left_aligned);
|
||||
}
|
||||
|
||||
/// Wraps a string approximately at the given number of colums per line.
|
||||
/// `threshold` defines how far the last column of the last word can be
|
||||
/// from the edge. Caller must free returned bytes with `allocator`.
|
||||
pub fn wrap(
|
||||
self: Self,
|
||||
allocator: mem.Allocator,
|
||||
str: []const u8,
|
||||
columns: usize,
|
||||
threshold: usize,
|
||||
) ![]u8 {
|
||||
var result = ArrayList(u8).init(allocator);
|
||||
defer result.deinit();
|
||||
|
||||
var line_iter = mem.tokenizeAny(u8, str, "\r\n");
|
||||
var line_width: usize = 0;
|
||||
|
||||
while (line_iter.next()) |line| {
|
||||
var word_iter = mem.tokenizeScalar(u8, line, ' ');
|
||||
|
||||
while (word_iter.next()) |word| {
|
||||
try result.appendSlice(word);
|
||||
try result.append(' ');
|
||||
line_width += self.strWidth(word) + 1;
|
||||
|
||||
if (line_width > columns or columns - line_width <= threshold) {
|
||||
try result.append('\n');
|
||||
line_width = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove trailing space and newline.
|
||||
_ = result.pop();
|
||||
_ = result.pop();
|
||||
|
||||
return try result.toOwnedSlice();
|
||||
}
|
||||
|
||||
test "wrap" {
|
||||
const allocator = testing.allocator;
|
||||
const data = try DisplayWidthData.init(allocator);
|
||||
defer data.deinit();
|
||||
const self = Self{ .data = &data };
|
||||
|
||||
const input = "The quick brown fox\r\njumped over the lazy dog!";
|
||||
const got = try self.wrap(allocator, input, 10, 3);
|
||||
defer testing.allocator.free(got);
|
||||
const want = "The quick \nbrown fox \njumped \nover the \nlazy dog!";
|
||||
try testing.expectEqualStrings(want, got);
|
||||
}
|
||||
98
deps/zg/src/FoldData.zig
vendored
Normal file
98
deps/zg/src/FoldData.zig
vendored
Normal file
@@ -0,0 +1,98 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const compress = std.compress;
|
||||
const mem = std.mem;
|
||||
|
||||
allocator: mem.Allocator,
|
||||
cutoff: u21 = undefined,
|
||||
cwcf_exceptions_min: u21 = undefined,
|
||||
cwcf_exceptions_max: u21 = undefined,
|
||||
cwcf_exceptions: []u21 = undefined,
|
||||
multiple_start: u21 = undefined,
|
||||
stage1: []u8 = undefined,
|
||||
stage2: []u8 = undefined,
|
||||
stage3: []i24 = undefined,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(allocator: mem.Allocator) !Self {
|
||||
const decompressor = compress.flate.inflate.decompressor;
|
||||
const in_bytes = @embedFile("fold");
|
||||
var in_fbs = std.io.fixedBufferStream(in_bytes);
|
||||
var in_decomp = decompressor(.raw, in_fbs.reader());
|
||||
var reader = in_decomp.reader();
|
||||
|
||||
const endian = builtin.cpu.arch.endian();
|
||||
|
||||
var self = Self{ .allocator = allocator };
|
||||
self.cutoff = @intCast(try reader.readInt(u24, endian));
|
||||
self.multiple_start = @intCast(try reader.readInt(u24, endian));
|
||||
|
||||
var len = try reader.readInt(u16, endian);
|
||||
self.stage1 = try allocator.alloc(u8, len);
|
||||
errdefer allocator.free(self.stage1);
|
||||
for (0..len) |i| self.stage1[i] = try reader.readInt(u8, endian);
|
||||
|
||||
len = try reader.readInt(u16, endian);
|
||||
self.stage2 = try allocator.alloc(u8, len);
|
||||
errdefer allocator.free(self.stage2);
|
||||
for (0..len) |i| self.stage2[i] = try reader.readInt(u8, endian);
|
||||
|
||||
len = try reader.readInt(u16, endian);
|
||||
self.stage3 = try allocator.alloc(i24, len);
|
||||
errdefer allocator.free(self.stage3);
|
||||
for (0..len) |i| self.stage3[i] = try reader.readInt(i24, endian);
|
||||
|
||||
self.cwcf_exceptions_min = @intCast(try reader.readInt(u24, endian));
|
||||
self.cwcf_exceptions_max = @intCast(try reader.readInt(u24, endian));
|
||||
len = try reader.readInt(u16, endian);
|
||||
self.cwcf_exceptions = try allocator.alloc(u21, len);
|
||||
for (0..len) |i| self.cwcf_exceptions[i] = @intCast(try reader.readInt(u24, endian));
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *const Self) void {
|
||||
self.allocator.free(self.stage1);
|
||||
self.allocator.free(self.stage2);
|
||||
self.allocator.free(self.stage3);
|
||||
}
|
||||
|
||||
/// Returns the case fold for `cp`.
|
||||
pub fn caseFold(self: Self, cp: u21, buf: []u21) []const u21 {
|
||||
if (cp >= self.cutoff) return &.{};
|
||||
|
||||
const stage1_val = self.stage1[cp >> 8];
|
||||
if (stage1_val == 0) return &.{};
|
||||
|
||||
const stage2_index = @as(usize, stage1_val) * 256 + (cp & 0xFF);
|
||||
const stage3_index = self.stage2[stage2_index];
|
||||
|
||||
if (stage3_index & 0x80 != 0) {
|
||||
const real_index = @as(usize, self.multiple_start) + (stage3_index ^ 0x80) * 3;
|
||||
const mapping = mem.sliceTo(self.stage3[real_index..][0..3], 0);
|
||||
for (mapping, 0..) |c, i| buf[i] = @intCast(c);
|
||||
|
||||
return buf[0..mapping.len];
|
||||
}
|
||||
|
||||
const offset = self.stage3[stage3_index];
|
||||
if (offset == 0) return &.{};
|
||||
|
||||
buf[0] = @intCast(@as(i32, cp) + offset);
|
||||
|
||||
return buf[0..1];
|
||||
}
|
||||
|
||||
/// Returns true when caseFold(NFD(`cp`)) != NFD(`cp`).
|
||||
pub fn changesWhenCaseFolded(self: Self, cp: u21) bool {
|
||||
var buf: [3]u21 = undefined;
|
||||
const has_mapping = self.caseFold(cp, &buf).len != 0;
|
||||
return has_mapping and !self.isCwcfException(cp);
|
||||
}
|
||||
|
||||
fn isCwcfException(self: Self, cp: u21) bool {
|
||||
return cp >= self.cwcf_exceptions_min and
|
||||
cp <= self.cwcf_exceptions_max and
|
||||
std.mem.indexOfScalar(u21, self.cwcf_exceptions, cp) != null;
|
||||
}
|
||||
171
deps/zg/src/GenCatData.zig
vendored
Normal file
171
deps/zg/src/GenCatData.zig
vendored
Normal file
@@ -0,0 +1,171 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const compress = std.compress;
|
||||
const mem = std.mem;
|
||||
|
||||
/// General Category
|
||||
pub const Gc = enum {
|
||||
Cc, // Other, Control
|
||||
Cf, // Other, Format
|
||||
Cn, // Other, Unassigned
|
||||
Co, // Other, Private Use
|
||||
Cs, // Other, Surrogate
|
||||
Ll, // Letter, Lowercase
|
||||
Lm, // Letter, Modifier
|
||||
Lo, // Letter, Other
|
||||
Lu, // Letter, Uppercase
|
||||
Lt, // Letter, Titlecase
|
||||
Mc, // Mark, Spacing Combining
|
||||
Me, // Mark, Enclosing
|
||||
Mn, // Mark, Non-Spacing
|
||||
Nd, // Number, Decimal Digit
|
||||
Nl, // Number, Letter
|
||||
No, // Number, Other
|
||||
Pc, // Punctuation, Connector
|
||||
Pd, // Punctuation, Dash
|
||||
Pe, // Punctuation, Close
|
||||
Pf, // Punctuation, Final quote (may behave like Ps or Pe depending on usage)
|
||||
Pi, // Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
|
||||
Po, // Punctuation, Other
|
||||
Ps, // Punctuation, Open
|
||||
Sc, // Symbol, Currency
|
||||
Sk, // Symbol, Modifier
|
||||
Sm, // Symbol, Math
|
||||
So, // Symbol, Other
|
||||
Zl, // Separator, Line
|
||||
Zp, // Separator, Paragraph
|
||||
Zs, // Separator, Space
|
||||
};
|
||||
|
||||
allocator: mem.Allocator,
|
||||
s1: []u16 = undefined,
|
||||
s2: []u5 = undefined,
|
||||
s3: []u5 = undefined,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(allocator: mem.Allocator) !Self {
|
||||
const decompressor = compress.flate.inflate.decompressor;
|
||||
const in_bytes = @embedFile("gencat");
|
||||
var in_fbs = std.io.fixedBufferStream(in_bytes);
|
||||
var in_decomp = decompressor(.raw, in_fbs.reader());
|
||||
var reader = in_decomp.reader();
|
||||
|
||||
const endian = builtin.cpu.arch.endian();
|
||||
|
||||
var self = Self{ .allocator = allocator };
|
||||
|
||||
const s1_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s1 = try allocator.alloc(u16, s1_len);
|
||||
errdefer allocator.free(self.s1);
|
||||
for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
|
||||
|
||||
const s2_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s2 = try allocator.alloc(u5, s2_len);
|
||||
errdefer allocator.free(self.s2);
|
||||
for (0..s2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian));
|
||||
|
||||
const s3_len: u16 = try reader.readInt(u8, endian);
|
||||
self.s3 = try allocator.alloc(u5, s3_len);
|
||||
errdefer allocator.free(self.s3);
|
||||
for (0..s3_len) |i| self.s3[i] = @intCast(try reader.readInt(u8, endian));
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *const Self) void {
|
||||
self.allocator.free(self.s1);
|
||||
self.allocator.free(self.s2);
|
||||
self.allocator.free(self.s3);
|
||||
}
|
||||
|
||||
/// Lookup the General Category for `cp`.
|
||||
pub fn gc(self: Self, cp: u21) Gc {
|
||||
return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]]);
|
||||
}
|
||||
|
||||
/// True if `cp` has an C general category.
|
||||
pub fn isControl(self: Self, cp: u21) bool {
|
||||
return switch (self.gc(cp)) {
|
||||
.Cc,
|
||||
.Cf,
|
||||
.Cn,
|
||||
.Co,
|
||||
.Cs,
|
||||
=> true,
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
|
||||
/// True if `cp` has an L general category.
|
||||
pub fn isLetter(self: Self, cp: u21) bool {
|
||||
return switch (self.gc(cp)) {
|
||||
.Ll,
|
||||
.Lm,
|
||||
.Lo,
|
||||
.Lu,
|
||||
.Lt,
|
||||
=> true,
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
|
||||
/// True if `cp` has an M general category.
|
||||
pub fn isMark(self: Self, cp: u21) bool {
|
||||
return switch (self.gc(cp)) {
|
||||
.Mc,
|
||||
.Me,
|
||||
.Mn,
|
||||
=> true,
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
|
||||
/// True if `cp` has an N general category.
|
||||
pub fn isNumber(self: Self, cp: u21) bool {
|
||||
return switch (self.gc(cp)) {
|
||||
.Nd,
|
||||
.Nl,
|
||||
.No,
|
||||
=> true,
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
|
||||
/// True if `cp` has an P general category.
|
||||
pub fn isPunctuation(self: Self, cp: u21) bool {
|
||||
return switch (self.gc(cp)) {
|
||||
.Pc,
|
||||
.Pd,
|
||||
.Pe,
|
||||
.Pf,
|
||||
.Pi,
|
||||
.Po,
|
||||
.Ps,
|
||||
=> true,
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
|
||||
/// True if `cp` has an S general category.
|
||||
pub fn isSymbol(self: Self, cp: u21) bool {
|
||||
return switch (self.gc(cp)) {
|
||||
.Sc,
|
||||
.Sk,
|
||||
.Sm,
|
||||
.So,
|
||||
=> true,
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
|
||||
/// True if `cp` has an Z general category.
|
||||
pub fn isSeparator(self: Self, cp: u21) bool {
|
||||
return switch (self.gc(cp)) {
|
||||
.Zl,
|
||||
.Zp,
|
||||
.Zs,
|
||||
=> true,
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
88
deps/zg/src/GraphemeData.zig
vendored
Normal file
88
deps/zg/src/GraphemeData.zig
vendored
Normal file
@@ -0,0 +1,88 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const compress = std.compress;
|
||||
const mem = std.mem;
|
||||
|
||||
/// Indic syllable type.
|
||||
pub const Indic = enum {
|
||||
none,
|
||||
|
||||
Consonant,
|
||||
Extend,
|
||||
Linker,
|
||||
};
|
||||
|
||||
/// Grapheme break property.
|
||||
pub const Gbp = enum {
|
||||
none,
|
||||
Control,
|
||||
CR,
|
||||
Extend,
|
||||
L,
|
||||
LF,
|
||||
LV,
|
||||
LVT,
|
||||
Prepend,
|
||||
Regional_Indicator,
|
||||
SpacingMark,
|
||||
T,
|
||||
V,
|
||||
ZWJ,
|
||||
};
|
||||
|
||||
allocator: mem.Allocator,
|
||||
s1: []u16 = undefined,
|
||||
s2: []u16 = undefined,
|
||||
s3: []u8 = undefined,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(allocator: mem.Allocator) !Self {
|
||||
const decompressor = compress.flate.inflate.decompressor;
|
||||
const in_bytes = @embedFile("gbp");
|
||||
var in_fbs = std.io.fixedBufferStream(in_bytes);
|
||||
var in_decomp = decompressor(.raw, in_fbs.reader());
|
||||
var reader = in_decomp.reader();
|
||||
|
||||
const endian = builtin.cpu.arch.endian();
|
||||
|
||||
var self = Self{ .allocator = allocator };
|
||||
|
||||
const s1_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s1 = try allocator.alloc(u16, s1_len);
|
||||
errdefer allocator.free(self.s1);
|
||||
for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
|
||||
|
||||
const s2_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s2 = try allocator.alloc(u16, s2_len);
|
||||
errdefer allocator.free(self.s2);
|
||||
for (0..s2_len) |i| self.s2[i] = try reader.readInt(u16, endian);
|
||||
|
||||
const s3_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s3 = try allocator.alloc(u8, s3_len);
|
||||
errdefer allocator.free(self.s3);
|
||||
_ = try reader.readAll(self.s3);
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *const Self) void {
|
||||
self.allocator.free(self.s1);
|
||||
self.allocator.free(self.s2);
|
||||
self.allocator.free(self.s3);
|
||||
}
|
||||
|
||||
/// Lookup the grapheme break property for a code point.
|
||||
pub fn gbp(self: Self, cp: u21) Gbp {
|
||||
return @enumFromInt(self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 4);
|
||||
}
|
||||
|
||||
/// Lookup the indic syllable type for a code point.
|
||||
pub fn indic(self: Self, cp: u21) Indic {
|
||||
return @enumFromInt((self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] >> 1) & 0x7);
|
||||
}
|
||||
|
||||
/// Lookup the indic syllable type for a code point.
|
||||
pub fn isEmoji(self: Self, cp: u21) bool {
|
||||
return self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]] & 1 == 1;
|
||||
}
|
||||
53
deps/zg/src/HangulData.zig
vendored
Normal file
53
deps/zg/src/HangulData.zig
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const compress = std.compress;
|
||||
const mem = std.mem;
|
||||
const testing = std.testing;
|
||||
|
||||
pub const Syllable = enum {
|
||||
none,
|
||||
L,
|
||||
LV,
|
||||
LVT,
|
||||
V,
|
||||
T,
|
||||
};
|
||||
|
||||
allocator: mem.Allocator,
|
||||
s1: []u16 = undefined,
|
||||
s2: []u3 = undefined,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(allocator: mem.Allocator) !Self {
|
||||
const decompressor = compress.flate.inflate.decompressor;
|
||||
const in_bytes = @embedFile("hangul");
|
||||
var in_fbs = std.io.fixedBufferStream(in_bytes);
|
||||
var in_decomp = decompressor(.raw, in_fbs.reader());
|
||||
var reader = in_decomp.reader();
|
||||
|
||||
const endian = builtin.cpu.arch.endian();
|
||||
var self = Self{ .allocator = allocator };
|
||||
|
||||
const stage_1_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s1 = try allocator.alloc(u16, stage_1_len);
|
||||
errdefer allocator.free(self.s1);
|
||||
for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
|
||||
|
||||
const stage_2_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s2 = try allocator.alloc(u3, stage_2_len);
|
||||
errdefer allocator.free(self.s2);
|
||||
for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian));
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *const Self) void {
|
||||
self.allocator.free(self.s1);
|
||||
self.allocator.free(self.s2);
|
||||
}
|
||||
|
||||
/// Returns the Hangul syllable type for `cp`.
|
||||
pub fn syllable(self: Self, cp: u21) Syllable {
|
||||
return @enumFromInt(self.s2[self.s1[cp >> 8] + (cp & 0xff)]);
|
||||
}
|
||||
37
deps/zg/src/NormData.zig
vendored
Normal file
37
deps/zg/src/NormData.zig
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
const std = @import("std");
|
||||
const mem = std.mem;
|
||||
|
||||
const CanonData = @import("CanonData");
|
||||
const CccData = @import("CombiningData");
|
||||
const CompatData = @import("CompatData");
|
||||
const FoldData = @import("FoldData");
|
||||
const HangulData = @import("HangulData");
|
||||
const NormPropsData = @import("NormPropsData");
|
||||
|
||||
canon_data: CanonData = undefined,
|
||||
ccc_data: CccData = undefined,
|
||||
compat_data: CompatData = undefined,
|
||||
hangul_data: HangulData = undefined,
|
||||
normp_data: NormPropsData = undefined,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(self: *Self, allocator: std.mem.Allocator) !void {
|
||||
self.canon_data = try CanonData.init(allocator);
|
||||
errdefer self.canon_data.deinit();
|
||||
self.ccc_data = try CccData.init(allocator);
|
||||
errdefer self.ccc_data.deinit();
|
||||
self.compat_data = try CompatData.init(allocator);
|
||||
errdefer self.compat_data.deinit();
|
||||
self.hangul_data = try HangulData.init(allocator);
|
||||
errdefer self.hangul_data.deinit();
|
||||
self.normp_data = try NormPropsData.init(allocator);
|
||||
}
|
||||
|
||||
pub fn deinit(self: *Self) void {
|
||||
self.canon_data.deinit();
|
||||
self.ccc_data.deinit();
|
||||
self.compat_data.deinit();
|
||||
self.hangul_data.deinit();
|
||||
self.normp_data.deinit();
|
||||
}
|
||||
54
deps/zg/src/NormPropsData.zig
vendored
Normal file
54
deps/zg/src/NormPropsData.zig
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const compress = std.compress;
|
||||
const mem = std.mem;
|
||||
const testing = std.testing;
|
||||
|
||||
allocator: mem.Allocator,
|
||||
s1: []u16 = undefined,
|
||||
s2: []u4 = undefined,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(allocator: mem.Allocator) !Self {
|
||||
const decompressor = compress.flate.inflate.decompressor;
|
||||
const in_bytes = @embedFile("normp");
|
||||
var in_fbs = std.io.fixedBufferStream(in_bytes);
|
||||
var in_decomp = decompressor(.raw, in_fbs.reader());
|
||||
var reader = in_decomp.reader();
|
||||
|
||||
const endian = builtin.cpu.arch.endian();
|
||||
var self = Self{ .allocator = allocator };
|
||||
|
||||
const stage_1_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s1 = try allocator.alloc(u16, stage_1_len);
|
||||
errdefer allocator.free(self.s1);
|
||||
for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
|
||||
|
||||
const stage_2_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s2 = try allocator.alloc(u4, stage_2_len);
|
||||
errdefer allocator.free(self.s2);
|
||||
for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(u8, endian));
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *const Self) void {
|
||||
self.allocator.free(self.s1);
|
||||
self.allocator.free(self.s2);
|
||||
}
|
||||
|
||||
/// Returns true if `cp` is already in NFD form.
|
||||
pub fn isNfd(self: Self, cp: u21) bool {
|
||||
return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 1 == 0;
|
||||
}
|
||||
|
||||
/// Returns true if `cp` is already in NFKD form.
|
||||
pub fn isNfkd(self: Self, cp: u21) bool {
|
||||
return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 2 == 0;
|
||||
}
|
||||
|
||||
/// Returns true if `cp` is not allowed in any normalized form.
|
||||
pub fn isFcx(self: Self, cp: u21) bool {
|
||||
return self.s2[self.s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
|
||||
}
|
||||
622
deps/zg/src/Normalize.zig
vendored
Normal file
622
deps/zg/src/Normalize.zig
vendored
Normal file
@@ -0,0 +1,622 @@
|
||||
//! Normalizer contains functions and methods that implement
|
||||
//! Unicode Normalization. You can normalize strings into NFC,
|
||||
//! NFKC, NFD, and NFKD normalization forms.
|
||||
|
||||
const std = @import("std");
|
||||
const debug = std.debug;
|
||||
const assert = debug.assert;
|
||||
const fmt = std.fmt;
|
||||
const heap = std.heap;
|
||||
const mem = std.mem;
|
||||
const simd = std.simd;
|
||||
const testing = std.testing;
|
||||
const unicode = std.unicode;
|
||||
|
||||
const ascii = @import("ascii");
|
||||
const CodePointIterator = @import("code_point").Iterator;
|
||||
pub const NormData = @import("NormData");
|
||||
|
||||
norm_data: *const NormData,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
const SBase: u21 = 0xAC00;
|
||||
const LBase: u21 = 0x1100;
|
||||
const VBase: u21 = 0x1161;
|
||||
const TBase: u21 = 0x11A7;
|
||||
const LCount: u21 = 19;
|
||||
const VCount: u21 = 21;
|
||||
const TCount: u21 = 28;
|
||||
const NCount: u21 = 588; // VCount * TCount
|
||||
const SCount: u21 = 11172; // LCount * NCount
|
||||
|
||||
fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp {
|
||||
const kind = self.norm_data.hangul_data.syllable(cp);
|
||||
if (kind != .LV and kind != .LVT) return null;
|
||||
|
||||
const SIndex: u21 = cp - SBase;
|
||||
const LIndex: u21 = SIndex / NCount;
|
||||
const VIndex: u21 = (SIndex % NCount) / TCount;
|
||||
const TIndex: u21 = SIndex % TCount;
|
||||
const LPart: u21 = LBase + LIndex;
|
||||
const VPart: u21 = VBase + VIndex;
|
||||
|
||||
var dc = Decomp{ .form = .nfd };
|
||||
buf[0] = LPart;
|
||||
buf[1] = VPart;
|
||||
|
||||
if (TIndex == 0) {
|
||||
dc.cps = buf[0..2];
|
||||
return dc;
|
||||
}
|
||||
|
||||
// TPart
|
||||
buf[2] = TBase + TIndex;
|
||||
dc.cps = buf[0..3];
|
||||
return dc;
|
||||
}
|
||||
|
||||
fn composeHangulCanon(lv: u21, t: u21) u21 {
|
||||
assert(0x11A8 <= t and t <= 0x11C2);
|
||||
return lv + (t - TBase);
|
||||
}
|
||||
|
||||
fn composeHangulFull(l: u21, v: u21, t: u21) u21 {
|
||||
assert(0x1100 <= l and l <= 0x1112);
|
||||
assert(0x1161 <= v and v <= 0x1175);
|
||||
const LIndex = l - LBase;
|
||||
const VIndex = v - VBase;
|
||||
const LVIndex = LIndex * NCount + VIndex * TCount;
|
||||
|
||||
if (t == 0) return SBase + LVIndex;
|
||||
|
||||
assert(0x11A8 <= t and t <= 0x11C2);
|
||||
const TIndex = t - TBase;
|
||||
|
||||
return SBase + LVIndex + TIndex;
|
||||
}
|
||||
|
||||
const Form = enum {
|
||||
nfc,
|
||||
nfd,
|
||||
nfkc,
|
||||
nfkd,
|
||||
same,
|
||||
};
|
||||
|
||||
const Decomp = struct {
|
||||
form: Form = .same,
|
||||
cps: []const u21 = &.{},
|
||||
};
|
||||
|
||||
// `mapping` retrieves the decomposition mapping for a code point as per the UCD.
|
||||
fn mapping(self: Self, cp: u21, form: Form) Decomp {
|
||||
var dc = Decomp{};
|
||||
|
||||
switch (form) {
|
||||
.nfd => {
|
||||
dc.cps = self.norm_data.canon_data.toNfd(cp);
|
||||
if (dc.cps.len != 0) dc.form = .nfd;
|
||||
},
|
||||
|
||||
.nfkd => {
|
||||
dc.cps = self.norm_data.compat_data.toNfkd(cp);
|
||||
if (dc.cps.len != 0) {
|
||||
dc.form = .nfkd;
|
||||
} else {
|
||||
dc.cps = self.norm_data.canon_data.toNfd(cp);
|
||||
if (dc.cps.len != 0) dc.form = .nfkd;
|
||||
}
|
||||
},
|
||||
|
||||
else => @panic("Normalizer.mapping only accepts form .nfd or .nfkd."),
|
||||
}
|
||||
|
||||
return dc;
|
||||
}
|
||||
|
||||
// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`.
|
||||
fn decompose(
|
||||
self: Self,
|
||||
cp: u21,
|
||||
form: Form,
|
||||
buf: []u21,
|
||||
) Decomp {
|
||||
// ASCII
|
||||
if (cp < 128) return .{};
|
||||
|
||||
// NFD / NFKD quick checks.
|
||||
switch (form) {
|
||||
.nfd => if (self.norm_data.normp_data.isNfd(cp)) return .{},
|
||||
.nfkd => if (self.norm_data.normp_data.isNfkd(cp)) return .{},
|
||||
else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."),
|
||||
}
|
||||
|
||||
// Hangul precomposed syllable full decomposition.
|
||||
if (self.decomposeHangul(cp, buf)) |dc| return dc;
|
||||
|
||||
// Full decomposition.
|
||||
var dc = Decomp{ .form = form };
|
||||
|
||||
var result_index: usize = 0;
|
||||
var work_index: usize = 1;
|
||||
|
||||
// Start work with argument code point.
|
||||
var work = [_]u21{cp} ++ [_]u21{0} ** 17;
|
||||
|
||||
while (work_index > 0) {
|
||||
// Look at previous code point in work queue.
|
||||
work_index -= 1;
|
||||
const next = work[work_index];
|
||||
const m = self.mapping(next, form);
|
||||
|
||||
// No more of decompositions for this code point.
|
||||
if (m.form == .same) {
|
||||
buf[result_index] = next;
|
||||
result_index += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Work backwards through decomposition.
|
||||
// `i` starts at 1 because m_last is 1 past the last code point.
|
||||
var i: usize = 1;
|
||||
while (i <= m.cps.len) : ({
|
||||
i += 1;
|
||||
work_index += 1;
|
||||
}) {
|
||||
work[work_index] = m.cps[m.cps.len - i];
|
||||
}
|
||||
}
|
||||
|
||||
dc.cps = buf[0..result_index];
|
||||
|
||||
return dc;
|
||||
}
|
||||
|
||||
test "decompose" {
|
||||
const allocator = testing.allocator;
|
||||
var data: NormData = undefined;
|
||||
try NormData.init(&data, allocator);
|
||||
defer data.deinit();
|
||||
var n = Self{ .norm_data = &data };
|
||||
|
||||
var buf: [18]u21 = undefined;
|
||||
|
||||
var dc = n.decompose('é', .nfd, &buf);
|
||||
try testing.expect(dc.form == .nfd);
|
||||
try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
|
||||
|
||||
dc = n.decompose('\u{1e0a}', .nfd, &buf);
|
||||
try testing.expect(dc.form == .nfd);
|
||||
try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
|
||||
|
||||
dc = n.decompose('\u{1e0a}', .nfkd, &buf);
|
||||
try testing.expect(dc.form == .nfkd);
|
||||
try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
|
||||
|
||||
dc = n.decompose('\u{3189}', .nfd, &buf);
|
||||
try testing.expect(dc.form == .same);
|
||||
try testing.expect(dc.cps.len == 0);
|
||||
|
||||
dc = n.decompose('\u{3189}', .nfkd, &buf);
|
||||
try testing.expect(dc.form == .nfkd);
|
||||
try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
|
||||
|
||||
dc = n.decompose('\u{ace1}', .nfd, &buf);
|
||||
try testing.expect(dc.form == .nfd);
|
||||
try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
|
||||
|
||||
dc = n.decompose('\u{ace1}', .nfkd, &buf);
|
||||
try testing.expect(dc.form == .nfd);
|
||||
try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
|
||||
|
||||
dc = n.decompose('\u{3d3}', .nfd, &buf);
|
||||
try testing.expect(dc.form == .nfd);
|
||||
try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
|
||||
|
||||
dc = n.decompose('\u{3d3}', .nfkd, &buf);
|
||||
try testing.expect(dc.form == .nfkd);
|
||||
try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
|
||||
}
|
||||
|
||||
/// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory.
|
||||
pub const Result = struct {
|
||||
allocator: ?mem.Allocator = null,
|
||||
slice: []const u8,
|
||||
|
||||
pub fn deinit(self: *const Result) void {
|
||||
if (self.allocator) |allocator| allocator.free(self.slice);
|
||||
}
|
||||
};
|
||||
|
||||
// Compares code points by Canonical Combining Class order.
|
||||
fn cccLess(self: Self, lhs: u21, rhs: u21) bool {
|
||||
return self.norm_data.ccc_data.ccc(lhs) < self.norm_data.ccc_data.ccc(rhs);
|
||||
}
|
||||
|
||||
// Applies the Canonical Sorting Algorithm.
|
||||
fn canonicalSort(self: Self, cps: []u21) void {
|
||||
var i: usize = 0;
|
||||
while (i < cps.len) : (i += 1) {
|
||||
const start: usize = i;
|
||||
while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {}
|
||||
mem.sort(u21, cps[start..i], self, cccLess);
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalize `str` to NFD.
|
||||
pub fn nfd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
|
||||
return self.nfxd(allocator, str, .nfd);
|
||||
}
|
||||
|
||||
/// Normalize `str` to NFKD.
|
||||
pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
|
||||
return self.nfxd(allocator, str, .nfkd);
|
||||
}
|
||||
|
||||
pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error![]u21 {
|
||||
var dcp_list = std.ArrayList(u21).init(allocator);
|
||||
defer dcp_list.deinit();
|
||||
|
||||
var cp_iter = CodePointIterator{ .bytes = str };
|
||||
var dc_buf: [18]u21 = undefined;
|
||||
|
||||
while (cp_iter.next()) |cp| {
|
||||
const dc = self.decompose(cp.code, form, &dc_buf);
|
||||
if (dc.form == .same) {
|
||||
try dcp_list.append(cp.code);
|
||||
} else {
|
||||
try dcp_list.appendSlice(dc.cps);
|
||||
}
|
||||
}
|
||||
|
||||
self.canonicalSort(dcp_list.items);
|
||||
|
||||
return try dcp_list.toOwnedSlice();
|
||||
}
|
||||
|
||||
fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result {
|
||||
// Quick checks.
|
||||
if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
|
||||
|
||||
const dcps = try self.nfxdCodePoints(allocator, str, form);
|
||||
defer allocator.free(dcps);
|
||||
|
||||
var dstr_list = std.ArrayList(u8).init(allocator);
|
||||
defer dstr_list.deinit();
|
||||
var buf: [4]u8 = undefined;
|
||||
|
||||
for (dcps) |dcp| {
|
||||
const len = unicode.utf8Encode(dcp, &buf) catch unreachable;
|
||||
try dstr_list.appendSlice(buf[0..len]);
|
||||
}
|
||||
|
||||
return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() };
|
||||
}
|
||||
|
||||
test "nfd ASCII / no-alloc" {
|
||||
const allocator = testing.allocator;
|
||||
var data: NormData = undefined;
|
||||
try NormData.init(&data, allocator);
|
||||
defer data.deinit();
|
||||
const n = Self{ .norm_data = &data };
|
||||
|
||||
const result = try n.nfd(allocator, "Hello World!");
|
||||
defer result.deinit();
|
||||
|
||||
try testing.expectEqualStrings("Hello World!", result.slice);
|
||||
}
|
||||
|
||||
test "nfd !ASCII / alloc" {
|
||||
const allocator = testing.allocator;
|
||||
var data: NormData = undefined;
|
||||
try NormData.init(&data, allocator);
|
||||
defer data.deinit();
|
||||
const n = Self{ .norm_data = &data };
|
||||
|
||||
const result = try n.nfd(allocator, "Héllo World! \u{3d3}");
|
||||
defer result.deinit();
|
||||
|
||||
try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice);
|
||||
}
|
||||
|
||||
test "nfkd ASCII / no-alloc" {
|
||||
const allocator = testing.allocator;
|
||||
var data: NormData = undefined;
|
||||
try NormData.init(&data, allocator);
|
||||
defer data.deinit();
|
||||
const n = Self{ .norm_data = &data };
|
||||
|
||||
const result = try n.nfkd(allocator, "Hello World!");
|
||||
defer result.deinit();
|
||||
|
||||
try testing.expectEqualStrings("Hello World!", result.slice);
|
||||
}
|
||||
|
||||
test "nfkd !ASCII / alloc" {
|
||||
const allocator = testing.allocator;
|
||||
var data: NormData = undefined;
|
||||
try NormData.init(&data, allocator);
|
||||
defer data.deinit();
|
||||
const n = Self{ .norm_data = &data };
|
||||
|
||||
const result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
|
||||
defer result.deinit();
|
||||
|
||||
try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
|
||||
}
|
||||
|
||||
pub fn nfdCodePoints(
|
||||
self: Self,
|
||||
allocator: mem.Allocator,
|
||||
cps: []const u21,
|
||||
) mem.Allocator.Error![]u21 {
|
||||
var dcp_list = std.ArrayList(u21).init(allocator);
|
||||
defer dcp_list.deinit();
|
||||
|
||||
var dc_buf: [18]u21 = undefined;
|
||||
|
||||
for (cps) |cp| {
|
||||
const dc = self.decompose(cp, .nfd, &dc_buf);
|
||||
|
||||
if (dc.form == .same) {
|
||||
try dcp_list.append(cp);
|
||||
} else {
|
||||
try dcp_list.appendSlice(dc.cps);
|
||||
}
|
||||
}
|
||||
|
||||
self.canonicalSort(dcp_list.items);
|
||||
|
||||
return try dcp_list.toOwnedSlice();
|
||||
}
|
||||
|
||||
pub fn nfkdCodePoints(
|
||||
self: Self,
|
||||
allocator: mem.Allocator,
|
||||
cps: []const u21,
|
||||
) mem.Allocator.Error![]u21 {
|
||||
var dcp_list = std.ArrayList(u21).init(allocator);
|
||||
defer dcp_list.deinit();
|
||||
|
||||
var dc_buf: [18]u21 = undefined;
|
||||
|
||||
for (cps) |cp| {
|
||||
const dc = self.decompose(cp, .nfkd, &dc_buf);
|
||||
|
||||
if (dc.form == .same) {
|
||||
try dcp_list.append(cp);
|
||||
} else {
|
||||
try dcp_list.appendSlice(dc.cps);
|
||||
}
|
||||
}
|
||||
|
||||
self.canonicalSort(dcp_list.items);
|
||||
|
||||
return try dcp_list.toOwnedSlice();
|
||||
}
|
||||
|
||||
// Composition (NFC, NFKC)
|
||||
|
||||
fn isHangul(self: Self, cp: u21) bool {
|
||||
return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none;
|
||||
}
|
||||
|
||||
/// Normalizes `str` to NFC.
|
||||
pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
|
||||
return self.nfxc(allocator, str, .nfc);
|
||||
}
|
||||
|
||||
/// Normalizes `str` to NFKC.
|
||||
pub fn nfkc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
|
||||
return self.nfxc(allocator, str, .nfkc);
|
||||
}
|
||||
|
||||
fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result {
|
||||
// Quick checks.
|
||||
if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
|
||||
if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str };
|
||||
|
||||
// Decompose first.
|
||||
var dcps = if (form == .nfc)
|
||||
try self.nfxdCodePoints(allocator, str, .nfd)
|
||||
else
|
||||
try self.nfxdCodePoints(allocator, str, .nfkd);
|
||||
defer allocator.free(dcps);
|
||||
|
||||
// Compose
|
||||
const tombstone = 0xe000; // Start of BMP Private Use Area
|
||||
|
||||
// Loop over all decomposed code points.
|
||||
while (true) {
|
||||
var i: usize = 1; // start at second code point.
|
||||
var deleted: usize = 0;
|
||||
|
||||
// For each code point, C, find the preceding
|
||||
// starter code point L, if any.
|
||||
block_check: while (i < dcps.len) : (i += 1) {
|
||||
const C = dcps[i];
|
||||
if (C == tombstone) continue :block_check;
|
||||
const cc_C = self.norm_data.ccc_data.ccc(C);
|
||||
var starter_index: ?usize = null;
|
||||
var j: usize = i;
|
||||
|
||||
// Seek back to find starter L, if any.
|
||||
while (true) {
|
||||
j -= 1;
|
||||
if (dcps[j] == tombstone) continue;
|
||||
|
||||
// Check for starter.
|
||||
if (self.norm_data.ccc_data.isStarter(dcps[j])) {
|
||||
// Check for blocking conditions.
|
||||
for (dcps[(j + 1)..i]) |B| {
|
||||
if (B == tombstone) continue;
|
||||
const cc_B = self.norm_data.ccc_data.ccc(B);
|
||||
if (cc_B != 0 and self.isHangul(C)) continue :block_check;
|
||||
if (cc_B >= cc_C) continue :block_check;
|
||||
}
|
||||
|
||||
// Found starter at j.
|
||||
starter_index = j;
|
||||
break;
|
||||
}
|
||||
|
||||
if (j == 0) break;
|
||||
}
|
||||
|
||||
// If we have a starter L, see if there's a primary
|
||||
// composite, P, for the sequence L, C. If so, we must
|
||||
// repace L with P and delete C.
|
||||
if (starter_index) |sidx| {
|
||||
const L = dcps[sidx];
|
||||
var processed_hangul = false;
|
||||
|
||||
// If L and C are Hangul syllables, we can compose
|
||||
// them algorithmically if possible.
|
||||
if (self.isHangul(L) and self.isHangul(C)) {
|
||||
// Get Hangul syllable types.
|
||||
const l_stype = self.norm_data.hangul_data.syllable(L);
|
||||
const c_stype = self.norm_data.hangul_data.syllable(C);
|
||||
|
||||
if (l_stype == .LV and c_stype == .T) {
|
||||
// LV, T canonical composition.
|
||||
dcps[sidx] = composeHangulCanon(L, C);
|
||||
dcps[i] = tombstone; // Mark for deletion.
|
||||
processed_hangul = true;
|
||||
}
|
||||
|
||||
if (l_stype == .L and c_stype == .V) {
|
||||
// L, V full composition. L, V, T is handled via main loop.
|
||||
dcps[sidx] = composeHangulFull(L, C, 0);
|
||||
dcps[i] = tombstone; // Mark for deletion.
|
||||
processed_hangul = true;
|
||||
}
|
||||
|
||||
if (processed_hangul) deleted += 1;
|
||||
}
|
||||
|
||||
// If no composition has occurred yet.
|
||||
if (!processed_hangul) {
|
||||
// L, C are not Hangul, so check for primary composite
|
||||
// in the Unicode Character Database.
|
||||
if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| {
|
||||
// We have a primary composite P for L, C.
|
||||
// We must check if P is not in the Full
|
||||
// Composition Exclusions (FCX) list,
|
||||
// preventing it from appearing in any
|
||||
// composed form (NFC, NFKC).
|
||||
if (!self.norm_data.normp_data.isFcx(P)) {
|
||||
dcps[sidx] = P;
|
||||
dcps[i] = tombstone; // Mark for deletion.
|
||||
deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we have no deletions. the code point sequence
|
||||
// has been fully composed.
|
||||
if (deleted == 0) {
|
||||
var cstr_list = std.ArrayList(u8).init(allocator);
|
||||
defer cstr_list.deinit();
|
||||
var buf: [4]u8 = undefined;
|
||||
|
||||
for (dcps) |cp| {
|
||||
if (cp == tombstone) continue; // "Delete"
|
||||
const len = unicode.utf8Encode(cp, &buf) catch unreachable;
|
||||
try cstr_list.appendSlice(buf[0..len]);
|
||||
}
|
||||
|
||||
return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "nfc" {
|
||||
const allocator = testing.allocator;
|
||||
var data: NormData = undefined;
|
||||
try NormData.init(&data, allocator);
|
||||
defer data.deinit();
|
||||
const n = Self{ .norm_data = &data };
|
||||
|
||||
const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
|
||||
defer result.deinit();
|
||||
|
||||
try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice);
|
||||
}
|
||||
|
||||
test "nfkc" {
|
||||
const allocator = testing.allocator;
|
||||
var data: NormData = undefined;
|
||||
try NormData.init(&data, allocator);
|
||||
defer data.deinit();
|
||||
const n = Self{ .norm_data = &data };
|
||||
|
||||
const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
|
||||
defer result.deinit();
|
||||
|
||||
try testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
|
||||
}
|
||||
|
||||
/// Tests for equality of `a` and `b` after normalizing to NFC.
|
||||
pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool {
|
||||
const norm_result_a = try self.nfc(allocator, a);
|
||||
defer norm_result_a.deinit();
|
||||
const norm_result_b = try self.nfc(allocator, b);
|
||||
defer norm_result_b.deinit();
|
||||
|
||||
return mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
|
||||
}
|
||||
|
||||
test "eql" {
|
||||
const allocator = testing.allocator;
|
||||
var data: NormData = undefined;
|
||||
try NormData.init(&data, allocator);
|
||||
defer data.deinit();
|
||||
const n = Self{ .norm_data = &data };
|
||||
|
||||
try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
|
||||
try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
|
||||
}
|
||||
|
||||
/// Returns true if `str` only contains Latin-1 Supplement
|
||||
/// code points. Uses SIMD if possible.
|
||||
pub fn isLatin1Only(str: []const u8) bool {
|
||||
var cp_iter = CodePointIterator{ .bytes = str };
|
||||
|
||||
const vec_len = simd.suggestVectorLength(u21) orelse return blk: {
|
||||
break :blk while (cp_iter.next()) |cp| {
|
||||
if (cp.code > 256) break false;
|
||||
} else true;
|
||||
};
|
||||
|
||||
const Vec = @Vector(vec_len, u21);
|
||||
|
||||
outer: while (true) {
|
||||
var v1: Vec = undefined;
|
||||
const saved_cp_i = cp_iter.i;
|
||||
|
||||
for (0..vec_len) |i| {
|
||||
if (cp_iter.next()) |cp| {
|
||||
v1[i] = cp.code;
|
||||
} else {
|
||||
cp_iter.i = saved_cp_i;
|
||||
break :outer;
|
||||
}
|
||||
}
|
||||
const v2: Vec = @splat(256);
|
||||
if (@reduce(.Or, v1 > v2)) return false;
|
||||
}
|
||||
|
||||
return while (cp_iter.next()) |cp| {
|
||||
if (cp.code > 256) break false;
|
||||
} else true;
|
||||
}
|
||||
|
||||
test "isLatin1Only" {
|
||||
const latin1_only = "Hello, World! \u{fe} \u{ff}";
|
||||
try testing.expect(isLatin1Only(latin1_only));
|
||||
const not_latin1_only = "Héllo, World! \u{3d3}";
|
||||
try testing.expect(!isLatin1Only(not_latin1_only));
|
||||
}
|
||||
164
deps/zg/src/PropsData.zig
vendored
Normal file
164
deps/zg/src/PropsData.zig
vendored
Normal file
@@ -0,0 +1,164 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const compress = std.compress;
|
||||
const mem = std.mem;
|
||||
const testing = std.testing;
|
||||
|
||||
allocator: mem.Allocator,
|
||||
core_s1: []u16 = undefined,
|
||||
core_s2: []u8 = undefined,
|
||||
props_s1: []u16 = undefined,
|
||||
props_s2: []u8 = undefined,
|
||||
num_s1: []u16 = undefined,
|
||||
num_s2: []u8 = undefined,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(allocator: mem.Allocator) !Self {
|
||||
const decompressor = compress.flate.inflate.decompressor;
|
||||
const endian = builtin.cpu.arch.endian();
|
||||
|
||||
// Process DerivedCoreProperties.txt
|
||||
const core_bytes = @embedFile("core_props");
|
||||
var core_fbs = std.io.fixedBufferStream(core_bytes);
|
||||
var core_decomp = decompressor(.raw, core_fbs.reader());
|
||||
var core_reader = core_decomp.reader();
|
||||
|
||||
var self = Self{ .allocator = allocator };
|
||||
|
||||
const core_stage_1_len: u16 = try core_reader.readInt(u16, endian);
|
||||
self.core_s1 = try allocator.alloc(u16, core_stage_1_len);
|
||||
errdefer allocator.free(self.core_s1);
|
||||
for (0..core_stage_1_len) |i| self.core_s1[i] = try core_reader.readInt(u16, endian);
|
||||
|
||||
const core_stage_2_len: u16 = try core_reader.readInt(u16, endian);
|
||||
self.core_s2 = try allocator.alloc(u8, core_stage_2_len);
|
||||
errdefer allocator.free(self.core_s2);
|
||||
_ = try core_reader.readAll(self.core_s2);
|
||||
|
||||
// Process PropList.txt
|
||||
const props_bytes = @embedFile("props");
|
||||
var props_fbs = std.io.fixedBufferStream(props_bytes);
|
||||
var props_decomp = decompressor(.raw, props_fbs.reader());
|
||||
var props_reader = props_decomp.reader();
|
||||
|
||||
const stage_1_len: u16 = try props_reader.readInt(u16, endian);
|
||||
self.props_s1 = try allocator.alloc(u16, stage_1_len);
|
||||
errdefer allocator.free(self.props_s1);
|
||||
for (0..stage_1_len) |i| self.props_s1[i] = try props_reader.readInt(u16, endian);
|
||||
|
||||
const stage_2_len: u16 = try props_reader.readInt(u16, endian);
|
||||
self.props_s2 = try allocator.alloc(u8, stage_2_len);
|
||||
errdefer allocator.free(self.props_s2);
|
||||
_ = try props_reader.readAll(self.props_s2);
|
||||
|
||||
// Process DerivedNumericType.txt
|
||||
const num_bytes = @embedFile("numeric");
|
||||
var num_fbs = std.io.fixedBufferStream(num_bytes);
|
||||
var num_decomp = decompressor(.raw, num_fbs.reader());
|
||||
var num_reader = num_decomp.reader();
|
||||
|
||||
const num_stage_1_len: u16 = try num_reader.readInt(u16, endian);
|
||||
self.num_s1 = try allocator.alloc(u16, num_stage_1_len);
|
||||
errdefer allocator.free(self.num_s1);
|
||||
for (0..num_stage_1_len) |i| self.num_s1[i] = try num_reader.readInt(u16, endian);
|
||||
|
||||
const num_stage_2_len: u16 = try num_reader.readInt(u16, endian);
|
||||
self.num_s2 = try allocator.alloc(u8, num_stage_2_len);
|
||||
errdefer allocator.free(self.num_s2);
|
||||
_ = try num_reader.readAll(self.num_s2);
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *const Self) void {
|
||||
self.allocator.free(self.core_s1);
|
||||
self.allocator.free(self.core_s2);
|
||||
self.allocator.free(self.props_s1);
|
||||
self.allocator.free(self.props_s2);
|
||||
self.allocator.free(self.num_s1);
|
||||
self.allocator.free(self.num_s2);
|
||||
}
|
||||
|
||||
/// True if `cp` is a mathematical symbol.
|
||||
pub fn isMath(self: Self, cp: u21) bool {
|
||||
return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
|
||||
}
|
||||
|
||||
/// True if `cp` is an alphabetic character.
|
||||
pub fn isAlphabetic(self: Self, cp: u21) bool {
|
||||
return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
|
||||
}
|
||||
|
||||
/// True if `cp` is a valid identifier start character.
|
||||
pub fn isIdStart(self: Self, cp: u21) bool {
|
||||
return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
|
||||
}
|
||||
|
||||
/// True if `cp` is a valid identifier continuation character.
|
||||
pub fn isIdContinue(self: Self, cp: u21) bool {
|
||||
return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 8 == 8;
|
||||
}
|
||||
|
||||
/// True if `cp` is a valid extended identifier start character.
|
||||
pub fn isXidStart(self: Self, cp: u21) bool {
|
||||
return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 16 == 16;
|
||||
}
|
||||
|
||||
/// True if `cp` is a valid extended identifier continuation character.
|
||||
pub fn isXidContinue(self: Self, cp: u21) bool {
|
||||
return self.core_s2[self.core_s1[cp >> 8] + (cp & 0xff)] & 32 == 32;
|
||||
}
|
||||
|
||||
/// True if `cp` is a whitespace character.
|
||||
pub fn isWhitespace(self: Self, cp: u21) bool {
|
||||
return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
|
||||
}
|
||||
|
||||
/// True if `cp` is a hexadecimal digit.
|
||||
pub fn isHexDigit(self: Self, cp: u21) bool {
|
||||
return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
|
||||
}
|
||||
|
||||
/// True if `cp` is a diacritic mark.
|
||||
pub fn isDiacritic(self: Self, cp: u21) bool {
|
||||
return self.props_s2[self.props_s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
|
||||
}
|
||||
|
||||
/// True if `cp` is numeric.
|
||||
pub fn isNumeric(self: Self, cp: u21) bool {
|
||||
return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 1 == 1;
|
||||
}
|
||||
|
||||
/// True if `cp` is a digit.
|
||||
pub fn isDigit(self: Self, cp: u21) bool {
|
||||
return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 2 == 2;
|
||||
}
|
||||
|
||||
/// True if `cp` is decimal.
|
||||
pub fn isDecimal(self: Self, cp: u21) bool {
|
||||
return self.num_s2[self.num_s1[cp >> 8] + (cp & 0xff)] & 4 == 4;
|
||||
}
|
||||
|
||||
test "Props" {
|
||||
const self = try init(testing.allocator);
|
||||
defer self.deinit();
|
||||
|
||||
try testing.expect(self.isHexDigit('F'));
|
||||
try testing.expect(self.isHexDigit('a'));
|
||||
try testing.expect(self.isHexDigit('8'));
|
||||
try testing.expect(!self.isHexDigit('z'));
|
||||
|
||||
try testing.expect(self.isDiacritic('\u{301}'));
|
||||
try testing.expect(self.isAlphabetic('A'));
|
||||
try testing.expect(!self.isAlphabetic('3'));
|
||||
try testing.expect(self.isMath('+'));
|
||||
|
||||
try testing.expect(self.isNumeric('\u{277f}'));
|
||||
try testing.expect(self.isDigit('\u{2070}'));
|
||||
try testing.expect(self.isDecimal('3'));
|
||||
|
||||
try testing.expect(!self.isNumeric('1'));
|
||||
try testing.expect(!self.isDigit('2'));
|
||||
try testing.expect(!self.isDecimal('g'));
|
||||
}
|
||||
228
deps/zg/src/ScriptsData.zig
vendored
Normal file
228
deps/zg/src/ScriptsData.zig
vendored
Normal file
@@ -0,0 +1,228 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const compress = std.compress;
|
||||
const mem = std.mem;
|
||||
const testing = std.testing;
|
||||
|
||||
/// Scripts
|
||||
pub const Script = enum {
|
||||
none,
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cypro_Minoan,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kawi,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nag_Mundari,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyiakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Old_Uyghur,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangsa,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Toto,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Vithkuqi,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square,
|
||||
};
|
||||
|
||||
allocator: mem.Allocator,
|
||||
s1: []u16 = undefined,
|
||||
s2: []u8 = undefined,
|
||||
s3: []u8 = undefined,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(allocator: mem.Allocator) !Self {
|
||||
const decompressor = compress.flate.inflate.decompressor;
|
||||
const in_bytes = @embedFile("scripts");
|
||||
var in_fbs = std.io.fixedBufferStream(in_bytes);
|
||||
var in_decomp = decompressor(.raw, in_fbs.reader());
|
||||
var reader = in_decomp.reader();
|
||||
|
||||
const endian = builtin.cpu.arch.endian();
|
||||
|
||||
var self = Self{ .allocator = allocator };
|
||||
|
||||
const s1_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s1 = try allocator.alloc(u16, s1_len);
|
||||
errdefer allocator.free(self.s1);
|
||||
for (0..s1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
|
||||
|
||||
const s2_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s2 = try allocator.alloc(u8, s2_len);
|
||||
errdefer allocator.free(self.s2);
|
||||
_ = try reader.readAll(self.s2);
|
||||
|
||||
const s3_len: u16 = try reader.readInt(u8, endian);
|
||||
self.s3 = try allocator.alloc(u8, s3_len);
|
||||
errdefer allocator.free(self.s3);
|
||||
_ = try reader.readAll(self.s3);
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *const Self) void {
|
||||
self.allocator.free(self.s1);
|
||||
self.allocator.free(self.s2);
|
||||
self.allocator.free(self.s3);
|
||||
}
|
||||
|
||||
/// Lookup the Script type for `cp`.
|
||||
pub fn script(self: Self, cp: u21) ?Script {
|
||||
const byte = self.s3[self.s2[self.s1[cp >> 8] + (cp & 0xff)]];
|
||||
if (byte == 0) return null;
|
||||
return @enumFromInt(byte);
|
||||
}
|
||||
|
||||
test "script" {
|
||||
const self = try init(std.testing.allocator);
|
||||
defer self.deinit();
|
||||
try testing.expectEqual(Script.Latin, self.script('A').?);
|
||||
}
|
||||
84
deps/zg/src/WidthData.zig
vendored
Normal file
84
deps/zg/src/WidthData.zig
vendored
Normal file
@@ -0,0 +1,84 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const compress = std.compress;
|
||||
const mem = std.mem;
|
||||
const testing = std.testing;
|
||||
|
||||
const GraphemeData = @import("GraphemeData");
|
||||
|
||||
allocator: mem.Allocator,
|
||||
g_data: GraphemeData,
|
||||
s1: []u16 = undefined,
|
||||
s2: []i3 = undefined,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub fn init(allocator: mem.Allocator) !Self {
|
||||
const decompressor = compress.flate.inflate.decompressor;
|
||||
const in_bytes = @embedFile("dwp");
|
||||
var in_fbs = std.io.fixedBufferStream(in_bytes);
|
||||
var in_decomp = decompressor(.raw, in_fbs.reader());
|
||||
var reader = in_decomp.reader();
|
||||
|
||||
const endian = builtin.cpu.arch.endian();
|
||||
|
||||
var self = Self{
|
||||
.allocator = allocator,
|
||||
.g_data = try GraphemeData.init(allocator),
|
||||
};
|
||||
errdefer self.g_data.deinit();
|
||||
|
||||
const stage_1_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s1 = try allocator.alloc(u16, stage_1_len);
|
||||
errdefer allocator.free(self.s1);
|
||||
for (0..stage_1_len) |i| self.s1[i] = try reader.readInt(u16, endian);
|
||||
|
||||
const stage_2_len: u16 = try reader.readInt(u16, endian);
|
||||
self.s2 = try allocator.alloc(i3, stage_2_len);
|
||||
errdefer allocator.free(self.s2);
|
||||
for (0..stage_2_len) |i| self.s2[i] = @intCast(try reader.readInt(i8, endian));
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
pub fn deinit(self: *const Self) void {
|
||||
self.allocator.free(self.s1);
|
||||
self.allocator.free(self.s2);
|
||||
self.g_data.deinit();
|
||||
}
|
||||
|
||||
/// codePointWidth returns the number of cells `cp` requires when rendered
|
||||
/// in a fixed-pitch font (i.e. a terminal screen). This can range from -1 to
|
||||
/// 3, where BACKSPACE and DELETE return -1 and 3-em-dash returns 3. C0/C1
|
||||
/// control codes return 0. If `cjk` is true, ambiguous code points return 2,
|
||||
/// otherwise they return 1.
|
||||
pub fn codePointWidth(self: Self, cp: u21) i3 {
|
||||
return self.s2[self.s1[cp >> 8] + (cp & 0xff)];
|
||||
}
|
||||
|
||||
test "codePointWidth" {
|
||||
try testing.expectEqual(@as(i3, 0), codePointWidth(0x0000)); // null
|
||||
try testing.expectEqual(@as(i3, -1), codePointWidth(0x8)); // \b
|
||||
try testing.expectEqual(@as(i3, -1), codePointWidth(0x7f)); // DEL
|
||||
try testing.expectEqual(@as(i3, 0), codePointWidth(0x0005)); // Cf
|
||||
try testing.expectEqual(@as(i3, 0), codePointWidth(0x0007)); // \a BEL
|
||||
try testing.expectEqual(@as(i3, 0), codePointWidth(0x000A)); // \n LF
|
||||
try testing.expectEqual(@as(i3, 0), codePointWidth(0x000B)); // \v VT
|
||||
try testing.expectEqual(@as(i3, 0), codePointWidth(0x000C)); // \f FF
|
||||
try testing.expectEqual(@as(i3, 0), codePointWidth(0x000D)); // \r CR
|
||||
try testing.expectEqual(@as(i3, 0), codePointWidth(0x000E)); // SQ
|
||||
try testing.expectEqual(@as(i3, 0), codePointWidth(0x000F)); // SI
|
||||
|
||||
try testing.expectEqual(@as(i3, 0), codePointWidth(0x070F)); // Cf
|
||||
try testing.expectEqual(@as(i3, 1), codePointWidth(0x0603)); // Cf Arabic
|
||||
|
||||
try testing.expectEqual(@as(i3, 1), codePointWidth(0x00AD)); // soft-hyphen
|
||||
try testing.expectEqual(@as(i3, 2), codePointWidth(0x2E3A)); // two-em dash
|
||||
try testing.expectEqual(@as(i3, 3), codePointWidth(0x2E3B)); // three-em dash
|
||||
|
||||
try testing.expectEqual(@as(i3, 1), codePointWidth(0x00BD)); // ambiguous halfwidth
|
||||
|
||||
try testing.expectEqual(@as(i3, 1), codePointWidth('é'));
|
||||
try testing.expectEqual(@as(i3, 2), codePointWidth('😊'));
|
||||
try testing.expectEqual(@as(i3, 2), codePointWidth('统'));
|
||||
}
|
||||
33
deps/zg/src/ascii.zig
vendored
Normal file
33
deps/zg/src/ascii.zig
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
const std = @import("std");
|
||||
const simd = std.simd;
|
||||
const testing = std.testing;
|
||||
|
||||
/// Returns true if `str` only contains ASCII bytes. Uses SIMD if possible.
|
||||
pub fn isAsciiOnly(str: []const u8) bool {
|
||||
const vec_len = simd.suggestVectorLength(u8) orelse return for (str) |b| {
|
||||
if (b > 127) break false;
|
||||
} else true;
|
||||
|
||||
const Vec = @Vector(vec_len, u8);
|
||||
var remaining = str;
|
||||
|
||||
while (true) {
|
||||
if (remaining.len < vec_len) return for (remaining) |b| {
|
||||
if (b > 127) break false;
|
||||
} else true;
|
||||
|
||||
const v1 = remaining[0..vec_len].*;
|
||||
const v2: Vec = @splat(127);
|
||||
if (@reduce(.Or, v1 > v2)) return false;
|
||||
remaining = remaining[vec_len..];
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
test "isAsciiOnly" {
|
||||
const ascii_only = "Hello, World! 0123456789 !@#$%^&*()_-=+";
|
||||
try testing.expect(isAsciiOnly(ascii_only));
|
||||
const not_ascii_only = "Héllo, World! 0123456789 !@#$%^&*()_-=+";
|
||||
try testing.expect(!isAsciiOnly(not_ascii_only));
|
||||
}
|
||||
118
deps/zg/src/code_point.zig
vendored
Normal file
118
deps/zg/src/code_point.zig
vendored
Normal file
@@ -0,0 +1,118 @@
|
||||
const std = @import("std");
|
||||
|
||||
/// `CodePoint` represents a Unicode code point by its code,
|
||||
/// length, and offset in the source bytes.
|
||||
pub const CodePoint = struct {
|
||||
code: u21,
|
||||
len: u3,
|
||||
offset: u32,
|
||||
};
|
||||
|
||||
/// given a small slice of a string, decode the corresponding codepoint
|
||||
pub fn decode(bytes: []const u8, offset: u32) ?CodePoint {
|
||||
// EOS fast path
|
||||
if (bytes.len == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// ASCII fast path
|
||||
if (bytes[0] < 128) {
|
||||
return .{
|
||||
.code = bytes[0],
|
||||
.len = 1,
|
||||
.offset = offset,
|
||||
};
|
||||
}
|
||||
|
||||
var cp = CodePoint{
|
||||
.code = undefined,
|
||||
.len = switch (bytes[0]) {
|
||||
0b1100_0000...0b1101_1111 => 2,
|
||||
0b1110_0000...0b1110_1111 => 3,
|
||||
0b1111_0000...0b1111_0111 => 4,
|
||||
else => {
|
||||
// unicode replacement code point.
|
||||
return .{
|
||||
.code = 0xfffd,
|
||||
.len = 1,
|
||||
.offset = offset,
|
||||
};
|
||||
},
|
||||
},
|
||||
.offset = offset,
|
||||
};
|
||||
|
||||
// Return replacement if we don' have a complete codepoint remaining. Consumes only one byte
|
||||
if (cp.len > bytes.len) {
|
||||
// Unicode replacement code point.
|
||||
return .{
|
||||
.code = 0xfffd,
|
||||
.len = 1,
|
||||
.offset = offset,
|
||||
};
|
||||
}
|
||||
|
||||
const cp_bytes = bytes[0..cp.len];
|
||||
cp.code = switch (cp.len) {
|
||||
2 => (@as(u21, (cp_bytes[0] & 0b00011111)) << 6) | (cp_bytes[1] & 0b00111111),
|
||||
|
||||
3 => (((@as(u21, (cp_bytes[0] & 0b00001111)) << 6) |
|
||||
(cp_bytes[1] & 0b00111111)) << 6) |
|
||||
(cp_bytes[2] & 0b00111111),
|
||||
|
||||
4 => (((((@as(u21, (cp_bytes[0] & 0b00000111)) << 6) |
|
||||
(cp_bytes[1] & 0b00111111)) << 6) |
|
||||
(cp_bytes[2] & 0b00111111)) << 6) |
|
||||
(cp_bytes[3] & 0b00111111),
|
||||
|
||||
else => @panic("CodePointIterator.next invalid code point length."),
|
||||
};
|
||||
|
||||
return cp;
|
||||
}
|
||||
|
||||
/// `Iterator` iterates a string one `CodePoint` at-a-time.
|
||||
pub const Iterator = struct {
|
||||
bytes: []const u8,
|
||||
i: u32 = 0,
|
||||
|
||||
pub fn next(self: *Iterator) ?CodePoint {
|
||||
if (self.i >= self.bytes.len) return null;
|
||||
|
||||
const res = decode(self.bytes[self.i..], self.i);
|
||||
if (res) |cp| {
|
||||
self.i += cp.len;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
pub fn peek(self: *Iterator) ?CodePoint {
|
||||
const saved_i = self.i;
|
||||
defer self.i = saved_i;
|
||||
return self.next();
|
||||
}
|
||||
};
|
||||
|
||||
test "decode" {
|
||||
const bytes = "🌩️";
|
||||
const res = decode(bytes, 0);
|
||||
|
||||
if (res) |cp| {
|
||||
try std.testing.expectEqual(@as(u21, 0x1F329), cp.code);
|
||||
try std.testing.expectEqual(4, cp.len);
|
||||
} else {
|
||||
// shouldn't have failed to return
|
||||
try std.testing.expect(false);
|
||||
}
|
||||
}
|
||||
|
||||
test "peek" {
|
||||
var iter = Iterator{ .bytes = "Hi" };
|
||||
|
||||
try std.testing.expectEqual(@as(u21, 'H'), iter.next().?.code);
|
||||
try std.testing.expectEqual(@as(u21, 'i'), iter.peek().?.code);
|
||||
try std.testing.expectEqual(@as(u21, 'i'), iter.next().?.code);
|
||||
try std.testing.expectEqual(@as(?CodePoint, null), iter.peek());
|
||||
try std.testing.expectEqual(@as(?CodePoint, null), iter.next());
|
||||
}
|
||||
258
deps/zg/src/grapheme.zig
vendored
Normal file
258
deps/zg/src/grapheme.zig
vendored
Normal file
@@ -0,0 +1,258 @@
|
||||
const std = @import("std");
|
||||
const mem = std.mem;
|
||||
const unicode = std.unicode;
|
||||
|
||||
const CodePoint = @import("code_point").CodePoint;
|
||||
const CodePointIterator = @import("code_point").Iterator;
|
||||
pub const GraphemeData = @import("GraphemeData");
|
||||
|
||||
/// `Grapheme` represents a Unicode grapheme cluster by its length and offset in the source bytes.
|
||||
pub const Grapheme = struct {
|
||||
len: u8,
|
||||
offset: u32,
|
||||
|
||||
/// `bytes` returns the slice of bytes that correspond to
|
||||
/// this grapheme cluster in `src`.
|
||||
pub fn bytes(self: Grapheme, src: []const u8) []const u8 {
|
||||
return src[self.offset..][0..self.len];
|
||||
}
|
||||
};
|
||||
|
||||
/// `Iterator` iterates a sting of UTF-8 encoded bytes one grapheme cluster at-a-time.
|
||||
pub const Iterator = struct {
|
||||
buf: [2]?CodePoint = .{ null, null },
|
||||
cp_iter: CodePointIterator,
|
||||
data: *const GraphemeData,
|
||||
|
||||
const Self = @This();
|
||||
|
||||
/// Assumes `src` is valid UTF-8.
|
||||
pub fn init(str: []const u8, data: *const GraphemeData) Self {
|
||||
var self = Self{ .cp_iter = .{ .bytes = str }, .data = data };
|
||||
self.advance();
|
||||
return self;
|
||||
}
|
||||
|
||||
fn advance(self: *Self) void {
|
||||
self.buf[0] = self.buf[1];
|
||||
self.buf[1] = self.cp_iter.next();
|
||||
}
|
||||
|
||||
pub fn next(self: *Self) ?Grapheme {
|
||||
self.advance();
|
||||
|
||||
// If no more
|
||||
if (self.buf[0] == null) return null;
|
||||
// If last one
|
||||
if (self.buf[1] == null) return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset };
|
||||
// If ASCII
|
||||
if (self.buf[0].?.code != '\r' and self.buf[0].?.code < 128 and self.buf[1].?.code < 128) {
|
||||
return Grapheme{ .len = self.buf[0].?.len, .offset = self.buf[0].?.offset };
|
||||
}
|
||||
|
||||
const gc_start = self.buf[0].?.offset;
|
||||
var gc_len: u8 = self.buf[0].?.len;
|
||||
var state = State{};
|
||||
|
||||
if (graphemeBreak(
|
||||
self.buf[0].?.code,
|
||||
self.buf[1].?.code,
|
||||
self.data,
|
||||
&state,
|
||||
)) return Grapheme{ .len = gc_len, .offset = gc_start };
|
||||
|
||||
while (true) {
|
||||
self.advance();
|
||||
if (self.buf[0] == null) break;
|
||||
|
||||
gc_len += self.buf[0].?.len;
|
||||
|
||||
if (graphemeBreak(
|
||||
self.buf[0].?.code,
|
||||
if (self.buf[1]) |ncp| ncp.code else 0,
|
||||
self.data,
|
||||
&state,
|
||||
)) break;
|
||||
}
|
||||
|
||||
return Grapheme{ .len = gc_len, .offset = gc_start };
|
||||
}
|
||||
};
|
||||
|
||||
// Predicates
|
||||
fn isBreaker(cp: u21, data: *const GraphemeData) bool {
|
||||
// Extract relevant properties.
|
||||
const cp_gbp_prop = data.gbp(cp);
|
||||
return cp == '\x0d' or cp == '\x0a' or cp_gbp_prop == .Control;
|
||||
}
|
||||
|
||||
// Grapheme break state.
|
||||
pub const State = struct {
|
||||
bits: u3 = 0,
|
||||
|
||||
// Extended Pictographic (emoji)
|
||||
fn hasXpic(self: State) bool {
|
||||
return self.bits & 1 == 1;
|
||||
}
|
||||
fn setXpic(self: *State) void {
|
||||
self.bits |= 1;
|
||||
}
|
||||
fn unsetXpic(self: *State) void {
|
||||
self.bits ^= 1;
|
||||
}
|
||||
|
||||
// Regional Indicatior (flags)
|
||||
fn hasRegional(self: State) bool {
|
||||
return self.bits & 2 == 2;
|
||||
}
|
||||
fn setRegional(self: *State) void {
|
||||
self.bits |= 2;
|
||||
}
|
||||
fn unsetRegional(self: *State) void {
|
||||
self.bits ^= 2;
|
||||
}
|
||||
|
||||
// Indic Conjunct
|
||||
fn hasIndic(self: State) bool {
|
||||
return self.bits & 4 == 4;
|
||||
}
|
||||
fn setIndic(self: *State) void {
|
||||
self.bits |= 4;
|
||||
}
|
||||
fn unsetIndic(self: *State) void {
|
||||
self.bits ^= 4;
|
||||
}
|
||||
};
|
||||
|
||||
/// `graphemeBreak` returns true only if a grapheme break point is required
|
||||
/// between `cp1` and `cp2`. `state` should start out as 0. If calling
|
||||
/// iteratively over a sequence of code points, this function must be called
|
||||
/// IN ORDER on ALL potential breaks in a string.
|
||||
/// Modeled after the API of utf8proc's `utf8proc_grapheme_break_stateful`.
|
||||
/// https://github.com/JuliaStrings/utf8proc/blob/2bbb1ba932f727aad1fab14fafdbc89ff9dc4604/utf8proc.h#L599-L617
|
||||
pub fn graphemeBreak(
|
||||
cp1: u21,
|
||||
cp2: u21,
|
||||
data: *const GraphemeData,
|
||||
state: *State,
|
||||
) bool {
|
||||
// Extract relevant properties.
|
||||
const cp1_gbp_prop = data.gbp(cp1);
|
||||
const cp1_indic_prop = data.indic(cp1);
|
||||
const cp1_is_emoji = data.isEmoji(cp1);
|
||||
|
||||
const cp2_gbp_prop = data.gbp(cp2);
|
||||
const cp2_indic_prop = data.indic(cp2);
|
||||
const cp2_is_emoji = data.isEmoji(cp2);
|
||||
|
||||
// GB11: Emoji Extend* ZWJ x Emoji
|
||||
if (!state.hasXpic() and cp1_is_emoji) state.setXpic();
|
||||
// GB9c: Indic Conjunct Break
|
||||
if (!state.hasIndic() and cp1_indic_prop == .Consonant) state.setIndic();
|
||||
|
||||
// GB3: CR x LF
|
||||
if (cp1 == '\r' and cp2 == '\n') return false;
|
||||
|
||||
// GB4: Control
|
||||
if (isBreaker(cp1, data)) return true;
|
||||
|
||||
// GB11: Emoji Extend* ZWJ x Emoji
|
||||
if (state.hasXpic() and
|
||||
cp1_gbp_prop == .ZWJ and
|
||||
cp2_is_emoji)
|
||||
{
|
||||
state.unsetXpic();
|
||||
return false;
|
||||
}
|
||||
|
||||
// GB9b: x (Extend | ZWJ)
|
||||
if (cp2_gbp_prop == .Extend or cp2_gbp_prop == .ZWJ) return false;
|
||||
|
||||
// GB9a: x Spacing
|
||||
if (cp2_gbp_prop == .SpacingMark) return false;
|
||||
|
||||
// GB9b: Prepend x
|
||||
if (cp1_gbp_prop == .Prepend and !isBreaker(cp2, data)) return false;
|
||||
|
||||
// GB12, GB13: RI x RI
|
||||
if (cp1_gbp_prop == .Regional_Indicator and cp2_gbp_prop == .Regional_Indicator) {
|
||||
if (state.hasRegional()) {
|
||||
state.unsetRegional();
|
||||
return true;
|
||||
} else {
|
||||
state.setRegional();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// GB6: Hangul L x (L|V|LV|VT)
|
||||
if (cp1_gbp_prop == .L) {
|
||||
if (cp2_gbp_prop == .L or
|
||||
cp2_gbp_prop == .V or
|
||||
cp2_gbp_prop == .LV or
|
||||
cp2_gbp_prop == .LVT) return false;
|
||||
}
|
||||
|
||||
// GB7: Hangul (LV | V) x (V | T)
|
||||
if (cp1_gbp_prop == .LV or cp1_gbp_prop == .V) {
|
||||
if (cp2_gbp_prop == .V or
|
||||
cp2_gbp_prop == .T) return false;
|
||||
}
|
||||
|
||||
// GB8: Hangul (LVT | T) x T
|
||||
if (cp1_gbp_prop == .LVT or cp1_gbp_prop == .T) {
|
||||
if (cp2_gbp_prop == .T) return false;
|
||||
}
|
||||
|
||||
// GB9c: Indic Conjunct Break
|
||||
if (state.hasIndic() and
|
||||
cp1_indic_prop == .Consonant and
|
||||
(cp2_indic_prop == .Extend or cp2_indic_prop == .Linker))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (state.hasIndic() and
|
||||
cp1_indic_prop == .Extend and
|
||||
cp2_indic_prop == .Linker)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (state.hasIndic() and
|
||||
(cp1_indic_prop == .Linker or cp1_gbp_prop == .ZWJ) and
|
||||
cp2_indic_prop == .Consonant)
|
||||
{
|
||||
state.unsetIndic();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
test "Segmentation ZWJ and ZWSP emoji sequences" {
|
||||
const seq_1 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
|
||||
const seq_2 = "\u{1F43B}\u{200D}\u{2744}\u{FE0F}";
|
||||
const with_zwj = seq_1 ++ "\u{200D}" ++ seq_2;
|
||||
const with_zwsp = seq_1 ++ "\u{200B}" ++ seq_2;
|
||||
const no_joiner = seq_1 ++ seq_2;
|
||||
|
||||
const data = try GraphemeData.init(std.testing.allocator);
|
||||
defer data.deinit();
|
||||
|
||||
var iter = Iterator.init(with_zwj, &data);
|
||||
|
||||
var i: usize = 0;
|
||||
while (iter.next()) |_| : (i += 1) {}
|
||||
try std.testing.expectEqual(@as(usize, 1), i);
|
||||
|
||||
iter = Iterator.init(with_zwsp, &data);
|
||||
i = 0;
|
||||
while (iter.next()) |_| : (i += 1) {}
|
||||
try std.testing.expectEqual(@as(usize, 3), i);
|
||||
|
||||
iter = Iterator.init(no_joiner, &data);
|
||||
i = 0;
|
||||
while (iter.next()) |_| : (i += 1) {}
|
||||
try std.testing.expectEqual(@as(usize, 2), i);
|
||||
}
|
||||
195
deps/zg/src/unicode_tests.zig
vendored
Normal file
195
deps/zg/src/unicode_tests.zig
vendored
Normal file
@@ -0,0 +1,195 @@
|
||||
const std = @import("std");
|
||||
const fmt = std.fmt;
|
||||
const fs = std.fs;
|
||||
const io = std.io;
|
||||
const heap = std.heap;
|
||||
const mem = std.mem;
|
||||
const testing = std.testing;
|
||||
const unicode = std.unicode;
|
||||
|
||||
const Grapheme = @import("grapheme").Grapheme;
|
||||
const GraphemeData = @import("grapheme").GraphemeData;
|
||||
const GraphemeIterator = @import("grapheme").Iterator;
|
||||
const Normalize = @import("Normalize");
|
||||
|
||||
test "Unicode normalization tests" {
|
||||
var arena = heap.ArenaAllocator.init(testing.allocator);
|
||||
defer arena.deinit();
|
||||
var allocator = arena.allocator();
|
||||
|
||||
var norm_data: Normalize.NormData = undefined;
|
||||
try Normalize.NormData.init(&norm_data, allocator);
|
||||
const n = Normalize{ .norm_data = &norm_data };
|
||||
|
||||
var file = try fs.cwd().openFile("data/unicode/NormalizationTest.txt", .{});
|
||||
defer file.close();
|
||||
var buf_reader = io.bufferedReader(file.reader());
|
||||
const input_stream = buf_reader.reader();
|
||||
|
||||
var line_no: usize = 0;
|
||||
var buf: [4096]u8 = undefined;
|
||||
var cp_buf: [4]u8 = undefined;
|
||||
|
||||
while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |line| {
|
||||
line_no += 1;
|
||||
// Skip comments or empty lines.
|
||||
if (line.len == 0 or line[0] == '#' or line[0] == '@') continue;
|
||||
// Iterate over fields.
|
||||
var fields = mem.split(u8, line, ";");
|
||||
var field_index: usize = 0;
|
||||
var input: []u8 = undefined;
|
||||
defer allocator.free(input);
|
||||
|
||||
while (fields.next()) |field| : (field_index += 1) {
|
||||
if (field_index == 0) {
|
||||
var i_buf = std.ArrayList(u8).init(allocator);
|
||||
defer i_buf.deinit();
|
||||
|
||||
var i_fields = mem.split(u8, field, " ");
|
||||
while (i_fields.next()) |s| {
|
||||
const icp = try fmt.parseInt(u21, s, 16);
|
||||
const len = try unicode.utf8Encode(icp, &cp_buf);
|
||||
try i_buf.appendSlice(cp_buf[0..len]);
|
||||
}
|
||||
|
||||
input = try i_buf.toOwnedSlice();
|
||||
} else if (field_index == 1) {
|
||||
//debug.print("\n*** {s} ***\n", .{line});
|
||||
// NFC, time to test.
|
||||
var w_buf = std.ArrayList(u8).init(allocator);
|
||||
defer w_buf.deinit();
|
||||
|
||||
var w_fields = mem.split(u8, field, " ");
|
||||
while (w_fields.next()) |s| {
|
||||
const wcp = try fmt.parseInt(u21, s, 16);
|
||||
const len = try unicode.utf8Encode(wcp, &cp_buf);
|
||||
try w_buf.appendSlice(cp_buf[0..len]);
|
||||
}
|
||||
|
||||
const want = w_buf.items;
|
||||
var got = try n.nfc(allocator, input);
|
||||
defer got.deinit();
|
||||
|
||||
try testing.expectEqualStrings(want, got.slice);
|
||||
} else if (field_index == 2) {
|
||||
// NFD, time to test.
|
||||
var w_buf = std.ArrayList(u8).init(allocator);
|
||||
defer w_buf.deinit();
|
||||
|
||||
var w_fields = mem.split(u8, field, " ");
|
||||
while (w_fields.next()) |s| {
|
||||
const wcp = try fmt.parseInt(u21, s, 16);
|
||||
const len = try unicode.utf8Encode(wcp, &cp_buf);
|
||||
try w_buf.appendSlice(cp_buf[0..len]);
|
||||
}
|
||||
|
||||
const want = w_buf.items;
|
||||
var got = try n.nfd(allocator, input);
|
||||
defer got.deinit();
|
||||
|
||||
try testing.expectEqualStrings(want, got.slice);
|
||||
} else if (field_index == 3) {
|
||||
// NFKC, time to test.
|
||||
var w_buf = std.ArrayList(u8).init(allocator);
|
||||
defer w_buf.deinit();
|
||||
|
||||
var w_fields = mem.split(u8, field, " ");
|
||||
while (w_fields.next()) |s| {
|
||||
const wcp = try fmt.parseInt(u21, s, 16);
|
||||
const len = try unicode.utf8Encode(wcp, &cp_buf);
|
||||
try w_buf.appendSlice(cp_buf[0..len]);
|
||||
}
|
||||
|
||||
const want = w_buf.items;
|
||||
var got = try n.nfkc(allocator, input);
|
||||
defer got.deinit();
|
||||
|
||||
try testing.expectEqualStrings(want, got.slice);
|
||||
} else if (field_index == 4) {
|
||||
// NFKD, time to test.
|
||||
var w_buf = std.ArrayList(u8).init(allocator);
|
||||
defer w_buf.deinit();
|
||||
|
||||
var w_fields = mem.split(u8, field, " ");
|
||||
while (w_fields.next()) |s| {
|
||||
const wcp = try fmt.parseInt(u21, s, 16);
|
||||
const len = try unicode.utf8Encode(wcp, &cp_buf);
|
||||
try w_buf.appendSlice(cp_buf[0..len]);
|
||||
}
|
||||
|
||||
const want = w_buf.items;
|
||||
const got = try n.nfkd(allocator, input);
|
||||
defer got.deinit();
|
||||
|
||||
try testing.expectEqualStrings(want, got.slice);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Segmentation GraphemeIterator" {
|
||||
const allocator = std.testing.allocator;
|
||||
var file = try std.fs.cwd().openFile("data/unicode/auxiliary/GraphemeBreakTest.txt", .{});
|
||||
defer file.close();
|
||||
var buf_reader = std.io.bufferedReader(file.reader());
|
||||
var input_stream = buf_reader.reader();
|
||||
|
||||
const data = try GraphemeData.init(allocator);
|
||||
defer data.deinit();
|
||||
|
||||
var buf: [4096]u8 = undefined;
|
||||
var line_no: usize = 1;
|
||||
|
||||
while (try input_stream.readUntilDelimiterOrEof(&buf, '\n')) |raw| : (line_no += 1) {
|
||||
// Skip comments or empty lines.
|
||||
if (raw.len == 0 or raw[0] == '#' or raw[0] == '@') continue;
|
||||
|
||||
// Clean up.
|
||||
var line = std.mem.trimLeft(u8, raw, "÷ ");
|
||||
if (std.mem.indexOf(u8, line, " ÷\t#")) |octo| {
|
||||
line = line[0..octo];
|
||||
}
|
||||
// Iterate over fields.
|
||||
var want = std.ArrayList(Grapheme).init(allocator);
|
||||
defer want.deinit();
|
||||
|
||||
var all_bytes = std.ArrayList(u8).init(allocator);
|
||||
defer all_bytes.deinit();
|
||||
|
||||
var graphemes = std.mem.split(u8, line, " ÷ ");
|
||||
var bytes_index: u32 = 0;
|
||||
|
||||
while (graphemes.next()) |field| {
|
||||
var code_points = std.mem.split(u8, field, " ");
|
||||
var cp_buf: [4]u8 = undefined;
|
||||
var cp_index: u32 = 0;
|
||||
var gc_len: u8 = 0;
|
||||
|
||||
while (code_points.next()) |code_point| {
|
||||
if (std.mem.eql(u8, code_point, "×")) continue;
|
||||
const cp: u21 = try std.fmt.parseInt(u21, code_point, 16);
|
||||
const len = try unicode.utf8Encode(cp, &cp_buf);
|
||||
try all_bytes.appendSlice(cp_buf[0..len]);
|
||||
cp_index += len;
|
||||
gc_len += len;
|
||||
}
|
||||
|
||||
try want.append(Grapheme{ .len = gc_len, .offset = bytes_index });
|
||||
bytes_index += cp_index;
|
||||
}
|
||||
|
||||
// std.debug.print("\nline {}: {s}\n", .{ line_no, all_bytes.items });
|
||||
var iter = GraphemeIterator.init(all_bytes.items, &data);
|
||||
|
||||
// Chaeck.
|
||||
for (want.items) |want_gc| {
|
||||
const got_gc = (iter.next()).?;
|
||||
try std.testing.expectEqualStrings(
|
||||
want_gc.bytes(all_bytes.items),
|
||||
got_gc.bytes(all_bytes.items),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user