I will never get tired of vendoring dependencies. ha ha. It is possible I am insane. I had to do a lot of pruning to get these not to be ridiculous (especially the unicode data, which had nearly 1 million lines of... stuff).
623 lines
19 KiB
Zig
623 lines
19 KiB
Zig
//! Normalizer contains functions and methods that implement
|
|
//! Unicode Normalization. You can normalize strings into NFC,
|
|
//! NFKC, NFD, and NFKD normalization forms.
|
|
|
|
const std = @import("std");
|
|
const debug = std.debug;
|
|
const assert = debug.assert;
|
|
const fmt = std.fmt;
|
|
const heap = std.heap;
|
|
const mem = std.mem;
|
|
const simd = std.simd;
|
|
const testing = std.testing;
|
|
const unicode = std.unicode;
|
|
|
|
const ascii = @import("ascii");
|
|
const CodePointIterator = @import("code_point").Iterator;
|
|
pub const NormData = @import("NormData");
|
|
|
|
norm_data: *const NormData,
|
|
|
|
const Self = @This();
|
|
|
|
const SBase: u21 = 0xAC00;
|
|
const LBase: u21 = 0x1100;
|
|
const VBase: u21 = 0x1161;
|
|
const TBase: u21 = 0x11A7;
|
|
const LCount: u21 = 19;
|
|
const VCount: u21 = 21;
|
|
const TCount: u21 = 28;
|
|
const NCount: u21 = 588; // VCount * TCount
|
|
const SCount: u21 = 11172; // LCount * NCount
|
|
|
|
fn decomposeHangul(self: Self, cp: u21, buf: []u21) ?Decomp {
|
|
const kind = self.norm_data.hangul_data.syllable(cp);
|
|
if (kind != .LV and kind != .LVT) return null;
|
|
|
|
const SIndex: u21 = cp - SBase;
|
|
const LIndex: u21 = SIndex / NCount;
|
|
const VIndex: u21 = (SIndex % NCount) / TCount;
|
|
const TIndex: u21 = SIndex % TCount;
|
|
const LPart: u21 = LBase + LIndex;
|
|
const VPart: u21 = VBase + VIndex;
|
|
|
|
var dc = Decomp{ .form = .nfd };
|
|
buf[0] = LPart;
|
|
buf[1] = VPart;
|
|
|
|
if (TIndex == 0) {
|
|
dc.cps = buf[0..2];
|
|
return dc;
|
|
}
|
|
|
|
// TPart
|
|
buf[2] = TBase + TIndex;
|
|
dc.cps = buf[0..3];
|
|
return dc;
|
|
}
|
|
|
|
fn composeHangulCanon(lv: u21, t: u21) u21 {
|
|
assert(0x11A8 <= t and t <= 0x11C2);
|
|
return lv + (t - TBase);
|
|
}
|
|
|
|
fn composeHangulFull(l: u21, v: u21, t: u21) u21 {
|
|
assert(0x1100 <= l and l <= 0x1112);
|
|
assert(0x1161 <= v and v <= 0x1175);
|
|
const LIndex = l - LBase;
|
|
const VIndex = v - VBase;
|
|
const LVIndex = LIndex * NCount + VIndex * TCount;
|
|
|
|
if (t == 0) return SBase + LVIndex;
|
|
|
|
assert(0x11A8 <= t and t <= 0x11C2);
|
|
const TIndex = t - TBase;
|
|
|
|
return SBase + LVIndex + TIndex;
|
|
}
|
|
|
|
const Form = enum {
|
|
nfc,
|
|
nfd,
|
|
nfkc,
|
|
nfkd,
|
|
same,
|
|
};
|
|
|
|
const Decomp = struct {
|
|
form: Form = .same,
|
|
cps: []const u21 = &.{},
|
|
};
|
|
|
|
// `mapping` retrieves the decomposition mapping for a code point as per the UCD.
|
|
fn mapping(self: Self, cp: u21, form: Form) Decomp {
|
|
var dc = Decomp{};
|
|
|
|
switch (form) {
|
|
.nfd => {
|
|
dc.cps = self.norm_data.canon_data.toNfd(cp);
|
|
if (dc.cps.len != 0) dc.form = .nfd;
|
|
},
|
|
|
|
.nfkd => {
|
|
dc.cps = self.norm_data.compat_data.toNfkd(cp);
|
|
if (dc.cps.len != 0) {
|
|
dc.form = .nfkd;
|
|
} else {
|
|
dc.cps = self.norm_data.canon_data.toNfd(cp);
|
|
if (dc.cps.len != 0) dc.form = .nfkd;
|
|
}
|
|
},
|
|
|
|
else => @panic("Normalizer.mapping only accepts form .nfd or .nfkd."),
|
|
}
|
|
|
|
return dc;
|
|
}
|
|
|
|
// `decompose` a code point to the specified normalization form, which should be either `.nfd` or `.nfkd`.
|
|
fn decompose(
|
|
self: Self,
|
|
cp: u21,
|
|
form: Form,
|
|
buf: []u21,
|
|
) Decomp {
|
|
// ASCII
|
|
if (cp < 128) return .{};
|
|
|
|
// NFD / NFKD quick checks.
|
|
switch (form) {
|
|
.nfd => if (self.norm_data.normp_data.isNfd(cp)) return .{},
|
|
.nfkd => if (self.norm_data.normp_data.isNfkd(cp)) return .{},
|
|
else => @panic("Normalizer.decompose only accepts form .nfd or .nfkd."),
|
|
}
|
|
|
|
// Hangul precomposed syllable full decomposition.
|
|
if (self.decomposeHangul(cp, buf)) |dc| return dc;
|
|
|
|
// Full decomposition.
|
|
var dc = Decomp{ .form = form };
|
|
|
|
var result_index: usize = 0;
|
|
var work_index: usize = 1;
|
|
|
|
// Start work with argument code point.
|
|
var work = [_]u21{cp} ++ [_]u21{0} ** 17;
|
|
|
|
while (work_index > 0) {
|
|
// Look at previous code point in work queue.
|
|
work_index -= 1;
|
|
const next = work[work_index];
|
|
const m = self.mapping(next, form);
|
|
|
|
// No more of decompositions for this code point.
|
|
if (m.form == .same) {
|
|
buf[result_index] = next;
|
|
result_index += 1;
|
|
continue;
|
|
}
|
|
|
|
// Work backwards through decomposition.
|
|
// `i` starts at 1 because m_last is 1 past the last code point.
|
|
var i: usize = 1;
|
|
while (i <= m.cps.len) : ({
|
|
i += 1;
|
|
work_index += 1;
|
|
}) {
|
|
work[work_index] = m.cps[m.cps.len - i];
|
|
}
|
|
}
|
|
|
|
dc.cps = buf[0..result_index];
|
|
|
|
return dc;
|
|
}
|
|
|
|
test "decompose" {
|
|
const allocator = testing.allocator;
|
|
var data: NormData = undefined;
|
|
try NormData.init(&data, allocator);
|
|
defer data.deinit();
|
|
var n = Self{ .norm_data = &data };
|
|
|
|
var buf: [18]u21 = undefined;
|
|
|
|
var dc = n.decompose('é', .nfd, &buf);
|
|
try testing.expect(dc.form == .nfd);
|
|
try testing.expectEqualSlices(u21, &[_]u21{ 'e', '\u{301}' }, dc.cps[0..2]);
|
|
|
|
dc = n.decompose('\u{1e0a}', .nfd, &buf);
|
|
try testing.expect(dc.form == .nfd);
|
|
try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
|
|
|
|
dc = n.decompose('\u{1e0a}', .nfkd, &buf);
|
|
try testing.expect(dc.form == .nfkd);
|
|
try testing.expectEqualSlices(u21, &[_]u21{ 'D', '\u{307}' }, dc.cps[0..2]);
|
|
|
|
dc = n.decompose('\u{3189}', .nfd, &buf);
|
|
try testing.expect(dc.form == .same);
|
|
try testing.expect(dc.cps.len == 0);
|
|
|
|
dc = n.decompose('\u{3189}', .nfkd, &buf);
|
|
try testing.expect(dc.form == .nfkd);
|
|
try testing.expectEqualSlices(u21, &[_]u21{'\u{1188}'}, dc.cps[0..1]);
|
|
|
|
dc = n.decompose('\u{ace1}', .nfd, &buf);
|
|
try testing.expect(dc.form == .nfd);
|
|
try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
|
|
|
|
dc = n.decompose('\u{ace1}', .nfkd, &buf);
|
|
try testing.expect(dc.form == .nfd);
|
|
try testing.expectEqualSlices(u21, &[_]u21{ '\u{1100}', '\u{1169}', '\u{11a8}' }, dc.cps[0..3]);
|
|
|
|
dc = n.decompose('\u{3d3}', .nfd, &buf);
|
|
try testing.expect(dc.form == .nfd);
|
|
try testing.expectEqualSlices(u21, &[_]u21{ '\u{3d2}', '\u{301}' }, dc.cps[0..2]);
|
|
|
|
dc = n.decompose('\u{3d3}', .nfkd, &buf);
|
|
try testing.expect(dc.form == .nfkd);
|
|
try testing.expectEqualSlices(u21, &[_]u21{ '\u{3a5}', '\u{301}' }, dc.cps[0..2]);
|
|
}
|
|
|
|
/// Returned from various functions in this namespace. Remember to call `deinit` to free any allocated memory.
|
|
pub const Result = struct {
|
|
allocator: ?mem.Allocator = null,
|
|
slice: []const u8,
|
|
|
|
pub fn deinit(self: *const Result) void {
|
|
if (self.allocator) |allocator| allocator.free(self.slice);
|
|
}
|
|
};
|
|
|
|
// Compares code points by Canonical Combining Class order.
|
|
fn cccLess(self: Self, lhs: u21, rhs: u21) bool {
|
|
return self.norm_data.ccc_data.ccc(lhs) < self.norm_data.ccc_data.ccc(rhs);
|
|
}
|
|
|
|
// Applies the Canonical Sorting Algorithm.
|
|
fn canonicalSort(self: Self, cps: []u21) void {
|
|
var i: usize = 0;
|
|
while (i < cps.len) : (i += 1) {
|
|
const start: usize = i;
|
|
while (i < cps.len and self.norm_data.ccc_data.ccc(cps[i]) != 0) : (i += 1) {}
|
|
mem.sort(u21, cps[start..i], self, cccLess);
|
|
}
|
|
}
|
|
|
|
/// Normalize `str` to NFD.
|
|
pub fn nfd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
|
|
return self.nfxd(allocator, str, .nfd);
|
|
}
|
|
|
|
/// Normalize `str` to NFKD.
|
|
pub fn nfkd(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
|
|
return self.nfxd(allocator, str, .nfkd);
|
|
}
|
|
|
|
pub fn nfxdCodePoints(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error![]u21 {
|
|
var dcp_list = std.ArrayList(u21).init(allocator);
|
|
defer dcp_list.deinit();
|
|
|
|
var cp_iter = CodePointIterator{ .bytes = str };
|
|
var dc_buf: [18]u21 = undefined;
|
|
|
|
while (cp_iter.next()) |cp| {
|
|
const dc = self.decompose(cp.code, form, &dc_buf);
|
|
if (dc.form == .same) {
|
|
try dcp_list.append(cp.code);
|
|
} else {
|
|
try dcp_list.appendSlice(dc.cps);
|
|
}
|
|
}
|
|
|
|
self.canonicalSort(dcp_list.items);
|
|
|
|
return try dcp_list.toOwnedSlice();
|
|
}
|
|
|
|
fn nfxd(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result {
|
|
// Quick checks.
|
|
if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
|
|
|
|
const dcps = try self.nfxdCodePoints(allocator, str, form);
|
|
defer allocator.free(dcps);
|
|
|
|
var dstr_list = std.ArrayList(u8).init(allocator);
|
|
defer dstr_list.deinit();
|
|
var buf: [4]u8 = undefined;
|
|
|
|
for (dcps) |dcp| {
|
|
const len = unicode.utf8Encode(dcp, &buf) catch unreachable;
|
|
try dstr_list.appendSlice(buf[0..len]);
|
|
}
|
|
|
|
return Result{ .allocator = allocator, .slice = try dstr_list.toOwnedSlice() };
|
|
}
|
|
|
|
test "nfd ASCII / no-alloc" {
|
|
const allocator = testing.allocator;
|
|
var data: NormData = undefined;
|
|
try NormData.init(&data, allocator);
|
|
defer data.deinit();
|
|
const n = Self{ .norm_data = &data };
|
|
|
|
const result = try n.nfd(allocator, "Hello World!");
|
|
defer result.deinit();
|
|
|
|
try testing.expectEqualStrings("Hello World!", result.slice);
|
|
}
|
|
|
|
test "nfd !ASCII / alloc" {
|
|
const allocator = testing.allocator;
|
|
var data: NormData = undefined;
|
|
try NormData.init(&data, allocator);
|
|
defer data.deinit();
|
|
const n = Self{ .norm_data = &data };
|
|
|
|
const result = try n.nfd(allocator, "Héllo World! \u{3d3}");
|
|
defer result.deinit();
|
|
|
|
try testing.expectEqualStrings("He\u{301}llo World! \u{3d2}\u{301}", result.slice);
|
|
}
|
|
|
|
test "nfkd ASCII / no-alloc" {
|
|
const allocator = testing.allocator;
|
|
var data: NormData = undefined;
|
|
try NormData.init(&data, allocator);
|
|
defer data.deinit();
|
|
const n = Self{ .norm_data = &data };
|
|
|
|
const result = try n.nfkd(allocator, "Hello World!");
|
|
defer result.deinit();
|
|
|
|
try testing.expectEqualStrings("Hello World!", result.slice);
|
|
}
|
|
|
|
test "nfkd !ASCII / alloc" {
|
|
const allocator = testing.allocator;
|
|
var data: NormData = undefined;
|
|
try NormData.init(&data, allocator);
|
|
defer data.deinit();
|
|
const n = Self{ .norm_data = &data };
|
|
|
|
const result = try n.nfkd(allocator, "Héllo World! \u{3d3}");
|
|
defer result.deinit();
|
|
|
|
try testing.expectEqualStrings("He\u{301}llo World! \u{3a5}\u{301}", result.slice);
|
|
}
|
|
|
|
pub fn nfdCodePoints(
|
|
self: Self,
|
|
allocator: mem.Allocator,
|
|
cps: []const u21,
|
|
) mem.Allocator.Error![]u21 {
|
|
var dcp_list = std.ArrayList(u21).init(allocator);
|
|
defer dcp_list.deinit();
|
|
|
|
var dc_buf: [18]u21 = undefined;
|
|
|
|
for (cps) |cp| {
|
|
const dc = self.decompose(cp, .nfd, &dc_buf);
|
|
|
|
if (dc.form == .same) {
|
|
try dcp_list.append(cp);
|
|
} else {
|
|
try dcp_list.appendSlice(dc.cps);
|
|
}
|
|
}
|
|
|
|
self.canonicalSort(dcp_list.items);
|
|
|
|
return try dcp_list.toOwnedSlice();
|
|
}
|
|
|
|
pub fn nfkdCodePoints(
|
|
self: Self,
|
|
allocator: mem.Allocator,
|
|
cps: []const u21,
|
|
) mem.Allocator.Error![]u21 {
|
|
var dcp_list = std.ArrayList(u21).init(allocator);
|
|
defer dcp_list.deinit();
|
|
|
|
var dc_buf: [18]u21 = undefined;
|
|
|
|
for (cps) |cp| {
|
|
const dc = self.decompose(cp, .nfkd, &dc_buf);
|
|
|
|
if (dc.form == .same) {
|
|
try dcp_list.append(cp);
|
|
} else {
|
|
try dcp_list.appendSlice(dc.cps);
|
|
}
|
|
}
|
|
|
|
self.canonicalSort(dcp_list.items);
|
|
|
|
return try dcp_list.toOwnedSlice();
|
|
}
|
|
|
|
// Composition (NFC, NFKC)
|
|
|
|
fn isHangul(self: Self, cp: u21) bool {
|
|
return cp >= 0x1100 and self.norm_data.hangul_data.syllable(cp) != .none;
|
|
}
|
|
|
|
/// Normalizes `str` to NFC.
|
|
pub fn nfc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
|
|
return self.nfxc(allocator, str, .nfc);
|
|
}
|
|
|
|
/// Normalizes `str` to NFKC.
|
|
pub fn nfkc(self: Self, allocator: mem.Allocator, str: []const u8) mem.Allocator.Error!Result {
|
|
return self.nfxc(allocator, str, .nfkc);
|
|
}
|
|
|
|
fn nfxc(self: Self, allocator: mem.Allocator, str: []const u8, form: Form) mem.Allocator.Error!Result {
|
|
// Quick checks.
|
|
if (ascii.isAsciiOnly(str)) return Result{ .slice = str };
|
|
if (form == .nfc and isLatin1Only(str)) return Result{ .slice = str };
|
|
|
|
// Decompose first.
|
|
var dcps = if (form == .nfc)
|
|
try self.nfxdCodePoints(allocator, str, .nfd)
|
|
else
|
|
try self.nfxdCodePoints(allocator, str, .nfkd);
|
|
defer allocator.free(dcps);
|
|
|
|
// Compose
|
|
const tombstone = 0xe000; // Start of BMP Private Use Area
|
|
|
|
// Loop over all decomposed code points.
|
|
while (true) {
|
|
var i: usize = 1; // start at second code point.
|
|
var deleted: usize = 0;
|
|
|
|
// For each code point, C, find the preceding
|
|
// starter code point L, if any.
|
|
block_check: while (i < dcps.len) : (i += 1) {
|
|
const C = dcps[i];
|
|
if (C == tombstone) continue :block_check;
|
|
const cc_C = self.norm_data.ccc_data.ccc(C);
|
|
var starter_index: ?usize = null;
|
|
var j: usize = i;
|
|
|
|
// Seek back to find starter L, if any.
|
|
while (true) {
|
|
j -= 1;
|
|
if (dcps[j] == tombstone) continue;
|
|
|
|
// Check for starter.
|
|
if (self.norm_data.ccc_data.isStarter(dcps[j])) {
|
|
// Check for blocking conditions.
|
|
for (dcps[(j + 1)..i]) |B| {
|
|
if (B == tombstone) continue;
|
|
const cc_B = self.norm_data.ccc_data.ccc(B);
|
|
if (cc_B != 0 and self.isHangul(C)) continue :block_check;
|
|
if (cc_B >= cc_C) continue :block_check;
|
|
}
|
|
|
|
// Found starter at j.
|
|
starter_index = j;
|
|
break;
|
|
}
|
|
|
|
if (j == 0) break;
|
|
}
|
|
|
|
// If we have a starter L, see if there's a primary
|
|
// composite, P, for the sequence L, C. If so, we must
|
|
// repace L with P and delete C.
|
|
if (starter_index) |sidx| {
|
|
const L = dcps[sidx];
|
|
var processed_hangul = false;
|
|
|
|
// If L and C are Hangul syllables, we can compose
|
|
// them algorithmically if possible.
|
|
if (self.isHangul(L) and self.isHangul(C)) {
|
|
// Get Hangul syllable types.
|
|
const l_stype = self.norm_data.hangul_data.syllable(L);
|
|
const c_stype = self.norm_data.hangul_data.syllable(C);
|
|
|
|
if (l_stype == .LV and c_stype == .T) {
|
|
// LV, T canonical composition.
|
|
dcps[sidx] = composeHangulCanon(L, C);
|
|
dcps[i] = tombstone; // Mark for deletion.
|
|
processed_hangul = true;
|
|
}
|
|
|
|
if (l_stype == .L and c_stype == .V) {
|
|
// L, V full composition. L, V, T is handled via main loop.
|
|
dcps[sidx] = composeHangulFull(L, C, 0);
|
|
dcps[i] = tombstone; // Mark for deletion.
|
|
processed_hangul = true;
|
|
}
|
|
|
|
if (processed_hangul) deleted += 1;
|
|
}
|
|
|
|
// If no composition has occurred yet.
|
|
if (!processed_hangul) {
|
|
// L, C are not Hangul, so check for primary composite
|
|
// in the Unicode Character Database.
|
|
if (self.norm_data.canon_data.toNfc(.{ L, C })) |P| {
|
|
// We have a primary composite P for L, C.
|
|
// We must check if P is not in the Full
|
|
// Composition Exclusions (FCX) list,
|
|
// preventing it from appearing in any
|
|
// composed form (NFC, NFKC).
|
|
if (!self.norm_data.normp_data.isFcx(P)) {
|
|
dcps[sidx] = P;
|
|
dcps[i] = tombstone; // Mark for deletion.
|
|
deleted += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// If we have no deletions. the code point sequence
|
|
// has been fully composed.
|
|
if (deleted == 0) {
|
|
var cstr_list = std.ArrayList(u8).init(allocator);
|
|
defer cstr_list.deinit();
|
|
var buf: [4]u8 = undefined;
|
|
|
|
for (dcps) |cp| {
|
|
if (cp == tombstone) continue; // "Delete"
|
|
const len = unicode.utf8Encode(cp, &buf) catch unreachable;
|
|
try cstr_list.appendSlice(buf[0..len]);
|
|
}
|
|
|
|
return Result{ .allocator = allocator, .slice = try cstr_list.toOwnedSlice() };
|
|
}
|
|
}
|
|
}
|
|
|
|
test "nfc" {
|
|
const allocator = testing.allocator;
|
|
var data: NormData = undefined;
|
|
try NormData.init(&data, allocator);
|
|
defer data.deinit();
|
|
const n = Self{ .norm_data = &data };
|
|
|
|
const result = try n.nfc(allocator, "Complex char: \u{3D2}\u{301}");
|
|
defer result.deinit();
|
|
|
|
try testing.expectEqualStrings("Complex char: \u{3D3}", result.slice);
|
|
}
|
|
|
|
test "nfkc" {
|
|
const allocator = testing.allocator;
|
|
var data: NormData = undefined;
|
|
try NormData.init(&data, allocator);
|
|
defer data.deinit();
|
|
const n = Self{ .norm_data = &data };
|
|
|
|
const result = try n.nfkc(allocator, "Complex char: \u{03A5}\u{0301}");
|
|
defer result.deinit();
|
|
|
|
try testing.expectEqualStrings("Complex char: \u{038E}", result.slice);
|
|
}
|
|
|
|
/// Tests for equality of `a` and `b` after normalizing to NFC.
|
|
pub fn eql(self: Self, allocator: mem.Allocator, a: []const u8, b: []const u8) !bool {
|
|
const norm_result_a = try self.nfc(allocator, a);
|
|
defer norm_result_a.deinit();
|
|
const norm_result_b = try self.nfc(allocator, b);
|
|
defer norm_result_b.deinit();
|
|
|
|
return mem.eql(u8, norm_result_a.slice, norm_result_b.slice);
|
|
}
|
|
|
|
test "eql" {
|
|
const allocator = testing.allocator;
|
|
var data: NormData = undefined;
|
|
try NormData.init(&data, allocator);
|
|
defer data.deinit();
|
|
const n = Self{ .norm_data = &data };
|
|
|
|
try testing.expect(try n.eql(allocator, "foé", "foe\u{0301}"));
|
|
try testing.expect(try n.eql(allocator, "foϓ", "fo\u{03D2}\u{0301}"));
|
|
}
|
|
|
|
/// Returns true if `str` only contains Latin-1 Supplement
|
|
/// code points. Uses SIMD if possible.
|
|
pub fn isLatin1Only(str: []const u8) bool {
|
|
var cp_iter = CodePointIterator{ .bytes = str };
|
|
|
|
const vec_len = simd.suggestVectorLength(u21) orelse return blk: {
|
|
break :blk while (cp_iter.next()) |cp| {
|
|
if (cp.code > 256) break false;
|
|
} else true;
|
|
};
|
|
|
|
const Vec = @Vector(vec_len, u21);
|
|
|
|
outer: while (true) {
|
|
var v1: Vec = undefined;
|
|
const saved_cp_i = cp_iter.i;
|
|
|
|
for (0..vec_len) |i| {
|
|
if (cp_iter.next()) |cp| {
|
|
v1[i] = cp.code;
|
|
} else {
|
|
cp_iter.i = saved_cp_i;
|
|
break :outer;
|
|
}
|
|
}
|
|
const v2: Vec = @splat(256);
|
|
if (@reduce(.Or, v1 > v2)) return false;
|
|
}
|
|
|
|
return while (cp_iter.next()) |cp| {
|
|
if (cp.code > 256) break false;
|
|
} else true;
|
|
}
|
|
|
|
test "isLatin1Only" {
|
|
const latin1_only = "Hello, World! \u{fe} \u{ff}";
|
|
try testing.expect(isLatin1Only(latin1_only));
|
|
const not_latin1_only = "Héllo, World! \u{3d3}";
|
|
try testing.expect(!isLatin1Only(not_latin1_only));
|
|
}
|