I will never get tired of vendoring dependencies. ha ha. It is possible I am insane. I had to do a lot of pruning to get these not to be ridiculous (especially the unicode data, which had nearly 1 million lines of... stuff).
253 lines
9.9 KiB
Zig
253 lines
9.9 KiB
Zig
const std = @import("std");
|
|
const builtin = @import("builtin");
|
|
const mem = std.mem;
|
|
|
|
pub fn main() !void {
|
|
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
|
defer std.debug.assert(gpa.deinit() == .ok);
|
|
const allocator = gpa.allocator();
|
|
|
|
// Process DerivedCoreProperties.txt
|
|
var props_file = try std.fs.cwd().openFile("data/unicode/DerivedCoreProperties.txt", .{});
|
|
defer props_file.close();
|
|
var props_buf = std.io.bufferedReader(props_file.reader());
|
|
const props_reader = props_buf.reader();
|
|
|
|
var props_map = std.AutoHashMap(u21, void).init(allocator);
|
|
defer props_map.deinit();
|
|
|
|
var line_buf: [4096]u8 = undefined;
|
|
|
|
props_lines: while (try props_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
|
|
if (line.len == 0 or line[0] == '#') continue;
|
|
|
|
const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
|
|
|
|
var field_iter = std.mem.tokenizeAny(u8, no_comment, "; ");
|
|
var current_code: [2]u21 = undefined;
|
|
|
|
var i: usize = 0;
|
|
while (field_iter.next()) |field| : (i += 1) {
|
|
switch (i) {
|
|
0 => {
|
|
// Code point(s)
|
|
if (std.mem.indexOf(u8, field, "..")) |dots| {
|
|
current_code = .{
|
|
try std.fmt.parseInt(u21, field[0..dots], 16),
|
|
try std.fmt.parseInt(u21, field[dots + 2 ..], 16),
|
|
};
|
|
} else {
|
|
const code = try std.fmt.parseInt(u21, field, 16);
|
|
current_code = .{ code, code };
|
|
}
|
|
},
|
|
1 => {
|
|
// Core property
|
|
if (!mem.eql(u8, field, "Changes_When_Casefolded")) continue :props_lines;
|
|
for (current_code[0]..current_code[1] + 1) |cp| try props_map.put(@intCast(cp), {});
|
|
},
|
|
else => {},
|
|
}
|
|
}
|
|
}
|
|
|
|
var codepoint_mapping = std.AutoArrayHashMap(u21, [3]u21).init(allocator);
|
|
defer codepoint_mapping.deinit();
|
|
|
|
// Process CaseFolding.txt
|
|
var cp_file = try std.fs.cwd().openFile("data/unicode/CaseFolding.txt", .{});
|
|
defer cp_file.close();
|
|
var cp_buf = std.io.bufferedReader(cp_file.reader());
|
|
const cp_reader = cp_buf.reader();
|
|
|
|
while (try cp_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
|
|
if (line.len == 0 or line[0] == '#') continue;
|
|
|
|
var field_it = std.mem.splitScalar(u8, line, ';');
|
|
const codepoint_str = field_it.first();
|
|
const codepoint = try std.fmt.parseUnsigned(u21, codepoint_str, 16);
|
|
|
|
const status = std.mem.trim(u8, field_it.next() orelse continue, " ");
|
|
// Only interested in 'common' and 'full'
|
|
if (status[0] != 'C' and status[0] != 'F') continue;
|
|
|
|
const mapping = std.mem.trim(u8, field_it.next() orelse continue, " ");
|
|
var mapping_it = std.mem.splitScalar(u8, mapping, ' ');
|
|
var mapping_buf = [_]u21{0} ** 3;
|
|
var mapping_i: u8 = 0;
|
|
while (mapping_it.next()) |mapping_c| {
|
|
mapping_buf[mapping_i] = try std.fmt.parseInt(u21, mapping_c, 16);
|
|
mapping_i += 1;
|
|
}
|
|
|
|
try codepoint_mapping.putNoClobber(codepoint, mapping_buf);
|
|
}
|
|
|
|
var changes_when_casefolded_exceptions = std.ArrayList(u21).init(allocator);
|
|
defer changes_when_casefolded_exceptions.deinit();
|
|
|
|
{
|
|
// Codepoints with a case fold mapping can be missing the Changes_When_Casefolded property,
|
|
// but not vice versa.
|
|
for (codepoint_mapping.keys()) |codepoint| {
|
|
if (props_map.get(codepoint) == null) {
|
|
try changes_when_casefolded_exceptions.append(codepoint);
|
|
}
|
|
}
|
|
}
|
|
|
|
var offset_to_index = std.AutoHashMap(i32, u8).init(allocator);
|
|
defer offset_to_index.deinit();
|
|
var unique_offsets = std.AutoArrayHashMap(i32, u32).init(allocator);
|
|
defer unique_offsets.deinit();
|
|
|
|
// First pass
|
|
{
|
|
var it = codepoint_mapping.iterator();
|
|
while (it.next()) |entry| {
|
|
const codepoint = entry.key_ptr.*;
|
|
const mappings = std.mem.sliceTo(entry.value_ptr, 0);
|
|
if (mappings.len == 1) {
|
|
const offset: i32 = @as(i32, mappings[0]) - @as(i32, codepoint);
|
|
const result = try unique_offsets.getOrPut(offset);
|
|
if (!result.found_existing) result.value_ptr.* = 0;
|
|
result.value_ptr.* += 1;
|
|
}
|
|
}
|
|
|
|
// A codepoint mapping to itself (offset=0) is the most common case
|
|
try unique_offsets.put(0, 0x10FFFF);
|
|
const C = struct {
|
|
vals: []u32,
|
|
|
|
pub fn lessThan(ctx: @This(), a_index: usize, b_index: usize) bool {
|
|
return ctx.vals[a_index] > ctx.vals[b_index];
|
|
}
|
|
};
|
|
unique_offsets.sort(C{ .vals = unique_offsets.values() });
|
|
|
|
var offset_it = unique_offsets.iterator();
|
|
var offset_index: u7 = 0;
|
|
while (offset_it.next()) |entry| {
|
|
try offset_to_index.put(entry.key_ptr.*, offset_index);
|
|
offset_index += 1;
|
|
}
|
|
}
|
|
|
|
var mappings_to_index = std.AutoArrayHashMap([3]u21, u8).init(allocator);
|
|
defer mappings_to_index.deinit();
|
|
var codepoint_to_index = std.AutoHashMap(u21, u8).init(allocator);
|
|
defer codepoint_to_index.deinit();
|
|
|
|
// Second pass
|
|
{
|
|
var count_multiple_codepoints: u8 = 0;
|
|
|
|
var it = codepoint_mapping.iterator();
|
|
while (it.next()) |entry| {
|
|
const codepoint = entry.key_ptr.*;
|
|
const mappings = std.mem.sliceTo(entry.value_ptr, 0);
|
|
if (mappings.len > 1) {
|
|
const result = try mappings_to_index.getOrPut(entry.value_ptr.*);
|
|
if (!result.found_existing) {
|
|
result.value_ptr.* = 0x80 | count_multiple_codepoints;
|
|
count_multiple_codepoints += 1;
|
|
}
|
|
const index = result.value_ptr.*;
|
|
try codepoint_to_index.put(codepoint, index);
|
|
} else {
|
|
const offset: i32 = @as(i32, mappings[0]) - @as(i32, codepoint);
|
|
const index = offset_to_index.get(offset).?;
|
|
try codepoint_to_index.put(codepoint, index);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Build the stage1/stage2/stage3 arrays and output them
|
|
{
|
|
const Block = [256]u8;
|
|
var stage2_blocks = std.AutoArrayHashMap(Block, void).init(allocator);
|
|
defer stage2_blocks.deinit();
|
|
|
|
const empty_block: Block = [_]u8{0} ** 256;
|
|
try stage2_blocks.put(empty_block, {});
|
|
const stage1_len = (0x10FFFF / 256) + 1;
|
|
var stage1: [stage1_len]u8 = undefined;
|
|
|
|
var codepoint: u21 = 0;
|
|
var block: Block = undefined;
|
|
while (codepoint <= 0x10FFFF) {
|
|
const data_index = codepoint_to_index.get(codepoint) orelse 0;
|
|
block[codepoint % 256] = data_index;
|
|
|
|
codepoint += 1;
|
|
if (codepoint % 256 == 0) {
|
|
const result = try stage2_blocks.getOrPut(block);
|
|
const index = result.index;
|
|
stage1[(codepoint >> 8) - 1] = @intCast(index);
|
|
}
|
|
}
|
|
|
|
const last_meaningful_block = std.mem.lastIndexOfNone(u8, &stage1, "\x00").?;
|
|
const meaningful_stage1 = stage1[0 .. last_meaningful_block + 1];
|
|
const codepoint_cutoff = (last_meaningful_block + 1) << 8;
|
|
const multiple_codepoint_start: usize = unique_offsets.count();
|
|
|
|
var index: usize = 0;
|
|
const stage3_elems = unique_offsets.count() + mappings_to_index.count() * 3;
|
|
var stage3 = try allocator.alloc(i24, stage3_elems);
|
|
defer allocator.free(stage3);
|
|
for (unique_offsets.keys()) |key| {
|
|
stage3[index] = @intCast(key);
|
|
index += 1;
|
|
}
|
|
for (mappings_to_index.keys()) |key| {
|
|
stage3[index] = @intCast(key[0]);
|
|
stage3[index + 1] = @intCast(key[1]);
|
|
stage3[index + 2] = @intCast(key[2]);
|
|
index += 3;
|
|
}
|
|
|
|
const stage2_elems = stage2_blocks.count() * 256;
|
|
var stage2 = try allocator.alloc(u8, stage2_elems);
|
|
defer allocator.free(stage2);
|
|
for (stage2_blocks.keys(), 0..) |key, i| {
|
|
@memcpy(stage2[i * 256 ..][0..256], &key);
|
|
}
|
|
|
|
// Write out compressed binary data file.
|
|
var args_iter = try std.process.argsWithAllocator(allocator);
|
|
defer args_iter.deinit();
|
|
_ = args_iter.skip();
|
|
const output_path = args_iter.next() orelse @panic("No output file arg!");
|
|
|
|
const compressor = std.compress.flate.deflate.compressor;
|
|
var out_file = try std.fs.cwd().createFile(output_path, .{});
|
|
defer out_file.close();
|
|
var out_comp = try compressor(.raw, out_file.writer(), .{ .level = .best });
|
|
const writer = out_comp.writer();
|
|
|
|
const endian = builtin.cpu.arch.endian();
|
|
// Table metadata.
|
|
try writer.writeInt(u24, @intCast(codepoint_cutoff), endian);
|
|
try writer.writeInt(u24, @intCast(multiple_codepoint_start), endian);
|
|
// Stage 1
|
|
try writer.writeInt(u16, @intCast(meaningful_stage1.len), endian);
|
|
try writer.writeAll(meaningful_stage1);
|
|
// Stage 2
|
|
try writer.writeInt(u16, @intCast(stage2.len), endian);
|
|
try writer.writeAll(stage2);
|
|
// Stage 3
|
|
try writer.writeInt(u16, @intCast(stage3.len), endian);
|
|
for (stage3) |offset| try writer.writeInt(i24, offset, endian);
|
|
// Changes when case folded
|
|
// Min and max
|
|
try writer.writeInt(u24, std.mem.min(u21, changes_when_casefolded_exceptions.items), endian);
|
|
try writer.writeInt(u24, std.mem.max(u21, changes_when_casefolded_exceptions.items), endian);
|
|
try writer.writeInt(u16, @intCast(changes_when_casefolded_exceptions.items.len), endian);
|
|
for (changes_when_casefolded_exceptions.items) |cp| try writer.writeInt(u24, cp, endian);
|
|
|
|
try out_comp.flush();
|
|
}
|
|
}
|