2023-09-03 18:24:16 -07:00

320 lines
7.3 KiB
C

/* $OpenBSD: utf8test.c,v 1.5 2022/11/26 16:08:56 tb Exp $ */
/*
* Copyright (c) 2014 Philip Guenther <guenther@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* A mostly exhaustive test of UTF-8 decoder and encoder
*/
#include <stdio.h>
#include <string.h>
#include <err.h>
#include <openssl/asn1.h>
#include "asn1_local.h" /* peek into the internals */
#define UNCHANGED 0xfedcba98
#define ASSERT(x) \
do { \
if (!(x)) \
errx(1, "test failed at line %d: %s", \
__LINE__, #x); \
} while (0)
int
main(void)
{
unsigned char testbuf[] = "012345";
const unsigned char zerobuf[sizeof testbuf] = { 0 };
unsigned long value;
unsigned int i, j, k, l;
int ret;
/*
* First, verify UTF8_getc()
*/
value = UNCHANGED;
ret = UTF8_getc(testbuf, 0, &value);
ASSERT(ret == 0);
ASSERT(value == UNCHANGED);
/* check all valid single-byte chars */
for (i = 0; i < 0x80; i++) {
testbuf[0] = i;
ret = UTF8_getc(testbuf, 1, &value);
ASSERT(ret == 1);
ASSERT(value == i);
ret = UTF8_getc(testbuf, 2, &value);
ASSERT(ret == 1);
ASSERT(value == i);
}
/*
* Verify failure on all invalid initial bytes:
* 0x80 - 0xBF following bytes only
* 0xC0 - 0xC1 used to be in non-shortest forms
* 0xF5 - 0xFD used to be initial for 5 and 6 byte sequences
* 0xFE - 0xFF have never been valid in utf-8
*/
for (i = 0x80; i < 0xC2; i++) {
value = UNCHANGED;
testbuf[0] = i;
ret = UTF8_getc(testbuf, 1, &value);
ASSERT(ret == -2);
ASSERT(value == UNCHANGED);
}
for (i = 0xF5; i < 0x100; i++) {
value = UNCHANGED;
testbuf[0] = i;
ret = UTF8_getc(testbuf, 1, &value);
ASSERT(ret == -2);
ASSERT(value == UNCHANGED);
}
/*
* Verify handling of all two-byte sequences
*/
for (i = 0xC2; i < 0xE0; i++) {
testbuf[0] = i;
for (j = 0; j < 0x100; j++) {
testbuf[1] = j;
value = UNCHANGED;
ret = UTF8_getc(testbuf, 1, &value);
ASSERT(ret == -1);
ASSERT(value == UNCHANGED);
ret = UTF8_getc(testbuf, 2, &value);
/* outside range of trailing bytes */
if (j < 0x80 || j > 0xBF) {
ASSERT(ret == -3);
ASSERT(value == UNCHANGED);
continue;
}
/* valid */
ASSERT(ret == 2);
ASSERT((value & 0x3F) == (j & 0x3F));
ASSERT(value >> 6 == (i & 0x1F));
}
}
/*
* Verify handling of all three-byte sequences
*/
for (i = 0xE0; i < 0xF0; i++) {
testbuf[0] = i;
for (j = 0; j < 0x100; j++) {
testbuf[1] = j;
for (k = 0; k < 0x100; k++) {
testbuf[2] = k;
value = UNCHANGED;
ret = UTF8_getc(testbuf, 2, &value);
ASSERT(ret == -1);
ASSERT(value == UNCHANGED);
ret = UTF8_getc(testbuf, 3, &value);
/* outside range of trailing bytes */
if (j < 0x80 || j > 0xBF ||
k < 0x80 || k > 0xBF) {
ASSERT(ret == -3);
ASSERT(value == UNCHANGED);
continue;
}
/* non-shortest form */
if (i == 0xE0 && j < 0xA0) {
ASSERT(ret == -4);
ASSERT(value == UNCHANGED);
continue;
}
/* surrogate pair code point */
if (i == 0xED && j > 0x9F) {
ASSERT(ret == -2);
ASSERT(value == UNCHANGED);
continue;
}
ASSERT(ret == 3);
ASSERT((value & 0x3F) == (k & 0x3F));
ASSERT(((value >> 6) & 0x3F) == (j & 0x3F));
ASSERT(value >> 12 == (i & 0x0F));
}
}
}
/*
* Verify handling of all four-byte sequences
*/
for (i = 0xF0; i < 0xF5; i++) {
testbuf[0] = i;
for (j = 0; j < 0x100; j++) {
testbuf[1] = j;
for (k = 0; k < 0x100; k++) {
testbuf[2] = k;
for (l = 0; l < 0x100; l++) {
testbuf[3] = l;
value = UNCHANGED;
ret = UTF8_getc(testbuf, 3, &value);
ASSERT(ret == -1);
ASSERT(value == UNCHANGED);
ret = UTF8_getc(testbuf, 4, &value);
/* outside range of trailing bytes */
if (j < 0x80 || j > 0xBF ||
k < 0x80 || k > 0xBF ||
l < 0x80 || l > 0xBF) {
ASSERT(ret == -3);
ASSERT(value == UNCHANGED);
continue;
}
/* non-shortest form */
if (i == 0xF0 && j < 0x90) {
ASSERT(ret == -4);
ASSERT(value == UNCHANGED);
continue;
}
/* beyond end of UCS range */
if (i == 0xF4 && j > 0x8F) {
ASSERT(ret == -2);
ASSERT(value == UNCHANGED);
continue;
}
ASSERT(ret == 4);
ASSERT((value & 0x3F) == (l & 0x3F));
ASSERT(((value >> 6) & 0x3F) ==
(k & 0x3F));
ASSERT(((value >> 12) & 0x3F) ==
(j & 0x3F));
ASSERT(value >> 18 == (i & 0x07));
}
}
}
}
/*
* Next, verify UTF8_putc()
*/
memset(testbuf, 0, sizeof testbuf);
/* single-byte sequences */
for (i = 0; i < 0x80; i++) {
ret = UTF8_putc(NULL, 0, i);
ASSERT(ret == 1);
testbuf[0] = 0;
ret = UTF8_putc(testbuf, 0, i);
ASSERT(ret == -1);
ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
ret = UTF8_putc(testbuf, 1, i);
ASSERT(ret == 1);
ASSERT(testbuf[0] == i);
ASSERT(memcmp(testbuf+1, zerobuf, sizeof(testbuf)-1) == 0);
}
/* two-byte sequences */
for (i = 0x80; i < 0x800; i++) {
ret = UTF8_putc(NULL, 0, i);
ASSERT(ret == 2);
testbuf[0] = testbuf[1] = 0;
ret = UTF8_putc(testbuf, 1, i);
ASSERT(ret == -1);
ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
ret = UTF8_putc(testbuf, 2, i);
ASSERT(ret == 2);
ASSERT(memcmp(testbuf+2, zerobuf, sizeof(testbuf)-2) == 0);
ret = UTF8_getc(testbuf, 2, &value);
ASSERT(ret == 2);
ASSERT(value == i);
}
/* three-byte sequences */
for (i = 0x800; i < 0x10000; i++) {
if (i >= 0xD800 && i < 0xE000) {
/* surrogates aren't valid */
ret = UTF8_putc(NULL, 0, i);
ASSERT(ret == -2);
continue;
}
ret = UTF8_putc(NULL, 0, i);
ASSERT(ret == 3);
testbuf[0] = testbuf[1] = testbuf[2] = 0;
ret = UTF8_putc(testbuf, 2, i);
ASSERT(ret == -1);
ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
ret = UTF8_putc(testbuf, 3, i);
ASSERT(ret == 3);
ASSERT(memcmp(testbuf+3, zerobuf, sizeof(testbuf)-3) == 0);
ret = UTF8_getc(testbuf, 3, &value);
ASSERT(ret == 3);
ASSERT(value == i);
}
/* four-byte sequences */
for (i = 0x10000; i < 0x110000; i++) {
ret = UTF8_putc(NULL, 0, i);
ASSERT(ret == 4);
testbuf[0] = testbuf[1] = testbuf[2] = testbuf[3] = 0;
ret = UTF8_putc(testbuf, 3, i);
ASSERT(ret == -1);
ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
ret = UTF8_putc(testbuf, 4, i);
ASSERT(ret == 4);
ASSERT(memcmp(testbuf+4, zerobuf, sizeof(testbuf)-4) == 0);
ret = UTF8_getc(testbuf, 4, &value);
ASSERT(ret == 4);
ASSERT(value == i);
}
/* spot check some larger values to confirm error return */
for (i = 0x110000; i < 0x110100; i++) {
ret = UTF8_putc(NULL, 0, i);
ASSERT(ret == -2);
}
for (value = (unsigned long)-1; value > (unsigned long)-256; value--) {
ret = UTF8_putc(NULL, 0, value);
ASSERT(ret == -2);
}
return 0;
}