blob: 501fc9f4808a9f9cce53ba355c080c68822843b8 [file] [log] [blame]
/*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <gtest/gtest.h>
#include <unicode/utf.h>
#include <cstdlib>
// src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset if non-null.
// Size is returned in an out parameter because gtest needs a void return for ASSERT to work.
void ParseUnicode(uint16_t* buf, size_t buf_size, const char* src, size_t* result_size,
size_t* offset) {
size_t input_ix = 0;
size_t output_ix = 0;
bool seen_offset = false;
while (src[input_ix] != 0) {
switch (src[input_ix]) {
case '\'':
// single ASCII char
ASSERT_LT(src[input_ix], 0x80);
input_ix++;
ASSERT_NE(src[input_ix], 0);
ASSERT_LT(output_ix, buf_size);
buf[output_ix++] = (uint16_t)src[input_ix++];
ASSERT_EQ(src[input_ix], '\'');
input_ix++;
break;
case 'u':
case 'U': {
// Unicode codepoint in hex syntax
input_ix++;
ASSERT_EQ(src[input_ix], '+');
input_ix++;
char* endptr = (char*)src + input_ix;
unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16);
size_t num_hex_digits = endptr - (src + input_ix);
ASSERT_GE(num_hex_digits, 4u); // also triggers on invalid number syntax, digits = 0
ASSERT_LE(num_hex_digits, 6u);
ASSERT_LE(codepoint, 0x10FFFFu);
input_ix += num_hex_digits;
if (U16_LENGTH(codepoint) == 1) {
ASSERT_LE(output_ix + 1, buf_size);
buf[output_ix++] = codepoint;
} else {
// UTF-16 encoding
ASSERT_LE(output_ix + 2, buf_size);
buf[output_ix++] = U16_LEAD(codepoint);
buf[output_ix++] = U16_TRAIL(codepoint);
}
break;
}
case ' ':
input_ix++;
break;
case '|':
ASSERT_FALSE(seen_offset);
ASSERT_NE(offset, nullptr);
*offset = output_ix;
seen_offset = true;
input_ix++;
break;
default:
FAIL(); // unexpected character
}
}
ASSERT_NE(result_size, nullptr);
*result_size = output_ix;
ASSERT_TRUE(seen_offset || offset == nullptr);
}
TEST(UnicodeUtils, parse) {
const size_t BUF_SIZE = 256;
uint16_t buf[BUF_SIZE];
size_t offset;
size_t size;
ParseUnicode(buf, BUF_SIZE, "U+000D U+1F431 | 'a'", &size, &offset);
EXPECT_EQ(size, 4u);
EXPECT_EQ(offset, 3u);
EXPECT_EQ(buf[0], 0x000D);
EXPECT_EQ(buf[1], 0xD83D);
EXPECT_EQ(buf[2], 0xDC31);
EXPECT_EQ(buf[3], 'a');
}