Skip to content
Snippets Groups Projects
Commit bae0fff0 authored by Keisuke Kuroyanagi's avatar Keisuke Kuroyanagi Committed by Android (Google) Code Review
Browse files

Merge "Utf8Utils for dicttoolkit."

parents d6e367ff f0c303dd
No related branches found
No related tags found
No related merge requests found
......@@ -24,11 +24,14 @@ LATIN_IME_DICT_TOOLKIT_SRC_FILES := \
makedict_executor.cpp) \
$(addprefix offdevice_intermediate_dict/, \
offdevice_intermediate_dict.cpp) \
utils/command_utils.cpp
$(addprefix utils/, \
command_utils.cpp \
utf8_utils.cpp)
LATIN_IME_DICT_TOOLKIT_TEST_FILES := \
dict_toolkit_defines_test.cpp \
$(addprefix offdevice_intermediate_dict/, \
offdevice_intermediate_dict_test.cpp) \
$(addprefix utils/, \
command_utils_test.cpp)
command_utils_test.cpp \
utf8_utils_test.cpp)
/*
* Copyright (C) 2014 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "utils/utf8_utils.h"
#include "utils/char_utils.h"
namespace latinime {
namespace dicttoolkit {
const size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4;
const uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8};
const uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0};
const uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03};
const int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
const uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F;
const uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80;
const size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6;
/* static */ std::vector<int> Utf8Utils::getCodePoints(const std::string &utf8Str) {
std::vector<int> codePoints;
int remainingByteCountForCurrentCodePoint = 0;
int currentCodePointSequenceSize = 0;
int codePoint = 0;
for (const char c : utf8Str) {
if (remainingByteCountForCurrentCodePoint == 0) {
currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c);
if (currentCodePointSequenceSize <= 0) {
AKLOGE("%x is an invalid utf8 first byte value.", c);
return std::vector<int>();
}
remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize;
codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint);
} else {
codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
codePoint += maskTrailingByte(c);
}
remainingByteCountForCurrentCodePoint--;
if (remainingByteCountForCurrentCodePoint == 0) {
if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) {
AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.",
currentCodePointSequenceSize, codePoint);
return std::vector<int>();
}
codePoints.push_back(codePoint);
}
}
return codePoints;
}
/* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) {
for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) {
return i;
}
}
// Not a valid utf8 char first byte.
return -1;
}
/* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte,
const int sequenceSize) {
return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize];
}
/* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) {
return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK;
}
/* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) {
std::string utf8String;
for (const int codePoint : codePoints) {
const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint);
if (sequenceSize <= 0) {
AKLOGE("Cannot encode code point (%d).", codePoint);
return std::string();
}
const int trailingByteCount = sequenceSize - 1;
// Output first byte.
const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE);
utf8String.push_back(static_cast<char>(value | FIRST_BYTE_MARKERS[sequenceSize]));
// Output second and later bytes.
for (int i = 1; i < sequenceSize; ++i) {
const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK;
utf8String.push_back(static_cast<char>(value | TRAILING_BYTE_MARKER));
}
}
return utf8String;
}
/* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) {
if (codePoint < 0) {
return -1;
}
for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) {
return i;
}
}
return -1;
}
} // namespace dicttoolkit
} // namespace latinime
/*
* Copyright (C) 2014 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DICT_TOOLKIT_UTF8_UTILS_H
#define LATINIME_DICT_TOOLKIT_UTF8_UTILS_H
#include <cstdint>
#include <string>
#include <vector>
#include "dict_toolkit_defines.h"
#include "utils/int_array_view.h"
namespace latinime {
namespace dicttoolkit {
class Utf8Utils {
public:
static std::vector<int> getCodePoints(const std::string &utf8Str);
static std::string getUtf8String(const CodePointArrayView codePoints);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8Utils);
// Values indexed by sequence size.
static const size_t MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT;
static const uint8_t FIRST_BYTE_MARKER_MASKS[];
static const uint8_t FIRST_BYTE_MARKERS[];
static const uint8_t FIRST_BYTE_CODE_POINT_BITS_MASKS[];
static const int MAX_ENCODED_CODE_POINT_VALUES[];
static const uint8_t TRAILING_BYTE_CODE_POINT_BITS_MASK;
static const uint8_t TRAILING_BYTE_MARKER;
static const size_t CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
static int getSequenceSizeByCheckingFirstByte(const uint8_t firstByte);
static int maskFirstByte(const uint8_t firstByte, const int encodeSize);
static int maskTrailingByte(const uint8_t secondOrLaterByte);
static int getSequenceSizeToEncodeCodePoint(const int codePoint);
};
} // namespace dicttoolkit
} // namespace latinime
#endif // LATINIME_DICT_TOOLKIT_UTF8_UTILS_H
/*
* Copyright (C) 2014 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "utils/utf8_utils.h"
#include <gtest/gtest.h>
#include <vector>
#include "utils/int_array_view.h"
namespace latinime {
namespace dicttoolkit {
namespace {
TEST(Utf8UtilsTests, TestGetCodePoints) {
{
const std::vector<int> codePoints = Utf8Utils::getCodePoints("");
EXPECT_EQ(0u, codePoints.size());
}
{
const std::vector<int> codePoints = Utf8Utils::getCodePoints("test");
EXPECT_EQ(4u, codePoints.size());
EXPECT_EQ('t', codePoints[0]);
EXPECT_EQ('e', codePoints[1]);
EXPECT_EQ('s', codePoints[2]);
EXPECT_EQ('t', codePoints[3]);
}
{
const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\u3042a\u03C2\u0410");
EXPECT_EQ(4u, codePoints.size());
EXPECT_EQ(0x3042, codePoints[0]); // HIRAGANA LETTER A
EXPECT_EQ('a', codePoints[1]);
EXPECT_EQ(0x03C2, codePoints[2]); // CYRILLIC CAPITAL LETTER A
EXPECT_EQ(0x0410, codePoints[3]); // GREEK SMALL LETTER FINAL SIGMA
}
{
const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\U0001F36A?\U0001F752");
EXPECT_EQ(3u, codePoints.size());
EXPECT_EQ(0x1F36A, codePoints[0]); // COOKIE
EXPECT_EQ('?', codePoints[1]);
EXPECT_EQ(0x1F752, codePoints[2]); // ALCHEMICAL SYMBOL FOR STARRED TRIDENT
}
// Redundant UTF-8 sequences must be rejected.
EXPECT_TRUE(Utf8Utils::getCodePoints("\xC0\xAF").empty());
EXPECT_TRUE(Utf8Utils::getCodePoints("\xE0\x80\xAF").empty());
EXPECT_TRUE(Utf8Utils::getCodePoints("\xF0\x80\x80\xAF").empty());
}
TEST(Utf8UtilsTests, TestGetUtf8String) {
{
const std::vector<int> codePoints = {'t', 'e', 's', 't'};
EXPECT_EQ("test", Utf8Utils::getUtf8String(CodePointArrayView(codePoints)));
}
{
const std::vector<int> codePoints = {
0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */,
0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */,
0x0430 /* CYRILLIC SMALL LETTER A */,
0x3042 /* HIRAGANA LETTER A */,
0x1F36A /* COOKIE */,
0x1F752 /* ALCHEMICAL SYMBOL FOR STARRED TRIDENT */
};
EXPECT_EQ(u8"\u00E0\u03C2\u0430\u3042\U0001F36A\U0001F752",
Utf8Utils::getUtf8String(CodePointArrayView(codePoints)));
}
}
} // namespace
} // namespace dicttoolkit
} // namespace latinime
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment