Skip to content
Snippets Groups Projects
Commit be76dbfe authored by Keisuke Kuroyanagi's avatar Keisuke Kuroyanagi Committed by Android (Google) Code Review
Browse files

Merge "Introduce DynamicPatriciaTrieReadingHelper."

parents acc886ee 4d814bfc
No related branches found
No related tags found
No related merge requests found
......@@ -74,6 +74,7 @@ LATIN_IME_CORE_SRC_FILES := \
dictionary_structure_with_buffer_policy_factory.cpp \
dynamic_patricia_trie_node_reader.cpp \
dynamic_patricia_trie_policy.cpp \
dynamic_patricia_trie_reading_helper.cpp \
dynamic_patricia_trie_reading_utils.cpp \
patricia_trie_policy.cpp \
patricia_trie_reading_utils.cpp) \
......
......@@ -20,95 +20,68 @@
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
namespace latinime {
// To avoid infinite loop caused by invalid or malicious forward links.
const int DynamicPatriciaTriePolicy::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
DicNodeVector *const childDicNodes) const {
if (!dicNode->hasChildren()) {
return;
}
DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer,
getBigramsStructurePolicy(), getShortcutsStructurePolicy());
int mergedNodeCodePoints[MAX_WORD_LENGTH];
int nextPos = dicNode->getChildrenPos();
int totalChildCount = 0;
do {
const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
mDictRoot, &nextPos);
totalChildCount += childCount;
if (childCount <= 0 || totalChildCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP) {
// Invalid dictionary.
AKLOGI("Invalid dictionary. childCount: %d, totalChildCount: %d, MAX: %d",
childCount, totalChildCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP);
ASSERT(false);
return;
}
for (int i = 0; i < childCount; i++) {
nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nextPos, MAX_WORD_LENGTH,
mergedNodeCodePoints);
if (!nodeReader.isDeleted()) {
// Push child node when the node is not a deleted node.
childDicNodes->pushLeavingChild(dicNode, nodeReader.getNodePos(),
nodeReader.getChildrenPos(), nodeReader.getProbability(),
nodeReader.isTerminal(), nodeReader.hasChildren(),
nodeReader.isBlacklisted() || nodeReader.isNotAWord(),
nodeReader.getCodePointCount(), mergedNodeCodePoints);
}
nextPos = nodeReader.getSiblingNodePos();
}
nextPos = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(mDictRoot, nextPos);
} while (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(nextPos));
DynamicPatriciaTrieReadingHelper readingHelper(mDictRoot, mOriginalDictSize,
&mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy());
readingHelper.initWithNodeArrayPos(dicNode->getChildrenPos());
const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader();
while (!readingHelper.isEnd()) {
childDicNodes->pushLeavingChild(dicNode, nodeReader->getNodePos(),
nodeReader->getChildrenPos(), nodeReader->getProbability(),
nodeReader->isTerminal() && !nodeReader->isDeleted(),
nodeReader->hasChildren(), nodeReader->isBlacklisted() || nodeReader->isNotAWord(),
nodeReader->getCodePointCount(), readingHelper.getMergedNodeCodePoints());
readingHelper.readNextSiblingNode();
}
}
int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
const int nodePos, const int maxCodePointCount, int *const outCodePoints,
int *const outUnigramProbability) const {
if (nodePos == NOT_A_VALID_WORD_POS) {
*outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
// This method traverses parent nodes from the terminal by following parent pointers; thus,
// node code points are stored in the buffer in the reverse order.
int reverseCodePoints[maxCodePointCount];
int mergedNodeCodePoints[maxCodePointCount];
int codePointCount = 0;
DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer,
getBigramsStructurePolicy(), getShortcutsStructurePolicy());
// First, read terminal node and get its probability.
nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nodePos, maxCodePointCount,
mergedNodeCodePoints);
// Store terminal node probability.
*outUnigramProbability = nodeReader.getProbability();
// Store terminal node code points to buffer in the reverse order.
for (int i = nodeReader.getCodePointCount() - 1; i >= 0; --i) {
reverseCodePoints[codePointCount++] = mergedNodeCodePoints[i];
DynamicPatriciaTrieReadingHelper readingHelper(mDictRoot, mOriginalDictSize,
&mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy());
// First, read the terminal node and get its probability.
readingHelper.initWithNodePos(nodePos);
if (!readingHelper.isValidTerminalNode()) {
// Node at the nodePos is not a valid terminal node.
*outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
// Then, follow parent pos toward the root node.
while (nodeReader.getParentPos() != NOT_A_DICT_POS) {
// codePointCount must be incremented at least once in each iteration to ensure preventing
// infinite loop.
if (nodeReader.isDeleted() || codePointCount > maxCodePointCount
|| nodeReader.getCodePointCount() <= 0) {
// Store terminal node probability.
*outUnigramProbability = readingHelper.getNodeReader()->getProbability();
// Then, following parent node link to the dictionary root and fetch node code points.
while (!readingHelper.isEnd()) {
if (readingHelper.getTotalCodePointCount() > maxCodePointCount) {
// The nodePos is not a valid terminal node position in the dictionary.
*outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
// Read parent node.
nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nodeReader.getParentPos(),
maxCodePointCount, mergedNodeCodePoints);
// Store node code points to buffer in the reverse order.
for (int i = nodeReader.getCodePointCount() - 1; i >= 0; --i) {
reverseCodePoints[codePointCount++] = mergedNodeCodePoints[i];
}
readingHelper.fetchMergedNodeCodePointsInReverseOrder(
readingHelper.getPrevTotalCodePointCount(), reverseCodePoints);
// Follow parent node toward the root node.
readingHelper.readParentNode();
}
if (readingHelper.isError()) {
// The node position or the dictionary is invalid.
*outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
// Reverse the stored code points to output them.
const int codePointCount = readingHelper.getTotalCodePointCount();
for (int i = 0; i < codePointCount; ++i) {
outCodePoints[i] = reverseCodePoints[codePointCount - i - 1];
}
......@@ -121,73 +94,39 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in
for (int i = 0; i < length; ++i) {
searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i];
}
int mergedNodeCodePoints[MAX_WORD_LENGTH];
int currentLength = 0;
int pos = getRootPosition();
DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer,
getBigramsStructurePolicy(), getShortcutsStructurePolicy());
while (currentLength < length) {
// When foundMatchedNode becomes true, currentLength is increased at least once.
bool foundMatchedNode = false;
int totalChildCount = 0;
do {
const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
mDictRoot, &pos);
totalChildCount += childCount;
if (childCount <= 0 || totalChildCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP) {
// Invalid dictionary.
AKLOGI("Invalid dictionary. childCount: %d, totalChildCount: %d, MAX: %d",
childCount, totalChildCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP);
ASSERT(false);
DynamicPatriciaTrieReadingHelper readingHelper(mDictRoot, mOriginalDictSize,
&mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy());
readingHelper.initWithNodeArrayPos(getRootPosition());
const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader();
while (!readingHelper.isEnd()) {
const int matchedCodePointCount = readingHelper.getPrevTotalCodePointCount();
if (readingHelper.getTotalCodePointCount() > length
|| !readingHelper.isMatchedCodePoint(0 /* index */,
searchCodePoints[matchedCodePointCount])) {
// Current node has too many code points or its first code point is different from
// target code point. Skip this node and read the next sibling node.
readingHelper.readNextSiblingNode();
continue;
}
// Check following merged node code points.
const int nodeCodePointCount = nodeReader->getCodePointCount();
for (int j = 1; j < nodeCodePointCount; ++j) {
if (!readingHelper.isMatchedCodePoint(
j, searchCodePoints[matchedCodePointCount + j])) {
// Different code point is found. The given word is not included in the dictionary.
return NOT_A_VALID_WORD_POS;
}
for (int i = 0; i < childCount; i++) {
nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(pos, MAX_WORD_LENGTH,
mergedNodeCodePoints);
const int nodeCodePointCount = nodeReader.getCodePointCount();
if (nodeReader.isDeleted() || nodeCodePointCount <= 0
|| currentLength + nodeCodePointCount > length) {
// Skip deleted or empty node.
pos = nodeReader.getSiblingNodePos();
continue;
}
bool matched = true;
for (int j = 0; j < nodeCodePointCount; ++j) {
if (mergedNodeCodePoints[j] != searchCodePoints[currentLength + j]) {
// Different code point is found.
matched = false;
break;
}
}
if (matched) {
currentLength += nodeCodePointCount;
if (length == currentLength) {
// Terminal position is found.
return nodeReader.getNodePos();
}
if (!nodeReader.hasChildren()) {
return NOT_A_VALID_WORD_POS;
}
foundMatchedNode = true;
// Advance to the children nodes.
pos = nodeReader.getChildrenPos();
break;
}
// Try next sibling node.
pos = nodeReader.getSiblingNodePos();
}
if (foundMatchedNode) {
break;
}
// If the matched node is not found in the current PtNode array, try to follow the
// forward link.
pos = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(
mDictRoot, pos);
} while (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(pos));
if (!foundMatchedNode) {
// Matched node is not found.
}
// All characters are matched.
if (length == readingHelper.getTotalCodePointCount()) {
// Terminal position is found.
return nodeReader->getNodePos();
}
if (!nodeReader->hasChildren()) {
return NOT_A_VALID_WORD_POS;
}
// Advance to the children nodes.
readingHelper.readChildNode();
}
// If we already traversed the tree further than the word is long, there means
// there was no match (or we would have found it).
......
......@@ -87,7 +87,6 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy);
static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP;
const MmappedBuffer *const mBuffer;
const ExtendableBuffer mExtendableBuffer;
......
/*
* Copyright (C) 2013, The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h"
namespace latinime {
// To avoid infinite loop caused by invalid or malicious forward links.
const int DynamicPatriciaTrieReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
const int DynamicPatriciaTrieReadingHelper::MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
// Read node array size and process empty node arrays. Nodes and arrays are counted up in this
// method to avoid an infinite loop.
void DynamicPatriciaTrieReadingHelper::nextNodeArray() {
const bool usesAdditionalBuffer = mPos >= mOriginalDictSize;
const uint8_t *const dictBuf = (usesAdditionalBuffer)
? mExtendableBuffer->getBuffer() : mDictRoot;
if (usesAdditionalBuffer) {
mPos -= mOriginalDictSize;
}
mNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(dictBuf,
&mPos);
if (usesAdditionalBuffer) {
mPos += mOriginalDictSize;
}
// Count up nodes and node arrays to avoid infinite loop.
mTotalNodeCount += mNodeCount;
mNodeArrayCount++;
if (mNodeCount < 0 || mTotalNodeCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP
|| mNodeArrayCount > MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) {
// Invalid dictionary.
AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d"
"nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d",
mNodeCount, mTotalNodeCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP,
mNodeArrayCount, MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP);
ASSERT(false);
mIsError = true;
mPos = NOT_A_DICT_POS;
return;
}
if (mNodeCount == 0) {
// Empty node array. Try following forward link.
followForwardLink();
}
}
// Follow the forward link and read the next node array if exists.
void DynamicPatriciaTrieReadingHelper::followForwardLink() {
const bool usesAdditionalBuffer = mPos >= mOriginalDictSize;
const uint8_t *const dictBuf = (usesAdditionalBuffer)
? mExtendableBuffer->getBuffer() : mDictRoot;
if (usesAdditionalBuffer) {
mPos -= mOriginalDictSize;
}
const int forwardLinkPosition =
DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(dictBuf, mPos);
if (usesAdditionalBuffer) {
mPos += mOriginalDictSize;
}
if (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(forwardLinkPosition)) {
// Follow the forward link.
mPos = forwardLinkPosition;
nextNodeArray();
} else {
// All node arrays have been read.
mPos = NOT_A_DICT_POS;
}
}
} // namespace latinime
/*
* Copyright (C) 2013, The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H
#define LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H
#include "defines.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/extendable_buffer.h"
namespace latinime {
/*
* This class is used for traversing dynamic patricia trie. This class supports iterating nodes and
* dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop.
*/
class DynamicPatriciaTrieReadingHelper {
public:
DynamicPatriciaTrieReadingHelper(const uint8_t *const dictRoot, const int originalDictSize,
const ExtendableBuffer *const extendableBuffer,
const DictionaryBigramsStructurePolicy *const bigramsPolicy,
const DictionaryShortcutsStructurePolicy *const shortcutsPolicy)
: mIsError(false), mPos(NOT_A_DICT_POS), mNodeCount(0), mPrevTotalCodePointCount(0),
mTotalNodeCount(0), mNodeArrayCount(0), mDictRoot(dictRoot),
mOriginalDictSize(originalDictSize), mExtendableBuffer(extendableBuffer),
mNodeReader(mDictRoot, mOriginalDictSize, mExtendableBuffer, bigramsPolicy,
shortcutsPolicy) {}
~DynamicPatriciaTrieReadingHelper() {}
AK_FORCE_INLINE bool isError() const {
return mIsError;
}
AK_FORCE_INLINE bool isEnd() const {
return mPos == NOT_A_DICT_POS;
}
// Initialize reading state with the head position of a node array.
AK_FORCE_INLINE void initWithNodeArrayPos(const int nodeArrayPos) {
if (nodeArrayPos == NOT_A_DICT_POS) {
mPos = NOT_A_DICT_POS;
} else {
mIsError = false;
mPos = nodeArrayPos;
mNodeCount = 0;
mPrevTotalCodePointCount = 0;
mTotalNodeCount = 0;
mNodeArrayCount = 0;
nextNodeArray();
if (!isEnd()) {
fetchNodeInfo();
}
}
}
// Initialize reading state with the head position of a node.
AK_FORCE_INLINE void initWithNodePos(const int nodePos) {
// TODO: Consolidate NOT_A_VALID_WORD_POS and NOT_A_DICT_POS
if (nodePos == NOT_A_VALID_WORD_POS || nodePos == NOT_A_DICT_POS) {
mPos = NOT_A_DICT_POS;
} else {
mIsError = false;
mPos = nodePos;
mNodeCount = 1;
mPrevTotalCodePointCount = 0;
mTotalNodeCount = 1;
mNodeArrayCount = 1;
fetchNodeInfo();
}
}
AK_FORCE_INLINE const DynamicPatriciaTrieNodeReader* getNodeReader() const {
return &mNodeReader;
}
AK_FORCE_INLINE bool isValidTerminalNode() const {
return !isEnd() && !mNodeReader.isDeleted() && mNodeReader.isTerminal();
}
AK_FORCE_INLINE bool isMatchedCodePoint(const int index, const int codePoint) const {
return mMergedNodeCodePoints[index] == codePoint;
}
// Return code point count exclude the last read node's code points.
AK_FORCE_INLINE int getPrevTotalCodePointCount() const {
return mPrevTotalCodePointCount;
}
// Return code point count include the last read node's code points.
AK_FORCE_INLINE int getTotalCodePointCount() const {
return mPrevTotalCodePointCount + mNodeReader.getCodePointCount();
}
AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder(
const int index, int *const outCodePoints) const {
const int nodeCodePointCount = mNodeReader.getCodePointCount();
for (int i = 0; i < nodeCodePointCount; ++i) {
outCodePoints[index + i] = mMergedNodeCodePoints[nodeCodePointCount - 1 - i];
}
}
AK_FORCE_INLINE const int *getMergedNodeCodePoints() const {
return mMergedNodeCodePoints;
}
AK_FORCE_INLINE void readNextSiblingNode() {
mNodeCount -= 1;
mPos = mNodeReader.getSiblingNodePos();
if (mNodeCount <= 0) {
// All nodes in the current node array have been read.
followForwardLink();
if (!isEnd()) {
fetchNodeInfo();
}
} else {
fetchNodeInfo();
}
}
// Read the first child node of the current node.
AK_FORCE_INLINE void readChildNode() {
if (mNodeReader.hasChildren()) {
mPrevTotalCodePointCount += mNodeReader.getCodePointCount();
mTotalNodeCount = 0;
mNodeArrayCount = 0;
mPos = mNodeReader.getChildrenPos();
// Read children node array.
nextNodeArray();
if (!isEnd()) {
fetchNodeInfo();
}
} else {
mPos = NOT_A_DICT_POS;
}
}
// Read the parent node of the current node.
AK_FORCE_INLINE void readParentNode() {
if (mNodeReader.getParentPos() != NOT_A_DICT_POS) {
mPrevTotalCodePointCount += mNodeReader.getCodePointCount();
mTotalNodeCount = 1;
mNodeArrayCount = 1;
mNodeCount = 1;
mPos = mNodeReader.getParentPos();
fetchNodeInfo();
} else {
mPos = NOT_A_DICT_POS;
}
}
private:
DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieReadingHelper);
static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP;
static const int MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP;
bool mIsError;
int mPos;
// Node count of a node array.
int mNodeCount;
int mPrevTotalCodePointCount;
int mTotalNodeCount;
int mNodeArrayCount;
const uint8_t *const mDictRoot;
const int mOriginalDictSize;
const ExtendableBuffer *const mExtendableBuffer;
DynamicPatriciaTrieNodeReader mNodeReader;
int mMergedNodeCodePoints[MAX_WORD_LENGTH];
void nextNodeArray();
void followForwardLink();
AK_FORCE_INLINE void fetchNodeInfo() {
mNodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(mPos, MAX_WORD_LENGTH,
mMergedNodeCodePoints);
if (mNodeReader.getCodePointCount() <= 0) {
// Empty node is not allowed.
mIsError = true;
mPos = NOT_A_DICT_POS;
}
}
};
} // namespace latinime
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H */
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment