Skip to content
Snippets Groups Projects
Commit 0adf7ae2 authored by Jean Chalard's avatar Jean Chalard Committed by Android (Google) Code Review
Browse files

Merge "New dict format, step 7"

parents e0d3922b 1059f273
No related branches found
No related tags found
No related merge requests found
/*
* Copyright (C) 2011 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_BINARY_FORMAT_H
#define LATINIME_BINARY_FORMAT_H
namespace latinime {
class BinaryFormat {
private:
const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
public:
static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
static int skipOtherCharacters(const uint8_t* const dict, const int pos);
static int skipAttributes(const uint8_t* const dict, const int pos);
static int skipChildrenPosition(const uint8_t flags, const int pos);
static int skipFrequency(const uint8_t flags, const int pos);
static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
const int pos);
static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos);
static bool hasChildrenInFlags(const uint8_t flags);
static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
int *pos);
};
inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
return dict[(*pos)++];
}
inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {
return dict[(*pos)++];
}
inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) {
const int origin = *pos;
const int32_t character = dict[origin];
if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
if (character == CHARACTER_ARRAY_TERMINATOR) {
*pos = origin + 1;
return NOT_A_CHARACTER;
} else {
*pos = origin + 3;
const int32_t char_1 = character << 16;
const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
return char_2 + dict[origin + 2];
}
} else {
*pos = origin + 1;
return character;
}
}
inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict,
const int pos) {
return dict[pos];
}
inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) {
int currentPos = pos;
int32_t character = dict[currentPos++];
while (CHARACTER_ARRAY_TERMINATOR != character) {
if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
}
character = dict[currentPos++];
}
return currentPos;
}
static inline int attributeAddressSize(const uint8_t flags) {
static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
/* Note: this is a value-dependant optimization of what may probably be
more readably written this way:
switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
default: return 0;
}
*/
}
inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
int currentPos = pos;
uint8_t flags = getFlagsAndForwardPointer(dict, &currentPos);
while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
currentPos += attributeAddressSize(flags);
flags = getFlagsAndForwardPointer(dict, &currentPos);
}
currentPos += attributeAddressSize(flags);
return currentPos;
}
static inline int childrenAddressSize(const uint8_t flags) {
static const int CHILDREN_ADDRESS_SHIFT = 6;
return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
/* See the note in attributeAddressSize. The same applies here */
}
inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
return pos + childrenAddressSize(flags);
}
inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
}
inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
const int pos) {
// This function skips all attributes. The format makes provision for future extension
// with other attributes (notably shortcuts) but for the time being, bigrams are the
// only attributes that may be found in a character group, so we only look at bigrams
// in this version.
if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
return skipAttributes(dict, pos);
} else {
return pos;
}
}
inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
const uint8_t flags, const int pos) {
int currentPos = pos;
currentPos = skipChildrenPosition(flags, currentPos);
currentPos = skipAllAttributes(dict, flags, currentPos);
return currentPos;
}
inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags,
const int pos) {
int offset = 0;
switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
offset = dict[pos];
break;
case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
offset = dict[pos] << 8;
offset += dict[pos + 1];
break;
case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
offset = dict[pos] << 16;
offset += dict[pos + 1] << 8;
offset += dict[pos + 2];
break;
default:
// If we come here, it means we asked for the children of a word with
// no children.
return -1;
}
return pos + offset;
}
inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
!= (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
}
inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict,
const uint8_t flags, int *pos) {
int offset = 0;
const int origin = *pos;
switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
offset = dict[origin];
*pos = origin + 1;
break;
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
offset = dict[origin] << 8;
offset += dict[origin + 1];
*pos = origin + 2;
break;
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
offset = dict[origin] << 16;
offset += dict[origin + 1] << 8;
offset += dict[origin + 2];
*pos = origin + 3;
break;
}
if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
return origin - offset;
} else {
return origin + offset;
}
}
} // namespace latinime
#endif // LATINIME_BINARY_FORMAT_H
......@@ -126,8 +126,11 @@ static void prof_out(void) {
#define FLAG_BIGRAM_FREQ 0x7F
#define DICTIONARY_VERSION_MIN 200
// TODO: remove this constant when the switch to the new dict format is over
#define DICTIONARY_HEADER_SIZE 2
#define NEW_DICTIONARY_HEADER_SIZE 5
#define NOT_VALID_WORD -99
#define NOT_A_CHARACTER -1
#define KEYCODE_SPACE ' '
......
This diff is collapsed.
......@@ -36,10 +36,51 @@ class UnigramDictionary {
} ProximityType;
public:
#ifdef NEW_DICTIONARY_FORMAT
// Mask and flags for children address type selection.
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
// Flag for single/multiple char group
static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
// Flag for terminal groups
static const int FLAG_IS_TERMINAL = 0x10;
// Flag for bigram presence
static const int FLAG_HAS_BIGRAMS = 0x04;
// Attribute (bigram/shortcut) related flags:
// Flag for presence of more attributes
static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
// Flag for sign of offset. If this flag is set, the offset value must be negated.
static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
// Mask for attribute frequency, stored on 4 bits inside the flags byte.
static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
// Mask and flags for attribute address type selection.
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
#endif // NEW_DICTIONARY_FORMAT
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
const bool isLatestDictVersion);
#ifndef NEW_DICTIONARY_FORMAT
bool isValidWord(unsigned short *word, int length);
#else // NEW_DICTIONARY_FORMAT
bool isValidWord(const uint16_t* const inWord, const int length) const;
int getBigrams(unsigned short *word, int length, int *codes, int codesSize,
unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
int maxAlternatives);
#endif // NEW_DICTIONARY_FORMAT
int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
int getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
const int *ycoordinates, const int *codes, const int codesSize, const int flags,
......@@ -92,6 +133,7 @@ private:
}
int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
unsigned short *word);
#ifndef NEW_DICTIONARY_FORMAT
void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
......@@ -104,6 +146,11 @@ private:
bool processCurrentNodeForExactMatch(const int firstChildPos,
const int startInputIndex, const int depth, unsigned short *word,
int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
#else // NEW_DICTIONARY_FORMAT
int getFrequency(const uint16_t* const inWord, const int length) const;
int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
short unsigned int* outWord);
#endif // NEW_DICTIONARY_FORMAT
const uint8_t* const DICT_ROOT;
const int MAX_WORD_LENGTH;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment