Skip to content
Snippets Groups Projects
Commit e9a86e2c authored by Jean Chalard's avatar Jean Chalard
Browse files

Search bigrams for the lower case version of the word (A46)

...if there aren't any for the exact case version.

Bug: 6752830
Change-Id: I2737148b01ba04a64febe009ceb2ef53c265d224
parent ac3bd961
No related branches found
No related tags found
No related merge requests found
......@@ -177,19 +177,9 @@ public class Suggest {
if (wordComposer.size() <= 1 && isCorrectionEnabled) {
// At first character typed, search only the bigrams
if (!TextUtils.isEmpty(prevWordForBigram)) {
final CharSequence lowerPrevWord;
if (StringUtils.hasUpperCase(prevWordForBigram)) {
// TODO: Must pay attention to locale when changing case.
lowerPrevWord = prevWordForBigram.toString().toLowerCase();
} else {
lowerPrevWord = null;
}
for (final String key : mDictionaries.keySet()) {
final Dictionary dictionary = mDictionaries.get(key);
suggestionsSet.addAll(dictionary.getBigrams(wordComposer, prevWordForBigram));
if (null != lowerPrevWord) {
suggestionsSet.addAll(dictionary.getBigrams(wordComposer, lowerPrevWord));
}
}
}
} else if (wordComposer.size() > 1) {
......
......@@ -98,11 +98,11 @@ public class UserHistoryDictionaryBigramList {
}
public HashMap<String, Byte> getBigrams(String word1) {
if (!mBigramMap.containsKey(word1)) {
return EMPTY_BIGRAM_MAP;
} else {
return mBigramMap.get(word1);
}
if (mBigramMap.containsKey(word1)) return mBigramMap.get(word1);
// TODO: lower case according to locale
final String lowerWord1 = word1.toLowerCase();
if (mBigramMap.containsKey(lowerWord1)) return mBigramMap.get(lowerWord1);
return EMPTY_BIGRAM_MAP;
}
public boolean removeBigram(String word1, String word2) {
......
......@@ -105,8 +105,15 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
// TODO: have "in" arguments before "out" ones, and make out args explicit in the name
const uint8_t* const root = DICT;
int pos = getBigramListPositionForWord(prevWord, prevWordLength);
int pos = getBigramListPositionForWord(prevWord, prevWordLength,
false /* forceLowerCaseSearch */);
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
if (0 == pos) {
// If no bigrams for this exact word, search again in lower case.
pos = getBigramListPositionForWord(prevWord, prevWordLength,
true /* forceLowerCaseSearch */);
}
// If still no bigrams, we really don't have them!
if (0 == pos) return 0;
int bigramFlags;
int bigramCount = 0;
......@@ -141,10 +148,11 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
// Returns a pointer to the start of the bigram list.
// If the word is not found or has no bigrams, this function returns 0.
int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
const int prevWordLength) const {
const int prevWordLength, const bool forceLowerCaseSearch) const {
if (0 >= prevWordLength) return 0;
const uint8_t* const root = DICT;
int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength);
int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength,
forceLowerCaseSearch);
if (NOT_VALID_WORD == pos) return 0;
const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
......@@ -164,7 +172,13 @@ void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *p
const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const {
memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
const uint8_t* const root = DICT;
int pos = getBigramListPositionForWord(prevWord, prevWordLength);
int pos = getBigramListPositionForWord(prevWord, prevWordLength,
false /* forceLowerCaseSearch */);
if (0 == pos) {
// If no bigrams for this exact string, search again in lower case.
pos = getBigramListPositionForWord(prevWord, prevWordLength,
true /* forceLowerCaseSearch */);
}
if (0 == pos) return;
int bigramFlags;
......@@ -197,10 +211,11 @@ bool BigramDictionary::checkFirstCharacter(unsigned short *word, int *inputCodes
bool BigramDictionary::isValidBigram(const int32_t *word1, int length1, const int32_t *word2,
int length2) const {
const uint8_t* const root = DICT;
int pos = getBigramListPositionForWord(word1, length1);
int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
if (0 == pos) return false;
int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2);
int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2,
false /* forceLowerCaseSearch */);
if (NOT_VALID_WORD == nextWordPos) return false;
int bigramFlags;
do {
......
......@@ -30,7 +30,8 @@ class BigramDictionary {
BigramDictionary(const unsigned char *dict, int maxWordLength);
int getBigrams(const int32_t *word, int length, int *inputCodes, int codesSize,
unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams) const;
int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength) const;
int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength,
const bool forceLowerCaseSearch) const;
void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength,
std::map<int, int> *map, uint8_t *filter) const;
bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2) const;
......
......@@ -19,6 +19,7 @@
#include <limits>
#include "bloom_filter.h"
#include "char_utils.h"
#include "unigram_dictionary.h"
namespace latinime {
......@@ -65,7 +66,7 @@ class BinaryFormat {
static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
int *pos);
static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord,
const int length);
const int length, const bool forceLowerCaseSearch);
static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
uint16_t* outWord, int* outUnigramFrequency);
static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
......@@ -309,7 +310,7 @@ inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* con
// This function gets the byte position of the last chargroup of the exact matching word in the
// dictionary. If no match is found, it returns NOT_VALID_WORD.
inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
const int32_t* const inWord, const int length) {
const int32_t* const inWord, const int length, const bool forceLowerCaseSearch) {
int pos = 0;
int wordPos = 0;
......@@ -318,7 +319,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
// there was no match (or we would have found it).
if (wordPos > length) return NOT_VALID_WORD;
int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
const int32_t wChar = inWord[wordPos];
const int32_t wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos];
while (true) {
// If there are no more character groups in this node, it means we could not
// find a matching character for this depth, therefore there is no match.
......
......@@ -50,8 +50,7 @@ inline static unsigned short toBaseChar(unsigned short c) {
return c;
}
inline static unsigned short toBaseLowerCase(unsigned short c) {
c = toBaseChar(c);
inline static unsigned short toLowerCase(const unsigned short c) {
if (isAsciiUpper(c)) {
return toAsciiLower(c);
} else if (isAscii(c)) {
......@@ -60,6 +59,10 @@ inline static unsigned short toBaseLowerCase(unsigned short c) {
return latin_tolower(c);
}
inline static unsigned short toBaseLowerCase(const unsigned short c) {
return toLowerCase(toBaseChar(c));
}
} // namespace latinime
#endif // LATINIME_CHAR_UTILS_H
......@@ -817,7 +817,8 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor
int UnigramDictionary::getFrequency(const int32_t* const inWord, const int length) const {
const uint8_t* const root = DICT_ROOT;
int pos = BinaryFormat::getTerminalPosition(root, inWord, length);
int pos = BinaryFormat::getTerminalPosition(root, inWord, length,
false /* forceLowerCaseSearch */);
if (NOT_VALID_WORD == pos) {
return NOT_A_PROBABILITY;
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment