From 49ba135fdedb3c6b33ec915e91ecad682b7655b8 Mon Sep 17 00:00:00 2001
From: Jean Chalard <jchalard@google.com>
Date: Mon, 7 May 2012 20:14:00 +0900
Subject: [PATCH] Perform the actual bigram frequency lookup.

This still returns the unigram frequency, because the values stored
for bigrams in the dictionary are not ready to be returned in-place
instead of unigram values. Aside from this, the code is complete.

Bug: 6313806
Change-Id: If7bb7b644730782277f0f6663334c170b7fe13fb
---
 native/jni/src/bigram_dictionary.cpp  | 13 ++-------
 native/jni/src/binary_format.h        | 20 +++++++++-----
 native/jni/src/bloom_filter.h         | 38 +++++++++++++++++++++++++++
 native/jni/src/unigram_dictionary.cpp |  3 ++-
 4 files changed, 55 insertions(+), 19 deletions(-)
 create mode 100644 native/jni/src/bloom_filter.h

diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp
index 220b340d13..07031086c9 100644
--- a/native/jni/src/bigram_dictionary.cpp
+++ b/native/jni/src/bigram_dictionary.cpp
@@ -20,8 +20,9 @@
 #define LOG_TAG "LatinIME: bigram_dictionary.cpp"
 
 #include "bigram_dictionary.h"
-#include "dictionary.h"
 #include "binary_format.h"
+#include "bloom_filter.h"
+#include "dictionary.h"
 
 namespace latinime {
 
@@ -153,16 +154,6 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
     return pos;
 }
 
-static inline void setInFilter(uint8_t *filter, const int position) {
-    const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
-    filter[bucket >> 3] |= (1 << (bucket & 0x7));
-}
-
-static inline bool isInFilter(uint8_t *filter, const int position) {
-    const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
-    return filter[bucket >> 3] & (1 << (bucket & 0x7));
-}
-
 void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord,
         const int prevWordLength, std::map<int, int> *map, uint8_t *filter) {
     memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index 71ade48a35..b87593ca97 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -18,6 +18,7 @@
 #define LATINIME_BINARY_FORMAT_H
 
 #include <limits>
+#include "bloom_filter.h"
 #include "unigram_dictionary.h"
 
 namespace latinime {
@@ -66,8 +67,8 @@ class BinaryFormat {
             const int length);
     static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
             uint16_t* outWord);
-    static int getProbability(const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
-            const int unigramFreq);
+    static int getProbability(const int position, const std::map<int, int> *bigramMap,
+            const uint8_t *bigramFilter, const int unigramFreq);
 
     // Flags for special processing
     // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
@@ -520,13 +521,18 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
 }
 
 // This should probably return a probability in log space.
-inline int BinaryFormat::getProbability(const std::map<int, int> *bigramMap,
+inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
         const uint8_t *bigramFilter, const int unigramFreq) {
-    // TODO: use the bigram filter for fast rejection, then the bigram map for lookup
-    // to get the bigram probability. If the bigram is not found, use the unigram frequency.
-    // Don't forget that they can be null.
+    if (!bigramMap || !bigramFilter) return unigramFreq;
+    if (!isInFilter(bigramFilter, position)) return unigramFreq;
+    const std::map<int, int>::const_iterator bigramFreq = bigramMap->find(position);
+    if (bigramFreq != bigramMap->end()) {
+        // TODO: return the frequency in bigramFreq->second
+        return unigramFreq;
+    } else {
+        return unigramFreq;
+    }
     // TODO: if the unigram frequency is used, compute the actual probability
-    return unigramFreq;
 }
 
 } // namespace latinime
diff --git a/native/jni/src/bloom_filter.h b/native/jni/src/bloom_filter.h
new file mode 100644
index 0000000000..7ae6a1fa44
--- /dev/null
+++ b/native/jni/src/bloom_filter.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_BLOOM_FILTER_H
+#define LATINIME_BLOOM_FILTER_H
+
+#include <stdint.h>
+
+#include "defines.h"
+
+namespace latinime {
+
+static inline void setInFilter(uint8_t *filter, const int position) {
+    const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
+    filter[bucket >> 3] |= (1 << (bucket & 0x7));
+}
+
+static inline bool isInFilter(const uint8_t *filter, const int position) {
+    const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
+    return filter[bucket >> 3] & (1 << (bucket & 0x7));
+}
+
+} // namespace latinime
+
+#endif // LATINIME_BLOOM_FILTER_H
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp
index 2e5468dd7c..9234b1b52d 100644
--- a/native/jni/src/unigram_dictionary.cpp
+++ b/native/jni/src/unigram_dictionary.cpp
@@ -851,7 +851,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos,
         TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
         // bigramMap contains the bigram frequencies indexed by addresses for fast lookup.
         // bigramFilter is a bloom filter of said frequencies for even faster rejection.
-        const int probability = BinaryFormat::getProbability(bigramMap, bigramFilter, unigramFreq);
+        const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter,
+                unigramFreq);
         onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal,
                 currentWordIndex);
 
-- 
GitLab