From 2fd0bf9a37b510e1a79dd8c43ed6f6d5919dc376 Mon Sep 17 00:00:00 2001
From: Keisuke Kuroyanagi <ksk@google.com>
Date: Wed, 9 Apr 2014 16:18:23 +0900
Subject: [PATCH] Use bitset for BloomFilter.

Before:
(0)  660.00 (1.43%)
(1)  45320.00 (98.18%)
(2)  80.00 (0.17%)
Total 46160.00 (sum of others 46060.00)

After:
(0)  620.00 (1.34%)
(1)  45310.00 (98.05%)
(2)  130.00 (0.28%)
Total 46210.00 (sum of others 46060.00)

Change-Id: I936b639c50e15208aee999a929b33983c6caa59d
---
 native/jni/NativeFileList.mk                  |  1 -
 .../suggest/core/dictionary/bloom_filter.cpp  | 25 ------------
 .../suggest/core/dictionary/bloom_filter.h    | 39 ++++++++-----------
 3 files changed, 17 insertions(+), 48 deletions(-)
 delete mode 100644 native/jni/src/suggest/core/dictionary/bloom_filter.cpp

diff --git a/native/jni/NativeFileList.mk b/native/jni/NativeFileList.mk
index 3da9d56328..23b0377019 100644
--- a/native/jni/NativeFileList.mk
+++ b/native/jni/NativeFileList.mk
@@ -27,7 +27,6 @@ LATIN_IME_CORE_SRC_FILES := \
         dic_nodes_cache.cpp) \
     $(addprefix suggest/core/dictionary/, \
         bigram_dictionary.cpp \
-        bloom_filter.cpp \
         dictionary.cpp \
         digraph_utils.cpp \
         error_type_utils.cpp \
diff --git a/native/jni/src/suggest/core/dictionary/bloom_filter.cpp b/native/jni/src/suggest/core/dictionary/bloom_filter.cpp
deleted file mode 100644
index 4ae474e0ca..0000000000
--- a/native/jni/src/suggest/core/dictionary/bloom_filter.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) 2013, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "suggest/core/dictionary/bloom_filter.h"
-
-namespace latinime {
-
-// Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest
-// prime under 128 * 8.
-const int BloomFilter::BIGRAM_FILTER_MODULO = 1021;
-
-} // namespace latinime
diff --git a/native/jni/src/suggest/core/dictionary/bloom_filter.h b/native/jni/src/suggest/core/dictionary/bloom_filter.h
index 85b8fc397f..1e60f49ed3 100644
--- a/native/jni/src/suggest/core/dictionary/bloom_filter.h
+++ b/native/jni/src/suggest/core/dictionary/bloom_filter.h
@@ -17,8 +17,7 @@
 #ifndef LATINIME_BLOOM_FILTER_H
 #define LATINIME_BLOOM_FILTER_H
 
-#include <cstdint>
-#include <cstring>
+#include <bitset>
 
 #include "defines.h"
 
@@ -34,41 +33,37 @@ namespace latinime {
 //   Total 148603.14 (sum of others 148579.90)
 class BloomFilter {
  public:
-    BloomFilter() {
-        ASSERT(BIGRAM_FILTER_BYTE_SIZE * 8 >= BIGRAM_FILTER_MODULO);
-        memset(mFilter, 0, sizeof(mFilter));
-    }
+    BloomFilter() : mFilter() {}
 
-    // TODO: uint32_t position
-    AK_FORCE_INLINE void setInFilter(const int32_t position) {
-        const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
-        mFilter[bucket >> 3] |= static_cast<uint8_t>(1 << (bucket & 0x7));
+    AK_FORCE_INLINE void setInFilter(const int position) {
+        mFilter.set(getIndex(position));
     }
 
-    // TODO: uint32_t position
-    AK_FORCE_INLINE bool isInFilter(const int32_t position) const {
-        const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
-        return (mFilter[bucket >> 3] & static_cast<uint8_t>(1 << (bucket & 0x7))) != 0;
+    AK_FORCE_INLINE bool isInFilter(const int position) const {
+        return mFilter.test(getIndex(position));
     }
 
  private:
     DISALLOW_ASSIGNMENT_OPERATOR(BloomFilter);
 
-    // Size, in bytes, of the bloom filter index for bigrams
-    // 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k,
+    AK_FORCE_INLINE size_t getIndex(const int position) const {
+        return static_cast<size_t>(position) % BIGRAM_FILTER_MODULO;
+    }
+
+    // Size, in bits, of the bloom filter index for bigrams
+    // The probability of false positive is (1 - e ** (-kn/m))**k,
     // where k is the number of hash functions, n the number of bigrams, and m the number of
     // bits we can test.
-    // At the moment 100 is the maximum number of bigrams for a word with the current
+    // At the moment 100 is the maximum number of bigrams for a word with the current main
     // dictionaries, so n = 100. 1024 buckets give us m = 1024.
     // With 1 hash function, our false positive rate is about 9.3%, which should be enough for
     // our uses since we are only using this to increase average performance. For the record,
     // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%,
     // and m = 4096 gives 2.4%.
-    // This is assigned here because it is used for array size.
-    static const int BIGRAM_FILTER_BYTE_SIZE = 128;
-    static const int BIGRAM_FILTER_MODULO;
-
-    uint8_t mFilter[BIGRAM_FILTER_BYTE_SIZE];
+    // This is assigned here because it is used for bitset size.
+    // 1021 is the largest prime under 1024.
+    static const size_t BIGRAM_FILTER_MODULO = 1021;
+    std::bitset<BIGRAM_FILTER_MODULO> mFilter;
 };
 } // namespace latinime
 #endif // LATINIME_BLOOM_FILTER_H
-- 
GitLab