Skip to content
Snippets Groups Projects
Commit e3d57ae7 authored by Keisuke Kuroyanagi's avatar Keisuke Kuroyanagi Committed by Android (Google) Code Review
Browse files

Merge "Use bitset for BloomFilter."

parents bbefa8c8 2fd0bf9a
No related branches found
No related tags found
No related merge requests found
......@@ -27,7 +27,6 @@ LATIN_IME_CORE_SRC_FILES := \
dic_nodes_cache.cpp) \
$(addprefix suggest/core/dictionary/, \
bigram_dictionary.cpp \
bloom_filter.cpp \
dictionary.cpp \
digraph_utils.cpp \
error_type_utils.cpp \
......
/*
* Copyright (C) 2013, The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "suggest/core/dictionary/bloom_filter.h"
namespace latinime {
// Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest
// prime under 128 * 8.
const int BloomFilter::BIGRAM_FILTER_MODULO = 1021;
} // namespace latinime
......@@ -17,8 +17,7 @@
#ifndef LATINIME_BLOOM_FILTER_H
#define LATINIME_BLOOM_FILTER_H
#include <cstdint>
#include <cstring>
#include <bitset>
#include "defines.h"
......@@ -34,41 +33,37 @@ namespace latinime {
// Total 148603.14 (sum of others 148579.90)
class BloomFilter {
public:
BloomFilter() {
ASSERT(BIGRAM_FILTER_BYTE_SIZE * 8 >= BIGRAM_FILTER_MODULO);
memset(mFilter, 0, sizeof(mFilter));
}
BloomFilter() : mFilter() {}
// TODO: uint32_t position
AK_FORCE_INLINE void setInFilter(const int32_t position) {
const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
mFilter[bucket >> 3] |= static_cast<uint8_t>(1 << (bucket & 0x7));
AK_FORCE_INLINE void setInFilter(const int position) {
mFilter.set(getIndex(position));
}
// TODO: uint32_t position
AK_FORCE_INLINE bool isInFilter(const int32_t position) const {
const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
return (mFilter[bucket >> 3] & static_cast<uint8_t>(1 << (bucket & 0x7))) != 0;
AK_FORCE_INLINE bool isInFilter(const int position) const {
return mFilter.test(getIndex(position));
}
private:
DISALLOW_ASSIGNMENT_OPERATOR(BloomFilter);
// Size, in bytes, of the bloom filter index for bigrams
// 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k,
AK_FORCE_INLINE size_t getIndex(const int position) const {
return static_cast<size_t>(position) % BIGRAM_FILTER_MODULO;
}
// Size, in bits, of the bloom filter index for bigrams
// The probability of false positive is (1 - e ** (-kn/m))**k,
// where k is the number of hash functions, n the number of bigrams, and m the number of
// bits we can test.
// At the moment 100 is the maximum number of bigrams for a word with the current
// At the moment 100 is the maximum number of bigrams for a word with the current main
// dictionaries, so n = 100. 1024 buckets give us m = 1024.
// With 1 hash function, our false positive rate is about 9.3%, which should be enough for
// our uses since we are only using this to increase average performance. For the record,
// k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%,
// and m = 4096 gives 2.4%.
// This is assigned here because it is used for array size.
static const int BIGRAM_FILTER_BYTE_SIZE = 128;
static const int BIGRAM_FILTER_MODULO;
uint8_t mFilter[BIGRAM_FILTER_BYTE_SIZE];
// This is assigned here because it is used for bitset size.
// 1021 is the largest prime under 1024.
static const size_t BIGRAM_FILTER_MODULO = 1021;
std::bitset<BIGRAM_FILTER_MODULO> mFilter;
};
} // namespace latinime
#endif // LATINIME_BLOOM_FILTER_H
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment