From cfb018ba6db78f2b33b54d4811f0bf166db29792 Mon Sep 17 00:00:00 2001
From: Keisuke Kuroyanagi <ksk@google.com>
Date: Mon, 7 Oct 2013 17:05:24 +0900
Subject: [PATCH] Fix bug and Add large test for decaying dictionary.

- GC gets failure when the dictionary become empty.
- Useless unigrams are sometimes not removed.

Bug: 10197478
Change-Id: I8d1479c01efba61a81f03bc077da6bcb4797a940
---
 .../inputmethod/latin/BinaryDictionary.java   |  4 ++
 ...namic_patricia_trie_gc_event_listeners.cpp |  2 +-
 ...dynamic_patricia_trie_gc_event_listeners.h |  1 +
 .../dynamic_patricia_trie_policy.cpp          | 10 ++++
 .../dictionary/dynamic_patricia_trie_policy.h |  2 +
 .../dynamic_patricia_trie_reading_helper.cpp  |  6 +++
 .../dynamic_patricia_trie_reading_helper.h    |  4 +-
 .../utils/forgetting_curve_utils.cpp          |  3 +-
 .../latin/BinaryDictionaryDecayingTests.java  | 54 +++++++++++++++++++
 9 files changed, 82 insertions(+), 4 deletions(-)

diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java
index 541e69788b..fd296988e6 100644
--- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java
+++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java
@@ -52,6 +52,10 @@ public final class BinaryDictionary extends Dictionary {
     public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
     @UsedForTesting
     public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
+    @UsedForTesting
+    public static final String MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
+    @UsedForTesting
+    public static final String MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
 
     private long mNativeDict;
     private final Locale mLocale;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
index a17a0acf61..5724c5d88e 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
@@ -39,7 +39,7 @@ bool DynamicPatriciaTrieGcEventListeners
             return false;
         }
         if (!ForgettingCurveUtils::isValidEncodedProbability(newProbability)) {
-            isUselessPtNode = false;
+            isUselessPtNode = true;
         }
     }
     if (mChildrenValue > 0) {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h
index 3ca2f2a011..9755120b0a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h
@@ -60,6 +60,7 @@ class DynamicPatriciaTrieGcEventListeners {
 
         bool onDescend(const int ptNodeArrayPos) {
             mValueStack.push_back(0);
+            mChildrenValue = 0;
             return true;
         }
 
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp
index 31e3fb42fb..3d07c9d6c0 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp
@@ -37,6 +37,8 @@ namespace latinime {
 // BinaryDictionaryDecayingTests.
 const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
 const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
+const char *const DynamicPatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
+const char *const DynamicPatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
 const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY =
         "SET_NEEDS_TO_DECAY_FOR_TESTING";
 const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024;
@@ -355,6 +357,14 @@ void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const
         snprintf(outResult, maxResultLength, "%d", mUnigramCount);
     } else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
         snprintf(outResult, maxResultLength, "%d", mBigramCount);
+    } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, maxResultLength) == 0) {
+        snprintf(outResult, maxResultLength, "%d",
+                mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT :
+                        DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE);
+    } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
+        snprintf(outResult, maxResultLength, "%d",
+                mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT :
+                        DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE);
     } else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, maxResultLength) == 0) {
         mNeedsToDecayForTesting = true;
     }
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h
index 903f65e8ec..be97ee1a5e 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h
@@ -102,6 +102,8 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
 
     static const char *const UNIGRAM_COUNT_QUERY;
     static const char *const BIGRAM_COUNT_QUERY;
+    static const char *const MAX_UNIGRAM_COUNT_QUERY;
+    static const char *const MAX_BIGRAM_COUNT_QUERY;
     static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY;
     static const int MAX_DICT_EXTENDED_REGION_SIZE;
     static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp
index 601ee663b3..f108c219f8 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp
@@ -93,6 +93,12 @@ bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreor
     if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) {
         return false;
     }
+    if (isEnd()) {
+        // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array.
+        if (!listener->onReadingPtNodeArrayTail()) {
+            return false;
+        }
+    }
     pushReadingStateToStack();
     while (!isEnd()) {
         if (alreadyVisitedAllPtNodesInArray) {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h
index 512a4d8183..a71c069714 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h
@@ -279,7 +279,9 @@ class DynamicPatriciaTrieReadingHelper {
         } else {
             mReadingState = mReadingStateStack.back();
             mReadingStateStack.pop_back();
-            fetchPtNodeInfo();
+            if (!isEnd()) {
+                fetchPtNodeInfo();
+            }
         }
     }
 };
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
index 19ca354810..1632fd072a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
@@ -93,8 +93,7 @@ void ForgettingCurveUtils::TimeKeeper::setCurrentTime() {
     for (int i = 0; i < decayIterationCount; ++i) {
         const float currentRate = static_cast<float>(currentEncodedProbability)
                 / static_cast<float>(MAX_ENCODED_PROBABILITY);
-        const float thresholdToDecay = MIN_PROBABILITY_TO_DECAY
-                + (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate;
+        const float thresholdToDecay = (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate;
         const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
         if (thresholdToDecay < randValue) {
             currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP,
diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java
index ded8eaa973..cecdd2ffb9 100644
--- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java
+++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java
@@ -19,13 +19,16 @@ package com.android.inputmethod.latin;
 import android.test.AndroidTestCase;
 import android.test.suitebuilder.annotation.LargeTest;
 
+import com.android.inputmethod.latin.makedict.CodePointUtils;
 import com.android.inputmethod.latin.makedict.FormatSpec;
 
 import java.io.File;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Random;
 
 @LargeTest
 public class BinaryDictionaryDecayingTests extends AndroidTestCase {
@@ -179,4 +182,55 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
         binaryDictionary.close();
         dictFile.delete();
     }
+
+    public void testAddManyUnigramsToDecayingDict() {
+        final int unigramCount = 30000;
+        final int unigramTypedCount = 100000;
+        final int codePointSetSize = 50;
+        final long seed = System.currentTimeMillis();
+        final Random random = new Random(seed);
+
+        File dictFile = null;
+        try {
+            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary");
+        } catch (IOException e) {
+            fail("IOException while writing an initial dictionary : " + e);
+        }
+        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
+                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
+                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
+
+        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
+        final ArrayList<String> words = new ArrayList<String>();
+
+        for (int i = 0; i < unigramCount; i++) {
+            final String word = CodePointUtils.generateWord(random, codePointSet);
+            words.add(word);
+        }
+
+        final int maxUnigramCount = Integer.parseInt(
+                binaryDictionary.getPropertyForTests(BinaryDictionary.MAX_UNIGRAM_COUNT_QUERY));
+        for (int i = 0; i < unigramTypedCount; i++) {
+            final String word = words.get(random.nextInt(words.size()));
+            binaryDictionary.addUnigramWord(word, DUMMY_PROBABILITY);
+
+            if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
+                final int unigramCountBeforeGC =
+                        Integer.parseInt(binaryDictionary.getPropertyForTests(
+                                BinaryDictionary.UNIGRAM_COUNT_QUERY));
+                while (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
+                    binaryDictionary.flushWithGC();
+                }
+                final int unigramCountAfterGC =
+                        Integer.parseInt(binaryDictionary.getPropertyForTests(
+                                BinaryDictionary.UNIGRAM_COUNT_QUERY));
+                assertTrue(unigramCountBeforeGC > unigramCountAfterGC);
+            }
+        }
+
+        assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests(
+                BinaryDictionary.UNIGRAM_COUNT_QUERY)) > 0);
+        assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests(
+                BinaryDictionary.UNIGRAM_COUNT_QUERY)) <= maxUnigramCount);
+    }
 }
-- 
GitLab