From 78b55a31cb158b1e14ccf678133269b0f99c7f9a Mon Sep 17 00:00:00 2001
From: Keisuke Kuroyanagi <ksk@google.com>
Date: Tue, 17 Sep 2013 15:11:24 +0900
Subject: [PATCH] Fix handling multi-bytes characters and add a test.

Bug: 6669677

Change-Id: Id2154db47adea2929559a4187a726f9dfa83363e
---
 .../dictionary/utils/byte_array_utils.cpp     |  3 +-
 .../dictionary/utils/byte_array_utils.h       | 11 ++--
 .../latin/BinaryDictionaryTests.java          | 40 +++++++++++-
 .../BinaryDictDecoderEncoderTests.java        | 51 ++-------------
 .../latin/makedict/CodePointUtils.java        | 65 +++++++++++++++++++
 5 files changed, 119 insertions(+), 51 deletions(-)
 create mode 100644 tests/src/com/android/inputmethod/latin/makedict/CodePointUtils.java

diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp
index a84cfb9d58..1833e88326 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp
@@ -18,7 +18,8 @@
 
 namespace latinime {
 
-const uint8_t ByteArrayUtils::MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
+const uint8_t ByteArrayUtils::MINIMUM_ONE_BYTE_CHARACTER_VALUE = 0x20;
+const uint8_t ByteArrayUtils::MAXIMUM_ONE_BYTE_CHARACTER_VALUE = 0xFF;
 const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F;
 
 } // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h
index 6bafb64eec..0c15768188 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h
@@ -135,7 +135,7 @@ class ByteArrayUtils {
     static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
             const uint8_t *const buffer, int *const pos) {
         const uint8_t firstByte = readUint8(buffer, *pos);
-        if (firstByte < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
+        if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
             if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
                 *pos += 1;
                 return NOT_A_CODE_POINT;
@@ -187,7 +187,8 @@ class ByteArrayUtils {
             const int codePoint = codePoints[i];
             if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
                 break;
-            } else if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
+            } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
+                    || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
                 // three bytes character.
                 writeUint24AndAdvancePosition(buffer, codePoint, pos);
             } else {
@@ -207,7 +208,8 @@ class ByteArrayUtils {
             const int codePoint = codePoints[i];
             if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
                 break;
-            } else if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
+            } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
+                    || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
                 // three bytes character.
                 byteCount += 3;
             } else {
@@ -225,7 +227,8 @@ class ByteArrayUtils {
  private:
     DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils);
 
-    static const uint8_t MINIMAL_ONE_BYTE_CHARACTER_VALUE;
+    static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE;
+    static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE;
     static const uint8_t CHARACTER_ARRAY_TERMINATOR;
 
     static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer,
diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java
index 6f05d428c7..501a035e72 100644
--- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java
+++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java
@@ -19,6 +19,7 @@ package com.android.inputmethod.latin;
 import android.test.AndroidTestCase;
 import android.test.suitebuilder.annotation.LargeTest;
 
+import com.android.inputmethod.latin.makedict.CodePointUtils;
 import com.android.inputmethod.latin.makedict.DictEncoder;
 import com.android.inputmethod.latin.makedict.FormatSpec;
 import com.android.inputmethod.latin.makedict.FusionDictionary;
@@ -30,6 +31,7 @@ import java.io.File;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Locale;
+import java.util.Random;
 
 @LargeTest
 public class BinaryDictionaryTests extends AndroidTestCase {
@@ -117,10 +119,46 @@ public class BinaryDictionaryTests extends AndroidTestCase {
 
         assertEquals(probability, binaryDictionary.getFrequency("aab"));
         assertEquals(probability, binaryDictionary.getFrequency("aac"));
-        assertEquals(probability, binaryDictionary.getFrequency("aac"));
+        assertEquals(probability, binaryDictionary.getFrequency("aa"));
         assertEquals(probability, binaryDictionary.getFrequency("aaaa"));
         assertEquals(probability, binaryDictionary.getFrequency("a"));
         assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa"));
+
+        dictFile.delete();
+    }
+
+    public void testRandomlyAddUnigramWord() {
+        final int wordCount = 1000;
+        final int codePointSetSize = 50;
+        final int seed = 123456789;
+
+        File dictFile = null;
+        try {
+            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary");
+        } catch (IOException e) {
+            fail("IOException while writing an initial dictionary : " + e);
+        } catch (UnsupportedFormatException e) {
+            fail("UnsupportedFormatException while writing an initial dictionary : " + e);
+        }
+        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
+                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
+                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
+
+        final HashMap<String, Integer> probabilityMap = new HashMap<String, Integer>();
+        // Test a word that isn't contained within the dictionary.
+        final Random random = new Random(seed);
+        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
+        for (int i = 0; i < wordCount; ++i) {
+            final String word = CodePointUtils.generateWord(random, codePointSet);
+            probabilityMap.put(word, random.nextInt() & 0xFF);
+        }
+        for (String word : probabilityMap.keySet()) {
+            binaryDictionary.addUnigramWord(word, probabilityMap.get(word));
+        }
+        for (String word : probabilityMap.keySet()) {
+            assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word));
+        }
+        dictFile.delete();
     }
 
     public void testAddBigramWords() {
diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java
index 807c252447..8bc0095a5e 100644
--- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java
+++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java
@@ -87,7 +87,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
         Log.e(TAG, "Testing dictionary: seed is " + seed);
         final Random random = new Random(seed);
         sWords.clear();
-        final int[] codePointSet = generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random);
+        final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE,
+                random);
         generateWords(maxUnigrams, random, codePointSet);
 
         for (int i = 0; i < sWords.size(); ++i) {
@@ -113,51 +114,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
         }
     }
 
-    private int[] generateCodePointSet(final int codePointSetSize, final Random random) {
-        final int[] codePointSet = new int[codePointSetSize];
-        for (int i = codePointSet.length - 1; i >= 0; ) {
-            final int r = Math.abs(random.nextInt());
-            if (r < 0) continue;
-            // Don't insert 0~0x20, but insert any other code point.
-            // Code points are in the range 0~0x10FFFF.
-            final int candidateCodePoint = 0x20 + r % (Character.MAX_CODE_POINT - 0x20);
-            // Code points between MIN_ and MAX_SURROGATE are not valid on their own.
-            if (candidateCodePoint >= Character.MIN_SURROGATE
-                    && candidateCodePoint <= Character.MAX_SURROGATE) continue;
-            codePointSet[i] = candidateCodePoint;
-            --i;
-        }
-        return codePointSet;
-    }
-
-    // Utilities for test
-
-    /**
-     * Generates a random word.
-     */
-    private String generateWord(final Random random, final int[] codePointSet) {
-        StringBuilder builder = new StringBuilder();
-        // 8 * 4 = 32 chars max, but we do it the following way so as to bias the random toward
-        // longer words. This should be closer to natural language, and more importantly, it will
-        // exercise the algorithms in dicttool much more.
-        final int count = 1 + (Math.abs(random.nextInt()) % 5)
-                + (Math.abs(random.nextInt()) % 5)
-                + (Math.abs(random.nextInt()) % 5)
-                + (Math.abs(random.nextInt()) % 5)
-                + (Math.abs(random.nextInt()) % 5)
-                + (Math.abs(random.nextInt()) % 5)
-                + (Math.abs(random.nextInt()) % 5)
-                + (Math.abs(random.nextInt()) % 5);
-        while (builder.length() < count) {
-            builder.appendCodePoint(codePointSet[Math.abs(random.nextInt()) % codePointSet.length]);
-        }
-        return builder.toString();
-    }
-
     private void generateWords(final int number, final Random random, final int[] codePointSet) {
         final Set<String> wordSet = CollectionUtils.newHashSet();
         while (wordSet.size() < number) {
-            wordSet.add(generateWord(random, codePointSet));
+            wordSet.add(CodePointUtils.generateWord(random, codePointSet));
         }
         sWords.addAll(wordSet);
     }
@@ -606,9 +566,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
 
         // Test a word that isn't contained within the dictionary.
         final Random random = new Random((int)System.currentTimeMillis());
-        final int[] codePointSet = generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random);
+        final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE,
+                random);
         for (int i = 0; i < 1000; ++i) {
-            final String word = generateWord(random, codePointSet);
+            final String word = CodePointUtils.generateWord(random, codePointSet);
             if (sWords.indexOf(word) != -1) continue;
             runGetTerminalPosition(dictDecoder, word, i, false);
         }
diff --git a/tests/src/com/android/inputmethod/latin/makedict/CodePointUtils.java b/tests/src/com/android/inputmethod/latin/makedict/CodePointUtils.java
new file mode 100644
index 0000000000..36b958af8f
--- /dev/null
+++ b/tests/src/com/android/inputmethod/latin/makedict/CodePointUtils.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.inputmethod.latin.makedict;
+
+import java.util.Random;
+
+// Utility methods related with code points used for tests.
+public class CodePointUtils {
+    private CodePointUtils() {
+        // This utility class is not publicly instantiable.
+    }
+
+    public static int[] generateCodePointSet(final int codePointSetSize, final Random random) {
+        final int[] codePointSet = new int[codePointSetSize];
+        for (int i = codePointSet.length - 1; i >= 0; ) {
+            final int r = Math.abs(random.nextInt());
+            if (r < 0) continue;
+            // Don't insert 0~0x20, but insert any other code point.
+            // Code points are in the range 0~0x10FFFF.
+            final int candidateCodePoint = 0x20 + r % (Character.MAX_CODE_POINT - 0x20);
+            // Code points between MIN_ and MAX_SURROGATE are not valid on their own.
+            if (candidateCodePoint >= Character.MIN_SURROGATE
+                    && candidateCodePoint <= Character.MAX_SURROGATE) continue;
+            codePointSet[i] = candidateCodePoint;
+            --i;
+        }
+        return codePointSet;
+    }
+
+    /**
+     * Generates a random word.
+     */
+    public static String generateWord(final Random random, final int[] codePointSet) {
+        StringBuilder builder = new StringBuilder();
+        // 8 * 4 = 32 chars max, but we do it the following way so as to bias the random toward
+        // longer words. This should be closer to natural language, and more importantly, it will
+        // exercise the algorithms in dicttool much more.
+        final int count = 1 + (Math.abs(random.nextInt()) % 5)
+                + (Math.abs(random.nextInt()) % 5)
+                + (Math.abs(random.nextInt()) % 5)
+                + (Math.abs(random.nextInt()) % 5)
+                + (Math.abs(random.nextInt()) % 5)
+                + (Math.abs(random.nextInt()) % 5)
+                + (Math.abs(random.nextInt()) % 5)
+                + (Math.abs(random.nextInt()) % 5);
+        while (builder.length() < count) {
+            builder.appendCodePoint(codePointSet[Math.abs(random.nextInt()) % codePointSet.length]);
+        }
+        return builder.toString();
+    }
+}
-- 
GitLab