From db2c0919cfd839d7036697b41e986fa897dc78df Mon Sep 17 00:00:00 2001
From: satok <satok@google.com>
Date: Mon, 1 Aug 2011 15:58:26 +0900
Subject: [PATCH] Remove old dictionary format code

Change-Id: Ic4b9e069c9bd5c088769519f44d0a9ea45acb833
---
 native/Android.mk                 |   3 -
 native/src/unigram_dictionary.cpp | 249 ------------------------------
 native/src/unigram_dictionary.h   |  13 --
 3 files changed, 265 deletions(-)

diff --git a/native/Android.mk b/native/Android.mk
index 63cd68cf66..04819e4564 100644
--- a/native/Android.mk
+++ b/native/Android.mk
@@ -8,9 +8,6 @@ LOCAL_CFLAGS += -Werror -Wall
 # To suppress compiler warnings for unused variables/functions used for debug features etc.
 LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function
 
-# Use the new dictionary format
-LOCAL_CFLAGS += -DNEW_DICTIONARY_FORMAT
-
 LOCAL_SRC_FILES := \
     jni/com_android_inputmethod_keyboard_ProximityInfo.cpp \
     jni/com_android_inputmethod_latin_BinaryDictionary.cpp \
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 27d15157ca..f0bb384fbb 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -24,9 +24,7 @@
 #include "dictionary.h"
 #include "unigram_dictionary.h"
 
-#ifdef NEW_DICTIONARY_FORMAT
 #include "binary_format.h"
-#endif // NEW_DICTIONARY_FORMAT
 
 namespace latinime {
 
@@ -39,20 +37,12 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
 UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
         int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
         const bool isLatestDictVersion)
-#ifndef NEW_DICTIONARY_FORMAT
-    : DICT_ROOT(streamStart),
-#else // NEW_DICTIONARY_FORMAT
     : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
-#endif // NEW_DICTIONARY_FORMAT
     MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
     MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
     TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
-#ifndef NEW_DICTIONARY_FORMAT
-    ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
-#else // NEW_DICTIONARY_FORMAT
       // TODO : remove this variable.
     ROOT_POS(0),
-#endif // NEW_DICTIONARY_FORMAT
     BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)),
     MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
     if (DEBUG_DICT) {
@@ -656,243 +646,6 @@ bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
     return true;
 }
 
-#ifndef NEW_DICTIONARY_FORMAT
-inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
-        const int inputLength, unsigned short *word) {
-    int pos = ROOT_POS;
-    int count = Dictionary::getCount(DICT_ROOT, &pos);
-    int maxFreq = 0;
-    int depth = 0;
-    unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
-    bool terminal = false;
-
-    mStackChildCount[0] = count;
-    mStackSiblingPos[0] = pos;
-
-    while (depth >= 0) {
-        if (mStackChildCount[depth] > 0) {
-            --mStackChildCount[depth];
-            int firstChildPos;
-            int newFreq;
-            int siblingPos = mStackSiblingPos[depth];
-            const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
-                    startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
-                    &siblingPos);
-            mStackSiblingPos[depth] = siblingPos;
-            if (depth == (inputLength - 1)) {
-                // Traverse sibling node
-                if (terminal) {
-                    if (newFreq > maxFreq) {
-                        for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
-                        if (DEBUG_DICT && DEBUG_NODE) {
-#ifdef FLAG_DBG
-                            char s[inputLength + 1];
-                            for (int i = 0; i < inputLength; ++i) s[i] = word[i];
-                            s[inputLength] = 0;
-                            LOGI("New missing space word found: %d > %d (%s), %d, %d",
-                                    newFreq, maxFreq, s, inputLength, depth);
-#endif
-                        }
-                        maxFreq = newFreq;
-                    }
-                }
-            } else if (needsToTraverseChildrenNodes) {
-                // Traverse children nodes
-                ++depth;
-                mStackChildCount[depth] = count;
-                mStackSiblingPos[depth] = firstChildPos;
-            }
-        } else {
-            // Traverse parent node
-            --depth;
-        }
-    }
-
-    word[inputLength] = 0;
-    return maxFreq;
-}
-
-inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
-        const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
-        int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
-    const int inputIndex = startInputIndex + depth;
-    unsigned short c;
-    *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
-            &c, newChildPosition, newTerminal, newFreq);
-    const unsigned int inputC = mProximityInfo->getPrimaryCharAt(inputIndex);
-    if (DEBUG_DICT) {
-        assert(inputC <= U_SHORT_MAX);
-    }
-    const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);
-    const bool matched = (inputC == baseLowerC || inputC == c);
-    const bool hasChild = *newChildPosition != 0;
-    if (matched) {
-        word[depth] = c;
-        if (DEBUG_DICT && DEBUG_NODE) {
-            LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
-            if (*newTerminal) {
-                LOGI("Terminal %d", *newFreq);
-            }
-        }
-        if (hasChild) {
-            *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
-            return true;
-        } else {
-            return false;
-        }
-    } else {
-        // If this node is not user typed character, this method treats this word as unmatched.
-        // Thus newTerminal shouldn't be true.
-        *newTerminal = false;
-        return false;
-    }
-}
-
-// TODO: use uint32_t instead of unsigned short
-bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
-    if (IS_LATEST_DICT_VERSION) {
-        return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
-    } else {
-        return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
-    }
-}
-
-
-// Require strict exact match.
-int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
-        int length) const {
-    // returns address of bigram data of that word
-    // return -99 if not found
-
-    int count = Dictionary::getCount(DICT_ROOT, &pos);
-    unsigned short currentChar = (unsigned short) word[offset];
-    for (int j = 0; j < count; j++) {
-        unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
-        int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
-        int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
-        if (c == currentChar) {
-            if (offset == length - 1) {
-                if (terminal) {
-                    return (pos+1);
-                }
-            } else {
-                if (childPos != 0) {
-                    int t = getBigramPosition(childPos, word, offset + 1, length);
-                    if (t > 0) {
-                        return t;
-                    }
-                }
-            }
-        }
-        if (terminal) {
-            Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
-        }
-        // There could be two instances of each alphabet - upper and lower case. So continue
-        // looking ...
-    }
-    return NOT_VALID_WORD;
-}
-
-// The following functions will be modified.
-inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
-        const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
-        const int initialDiffs, int *nextLetters, const int nextLettersSize,
-        CorrectionState *correctionState, int *newCount, int *newChildPosition,
-        bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
-        int *nextSiblingPosition, int *nextOutputIndex) {
-    const int skipPos = correctionState->getSkipPos();
-    const int excessivePos = correctionState->getExcessivePos();
-    const int transposedPos = correctionState->getTransposedPos();
-    if (DEBUG_DICT) {
-        int inputCount = 0;
-        if (skipPos >= 0) ++inputCount;
-        if (excessivePos >= 0) ++inputCount;
-        if (transposedPos >= 0) ++inputCount;
-        assert(inputCount <= 1);
-    }
-    unsigned short c;
-    int childPosition;
-    bool terminal;
-    int freq;
-    bool isSameAsUserTypedLength = false;
-
-    const int pos = initialPos;
-    const int depth = initialDepth;
-    const int traverseAllNodes = initialTraverseAllNodes;
-    const int diffs = initialDiffs;
-
-    const uint8_t flags = 0; // No flags for now
-
-    if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
-
-    *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
-            &c, &childPosition, &terminal, &freq);
-    *nextOutputIndex = depth + 1;
-
-    const bool needsToTraverseChildrenNodes = childPosition != 0;
-
-    // If we are only doing traverseAllNodes, no need to look at the typed characters.
-    if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
-        mWord[depth] = c;
-        if (traverseAllNodes && terminal) {
-            onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight,
-                    freq, false, nextLetters, nextLettersSize, mCorrectionState);
-        }
-        if (!needsToTraverseChildrenNodes) return false;
-        *newTraverseAllNodes = traverseAllNodes;
-        *newMatchRate = matchWeight;
-        *newDiffs = diffs;
-        *newInputIndex = inputIndex;
-    } else {
-        int inputIndexForProximity = inputIndex;
-
-        if (transposedPos >= 0) {
-            if (inputIndex == transposedPos) ++inputIndexForProximity;
-            if (inputIndex == (transposedPos + 1)) --inputIndexForProximity;
-        }
-
-        ProximityInfo::ProximityType matchedProximityCharId = mProximityInfo->getMatchedProximityId(
-                inputIndexForProximity, c, mCorrectionState);
-        if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) return false;
-        mWord[depth] = c;
-        // If inputIndex is greater than mInputLength, that means there is no
-        // proximity chars. So, we don't need to check proximity.
-        if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
-            multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
-        }
-        bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
-                || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
-        if (isSameAsUserTypedLength && terminal) {
-            onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight,
-                    freq, true, nextLetters, nextLettersSize, mCorrectionState);
-        }
-        if (!needsToTraverseChildrenNodes) return false;
-        // Start traversing all nodes after the index exceeds the user typed length
-        *newTraverseAllNodes = isSameAsUserTypedLength;
-        *newMatchRate = matchWeight;
-        *newDiffs = diffs
-                + ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
-        *newInputIndex = inputIndex + 1;
-    }
-    // Optimization: Prune out words that are too long compared to how much was typed.
-    if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
-        return false;
-    }
-
-    // If inputIndex is greater than mInputLength, that means there are no proximity chars.
-    // TODO: Check if this can be isSameAsUserTypedLength only.
-    if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
-        *newTraverseAllNodes = true;
-    }
-    // get the count of nodes and increment childAddress.
-    *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
-    *newChildPosition = childPosition;
-    if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
-    return needsToTraverseChildrenNodes;
-}
-
-#else // NEW_DICTIONARY_FORMAT
-
 // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
 // interface.
 inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
@@ -1245,6 +998,4 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in
     return true;
 }
 
-#endif // NEW_DICTIONARY_FORMAT
-
 } // namespace latinime
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index e8a0868ac2..41e3818605 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -31,7 +31,6 @@ namespace latinime {
 class UnigramDictionary {
 
 public:
-#ifdef NEW_DICTIONARY_FORMAT
 
     // Mask and flags for children address type selection.
     static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
@@ -63,16 +62,11 @@ public:
     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
-#endif // NEW_DICTIONARY_FORMAT
 
     UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
             int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
             const bool isLatestDictVersion);
-#ifndef NEW_DICTIONARY_FORMAT
-    bool isValidWord(unsigned short *word, int length);
-#else // NEW_DICTIONARY_FORMAT
     bool isValidWord(const uint16_t* const inWord, const int length) const;
-#endif // NEW_DICTIONARY_FORMAT
     int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
     int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
             const int *ycoordinates, const int *codes, const int codesSize, const int flags,
@@ -117,15 +111,8 @@ private:
             int *nextSiblingPosition, int *nextOutputIndex);
     int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
             unsigned short *word);
-#ifndef NEW_DICTIONARY_FORMAT
-    // Process a node by considering missing space
-    bool processCurrentNodeForExactMatch(const int firstChildPos,
-            const int startInputIndex, const int depth, unsigned short *word,
-            int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
-#else // NEW_DICTIONARY_FORMAT
     int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
             short unsigned int* outWord);
-#endif // NEW_DICTIONARY_FORMAT
 
     const uint8_t* const DICT_ROOT;
     const int MAX_WORD_LENGTH;
-- 
GitLab