From 2c0d91628733ab502a47f6a60411b1f7e3d0b9a2 Mon Sep 17 00:00:00 2001
From: Jean Chalard <jchalard@google.com>
Date: Thu, 13 Sep 2012 20:09:52 +0900
Subject: [PATCH] Fix abbreviations processing for English (A7)

It's still incorrect for non-English languages.

Bug: 4967874
Change-Id: I3d4379265f72e1c0435eba90ab878a80bbcf55ce
---
 .../inputmethod/latin/StringUtils.java        | 107 +++++++++++++-----
 .../inputmethod/latin/StringUtilsTests.java   |  41 +++++++
 2 files changed, 122 insertions(+), 26 deletions(-)

diff --git a/java/src/com/android/inputmethod/latin/StringUtils.java b/java/src/com/android/inputmethod/latin/StringUtils.java
index 8696a5cafb..b4f3b4bc9a 100644
--- a/java/src/com/android/inputmethod/latin/StringUtils.java
+++ b/java/src/com/android/inputmethod/latin/StringUtils.java
@@ -304,34 +304,89 @@ public final class StringUtils {
         }
 
         if (j <= 0) return TextUtils.CAP_MODE_CHARACTERS & reqModes;
-        char c = cs.charAt(j - 1);
-        if (c == Keyboard.CODE_PERIOD || c == Keyboard.CODE_QUESTION_MARK
-                || c == Keyboard.CODE_EXCLAMATION_MARK) {
-            // Here we found a marker for sentence end (we consider these to be one of
-            // either . or ? or ! only). So this is probably the end of a sentence, but if we
-            // found a period, we still want to check the case where this is a abbreviation
-            // period rather than a full stop. To do this, we look for a period within a word
-            // before the period we just found; if any, we take that to mean it was an
-            // abbreviation.
-            // A typical example of the above is "In the U.S. ", where the last period is
-            // not a full stop and we should not capitalize.
-            // TODO: the rule below is broken. In particular it fails for runs of periods,
-            // whatever the reason. In the example "in the U.S..", the last period is a full
-            // stop following the abbreviation period, and we should capitalize but we don't.
-            // Likewise, "I don't know... " should capitalize, but fails to do so.
-            if (c == Keyboard.CODE_PERIOD) {
-                for (int k = j - 2; k >= 0; k--) {
-                    c = cs.charAt(k);
-                    if (c == Keyboard.CODE_PERIOD) {
-                        return TextUtils.CAP_MODE_CHARACTERS & reqModes;
-                    }
-                    if (!Character.isLetter(c)) {
-                        break;
-                    }
+        char c = cs.charAt(--j);
+
+        // We found the next interesting chunk of text ; next we need to determine if it's the
+        // end of a sentence. If we have a question mark or an exclamation mark, it's the end of
+        // a sentence. If it's neither, the only remaining case is the period so we get the opposite
+        // case out of the way.
+        if (c == Keyboard.CODE_QUESTION_MARK || c == Keyboard.CODE_EXCLAMATION_MARK) {
+            return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_SENTENCES) & reqModes;
+        }
+        if (c != Keyboard.CODE_PERIOD || j <= 0) {
+            return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
+        }
+
+        // We found out that we have a period. We need to determine if this is a full stop or
+        // otherwise sentence-ending period, or an abbreviation like "e.g.". An abbreviation
+        // looks like (\w\.){2,}
+        // To find out, we will have a simple state machine with the following states :
+        // START, WORD, PERIOD, ABBREVIATION
+        // On START : (just before the first period)
+        //           letter => WORD
+        //           whitespace => end with no caps (it was a stand-alone period)
+        //           otherwise => end with caps (several periods/symbols in a row)
+        // On WORD : (within the word just before the first period)
+        //           letter => WORD
+        //           period => PERIOD
+        //           otherwise => end with caps (it was a word with a full stop at the end)
+        // On PERIOD : (period within a potential abbreviation)
+        //           letter => LETTER
+        //           otherwise => end with caps (it was not an abbreviation)
+        // On LETTER : (letter within a potential abbreviation)
+        //           letter => LETTER
+        //           period => PERIOD
+        //           otherwise => end with no caps (it was an abbreviation)
+        // "Not an abbreviation" in the above chart essentially covers cases like "...yes.". This
+        // should capitalize.
+
+        final int START = 0;
+        final int WORD = 1;
+        final int PERIOD = 2;
+        final int LETTER = 3;
+        final int caps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
+                | TextUtils.CAP_MODE_SENTENCES) & reqModes;
+        final int noCaps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
+        int state = START;
+        while (j > 0) {
+            c = cs.charAt(--j);
+            switch (state) {
+            case START:
+                if (Character.isLetter(c)) {
+                    state = WORD;
+                } else if (Character.isWhitespace(c)) {
+                    return noCaps;
+                } else {
+                    return caps;
+                }
+                break;
+            case WORD:
+                if (Character.isLetter(c)) {
+                    state = WORD;
+                } else if (c == Keyboard.CODE_PERIOD) {
+                    state = PERIOD;
+                } else {
+                    return caps;
+                }
+                break;
+            case PERIOD:
+                if (Character.isLetter(c)) {
+                    state = LETTER;
+                } else {
+                    return caps;
+                }
+                break;
+            case LETTER:
+                if (Character.isLetter(c)) {
+                    state = LETTER;
+                } else if (c == Keyboard.CODE_PERIOD) {
+                    state = PERIOD;
+                } else {
+                    return noCaps;
                 }
             }
-            return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_SENTENCES) & reqModes;
         }
-        return TextUtils.CAP_MODE_CHARACTERS & reqModes;
+        // Here we arrived at the start of the line. This should behave exactly like whitespace.
+        return (START == state || LETTER == state) ? noCaps : caps;
     }
 }
diff --git a/tests/src/com/android/inputmethod/latin/StringUtilsTests.java b/tests/src/com/android/inputmethod/latin/StringUtilsTests.java
index 5db06ef51a..0cef28c74d 100644
--- a/tests/src/com/android/inputmethod/latin/StringUtilsTests.java
+++ b/tests/src/com/android/inputmethod/latin/StringUtilsTests.java
@@ -17,6 +17,7 @@
 package com.android.inputmethod.latin;
 
 import android.test.AndroidTestCase;
+import android.text.TextUtils;
 
 public class StringUtilsTests extends AndroidTestCase {
     public void testContainsInArray() {
@@ -99,4 +100,44 @@ public class StringUtilsTests extends AndroidTestCase {
         assertFalse("lower-case string", StringUtils.hasUpperCase("string"));
         assertFalse("lower-case string with non-letters", StringUtils.hasUpperCase("he's"));
     }
+
+    private void onePathForCaps(final CharSequence cs, final int expectedResult, final int mask) {
+        int oneTimeResult = expectedResult & mask;
+        assertEquals("After >" + cs + "<", oneTimeResult, StringUtils.getCapsMode(cs, mask));
+    }
+
+    private void allPathsForCaps(final CharSequence cs, final int expectedResult) {
+        final int c = TextUtils.CAP_MODE_CHARACTERS;
+        final int w = TextUtils.CAP_MODE_WORDS;
+        final int s = TextUtils.CAP_MODE_SENTENCES;
+        onePathForCaps(cs, expectedResult, c | w | s);
+        onePathForCaps(cs, expectedResult, w | s);
+        onePathForCaps(cs, expectedResult, c | s);
+        onePathForCaps(cs, expectedResult, c | w);
+        onePathForCaps(cs, expectedResult, c);
+        onePathForCaps(cs, expectedResult, w);
+        onePathForCaps(cs, expectedResult, s);
+    }
+
+    public void testGetCapsMode() {
+        final int c = TextUtils.CAP_MODE_CHARACTERS;
+        final int w = TextUtils.CAP_MODE_WORDS;
+        final int s = TextUtils.CAP_MODE_SENTENCES;
+        allPathsForCaps("", c | w | s);
+        allPathsForCaps("Word", c);
+        allPathsForCaps("Word.", c);
+        allPathsForCaps("Word ", c | w);
+        allPathsForCaps("Word. ", c | w | s);
+        allPathsForCaps("Word..", c);
+        allPathsForCaps("Word.. ", c | w | s);
+        allPathsForCaps("Word... ", c | w | s);
+        allPathsForCaps("Word ... ", c | w | s);
+        allPathsForCaps("Word . ", c | w);
+        allPathsForCaps("In the U.S ", c | w);
+        allPathsForCaps("In the U.S. ", c | w);
+        allPathsForCaps("Some stuff (e.g. ", c | w);
+        allPathsForCaps("In the U.S.. ", c | w | s);
+        allPathsForCaps("\"Word.\" ", c | w | s);
+        allPathsForCaps("\"Word\" ", c | w);
+    }
 }
-- 
GitLab