From f6b0e32df38da4e2130bdbfc8875ea2d19054caf Mon Sep 17 00:00:00 2001
From: Jean Chalard <jchalard@google.com>
Date: Tue, 21 Oct 2014 17:31:00 +0900
Subject: [PATCH] Add a *FAST* dictionary header reader.

It's still unused as of this change but the next change will use it

As a reference point, generating the metadata for Bayo takes
3'02" on my machine with the info command; it's down to 16" if
made to use this instead. The gains increases with the number
of dictionaries obviously.

Change-Id: I0eeea2d8f81bb74b0d1570af658e91b56f7c2b79
---
 .../makedict/BinaryDictDecoderUtils.java      | 42 +++++++++++++++
 .../dicttool/BinaryDictOffdeviceUtils.java    | 53 +++++++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
index 120b96bc6b..be75565bb6 100644
--- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
+++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
@@ -17,11 +17,16 @@
 package com.android.inputmethod.latin.makedict;
 
 import com.android.inputmethod.annotations.UsedForTesting;
+import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
+
 import java.io.File;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.util.HashMap;
+import java.util.LinkedList;
+
+import javax.annotation.Nonnull;
 
 /**
  * Decodes binary files for a FusionDictionary.
@@ -360,6 +365,43 @@ public final class BinaryDictDecoderUtils {
         return result;
     }
 
+    /**
+     * Helper method that brutally decodes a header from a byte array.
+     *
+     * @param headerBuffer a buffer containing the bytes of the header.
+     * @return a hashmap of the attributes stored in the header
+     */
+    @Nonnull
+    public static HashMap<String, String> decodeHeaderAttributes(@Nonnull final byte[] headerBuffer)
+            throws UnsupportedFormatException {
+        final StringBuilder sb = new StringBuilder();
+        final LinkedList<String> keyValues = new LinkedList<>();
+        int index = 0;
+        while (index < headerBuffer.length) {
+            if (headerBuffer[index] == FormatSpec.PTNODE_CHARACTERS_TERMINATOR) {
+                keyValues.add(sb.toString());
+                sb.setLength(0);
+            } else if (CharEncoding.fitsOnOneByte(headerBuffer[index] & 0xFF,
+                    null /* codePointTable */)) {
+                sb.appendCodePoint(headerBuffer[index] & 0xFF);
+            } else {
+                sb.appendCodePoint(((headerBuffer[index] & 0xFF) << 16)
+                        + ((headerBuffer[index + 1] & 0xFF) << 8)
+                        + (headerBuffer[index + 2] & 0xFF));
+                index += 2;
+            }
+            index += 1;
+        }
+        if ((keyValues.size() & 1) != 0) {
+            throw new UnsupportedFormatException("Odd number of attributes");
+        }
+        final HashMap<String, String> attributes = new HashMap<>();
+        for (int i = 0; i < keyValues.size(); i += 2) {
+            attributes.put(keyValues.get(i), keyValues.get(i + 1));
+        }
+        return attributes;
+    }
+
     /**
      * Helper method to pass a file name instead of a File object to isBinaryDictionary.
      */
diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java
index 49a6e8e145..7894e17c4d 100644
--- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java
+++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java
@@ -19,6 +19,10 @@ package com.android.inputmethod.latin.dicttool;
 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils;
 import com.android.inputmethod.latin.makedict.BinaryDictIOUtils;
 import com.android.inputmethod.latin.makedict.DictDecoder;
+import com.android.inputmethod.latin.makedict.DictionaryHeader;
+import com.android.inputmethod.latin.makedict.FormatSpec;
+import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
+import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
 import com.android.inputmethod.latin.makedict.FusionDictionary;
 import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
 
@@ -34,6 +38,8 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStream;
 import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.HashMap;
 
 import javax.annotation.Nonnull;
 import javax.annotation.Nullable;
@@ -142,6 +148,53 @@ public final class BinaryDictOffdeviceUtils {
         }
     }
 
+    public static class HeaderReaderProcessor implements InputProcessor<DictionaryHeader> {
+        // Arbitrarily limit the header length to 32k. Sounds like it would never be larger
+        // than this. Revisit this if needed later.
+        private final int MAX_HEADER_LENGTH = 32 * 1024;
+        @Override @Nonnull
+        public DictionaryHeader process(final InputStream input) throws IOException,
+                UnsupportedFormatException {
+            // Do everything as curtly and ad-hoc as possible for performance.
+            final byte[] tmpBuffer = new byte[12];
+            if (tmpBuffer.length != input.read(tmpBuffer)) {
+                throw new UnsupportedFormatException("File too short, not a dictionary");
+            }
+            // Ad-hoc check for the magic number. See FormatSpec.java as well as
+            // byte_array_utils.h and BinaryDictEncoderUtils#writeDictionaryHeader().
+            final int MAGIC_NUMBER_START_OFFSET = 0;
+            final int VERSION_START_OFFSET = 4;
+            final int HEADER_SIZE_OFFSET = 8;
+            final int magicNumber = ((tmpBuffer[MAGIC_NUMBER_START_OFFSET] & 0xFF) << 24)
+                    + ((tmpBuffer[MAGIC_NUMBER_START_OFFSET + 1] & 0xFF) << 16)
+                    + ((tmpBuffer[MAGIC_NUMBER_START_OFFSET + 2] & 0xFF) << 8)
+                    + (tmpBuffer[MAGIC_NUMBER_START_OFFSET + 3] & 0xFF);
+            if (magicNumber != FormatSpec.MAGIC_NUMBER) {
+                throw new UnsupportedFormatException("Wrong magic number");
+            }
+            final int version = ((tmpBuffer[VERSION_START_OFFSET] & 0xFF) << 8)
+                    + (tmpBuffer[VERSION_START_OFFSET + 1] & 0xFF);
+            if (version != FormatSpec.VERSION2 && version != FormatSpec.VERSION201) {
+                throw new UnsupportedFormatException("Only versions 2 and 201 are supported");
+            }
+            final int totalHeaderSize = ((tmpBuffer[HEADER_SIZE_OFFSET] & 0xFF) >> 24)
+                    + ((tmpBuffer[HEADER_SIZE_OFFSET + 1] & 0xFF) >> 16)
+                    + ((tmpBuffer[HEADER_SIZE_OFFSET + 2] & 0xFF) >> 8)
+                    + (tmpBuffer[HEADER_SIZE_OFFSET + 3] & 0xFF);
+            if (totalHeaderSize > MAX_HEADER_LENGTH) {
+                throw new UnsupportedFormatException("Header too large");
+            }
+            final byte[] headerBuffer = new byte[totalHeaderSize - tmpBuffer.length];
+            if (headerBuffer.length != input.read(headerBuffer)) {
+                throw new UnsupportedFormatException("File shorter than specified in the header");
+            }
+            final HashMap<String, String> attributes =
+                    BinaryDictDecoderUtils.decodeHeaderAttributes(headerBuffer);
+            return new DictionaryHeader(totalHeaderSize, new DictionaryOptions(attributes),
+                    new FormatOptions(version, false /* hasTimestamp */));
+        }
+    }
+
     public static void copy(final InputStream input, final OutputStream output) throws IOException {
         final byte[] buffer = new byte[COPY_BUFFER_SIZE];
         for (int readBytes = input.read(buffer); readBytes >= 0; readBytes = input.read(buffer)) {
-- 
GitLab