Skip to content
Snippets Groups Projects
Commit 25f47828 authored by Ken Wakasa's avatar Ken Wakasa Committed by Android (Google) Code Review
Browse files

Merge "Separate bigram entries from the trie file."

parents d40a931a fd46e87d
No related branches found
No related tags found
No related merge requests found
......@@ -385,12 +385,14 @@ public class BinaryDictEncoderUtils {
nodeSize + size, ptNode.mChildren));
}
nodeSize += getShortcutListSize(ptNode.mShortcutTargets);
if (null != ptNode.mBigrams) {
for (WeightedString bigram : ptNode.mBigrams) {
final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray,
nodeSize + size + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE,
FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord));
nodeSize += getByteSize(offset) + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE;
if (formatOptions.mVersion < FormatSpec.FIRST_VERSION_WITH_TERMINAL_ID) {
if (null != ptNode.mBigrams) {
for (WeightedString bigram : ptNode.mBigrams) {
final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray,
nodeSize + size + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE,
FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord));
nodeSize += getByteSize(offset) + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE;
}
}
}
ptNode.mCachedSize = nodeSize;
......
......@@ -265,8 +265,12 @@ public final class FormatSpec {
static final String FREQ_FILE_EXTENSION = ".freq";
// tat = Terminal Address Table
static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
static final String BIGRAM_FILE_EXTENSION = ".bigram";
static final String BIGRAM_LOOKUP_TABLE_FILE_EXTENSION = ".bigram_lookup";
static final String BIGRAM_ADDRESS_TABLE_FILE_EXTENSION = ".bigram_index";
static final int FREQUENCY_AND_FLAGS_SIZE = 2;
static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
static final int NO_PARENT_ADDRESS = 0;
......
......@@ -18,6 +18,9 @@ package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.annotations.UsedForTesting;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
......@@ -147,4 +150,45 @@ public class SparseTable {
BinaryDictEncoderUtils.writeUIntToStream(contentOutStream, index, 4);
}
}
@UsedForTesting
public void writeToFiles(final File lookupTableFile, final File contentFile)
throws IOException {
FileOutputStream lookupTableOutStream = null;
FileOutputStream contentOutStream = null;
try {
lookupTableOutStream = new FileOutputStream(lookupTableFile);
contentOutStream = new FileOutputStream(contentFile);
write(lookupTableOutStream, contentOutStream);
} finally {
if (lookupTableOutStream != null) {
lookupTableOutStream.close();
}
if (contentOutStream != null) {
contentOutStream.close();
}
}
}
private static byte[] readFileToByteArray(final File file) throws IOException {
final byte[] contents = new byte[(int) file.length()];
FileInputStream inStream = null;
try {
inStream = new FileInputStream(file);
inStream.read(contents);
} finally {
if (inStream != null) {
inStream.close();
}
}
return contents;
}
@UsedForTesting
public static SparseTable readFromFiles(final File lookupTableFile, final File contentFile,
final int blockSize) throws IOException {
final byte[] lookupTable = readFileToByteArray(lookupTableFile);
final byte[] content = readFileToByteArray(contentFile);
return new SparseTable(lookupTable, content, blockSize);
}
}
......@@ -42,12 +42,15 @@ public class Ver4DictDecoder extends DictDecoder {
private static final int FILETYPE_TRIE = 1;
private static final int FILETYPE_FREQUENCY = 2;
private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
private static final int FILETYPE_BIGRAM = 4;
private final File mDictDirectory;
private final DictionaryBufferFactory mBufferFactory;
private DictBuffer mDictBuffer;
private DictBuffer mFrequencyBuffer;
private DictBuffer mTerminalAddressTableBuffer;
private DictBuffer mBigramBuffer;
private SparseTable mBigramAddressTable;
@UsedForTesting
/* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) {
......@@ -82,6 +85,9 @@ public class Ver4DictDecoder extends DictDecoder {
} else if (fileType == FILETYPE_TERMINAL_ADDRESS_TABLE) {
return new File(mDictDirectory,
mDictDirectory.getName() + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
} else if (fileType == FILETYPE_BIGRAM) {
return new File(mDictDirectory,
mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION);
} else {
throw new RuntimeException("Unsupported kind of file : " + fileType);
}
......@@ -94,6 +100,8 @@ public class Ver4DictDecoder extends DictDecoder {
mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY));
mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer(
getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM));
loadBigramAddressSparseTable();
}
@Override
......@@ -118,6 +126,15 @@ public class Ver4DictDecoder extends DictDecoder {
return header;
}
private void loadBigramAddressSparseTable() throws IOException {
final File lookupIndexFile = new File(mDictDirectory,
mDictDirectory.getName() + FormatSpec.BIGRAM_LOOKUP_TABLE_FILE_EXTENSION);
final File contentFile = new File(mDictDirectory,
mDictDirectory.getName() + FormatSpec.BIGRAM_ADDRESS_TABLE_FILE_EXTENSION);
mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, contentFile,
FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE);
}
protected static class PtNodeReader extends DictDecoder.PtNodeReader {
protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) {
frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1);
......@@ -191,8 +208,21 @@ public class Ver4DictDecoder extends DictDecoder {
final ArrayList<PendingAttribute> bigrams;
if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
bigrams = new ArrayList<PendingAttribute>();
addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams,
addressPointer);
final int posOfBigrams = mBigramAddressTable.get(terminalId);
mBigramBuffer.position(posOfBigrams);
while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
// If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE,
// remaining bigram entries are ignored.
final int bigramFlags = mBigramBuffer.readUnsignedByte();
final int targetTerminalId = mBigramBuffer.readUnsignedInt24();
mTerminalAddressTableBuffer.position(
targetTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
final int targetAddress = mTerminalAddressTableBuffer.readUnsignedInt24();
bigrams.add(new PendingAttribute(
bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
targetAddress));
if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
}
if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
MakedictLog.d("too many bigrams in a node.");
}
......
......@@ -26,6 +26,7 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
......@@ -43,9 +44,13 @@ public class Ver4DictEncoder implements DictEncoder {
private byte[] mTrieBuf;
private int mTriePos;
private int mHeaderSize;
private SparseTable mBigramAddressTable;
private OutputStream mTrieOutStream;
private OutputStream mFreqOutStream;
private OutputStream mTerminalAddressTableOutStream;
private OutputStream mBigramOutStream;
private File mDictDir;
private String mBaseFilename;
@UsedForTesting
public Ver4DictEncoder(final File dictPlacedDir) {
......@@ -55,12 +60,14 @@ public class Ver4DictEncoder implements DictEncoder {
private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions)
throws FileNotFoundException, IOException {
final FileHeader header = new FileHeader(0, dictOptions, formatOptions);
final String filename = header.getId() + "." + header.getVersion();
final File mDictDir = new File(mDictPlacedDir, filename);
final File trieFile = new File(mDictDir, filename + FormatSpec.TRIE_FILE_EXTENSION);
final File freqFile = new File(mDictDir, filename + FormatSpec.FREQ_FILE_EXTENSION);
mBaseFilename = header.getId() + "." + header.getVersion();
mDictDir = new File(mDictPlacedDir, mBaseFilename);
final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION);
final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION);
final File terminalAddressTableFile = new File(mDictDir,
filename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
final File bigramFile = new File(mDictDir,
mBaseFilename + FormatSpec.BIGRAM_FILE_EXTENSION);
if (!mDictDir.isDirectory()) {
if (mDictDir.exists()) mDictDir.delete();
mDictDir.mkdirs();
......@@ -71,6 +78,7 @@ public class Ver4DictEncoder implements DictEncoder {
mTrieOutStream = new FileOutputStream(trieFile);
mFreqOutStream = new FileOutputStream(freqFile);
mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile);
mBigramOutStream = new FileOutputStream(bigramFile);
}
private void close() throws IOException {
......@@ -84,10 +92,14 @@ public class Ver4DictEncoder implements DictEncoder {
if (mTerminalAddressTableOutStream != null) {
mTerminalAddressTableOutStream.close();
}
if (mBigramOutStream != null) {
mBigramOutStream.close();
}
} finally {
mTrieOutStream = null;
mFreqOutStream = null;
mTerminalAddressTableOutStream = null;
mBigramOutStream = null;
}
}
......@@ -123,6 +135,10 @@ public class Ver4DictEncoder implements DictEncoder {
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
writeTerminalData(flatNodes, terminalCount);
mBigramAddressTable = new SparseTable(terminalCount,
FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE);
writeBigrams(flatNodes, dict);
writeBigramAddressSparseTable();
final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
......@@ -230,24 +246,41 @@ public class Ver4DictEncoder implements DictEncoder {
shortcutByteSize, FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE);
}
private void writeBigrams(ArrayList<WeightedString> bigrams, FusionDictionary dict) {
if (bigrams == null) return;
final Iterator<WeightedString> bigramIterator = bigrams.iterator();
while (bigramIterator.hasNext()) {
final WeightedString bigram = bigramIterator.next();
final PtNode target =
FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
final int addressOfBigram = target.mCachedAddressAfterUpdate;
final int unigramFrequencyForThisWord = target.mFrequency;
final int offset = addressOfBigram
- (mTriePos + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
offset, bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord);
mTrieBuf[mTriePos++] = (byte) bigramFlags;
mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf,
mTriePos, Math.abs(offset));
private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict)
throws IOException {
final ByteArrayOutputStream bigramBuffer = new ByteArrayOutputStream();
for (final PtNodeArray nodeArray : flatNodes) {
for (final PtNode ptNode : nodeArray.mData) {
if (ptNode.mBigrams != null) {
final int startPos = bigramBuffer.size();
mBigramAddressTable.set(ptNode.mTerminalId, startPos);
final Iterator<WeightedString> bigramIterator = ptNode.mBigrams.iterator();
while (bigramIterator.hasNext()) {
final WeightedString bigram = bigramIterator.next();
final PtNode target =
FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
final int unigramFrequencyForThisWord = target.mFrequency;
final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(
bigramIterator.hasNext(), 0, bigram.mFrequency,
unigramFrequencyForThisWord, bigram.mWord);
BinaryDictEncoderUtils.writeUIntToStream(bigramBuffer, bigramFlags,
FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
BinaryDictEncoderUtils.writeUIntToStream(bigramBuffer, target.mTerminalId,
FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
}
}
}
}
bigramBuffer.writeTo(mBigramOutStream);
}
private void writeBigramAddressSparseTable() throws IOException {
final File lookupIndexFile =
new File(mDictDir, mBaseFilename + FormatSpec.BIGRAM_LOOKUP_TABLE_FILE_EXTENSION);
final File contentFile =
new File(mDictDir, mBaseFilename + FormatSpec.BIGRAM_ADDRESS_TABLE_FILE_EXTENSION);
mBigramAddressTable.writeToFiles(lookupIndexFile, contentFile);
}
@Override
......@@ -267,7 +300,6 @@ public class Ver4DictEncoder implements DictEncoder {
}
writeChildrenPosition(ptNode, formatOptions);
writeShortcuts(ptNode.mShortcutTargets);
writeBigrams(ptNode.mBigrams, dict);
}
private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment