uber · mayurdb · Jul 19, 2021 · Jul 23, 2021 · Jul 24, 2021 · Jul 27, 2021
diff --git a/src/main/java/org/apache/spark/shuffle/sort/RadixSort.java b/src/main/java/org/apache/spark/shuffle/sort/RadixSort.java
@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.sort;
+
+import com.google.common.primitives.Ints;
+
+import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.array.LongArray;
+
+public class RadixSort {
+
+    /**
+     * Sorts a given array of longs using least-significant-digit radix sort. This routine assumes
+     * you have extra space at the end of the array at least equal to the number of records. The
+     * sort is destructive and may relocate the data positioned within the array.
+     *
+     * @param array array of long elements followed by at least that many empty slots.
+     * @param numRecords number of data records in the array.
+     * @param startByteIndex the first byte (in range [0, 7]) to sort each long by, counting from the
+     *                       least significant byte.
+     * @param endByteIndex the last byte (in range [0, 7]) to sort each long by, counting from the
+     *                     least significant byte. Must be greater than startByteIndex.
+     * @param desc whether this is a descending (binary-order) sort.
+     * @param signed whether this is a signed (two's complement) sort.
+     *
+     * @return The starting index of the sorted data within the given array. We return this instead
+     *         of always copying the data back to position zero for efficiency.
+     */
+    public static int sort(
+            LongArray array, long numRecords, int startByteIndex, int endByteIndex,
+            boolean desc, boolean signed) {
+        assert startByteIndex >= 0 : "startByteIndex (" + startByteIndex + ") should >= 0";
+        assert endByteIndex <= 7 : "endByteIndex (" + endByteIndex + ") should <= 7";
+        assert endByteIndex > startByteIndex;
+        assert numRecords * 2 <= array.size();
+        long inIndex = 0;
+        long outIndex = numRecords;
+        if (numRecords > 0) {
+            long[][] counts = getCounts(array, numRecords, startByteIndex, endByteIndex);
+            for (int i = startByteIndex; i <= endByteIndex; i++) {
+                if (counts[i] != null) {
+                    sortAtByte(
+                            array, numRecords, counts[i], i, inIndex, outIndex,
+                            desc, signed && i == endByteIndex);
+                    long tmp = inIndex;
+                    inIndex = outIndex;
+                    outIndex = tmp;
+                }
+            }
+        }
+        return Ints.checkedCast(inIndex);
+    }
+
+    /**
+     * Performs a partial sort by copying data into destination offsets for each byte value at the
+     * specified byte offset.
+     *
+     * @param array array to partially sort.
+     * @param numRecords number of data records in the array.
+     * @param counts counts for each byte value. This routine destructively modifies this array.
+     * @param byteIdx the byte in a long to sort at, counting from the least significant byte.
+     * @param inIndex the starting index in the array where input data is located.
+     * @param outIndex the starting index where sorted output data should be written.
+     * @param desc whether this is a descending (binary-order) sort.
+     * @param signed whether this is a signed (two's complement) sort (only applies to last byte).
+     */
+    private static void sortAtByte(
+            LongArray array, long numRecords, long[] counts, int byteIdx, long inIndex, long outIndex,
+            boolean desc, boolean signed) {
+        assert counts.length == 256;
+        long[] offsets = transformCountsToOffsets(
+                counts, numRecords, array.getBaseOffset() + outIndex * 8L, 8, desc, signed);
+        Object baseObject = array.getBaseObject();
+        long baseOffset = array.getBaseOffset() + inIndex * 8L;
+        long maxOffset = baseOffset + numRecords * 8L;
+        for (long offset = baseOffset; offset < maxOffset; offset += 8) {
+            long value = Platform.getLong(baseObject, offset);
+            int bucket = (int)((value >>> (byteIdx * 8)) & 0xff);
+            Platform.putLong(baseObject, offsets[bucket], value);
+            offsets[bucket] += 8;
+        }
+    }
+
+    /**
+     * Computes a value histogram for each byte in the given array.
+     *
+     * @param array array to count records in.
+     * @param numRecords number of data records in the array.
+     * @param startByteIndex the first byte to compute counts for (the prior are skipped).
+     * @param endByteIndex the last byte to compute counts for.
+     *
+     * @return an array of eight 256-byte count arrays, one for each byte starting from the least
+     *         significant byte. If the byte does not need sorting the array will be null.
+     */
+    private static long[][] getCounts(
+            LongArray array, long numRecords, int startByteIndex, int endByteIndex) {
+        long[][] counts = new long[8][];
+        // Optimization: do a fast pre-pass to determine which byte indices we can skip for sorting.
+        // If all the byte values at a particular index are the same we don't need to count it.
+        long bitwiseMax = 0;
+        long bitwiseMin = -1L;
+        long maxOffset = array.getBaseOffset() + numRecords * 8L;
+        Object baseObject = array.getBaseObject();
+        for (long offset = array.getBaseOffset(); offset < maxOffset; offset += 8) {
+            long value = Platform.getLong(baseObject, offset);
+            bitwiseMax |= value;
+            bitwiseMin &= value;
+        }
+        long bitsChanged = bitwiseMin ^ bitwiseMax;
+        // Compute counts for each byte index.
+        for (int i = startByteIndex; i <= endByteIndex; i++) {
+            if (((bitsChanged >>> (i * 8)) & 0xff) != 0) {
+                counts[i] = new long[256];
+                // TODO(ekl) consider computing all the counts in one pass.
+                for (long offset = array.getBaseOffset(); offset < maxOffset; offset += 8) {
+                    counts[i][(int)((Platform.getLong(baseObject, offset) >>> (i * 8)) & 0xff)]++;
+                }
+            }
+        }
+        return counts;
+    }
+
+    /**
+     * Transforms counts into the proper unsafe output offsets for the sort type.
+     *
+     * @param counts counts for each byte value. This routine destructively modifies this array.
+     * @param numRecords number of data records in the original data array.
+     * @param outputOffset output offset in bytes from the base array object.
+     * @param bytesPerRecord size of each record (8 for plain sort, 16 for key-prefix sort).
+     * @param desc whether this is a descending (binary-order) sort.
+     * @param signed whether this is a signed (two's complement) sort.
+     *
+     * @return the input counts array.
+     */
+    private static long[] transformCountsToOffsets(
+            long[] counts, long numRecords, long outputOffset, long bytesPerRecord,
+            boolean desc, boolean signed) {
+        assert counts.length == 256;
+        int start = signed ? 128 : 0;  // output the negative records first (values 129-255).
+        if (desc) {
+            long pos = numRecords;
+            for (int i = start; i < start + 256; i++) {
+                pos -= counts[i & 0xff];
+                counts[i & 0xff] = outputOffset + pos * bytesPerRecord;
+            }
+        } else {
+            long pos = 0;
+            for (int i = start; i < start + 256; i++) {
+                long tmp = counts[i & 0xff];
+                counts[i & 0xff] = outputOffset + pos * bytesPerRecord;
+                pos += tmp;
+            }
+        }
+        return counts;
+    }
+
+    /**
+     * Specialization of sort() for key-prefix arrays. In this type of array, each record consists
+     * of two longs, only the second of which is sorted on.
+     *
+     * @param startIndex starting index in the array to sort from. This parameter is not supported
+     *    in the plain sort() implementation.
+     */
+    public static int sortKeyPrefixArray(
+            LongArray array,
+            long startIndex,
+            long numRecords,
+            int startByteIndex,
+            int endByteIndex,
+            boolean desc,
+            boolean signed) {
+        assert startByteIndex >= 0 : "startByteIndex (" + startByteIndex + ") should >= 0";
+        assert endByteIndex <= 7 : "endByteIndex (" + endByteIndex + ") should <= 7";
+        assert endByteIndex > startByteIndex;
+        assert numRecords * 4 <= array.size();
+        long inIndex = startIndex;
+        long outIndex = startIndex + numRecords * 2L;
+        if (numRecords > 0) {
+            long[][] counts = getKeyPrefixArrayCounts(
+                    array, startIndex, numRecords, startByteIndex, endByteIndex);
+            for (int i = startByteIndex; i <= endByteIndex; i++) {
+                if (counts[i] != null) {
+                    sortKeyPrefixArrayAtByte(
+                            array, numRecords, counts[i], i, inIndex, outIndex,
+                            desc, signed && i == endByteIndex);
+                    long tmp = inIndex;
+                    inIndex = outIndex;
+                    outIndex = tmp;
+                }
+            }
+        }
+        return Ints.checkedCast(inIndex);
+    }
+
+    /**
+     * Specialization of getCounts() for key-prefix arrays. We could probably combine this with
+     * getCounts with some added parameters but that seems to hurt in benchmarks.
+     */
+    private static long[][] getKeyPrefixArrayCounts(
+            LongArray array, long startIndex, long numRecords, int startByteIndex, int endByteIndex) {
+        long[][] counts = new long[8][];
+        long bitwiseMax = 0;
+        long bitwiseMin = -1L;
+        long baseOffset = array.getBaseOffset() + startIndex * 8L;
+        long limit = baseOffset + numRecords * 16L;
+        Object baseObject = array.getBaseObject();
+        for (long offset = baseOffset; offset < limit; offset += 16) {
+            long value = Platform.getLong(baseObject, offset + 8);
+            bitwiseMax |= value;
+            bitwiseMin &= value;
+        }
+        long bitsChanged = bitwiseMin ^ bitwiseMax;
+        for (int i = startByteIndex; i <= endByteIndex; i++) {
+            if (((bitsChanged >>> (i * 8)) & 0xff) != 0) {
+                counts[i] = new long[256];
+                for (long offset = baseOffset; offset < limit; offset += 16) {
+                    counts[i][(int)((Platform.getLong(baseObject, offset + 8) >>> (i * 8)) & 0xff)]++;
+                }
+            }
+        }
+        return counts;
+    }
+
+    /**
+     * Specialization of sortAtByte() for key-prefix arrays.
+     */
+    private static void sortKeyPrefixArrayAtByte(
+            LongArray array, long numRecords, long[] counts, int byteIdx, long inIndex, long outIndex,
+            boolean desc, boolean signed) {
+        assert counts.length == 256;
+        long[] offsets = transformCountsToOffsets(
+                counts, numRecords, array.getBaseOffset() + outIndex * 8L, 16, desc, signed);
+        Object baseObject = array.getBaseObject();
+        long baseOffset = array.getBaseOffset() + inIndex * 8L;
+        long maxOffset = baseOffset + numRecords * 16L;
+        for (long offset = baseOffset; offset < maxOffset; offset += 16) {
+            long key = Platform.getLong(baseObject, offset);
+            long prefix = Platform.getLong(baseObject, offset + 8);
+            int bucket = (int)((prefix >>> (byteIdx * 8)) & 0xff);
+            long dest = offsets[bucket];
+            Platform.putLong(baseObject, dest, key);
+            Platform.putLong(baseObject, dest + 8, prefix);
+            offsets[bucket] += 16;
+        }
+    }
+}
diff --git a/src/main/java/org/apache/spark/shuffle/sort/RssPackedRecordPointer.java b/src/main/java/org/apache/spark/shuffle/sort/RssPackedRecordPointer.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.sort;
+
+/**
+ * Wrapper around an 8-byte word that holds a 24-bit partition number and 40-bit record pointer.
+ * <p>
+ * Within the long, the data is laid out as follows:
+ * <pre>
+ *   [24 bit partition number][13 bit memory page number][27 bit offset in page]
+ * </pre>
+ * This implies that the maximum addressable page size is 2^27 bits = 128 megabytes, assuming that
+ * our offsets in pages are not 8-byte-word-aligned. Since we have 2^13 pages (based off the
+ * 13-bit page numbers assigned by {@link org.apache.spark.memory.TaskMemoryManager}), this
+ * implies that we can address 2^13 * 128 megabytes = 1 terabyte of RAM per task.
+ * <p>
+ * Assuming word-alignment would allow for a 1 gigabyte maximum page size, but we leave this
+ * optimization to future work as it will require more careful design to ensure that addresses are
+ * properly aligned (e.g. by padding records).
+ */
+public final class RssPackedRecordPointer {
+
+    static final int MAXIMUM_PAGE_SIZE_BYTES = 1 << 27;  // 128 megabytes
+
+    /**
+     * The maximum partition identifier that can be encoded. Note that partition ids start from 0.
+     */
+    static final int MAXIMUM_PARTITION_ID = (1 << 24) - 1;  // 16777215
+
+    /**
+     * The index of the first byte of the partition id, counting from the least significant byte.
+     */
+    static final int PARTITION_ID_START_BYTE_INDEX = 5;
+
+    /**
+     * The index of the last byte of the partition id, counting from the least significant byte.
+     */
+    static final int PARTITION_ID_END_BYTE_INDEX = 7;
+
+    /** Bit mask for the lower 40 bits of a long. */
+    private static final long MASK_LONG_LOWER_40_BITS = (1L << 40) - 1;
+
+    /** Bit mask for the upper 24 bits of a long */
+    private static final long MASK_LONG_UPPER_24_BITS = ~MASK_LONG_LOWER_40_BITS;
+
+    /** Bit mask for the lower 27 bits of a long. */
+    private static final long MASK_LONG_LOWER_27_BITS = (1L << 27) - 1;
+
+    /** Bit mask for the lower 51 bits of a long. */
+    private static final long MASK_LONG_LOWER_51_BITS = (1L << 51) - 1;
+
+    /** Bit mask for the upper 13 bits of a long */
+    private static final long MASK_LONG_UPPER_13_BITS = ~MASK_LONG_LOWER_51_BITS;
+
+    /**
+     * Pack a record address and partition id into a single word.
+     *
+     * @param recordPointer a record pointer encoded by TaskMemoryManager.
+     * @param partitionId a shuffle partition id (maximum value of 2^24).
+     * @return a packed pointer that can be decoded using the {@link RssPackedRecordPointer} class.
+     */
+    public static long packPointer(long recordPointer, int partitionId) {
+        assert (partitionId <= MAXIMUM_PARTITION_ID);
+        // Note that without word alignment we can address 2^27 bytes = 128 megabytes per page.
+        // Also note that this relies on some internals of how TaskMemoryManager encodes its addresses.
+        final long pageNumber = (recordPointer & MASK_LONG_UPPER_13_BITS) >>> 24;
+        final long compressedAddress = pageNumber | (recordPointer & MASK_LONG_LOWER_27_BITS);
+        return (((long) partitionId) << 40) | compressedAddress;
+    }
+
+    private long packedRecordPointer;
+
+    public void set(long packedRecordPointer) {
+        this.packedRecordPointer = packedRecordPointer;
+    }
+
+    public int getPartitionId() {
+        return (int) ((packedRecordPointer & MASK_LONG_UPPER_24_BITS) >>> 40);
+    }
+
+    public long getRecordPointer() {
+        final long pageNumber = (packedRecordPointer << 24) & MASK_LONG_UPPER_13_BITS;
+        final long offsetInPage = packedRecordPointer & MASK_LONG_LOWER_27_BITS;
+        return pageNumber | offsetInPage;
+    }
+
+}