From 2fe2adbd875722fbb84b2dce557984b512f000fb Mon Sep 17 00:00:00 2001 From: ijuren Date: Wed, 31 Jan 2024 20:49:28 +0100 Subject: [PATCH 1/6] Common sense implementation --- calculate_average_ijuren.sh | 19 ++ .../onebrc/CalculateAverage_ijuren.java | 212 ++++++++++++++++++ 2 files changed, 231 insertions(+) create mode 100755 calculate_average_ijuren.sh create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_ijuren.java diff --git a/calculate_average_ijuren.sh b/calculate_average_ijuren.sh new file mode 100755 index 000000000..1c3bedecd --- /dev/null +++ b/calculate_average_ijuren.sh @@ -0,0 +1,19 @@ +#!/bin/sh +# +# Copyright 2023 The original authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +JAVA_OPTS="--enable-preview" +java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_ijuren diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ijuren.java b/src/main/java/dev/morling/onebrc/CalculateAverage_ijuren.java new file mode 100644 index 000000000..0f2952f92 --- /dev/null +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_ijuren.java @@ -0,0 +1,212 @@ +/* + * Copyright 2023 The original authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dev.morling.onebrc; + +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Objects; +import java.util.TreeMap; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static java.lang.Math.round; +import static java.nio.channels.FileChannel.MapMode.READ_ONLY; +import static java.nio.file.StandardOpenOption.READ; + +public class CalculateAverage_ijuren { + + private static final String FILE_NAME = "./measurements.txt"; + + public static void main(String[] args) throws IOException { + long[] segments = getSegments(Runtime.getRuntime().availableProcessors()); + + var result = IntStream.range(0, segments.length - 1) + .parallel() + .mapToObj(i -> processSegment(segments[i], segments[i + 1])) + .flatMap(m -> Arrays.stream(m.hashTable).filter(Objects::nonNull)) + .collect(Collectors.toMap(m -> new String(m.city), m -> m, Measurement::merge, TreeMap::new)); + + System.out.println(result); + } + + private static LinearProbingHashMap processSegment(long start, long end) { + var results = new LinearProbingHashMap(1 << 22); + + try (var fileChannel = (FileChannel) Files.newByteChannel(Path.of(FILE_NAME), READ)) { + var bb = fileChannel.map(READ_ONLY, start, end - start); + var buffer = new byte[100]; + + int limit = bb.limit(); + for (int startLine = bb.position(); startLine < limit; startLine = bb.position()) { + int currentPosition = startLine; + + byte b; + int hash = 7; + int wordLen = 0; + while (currentPosition < end && (b = bb.get(currentPosition++)) != ';') { + buffer[wordLen++] = b; + hash = hash * 31 + b; + } + + int temp; + int negative = 1; + if (bb.get(currentPosition) == '-') { + negative = -1; + currentPosition++; + } + + if (bb.get(currentPosition + 1) == '.') { + temp = negative * ((bb.get(currentPosition) - '0') * 10 + (bb.get(currentPosition + 2) - '0')); + currentPosition += 3; + } + else { + temp = negative * ((bb.get(currentPosition) - '0') * 100 + ((bb.get(currentPosition + 1) - '0') * 10 + (bb.get(currentPosition + 3) - '0'))); + currentPosition += 4; + } + + currentPosition++; + + results.put(hash, buffer, wordLen, temp); + + bb.position(currentPosition); + } + } + catch (IOException e) { + throw new RuntimeException(e); + } + return results; + } + + private static long[] getSegments(int segmentCount) throws IOException { + try (var raf = new RandomAccessFile(FILE_NAME, "r")) { + long fileSize = raf.length(); + + if (fileSize < 100000) { + long[] chunks = new long[2]; + chunks[1] = fileSize; + return chunks; + } + + long[] chunks = new long[segmentCount + 1]; + + chunks[0] = 0; + long segmentSize = fileSize / segmentCount; + + for (int i = 1; i < segmentCount; i++) { + long chunkOffset = chunks[i - 1] + segmentSize; + raf.seek(chunkOffset); + raf.readLine(); + chunks[i] = raf.getFilePointer(); + } + chunks[segmentCount] = fileSize; + return chunks; + } + } + + public static class LinearProbingHashMap { + final Measurement[] hashTable; + int slots; + + public LinearProbingHashMap(int slots) { + this.slots = slots; + this.hashTable = new Measurement[slots]; + } + + void put(int hash, byte[] key, int len, int temperature) { + hash = Math.abs(hash); + int index = hash % slots; + + int i = index; + while (hashTable[i] != null) { + if (keyIsEqual(key, hashTable[index].city, len)) { + hashTable[i].add(temperature); + return; + } + i++; + if (i == slots) { + i = 0; + } + } + + var cityArr = new byte[len]; + System.arraycopy(key, 0, cityArr, 0, len); + hashTable[index] = new Measurement(cityArr, hash, temperature, temperature, 1, temperature); + } + + private boolean keyIsEqual(byte[] one, byte[] other, int len) { + for (int i = 0; i < len; i++) { + if (one[i] != other[i]) { + return false; + } + } + return true; + } + + } + + static class Measurement { + byte[] city; + int hash; + int min; + int max; + int count; + long sum; + + public Measurement(byte[] city, int hash, int min, int max, int count, long sum) { + this.city = city; + this.hash = hash; + this.min = min; + this.max = max; + this.count = count; + this.sum = sum; + } + + public void add(int temperature) { + min = Math.min(min, temperature); + max = Math.max(max, temperature); + count++; + sum += temperature; + } + + public Measurement merge(Measurement other) { + min = Math.min(min, other.min); + max = Math.max(max, other.max); + count += other.count; + sum += other.sum; + return this; + } + + @Override + public String toString() { + return (min * 1.0) / 10 + "/" + round((sum * 1.0) / count) / 10.0 + "/" + (max * 1.0) / 10; + } + + @Override + public int hashCode() { + return hash; + } + + @Override + public boolean equals(Object obj) { + return Arrays.equals(city, ((Measurement) obj).city); + } + } +} From be4c3bcf00ad87547a7f088dec56a1f0ee72438a Mon Sep 17 00:00:00 2001 From: ijuren Date: Wed, 31 Jan 2024 21:20:52 +0100 Subject: [PATCH 2/6] fix filename --- ...erage_ijuren.sh => calculate_average_JurenIvan.sh | 2 +- ...e_ijuren.java => CalculateAverage_JurenIvan.java} | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) rename calculate_average_ijuren.sh => calculate_average_JurenIvan.sh (93%) rename src/main/java/dev/morling/onebrc/{CalculateAverage_ijuren.java => CalculateAverage_JurenIvan.java} (95%) diff --git a/calculate_average_ijuren.sh b/calculate_average_JurenIvan.sh similarity index 93% rename from calculate_average_ijuren.sh rename to calculate_average_JurenIvan.sh index 1c3bedecd..73d956e90 100755 --- a/calculate_average_ijuren.sh +++ b/calculate_average_JurenIvan.sh @@ -16,4 +16,4 @@ # JAVA_OPTS="--enable-preview" -java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_ijuren +java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_JurenIvan diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ijuren.java b/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java similarity index 95% rename from src/main/java/dev/morling/onebrc/CalculateAverage_ijuren.java rename to src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java index 0f2952f92..1238780ee 100644 --- a/src/main/java/dev/morling/onebrc/CalculateAverage_ijuren.java +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java @@ -31,11 +31,12 @@ import static java.nio.channels.FileChannel.MapMode.READ_ONLY; import static java.nio.file.StandardOpenOption.READ; -public class CalculateAverage_ijuren { +public class CalculateAverage_JurenIvan { private static final String FILE_NAME = "./measurements.txt"; public static void main(String[] args) throws IOException { + long start = System.currentTimeMillis(); long[] segments = getSegments(Runtime.getRuntime().availableProcessors()); var result = IntStream.range(0, segments.length - 1) @@ -45,10 +46,13 @@ public static void main(String[] args) throws IOException { .collect(Collectors.toMap(m -> new String(m.city), m -> m, Measurement::merge, TreeMap::new)); System.out.println(result); + long end = System.currentTimeMillis(); + + System.out.println(end - start); } private static LinearProbingHashMap processSegment(long start, long end) { - var results = new LinearProbingHashMap(1 << 22); + var results = new LinearProbingHashMap(1 << 19); try (var fileChannel = (FileChannel) Files.newByteChannel(Path.of(FILE_NAME), READ)) { var bb = fileChannel.map(READ_ONLY, start, end - start); @@ -132,11 +136,11 @@ public LinearProbingHashMap(int slots) { void put(int hash, byte[] key, int len, int temperature) { hash = Math.abs(hash); - int index = hash % slots; + int index = hash & (slots - 1); int i = index; while (hashTable[i] != null) { - if (keyIsEqual(key, hashTable[index].city, len)) { + if (keyIsEqual(key, hashTable[index].city, len)) { //handling hash collisions hashTable[i].add(temperature); return; } From 5de857fa243fc98bf74c45ed59fa7206f1e1f4dd Mon Sep 17 00:00:00 2001 From: ijuren Date: Wed, 31 Jan 2024 21:31:46 +0100 Subject: [PATCH 3/6] formatting --- .../java/dev/morling/onebrc/CalculateAverage_JurenIvan.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java b/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java index 1238780ee..1a8b0cb57 100644 --- a/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java @@ -140,7 +140,7 @@ void put(int hash, byte[] key, int len, int temperature) { int i = index; while (hashTable[i] != null) { - if (keyIsEqual(key, hashTable[index].city, len)) { //handling hash collisions + if (keyIsEqual(key, hashTable[index].city, len)) { // handling hash collisions hashTable[i].add(temperature); return; } From 8bd465263192d98e312112e35345c61f808eaf77 Mon Sep 17 00:00:00 2001 From: ijuren Date: Wed, 31 Jan 2024 21:40:12 +0100 Subject: [PATCH 4/6] remove excess system.out.println --- .../java/dev/morling/onebrc/CalculateAverage_JurenIvan.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java b/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java index 1a8b0cb57..b597b39dd 100644 --- a/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java @@ -36,7 +36,6 @@ public class CalculateAverage_JurenIvan { private static final String FILE_NAME = "./measurements.txt"; public static void main(String[] args) throws IOException { - long start = System.currentTimeMillis(); long[] segments = getSegments(Runtime.getRuntime().availableProcessors()); var result = IntStream.range(0, segments.length - 1) @@ -46,9 +45,6 @@ public static void main(String[] args) throws IOException { .collect(Collectors.toMap(m -> new String(m.city), m -> m, Measurement::merge, TreeMap::new)); System.out.println(result); - long end = System.currentTimeMillis(); - - System.out.println(end - start); } private static LinearProbingHashMap processSegment(long start, long end) { From 9858f479df38fd6b0da2914b71762d3611ec4f39 Mon Sep 17 00:00:00 2001 From: ijuren Date: Thu, 1 Feb 2024 10:48:35 +0100 Subject: [PATCH 5/6] fix hash collisions --- .../dev/morling/onebrc/CalculateAverage_JurenIvan.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java b/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java index b597b39dd..3cad510b5 100644 --- a/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java @@ -113,7 +113,8 @@ private static long[] getSegments(int segmentCount) throws IOException { for (int i = 1; i < segmentCount; i++) { long chunkOffset = chunks[i - 1] + segmentSize; raf.seek(chunkOffset); - raf.readLine(); + while (raf.readByte() != '\n') { + } chunks[i] = raf.getFilePointer(); } chunks[segmentCount] = fileSize; @@ -136,7 +137,7 @@ void put(int hash, byte[] key, int len, int temperature) { int i = index; while (hashTable[i] != null) { - if (keyIsEqual(key, hashTable[index].city, len)) { // handling hash collisions + if (keyIsEqual(key, hashTable[i].city, len)) { // handling hash collisions hashTable[i].add(temperature); return; } @@ -148,10 +149,12 @@ void put(int hash, byte[] key, int len, int temperature) { var cityArr = new byte[len]; System.arraycopy(key, 0, cityArr, 0, len); - hashTable[index] = new Measurement(cityArr, hash, temperature, temperature, 1, temperature); + hashTable[i] = new Measurement(cityArr, hash, temperature, temperature, 1, temperature); } private boolean keyIsEqual(byte[] one, byte[] other, int len) { + if (len != other.length) + return false; for (int i = 0; i < len; i++) { if (one[i] != other[i]) { return false; From b07c7e09c3aa3d6743063188c6484cb89d477d57 Mon Sep 17 00:00:00 2001 From: ijuren Date: Thu, 1 Feb 2024 12:38:46 +0100 Subject: [PATCH 6/6] ajdust so taht segment size smaller than Integer.MAX_VALUE --- .../java/dev/morling/onebrc/CalculateAverage_JurenIvan.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java b/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java index 3cad510b5..3f9306899 100644 --- a/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java @@ -105,6 +105,10 @@ private static long[] getSegments(int segmentCount) throws IOException { return chunks; } + while (fileSize / segmentCount >= (Integer.MAX_VALUE - 150)) { + segmentCount *= 2; + } + long[] chunks = new long[segmentCount + 1]; chunks[0] = 0;