In this article we write two equivalent programs in C++ and in Java, in exactly the same way to do exactly the same thing: the (in)famous bubble sort algorithm. Then we proceed to measure the latency. On this experiment, Java was faster than C++ even with the -O3 compiler option.
Note: We used the same isolated cpu core for all tests through thread pinning.
Java Version
java version "17.0.1" 2021-10-19 LTS Java(TM) SE Runtime Environment (build 17.0.1+12-LTS-39) Java HotSpot(TM) 64-Bit Server VM (build 17.0.1+12-LTS-39, mixed mode, sharing)
Java
Iterations: 9,000,000 Avg Time: 825.97 nanos StDev: 156.4 nanos Min Time: 781 nanos Max Time: 107335 nanos 75% (6,750,000) = [avg: 801, stdev: 5.2, max: 876] - 25% (2,250,000) = [avg: 899, stdev: 301.03, min: 876] 90% (8,100,000) = [avg: 816, stdev: 32.78, max: 893] - 10% (900,000) = [avg: 915, stdev: 475.5, min: 893] 99% (8,910,000) = [avg: 823, stdev: 39.0, max: 904] - 1% (90,000) = [avg: 1077, stdev: 1493.9, min: 904] 99.9% (8,991,000) = [avg: 824, stdev: 39.62, max: 915] - 0.1% (9,000) = [avg: 2610, stdev: 4439.05, min: 915] 99.99% (8,999,100) = [avg: 824, stdev: 47.73, max: 13448] - 0.01% (900) = [avg: 15264, stdev: 3653.04, min: 13484] 99.999% (8,999,910) = [avg: 825, stdev: 141.2, max: 16186] - 0.001% (90) = [avg: 19503, stdev: 10169.29, min: 16187] 99.9999% (8,999,991) = [avg: 825, stdev: 149.83, max: 25498] - 0.0001% (9) = [avg: 38328, stdev: 24606.03, min: 25787] 99.99999% (8,999,999) = [avg: 825, stdev: 152.32, max: 34315] - 0.00001% (1) = [avg: 107335, stdev: 0.0, min: 107335]
C++ (-O3)
Iterations: 9,000,000 Avg Time: 1229 nanos Stdev: 157.94 nanos Min Time: 1141 nanos Max Time: 35318 nanos 75% (6,750,000) = [avg: 1183, stdev: 7.59, max: 1197] - 25% (2,250,000) = [avg: 1364, stdev: 273.80, min: 1197] 90% (8,100,000) = [avg: 1203, stdev: 63.25, max: 1431] - 10% (900,000) = [avg: 1458, stdev: 393.68, min: 1431] 99% (8,910,000) = [avg: 1225, stdev: 91.58, max: 1462] - 1% (90,000) = [avg: 1595, stdev: 1236.27, min: 1462] 99.9% (8,991,000) = [avg: 1227, stdev: 94.04, max: 1484] - 0.1% (9,000) = [avg: 2732, stdev: 3721.22, min: 1484] 99.99% (8,999,100) = [avg: 1227, stdev: 95.25, max: 2305] - 0.01% (900) = [avg: 12314, stdev: 5985.26, min: 2305] 99.999% (8,999,910) = [avg: 1228, stdev: 146.17, max: 16629] - 0.001% (90) = [avg: 19763, stdev: 3783.30, min: 16631] 99.9999% (8,999,991) = [avg: 1229, stdev: 155.39, max: 23087] - 0.0001% (9) = [avg: 29166, stdev: 4156.72, min: 24677] 99.99999% (8,999,999) = [avg: 1229, stdev: 157.53, max: 34189] - 0.00001% (1) = [avg: 35318, stdev: 0.00, min: 35318]
Java Source Code
package com.coralblocks.coralthreads.sample;
import java.util.Arrays;
import com.coralblocks.coralbits.bench.Benchmarker;
import com.coralblocks.coralbits.util.OSUtils;
import com.coralblocks.coralbits.util.SystemUtils;
import com.coralblocks.coralthreads.Affinity;
public class TestPerformance {
// java -server -verbose:gc -cp ./target/classes:./target/coralthreads-all.jar:coralthreads-all.jar -DcoralThreadsVerbose=false -DbenchWorstPercs=true -DbenchTotals=true -DbenchStdev=true -DbenchMorePercs=true -DdetailedBenchmarker=true -DprocToBind=1 -DexcludeNanoTimeCost=true com.coralblocks.coralthreads.sample.TestPerformance 10000000 1000000 60
private static int[] HEAP_ARRAY;
public static void main(String[] args) {
int iterations = Integer.parseInt(args[0]);
int warmup = Integer.parseInt(args[1]);
int arraySize = Integer.parseInt(args[2]);
int procToBind = SystemUtils.getInt("procToBind", -1);
if (procToBind != -1 && OSUtils.isLinux()) {
Affinity.set(procToBind);
}
HEAP_ARRAY = new int[arraySize];
Benchmarker bench = Benchmarker.create(warmup);
long x = 0;
for(int i = 0; i < iterations; i++) {
bench.mark();
doSomething(HEAP_ARRAY, HEAP_ARRAY.length);
bench.measure();
for(int j = 0; j < HEAP_ARRAY.length; j++) {
x += HEAP_ARRAY[j];
}
}
System.out.println("Value computed: " + x);
System.out.println("Array: " + Arrays.toString(HEAP_ARRAY));
bench.printResults();
}
private static void swapping(int[] array, int x, int y) {
int temp = array[x];
array[x] = array[y];
array[y] = temp;
}
private static void bubbleSort(int[] array, int size) {
for(int i = 0; i < size; i++) {
int swaps = 0; // flag to detect any swap is there or not
for(int j = 0; j < size - i - 1; j++) {
if (array[j] > array[j + 1]) { // when the current item is bigger than next
swapping(array, j, j + 1);
swaps = 1;
}
}
if (swaps == 0) break; // No swap in this pass, so array is sorted
}
}
/*
* For speed, it is important to extract the hot code (i.e. the code executed in a loop) to its own method so the JIT can inline/optimize/compile.
*
* Note that the main() method above is executed only once.
*/
private final static void doSomething(int[] array, int size) {
for(int z = 0; z < size; z++) {
array[z] = size - z;
}
bubbleSort(array, size);
}
}
C++ Source Code
#include <iostream>
#include <string>
#include <random>
#include <cmath>
#include <algorithm>
#include <limits>
#include <sys/time.h>
#include <map>
#include <sched.h>
#include <sstream>
#include <iomanip>
using namespace std;
// TO COMPILE: g++ TestPerformance.cpp -o TestPerformance -std=c++11 -O3
// TO EXECUTE: ./TestPerformance 10000000 1000000 60 1
static const bool MORE_PERCS = true;
static const bool INCLUDE_WORST_PERCS = true;
static const bool INCLUDE_TOTALS = true;
static const bool INCLUDE_RATIOS = false;
static const bool INCLUDE_STDEV = true;
static const bool EXCLUDE_NANO_TS_COST = true;
long get_nano_ts(timespec* ts) {
clock_gettime(CLOCK_MONOTONIC, ts);
return ts->tv_sec * 1000000000 + ts->tv_nsec;
}
static const long NANO_COST_ITERATIONS = 10000000;
static long calc_nano_ts_cost() {
struct timespec ts;
long start = get_nano_ts(&ts);
long finish = start;
for (long i = 0; i < NANO_COST_ITERATIONS; i++) {
finish = get_nano_ts(&ts);
}
finish = get_nano_ts(&ts);
return (finish - start) / NANO_COST_ITERATIONS;
}
struct mi {
long value;
};
void add_perc(stringstream& ss, int size, double perc, map<int, mi*>* map) {
if (map->empty()) return;
int max = -1;
int minBottom = -1;
long x = round(perc * size);
long i = 0;
long iBottom = 0;
long sum = 0;
long sumBottom = 0;
bool trueForTopFalseForBottom = true;
bool flag = false;
const int arraySize = 1024 * 1024 * 10;
int* tempData = new int[arraySize];
double stdevTop = -1;
for(auto iter = map->begin(); iter != map->end(); iter++) {
if (flag) break;
int time = iter->first;
long count = (iter->second)->value;
for(int a = 0; a < count; a++) {
if (trueForTopFalseForBottom) {
tempData[i] = time;
i++;
sum += time;
if (i == x) {
max = time;
if (INCLUDE_STDEV) {
double avg = (double) sum / (double) i;
double temp = 0;
for(int b = 0; b < i; b++) {
int t = tempData[b];
temp += (avg - t) * (avg - t);
}
stdevTop = sqrt(((double) temp / (double) i));
}
if (INCLUDE_WORST_PERCS) {
trueForTopFalseForBottom = false;
} else {
flag = true;
break;
}
}
} else {
tempData[iBottom] = time;
iBottom++;
sumBottom += time;
if (minBottom == -1) {
minBottom = time;
}
}
}
}
ss << " | " << fixed << setprecision(5) << (perc * 100) << "%";
if (INCLUDE_TOTALS) ss << " (" << i << ")";
ss << " = [avg: " << (sum / i);
if (INCLUDE_STDEV) ss << ", stdev: " << fixed << setprecision(2) << stdevTop;
ss << ", max: " << max << "]";
if (INCLUDE_WORST_PERCS) {
ss << " - " << fixed << setprecision(5) << ((1 - perc) * 100) << "%";
if (INCLUDE_TOTALS) ss << " (" << (iBottom > 0 ? iBottom : 0) << ")";
ss << " = [avg: " << (iBottom > 0 ? (sumBottom / iBottom) : -1);
if (INCLUDE_STDEV) {
ss << ", stdev: ";
if (iBottom <= 0) {
ss << "?";
} else {
double avgBottom = (sumBottom / iBottom);
double temp = 0;
for(int b = 0; b < iBottom; b++) {
long t = tempData[b];
temp += (avgBottom - t) * (avgBottom - t);
}
double stdevBottom = sqrt((double) temp / (double) iBottom);
ss << fixed << setprecision(2) << stdevBottom;
}
}
ss << ", min: " << (minBottom != -1 ? minBottom : -1) << "]";
if (INCLUDE_RATIOS) ss << " R: " << fixed << setprecision(2) << (iBottom > 0 ? ( ((double) (sumBottom / iBottom) / (double) (sum / i) ) - 1) * 100 : -1) << "%";
}
delete[] tempData;
}
void swapping(int &a, int &b) { //swap the content of a and b
int temp;
temp = a;
a = b;
b = temp;
}
void display(int *array, int size) {
for(int i = 0; i<size; i++)
cout << array[i] << " ";
cout << endl;
}
void bubbleSort(int *array, int size) {
for(int i = 0; i<size; i++) {
int swaps = 0; //flag to detect any swap is there or not
for(int j = 0; j<size-i-1; j++) {
if(array[j] > array[j+1]) { //when the current item is bigger than next
swapping(array[j], array[j+1]);
swaps = 1; //set swap flag
}
}
if(!swaps)
break; // No swap in this pass, so array is sorted
}
}
void doSomething(int *array, int size) {
for(int z = 0; z < size; z++) {
array[z] = size - z;
}
bubbleSort(array, size);
}
int main(int argc, char* argv[]) {
int iterations = stoi(argv[1]);
int warmup = stoi(argv[2]);
int arraySize = stoi(argv[3]);
int proc = stoi(argv[4]);
cpu_set_t my_set;
CPU_ZERO(&my_set);
CPU_SET(proc, &my_set);
sched_setaffinity(0, sizeof(cpu_set_t), &my_set);
long nanoTimeCost = EXCLUDE_NANO_TS_COST ? calc_nano_ts_cost() : 0;
struct timespec ts;
long long x = 0;
long long totalTime = 0;
int minTime = numeric_limits<int>::max();
int maxTime = numeric_limits<int>::min();
map<int, mi*>* results = new map<int, mi*>();
int * array = (int*) malloc(arraySize * sizeof(int));
for(int i = 0; i < iterations; i++) {
long start = get_nano_ts(&ts);
doSomething(array, arraySize);
long end = get_nano_ts(&ts);
for(int j = 0; j < arraySize; j++) {
x += array[j];
}
int res = end - start - nanoTimeCost;
if (res <= 0) res = 1;
if (i >= warmup) {
totalTime += res;
minTime = min(minTime, res);
maxTime = max(maxTime, res);
auto iter = results->find(res);
if (iter != results->end()) {
(iter->second)->value = (iter->second)->value + 1;
} else {
mi* elem = new mi();
elem->value = 1;
(*results)[res] = elem;
}
}
}
int count = iterations - warmup;
double avg = totalTime / count;
cout << "Value computed: " << x << endl;
display(array, arraySize);
cout << "Nano timestamp cost: " << nanoTimeCost << endl;
free(array);
stringstream ss;
ss << "Iterations: " << count << " | Avg Time: " << avg;
if (INCLUDE_STDEV) {
long temp = 0;
long x = 0;
for(auto iter = results->begin(); iter != results->end(); iter++) {
int time = iter->first;
long count = (iter->second)->value;
for(int a = 0; a < count; a++) {
temp += (avg - time) * (avg - time);
x++;
}
}
double stdev = sqrt( temp / x );
ss << " | Stdev: " << fixed << setprecision(2) << stdev;
}
if (count > 0) {
ss << " | Min Time: " << minTime << " | Max Time: " << maxTime;
}
add_perc(ss, count, 0.75, results);
add_perc(ss, count, 0.90, results);
add_perc(ss, count, 0.99, results);
add_perc(ss, count, 0.999, results);
add_perc(ss, count, 0.9999, results);
add_perc(ss, count, 0.99999, results);
if (MORE_PERCS) {
add_perc(ss, count, 0.999999, results);
add_perc(ss, count, 0.9999999, results);
}
cout << ss.str() << endl << endl;
delete results;
return 0;
}