Performance Analysis: comparing C++ and Java

In this article we write two equivalent programs in C++ and in Java, in exactly the same way to do exactly the same thing: the (in)famous bubble sort algorithm. Then we proceed to measure the latency. On this experiment, Java was faster than C++ even with the -O3 compiler option.

Note: We used the same isolated cpu core for all tests through thread pinning.

Java Version

java version "17.0.1" 2021-10-19 LTS
Java(TM) SE Runtime Environment (build 17.0.1+12-LTS-39)
Java HotSpot(TM) 64-Bit Server VM (build 17.0.1+12-LTS-39, mixed mode, sharing)

Java

Iterations: 9,000,000
Avg Time: 825.97 nanos
StDev: 156.4 nanos
Min Time: 781 nanos
Max Time: 107335 nanos
75% (6,750,000) = [avg: 801, stdev: 5.2, max: 876] - 25% (2,250,000) = [avg: 899, stdev: 301.03, min: 876]
90% (8,100,000) = [avg: 816, stdev: 32.78, max: 893] - 10% (900,000) = [avg: 915, stdev: 475.5, min: 893]
99% (8,910,000) = [avg: 823, stdev: 39.0, max: 904] - 1% (90,000) = [avg: 1077, stdev: 1493.9, min: 904]
99.9% (8,991,000) = [avg: 824, stdev: 39.62, max: 915] - 0.1% (9,000) = [avg: 2610, stdev: 4439.05, min: 915]
99.99% (8,999,100) = [avg: 824, stdev: 47.73, max: 13448] - 0.01% (900) = [avg: 15264, stdev: 3653.04, min: 13484]
99.999% (8,999,910) = [avg: 825, stdev: 141.2, max: 16186] - 0.001% (90) = [avg: 19503, stdev: 10169.29, min: 16187]
99.9999% (8,999,991) = [avg: 825, stdev: 149.83, max: 25498] - 0.0001% (9) = [avg: 38328, stdev: 24606.03, min: 25787]
99.99999% (8,999,999) = [avg: 825, stdev: 152.32, max: 34315] - 0.00001% (1) = [avg: 107335, stdev: 0.0, min: 107335]

C++ (-O3)

Iterations: 9,000,000 
Avg Time: 1229 nanos
Stdev: 157.94 nanos
Min Time: 1141 nanos
Max Time: 35318 nanos
75% (6,750,000) = [avg: 1183, stdev: 7.59, max: 1197] - 25% (2,250,000) = [avg: 1364, stdev: 273.80, min: 1197]
90% (8,100,000) = [avg: 1203, stdev: 63.25, max: 1431] - 10% (900,000) = [avg: 1458, stdev: 393.68, min: 1431]
99% (8,910,000) = [avg: 1225, stdev: 91.58, max: 1462] - 1% (90,000) = [avg: 1595, stdev: 1236.27, min: 1462]
99.9% (8,991,000) = [avg: 1227, stdev: 94.04, max: 1484] - 0.1% (9,000) = [avg: 2732, stdev: 3721.22, min: 1484]
99.99% (8,999,100) = [avg: 1227, stdev: 95.25, max: 2305] - 0.01% (900) = [avg: 12314, stdev: 5985.26, min: 2305]
99.999% (8,999,910) = [avg: 1228, stdev: 146.17, max: 16629] - 0.001% (90) = [avg: 19763, stdev: 3783.30, min: 16631]
99.9999% (8,999,991) = [avg: 1229, stdev: 155.39, max: 23087] - 0.0001% (9) = [avg: 29166, stdev: 4156.72, min: 24677]
99.99999% (8,999,999) = [avg: 1229, stdev: 157.53, max: 34189] - 0.00001% (1) = [avg: 35318, stdev: 0.00, min: 35318]

Java Source Code

package com.coralblocks.coralthreads.sample;

import java.util.Arrays;

import com.coralblocks.coralbits.bench.Benchmarker;
import com.coralblocks.coralbits.util.OSUtils;
import com.coralblocks.coralbits.util.SystemUtils;
import com.coralblocks.coralthreads.Affinity;

public class TestPerformance {
	
	// java -server -verbose:gc -cp ./target/classes:./target/coralthreads-all.jar:coralthreads-all.jar -DcoralThreadsVerbose=false -DbenchWorstPercs=true -DbenchTotals=true -DbenchStdev=true -DbenchMorePercs=true -DdetailedBenchmarker=true -DprocToBind=1 -DexcludeNanoTimeCost=true com.coralblocks.coralthreads.sample.TestPerformance 10000000 1000000 60
	
	private static int[] HEAP_ARRAY;
	
	public static void main(String[] args) {
		
		int iterations = Integer.parseInt(args[0]);
		int warmup = Integer.parseInt(args[1]);
		int arraySize = Integer.parseInt(args[2]);
		int procToBind = SystemUtils.getInt("procToBind", -1);
		
		if (procToBind != -1 && OSUtils.isLinux()) {
			Affinity.set(procToBind);
		}
		
		HEAP_ARRAY = new int[arraySize];
		
		Benchmarker bench = Benchmarker.create(warmup);
		
		long x = 0;
		
		for(int i = 0; i < iterations; i++) {
			
			bench.mark();
			
			doSomething(HEAP_ARRAY, HEAP_ARRAY.length);
			
			bench.measure();
			
			for(int j = 0; j < HEAP_ARRAY.length; j++) {
				x += HEAP_ARRAY[j];
			}
		}
		
		System.out.println("Value computed: " + x);
		System.out.println("Array: " + Arrays.toString(HEAP_ARRAY));
		bench.printResults();
	}
	
	private static void swapping(int[] array, int x, int y) {
		int temp = array[x];
		array[x] = array[y];
		array[y] = temp;
	}
	
	private static void bubbleSort(int[] array, int size) {
		for(int i = 0; i < size; i++) {
			int swaps = 0; // flag to detect any swap is there or not
			for(int j = 0; j < size - i - 1; j++) {
				if (array[j] > array[j + 1]) { // when the current item is bigger than next
					swapping(array, j, j + 1);
					swaps = 1;
				}
			}
			if (swaps == 0) break; // No swap in this pass, so array is sorted
		}
	}
	
	/*
	 * For speed, it is important to extract the hot code (i.e. the code executed in a loop) to its own method so the JIT can inline/optimize/compile.
	 * 
	 * Note that the main() method above is executed only once.
	 */
	private final static void doSomething(int[] array, int size) {
		
		for(int z = 0; z < size; z++) {
			array[z] = size - z;
		}

		bubbleSort(array, size);
	}
}

C++ Source Code

#include <iostream>
#include <string>
#include <random>
#include <cmath>
#include <algorithm>
#include <limits>
#include <sys/time.h>
#include <map>
#include <sched.h>
#include <sstream>
#include <iomanip>

using namespace std;

// TO COMPILE: g++ TestPerformance.cpp -o TestPerformance -std=c++11 -O3
// TO EXECUTE: ./TestPerformance 10000000 1000000 60 1

static const bool MORE_PERCS = true;
static const bool INCLUDE_WORST_PERCS = true;
static const bool INCLUDE_TOTALS = true;
static const bool INCLUDE_RATIOS = false;
static const bool INCLUDE_STDEV = true;

static const bool EXCLUDE_NANO_TS_COST = true;

long get_nano_ts(timespec* ts) {
	clock_gettime(CLOCK_MONOTONIC, ts);
	return ts->tv_sec * 1000000000 + ts->tv_nsec;
}

static const long NANO_COST_ITERATIONS = 10000000;

static long calc_nano_ts_cost() {

	struct timespec ts;

   	long start = get_nano_ts(&ts);

    long finish = start;

    for (long i = 0; i < NANO_COST_ITERATIONS; i++) {
    	finish = get_nano_ts(&ts);
   	}

    finish = get_nano_ts(&ts);
        
    return (finish - start) / NANO_COST_ITERATIONS;
}

struct mi {
   long value;
};

void add_perc(stringstream& ss, int size, double perc, map<int, mi*>* map) {

	if (map->empty()) return;
	
	int max = -1;
	int minBottom = -1;
	
	long x = round(perc * size);
	long i = 0;
	long iBottom = 0;
	
	long sum = 0;
	long sumBottom = 0;
	
	bool trueForTopFalseForBottom = true;
	bool flag = false;
	
	const int arraySize = 1024 * 1024 * 10;
	int* tempData = new int[arraySize];
	double stdevTop = -1;
	
	for(auto iter = map->begin(); iter != map->end(); iter++) {
	
		if (flag) break;
	
		int time = iter->first;
		long count = (iter->second)->value;
		
		for(int a = 0; a < count; a++) {
		
			if (trueForTopFalseForBottom) {
		
				tempData[i] = time;
		
				i++;
				sum += time;
				
				if (i == x) {
					
					max = time;
					
					if (INCLUDE_STDEV) {
    						
						double avg = (double) sum / (double) i;
						double temp = 0;
						
						for(int b = 0; b < i; b++) {
							int t = tempData[b];
							temp += (avg - t) * (avg - t);
						}
						
						stdevTop = sqrt(((double) temp / (double) i));
					}
				
					if (INCLUDE_WORST_PERCS) {
    					trueForTopFalseForBottom = false;	
    				} else {
    					flag = true;
						break;
    				}
				}
				
			} else {
			
				tempData[iBottom] = time;
			
				iBottom++;
				sumBottom += time;
				if (minBottom == -1) {
					minBottom = time;
				}
			}
		}
	}
	
	ss << " | " << fixed << setprecision(5) << (perc * 100) << "%";
	if (INCLUDE_TOTALS) ss << " (" << i << ")";
	ss << " = [avg: " << (sum / i);
	if (INCLUDE_STDEV) ss << ", stdev: " << fixed << setprecision(2) << stdevTop;
	ss << ", max: " << max << "]";
	if (INCLUDE_WORST_PERCS) {
		ss << " - " << fixed << setprecision(5) << ((1 - perc) * 100) << "%";
		if (INCLUDE_TOTALS) ss << " (" << (iBottom > 0 ? iBottom : 0) << ")";
		ss << " = [avg: " << (iBottom > 0 ? (sumBottom / iBottom) : -1);
		
		if (INCLUDE_STDEV) {
		
			ss << ", stdev: ";
		
			if (iBottom <= 0) {
				ss << "?";
			} else {
			
				double avgBottom = (sumBottom / iBottom);
				
				double temp = 0;
				
				for(int b = 0; b < iBottom; b++) {
					long t = tempData[b];
					temp += (avgBottom - t) * (avgBottom - t);
				}
				
				double stdevBottom = sqrt((double) temp / (double) iBottom);
			
				ss << fixed << setprecision(2) << stdevBottom;
			}
		
		}
		
		ss << ", min: " << (minBottom != -1 ? minBottom : -1) << "]";
		if (INCLUDE_RATIOS) ss << " R: " << fixed << setprecision(2) << (iBottom > 0 ? ( ((double) (sumBottom / iBottom) / (double) (sum / i) ) - 1) * 100 : -1) << "%";
	}
	
	delete[] tempData;
}

void swapping(int &a, int &b) {      //swap the content of a and b
   int temp;
   temp = a;
   a = b;
   b = temp;
}
void display(int *array, int size) {
   for(int i = 0; i<size; i++)
      cout << array[i] << " ";
   cout << endl;
}
void bubbleSort(int *array, int size) {
   for(int i = 0; i<size; i++) {
      int swaps = 0;         //flag to detect any swap is there or not
      for(int j = 0; j<size-i-1; j++) {
         if(array[j] > array[j+1]) {       //when the current item is bigger than next
            swapping(array[j], array[j+1]);
            swaps = 1;    //set swap flag
         }
      }
      if(!swaps)
         break;       // No swap in this pass, so array is sorted
   }
}

void doSomething(int *array, int size) {

	for(int z = 0; z < size; z++) {
		array[z] = size - z;
	}

	bubbleSort(array, size);
}

int main(int argc, char* argv[]) {
	
	int iterations = stoi(argv[1]);
	int warmup = stoi(argv[2]);
	int arraySize = stoi(argv[3]);
	int proc = stoi(argv[4]);
	
	cpu_set_t my_set;
	CPU_ZERO(&my_set);
	CPU_SET(proc, &my_set);
	sched_setaffinity(0, sizeof(cpu_set_t), &my_set);
	
	long nanoTimeCost = EXCLUDE_NANO_TS_COST ? calc_nano_ts_cost() : 0;
	
	struct timespec ts;
	
	long long x = 0;
	long long totalTime = 0;
	int minTime = numeric_limits<int>::max();
	int maxTime = numeric_limits<int>::min();
	
	map<int, mi*>* results = new map<int, mi*>();
	
	int * array = (int*) malloc(arraySize * sizeof(int));
	
	for(int i = 0; i < iterations; i++) {
	
	 	long start = get_nano_ts(&ts);

		doSomething(array, arraySize);	
		
		long end = get_nano_ts(&ts);
		
		for(int j = 0; j < arraySize; j++) {
			x += array[j];
		}

		int res = end - start - nanoTimeCost;
		
		if (res <= 0) res = 1;
		
		if (i >= warmup) {
			totalTime += res;
			minTime = min(minTime, res);
			maxTime = max(maxTime, res);
			
			auto iter = results->find(res);
			
			if (iter != results->end()) {
			
				(iter->second)->value = (iter->second)->value + 1;
				
			} else {
			
				mi* elem = new mi();
				elem->value = 1;
				(*results)[res] = elem;
			}
			
		}
	}
	
	int count = iterations - warmup;
	
	double avg = totalTime / count;
	
	cout << "Value computed: " << x << endl;
	display(array, arraySize);
	cout << "Nano timestamp cost: " << nanoTimeCost << endl;
	free(array);
	
	stringstream ss;
	
	ss << "Iterations: " << count << " | Avg Time: " << avg;


	if (INCLUDE_STDEV) {
	
		long temp = 0;
		long x = 0;
	
		for(auto iter = results->begin(); iter != results->end(); iter++) {
	
			int time = iter->first;
			long count = (iter->second)->value;
			
			for(int a = 0; a < count; a++) {
				temp += (avg - time) * (avg - time);
				x++;
			}
		}
		
		double stdev = sqrt( temp / x );
		
		ss << " | Stdev: " << fixed << setprecision(2) << stdev;
	}
	
	if (count > 0) {
		ss << " | Min Time: " << minTime << " | Max Time: " << maxTime;
	}
	
	add_perc(ss, count, 0.75, results);
	add_perc(ss, count, 0.90, results);
	add_perc(ss, count, 0.99, results);
	add_perc(ss, count, 0.999, results);
	add_perc(ss, count, 0.9999, results);
	add_perc(ss, count, 0.99999, results);
	
	if (MORE_PERCS) {
		add_perc(ss, count, 0.999999, results);
		add_perc(ss, count, 0.9999999, results);
	}

	cout << ss.str() << endl << endl;
		
	delete results;
	
	return 0;
}