Skip to content
Draft
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
9d490f5
Disabled some benchmarks and scaled
stanbrub Mar 13, 2026
47f066f
Scaled up basic math combo
stanbrub Mar 13, 2026
dea74d7
Merge branch 'deephaven:main' into gc-benchmarking
stanbrub Mar 19, 2026
15cf1f4
Added a Local Parquet Generator as opposed to going through Kafka
stanbrub Mar 20, 2026
8604111
Added local parquet generator and 1st training test
stanbrub Mar 24, 2026
83b1c11
Added more train benchmarks. Improved Local Parquet Generator
stanbrub Mar 25, 2026
c552c01
Revert BasicMathCombo
stanbrub Mar 26, 2026
62aa96a
Revert BasicMathCombo
stanbrub Mar 26, 2026
f78ca22
Reverted scale and disabled for pre-train standard tests used for pre…
stanbrub Mar 26, 2026
e5412e7
Parallelized local parquet. worked around directory link failures
stanbrub Mar 31, 2026
ff4d891
Added 1st pass at benchmark even retrieval with JFR
stanbrub Apr 1, 2026
f35ab4f
Merge branch 'deephaven:main' into gc-benchmarking
stanbrub Apr 7, 2026
25629cc
Added jfr events
stanbrub Apr 7, 2026
254cca0
Merge branch 'deephaven:main' into gc-benchmarking
stanbrub Apr 7, 2026
528c365
Added UGP events
stanbrub Apr 9, 2026
bd5ff02
Rescaled only static trained for 120 secs
stanbrub Apr 10, 2026
75449bb
Updated adhoc for local parquet env variables
stanbrub Apr 10, 2026
ec2d95e
Open up dh data dir so local parquet can work
stanbrub Apr 10, 2026
a402a54
More logging for benchmark runs
stanbrub Apr 10, 2026
4cf8357
Scaling back AggBy because of system lockup
stanbrub Apr 10, 2026
8507794
Restrict the number of parquet threads and memory for the runner
stanbrub Apr 10, 2026
c0b5e7a
Fixed NaturalJoin OOM
stanbrub Apr 11, 2026
8f1a77f
Added separate scalling for static vs inc
stanbrub Apr 22, 2026
2938992
Better separation for running static and inc. Added ugp deltas
stanbrub Apr 23, 2026
9b326e0
turn on JFR metrics
stanbrub Apr 23, 2026
7fe14cc
Turn off Inc runs
stanbrub Apr 23, 2026
5e1d59c
Added ss_log budget metric
stanbrub May 5, 2026
a1316d4
Added runner setting for auto tune cycle factor
stanbrub May 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/resources/adhoc-scale-benchmark.properties
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ schema.registry.addr=redpanda:8081
kafka.consumer.addr=redpanda:29092

# Default timeout to complete processes (Executing queries, generating records)
default.completion.timeout=10 minutes
default.completion.timeout=20 minutes

# Default data distribution for column data (random, ascending, descending, runlength)
default.data.distribution=${baseDistrib}
Expand Down
8 changes: 7 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@
<file>${project.basedir}/eclipse-java-google-style.xml</file>
</eclipse>
<licenseHeader>
<content>/* Copyright (c) 2022-$YEAR Deephaven Data Labs and Patent Pending */</content>
<content>/* Copyright (c) $YEAR Deephaven Data Labs and Patent Pending */</content>
</licenseHeader>
</java>
</configuration>
Expand Down Expand Up @@ -276,6 +276,12 @@
<artifactId>deephaven-java-client-barrage-dagger</artifactId>
<version>41.3</version>
</dependency>
<!-- Use the same parquet hadoop version is DHC -->
<dependency>
<groupId>blue.strategic.parquet</groupId>
<artifactId>parquet-floor</artifactId>
<version>1.64</version>
</dependency>
<dependency>
<groupId>io.deephaven</groupId>
<artifactId>deephaven-log-to-slf4j</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright (c) 2022-2024 Deephaven Data Labs and Patent Pending */
/* Copyright (c) 2022-2026 Deephaven Data Labs and Patent Pending */
package io.deephaven.benchmark.tests.standard;

import static org.junit.jupiter.api.Assertions.assertTrue;
Expand Down Expand Up @@ -32,6 +32,8 @@ final public class StandardTestRunner {
private int staticFactor = 1;
private int incFactor = 1;
private int rowCountFactor = 1;
private boolean useMemorySource = true;
private boolean useLocalParquet = false;

public StandardTestRunner(Object testInst) {
this.testInst = testInst;
Expand Down Expand Up @@ -96,6 +98,25 @@ public void setServices(String... services) {
requiredServices.addAll(Arrays.asList(services));
}

/**
* Set if the generated tables are loaded into memory before running the test queries.
*
* @return true if in memory source, otherwise false
*/
public void useMemorySource(boolean useMemorySource) {
this.useMemorySource = useMemorySource;
}

/**
* Set if the generated tables are created through Deephaven (i.e. real client-server) or through the local file
* system (i.e. a local copy). The default of "false" is preferred.
*
* @param useLocalParquet false to generate tables through Deephaven, otherwise false
*/
public void useLocalParquet(boolean useLocalParquet) {
this.useLocalParquet = useLocalParquet;
}

/**
* Add a query to be run directly after the main table is loaded. It is not measured. This query can transform the
* main table or supporting table, set up aggregations or updateby operations, etc.
Expand Down Expand Up @@ -193,40 +214,42 @@ public void test(String name, long maxExpectedRowCount, String operation, String
}
}

long getWarmupRowCount() {
return (long) (api.propertyAsIntegral("warmup.row.count", "0") * rowCountFactor);
public long getGeneratedRowCount() {
return (long) (api.propertyAsIntegral("scale.row.count", "100000") * rowCountFactor);
}

long getGeneratedRowCount() {
return (long) (api.propertyAsIntegral("scale.row.count", "100000") * rowCountFactor);
long getWarmupRowCount() {
return (long) (api.propertyAsIntegral("warmup.row.count", "0") * rowCountFactor);
}

long getMaxExpectedRowCount(long expectedRowCount, long scaleFactor) {
return (expectedRowCount < 1) ? Long.MAX_VALUE : expectedRowCount;
}

String getReadOperation(int scaleFactor, long rowCount, String... loadColumns) {
var headRows = (rowCount >= getGeneratedRowCount())?"":".head(${rows})";
var headRows = (rowCount >= getGeneratedRowCount()) ? "" : ".head(${rows})";
var selectStr = useMemorySource ? "select" : "view";
if (scaleFactor > 1 && mainTable.equals("timed") && Arrays.asList(loadColumns).contains("timestamp")) {
var read = """
merge([
read('/data/timed.parquet').view(formulas=[${loadColumns}])${headRows}
] * ${scaleFactor}).update_view([
'timestamp=timestamp.plusMillis((long)(ii / ${rows}) * ${rows})'
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason we can't use the timestamp from the file? I have a few worries about doing rowset calculation as part of the benchmark (to come up with ii).

For the actual test benchmarks, without a select we would also just prefer more/bigger parquet files to avoid the overhead of going through the merge data structures. We might even be able to get away with symlinks to have the data just repeate itself.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the "train" benchmarks, since we don't use Scale Factors, that section of code will not be hit. This is only used when we are doing merges to simulate larger data sets. So for the nightly runs, this will happen BEFORE the "select" into memory, which is not included in the measurement. But for the "train" benchmarks, we only read timestamps directly from the parquet file(s), and that only if they are used in the benchmark (like for rollingtime).

]).select()
]).${selectStr}()
""";
read = read.replace("${headRows}",headRows);
read = read.replace("${headRows}", headRows).replace("${selectStr}", selectStr);
return read.replace("${scaleFactor}", "" + scaleFactor).replace("${rows}", "" + rowCount);
}

var read = "read('/data/${mainTable}.parquet')${headRows}.select(formulas=[${loadColumns}])";
var read = "read('/data/${mainTable}.parquet')${headRows}.${selectStr}(formulas=[${loadColumns}])";
read = (loadColumns.length == 0) ? ("empty_table(${rows})") : read;

if (scaleFactor > 1) {
read = "merge([${readTable}] * ${scaleFactor})".replace("${readTable}", read);
read = read.replace("${scaleFactor}", "" + scaleFactor);
}
return read.replace("${headRows}",headRows).replace("${rows}", "" + rowCount);
read = read.replace("${headRows}", headRows).replace("${rows}", "" + rowCount);
return read.replace("${selectStr}", selectStr);
}

String getStaticQuery(String name, String operation, long rowCount, String... loadColumns) {
Expand Down Expand Up @@ -435,7 +458,7 @@ boolean generateNamedTable(String name, String distribution, String[] groups) {
}

boolean generateSourceTable(String distribution, String[] groups) {
return api.table("source")
var t = api.table("source")
.add("num1", "double", "[0-4]", distribution)
.add("num2", "double", "[1-10]", distribution)
.add("key1", "string", "[1-100]", distribution)
Expand All @@ -444,8 +467,8 @@ boolean generateSourceTable(String distribution, String[] groups) {
.add("key4", "int", "[0-98]", distribution)
.add("key5", "string", "[1-1000000]", distribution)
.withRowCount(getGeneratedRowCount())
.withColumnGrouping(groups)
.generateParquet();
.withColumnGrouping(groups);
return useLocalParquet ? t.generateLocalParquet() : t.generateParquet();
}

boolean generateRightTable(String distribution, String[] groups) {
Expand All @@ -469,7 +492,7 @@ boolean generateRightTable(String distribution, String[] groups) {
boolean generateTimedTable(String distribution, String[] groups) {
long minTime = 1676557157537L;
long maxTime = minTime + getGeneratedRowCount() - 1;
return api.table("timed")
var t = api.table("timed")
.add("timestamp", "timestamp-millis", "[" + minTime + "-" + maxTime + "]", "ascending")
.add("num1", "double", "[0-4]", distribution)
.add("num2", "double", "[1-10]", distribution)
Expand All @@ -478,8 +501,8 @@ boolean generateTimedTable(String distribution, String[] groups) {
.add("key3", "int", "[0-8]", distribution)
.add("key4", "int", "[0-98]", distribution)
.withFixedRowCount(true)
.withColumnGrouping(groups)
.generateParquet();
.withColumnGrouping(groups);
return useLocalParquet ? t.generateLocalParquet() : t.generateParquet();
}

record Result(long loadedRowCount, Duration elapsedTime, long resultRowCount) {
Expand Down
41 changes: 41 additions & 0 deletions src/it/java/io/deephaven/benchmark/tests/train/AggByTrainTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/* Copyright (c) 2026-2026 Deephaven Data Labs and Patent Pending */
package io.deephaven.benchmark.tests.train;

import org.junit.jupiter.api.*;

/**
* Training tests for the aggBy table operations that do aggregations (e.g. sum, std, min/max. var, avg). See
* <code>TrainTestRunner</code> for more information.
*/
public class AggByTrainTest {
final TrainTestRunner runner = new TrainTestRunner(this);

void setup(double rowFactor) {
runner.tables(rowFactor, "timed");

var setupStr = """
from deephaven import agg

aggs = [
agg.sum_('Sum=num1'), agg.std('Std=num2'), agg.min_('Min=num1'), agg.max_('Max=num2'),
agg.avg('Avg=num1'), agg.var('Var=num2'), agg.count_('num1')
]
""";
runner.addSetupQuery(setupStr);
}

@Test
void aggBy0Groups() {
setup(40);
var q = "timed.agg_by(aggs)";
runner.test("AggBy- No Groups", 1, q, "num1", "num2");
}

@Test
void aggBy2Groups() {
setup(20);
var q = "timed.agg_by(aggs, by=['key1', 'key2'])";
runner.test("AggBy- 2 Groups 10K Unique Combos ", 10100, q, "key1", "key2", "num1", "num2");
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/* Copyright (c) 2022-2024 Deephaven Data Labs and Patent Pending */
package io.deephaven.benchmark.tests.train;

import org.junit.jupiter.api.*;
import io.deephaven.benchmark.tests.standard.StandardTestRunner;

/**
* Standard tests for the whereIn table operation. Filters rows of data from the source table where the rows match
* column values in the filter table.
*/
@Tag("Iterate")
public class FilterTrainTest {
final TrainTestRunner runner = new TrainTestRunner(this);

void setup(double rowFactor) {
runner.tables(rowFactor, "timed");
var setup = """
from deephaven.column import string_col, int_col
where_filter = new_table([
string_col("set1", ['1', '2', '3', '4', '5', '6', '7', '8']),
string_col("set2", ['10', '20', '30', '40', '50', '60', '70', '80']),
int_col("set3", [-1, -2, -3, -4, 1, 2, 3, 4])
])
""";
runner.addSetupQuery(setup);
}

@Test
void filter1Col() {
setup(40);
var q = "timed.where_in(where_filter, cols=['key1 = set1']).where(['key1 < `4`'])";
Comment thread
stanbrub marked this conversation as resolved.
Outdated
runner.test("Filter- 1 Col", 0, q, "key1", "num1");
}

@Test
void filter3Cols() {
setup(40);
var q = """
timed.where_in(where_filter, cols=['key1 = set1', 'key2 = set2', 'key3 = set3']) \
.where(filters=["key1 = '1'", "key2 < '100'", "key3 in -2, -1, 0, 1, 2"])
""";
runner.test("Filter- 3 Cols", 0, q, "key1", "key2", "key3", "num1");
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/* Copyright (c) 2026-2026 Deephaven Data Labs and Patent Pending */
package io.deephaven.benchmark.tests.train;

import org.junit.jupiter.api.*;

/**
* Training tests for the formula table operations (e.g. udf, inline). See <code>TrainTestRunner</code> for more
* information.
*/
public class FormulaTrainTest {
final TrainTestRunner runner = new TrainTestRunner(this);

void setup(double rowFactor) {
runner.tables(rowFactor, "timed");
}

@Test
void formulaUdf() {
setup(5);
var setup = """
def f_py(num1: float, num2: float) -> float:
return (num2 + num1) / 2
def f_np(num1: np.float64, num2: np.float64) -> np.float64:
return num1 + num2
""";
runner.addSetupQuery(setup);
var q = "timed.view(['New1 = f_py(num1, num2)','New2 = f_np(num1, num2)']).sum_by()";
runner.test("Formula- UDF 2 Calcs", 1, q, "num1", "num2");
}

@Test
void formulaInline() {
setup(40);
var q = "timed.view(['New1 = (float)((num2 + num1) / 2)', 'New2 = (float)(num1 + num2)']).sum_by()";
runner.test("Formula- Inline 2 Calcs", 1, q, "num1", "num2");
}

@Test
void formulaDate() {
setup(1.75);
var q = """
timed.view([
'New1 = parseDuration(`PT4H52M14S`).toHours()',
'New1 = parseInstant(`2023-05-31T04:52:14.001 ET`).getEpochSecond()'
]).sum_by()
""";
runner.test("Formula- Inline 2 Dates", 1, q, "num1", "num2");
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/* Copyright (c) 2026-2026 Deephaven Data Labs and Patent Pending */
package io.deephaven.benchmark.tests.train;

import org.junit.jupiter.api.*;

/**
* Training tests for the aggBy table operations that do joins (e.g. natural join). See
* <code>TrainTestRunner</code> for more information.
*/
public class NaturalJoinTrainTest {
final TrainTestRunner runner = new TrainTestRunner(this);

void setup(double rowFactor) {
runner.tables(rowFactor, "timed", "right");
}

@Test
void naturalJoinOn1Col() {
setup(40);
var r = "right = right.select_distinct(['r_wild'])";
runner.addSetupQuery(r);
var q = "timed.natural_join(right, on=['key1 = r_wild'])";
runner.test("NaturalJoin- Join On 1 Col", 0, q, "key1", "num1");
}

@Test
void naturalJoinOn3Cols() {
setup(20);
var q = "timed.natural_join(right, on=['key1 = r_wild', 'key2 = r_key2', 'key1 = r_key1'])";
runner.test("NaturalJoin- Join On 3 Cols", 0, q, "key1", "key2", "num1");
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/* Copyright (c) 2026-2026 Deephaven Data Labs and Patent Pending */
package io.deephaven.benchmark.tests.train;

import org.junit.jupiter.api.*;
import io.deephaven.benchmark.tests.standard.StandardTestRunner;

/**
* Training tests for the aggBy table operations that do ordering (e.g.. median, percentile, sorted_first/last). See
* <code>TrainTestRunner</code> for more information.
*
*/
public class OrderedTrainTest {
final TrainTestRunner runner = new TrainTestRunner(this);

void setup(double rowFactor) {
runner.tables(rowFactor, "timed");

var setupStr = """
from deephaven import agg
aggs = [
agg.median('Median=num1'), agg.pct(0.50, ['Percentile=num1']),
agg.unique('Unique=num2'), agg.sorted_first('key4', ['num2']),
agg.sorted_last('key3', ['num1'])
]
""";
runner.addSetupQuery(setupStr);
}

@Test
void ordered0Groups() {
setup(21);
var q = "timed.agg_by(aggs)";
runner.test("Ordered- No Groups", 100, q, "key3", "key4", "num1", "num2");
}

@Test
void ordered2Groups() {
setup(5);
var q = "timed.agg_by(aggs, by=['key1', 'key2'])";
runner.test("Ordered- 2 Groups 10K Unique Combos", 10100, q, "key1", "key2", "key3", "key4", "num1", "num2");
}

}
Loading
Loading