deephaven · stanbrub · Mar 13, 2026 · Mar 13, 2026 · Mar 19, 2026 · Mar 20, 2026
diff --git a/.github/resources/adhoc-scale-benchmark.properties b/.github/resources/adhoc-scale-benchmark.properties
@@ -15,7 +15,7 @@ schema.registry.addr=redpanda:8081
 kafka.consumer.addr=redpanda:29092
 
 # Default timeout to complete processes (Executing queries, generating records)
-default.completion.timeout=10 minutes
+default.completion.timeout=20 minutes
 
 # Default data distribution for column data (random, ascending, descending, runlength)
 default.data.distribution=${baseDistrib}

diff --git a/pom.xml b/pom.xml
@@ -178,7 +178,7 @@
 							<file>${project.basedir}/eclipse-java-google-style.xml</file>
 						</eclipse>
 						<licenseHeader>
-							<content>/* Copyright (c) 2022-$YEAR Deephaven Data Labs and Patent Pending */</content>
+							<content>/* Copyright (c) $YEAR Deephaven Data Labs and Patent Pending */</content>
 						</licenseHeader>
 					</java>
 				</configuration>
@@ -276,6 +276,12 @@
 			<artifactId>deephaven-java-client-barrage-dagger</artifactId>
 			<version>41.3</version>
 		</dependency>
+		<!-- Use the same parquet hadoop version is DHC -->
+        <dependency>
+            <groupId>blue.strategic.parquet</groupId>
+            <artifactId>parquet-floor</artifactId>
+            <version>1.64</version>
+        </dependency>
 		<dependency>
 			<groupId>io.deephaven</groupId>
 			<artifactId>deephaven-log-to-slf4j</artifactId>

diff --git a/src/it/java/io/deephaven/benchmark/tests/standard/StandardTestRunner.java b/src/it/java/io/deephaven/benchmark/tests/standard/StandardTestRunner.java
@@ -1,4 +1,4 @@
-/* Copyright (c) 2022-2024 Deephaven Data Labs and Patent Pending */
+/* Copyright (c) 2022-2026 Deephaven Data Labs and Patent Pending */
 package io.deephaven.benchmark.tests.standard;
 
 import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -32,6 +32,8 @@ final public class StandardTestRunner {
     private int staticFactor = 1;
     private int incFactor = 1;
     private int rowCountFactor = 1;
+    private boolean useMemorySource = true;
+    private boolean useLocalParquet = false;
 
     public StandardTestRunner(Object testInst) {
         this.testInst = testInst;
@@ -96,6 +98,25 @@ public void setServices(String... services) {
         requiredServices.addAll(Arrays.asList(services));
     }
 
+    /**
+     * Set if the generated tables are loaded into memory before running the test queries.
+     * 
+     * @return true if in memory source, otherwise false
+     */
+    public void useMemorySource(boolean useMemorySource) {
+        this.useMemorySource = useMemorySource;
+    }
+
+    /**
+     * Set if the generated tables are created through Deephaven (i.e. real client-server) or through the local file
+     * system (i.e. a local copy). The default of "false" is preferred.
+     * 
+     * @param useLocalParquet false to generate tables through Deephaven, otherwise false
+     */
+    public void useLocalParquet(boolean useLocalParquet) {
+        this.useLocalParquet = useLocalParquet;
+    }
+
     /**
      * Add a query to be run directly after the main table is loaded. It is not measured. This query can transform the
      * main table or supporting table, set up aggregations or updateby operations, etc.
@@ -193,40 +214,42 @@ public void test(String name, long maxExpectedRowCount, String operation, String
         }
     }
 
-    long getWarmupRowCount() {
-        return (long) (api.propertyAsIntegral("warmup.row.count", "0") * rowCountFactor);
+    public long getGeneratedRowCount() {
+        return (long) (api.propertyAsIntegral("scale.row.count", "100000") * rowCountFactor);
     }
 
-    long getGeneratedRowCount() {
-        return (long) (api.propertyAsIntegral("scale.row.count", "100000") * rowCountFactor);
+    long getWarmupRowCount() {
+        return (long) (api.propertyAsIntegral("warmup.row.count", "0") * rowCountFactor);
     }
 
     long getMaxExpectedRowCount(long expectedRowCount, long scaleFactor) {
         return (expectedRowCount < 1) ? Long.MAX_VALUE : expectedRowCount;
     }
 
     String getReadOperation(int scaleFactor, long rowCount, String... loadColumns) {
-        var headRows = (rowCount >= getGeneratedRowCount())?"":".head(${rows})";
+        var headRows = (rowCount >= getGeneratedRowCount()) ? "" : ".head(${rows})";
+        var selectStr = useMemorySource ? "select" : "view";
         if (scaleFactor > 1 && mainTable.equals("timed") && Arrays.asList(loadColumns).contains("timestamp")) {
             var read = """
             merge([
                 read('/data/timed.parquet').view(formulas=[${loadColumns}])${headRows}
             ] * ${scaleFactor}).update_view([
                 'timestamp=timestamp.plusMillis((long)(ii / ${rows}) * ${rows})'
-            ]).select()
+            ]).${selectStr}()
             """;
-            read = read.replace("${headRows}",headRows);
+            read = read.replace("${headRows}", headRows).replace("${selectStr}", selectStr);
             return read.replace("${scaleFactor}", "" + scaleFactor).replace("${rows}", "" + rowCount);
         }
 
-        var read = "read('/data/${mainTable}.parquet')${headRows}.select(formulas=[${loadColumns}])";
+        var read = "read('/data/${mainTable}.parquet')${headRows}.${selectStr}(formulas=[${loadColumns}])";
         read = (loadColumns.length == 0) ? ("empty_table(${rows})") : read;
 
         if (scaleFactor > 1) {
             read = "merge([${readTable}] * ${scaleFactor})".replace("${readTable}", read);
             read = read.replace("${scaleFactor}", "" + scaleFactor);
         }
-        return read.replace("${headRows}",headRows).replace("${rows}", "" + rowCount);
+        read = read.replace("${headRows}", headRows).replace("${rows}", "" + rowCount);
+        return read.replace("${selectStr}", selectStr);
     }
 
     String getStaticQuery(String name, String operation, long rowCount, String... loadColumns) {
@@ -435,7 +458,7 @@ boolean generateNamedTable(String name, String distribution, String[] groups) {
     }
 
     boolean generateSourceTable(String distribution, String[] groups) {
-        return api.table("source")
+        var t = api.table("source")
                 .add("num1", "double", "[0-4]", distribution)
                 .add("num2", "double", "[1-10]", distribution)
                 .add("key1", "string", "[1-100]", distribution)
@@ -444,8 +467,8 @@ boolean generateSourceTable(String distribution, String[] groups) {
                 .add("key4", "int", "[0-98]", distribution)
                 .add("key5", "string", "[1-1000000]", distribution)
                 .withRowCount(getGeneratedRowCount())
-                .withColumnGrouping(groups)
-                .generateParquet();
+                .withColumnGrouping(groups);
+        return useLocalParquet ? t.generateLocalParquet() : t.generateParquet();
     }
 
     boolean generateRightTable(String distribution, String[] groups) {
@@ -469,7 +492,7 @@ boolean generateRightTable(String distribution, String[] groups) {
     boolean generateTimedTable(String distribution, String[] groups) {
         long minTime = 1676557157537L;
         long maxTime = minTime + getGeneratedRowCount() - 1;
-        return api.table("timed")
+        var t = api.table("timed")
                 .add("timestamp", "timestamp-millis", "[" + minTime + "-" + maxTime + "]", "ascending")
                 .add("num1", "double", "[0-4]", distribution)
                 .add("num2", "double", "[1-10]", distribution)
@@ -478,8 +501,8 @@ boolean generateTimedTable(String distribution, String[] groups) {
                 .add("key3", "int", "[0-8]", distribution)
                 .add("key4", "int", "[0-98]", distribution)
                 .withFixedRowCount(true)
-                .withColumnGrouping(groups)
-                .generateParquet();
+                .withColumnGrouping(groups);
+        return useLocalParquet ? t.generateLocalParquet() : t.generateParquet();
     }
 
     record Result(long loadedRowCount, Duration elapsedTime, long resultRowCount) {

diff --git a/src/it/java/io/deephaven/benchmark/tests/train/AggByTrainTest.java b/src/it/java/io/deephaven/benchmark/tests/train/AggByTrainTest.java
@@ -0,0 +1,41 @@
+/* Copyright (c) 2026-2026 Deephaven Data Labs and Patent Pending */
+package io.deephaven.benchmark.tests.train;
+
+import org.junit.jupiter.api.*;
+
+/**
+ * Training tests for the aggBy table operations that do aggregations (e.g. sum, std, min/max. var, avg). See
+ * <code>TrainTestRunner</code> for more information.
+ */
+public class AggByTrainTest {
+    final TrainTestRunner runner = new TrainTestRunner(this);
+
+    void setup(double rowFactor) {
+        runner.tables(rowFactor, "timed");
+
+        var setupStr = """
+        from deephaven import agg
+
+        aggs = [
+           agg.sum_('Sum=num1'), agg.std('Std=num2'), agg.min_('Min=num1'), agg.max_('Max=num2'),
+           agg.avg('Avg=num1'), agg.var('Var=num2'), agg.count_('num1')
+        ]
+        """;
+        runner.addSetupQuery(setupStr);
+    }
+
+    @Test
+    void aggBy0Groups() {
+        setup(40);
+        var q = "timed.agg_by(aggs)";
+        runner.test("AggBy- No Groups", 1, q, "num1", "num2");
+    }
+
+    @Test
+    void aggBy2Groups() {
+        setup(20);
+        var q = "timed.agg_by(aggs, by=['key1', 'key2'])";
+        runner.test("AggBy- 2 Groups 10K Unique Combos ", 10100, q, "key1", "key2", "num1", "num2");
+    }
+
+}
diff --git a/src/it/java/io/deephaven/benchmark/tests/train/FilterTrainTest.java b/src/it/java/io/deephaven/benchmark/tests/train/FilterTrainTest.java
@@ -0,0 +1,45 @@
+/* Copyright (c) 2022-2024 Deephaven Data Labs and Patent Pending */
+package io.deephaven.benchmark.tests.train;
+
+import org.junit.jupiter.api.*;
+import io.deephaven.benchmark.tests.standard.StandardTestRunner;
+
+/**
+ * Standard tests for the whereIn table operation. Filters rows of data from the source table where the rows match
+ * column values in the filter table.
+ */
+@Tag("Iterate")
+public class FilterTrainTest {
+    final TrainTestRunner runner = new TrainTestRunner(this);
+
+    void setup(double rowFactor) {
+        runner.tables(rowFactor, "timed");
+        var setup = """
+        from deephaven.column import string_col, int_col
+        where_filter = new_table([
+        	string_col("set1", ['1', '2', '3', '4', '5', '6', '7', '8']),
+        	string_col("set2", ['10', '20', '30', '40', '50', '60', '70', '80']),
+        	int_col("set3", [-1, -2, -3, -4, 1, 2, 3, 4])
+        ])
+        """;
+        runner.addSetupQuery(setup);
+    }
+
+    @Test
+    void filter1Col() {
+        setup(40);
+        var q = "timed.where_in(where_filter, cols=['key1 = set1']).where(['key1 < `4`'])";
+        runner.test("Filter- 1 Col", 0, q, "key1", "num1");
+    }
+
+    @Test
+    void filter3Cols() {
+        setup(40);
+        var q = """
+        timed.where_in(where_filter, cols=['key1 = set1', 'key2 = set2', 'key3 = set3']) \
+            .where(filters=["key1 = '1'", "key2 < '100'", "key3 in -2, -1, 0, 1, 2"])
+        """;
+        runner.test("Filter- 3 Cols", 0, q, "key1", "key2", "key3", "num1");
+    }
+
+}
diff --git a/src/it/java/io/deephaven/benchmark/tests/train/FormulaTrainTest.java b/src/it/java/io/deephaven/benchmark/tests/train/FormulaTrainTest.java
@@ -0,0 +1,50 @@
+/* Copyright (c) 2026-2026 Deephaven Data Labs and Patent Pending */
+package io.deephaven.benchmark.tests.train;
+
+import org.junit.jupiter.api.*;
+
+/**
+ * Training tests for the formula table operations (e.g. udf, inline). See <code>TrainTestRunner</code> for more
+ * information.
+ */
+public class FormulaTrainTest {
+    final TrainTestRunner runner = new TrainTestRunner(this);
+
+    void setup(double rowFactor) {
+        runner.tables(rowFactor, "timed");
+    }
+
+    @Test
+    void formulaUdf() {
+        setup(5);
+        var setup = """
+        def f_py(num1: float, num2: float) -> float:
+            return (num2 + num1) / 2
+        def f_np(num1: np.float64, num2: np.float64) -> np.float64:
+            return num1 + num2
+        """;
+        runner.addSetupQuery(setup);
+        var q = "timed.view(['New1 = f_py(num1, num2)','New2 = f_np(num1, num2)']).sum_by()";
+        runner.test("Formula- UDF 2 Calcs", 1, q, "num1", "num2");
+    }
+
+    @Test
+    void formulaInline() {
+        setup(40);
+        var q = "timed.view(['New1 = (float)((num2 + num1) / 2)', 'New2 = (float)(num1 + num2)']).sum_by()";
+        runner.test("Formula- Inline 2 Calcs", 1, q, "num1", "num2");
+    }
+
+    @Test
+    void formulaDate() {
+        setup(1.75);
+        var q = """
+        timed.view([
+            'New1 = parseDuration(`PT4H52M14S`).toHours()', 
+            'New1 = parseInstant(`2023-05-31T04:52:14.001 ET`).getEpochSecond()'
+        ]).sum_by()
+        """;
+        runner.test("Formula- Inline 2 Dates", 1, q, "num1", "num2");
+    }
+
+}
diff --git a/src/it/java/io/deephaven/benchmark/tests/train/NaturalJoinTrainTest.java b/src/it/java/io/deephaven/benchmark/tests/train/NaturalJoinTrainTest.java
@@ -0,0 +1,33 @@
+/* Copyright (c) 2026-2026 Deephaven Data Labs and Patent Pending */
+package io.deephaven.benchmark.tests.train;
+
+import org.junit.jupiter.api.*;
+
+/**
+ * Training tests for the aggBy table operations that do joins (e.g. natural join). See
+ * <code>TrainTestRunner</code> for more information.
+ */
+public class NaturalJoinTrainTest {
+    final TrainTestRunner runner = new TrainTestRunner(this);
+
+    void setup(double rowFactor) {
+        runner.tables(rowFactor, "timed", "right");
+    }
+
+    @Test
+    void naturalJoinOn1Col() {
+        setup(40);
+        var r = "right = right.select_distinct(['r_wild'])";
+        runner.addSetupQuery(r);
+        var q = "timed.natural_join(right, on=['key1 = r_wild'])";
+        runner.test("NaturalJoin- Join On 1 Col", 0, q, "key1", "num1");
+    }
+
+    @Test
+    void naturalJoinOn3Cols() {
+        setup(20);
+        var q = "timed.natural_join(right, on=['key1 = r_wild', 'key2 = r_key2', 'key1 = r_key1'])";
+        runner.test("NaturalJoin- Join On 3 Cols", 0, q, "key1", "key2", "num1");
+    }
+
+}
diff --git a/src/it/java/io/deephaven/benchmark/tests/train/OrderedTrainTest.java b/src/it/java/io/deephaven/benchmark/tests/train/OrderedTrainTest.java
@@ -0,0 +1,43 @@
+/* Copyright (c) 2026-2026 Deephaven Data Labs and Patent Pending */
+package io.deephaven.benchmark.tests.train;
+
+import org.junit.jupiter.api.*;
+import io.deephaven.benchmark.tests.standard.StandardTestRunner;
+
+/**
+ * Training tests for the aggBy table operations that do ordering (e.g.. median, percentile, sorted_first/last). See
+ * <code>TrainTestRunner</code> for more information.
+ * 
+ */
+public class OrderedTrainTest {
+    final TrainTestRunner runner = new TrainTestRunner(this);
+
+    void setup(double rowFactor) {
+        runner.tables(rowFactor, "timed");
+
+        var setupStr = """
+        from deephaven import agg
+        aggs = [
+           agg.median('Median=num1'), agg.pct(0.50, ['Percentile=num1']), 
+           agg.unique('Unique=num2'), agg.sorted_first('key4', ['num2']),
+           agg.sorted_last('key3', ['num1'])
+        ]
+        """;
+        runner.addSetupQuery(setupStr);
+    }
+
+    @Test
+    void ordered0Groups() {
+        setup(21);
+        var q = "timed.agg_by(aggs)";
+        runner.test("Ordered- No Groups", 100, q, "key3", "key4", "num1", "num2");
+    }
+
+    @Test
+    void ordered2Groups() {
+        setup(5);
+        var q = "timed.agg_by(aggs, by=['key1', 'key2'])";
+        runner.test("Ordered- 2 Groups 10K Unique Combos", 10100, q, "key1", "key2", "key3", "key4", "num1", "num2");
+    }
+
+}