[SPARK-54938][PYTHON][TEST][FOLLOW-UP] Fix inferred time unit for pandas >= 3

zhengruifeng · HyukjinKwon · commit 842eb7b7c2db · 2026-04-05T10:32:44.000+09:00
### What changes were proposed in this pull request? Fix inferred time unit for pandas >= 3 ### Why are the changes needed? there is behavior change in pandas 3 ### Does this PR introduce _any_ user-facing change? No, test-only ### How was this patch tested? manually check pandas=2.3.3 ``` In [7]: pd.__version__ Out[7]: '2.3.3' In [8]: pd.Series(pd.to_datetime(["2024-01-01", "2024-01-02"])).dtype Out[8]: dtype('<M8[ns]') In [9]: pa.array(pd.Series(pd.to_datetime(["2024-01-01", "2024-01-02"]))).type Out[9]: TimestampType(timestamp[ns]) ``` pandas=3.0.1 ``` In [6]: pd.__version__ Out[6]: '3.0.1' In [7]: pd.Series(pd.to_datetime(["2024-01-01", "2024-01-02"])).dtype Out[7]: dtype('<M8[us]') In [8]: pa.array(pd.Series(pd.to_datetime(["2024-01-01", "2024-01-02"]))).type Out[8]: TimestampType(timestamp[us]) ``` ### Was this patch authored or co-authored using generative AI tooling? Co-authored-by: Claude code (Opus 4.6) Closes #55158 from zhengruifeng/fix-pyarrow-ts-inference. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_type_inference.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_type_inference.py
@@ -299,6 +299,8 @@ def test_pandas_series_numpy_backed(self):
 
         # pandas >= 3 infers large_string instead of string for object-dtype string Series
         string_type = pa.large_string() if LooseVersion(pd.__version__) >= "3.0.0" else pa.string()
+        # pandas >= 3 defaults to microsecond resolution instead of nanosecond
+        ts_unit = "us" if LooseVersion(pd.__version__) >= "3.0.0" else "ns"
 
         sg = ZoneInfo("Asia/Singapore")
         la = "America/Los_Angeles"
@@ -324,17 +326,17 @@ def test_pandas_series_numpy_backed(self):
             (pd.Series([True, False, True]), pa.bool_()),
             # Temporal
             (pd.Series([date1, date2]), pa.date32()),
-            (pd.Series(pd.to_datetime(["2024-01-01", "2024-01-02"])), pa.timestamp("ns")),
-            (pd.Series([pd.Timestamp("1970-01-01")]), pa.timestamp("ns")),
-            (pd.Series([pd.Timestamp.min]), pa.timestamp("ns")),
-            (pd.Series([pd.Timestamp.max]), pa.timestamp("ns")),
-            (pd.Series(pd.to_timedelta(["1 day", "2 hours"])), pa.duration("ns")),
-            (pd.Series([pd.Timedelta(0)]), pa.duration("ns")),
-            (pd.Series([pd.Timedelta.min]), pa.duration("ns")),
-            (pd.Series([pd.Timedelta.max]), pa.duration("ns")),
+            (pd.Series(pd.to_datetime(["2024-01-01", "2024-01-02"])), pa.timestamp(ts_unit)),
+            (pd.Series([pd.Timestamp("1970-01-01")]), pa.timestamp(ts_unit)),
+            (pd.Series([pd.Timestamp.min]), pa.timestamp(ts_unit)),
+            (pd.Series([pd.Timestamp.max]), pa.timestamp(ts_unit)),
+            (pd.Series(pd.to_timedelta(["1 day", "2 hours"])), pa.duration(ts_unit)),
+            (pd.Series([pd.Timedelta(0)]), pa.duration(ts_unit)),
+            (pd.Series([pd.Timedelta.min]), pa.duration(ts_unit)),
+            (pd.Series([pd.Timedelta.max]), pa.duration(ts_unit)),
             # Timezone-aware
-            (pd.Series([dt1_sg, dt2_sg]), pa.timestamp("ns", tz="Asia/Singapore")),
-            (pd.Series([ts1_la, ts2_la]), pa.timestamp("ns", tz=la)),
+            (pd.Series([dt1_sg, dt2_sg]), pa.timestamp(ts_unit, tz="Asia/Singapore")),
+            (pd.Series([ts1_la, ts2_la]), pa.timestamp(ts_unit, tz=la)),
             # Binary
             (pd.Series([b"hello", b"world"]), pa.binary()),
             # Nested