Respect caller timestamp options in batched transcribe

ponpaku · ponpaku · commit 519481d93edd · 2026-03-16T13:47:40.000+09:00
diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
@@ -359,14 +359,14 @@ def transcribe(
             condition_on_previous_text: If True, the previous output of the model is provided
                 as a prompt for the next window; disabling may make the text inconsistent across
                 windows, but the model becomes less prone to getting stuck in a failure loop,
-                such as repetition looping or timestamps going out of sync. Set as False
+                such as repetition looping or timestamps going out of sync.
             prompt_reset_on_temperature: Resets prompt if temperature is above this value.
                 Arg has effect only if condition_on_previous_text is True. Set at 0.5
             prefix: Optional text to provide as a prefix at the beginning of each window.
-            max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0.
+            max_initial_timestamp: The initial timestamp cannot be later than this.
             hallucination_silence_threshold: Optional[float]
                 When word_timestamps is True, skip silent periods longer than this threshold
-                (in seconds) when a possible hallucination is detected. set as None.
+                (in seconds) when a possible hallucination is detected. Set as None.
         Returns:
           A tuple with:
 
@@ -544,12 +544,12 @@ def transcribe(
             hotwords=hotwords,
             word_timestamps=word_timestamps,
             hallucination_silence_threshold=None,
-            condition_on_previous_text=False,
+            condition_on_previous_text=condition_on_previous_text,
             clip_timestamps=clip_timestamps,
             prompt_reset_on_temperature=0.5,
             multilingual=multilingual,
             without_timestamps=without_timestamps,
-            max_initial_timestamp=0.0,
+            max_initial_timestamp=max_initial_timestamp,
         )
 
         info = TranscriptionInfo(
diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py
@@ -1,9 +1,31 @@
 import inspect
+import logging
 import os
+from types import SimpleNamespace
 
 import numpy as np
 
-from faster_whisper import BatchedInferencePipeline, WhisperModel, decode_audio
+from faster_whisper import BatchedInferencePipeline, decode_audio, WhisperModel
+
+
+class _DummyFeatureExtractor:
+    sampling_rate = 16000
+    chunk_length = 30
+
+    def __call__(self, audio, chunk_length=None):
+        return np.zeros((80, 4), dtype="float32")
+
+
+def _make_dummy_batched_model():
+    logger = logging.getLogger("test.batched_options")
+
+    return SimpleNamespace(
+        feature_extractor=_DummyFeatureExtractor(),
+        frames_per_second=50,
+        hf_tokenizer=object(),
+        logger=logger,
+        model=SimpleNamespace(is_multilingual=False),
+    )
 
 
 def test_supported_languages():
@@ -313,3 +335,71 @@ def test_cliptimestamps_timings(physcisworks_path):
         assert clip["start"] == segment.start
         assert clip["end"] == segment.end
         assert segment.text == transcript
+
+
+def test_batched_transcribe_respects_condition_on_previous_text(monkeypatch):
+    pipeline = BatchedInferencePipeline(_make_dummy_batched_model())
+    captured_options = []
+
+    monkeypatch.setattr(
+        "faster_whisper.transcribe.Tokenizer",
+        lambda *args, **kwargs: object(),
+    )
+    monkeypatch.setattr(
+        pipeline,
+        "_batched_segments_generator",
+        lambda *args: captured_options.append(args[4]) or iter(()),
+    )
+
+    audio = np.zeros(1600, dtype="float32")
+    clip_timestamps = [{"start": 0.0, "end": 0.1}]
+
+    _, info = pipeline.transcribe(
+        audio,
+        language="en",
+        clip_timestamps=clip_timestamps,
+        condition_on_previous_text=True,
+        suppress_tokens=[],
+    )
+
+    assert info.transcription_options.condition_on_previous_text is True
+    assert captured_options[0].condition_on_previous_text is True
+
+    captured_options.clear()
+
+    _, info = pipeline.transcribe(
+        audio,
+        language="en",
+        clip_timestamps=clip_timestamps,
+        condition_on_previous_text=False,
+        suppress_tokens=[],
+    )
+
+    assert info.transcription_options.condition_on_previous_text is False
+    assert captured_options[0].condition_on_previous_text is False
+
+
+def test_batched_transcribe_respects_max_initial_timestamp(monkeypatch):
+    pipeline = BatchedInferencePipeline(_make_dummy_batched_model())
+    captured_options = []
+
+    monkeypatch.setattr(
+        "faster_whisper.transcribe.Tokenizer",
+        lambda *args, **kwargs: object(),
+    )
+    monkeypatch.setattr(
+        pipeline,
+        "_batched_segments_generator",
+        lambda *args: captured_options.append(args[4]) or iter(()),
+    )
+
+    _, info = pipeline.transcribe(
+        np.zeros(1600, dtype="float32"),
+        language="en",
+        clip_timestamps=[{"start": 0.0, "end": 0.1}],
+        max_initial_timestamp=1.7,
+        suppress_tokens=[],
+    )
+
+    assert info.transcription_options.max_initial_timestamp == 1.7
+    assert captured_options[0].max_initial_timestamp == 1.7