Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -168,4 +168,7 @@ cython_debug/
#.idea/

# PyPI configuration file
.pypirc
.pypirc

# Models files added for offline diarization
models/*.bin
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ This repository provides fast automatic speech recognition (70x realtime with la

<h2 align="left", id="highlights">New🚨</h2>

- Offline diarization support added! Use local models without Hugging Face API access.
- 1st place at [Ego4d transcription challenge](https://eval.ai/web/challenges/challenge-page/1637/leaderboard/3931/WER) 🏆
- _WhisperX_ accepted at INTERSPEECH 2023
- v3 transcript segment-per-sentence: using nltk sent_tokenize for better subtitlting & better diarization
Expand Down Expand Up @@ -110,6 +111,19 @@ To **enable Speaker Diarization**, include your Hugging Face access token (read)
> **Note**<br>
> As of Oct 11, 2023, there is a known issue regarding slow performance with pyannote/Speaker-Diarization-3.0 in whisperX. It is due to dependency conflicts between faster-whisper and pyannote-audio 3.0.0. Please see [this issue](https://github.com/m-bain/whisperX/issues/499) for more details and potential workarounds.

#### Offline Diarization

You can now use offline diarization without needing a Hugging Face token. This requires:

1. Download the necessary model files to a local `models` directory
2. Configure the diarization pipeline using the provided `models/pyannote_diarization_config.yaml` file

To use offline diarization with the command line:

```bash
whisperx path/to/audio.wav --model large-v2 --diarize --diarize_offline --diarize_config models/pyannote_diarization_config.yaml
```

<h2 align="left" id="example">Usage 💬 (command line)</h2>

### English
Expand Down Expand Up @@ -137,6 +151,10 @@ To label the transcript with speaker ID's (set number of speakers if known e.g.

whisperx path/to/audio.wav --model large-v2 --diarize --highlight_words True

To use offline diarization mode:

whisperx path/to/audio.wav --model large-v2 --diarize --diarize_offline --diarize_config models/pyannote_diarization_config.yaml

To run on CPU instead of GPU (and for running on Mac OS X):

whisperx path/to/audio.wav --compute_type int8
Expand Down Expand Up @@ -192,8 +210,15 @@ print(result["segments"]) # after alignment
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
# Option 1: HF-token based diarization (online)
diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)

# Option 2: Local/offline diarization
# diarize_model = whisperx.OfflineDiarizationPipeline(
# config_path="models/pyannote_diarization_config.yaml",
# device=device
# )

# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
Expand Down Expand Up @@ -267,6 +292,8 @@ Bug finding and pull requests are also highly appreciated to keep this project g

* [x] Allow silero-vad as alternative VAD option

* [x] Add offline diarization support

* [ ] Improve diarization (word level). *Harder than first thought...*


Expand Down
19 changes: 19 additions & 0 deletions models/pyannote_diarization_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
version: 3.1.0

pipeline:
name: pyannote.audio.pipelines.SpeakerDiarization
params:
clustering: AgglomerativeClustering
embedding: models/pyannote_model_wespeaker-voxceleb-resnet34-LM.bin
embedding_batch_size: 32
embedding_exclude_overlap: true
segmentation: models/pyannote_model_segmentation-3.0.bin
segmentation_batch_size: 32

params:
clustering:
method: centroid
min_cluster_size: 12
threshold: 0.7045654963945799
segmentation:
min_duration_off: 0.0
65 changes: 65 additions & 0 deletions whisperx/offline_diarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from pyannote.audio import Pipeline

from .audio import load_audio, SAMPLE_RATE

class OfflineDiarizationPipeline:
def __init__(
self,
config_path,
device="cpu",
):
if isinstance(device, str):
device = torch.device(device)

# Load the pipeline with local config
self.model = self._load_pipeline_from_pretrained(config_path).to(device)

def _load_pipeline_from_pretrained(self, path_to_config):
path_to_config = Path(path_to_config)

if not path_to_config.exists():
raise FileNotFoundError(f"Config file not found: {path_to_config}")

print(f"Loading pyannote pipeline from {path_to_config}...")
# the paths in the config are relative to the current working directory
# so we need to change the working directory to the model path
# and then change it back

cwd = Path.cwd().resolve() # store current working directory

# first .parent is the folder of the config, second .parent is the folder containing the 'models' folder
cd_to = path_to_config.parent.parent.resolve()

print(f"Changing working directory to {cd_to}")
os.chdir(cd_to)

pipeline = Pipeline.from_pretrained(path_to_config)

print(f"Changing working directory back to {cwd}")
os.chdir(cwd)

return pipeline

def __call__(
self,
audio,
num_speakers=None,
min_speakers=None,
max_speakers=None,
):
if isinstance(audio, str):
audio = load_audio(audio)
audio_data = {
'waveform': torch.from_numpy(audio[None, :]),
'sample_rate': SAMPLE_RATE
}
segments = self.model(audio_data, num_speakers=num_speakers, min_speakers=min_speakers, max_speakers=max_speakers)
diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
return diarize_df
28 changes: 25 additions & 3 deletions whisperx/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
from whisperx.asr import load_model
from whisperx.audio import load_audio
from whisperx.diarize import DiarizationPipeline, assign_word_speakers
# Import the offline diarization pipeline
try:
from .offline_diarize import OfflineDiarizationPipeline
except ImportError:
OfflineDiarizationPipeline = None
from whisperx.types import AlignedTranscriptionResult, TranscriptionResult
from whisperx.utils import (
LANGUAGES,
Expand Down Expand Up @@ -56,6 +61,9 @@ def cli():
parser.add_argument("--diarize", action="store_true", help="Apply diarization to assign speaker labels to each segment/word")
parser.add_argument("--min_speakers", default=None, type=int, help="Minimum number of speakers to in audio file")
parser.add_argument("--max_speakers", default=None, type=int, help="Maximum number of speakers to in audio file")
# Add offline diarization params
parser.add_argument("--diarize_offline", action="store_true", help="Use offline diarization models instead of downloading from HF")
parser.add_argument("--diarize_config", type=str, default="models/pyannote_diarization_config.yaml", help="Path to the diarization config file for offline mode")

parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
Expand Down Expand Up @@ -120,6 +128,8 @@ def cli():
chunk_size: int = args.pop("chunk_size")

diarize: bool = args.pop("diarize")
diarize_offline: bool = args.pop("diarize_offline")
diarize_config: str = args.pop("diarize_config")
min_speakers: int = args.pop("min_speakers")
max_speakers: int = args.pop("max_speakers")
print_progress: bool = args.pop("print_progress")
Expand Down Expand Up @@ -238,12 +248,24 @@ def cli():

# >> Diarize
if diarize:
if hf_token is None:
print("Warning, no --hf_token used, needs to be saved in environment variable, otherwise will throw error loading diarization model...")
tmp_results = results
print(">>Performing diarization...")
results = []
diarize_model = DiarizationPipeline(use_auth_token=hf_token, device=device)

if diarize_offline:
if OfflineDiarizationPipeline is None:
raise ImportError("offline_diarize.py must be in the same directory as transcribe.py. Please ensure it's properly installed.")

print(f"Using offline diarization with config: {diarize_config}")
diarize_model = OfflineDiarizationPipeline(
config_path=diarize_config,
device=device
)
else:
if hf_token is None:
print("Warning, no --hf_token used, needs to be saved in environment variable, otherwise will throw error loading diarization model...")
diarize_model = DiarizationPipeline(use_auth_token=hf_token, device=device)

for result, input_audio_path in tmp_results:
diarize_segments = diarize_model(input_audio_path, min_speakers=min_speakers, max_speakers=max_speakers)
result = assign_word_speakers(diarize_segments, result)
Expand Down