In [None]:
# Install system dependencies, specifically ffmpeg
!apt-get update && apt-get install -y ffmpeg

# Install PyTorch with CUDA support (Colab usually has CUDA 11.8 or 12.1)
# The default PyTorch installed might not be compatible, so this ensures the right version.
# Check your Colab runtime for the CUDA version using !nvidia-smi and change 'cu118' if necessary.
!pip install torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu118

# Install the specific versions of whisperx and related libraries that are compatible with each other
!pip install ctranslate2==4.4.0 faster-whisper==1.1.0 whisperx==3.3.1 pyannote.audio==3.3.2


0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Connecting to security.                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,849 kB]
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,539 kB]
Get:12 http://s

In [1]:
import whisperx
import gc
import torch

# --- Configuration ---
audio_file = "VoiceRec 20251211 113506__00_00_00000__00_58_50631.mp3" # <-- CHANGE THIS to your file name
hf_token = "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" # <-- CHANGE THIS to your HF token

# Set device to 'cuda' (GPU) if available, otherwise 'cpu'
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 16 # Adjust batch size based on GPU memory (higher is faster, requires more VRAM)
compute_type = "float16" # use int8 on CPU, float16 on GPU

print(f"Using device: {device}")

# 1. Load the Whisper model
# 'large-v3' is a good choice for high accuracy. Use 'base' or 'small' for faster processing.
model = whisperx.load_model("large-v3", device, compute_type=compute_type)

print("Model loaded. Transcribing audio...")

# 2. Transcribe the audio file
# WhisperX automatically chunks audio during processing.
result = model.transcribe(audio_file, batch_size=batch_size)
print("Transcription complete.")

# Clear GPU memory after transcription step
gc.collect()
torch.cuda.empty_cache()

# 3. Align the transcript timestamps (optional but recommended for accuracy)
print("Aligning timestamps...")
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio_file, device, return_char_alignments=False)
print("Alignment complete.")

# Clear GPU memory after alignment step
gc.collect()
torch.cuda.empty_cache()

# 4. Apply speaker diarization
print("Starting speaker diarization...")
# Diarization models require your Hugging Face access token for authentication
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)
diarize_segments = diarize_model(audio_file)
print("Diarization complete.")

# 5. Assign speakers to the aligned segments
print("Assigning speakers to segments...")
result = whisperx.assign_speakers_to_segments(diarize_segments, result)

# Clear GPU memory after diarization step
gc.collect()
torch.cuda.empty_cache()

# --- Output the results ---
print("\n--- Diarized Transcript ---")
for segment in result["segments"]:
    # Format the output as [StartTime - EndTime] [Speaker]: Text
    print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] [{segment.get('speaker', 'N/A')}]: {segment['text']}")

# Optionally, save the result to a text file for easy download
output_filename = "transcript_with_speakers.txt"
with open(output_filename, "w") as f:
    for segment in result["segments"]:
        f.write(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] [{segment.get('speaker', 'N/A')}]: {segment['text']}\n")

print(f"\nTranscript saved to {output_filename}")




AttributeError: module 'torchaudio' has no attribute 'AudioMetaData'