Add support pour GPU (MPS and CUDA)
Migrate to `uv`
This commit is contained in:
@@ -3,8 +3,11 @@ import argparse
|
||||
from pathlib import Path
|
||||
from transformers import pipeline
|
||||
import yt_dlp
|
||||
import torch
|
||||
|
||||
OLLAMA_MODEL = "llama3"
|
||||
WHISPER_MODEL = "openai/whisper-large-v2"
|
||||
WHISPER_LANGUAGE = "en" # Set to desired language or None for auto-detection
|
||||
|
||||
# Function to download a video from YouTube using yt-dlp
|
||||
def download_from_youtube(url: str, path: str):
|
||||
@@ -20,26 +23,70 @@ def download_from_youtube(url: str, path: str):
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download([url])
|
||||
|
||||
# Function to get the best available device
|
||||
def get_device():
|
||||
if torch.backends.mps.is_available():
|
||||
return "mps"
|
||||
elif torch.cuda.is_available():
|
||||
return "cuda"
|
||||
else:
|
||||
return "cpu"
|
||||
|
||||
# Function to transcribe an audio file using the transformers pipeline
|
||||
def transcribe_file(file_path: str, output_file: str) -> str:
|
||||
# Load the pipeline model for automatic speech recognition with MPS
|
||||
transcriber_gpu = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device="mps")
|
||||
def transcribe_file(file_path: str, output_file: str, language: str = None) -> str:
|
||||
# Get the best available device
|
||||
device = get_device()
|
||||
print(f"Using device: {device} for transcription")
|
||||
|
||||
# Load the pipeline model for automatic speech recognition
|
||||
transcriber = pipeline(
|
||||
"automatic-speech-recognition",
|
||||
model=WHISPER_MODEL,
|
||||
device=device,
|
||||
chunk_length_s=30, # Process in 30-second chunks
|
||||
return_timestamps=True # Enable timestamp generation for longer audio
|
||||
)
|
||||
|
||||
# Transcribe the audio file
|
||||
transcribe = transcriber_gpu(file_path)
|
||||
# For CPU, we might want to use a smaller model or chunk the audio if memory is an issue
|
||||
if device == "cpu":
|
||||
print("Warning: Using CPU for transcription. This may be slow.")
|
||||
|
||||
# Set up generation keyword arguments including language
|
||||
generate_kwargs = {}
|
||||
if language and language.lower() != "auto":
|
||||
generate_kwargs["language"] = language
|
||||
print(f"Transcribing in language: {language}")
|
||||
else:
|
||||
print("Using automatic language detection")
|
||||
|
||||
# Transcribe the audio file
|
||||
print("Starting transcription (this may take a while for longer files)...")
|
||||
transcribe = transcriber(file_path, generate_kwargs=generate_kwargs)
|
||||
|
||||
# Extract the full text from the chunked transcription
|
||||
if isinstance(transcribe, dict) and "text" in transcribe:
|
||||
# Simple case - just one chunk
|
||||
full_text = transcribe["text"]
|
||||
elif isinstance(transcribe, dict) and "chunks" in transcribe:
|
||||
# Multiple chunks with timestamps
|
||||
full_text = " ".join([chunk["text"] for chunk in transcribe["chunks"]])
|
||||
else:
|
||||
# Fallback for other return formats
|
||||
full_text = transcribe["text"] if "text" in transcribe else str(transcribe)
|
||||
|
||||
# Save the transcribed text to the specified temporary file
|
||||
with open(output_file, 'w') as tmp_file:
|
||||
tmp_file.write(transcribe["text"])
|
||||
tmp_file.write(full_text)
|
||||
print(f"Transcription saved to file: {output_file}")
|
||||
|
||||
# Return the transcribed text
|
||||
return transcribe["text"]
|
||||
return full_text
|
||||
|
||||
# Function to summarize a text using the Ollama model
|
||||
def summarize_text(text: str, output_path: str) -> str:
|
||||
# Define the system prompt for the Ollama model
|
||||
system_prompt = f"I would like for you to assume the role of a Technical Expert"
|
||||
system_prompt = "I would like for you to assume the role of a Technical Expert"
|
||||
# Define the user prompt for the Ollama model
|
||||
user_prompt = f"""Generate a concise summary of the text below.
|
||||
Text : {text}
|
||||
@@ -73,9 +120,15 @@ def main():
|
||||
group.add_argument("--from-local", type=str, help="Path to the local audio file.")
|
||||
parser.add_argument("--output", type=str, default="./summary.md", help="Output markdown file path.")
|
||||
parser.add_argument("--transcript-only", action='store_true', help="Only transcribe the file, do not summarize.")
|
||||
parser.add_argument("--language", type=str, help="Language code for transcription (e.g., 'en', 'fr', 'es', or 'auto' for detection)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine language setting
|
||||
language = args.language if args.language else WHISPER_LANGUAGE
|
||||
if language and language.lower() == "auto":
|
||||
language = None # None triggers automatic language detection
|
||||
|
||||
# Set up data directory
|
||||
data_directory = Path("tmp")
|
||||
# Check if the directory exists, if not, create it
|
||||
@@ -94,7 +147,7 @@ def main():
|
||||
|
||||
print(f"Transcribing file: {file_path}")
|
||||
# Transcribe the audio file
|
||||
transcript = transcribe_file(str(file_path), data_directory / "transcript.txt")
|
||||
transcript = transcribe_file(str(file_path), data_directory / "transcript.txt", language)
|
||||
|
||||
if args.transcript_only:
|
||||
print("Transcription complete. Skipping summary generation.")
|
||||
@@ -111,4 +164,4 @@ def main():
|
||||
print(f"Summary written to {args.output}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
Reference in New Issue
Block a user