Add support pour GPU (MPS and CUDA)

Migrate to `uv`
2025-03-28 12:58:39 +01:00
parent 4eb5f586d4
commit a8005cce50
6 changed files with 1336 additions and 45 deletions
--- a/src/summary.py
+++ b/src/summary.py
@@ -3,8 +3,11 @@ import argparse
 from pathlib import Path
 from transformers import pipeline
 import yt_dlp
+import torch

 OLLAMA_MODEL = "llama3"
+WHISPER_MODEL = "openai/whisper-large-v2"
+WHISPER_LANGUAGE = "en"  # Set to desired language or None for auto-detection

 # Function to download a video from YouTube using yt-dlp
 def download_from_youtube(url: str, path: str):
@@ -20,26 +23,70 @@ def download_from_youtube(url: str, path: str):
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

+# Function to get the best available device
+def get_device():
+    if torch.backends.mps.is_available():
+        return "mps"
+    elif torch.cuda.is_available():
+        return "cuda"
+    else:
+        return "cpu"
+
 # Function to transcribe an audio file using the transformers pipeline
-def transcribe_file(file_path: str, output_file: str) -> str:
-    # Load the pipeline model for automatic speech recognition with MPS
-    transcriber_gpu = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device="mps")
+def transcribe_file(file_path: str, output_file: str, language: str = None) -> str:
+    # Get the best available device
+    device = get_device()
+    print(f"Using device: {device} for transcription")
+    
+    # Load the pipeline model for automatic speech recognition
+    transcriber = pipeline(
+        "automatic-speech-recognition", 
+        model=WHISPER_MODEL, 
+        device=device,
+        chunk_length_s=30,  # Process in 30-second chunks
+        return_timestamps=True  # Enable timestamp generation for longer audio
+    )
    
    # Transcribe the audio file
-    transcribe = transcriber_gpu(file_path)
+    # For CPU, we might want to use a smaller model or chunk the audio if memory is an issue
+    if device == "cpu":
+        print("Warning: Using CPU for transcription. This may be slow.")
+    
+    # Set up generation keyword arguments including language
+    generate_kwargs = {}
+    if language and language.lower() != "auto":
+        generate_kwargs["language"] = language
+        print(f"Transcribing in language: {language}")
+    else:
+        print("Using automatic language detection")
+    
+    # Transcribe the audio file
+    print("Starting transcription (this may take a while for longer files)...")
+    transcribe = transcriber(file_path, generate_kwargs=generate_kwargs)
+    
+    # Extract the full text from the chunked transcription
+    if isinstance(transcribe, dict) and "text" in transcribe:
+        # Simple case - just one chunk
+        full_text = transcribe["text"]
+    elif isinstance(transcribe, dict) and "chunks" in transcribe:
+        # Multiple chunks with timestamps
+        full_text = " ".join([chunk["text"] for chunk in transcribe["chunks"]])
+    else:
+        # Fallback for other return formats
+        full_text = transcribe["text"] if "text" in transcribe else str(transcribe)
    
    # Save the transcribed text to the specified temporary file
    with open(output_file, 'w') as tmp_file:
-        tmp_file.write(transcribe["text"])
+        tmp_file.write(full_text)
        print(f"Transcription saved to file: {output_file}")
    
    # Return the transcribed text
-    return transcribe["text"]
+    return full_text

 # Function to summarize a text using the Ollama model
 def summarize_text(text: str, output_path: str) -> str:
    # Define the system prompt for the Ollama model
-    system_prompt = f"I would like for you to assume the role of a Technical Expert"
+    system_prompt = "I would like for you to assume the role of a Technical Expert"
    # Define the user prompt for the Ollama model
    user_prompt = f"""Generate a concise summary of the text below.
    Text : {text}
@@ -73,9 +120,15 @@ def main():
    group.add_argument("--from-local", type=str, help="Path to the local audio file.")
    parser.add_argument("--output", type=str, default="./summary.md", help="Output markdown file path.")
    parser.add_argument("--transcript-only", action='store_true', help="Only transcribe the file, do not summarize.")
+    parser.add_argument("--language", type=str, help="Language code for transcription (e.g., 'en', 'fr', 'es', or 'auto' for detection)")
    
    args = parser.parse_args()
    
+    # Determine language setting
+    language = args.language if args.language else WHISPER_LANGUAGE
+    if language and language.lower() == "auto":
+        language = None  # None triggers automatic language detection
+    
    # Set up data directory
    data_directory = Path("tmp")
    # Check if the directory exists, if not, create it
@@ -94,7 +147,7 @@ def main():
    
    print(f"Transcribing file: {file_path}")
    # Transcribe the audio file
-    transcript = transcribe_file(str(file_path), data_directory / "transcript.txt")
+    transcript = transcribe_file(str(file_path), data_directory / "transcript.txt", language)
    
    if args.transcript_only:
        print("Transcription complete. Skipping summary generation.")
@@ -111,4 +164,4 @@ def main():
        print(f"Summary written to {args.output}")

 if __name__ == "__main__":
-    main()
+    main()