Files
audio-summary-with-local-LLM/src/summary.py
darnodo a8005cce50 Add support pour GPU (MPS and CUDA)
Migrate to `uv`
2025-03-28 12:58:39 +01:00

167 lines
6.3 KiB
Python

import ollama
import argparse
from pathlib import Path
from transformers import pipeline
import yt_dlp
import torch
OLLAMA_MODEL = "llama3"
WHISPER_MODEL = "openai/whisper-large-v2"
WHISPER_LANGUAGE = "en" # Set to desired language or None for auto-detection
# Function to download a video from YouTube using yt-dlp
def download_from_youtube(url: str, path: str):
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': str(Path(path) / 'to_transcribe.%(ext)s'),
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
# Function to get the best available device
def get_device():
if torch.backends.mps.is_available():
return "mps"
elif torch.cuda.is_available():
return "cuda"
else:
return "cpu"
# Function to transcribe an audio file using the transformers pipeline
def transcribe_file(file_path: str, output_file: str, language: str = None) -> str:
# Get the best available device
device = get_device()
print(f"Using device: {device} for transcription")
# Load the pipeline model for automatic speech recognition
transcriber = pipeline(
"automatic-speech-recognition",
model=WHISPER_MODEL,
device=device,
chunk_length_s=30, # Process in 30-second chunks
return_timestamps=True # Enable timestamp generation for longer audio
)
# Transcribe the audio file
# For CPU, we might want to use a smaller model or chunk the audio if memory is an issue
if device == "cpu":
print("Warning: Using CPU for transcription. This may be slow.")
# Set up generation keyword arguments including language
generate_kwargs = {}
if language and language.lower() != "auto":
generate_kwargs["language"] = language
print(f"Transcribing in language: {language}")
else:
print("Using automatic language detection")
# Transcribe the audio file
print("Starting transcription (this may take a while for longer files)...")
transcribe = transcriber(file_path, generate_kwargs=generate_kwargs)
# Extract the full text from the chunked transcription
if isinstance(transcribe, dict) and "text" in transcribe:
# Simple case - just one chunk
full_text = transcribe["text"]
elif isinstance(transcribe, dict) and "chunks" in transcribe:
# Multiple chunks with timestamps
full_text = " ".join([chunk["text"] for chunk in transcribe["chunks"]])
else:
# Fallback for other return formats
full_text = transcribe["text"] if "text" in transcribe else str(transcribe)
# Save the transcribed text to the specified temporary file
with open(output_file, 'w') as tmp_file:
tmp_file.write(full_text)
print(f"Transcription saved to file: {output_file}")
# Return the transcribed text
return full_text
# Function to summarize a text using the Ollama model
def summarize_text(text: str, output_path: str) -> str:
# Define the system prompt for the Ollama model
system_prompt = "I would like for you to assume the role of a Technical Expert"
# Define the user prompt for the Ollama model
user_prompt = f"""Generate a concise summary of the text below.
Text : {text}
Add a title to the summary.
Make sure your summary has useful and true information about the main points of the topic.
Begin with a short introduction explaining the topic. If you can, use bullet points to list important details,
and finish your summary with a concluding sentence."""
# Use the Ollama model to generate a summary
response = ollama.chat(
model=OLLAMA_MODEL,
messages=[
{
"role": "system",
"content": system_prompt,
},
{
"role": "user",
"content": user_prompt,
},
],
)
# Print the generated summary
return response["message"]["content"]
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description="Download, transcribe, and summarize audio or video files.")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--from-youtube", type=str, help="YouTube URL to download.")
group.add_argument("--from-local", type=str, help="Path to the local audio file.")
parser.add_argument("--output", type=str, default="./summary.md", help="Output markdown file path.")
parser.add_argument("--transcript-only", action='store_true', help="Only transcribe the file, do not summarize.")
parser.add_argument("--language", type=str, help="Language code for transcription (e.g., 'en', 'fr', 'es', or 'auto' for detection)")
args = parser.parse_args()
# Determine language setting
language = args.language if args.language else WHISPER_LANGUAGE
if language and language.lower() == "auto":
language = None # None triggers automatic language detection
# Set up data directory
data_directory = Path("tmp")
# Check if the directory exists, if not, create it
if not data_directory.exists():
data_directory.mkdir(parents=True)
print(f"Created directory: {data_directory}")
if args.from_youtube:
# Download from YouTube
print(f"Downloading YouTube video from {args.from_youtube}")
download_from_youtube(args.from_youtube, str(data_directory))
file_path = data_directory / "to_transcribe.mp3"
elif args.from_local:
# Use local file
file_path = Path(args.from_local)
print(f"Transcribing file: {file_path}")
# Transcribe the audio file
transcript = transcribe_file(str(file_path), data_directory / "transcript.txt", language)
if args.transcript_only:
print("Transcription complete. Skipping summary generation.")
return
print("Generating summary...")
# Generate summary
summary = summarize_text(transcript, "./")
# Write summary to a markdown file
with open(args.output, "w") as md_file:
md_file.write("# Summary\n\n")
md_file.write(summary)
print(f"Summary written to {args.output}")
if __name__ == "__main__":
main()