audio-summary-with-local-LLM/src/summary.py

import ollama
import argparse
from pathlib import Path
from transformers import pipeline
import yt_dlp
import torch

OLLAMA_MODEL = "llama3"
WHISPER_MODEL = "openai/whisper-large-v2"
WHISPER_LANGUAGE = "en"  # Set to desired language or None for auto-detection

# Function to download a video from YouTube using yt-dlp
def download_from_youtube(url: str, path: str):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': str(Path(path) / 'to_transcribe.%(ext)s'),
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

# Function to get the best available device
def get_device():
    if torch.backends.mps.is_available():
        return "mps"
    elif torch.cuda.is_available():
        return "cuda"
    else:
        return "cpu"

# Function to transcribe an audio file using the transformers pipeline
def transcribe_file(file_path: str, output_file: str, language: str = None) -> str:
    # Get the best available device
    device = get_device()
    print(f"Using device: {device} for transcription")

    # Load the pipeline model for automatic speech recognition
    transcriber = pipeline(
        "automatic-speech-recognition",
        model=WHISPER_MODEL,
        device=device,
        chunk_length_s=30,  # Process in 30-second chunks
        return_timestamps=True  # Enable timestamp generation for longer audio
    )

    # Transcribe the audio file
    # For CPU, we might want to use a smaller model or chunk the audio if memory is an issue
    if device == "cpu":
        print("Warning: Using CPU for transcription. This may be slow.")

    # Set up generation keyword arguments including language
    generate_kwargs = {}
    if language and language.lower() != "auto":
        generate_kwargs["language"] = language
        print(f"Transcribing in language: {language}")
    else:
        print("Using automatic language detection")

    # Transcribe the audio file
    print("Starting transcription (this may take a while for longer files)...")
    transcribe = transcriber(file_path, generate_kwargs=generate_kwargs)

    # Extract the full text from the chunked transcription
    if isinstance(transcribe, dict) and "text" in transcribe:
        # Simple case - just one chunk
        full_text = transcribe["text"]
    elif isinstance(transcribe, dict) and "chunks" in transcribe:
        # Multiple chunks with timestamps
        full_text = " ".join([chunk["text"] for chunk in transcribe["chunks"]])
    else:
        # Fallback for other return formats
        full_text = transcribe["text"] if "text" in transcribe else str(transcribe)

    # Save the transcribed text to the specified temporary file
    with open(output_file, 'w') as tmp_file:
        tmp_file.write(full_text)
        print(f"Transcription saved to file: {output_file}")

    # Return the transcribed text
    return full_text

# Function to summarize a text using the Ollama model
def summarize_text(text: str, output_path: str) -> str:
    # Define the system prompt for the Ollama model
    system_prompt = "I would like for you to assume the role of a Technical Expert"
    # Define the user prompt for the Ollama model
    user_prompt = f"""Generate a concise summary of the text below.
    Text : {text}
    Add a title to the summary.
    Make sure your summary has useful and true information about the main points of the topic.
    Begin with a short introduction explaining the topic. If you can, use bullet points to list important details,
    and finish your summary with a concluding sentence."""

    # Use the Ollama model to generate a summary
    response = ollama.chat(
        model=OLLAMA_MODEL,
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": user_prompt,
            },
        ],
    )
    # Print the generated summary
    return response["message"]["content"]

def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description="Download, transcribe, and summarize audio or video files.")
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--from-youtube", type=str, help="YouTube URL to download.")
    group.add_argument("--from-local", type=str, help="Path to the local audio file.")
    parser.add_argument("--output", type=str, default="./summary.md", help="Output markdown file path.")
    parser.add_argument("--transcript-only", action='store_true', help="Only transcribe the file, do not summarize.")
    parser.add_argument("--language", type=str, help="Language code for transcription (e.g., 'en', 'fr', 'es', or 'auto' for detection)")

    args = parser.parse_args()

    # Determine language setting
    language = args.language if args.language else WHISPER_LANGUAGE
    if language and language.lower() == "auto":
        language = None  # None triggers automatic language detection

    # Set up data directory
    data_directory = Path("tmp")
    # Check if the directory exists, if not, create it
    if not data_directory.exists():
        data_directory.mkdir(parents=True)
        print(f"Created directory: {data_directory}")

    if args.from_youtube:
        # Download from YouTube
        print(f"Downloading YouTube video from {args.from_youtube}")
        download_from_youtube(args.from_youtube, str(data_directory))
        file_path = data_directory / "to_transcribe.mp3"
    elif args.from_local:
        # Use local file
        file_path = Path(args.from_local)

    print(f"Transcribing file: {file_path}")
    # Transcribe the audio file
    transcript = transcribe_file(str(file_path), data_directory / "transcript.txt", language)

    if args.transcript_only:
        print("Transcription complete. Skipping summary generation.")
        return

    print("Generating summary...")
    # Generate summary
    summary = summarize_text(transcript, "./")

    # Write summary to a markdown file
    with open(args.output, "w") as md_file:
        md_file.write("# Summary\n\n")
        md_file.write(summary)
        print(f"Summary written to {args.output}")

if __name__ == "__main__":
    main()