From 51d6ea8e18ed196ab2ea12339026cb85ec55559f Mon Sep 17 00:00:00 2001 From: Damien A Date: Fri, 12 Jul 2024 12:15:17 +0200 Subject: [PATCH] Add `transcript-only` feature --- README.md | 29 +++++++++++++++++++++-------- src/summary.py | 15 ++++++++++++--- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index d432b73..2acdf67 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ This tool is designed to provide a quick and concise summary of audio and video - **Local File Support**: Summarize audio files available on your local disk. - **Transcription**: Converts audio content to text using Whisper. - **Summarization**: Generates a concise summary using Mistral AI (Ollama). +- **Transcript Only Option**: Option to only transcribe the audio content without generating a summary. ## Prerequisites @@ -19,7 +20,7 @@ Before you start using this tool, you need to install the following dependencies - Python 3.8 or higher - `pytube` for downloading videos from YouTube. -- `pathlib`for local file +- `pathlib` for local file handling - `openai-whisper` for audio transcription. - [Ollama](https://ollama.com) for LLM model management. - `ffmpeg` (required for whisper) @@ -36,17 +37,16 @@ cd audio-summary-with-local-LLM pip install -r src/requirements.txt ``` -### LLM Requierement +### LLM Requirement -[Download and install](https://ollama.com) Ollama to carry out LLM Management -More details about LLM model supported can be discribe on the Ollama [github](https://github.com/ollama/ollama). +[Download and install](https://ollama.com) Ollama to carry out LLM Management. More details about LLM models supported can be found on the Ollama [GitHub](https://github.com/ollama/ollama). -Download and use Mistral model : +Download and use the Mistral model: ```bash ollama pull mistral -## Test the access : +## Test the access: ollama run mistral "tell me a joke" ``` @@ -56,6 +56,7 @@ The tool can be executed with the following command line options: - `--from-youtube`: To download and summarize a video from YouTube. - `--from-local`: To load and summarize an audio or video file from the local disk. +- `--transcript-only`: To only transcribe the audio content without generating a summary. This option must be used with either `--from-youtube` or `--from-local`. ### Examples @@ -71,11 +72,23 @@ The tool can be executed with the following command line options: python src/summary.py --from-local ``` -The output summary will be saved in a markdown file in the specified output directory. +3. **Transcribing a YouTube video without summarizing:** + + ```bash + python src/summary.py --from-youtube --transcript-only + ``` + +4. **Transcribing a local audio file without summarizing:** + + ```bash + python src/summary.py --from-local --transcript-only + ``` + +The output summary will be saved in a markdown file in the specified output directory, while the transcript will be saved in the temporary directory. ## Output -The summarized content is saved as a markdown file named `summary.md` in the current working directory. This file includes the transcribed text and its corresponding summary. +The summarized content is saved as a markdown file named `summary.md` in the current working directory. This file includes the transcribed text and its corresponding summary. If `--transcript-only` is used, only the transcription will be saved in the temporary directory. ## Sources diff --git a/src/summary.py b/src/summary.py index fb5c252..c2a90b6 100644 --- a/src/summary.py +++ b/src/summary.py @@ -3,10 +3,10 @@ import ollama import argparse from pytube import YouTube from pathlib import Path - +import os WHISPER_MODEL = "base" -OLLAMA_MODEL = "mistral" +OLLAMA_MODEL = "llama3" # Function to download a video from YouTube def download_from_youtube(url: str, path: str): @@ -65,11 +65,16 @@ def main(): group.add_argument("--from-youtube", type=str, help="YouTube URL to download.") group.add_argument("--from-local", type=str, help="Path to the local audio file.") parser.add_argument("--output", type=str, default="./summary.md", help="Output markdown file path.") + parser.add_argument("--transcript-only", action='store_true', help="Only transcribe the file, do not summarize.") args = parser.parse_args() # Set up data directory data_directory = Path("tmp") + # Check if the directory exists, if not, create it + if not data_directory.exists(): + data_directory.mkdir(parents=True) + print(f"Created directory: {data_directory}") if args.from_youtube: # Download from YouTube @@ -78,12 +83,16 @@ def main(): file_path = data_directory / "to_transcribe.mp4" elif args.from_local: # Use local file - file_path = args.from_local + file_path = Path(args.from_local) print(f"Transcribing file: {file_path}") # Transcribe the audio file transcript = transcribe_file(str(file_path), data_directory / "transcript.txt") + if args.transcript_only: + print("Transcription complete. Skipping summary generation.") + return + print("Generating summary...") # Generate summary summary = summarize_text(transcript, "./")