import os import math import tempfile import moviepy as mp import tkinter as tk from tkinter import filedialog, messagebox, ttk from pydub import AudioSegment from pydub.silence import split_on_silence import threading from faster_whisper import WhisperModel def format_time(seconds): hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = int(seconds % 60) millis = int((seconds - int(seconds)) * 1000) return f"{hours:02}:{minutes:02}:{secs:02},{millis:03}" def wrap_text(text, max_len=40): """ Wraps text to ~max_len characters per line without cutting words. """ words = text.split() lines = [] current_line = "" for word in words: if len(current_line + " " + word) <= max_len: current_line += (" " if current_line else "") + word else: lines.append(current_line) current_line = word if current_line: lines.append(current_line) return "\n".join(lines) def write_srt(subtitles, output_path): with open(output_path, 'w', encoding='utf-8') as f: for i, sub in enumerate(subtitles, 1): f.write(f"{i}\n") f.write(f"{format_time(sub['start'])} --> {format_time(sub['end'])}\n") f.write(f"{wrap_text(sub['text'])}\n\n") def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None, model_size="base", language="auto"): try: print("📽️ Loading video file...") video = mp.VideoFileClip(video_path) audio = video.audio with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file: temp_audio_path = temp_audio_file.name print("🔊 Extracting audio...") audio.write_audiofile(temp_audio_path, logger=None) print(f"🤖 Loading Whisper model ({model_size})...") # Initialize Whisper model - much more accurate than Google Speech Recognition model = WhisperModel(model_size, device="cpu", compute_type="int8") print("🎯 Transcribing with Whisper AI...") # Transcribe the entire audio file at once - Whisper handles timing automatically segments, info = model.transcribe( temp_audio_path, language=None if language == "auto" else language, word_timestamps=True, vad_filter=True, # Voice Activity Detection for better accuracy vad_parameters=dict(min_silence_duration_ms=500) ) print(f"🌍 Detected language: {info.language} (confidence: {info.language_probability:.2f})") subtitles = [] segment_list = list(segments) # Convert generator to list for progress tracking print(f"📝 Processing {len(segment_list)} speech segments...") for i, segment in enumerate(segment_list): # Whisper provides precise timing and text start_time = segment.start end_time = segment.end text = segment.text.strip() if text and len(text) > 0: subtitles.append({ "start": start_time, "end": end_time, "text": text }) print(f"✅ Segment {i+1}: '{text[:50]}...' ({start_time:.1f}s - {end_time:.1f}s)") # Update progress bar if progress_callback: progress_callback(i + 1, len(segment_list)) # Clean up if os.path.exists(temp_audio_path): os.remove(temp_audio_path) if video: video.close() if audio: audio.close() print(f"🎯 Generated {len(subtitles)} subtitle segments with Whisper AI") write_srt(subtitles, srt_output_path) return True except Exception as e: print(f"❌ Error: {e}") return False # -------------------- GUI -------------------- def select_file_and_generate(): video_path = filedialog.askopenfilename( title="Select a video file", filetypes=[("Video files", "*.mp4 *.mov *.avi *.mkv *.webm *.flv")] ) if not video_path: return srt_output_path = filedialog.asksaveasfilename( title="Save SRT subtitles as...", defaultextension=".srt", filetypes=[("Subtitle files", "*.srt")] ) if not srt_output_path: return # Disable button during processing select_button.config(state="disabled", text="Processing...") progress_bar["value"] = 0 progress_label.config(text="Starting speech recognition...") status_label.config(text="🔄 Processing video...", fg="blue") root.update() def update_progress(current, total): percent = (current / total) * 100 progress_bar["value"] = percent progress_label.config(text=f"Processing: {current}/{total} segments") root.update() def process_video(): try: model_size = model_size_var.get() language = language_var.get() success = transcribe_video_to_srt( video_path, srt_output_path, progress_callback=update_progress, model_size=model_size, language=language ) if success: status_label.config(text="✅ Subtitles generated successfully!", fg="green") messagebox.showinfo("Success", f"Subtitles saved to:\n{srt_output_path}\n\nOpen with your subtitle_generator2.py for editing!") else: status_label.config(text="❌ Error occurred during processing", fg="red") messagebox.showerror("Error", "Something went wrong. Check console for details.") except Exception as e: status_label.config(text="❌ Unexpected error occurred", fg="red") messagebox.showerror("Error", f"Unexpected error: {e}") finally: # Re-enable button select_button.config(state="normal", text="📂 Select Video and Generate Subtitles") progress_label.config(text="Done") # Run in separate thread to prevent GUI freezing thread = threading.Thread(target=process_video) thread.daemon = True thread.start() # GUI Setup root = tk.Tk() root.title("🎬 Auto Subtitle Generator - Speech to SRT") root.geometry("500x350") frame = tk.Frame(root, padx=20, pady=20) frame.pack(fill="both", expand=True) # Title title_label = tk.Label(frame, text="🎬 Auto Subtitle Generator", font=("Arial", 16, "bold")) title_label.pack(pady=(0, 10)) subtitle_label = tk.Label(frame, text="Extract speech from video and create perfectly timed SRT subtitles", font=("Arial", 10)) subtitle_label.pack(pady=(0, 20)) # Settings Frame settings_frame = tk.LabelFrame(frame, text="Whisper AI Settings", padx=10, pady=10) settings_frame.pack(fill="x", pady=(0, 15)) # Model Size Selection model_frame = tk.Frame(settings_frame) model_frame.pack(fill="x", pady=(0, 10)) tk.Label(model_frame, text="🧠 Model Size:", font=("Arial", 9)).pack(side="left") model_size_var = tk.StringVar(value="base") model_dropdown = ttk.Combobox(model_frame, textvariable=model_size_var, values=["tiny", "base", "small", "medium", "large"], state="readonly", width=12) model_dropdown.pack(side="right") # Language Selection language_frame = tk.Frame(settings_frame) language_frame.pack(fill="x", pady=(0, 10)) tk.Label(language_frame, text="🌍 Language:", font=("Arial", 9)).pack(side="left") language_var = tk.StringVar(value="auto") language_dropdown = ttk.Combobox(language_frame, textvariable=language_var, values=["auto", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh"], state="readonly", width=12) language_dropdown.pack(side="right") # Help text help_label = tk.Label( settings_frame, text=" 💡 Base model recommended for best speed/accuracy balance\n 🔍 Auto language detection works for most videos", font=("Arial", 8), fg="gray" ) help_label.pack(anchor="w") # Main Action Button select_button = tk.Button( frame, text="📂 Select Video and Generate Subtitles", command=select_file_and_generate, font=("Arial", 11, "bold"), bg="#4CAF50", fg="white", pady=8 ) select_button.pack(pady=15, fill="x") # Progress Section progress_frame = tk.LabelFrame(frame, text="Progress", padx=10, pady=10) progress_frame.pack(fill="x", pady=(0, 10)) progress_bar = ttk.Progressbar(progress_frame, length=400, mode="determinate") progress_bar.pack(fill="x", pady=(0, 5)) progress_label = tk.Label(progress_frame, text="Ready to process video", font=("Arial", 9)) progress_label.pack() # Status Label status_label = tk.Label(frame, text="💡 Tip: Use generated SRT files with subtitle_generator2.py for advanced editing!", font=("Arial", 9), fg="blue") status_label.pack(pady=(10, 0)) root.mainloop()