Refactor subtitle generation and integration with Whisper AI; remove unused files and enhance GUI for better user experience

2025-08-09 10:35:13 +02:00 · 2025-08-09 10:35:13 +02:00 · 491040b148
commit 491040b148
parent 5ce79f084d
10 changed files with 223 additions and 710 deletions
--- a/app.py
+++ b/app.py
@ -1,175 +0,0 @@
-import tkinter as tk
-from tkinter import filedialog
-from moviepy import VideoFileClip, TextClip, CompositeVideoClip
-import threading
-import json
-
-# Global settings with defaults
-settings = {
-    "subtitle_y_px": 1550,
-    "highlight_offset": -8,
-    "font_size_subtitle": 65,
-    "font_size_highlight": 68,
-    "highlight_x_offset": 0,
-    "video_path": None,
-    "selected_font": "Arial"  # Default font
-}
-
-# Compatible fonts that work across different systems
-COMPATIBLE_FONTS = [
-    "Arial",
-    "Times-Roman", 
-    "Helvetica",
-    "Courier",
-    "Comic-Sans-MS",
-    "Impact",
-    "Verdana",
-    "Tahoma",
-    "Georgia",
-    "Trebuchet-MS"
-]
-
-preset_file = "subtitle_gui_presets.json"
-
-def save_presets():
-    with open(preset_file, "w") as f:
-        json.dump(settings, f)
-    print("💾 Presets saved!")
-
-def load_presets():
-    global settings
-    try:
-        with open(preset_file, "r") as f:
-            loaded = json.load(f)
-            settings.update(loaded)
-            print("✅ Presets loaded!")
-            sync_gui()
-    except FileNotFoundError:
-        print("⚠️ No presets found.")
-
-def sync_gui():
-    sub_y_slider.set(settings["subtitle_y_px"])
-    highlight_slider.set(settings["highlight_offset"])
-    highlight_x_slider.set(settings["highlight_x_offset"])
-    sub_font_slider.set(settings["font_size_subtitle"])
-    highlight_font_slider.set(settings["font_size_highlight"])
-    font_var.set(settings["selected_font"])
-
-def render_preview():
-    if not settings["video_path"]:
-        print("⚠️ No video selected.")
-        return
-
-    clip = VideoFileClip(settings["video_path"]).subclipped(0, 3)  # Use first 3 seconds
-    vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2)
-
-    subtitle_text = "THIS IS A TEST SUBTITLE"
-    highlight_word = "SUBTITLE"
-
-    base_subtitle = TextClip(
-        text=subtitle_text,
-        font_size=settings["font_size_subtitle"],
-        font=settings["selected_font"],
-        color='white',
-        stroke_color='black',
-        stroke_width=5
-    ).with_duration(3).with_position(('center', settings["subtitle_y_px"]))
-
-    # Compute highlight word position
-    full_text = subtitle_text.upper()
-    words = full_text.split()
-    highlight_index = words.index(highlight_word.upper())
-    chars_before = sum(len(w) + 1 for w in words[:highlight_index])
-    char_width = 35
-    total_width = len(full_text) * char_width
-    x_offset = (chars_before * char_width) - (total_width // 2) + settings["highlight_x_offset"]
-
-    highlighted_word = TextClip(
-        text=highlight_word,
-        font_size=settings["font_size_highlight"],
-        font=settings["selected_font"],
-        color='#FFD700',
-        stroke_color='#FF6B35',
-        stroke_width=5
-    ).with_duration(1.5).with_start(0.75).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
-
-    final = CompositeVideoClip([vertical_clip, base_subtitle, highlighted_word], size=(1080, 1920))
-    # Scale down the preview to fit 1080p monitor (max height ~900px to leave room for taskbar)
-    preview_scale = 900 / 1920  # Scale factor to fit height
-    preview_width = int(1080 * preview_scale)
-    preview_height = int(1920 * preview_scale)
-    preview_clip = final.resized((preview_width, preview_height))
-    preview_clip.preview(fps=24, audio=False)
-
-    clip.close()
-    final.close()
-    preview_clip.close()
-
-def update_setting(var_name, value):
-    settings[var_name] = int(value)
-
-def update_font(font_name):
-    settings["selected_font"] = font_name
-
-def open_video():
-    file_path = filedialog.askopenfilename(filetypes=[("MP4 files", "*.mp4")])
-    if file_path:
-        settings["video_path"] = file_path
-        print(f"📂 Loaded video: {file_path}")
-
-def start_preview_thread():
-    threading.Thread(target=render_preview).start()
-
-# GUI Setup
-root = tk.Tk()
-root.title("Subtitle Positioning Tool")
-root.geometry("400x600")
-
-load_btn = tk.Button(root, text="🎥 Load Video", command=open_video)
-load_btn.pack(pady=10)
-
-tk.Label(root, text="Font Family").pack()
-font_var = tk.StringVar(value=settings["selected_font"])
-font_dropdown = tk.OptionMenu(root, font_var, *COMPATIBLE_FONTS, command=update_font)
-font_dropdown.pack(pady=5)
-
-tk.Label(root, text="Subtitle Y Position").pack()
-sub_y_slider = tk.Scale(root, from_=1000, to=1800, orient="horizontal",
-                        command=lambda v: update_setting("subtitle_y_px", v))
-sub_y_slider.set(settings["subtitle_y_px"])
-sub_y_slider.pack()
-
-tk.Label(root, text="Highlight Y Offset").pack()
-highlight_slider = tk.Scale(root, from_=-100, to=100, orient="horizontal",
-                             command=lambda v: update_setting("highlight_offset", v))
-highlight_slider.set(settings["highlight_offset"])
-highlight_slider.pack()
-
-tk.Label(root, text="Highlight X Offset").pack()
-highlight_x_slider = tk.Scale(root, from_=-300, to=300, orient="horizontal",
-                               command=lambda v: update_setting("highlight_x_offset", v))
-highlight_x_slider.set(settings["highlight_x_offset"])
-highlight_x_slider.pack()
-
-tk.Label(root, text="Subtitle Font Size").pack()
-sub_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
-                           command=lambda v: update_setting("font_size_subtitle", v))
-sub_font_slider.set(settings["font_size_subtitle"])
-sub_font_slider.pack()
-
-tk.Label(root, text="Highlight Font Size").pack()
-highlight_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
-                                 command=lambda v: update_setting("font_size_highlight", v))
-highlight_font_slider.set(settings["font_size_highlight"])
-highlight_font_slider.pack()
-
-preview_btn = tk.Button(root, text="▶️ Preview Clip", command=start_preview_thread)
-preview_btn.pack(pady=10)
-
-save_btn = tk.Button(root, text="💾 Save Preset", command=save_presets)
-save_btn.pack(pady=5)
-
-load_preset_btn = tk.Button(root, text="📂 Load Preset", command=load_presets)
-load_preset_btn.pack(pady=5)
-
-root.mainloop()
--- a/app2.py
+++ b/app2.py
@ -1,322 +0,0 @@
-import tkinter as tk
-from tkinter import filedialog
-from moviepy import VideoFileClip, TextClip, CompositeVideoClip
-import threading
-import json
-import re
-import os
-import platform
-
-def get_system_fonts():
-    """Get list of available system fonts"""
-    fonts = []
-    
-    if platform.system() == "Windows":
-        # Common Windows font paths
-        font_paths = [
-            "C:/Windows/Fonts/",
-            "C:/Windows/System32/Fonts/"
-        ]
-        
-        common_fonts = []
-        for font_path in font_paths:
-            if os.path.exists(font_path):
-                for file in os.listdir(font_path):
-                    if file.endswith(('.ttf', '.otf')):
-                        # Extract font name without extension
-                        font_name = os.path.splitext(file)[0]
-                        # Clean up common variations
-                        if 'arial' in font_name.lower() and 'bold' not in font_name.lower():
-                            common_fonts.append('arial.ttf')
-                        elif 'times' in font_name.lower() and 'bold' not in font_name.lower():
-                            common_fonts.append('times.ttf')
-                        elif 'courier' in font_name.lower() and 'bold' not in font_name.lower():
-                            common_fonts.append('cour.ttf')
-                        elif 'comic' in font_name.lower():
-                            common_fonts.append('comic.ttf')
-                        elif 'impact' in font_name.lower():
-                            common_fonts.append('impact.ttf')
-                        elif 'verdana' in font_name.lower():
-                            common_fonts.append('verdana.ttf')
-                        elif 'tahoma' in font_name.lower():
-                            common_fonts.append('tahoma.ttf')
-        
-        # Add found fonts, fallback to common Windows fonts
-        fonts = list(set(common_fonts)) if common_fonts else [
-            'arial.ttf', 'times.ttf', 'cour.ttf', 'comic.ttf', 
-            'impact.ttf', 'verdana.ttf', 'tahoma.ttf'
-        ]
-    
-    # Add option to use no font (system default)
-    fonts.insert(0, 'System Default')
-    return fonts
-
-AVAILABLE_FONTS = get_system_fonts()
-
-# Global settings with defaults
-settings = {
-    "subtitle_y_px": 1550,
-    "highlight_offset": -8,
-    "font_size_subtitle": 65,
-    "font_size_highlight": 68,
-    "highlight_x_offset": 0,
-    "video_path": None,
-    "font": "System Default",
-    "subtitles": [],
-    "current_index": 0
-}
-
-# Compatible fonts that work across different systems
-COMPATIBLE_FONTS = [
-    "Arial",
-    "Times-Roman", 
-    "Helvetica",
-    "Courier",
-    "Comic-Sans-MS",
-    "Impact",
-    "Verdana",
-    "Tahoma",
-    "Georgia",
-    "Trebuchet-MS"
-]
-
-preset_file = "subtitle_gui_presets.json"
-
-# === SRT PARSER ===
-def parse_srt(file_path):
-    with open(file_path, 'r', encoding='utf-8') as f:
-        contents = f.read()
-    pattern = r"(\d+)\s+(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\s+([\s\S]*?)(?=\n\d+|\Z)"
-    matches = re.findall(pattern, contents)
-    subtitles = []
-    for _, start, end, text in matches:
-        subtitles.append({
-            "start": srt_time_to_seconds(start),
-            "end": srt_time_to_seconds(end),
-            "text": text.replace('\n', ' ')
-        })
-    return subtitles
-
-def srt_time_to_seconds(time_str):
-    h, m, s_ms = time_str.split(':')
-    s, ms = s_ms.split(',')
-    return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000
-
-# === PRESETS ===
-def save_presets():
-    with open(preset_file, "w") as f:
-        json.dump(settings, f)
-    print("📂 Presets saved!")
-
-def load_presets():
-    global settings
-    try:
-        with open(preset_file, "r") as f:
-            loaded = json.load(f)
-            settings.update(loaded)
-            print("✅ Presets loaded!")
-            sync_gui()
-    except FileNotFoundError:
-        print("⚠️ No presets found.")
-
-# === SYNC ===
-def sync_gui():
-    sub_y_slider.set(settings["subtitle_y_px"])
-    highlight_slider.set(settings["highlight_offset"])
-    highlight_x_slider.set(settings["highlight_x_offset"])
-    sub_font_slider.set(settings["font_size_subtitle"])
-    highlight_font_slider.set(settings["font_size_highlight"])
-    font_dropdown_var.set(settings["font"])
-
-def render_preview():
-    if not settings["video_path"] or not settings["subtitles"]:
-        print("⚠️ Video or subtitles not loaded.")
-        return
-
-    sub = settings["subtitles"][settings["current_index"]]
-    subtitle_text = sub["text"]
-    start_time = sub["start"]
-    end_time = sub["end"]
-    duration = end_time - start_time
-
-    clip = VideoFileClip(settings["video_path"]).subclipped(start_time, end_time)
-    vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2)
-
-    highlight_word = subtitle_text.split()[-1]  # Highlight last word for now
-
-    # Create TextClip with font if specified, otherwise use system default
-    if settings["font"] == "System Default":
-        base_subtitle = TextClip(
-            text=subtitle_text,
-            font_size=settings["font_size_subtitle"],
-            color='white',
-            stroke_color='black',
-            stroke_width=5
-        ).with_duration(duration).with_position(('center', settings["subtitle_y_px"]))
-    else:
-        try:
-            base_subtitle = TextClip(
-                text=subtitle_text,
-                font=settings["font"],
-                font_size=settings["font_size_subtitle"],
-                color='white',
-                stroke_color='black',
-                stroke_width=5
-            ).with_duration(duration).with_position(('center', settings["subtitle_y_px"]))
-        except:
-            # Fallback to system default if font fails
-            print(f"⚠️ Font {settings['font']} failed, using system default")
-            base_subtitle = TextClip(
-                text=subtitle_text,
-                font_size=settings["font_size_subtitle"],
-                color='white',
-                stroke_color='black',
-                stroke_width=5
-            ).with_duration(duration).with_position(('center', settings["subtitle_y_px"]))
-
-    full_text = subtitle_text.upper()
-    words = full_text.split()
-    try:
-        highlight_index = words.index(highlight_word.upper())
-    except ValueError:
-        highlight_index = len(words) - 1
-
-    chars_before = sum(len(w) + 1 for w in words[:highlight_index])
-    char_width = 35
-    total_width = len(full_text) * char_width
-    x_offset = (chars_before * char_width) - (total_width // 2) + settings["highlight_x_offset"]
-
-    # Create highlighted word with same font logic
-    if settings["font"] == "System Default":
-        highlighted_word = TextClip(
-            text=highlight_word,
-            font_size=settings["font_size_highlight"],
-            color='#FFD700',
-            stroke_color='#FF6B35',
-            stroke_width=5
-        ).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
-    else:
-        try:
-            highlighted_word = TextClip(
-                text=highlight_word,
-                font=settings["font"],
-                font_size=settings["font_size_highlight"],
-                color='#FFD700',
-                stroke_color='#FF6B35',
-                stroke_width=5
-            ).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
-        except:
-            # Fallback to system default if font fails
-            highlighted_word = TextClip(
-                text=highlight_word,
-                font_size=settings["font_size_highlight"],
-                color='#FFD700',
-                stroke_color='#FF6B35',
-                stroke_width=5
-            ).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
-
-    final = CompositeVideoClip([vertical_clip, base_subtitle, highlighted_word], size=(1080, 1920))
-    # Scale down the preview to fit 1080p monitor (max height ~900px to leave room for taskbar)
-    preview_scale = 900 / 1920  # Scale factor to fit height
-    preview_width = int(1080 * preview_scale)
-    preview_height = int(1920 * preview_scale)
-    preview_clip = final.resized((preview_width, preview_height))
-    preview_clip.preview(fps=24, audio=False)
-
-    clip.close()
-    final.close()
-    preview_clip.close()
-
-def update_setting(var_name, value):
-    settings[var_name] = int(value) if var_name.startswith("font_size") or "offset" in var_name or "y_px" in var_name else value
-
-def update_font(value):
-    settings["font"] = value
-
-def open_video():
-    file_path = filedialog.askopenfilename(filetypes=[("MP4 files", "*.mp4")])
-    if file_path:
-        settings["video_path"] = file_path
-        print(f"📂 Loaded video: {file_path}")
-
-def load_srt():
-    file_path = filedialog.askopenfilename(filetypes=[("SRT Subtitle", "*.srt")])
-    if file_path:
-        settings["subtitles"] = parse_srt(file_path)
-        settings["current_index"] = 0
-        print(f"📝 Loaded {len(settings['subtitles'])} subtitles from {file_path}")
-
-def next_sub():
-    if settings["current_index"] < len(settings["subtitles"]) - 1:
-        settings["current_index"] += 1
-        start_preview_thread()
-
-def prev_sub():
-    if settings["current_index"] > 0:
-        settings["current_index"] -= 1
-        start_preview_thread()
-
-def start_preview_thread():
-    threading.Thread(target=render_preview).start()
-
-# === GUI ===
-root = tk.Tk()
-root.title("Subtitle Positioning Tool")
-root.geometry("420x700")
-
-load_btn = tk.Button(root, text="🎥 Load Video", command=open_video)
-load_btn.pack(pady=10)
-
-load_srt_btn = tk.Button(root, text="📑 Load SRT Subtitles", command=load_srt)
-load_srt_btn.pack(pady=5)
-
-tk.Label(root, text="Subtitle Y Position").pack()
-sub_y_slider = tk.Scale(root, from_=1000, to=1800, orient="horizontal",
-                        command=lambda v: update_setting("subtitle_y_px", v))
-sub_y_slider.set(settings["subtitle_y_px"])
-sub_y_slider.pack()
-
-tk.Label(root, text="Highlight Y Offset").pack()
-highlight_slider = tk.Scale(root, from_=-100, to=100, orient="horizontal",
-                             command=lambda v: update_setting("highlight_offset", v))
-highlight_slider.set(settings["highlight_offset"])
-highlight_slider.pack()
-
-tk.Label(root, text="Highlight X Offset").pack()
-highlight_x_slider = tk.Scale(root, from_=-300, to=300, orient="horizontal",
-                               command=lambda v: update_setting("highlight_x_offset", v))
-highlight_x_slider.set(settings["highlight_x_offset"])
-highlight_x_slider.pack()
-
-tk.Label(root, text="Subtitle Font Size").pack()
-sub_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
-                           command=lambda v: update_setting("font_size_subtitle", v))
-sub_font_slider.set(settings["font_size_subtitle"])
-sub_font_slider.pack()
-
-tk.Label(root, text="Highlight Font Size").pack()
-highlight_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
-                                 command=lambda v: update_setting("font_size_highlight", v))
-highlight_font_slider.set(settings["font_size_highlight"])
-highlight_font_slider.pack()
-
-tk.Label(root, text="Font").pack()
-font_dropdown_var = tk.StringVar(value=settings["font"])
-font_dropdown = tk.OptionMenu(root, font_dropdown_var, *AVAILABLE_FONTS, command=update_font)
-font_dropdown.pack(pady=5)
-
-preview_btn = tk.Button(root, text="▶️ Preview Clip", command=start_preview_thread)
-preview_btn.pack(pady=10)
-
-nav_frame = tk.Frame(root)
-tk.Button(nav_frame, text="⏮️ Prev", command=prev_sub).pack(side="left", padx=5)
-tk.Button(nav_frame, text="⏭️ Next", command=next_sub).pack(side="right", padx=5)
-nav_frame.pack(pady=5)
-
-save_btn = tk.Button(root, text="📂 Save Preset", command=save_presets)
-save_btn.pack(pady=5)
-
-load_preset_btn = tk.Button(root, text="📂 Load Preset", command=load_presets)
-load_preset_btn.pack(pady=5)
-
-root.mainloop()
--- a/myvideo.srt
+++ b/myvideo.srt
@ -0,0 +1,22 @@
+1
+00:00:30,000 --> 00:00:40,000
+okay after we will
+
+2
+00:02:00,000 --> 00:02:10,000
+find it difficult to believe we prepare
+to fight arm and arm the corny and royal
+family
+
+3
+00:02:20,000 --> 00:02:30,000
+hello me
+
+4
+00:02:30,000 --> 00:02:40,000
+as expected
+
+5
+00:02:40,000 --> 00:02:50,000
+gacha
+
--- a/shorts_generator2.py
+++ b/shorts_generator2.py
@ -112,7 +112,8 @@ def create_short_clip(video_path, start, end, subtitles, output_path):
                    word_width, _ = highlighted_word.size

                    word_x = current_x + (word_width / 2)
-                    highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((word_x -125, subtitle_y_px))
+                    highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((word_x -125
+                    , subtitle_y_px))
                    clips.append(highlighted_word)

                    current_x += word_width + 20  # Add spacing between words
--- a/sub2.srt
+++ b/sub2.srt
@ -0,0 +1,4 @@
+1
+00:00:00,000 --> 00:00:08,250
+Yeah, yeah, level she's 24.
+
--- a/subtitle_extrator.py
+++ b/subtitle_extrator.py
@ -1,157 +0,0 @@
-import os
-import numpy as np
-from moviepy import VideoFileClip, TextClip, CompositeVideoClip
-from faster_whisper import WhisperModel
-
-def detect_loud_moments(video_path, chunk_duration=5, threshold_db=10):
-    print("🔍 Analyzing audio...")
-    clip = VideoFileClip(video_path)
-    audio = clip.audio.to_soundarray(fps=44100)
-    volume = np.linalg.norm(audio, axis=1)
-    chunk_size = int(chunk_duration * 44100)
-
-    loud_chunks = []
-    max_db = -float('inf')
-    for i in range(0, len(volume), chunk_size):
-        chunk = volume[i:i+chunk_size]
-        db = 20 * np.log10(np.mean(chunk) + 1e-10)
-        max_db = max(max_db, db)
-        if db > threshold_db:
-            start = i / 44100
-            loud_chunks.append((start, min(start + chunk_duration, clip.duration)))
-    
-    print(f"🔊 Max volume found: {max_db:.2f} dB, threshold: {threshold_db} dB")
-    print(f"📈 Found {len(loud_chunks)} loud moments")
-    clip.close()
-    return loud_chunks
-
-def transcribe_and_extract_subtitles(video_path, start, end):
-    print(f"🗣️ Transcribing audio from {start:.2f}s to {end:.2f}s...")
-    model = WhisperModel("base", device="cpu", compute_type="int8")
-    segments, _ = model.transcribe(video_path, beam_size=5, language="en", vad_filter=True)
-
-    subtitles = []
-    for segment in segments:
-        if start <= segment.start <= end:
-            subtitles.append((segment.start - start, segment.end - start, segment.text))
-    return subtitles
-
-def create_short_clip(video_path, start, end, subtitles, output_path):
-    print(f"🎬 Creating short: {output_path}")
-    clip = VideoFileClip(video_path).subclipped(start, end)
-    video_duration = clip.duration
-    print(f"📏 Video clip duration: {video_duration:.2f}s")
-
-    # Convert to vertical 9:16
-    vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2)
-
-    clips = [vertical_clip]
-    for (s, e, text) in subtitles:
-        try:
-            # Ensure subtitle timing doesn't exceed video duration
-            subtitle_start = max(0, s)
-            subtitle_end = min(e, video_duration)
-            
-            if subtitle_start >= video_duration or subtitle_end <= subtitle_start:
-                print(f"⚠️ Skipping subtitle outside video duration: {text[:30]}...")
-                continue
-            
-            # Opus Clip style professional subtitles
-            words = text.strip().split()
-            if not words:
-                continue
-            
-            # Break text into smaller chunks for better readability (max 3-4 words per line)
-            chunks = []
-            current_chunk = []
-            for word in words:
-                current_chunk.append(word)
-                if len(current_chunk) >= 3 or len(' '.join(current_chunk)) > 25:
-                    chunks.append(' '.join(current_chunk))
-                    current_chunk = []
-            if current_chunk:
-                chunks.append(' '.join(current_chunk))
-            
-            # Position subtitles in the center-bottom area (Opus style)
-            subtitle_position = 0.75
-            
-            # Create subtitle for each chunk with smooth transitions
-            chunk_duration = (subtitle_end - subtitle_start) / len(chunks)
-            
-            for chunk_idx, chunk_text in enumerate(chunks):
-                chunk_start = subtitle_start + (chunk_idx * chunk_duration)
-                chunk_end = min(chunk_start + chunk_duration, subtitle_end)
-                
-                chunk_words = chunk_text.split()
-                
-                # Base subtitle with Opus-style design (bold white text with strong outline)
-                base_subtitle = TextClip(
-                    text=chunk_text.upper(), 
-                    font='C:/Windows/Fonts/LatoWeb-Bold.ttf',  # Lato Bold - excellent for subtitles
-                    font_size=65,  # Larger, chunkier text
-                    color='white', 
-                    stroke_color='black', 
-                    stroke_width=5  # Thicker outline for better readability
-                )
-                base_subtitle = base_subtitle.with_start(chunk_start).with_end(chunk_end).with_position(('center', subtitle_position), relative=True)
-                clips.append(base_subtitle)
-                
-                # Opus-style word-by-word highlighting (yellow/orange like Opus)
-                word_duration = chunk_duration / len(chunk_words)
-                
-                for i, word in enumerate(chunk_words):
-                    word_start = chunk_start + (i * word_duration)
-                    word_end = min(word_start + word_duration * 0.8, chunk_end)
-                    
-                    # Opus-style highlighted word (vibrant yellow/orange)
-                    highlighted_word = TextClip(
-                        text=word.upper(), 
-                        font='C:/Windows/Fonts/LatoWeb-Bold.ttf',  # Lato Bold for consistency
-                        font_size=68,  # Slightly larger for highlight effect
-                        color='#FFD700',  # Gold/yellow like Opus Clip
-                        stroke_color='#FF6B35',  # Orange outline for pop
-                        stroke_width=5
-                    )
-                    
-                    # Calculate precise word positioning within the chunk
-                    words_before = chunk_words[:i]
-                    chars_before = sum(len(w) for w in words_before) + len(words_before)
-                    
-                    # More accurate character width calculation for Arial Bold
-                    char_width = 35  # Adjusted for larger, bolder font
-                    total_chunk_width = len(chunk_text) * char_width
-                    word_x_offset = (chars_before * char_width) - (total_chunk_width // 2)
-                    
-                    highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((540 + word_x_offset, subtitle_position), relative=(False, True))
-                    clips.append(highlighted_word)
-                
-            print(f"✅ Added Opus-style subtitle ({subtitle_start:.1f}s-{subtitle_end:.1f}s): {text[:30]}...")
-        except Exception as e:
-            print(f"⚠️ Subtitle error: {e}, skipping subtitle: {text[:50]}...")
-            continue
-
-    final = CompositeVideoClip(clips, size=(1080, 1920))
-    final.write_videofile(output_path, codec="libx264", audio_codec="aac", threads=1)
-
-    # 💥 Force close to avoid Windows pipe errors
-    clip.reader.close()
-    if clip.audio:
-        clip.audio.reader.close()
-    final.close()
-
-def generate_shorts(video_path, max_clips=3, output_folder="shorts"):
-    os.makedirs(output_folder, exist_ok=True)
-    best_moments = detect_loud_moments(video_path, threshold_db=-30)
-
-    selected = best_moments[:max_clips]
-    for i, (start, end) in enumerate(selected):
-        subtitles = transcribe_and_extract_subtitles(video_path, start, end)
-        out_path = os.path.join(output_folder, f"short_{i+1}.mp4")
-        create_short_clip(video_path, start, end, subtitles, out_path)
-
-if __name__ == "__main__":
-    import sys
-    if len(sys.argv) < 2:
-        print("Usage: python shorts_generator.py your_video.mp4")
-    else:
-        generate_shorts(sys.argv[1])
--- a/subtitle_generator.py
+++ b/subtitle_generator.py
@ -2,9 +2,12 @@ import os
 import math
 import tempfile
 import moviepy as mp
-import speech_recognition as sr
 import tkinter as tk
 from tkinter import filedialog, messagebox, ttk
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+import threading
+from faster_whisper import WhisperModel


 def format_time(seconds):
@ -44,52 +47,71 @@ def write_srt(subtitles, output_path):
            f.write(f"{wrap_text(sub['text'])}\n\n")


-def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None, chunk_duration=10):
+def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None, model_size="base", language="auto"):
    try:
+        print("📽️ Loading video file...")
        video = mp.VideoFileClip(video_path)
        audio = video.audio

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
            temp_audio_path = temp_audio_file.name
+            print("🔊 Extracting audio...")
            audio.write_audiofile(temp_audio_path, logger=None)

-        recognizer = sr.Recognizer()
+        print(f"🤖 Loading Whisper model ({model_size})...")
+        # Initialize Whisper model - much more accurate than Google Speech Recognition
+        model = WhisperModel(model_size, device="cpu", compute_type="int8")
+        
+        print("🎯 Transcribing with Whisper AI...")
+        # Transcribe the entire audio file at once - Whisper handles timing automatically
+        segments, info = model.transcribe(
+            temp_audio_path, 
+            language=None if language == "auto" else language,
+            word_timestamps=True,
+            vad_filter=True,  # Voice Activity Detection for better accuracy
+            vad_parameters=dict(min_silence_duration_ms=500)
+        )
+        
+        print(f"🌍 Detected language: {info.language} (confidence: {info.language_probability:.2f})")
+        
        subtitles = []
+        segment_list = list(segments)  # Convert generator to list for progress tracking
+        
+        print(f"📝 Processing {len(segment_list)} speech segments...")
+        
+        for i, segment in enumerate(segment_list):
+            # Whisper provides precise timing and text
+            start_time = segment.start
+            end_time = segment.end
+            text = segment.text.strip()
+            
+            if text and len(text) > 0:
+                subtitles.append({
+                    "start": start_time,
+                    "end": end_time,
+                    "text": text
+                })
+                print(f"✅ Segment {i+1}: '{text[:50]}...' ({start_time:.1f}s - {end_time:.1f}s)")
+            
+            # Update progress bar
+            if progress_callback:
+                progress_callback(i + 1, len(segment_list))

-        with sr.AudioFile(temp_audio_path) as source:
-            audio_duration = source.DURATION
-            num_chunks = math.ceil(audio_duration / chunk_duration)
-
-            for i in range(num_chunks):
-                start_time = i * chunk_duration
-                end_time = min((i + 1) * chunk_duration, audio_duration)
-
-                source_offset = start_time
-                duration = end_time - start_time
-
-                audio_data = recognizer.record(source, offset=source_offset, duration=duration)
-
-                try:
-                    text = recognizer.recognize_google(audio_data)
-                    subtitles.append({
-                        "start": start_time,
-                        "end": end_time,
-                        "text": text
-                    })
-                except sr.UnknownValueError:
-                    pass
-                except sr.RequestError as e:
-                    print(f"API error: {e}")
-
-                # Update progress bar
-                if progress_callback:
-                    progress_callback(i + 1, num_chunks)
-
-        os.remove(temp_audio_path)
+        # Clean up
+        if os.path.exists(temp_audio_path):
+            os.remove(temp_audio_path)
+        
+        if video:
+            video.close()
+        if audio:
+            audio.close()
+        
+        print(f"🎯 Generated {len(subtitles)} subtitle segments with Whisper AI")
        write_srt(subtitles, srt_output_path)
        return True
+        
    except Exception as e:
-        print(f"Error: {e}")
+        print(f"❌ Error: {e}")
        return False


@ -98,7 +120,7 @@ def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None,
 def select_file_and_generate():
    video_path = filedialog.askopenfilename(
        title="Select a video file",
-        filetypes=[("Video files", "*.mp4 *.mov *.avi *.mkv")]
+        filetypes=[("Video files", "*.mp4 *.mov *.avi *.mkv *.webm *.flv")]
    )

    if not video_path:
@ -113,43 +135,127 @@ def select_file_and_generate():
    if not srt_output_path:
        return

+    # Disable button during processing
+    select_button.config(state="disabled", text="Processing...")
    progress_bar["value"] = 0
-    progress_label.config(text="Starting...")
+    progress_label.config(text="Starting speech recognition...")
+    status_label.config(text="🔄 Processing video...", fg="blue")
    root.update()

    def update_progress(current, total):
        percent = (current / total) * 100
        progress_bar["value"] = percent
-        progress_label.config(text=f"Progress: {current}/{total} chunks")
+        progress_label.config(text=f"Processing: {current}/{total} segments")
        root.update()

-    success = transcribe_video_to_srt(video_path, srt_output_path, progress_callback=update_progress)
+    def process_video():
+        try:
+            model_size = model_size_var.get()
+            language = language_var.get()
+            
+            success = transcribe_video_to_srt(
+                video_path, 
+                srt_output_path, 
+                progress_callback=update_progress,
+                model_size=model_size,
+                language=language
+            )

-    if success:
-        messagebox.showinfo("Success", f"Subtitles saved to:\n{srt_output_path}")
-    else:
-        messagebox.showerror("Error", "Something went wrong. See console for details.")
+            if success:
+                status_label.config(text="✅ Subtitles generated successfully!", fg="green")
+                messagebox.showinfo("Success", f"Subtitles saved to:\n{srt_output_path}\n\nOpen with your subtitle_generator2.py for editing!")
+            else:
+                status_label.config(text="❌ Error occurred during processing", fg="red")
+                messagebox.showerror("Error", "Something went wrong. Check console for details.")

-    progress_label.config(text="Done")
+        except Exception as e:
+            status_label.config(text="❌ Unexpected error occurred", fg="red")
+            messagebox.showerror("Error", f"Unexpected error: {e}")
+        finally:
+            # Re-enable button
+            select_button.config(state="normal", text="📂 Select Video and Generate Subtitles")
+            progress_label.config(text="Done")
+
+    # Run in separate thread to prevent GUI freezing
+    thread = threading.Thread(target=process_video)
+    thread.daemon = True
+    thread.start()


 # GUI Setup
 root = tk.Tk()
-root.title("Auto Subtitle Generator (.srt) with Progress")
+root.title("🎬 Auto Subtitle Generator - Speech to SRT")
+root.geometry("500x350")

 frame = tk.Frame(root, padx=20, pady=20)
-frame.pack()
+frame.pack(fill="both", expand=True)

-label = tk.Label(frame, text="Select a video file to auto-generate subtitles (SRT):")
-label.pack(pady=(0, 10))
+# Title
+title_label = tk.Label(frame, text="🎬 Auto Subtitle Generator", font=("Arial", 16, "bold"))
+title_label.pack(pady=(0, 10))

-select_button = tk.Button(frame, text="Select Video and Generate Subtitles", command=select_file_and_generate)
-select_button.pack(pady=5)
+subtitle_label = tk.Label(frame, text="Extract speech from video and create perfectly timed SRT subtitles", font=("Arial", 10))
+subtitle_label.pack(pady=(0, 20))

-progress_bar = ttk.Progressbar(frame, length=300, mode="determinate")
-progress_bar.pack(pady=(15, 5))
+# Settings Frame
+settings_frame = tk.LabelFrame(frame, text="Whisper AI Settings", padx=10, pady=10)
+settings_frame.pack(fill="x", pady=(0, 15))

-progress_label = tk.Label(frame, text="Idle")
+# Model Size Selection
+model_frame = tk.Frame(settings_frame)
+model_frame.pack(fill="x", pady=(0, 10))
+
+tk.Label(model_frame, text="🧠 Model Size:", font=("Arial", 9)).pack(side="left")
+model_size_var = tk.StringVar(value="base")
+model_dropdown = ttk.Combobox(model_frame, textvariable=model_size_var, 
+                              values=["tiny", "base", "small", "medium", "large"], 
+                              state="readonly", width=12)
+model_dropdown.pack(side="right")
+
+# Language Selection
+language_frame = tk.Frame(settings_frame)
+language_frame.pack(fill="x", pady=(0, 10))
+
+tk.Label(language_frame, text="🌍 Language:", font=("Arial", 9)).pack(side="left")
+language_var = tk.StringVar(value="auto")
+language_dropdown = ttk.Combobox(language_frame, textvariable=language_var, 
+                                 values=["auto", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh"], 
+                                 state="readonly", width=12)
+language_dropdown.pack(side="right")
+
+# Help text
+help_label = tk.Label(
+    settings_frame, 
+    text="   💡 Base model recommended for best speed/accuracy balance\n   🔍 Auto language detection works for most videos", 
+    font=("Arial", 8), 
+    fg="gray"
+)
+help_label.pack(anchor="w")
+
+# Main Action Button
+select_button = tk.Button(
+    frame, 
+    text="📂 Select Video and Generate Subtitles", 
+    command=select_file_and_generate,
+    font=("Arial", 11, "bold"),
+    bg="#4CAF50",
+    fg="white",
+    pady=8
+)
+select_button.pack(pady=15, fill="x")
+
+# Progress Section
+progress_frame = tk.LabelFrame(frame, text="Progress", padx=10, pady=10)
+progress_frame.pack(fill="x", pady=(0, 10))
+
+progress_bar = ttk.Progressbar(progress_frame, length=400, mode="determinate")
+progress_bar.pack(fill="x", pady=(0, 5))
+
+progress_label = tk.Label(progress_frame, text="Ready to process video", font=("Arial", 9))
 progress_label.pack()

+# Status Label
+status_label = tk.Label(frame, text="💡 Tip: Use generated SRT files with subtitle_generator2.py for advanced editing!", font=("Arial", 9), fg="blue")
+status_label.pack(pady=(10, 0))
+
 root.mainloop()
--- a/subtitle_generator2_fixed.py
+++ b/subtitle_generator2_fixed.py
--- a/subtitles.srt
+++ b/subtitles.srt
@ -1,3 +0,0 @@
-1
-00:00:00,000 --> 00:00:02,500
-You're running
--- a/test_whisper.py
+++ b/test_whisper.py
@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+"""
+Test script to verify faster_whisper integration
+"""
+
+import os
+from faster_whisper import WhisperModel
+
+def test_whisper_setup():
+    """Test if faster_whisper is working correctly"""
+    print("🧪 Testing faster_whisper setup...")
+    
+    try:
+        # Try to initialize the smallest model
+        print("📥 Loading tiny model (this might take a moment on first run)...")
+        model = WhisperModel("tiny")
+        print("✅ Successfully loaded Whisper tiny model!")
+        
+        # Check available models
+        available_models = ["tiny", "base", "small", "medium", "large"]
+        print(f"🎯 Available models: {', '.join(available_models)}")
+        
+        # Test basic functionality with a short audio
+        print("🔍 Whisper model ready for transcription!")
+        
+        return True
+        
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        return False
+
+if __name__ == "__main__":
+    if test_whisper_setup():
+        print("\n🎉 faster_whisper is ready to use!")
+        print("💡 Your subtitle generator now has much better speech recognition!")
+    else:
+        print("\n⚠️  There might be an issue with faster_whisper setup")