Refactor subtitle generation and integration with Whisper AI; remove unused files and enhance GUI for better user experience

2025-08-09 10:35:13 +02:00 · 2025-08-09 10:35:13 +02:00 · 491040b148
commit 491040b148
parent 5ce79f084d
10 changed files with 223 additions and 710 deletions
--- a/app.py
+++ b/app.py
@ -1,175 +0,0 @@
 import tkinter as tk
 from tkinter import filedialog
 from moviepy import VideoFileClip, TextClip, CompositeVideoClip
 import threading
 import json
 # Global settings with defaults
 settings = {
    "subtitle_y_px": 1550,
    "highlight_offset": -8,
    "font_size_subtitle": 65,
    "font_size_highlight": 68,
    "highlight_x_offset": 0,
    "video_path": None,
    "selected_font": "Arial"  # Default font
 }
 # Compatible fonts that work across different systems
 COMPATIBLE_FONTS = [
    "Arial",
    "Times-Roman", 
    "Helvetica",
    "Courier",
    "Comic-Sans-MS",
    "Impact",
    "Verdana",
    "Tahoma",
    "Georgia",
    "Trebuchet-MS"
 ]
 preset_file = "subtitle_gui_presets.json"
 def save_presets():
    with open(preset_file, "w") as f:
        json.dump(settings, f)
    print("💾 Presets saved!")
 def load_presets():
    global settings
    try:
        with open(preset_file, "r") as f:
            loaded = json.load(f)
            settings.update(loaded)
            print("✅ Presets loaded!")
            sync_gui()
    except FileNotFoundError:
        print("⚠️ No presets found.")
 def sync_gui():
    sub_y_slider.set(settings["subtitle_y_px"])
    highlight_slider.set(settings["highlight_offset"])
    highlight_x_slider.set(settings["highlight_x_offset"])
    sub_font_slider.set(settings["font_size_subtitle"])
    highlight_font_slider.set(settings["font_size_highlight"])
    font_var.set(settings["selected_font"])
 def render_preview():
    if not settings["video_path"]:
        print("⚠️ No video selected.")
        return
    clip = VideoFileClip(settings["video_path"]).subclipped(0, 3)  # Use first 3 seconds
    vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2)
    subtitle_text = "THIS IS A TEST SUBTITLE"
    highlight_word = "SUBTITLE"
    base_subtitle = TextClip(
        text=subtitle_text,
        font_size=settings["font_size_subtitle"],
        font=settings["selected_font"],
        color='white',
        stroke_color='black',
        stroke_width=5
    ).with_duration(3).with_position(('center', settings["subtitle_y_px"]))
    # Compute highlight word position
    full_text = subtitle_text.upper()
    words = full_text.split()
    highlight_index = words.index(highlight_word.upper())
    chars_before = sum(len(w) + 1 for w in words[:highlight_index])
    char_width = 35
    total_width = len(full_text) * char_width
    x_offset = (chars_before * char_width) - (total_width // 2) + settings["highlight_x_offset"]
    highlighted_word = TextClip(
        text=highlight_word,
        font_size=settings["font_size_highlight"],
        font=settings["selected_font"],
        color='#FFD700',
        stroke_color='#FF6B35',
        stroke_width=5
    ).with_duration(1.5).with_start(0.75).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
    final = CompositeVideoClip([vertical_clip, base_subtitle, highlighted_word], size=(1080, 1920))
    # Scale down the preview to fit 1080p monitor (max height ~900px to leave room for taskbar)
    preview_scale = 900 / 1920  # Scale factor to fit height
    preview_width = int(1080 * preview_scale)
    preview_height = int(1920 * preview_scale)
    preview_clip = final.resized((preview_width, preview_height))
    preview_clip.preview(fps=24, audio=False)
    clip.close()
    final.close()
    preview_clip.close()
 def update_setting(var_name, value):
    settings[var_name] = int(value)
 def update_font(font_name):
    settings["selected_font"] = font_name
 def open_video():
    file_path = filedialog.askopenfilename(filetypes=[("MP4 files", "*.mp4")])
    if file_path:
        settings["video_path"] = file_path
        print(f"📂 Loaded video: {file_path}")
 def start_preview_thread():
    threading.Thread(target=render_preview).start()
 # GUI Setup
 root = tk.Tk()
 root.title("Subtitle Positioning Tool")
 root.geometry("400x600")
 load_btn = tk.Button(root, text="🎥 Load Video", command=open_video)
 load_btn.pack(pady=10)
 tk.Label(root, text="Font Family").pack()
 font_var = tk.StringVar(value=settings["selected_font"])
 font_dropdown = tk.OptionMenu(root, font_var, *COMPATIBLE_FONTS, command=update_font)
 font_dropdown.pack(pady=5)
 tk.Label(root, text="Subtitle Y Position").pack()
 sub_y_slider = tk.Scale(root, from_=1000, to=1800, orient="horizontal",
                        command=lambda v: update_setting("subtitle_y_px", v))
 sub_y_slider.set(settings["subtitle_y_px"])
 sub_y_slider.pack()
 tk.Label(root, text="Highlight Y Offset").pack()
 highlight_slider = tk.Scale(root, from_=-100, to=100, orient="horizontal",
                             command=lambda v: update_setting("highlight_offset", v))
 highlight_slider.set(settings["highlight_offset"])
 highlight_slider.pack()
 tk.Label(root, text="Highlight X Offset").pack()
 highlight_x_slider = tk.Scale(root, from_=-300, to=300, orient="horizontal",
                               command=lambda v: update_setting("highlight_x_offset", v))
 highlight_x_slider.set(settings["highlight_x_offset"])
 highlight_x_slider.pack()
 tk.Label(root, text="Subtitle Font Size").pack()
 sub_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
                           command=lambda v: update_setting("font_size_subtitle", v))
 sub_font_slider.set(settings["font_size_subtitle"])
 sub_font_slider.pack()
 tk.Label(root, text="Highlight Font Size").pack()
 highlight_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
                                 command=lambda v: update_setting("font_size_highlight", v))
 highlight_font_slider.set(settings["font_size_highlight"])
 highlight_font_slider.pack()
 preview_btn = tk.Button(root, text="▶️ Preview Clip", command=start_preview_thread)
 preview_btn.pack(pady=10)
 save_btn = tk.Button(root, text="💾 Save Preset", command=save_presets)
 save_btn.pack(pady=5)
 load_preset_btn = tk.Button(root, text="📂 Load Preset", command=load_presets)
 load_preset_btn.pack(pady=5)
 root.mainloop()
--- a/app2.py
+++ b/app2.py
@ -1,322 +0,0 @@
 import tkinter as tk
 from tkinter import filedialog
 from moviepy import VideoFileClip, TextClip, CompositeVideoClip
 import threading
 import json
 import re
 import os
 import platform
 def get_system_fonts():
    """Get list of available system fonts"""
    fonts = []
    if platform.system() == "Windows":
        # Common Windows font paths
        font_paths = [
            "C:/Windows/Fonts/",
            "C:/Windows/System32/Fonts/"
        ]
        common_fonts = []
        for font_path in font_paths:
            if os.path.exists(font_path):
                for file in os.listdir(font_path):
                    if file.endswith(('.ttf', '.otf')):
                        # Extract font name without extension
                        font_name = os.path.splitext(file)[0]
                        # Clean up common variations
                        if 'arial' in font_name.lower() and 'bold' not in font_name.lower():
                            common_fonts.append('arial.ttf')
                        elif 'times' in font_name.lower() and 'bold' not in font_name.lower():
                            common_fonts.append('times.ttf')
                        elif 'courier' in font_name.lower() and 'bold' not in font_name.lower():
                            common_fonts.append('cour.ttf')
                        elif 'comic' in font_name.lower():
                            common_fonts.append('comic.ttf')
                        elif 'impact' in font_name.lower():
                            common_fonts.append('impact.ttf')
                        elif 'verdana' in font_name.lower():
                            common_fonts.append('verdana.ttf')
                        elif 'tahoma' in font_name.lower():
                            common_fonts.append('tahoma.ttf')
        # Add found fonts, fallback to common Windows fonts
        fonts = list(set(common_fonts)) if common_fonts else [
            'arial.ttf', 'times.ttf', 'cour.ttf', 'comic.ttf', 
            'impact.ttf', 'verdana.ttf', 'tahoma.ttf'
        ]
    # Add option to use no font (system default)
    fonts.insert(0, 'System Default')
    return fonts
 AVAILABLE_FONTS = get_system_fonts()
 # Global settings with defaults
 settings = {
    "subtitle_y_px": 1550,
    "highlight_offset": -8,
    "font_size_subtitle": 65,
    "font_size_highlight": 68,
    "highlight_x_offset": 0,
    "video_path": None,
    "font": "System Default",
    "subtitles": [],
    "current_index": 0
 }
 # Compatible fonts that work across different systems
 COMPATIBLE_FONTS = [
    "Arial",
    "Times-Roman", 
    "Helvetica",
    "Courier",
    "Comic-Sans-MS",
    "Impact",
    "Verdana",
    "Tahoma",
    "Georgia",
    "Trebuchet-MS"
 ]
 preset_file = "subtitle_gui_presets.json"
 # === SRT PARSER ===
 def parse_srt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        contents = f.read()
    pattern = r"(\d+)\s+(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\s+([\s\S]*?)(?=\n\d+|\Z)"
    matches = re.findall(pattern, contents)
    subtitles = []
    for _, start, end, text in matches:
        subtitles.append({
            "start": srt_time_to_seconds(start),
            "end": srt_time_to_seconds(end),
            "text": text.replace('\n', ' ')
        })
    return subtitles
 def srt_time_to_seconds(time_str):
    h, m, s_ms = time_str.split(':')
    s, ms = s_ms.split(',')
    return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000
 # === PRESETS ===
 def save_presets():
    with open(preset_file, "w") as f:
        json.dump(settings, f)
    print("📂 Presets saved!")
 def load_presets():
    global settings
    try:
        with open(preset_file, "r") as f:
            loaded = json.load(f)
            settings.update(loaded)
            print("✅ Presets loaded!")
            sync_gui()
    except FileNotFoundError:
        print("⚠️ No presets found.")
 # === SYNC ===
 def sync_gui():
    sub_y_slider.set(settings["subtitle_y_px"])
    highlight_slider.set(settings["highlight_offset"])
    highlight_x_slider.set(settings["highlight_x_offset"])
    sub_font_slider.set(settings["font_size_subtitle"])
    highlight_font_slider.set(settings["font_size_highlight"])
    font_dropdown_var.set(settings["font"])
 def render_preview():
    if not settings["video_path"] or not settings["subtitles"]:
        print("⚠️ Video or subtitles not loaded.")
        return
    sub = settings["subtitles"][settings["current_index"]]
    subtitle_text = sub["text"]
    start_time = sub["start"]
    end_time = sub["end"]
    duration = end_time - start_time
    clip = VideoFileClip(settings["video_path"]).subclipped(start_time, end_time)
    vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2)
    highlight_word = subtitle_text.split()[-1]  # Highlight last word for now
    # Create TextClip with font if specified, otherwise use system default
    if settings["font"] == "System Default":
        base_subtitle = TextClip(
            text=subtitle_text,
            font_size=settings["font_size_subtitle"],
            color='white',
            stroke_color='black',
            stroke_width=5
        ).with_duration(duration).with_position(('center', settings["subtitle_y_px"]))
    else:
        try:
            base_subtitle = TextClip(
                text=subtitle_text,
                font=settings["font"],
                font_size=settings["font_size_subtitle"],
                color='white',
                stroke_color='black',
                stroke_width=5
            ).with_duration(duration).with_position(('center', settings["subtitle_y_px"]))
        except:
            # Fallback to system default if font fails
            print(f"⚠️ Font {settings['font']} failed, using system default")
            base_subtitle = TextClip(
                text=subtitle_text,
                font_size=settings["font_size_subtitle"],
                color='white',
                stroke_color='black',
                stroke_width=5
            ).with_duration(duration).with_position(('center', settings["subtitle_y_px"]))
    full_text = subtitle_text.upper()
    words = full_text.split()
    try:
        highlight_index = words.index(highlight_word.upper())
    except ValueError:
        highlight_index = len(words) - 1
    chars_before = sum(len(w) + 1 for w in words[:highlight_index])
    char_width = 35
    total_width = len(full_text) * char_width
    x_offset = (chars_before * char_width) - (total_width // 2) + settings["highlight_x_offset"]
    # Create highlighted word with same font logic
    if settings["font"] == "System Default":
        highlighted_word = TextClip(
            text=highlight_word,
            font_size=settings["font_size_highlight"],
            color='#FFD700',
            stroke_color='#FF6B35',
            stroke_width=5
        ).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
    else:
        try:
            highlighted_word = TextClip(
                text=highlight_word,
                font=settings["font"],
                font_size=settings["font_size_highlight"],
                color='#FFD700',
                stroke_color='#FF6B35',
                stroke_width=5
            ).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
        except:
            # Fallback to system default if font fails
            highlighted_word = TextClip(
                text=highlight_word,
                font_size=settings["font_size_highlight"],
                color='#FFD700',
                stroke_color='#FF6B35',
                stroke_width=5
            ).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
    final = CompositeVideoClip([vertical_clip, base_subtitle, highlighted_word], size=(1080, 1920))
    # Scale down the preview to fit 1080p monitor (max height ~900px to leave room for taskbar)
    preview_scale = 900 / 1920  # Scale factor to fit height
    preview_width = int(1080 * preview_scale)
    preview_height = int(1920 * preview_scale)
    preview_clip = final.resized((preview_width, preview_height))
    preview_clip.preview(fps=24, audio=False)
    clip.close()
    final.close()
    preview_clip.close()
 def update_setting(var_name, value):
    settings[var_name] = int(value) if var_name.startswith("font_size") or "offset" in var_name or "y_px" in var_name else value
 def update_font(value):
    settings["font"] = value
 def open_video():
    file_path = filedialog.askopenfilename(filetypes=[("MP4 files", "*.mp4")])
    if file_path:
        settings["video_path"] = file_path
        print(f"📂 Loaded video: {file_path}")
 def load_srt():
    file_path = filedialog.askopenfilename(filetypes=[("SRT Subtitle", "*.srt")])
    if file_path:
        settings["subtitles"] = parse_srt(file_path)
        settings["current_index"] = 0
        print(f"📝 Loaded {len(settings['subtitles'])} subtitles from {file_path}")
 def next_sub():
    if settings["current_index"] < len(settings["subtitles"]) - 1:
        settings["current_index"] += 1
        start_preview_thread()
 def prev_sub():
    if settings["current_index"] > 0:
        settings["current_index"] -= 1
        start_preview_thread()
 def start_preview_thread():
    threading.Thread(target=render_preview).start()
 # === GUI ===
 root = tk.Tk()
 root.title("Subtitle Positioning Tool")
 root.geometry("420x700")
 load_btn = tk.Button(root, text="🎥 Load Video", command=open_video)
 load_btn.pack(pady=10)
 load_srt_btn = tk.Button(root, text="📑 Load SRT Subtitles", command=load_srt)
 load_srt_btn.pack(pady=5)
 tk.Label(root, text="Subtitle Y Position").pack()
 sub_y_slider = tk.Scale(root, from_=1000, to=1800, orient="horizontal",
                        command=lambda v: update_setting("subtitle_y_px", v))
 sub_y_slider.set(settings["subtitle_y_px"])
 sub_y_slider.pack()
 tk.Label(root, text="Highlight Y Offset").pack()
 highlight_slider = tk.Scale(root, from_=-100, to=100, orient="horizontal",
                             command=lambda v: update_setting("highlight_offset", v))
 highlight_slider.set(settings["highlight_offset"])
 highlight_slider.pack()
 tk.Label(root, text="Highlight X Offset").pack()
 highlight_x_slider = tk.Scale(root, from_=-300, to=300, orient="horizontal",
                               command=lambda v: update_setting("highlight_x_offset", v))
 highlight_x_slider.set(settings["highlight_x_offset"])
 highlight_x_slider.pack()
 tk.Label(root, text="Subtitle Font Size").pack()
 sub_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
                           command=lambda v: update_setting("font_size_subtitle", v))
 sub_font_slider.set(settings["font_size_subtitle"])
 sub_font_slider.pack()
 tk.Label(root, text="Highlight Font Size").pack()
 highlight_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
                                 command=lambda v: update_setting("font_size_highlight", v))
 highlight_font_slider.set(settings["font_size_highlight"])
 highlight_font_slider.pack()
 tk.Label(root, text="Font").pack()
 font_dropdown_var = tk.StringVar(value=settings["font"])
 font_dropdown = tk.OptionMenu(root, font_dropdown_var, *AVAILABLE_FONTS, command=update_font)
 font_dropdown.pack(pady=5)
 preview_btn = tk.Button(root, text="▶️ Preview Clip", command=start_preview_thread)
 preview_btn.pack(pady=10)
 nav_frame = tk.Frame(root)
 tk.Button(nav_frame, text="⏮️ Prev", command=prev_sub).pack(side="left", padx=5)
 tk.Button(nav_frame, text="⏭️ Next", command=next_sub).pack(side="right", padx=5)
 nav_frame.pack(pady=5)
 save_btn = tk.Button(root, text="📂 Save Preset", command=save_presets)
 save_btn.pack(pady=5)
 load_preset_btn = tk.Button(root, text="📂 Load Preset", command=load_presets)
 load_preset_btn.pack(pady=5)
 root.mainloop()
--- a/myvideo.srt
+++ b/myvideo.srt
@ -0,0 +1,22 @@
 1
 00:00:30,000 --> 00:00:40,000
 okay after we will
 2
 00:02:00,000 --> 00:02:10,000
 find it difficult to believe we prepare
 to fight arm and arm the corny and royal
 family
 3
 00:02:20,000 --> 00:02:30,000
 hello me
 4
 00:02:30,000 --> 00:02:40,000
 as expected
 5
 00:02:40,000 --> 00:02:50,000
 gacha
--- a/shorts_generator2.py
+++ b/shorts_generator2.py
@ -112,7 +112,8 @@ def create_short_clip(video_path, start, end, subtitles, output_path):
                    word_width, _ = highlighted_word.size
                    word_x = current_x + (word_width / 2)
-                    highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((word_x -125, subtitle_y_px))
+                    highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((word_x -125
                    , subtitle_y_px))
                    clips.append(highlighted_word)
                    current_x += word_width + 20  # Add spacing between words
--- a/sub2.srt
+++ b/sub2.srt
@ -0,0 +1,4 @@
 1
 00:00:00,000 --> 00:00:08,250
 Yeah, yeah, level she's 24.
--- a/subtitle_extrator.py
+++ b/subtitle_extrator.py
@ -1,157 +0,0 @@
 import os
 import numpy as np
 from moviepy import VideoFileClip, TextClip, CompositeVideoClip
 from faster_whisper import WhisperModel
 def detect_loud_moments(video_path, chunk_duration=5, threshold_db=10):
    print("🔍 Analyzing audio...")
    clip = VideoFileClip(video_path)
    audio = clip.audio.to_soundarray(fps=44100)
    volume = np.linalg.norm(audio, axis=1)
    chunk_size = int(chunk_duration * 44100)
    loud_chunks = []
    max_db = -float('inf')
    for i in range(0, len(volume), chunk_size):
        chunk = volume[i:i+chunk_size]
        db = 20 * np.log10(np.mean(chunk) + 1e-10)
        max_db = max(max_db, db)
        if db > threshold_db:
            start = i / 44100
            loud_chunks.append((start, min(start + chunk_duration, clip.duration)))
    print(f"🔊 Max volume found: {max_db:.2f} dB, threshold: {threshold_db} dB")
    print(f"📈 Found {len(loud_chunks)} loud moments")
    clip.close()
    return loud_chunks
 def transcribe_and_extract_subtitles(video_path, start, end):
    print(f"🗣️ Transcribing audio from {start:.2f}s to {end:.2f}s...")
    model = WhisperModel("base", device="cpu", compute_type="int8")
    segments, _ = model.transcribe(video_path, beam_size=5, language="en", vad_filter=True)
    subtitles = []
    for segment in segments:
        if start <= segment.start <= end:
            subtitles.append((segment.start - start, segment.end - start, segment.text))
    return subtitles
 def create_short_clip(video_path, start, end, subtitles, output_path):
    print(f"🎬 Creating short: {output_path}")
    clip = VideoFileClip(video_path).subclipped(start, end)
    video_duration = clip.duration
    print(f"📏 Video clip duration: {video_duration:.2f}s")
    # Convert to vertical 9:16
    vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2)
    clips = [vertical_clip]
    for (s, e, text) in subtitles:
        try:
            # Ensure subtitle timing doesn't exceed video duration
            subtitle_start = max(0, s)
            subtitle_end = min(e, video_duration)
            if subtitle_start >= video_duration or subtitle_end <= subtitle_start:
                print(f"⚠️ Skipping subtitle outside video duration: {text[:30]}...")
                continue
            # Opus Clip style professional subtitles
            words = text.strip().split()
            if not words:
                continue
            # Break text into smaller chunks for better readability (max 3-4 words per line)
            chunks = []
            current_chunk = []
            for word in words:
                current_chunk.append(word)
                if len(current_chunk) >= 3 or len(' '.join(current_chunk)) > 25:
                    chunks.append(' '.join(current_chunk))
                    current_chunk = []
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            # Position subtitles in the center-bottom area (Opus style)
            subtitle_position = 0.75
            # Create subtitle for each chunk with smooth transitions
            chunk_duration = (subtitle_end - subtitle_start) / len(chunks)
            for chunk_idx, chunk_text in enumerate(chunks):
                chunk_start = subtitle_start + (chunk_idx * chunk_duration)
                chunk_end = min(chunk_start + chunk_duration, subtitle_end)
                chunk_words = chunk_text.split()
                # Base subtitle with Opus-style design (bold white text with strong outline)
                base_subtitle = TextClip(
                    text=chunk_text.upper(), 
                    font='C:/Windows/Fonts/LatoWeb-Bold.ttf',  # Lato Bold - excellent for subtitles
                    font_size=65,  # Larger, chunkier text
                    color='white', 
                    stroke_color='black', 
                    stroke_width=5  # Thicker outline for better readability
                )
                base_subtitle = base_subtitle.with_start(chunk_start).with_end(chunk_end).with_position(('center', subtitle_position), relative=True)
                clips.append(base_subtitle)
                # Opus-style word-by-word highlighting (yellow/orange like Opus)
                word_duration = chunk_duration / len(chunk_words)
                for i, word in enumerate(chunk_words):
                    word_start = chunk_start + (i * word_duration)
                    word_end = min(word_start + word_duration * 0.8, chunk_end)
                    # Opus-style highlighted word (vibrant yellow/orange)
                    highlighted_word = TextClip(
                        text=word.upper(), 
                        font='C:/Windows/Fonts/LatoWeb-Bold.ttf',  # Lato Bold for consistency
                        font_size=68,  # Slightly larger for highlight effect
                        color='#FFD700',  # Gold/yellow like Opus Clip
                        stroke_color='#FF6B35',  # Orange outline for pop
                        stroke_width=5
                    )
                    # Calculate precise word positioning within the chunk
                    words_before = chunk_words[:i]
                    chars_before = sum(len(w) for w in words_before) + len(words_before)
                    # More accurate character width calculation for Arial Bold
                    char_width = 35  # Adjusted for larger, bolder font
                    total_chunk_width = len(chunk_text) * char_width
                    word_x_offset = (chars_before * char_width) - (total_chunk_width // 2)
                    highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((540 + word_x_offset, subtitle_position), relative=(False, True))
                    clips.append(highlighted_word)
            print(f"✅ Added Opus-style subtitle ({subtitle_start:.1f}s-{subtitle_end:.1f}s): {text[:30]}...")
        except Exception as e:
            print(f"⚠️ Subtitle error: {e}, skipping subtitle: {text[:50]}...")
            continue
    final = CompositeVideoClip(clips, size=(1080, 1920))
    final.write_videofile(output_path, codec="libx264", audio_codec="aac", threads=1)
    # 💥 Force close to avoid Windows pipe errors
    clip.reader.close()
    if clip.audio:
        clip.audio.reader.close()
    final.close()
 def generate_shorts(video_path, max_clips=3, output_folder="shorts"):
    os.makedirs(output_folder, exist_ok=True)
    best_moments = detect_loud_moments(video_path, threshold_db=-30)
    selected = best_moments[:max_clips]
    for i, (start, end) in enumerate(selected):
        subtitles = transcribe_and_extract_subtitles(video_path, start, end)
        out_path = os.path.join(output_folder, f"short_{i+1}.mp4")
        create_short_clip(video_path, start, end, subtitles, out_path)
 if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2:
        print("Usage: python shorts_generator.py your_video.mp4")
    else:
        generate_shorts(sys.argv[1])
--- a/subtitle_generator.py
+++ b/subtitle_generator.py
@ -2,9 +2,12 @@ import os
 import math
 import tempfile
 import moviepy as mp
 import speech_recognition as sr
 import tkinter as tk
 from tkinter import filedialog, messagebox, ttk
 from pydub import AudioSegment
 from pydub.silence import split_on_silence
 import threading
 from faster_whisper import WhisperModel
 def format_time(seconds):
@ -44,52 +47,71 @@ def write_srt(subtitles, output_path):
            f.write(f"{wrap_text(sub['text'])}\n\n")
-def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None, chunk_duration=10):
+def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None, model_size="base", language="auto"):
    try:
        print("📽️ Loading video file...")
        video = mp.VideoFileClip(video_path)
        audio = video.audio
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
            temp_audio_path = temp_audio_file.name
            print("🔊 Extracting audio...")
            audio.write_audiofile(temp_audio_path, logger=None)
-        recognizer = sr.Recognizer()
+        print(f"🤖 Loading Whisper model ({model_size})...")
        # Initialize Whisper model - much more accurate than Google Speech Recognition
        model = WhisperModel(model_size, device="cpu", compute_type="int8")
        print("🎯 Transcribing with Whisper AI...")
        # Transcribe the entire audio file at once - Whisper handles timing automatically
        segments, info = model.transcribe(
            temp_audio_path, 
            language=None if language == "auto" else language,
            word_timestamps=True,
            vad_filter=True,  # Voice Activity Detection for better accuracy
            vad_parameters=dict(min_silence_duration_ms=500)
        )
        print(f"🌍 Detected language: {info.language} (confidence: {info.language_probability:.2f})")
        subtitles = []
        segment_list = list(segments)  # Convert generator to list for progress tracking
        print(f"📝 Processing {len(segment_list)} speech segments...")
        for i, segment in enumerate(segment_list):
            # Whisper provides precise timing and text
            start_time = segment.start
            end_time = segment.end
            text = segment.text.strip()
            if text and len(text) > 0:
                subtitles.append({
                    "start": start_time,
                    "end": end_time,
                    "text": text
                })
                print(f"✅ Segment {i+1}: '{text[:50]}...' ({start_time:.1f}s - {end_time:.1f}s)")
            # Update progress bar
            if progress_callback:
                progress_callback(i + 1, len(segment_list))
-        with sr.AudioFile(temp_audio_path) as source:
+        # Clean up
-            audio_duration = source.DURATION
+        if os.path.exists(temp_audio_path):
-            num_chunks = math.ceil(audio_duration / chunk_duration)
+            os.remove(temp_audio_path)
-
+        
-            for i in range(num_chunks):
+        if video:
-                start_time = i * chunk_duration
+            video.close()
-                end_time = min((i + 1) * chunk_duration, audio_duration)
+        if audio:
-
+            audio.close()
-                source_offset = start_time
+        
-                duration = end_time - start_time
+        print(f"🎯 Generated {len(subtitles)} subtitle segments with Whisper AI")
                audio_data = recognizer.record(source, offset=source_offset, duration=duration)
                try:
                    text = recognizer.recognize_google(audio_data)
                    subtitles.append({
                        "start": start_time,
                        "end": end_time,
                        "text": text
                    })
                except sr.UnknownValueError:
                    pass
                except sr.RequestError as e:
                    print(f"API error: {e}")
                # Update progress bar
                if progress_callback:
                    progress_callback(i + 1, num_chunks)
        os.remove(temp_audio_path)
        write_srt(subtitles, srt_output_path)
        return True
    except Exception as e:
-        print(f"Error: {e}")
+        print(f"❌ Error: {e}")
        return False
@ -98,7 +120,7 @@ def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None,
 def select_file_and_generate():
    video_path = filedialog.askopenfilename(
        title="Select a video file",
-        filetypes=[("Video files", "*.mp4 *.mov *.avi *.mkv")]
+        filetypes=[("Video files", "*.mp4 *.mov *.avi *.mkv *.webm *.flv")]
    )
    if not video_path:
@ -113,43 +135,127 @@ def select_file_and_generate():
    if not srt_output_path:
        return
    # Disable button during processing
    select_button.config(state="disabled", text="Processing...")
    progress_bar["value"] = 0
-    progress_label.config(text="Starting...")
+    progress_label.config(text="Starting speech recognition...")
    status_label.config(text="🔄 Processing video...", fg="blue")
    root.update()
    def update_progress(current, total):
        percent = (current / total) * 100
        progress_bar["value"] = percent
-        progress_label.config(text=f"Progress: {current}/{total} chunks")
+        progress_label.config(text=f"Processing: {current}/{total} segments")
        root.update()
-    success = transcribe_video_to_srt(video_path, srt_output_path, progress_callback=update_progress)
+    def process_video():
        try:
            model_size = model_size_var.get()
            language = language_var.get()
            success = transcribe_video_to_srt(
                video_path, 
                srt_output_path, 
                progress_callback=update_progress,
                model_size=model_size,
                language=language
            )
-    if success:
+            if success:
-        messagebox.showinfo("Success", f"Subtitles saved to:\n{srt_output_path}")
+                status_label.config(text="✅ Subtitles generated successfully!", fg="green")
-    else:
+                messagebox.showinfo("Success", f"Subtitles saved to:\n{srt_output_path}\n\nOpen with your subtitle_generator2.py for editing!")
-        messagebox.showerror("Error", "Something went wrong. See console for details.")
+            else:
                status_label.config(text="❌ Error occurred during processing", fg="red")
                messagebox.showerror("Error", "Something went wrong. Check console for details.")
-    progress_label.config(text="Done")
+        except Exception as e:
            status_label.config(text="❌ Unexpected error occurred", fg="red")
            messagebox.showerror("Error", f"Unexpected error: {e}")
        finally:
            # Re-enable button
            select_button.config(state="normal", text="📂 Select Video and Generate Subtitles")
            progress_label.config(text="Done")
    # Run in separate thread to prevent GUI freezing
    thread = threading.Thread(target=process_video)
    thread.daemon = True
    thread.start()
 # GUI Setup
 root = tk.Tk()
-root.title("Auto Subtitle Generator (.srt) with Progress")
+root.title("🎬 Auto Subtitle Generator - Speech to SRT")
 root.geometry("500x350")
 frame = tk.Frame(root, padx=20, pady=20)
-frame.pack()
+frame.pack(fill="both", expand=True)
-label = tk.Label(frame, text="Select a video file to auto-generate subtitles (SRT):")
+# Title
-label.pack(pady=(0, 10))
+title_label = tk.Label(frame, text="🎬 Auto Subtitle Generator", font=("Arial", 16, "bold"))
 title_label.pack(pady=(0, 10))
-select_button = tk.Button(frame, text="Select Video and Generate Subtitles", command=select_file_and_generate)
+subtitle_label = tk.Label(frame, text="Extract speech from video and create perfectly timed SRT subtitles", font=("Arial", 10))
-select_button.pack(pady=5)
+subtitle_label.pack(pady=(0, 20))
-progress_bar = ttk.Progressbar(frame, length=300, mode="determinate")
+# Settings Frame
-progress_bar.pack(pady=(15, 5))
+settings_frame = tk.LabelFrame(frame, text="Whisper AI Settings", padx=10, pady=10)
 settings_frame.pack(fill="x", pady=(0, 15))
-progress_label = tk.Label(frame, text="Idle")
+# Model Size Selection
 model_frame = tk.Frame(settings_frame)
 model_frame.pack(fill="x", pady=(0, 10))
 tk.Label(model_frame, text="🧠 Model Size:", font=("Arial", 9)).pack(side="left")
 model_size_var = tk.StringVar(value="base")
 model_dropdown = ttk.Combobox(model_frame, textvariable=model_size_var, 
                              values=["tiny", "base", "small", "medium", "large"], 
                              state="readonly", width=12)
 model_dropdown.pack(side="right")
 # Language Selection
 language_frame = tk.Frame(settings_frame)
 language_frame.pack(fill="x", pady=(0, 10))
 tk.Label(language_frame, text="🌍 Language:", font=("Arial", 9)).pack(side="left")
 language_var = tk.StringVar(value="auto")
 language_dropdown = ttk.Combobox(language_frame, textvariable=language_var, 
                                 values=["auto", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh"], 
                                 state="readonly", width=12)
 language_dropdown.pack(side="right")
 # Help text
 help_label = tk.Label(
    settings_frame, 
    text="   💡 Base model recommended for best speed/accuracy balance\n   🔍 Auto language detection works for most videos", 
    font=("Arial", 8), 
    fg="gray"
 )
 help_label.pack(anchor="w")
 # Main Action Button
 select_button = tk.Button(
    frame, 
    text="📂 Select Video and Generate Subtitles", 
    command=select_file_and_generate,
    font=("Arial", 11, "bold"),
    bg="#4CAF50",
    fg="white",
    pady=8
 )
 select_button.pack(pady=15, fill="x")
 # Progress Section
 progress_frame = tk.LabelFrame(frame, text="Progress", padx=10, pady=10)
 progress_frame.pack(fill="x", pady=(0, 10))
 progress_bar = ttk.Progressbar(progress_frame, length=400, mode="determinate")
 progress_bar.pack(fill="x", pady=(0, 5))
 progress_label = tk.Label(progress_frame, text="Ready to process video", font=("Arial", 9))
 progress_label.pack()
 # Status Label
 status_label = tk.Label(frame, text="💡 Tip: Use generated SRT files with subtitle_generator2.py for advanced editing!", font=("Arial", 9), fg="blue")
 status_label.pack(pady=(10, 0))
 root.mainloop()
--- a/subtitle_generator2_fixed.py
+++ b/subtitle_generator2_fixed.py
--- a/subtitles.srt
+++ b/subtitles.srt
@ -1,3 +0,0 @@
 1
 00:00:00,000 --> 00:00:02,500
 You're running
--- a/test_whisper.py
+++ b/test_whisper.py
@ -0,0 +1,37 @@
 #!/usr/bin/env python3
 """
 Test script to verify faster_whisper integration
 """
 import os
 from faster_whisper import WhisperModel
 def test_whisper_setup():
    """Test if faster_whisper is working correctly"""
    print("🧪 Testing faster_whisper setup...")
    try:
        # Try to initialize the smallest model
        print("📥 Loading tiny model (this might take a moment on first run)...")
        model = WhisperModel("tiny")
        print("✅ Successfully loaded Whisper tiny model!")
        # Check available models
        available_models = ["tiny", "base", "small", "medium", "large"]
        print(f"🎯 Available models: {', '.join(available_models)}")
        # Test basic functionality with a short audio
        print("🔍 Whisper model ready for transcription!")
        return True
    except Exception as e:
        print(f"❌ Error: {e}")
        return False
 if __name__ == "__main__":
    if test_whisper_setup():
        print("\n🎉 faster_whisper is ready to use!")
        print("💡 Your subtitle generator now has much better speech recognition!")
    else:
        print("\n⚠️  There might be an issue with faster_whisper setup")
		`@ -0,0 +1,4 @@`
							`1`
							`00:00:00,000 --> 00:00:08,250`
							`Yeah, yeah, level she's 24.`
		`@ -1,3 +0,0 @@`
			`1`
			`00:00:00,000 --> 00:00:02,500`
			`You're running`