Refactor subtitle generation and integration with Whisper AI; remove unused files and enhance GUI for better user experience

This commit is contained in:
klop51 2025-08-09 10:35:13 +02:00
parent 5ce79f084d
commit 491040b148
10 changed files with 223 additions and 710 deletions

175
app.py
View File

@ -1,175 +0,0 @@
import tkinter as tk
from tkinter import filedialog
from moviepy import VideoFileClip, TextClip, CompositeVideoClip
import threading
import json
# Global settings with defaults
settings = {
"subtitle_y_px": 1550,
"highlight_offset": -8,
"font_size_subtitle": 65,
"font_size_highlight": 68,
"highlight_x_offset": 0,
"video_path": None,
"selected_font": "Arial" # Default font
}
# Compatible fonts that work across different systems
COMPATIBLE_FONTS = [
"Arial",
"Times-Roman",
"Helvetica",
"Courier",
"Comic-Sans-MS",
"Impact",
"Verdana",
"Tahoma",
"Georgia",
"Trebuchet-MS"
]
preset_file = "subtitle_gui_presets.json"
def save_presets():
with open(preset_file, "w") as f:
json.dump(settings, f)
print("💾 Presets saved!")
def load_presets():
global settings
try:
with open(preset_file, "r") as f:
loaded = json.load(f)
settings.update(loaded)
print("✅ Presets loaded!")
sync_gui()
except FileNotFoundError:
print("⚠️ No presets found.")
def sync_gui():
sub_y_slider.set(settings["subtitle_y_px"])
highlight_slider.set(settings["highlight_offset"])
highlight_x_slider.set(settings["highlight_x_offset"])
sub_font_slider.set(settings["font_size_subtitle"])
highlight_font_slider.set(settings["font_size_highlight"])
font_var.set(settings["selected_font"])
def render_preview():
if not settings["video_path"]:
print("⚠️ No video selected.")
return
clip = VideoFileClip(settings["video_path"]).subclipped(0, 3) # Use first 3 seconds
vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2)
subtitle_text = "THIS IS A TEST SUBTITLE"
highlight_word = "SUBTITLE"
base_subtitle = TextClip(
text=subtitle_text,
font_size=settings["font_size_subtitle"],
font=settings["selected_font"],
color='white',
stroke_color='black',
stroke_width=5
).with_duration(3).with_position(('center', settings["subtitle_y_px"]))
# Compute highlight word position
full_text = subtitle_text.upper()
words = full_text.split()
highlight_index = words.index(highlight_word.upper())
chars_before = sum(len(w) + 1 for w in words[:highlight_index])
char_width = 35
total_width = len(full_text) * char_width
x_offset = (chars_before * char_width) - (total_width // 2) + settings["highlight_x_offset"]
highlighted_word = TextClip(
text=highlight_word,
font_size=settings["font_size_highlight"],
font=settings["selected_font"],
color='#FFD700',
stroke_color='#FF6B35',
stroke_width=5
).with_duration(1.5).with_start(0.75).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
final = CompositeVideoClip([vertical_clip, base_subtitle, highlighted_word], size=(1080, 1920))
# Scale down the preview to fit 1080p monitor (max height ~900px to leave room for taskbar)
preview_scale = 900 / 1920 # Scale factor to fit height
preview_width = int(1080 * preview_scale)
preview_height = int(1920 * preview_scale)
preview_clip = final.resized((preview_width, preview_height))
preview_clip.preview(fps=24, audio=False)
clip.close()
final.close()
preview_clip.close()
def update_setting(var_name, value):
settings[var_name] = int(value)
def update_font(font_name):
settings["selected_font"] = font_name
def open_video():
file_path = filedialog.askopenfilename(filetypes=[("MP4 files", "*.mp4")])
if file_path:
settings["video_path"] = file_path
print(f"📂 Loaded video: {file_path}")
def start_preview_thread():
threading.Thread(target=render_preview).start()
# GUI Setup
root = tk.Tk()
root.title("Subtitle Positioning Tool")
root.geometry("400x600")
load_btn = tk.Button(root, text="🎥 Load Video", command=open_video)
load_btn.pack(pady=10)
tk.Label(root, text="Font Family").pack()
font_var = tk.StringVar(value=settings["selected_font"])
font_dropdown = tk.OptionMenu(root, font_var, *COMPATIBLE_FONTS, command=update_font)
font_dropdown.pack(pady=5)
tk.Label(root, text="Subtitle Y Position").pack()
sub_y_slider = tk.Scale(root, from_=1000, to=1800, orient="horizontal",
command=lambda v: update_setting("subtitle_y_px", v))
sub_y_slider.set(settings["subtitle_y_px"])
sub_y_slider.pack()
tk.Label(root, text="Highlight Y Offset").pack()
highlight_slider = tk.Scale(root, from_=-100, to=100, orient="horizontal",
command=lambda v: update_setting("highlight_offset", v))
highlight_slider.set(settings["highlight_offset"])
highlight_slider.pack()
tk.Label(root, text="Highlight X Offset").pack()
highlight_x_slider = tk.Scale(root, from_=-300, to=300, orient="horizontal",
command=lambda v: update_setting("highlight_x_offset", v))
highlight_x_slider.set(settings["highlight_x_offset"])
highlight_x_slider.pack()
tk.Label(root, text="Subtitle Font Size").pack()
sub_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
command=lambda v: update_setting("font_size_subtitle", v))
sub_font_slider.set(settings["font_size_subtitle"])
sub_font_slider.pack()
tk.Label(root, text="Highlight Font Size").pack()
highlight_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
command=lambda v: update_setting("font_size_highlight", v))
highlight_font_slider.set(settings["font_size_highlight"])
highlight_font_slider.pack()
preview_btn = tk.Button(root, text="▶️ Preview Clip", command=start_preview_thread)
preview_btn.pack(pady=10)
save_btn = tk.Button(root, text="💾 Save Preset", command=save_presets)
save_btn.pack(pady=5)
load_preset_btn = tk.Button(root, text="📂 Load Preset", command=load_presets)
load_preset_btn.pack(pady=5)
root.mainloop()

322
app2.py
View File

@ -1,322 +0,0 @@
import tkinter as tk
from tkinter import filedialog
from moviepy import VideoFileClip, TextClip, CompositeVideoClip
import threading
import json
import re
import os
import platform
def get_system_fonts():
"""Get list of available system fonts"""
fonts = []
if platform.system() == "Windows":
# Common Windows font paths
font_paths = [
"C:/Windows/Fonts/",
"C:/Windows/System32/Fonts/"
]
common_fonts = []
for font_path in font_paths:
if os.path.exists(font_path):
for file in os.listdir(font_path):
if file.endswith(('.ttf', '.otf')):
# Extract font name without extension
font_name = os.path.splitext(file)[0]
# Clean up common variations
if 'arial' in font_name.lower() and 'bold' not in font_name.lower():
common_fonts.append('arial.ttf')
elif 'times' in font_name.lower() and 'bold' not in font_name.lower():
common_fonts.append('times.ttf')
elif 'courier' in font_name.lower() and 'bold' not in font_name.lower():
common_fonts.append('cour.ttf')
elif 'comic' in font_name.lower():
common_fonts.append('comic.ttf')
elif 'impact' in font_name.lower():
common_fonts.append('impact.ttf')
elif 'verdana' in font_name.lower():
common_fonts.append('verdana.ttf')
elif 'tahoma' in font_name.lower():
common_fonts.append('tahoma.ttf')
# Add found fonts, fallback to common Windows fonts
fonts = list(set(common_fonts)) if common_fonts else [
'arial.ttf', 'times.ttf', 'cour.ttf', 'comic.ttf',
'impact.ttf', 'verdana.ttf', 'tahoma.ttf'
]
# Add option to use no font (system default)
fonts.insert(0, 'System Default')
return fonts
AVAILABLE_FONTS = get_system_fonts()
# Global settings with defaults
settings = {
"subtitle_y_px": 1550,
"highlight_offset": -8,
"font_size_subtitle": 65,
"font_size_highlight": 68,
"highlight_x_offset": 0,
"video_path": None,
"font": "System Default",
"subtitles": [],
"current_index": 0
}
# Compatible fonts that work across different systems
COMPATIBLE_FONTS = [
"Arial",
"Times-Roman",
"Helvetica",
"Courier",
"Comic-Sans-MS",
"Impact",
"Verdana",
"Tahoma",
"Georgia",
"Trebuchet-MS"
]
preset_file = "subtitle_gui_presets.json"
# === SRT PARSER ===
def parse_srt(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
contents = f.read()
pattern = r"(\d+)\s+(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\s+([\s\S]*?)(?=\n\d+|\Z)"
matches = re.findall(pattern, contents)
subtitles = []
for _, start, end, text in matches:
subtitles.append({
"start": srt_time_to_seconds(start),
"end": srt_time_to_seconds(end),
"text": text.replace('\n', ' ')
})
return subtitles
def srt_time_to_seconds(time_str):
h, m, s_ms = time_str.split(':')
s, ms = s_ms.split(',')
return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000
# === PRESETS ===
def save_presets():
with open(preset_file, "w") as f:
json.dump(settings, f)
print("📂 Presets saved!")
def load_presets():
global settings
try:
with open(preset_file, "r") as f:
loaded = json.load(f)
settings.update(loaded)
print("✅ Presets loaded!")
sync_gui()
except FileNotFoundError:
print("⚠️ No presets found.")
# === SYNC ===
def sync_gui():
sub_y_slider.set(settings["subtitle_y_px"])
highlight_slider.set(settings["highlight_offset"])
highlight_x_slider.set(settings["highlight_x_offset"])
sub_font_slider.set(settings["font_size_subtitle"])
highlight_font_slider.set(settings["font_size_highlight"])
font_dropdown_var.set(settings["font"])
def render_preview():
if not settings["video_path"] or not settings["subtitles"]:
print("⚠️ Video or subtitles not loaded.")
return
sub = settings["subtitles"][settings["current_index"]]
subtitle_text = sub["text"]
start_time = sub["start"]
end_time = sub["end"]
duration = end_time - start_time
clip = VideoFileClip(settings["video_path"]).subclipped(start_time, end_time)
vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2)
highlight_word = subtitle_text.split()[-1] # Highlight last word for now
# Create TextClip with font if specified, otherwise use system default
if settings["font"] == "System Default":
base_subtitle = TextClip(
text=subtitle_text,
font_size=settings["font_size_subtitle"],
color='white',
stroke_color='black',
stroke_width=5
).with_duration(duration).with_position(('center', settings["subtitle_y_px"]))
else:
try:
base_subtitle = TextClip(
text=subtitle_text,
font=settings["font"],
font_size=settings["font_size_subtitle"],
color='white',
stroke_color='black',
stroke_width=5
).with_duration(duration).with_position(('center', settings["subtitle_y_px"]))
except:
# Fallback to system default if font fails
print(f"⚠️ Font {settings['font']} failed, using system default")
base_subtitle = TextClip(
text=subtitle_text,
font_size=settings["font_size_subtitle"],
color='white',
stroke_color='black',
stroke_width=5
).with_duration(duration).with_position(('center', settings["subtitle_y_px"]))
full_text = subtitle_text.upper()
words = full_text.split()
try:
highlight_index = words.index(highlight_word.upper())
except ValueError:
highlight_index = len(words) - 1
chars_before = sum(len(w) + 1 for w in words[:highlight_index])
char_width = 35
total_width = len(full_text) * char_width
x_offset = (chars_before * char_width) - (total_width // 2) + settings["highlight_x_offset"]
# Create highlighted word with same font logic
if settings["font"] == "System Default":
highlighted_word = TextClip(
text=highlight_word,
font_size=settings["font_size_highlight"],
color='#FFD700',
stroke_color='#FF6B35',
stroke_width=5
).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
else:
try:
highlighted_word = TextClip(
text=highlight_word,
font=settings["font"],
font_size=settings["font_size_highlight"],
color='#FFD700',
stroke_color='#FF6B35',
stroke_width=5
).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
except:
# Fallback to system default if font fails
highlighted_word = TextClip(
text=highlight_word,
font_size=settings["font_size_highlight"],
color='#FFD700',
stroke_color='#FF6B35',
stroke_width=5
).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
final = CompositeVideoClip([vertical_clip, base_subtitle, highlighted_word], size=(1080, 1920))
# Scale down the preview to fit 1080p monitor (max height ~900px to leave room for taskbar)
preview_scale = 900 / 1920 # Scale factor to fit height
preview_width = int(1080 * preview_scale)
preview_height = int(1920 * preview_scale)
preview_clip = final.resized((preview_width, preview_height))
preview_clip.preview(fps=24, audio=False)
clip.close()
final.close()
preview_clip.close()
def update_setting(var_name, value):
settings[var_name] = int(value) if var_name.startswith("font_size") or "offset" in var_name or "y_px" in var_name else value
def update_font(value):
settings["font"] = value
def open_video():
file_path = filedialog.askopenfilename(filetypes=[("MP4 files", "*.mp4")])
if file_path:
settings["video_path"] = file_path
print(f"📂 Loaded video: {file_path}")
def load_srt():
file_path = filedialog.askopenfilename(filetypes=[("SRT Subtitle", "*.srt")])
if file_path:
settings["subtitles"] = parse_srt(file_path)
settings["current_index"] = 0
print(f"📝 Loaded {len(settings['subtitles'])} subtitles from {file_path}")
def next_sub():
if settings["current_index"] < len(settings["subtitles"]) - 1:
settings["current_index"] += 1
start_preview_thread()
def prev_sub():
if settings["current_index"] > 0:
settings["current_index"] -= 1
start_preview_thread()
def start_preview_thread():
threading.Thread(target=render_preview).start()
# === GUI ===
root = tk.Tk()
root.title("Subtitle Positioning Tool")
root.geometry("420x700")
load_btn = tk.Button(root, text="🎥 Load Video", command=open_video)
load_btn.pack(pady=10)
load_srt_btn = tk.Button(root, text="📑 Load SRT Subtitles", command=load_srt)
load_srt_btn.pack(pady=5)
tk.Label(root, text="Subtitle Y Position").pack()
sub_y_slider = tk.Scale(root, from_=1000, to=1800, orient="horizontal",
command=lambda v: update_setting("subtitle_y_px", v))
sub_y_slider.set(settings["subtitle_y_px"])
sub_y_slider.pack()
tk.Label(root, text="Highlight Y Offset").pack()
highlight_slider = tk.Scale(root, from_=-100, to=100, orient="horizontal",
command=lambda v: update_setting("highlight_offset", v))
highlight_slider.set(settings["highlight_offset"])
highlight_slider.pack()
tk.Label(root, text="Highlight X Offset").pack()
highlight_x_slider = tk.Scale(root, from_=-300, to=300, orient="horizontal",
command=lambda v: update_setting("highlight_x_offset", v))
highlight_x_slider.set(settings["highlight_x_offset"])
highlight_x_slider.pack()
tk.Label(root, text="Subtitle Font Size").pack()
sub_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
command=lambda v: update_setting("font_size_subtitle", v))
sub_font_slider.set(settings["font_size_subtitle"])
sub_font_slider.pack()
tk.Label(root, text="Highlight Font Size").pack()
highlight_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
command=lambda v: update_setting("font_size_highlight", v))
highlight_font_slider.set(settings["font_size_highlight"])
highlight_font_slider.pack()
tk.Label(root, text="Font").pack()
font_dropdown_var = tk.StringVar(value=settings["font"])
font_dropdown = tk.OptionMenu(root, font_dropdown_var, *AVAILABLE_FONTS, command=update_font)
font_dropdown.pack(pady=5)
preview_btn = tk.Button(root, text="▶️ Preview Clip", command=start_preview_thread)
preview_btn.pack(pady=10)
nav_frame = tk.Frame(root)
tk.Button(nav_frame, text="⏮️ Prev", command=prev_sub).pack(side="left", padx=5)
tk.Button(nav_frame, text="⏭️ Next", command=next_sub).pack(side="right", padx=5)
nav_frame.pack(pady=5)
save_btn = tk.Button(root, text="📂 Save Preset", command=save_presets)
save_btn.pack(pady=5)
load_preset_btn = tk.Button(root, text="📂 Load Preset", command=load_presets)
load_preset_btn.pack(pady=5)
root.mainloop()

22
myvideo.srt Normal file
View File

@ -0,0 +1,22 @@
1
00:00:30,000 --> 00:00:40,000
okay after we will
2
00:02:00,000 --> 00:02:10,000
find it difficult to believe we prepare
to fight arm and arm the corny and royal
family
3
00:02:20,000 --> 00:02:30,000
hello me
4
00:02:30,000 --> 00:02:40,000
as expected
5
00:02:40,000 --> 00:02:50,000
gacha

View File

@ -112,7 +112,8 @@ def create_short_clip(video_path, start, end, subtitles, output_path):
word_width, _ = highlighted_word.size word_width, _ = highlighted_word.size
word_x = current_x + (word_width / 2) word_x = current_x + (word_width / 2)
highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((word_x -125, subtitle_y_px)) highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((word_x -125
, subtitle_y_px))
clips.append(highlighted_word) clips.append(highlighted_word)
current_x += word_width + 20 # Add spacing between words current_x += word_width + 20 # Add spacing between words

4
sub2.srt Normal file
View File

@ -0,0 +1,4 @@
1
00:00:00,000 --> 00:00:08,250
Yeah, yeah, level she's 24.

View File

@ -1,157 +0,0 @@
import os
import numpy as np
from moviepy import VideoFileClip, TextClip, CompositeVideoClip
from faster_whisper import WhisperModel
def detect_loud_moments(video_path, chunk_duration=5, threshold_db=10):
print("🔍 Analyzing audio...")
clip = VideoFileClip(video_path)
audio = clip.audio.to_soundarray(fps=44100)
volume = np.linalg.norm(audio, axis=1)
chunk_size = int(chunk_duration * 44100)
loud_chunks = []
max_db = -float('inf')
for i in range(0, len(volume), chunk_size):
chunk = volume[i:i+chunk_size]
db = 20 * np.log10(np.mean(chunk) + 1e-10)
max_db = max(max_db, db)
if db > threshold_db:
start = i / 44100
loud_chunks.append((start, min(start + chunk_duration, clip.duration)))
print(f"🔊 Max volume found: {max_db:.2f} dB, threshold: {threshold_db} dB")
print(f"📈 Found {len(loud_chunks)} loud moments")
clip.close()
return loud_chunks
def transcribe_and_extract_subtitles(video_path, start, end):
print(f"🗣️ Transcribing audio from {start:.2f}s to {end:.2f}s...")
model = WhisperModel("base", device="cpu", compute_type="int8")
segments, _ = model.transcribe(video_path, beam_size=5, language="en", vad_filter=True)
subtitles = []
for segment in segments:
if start <= segment.start <= end:
subtitles.append((segment.start - start, segment.end - start, segment.text))
return subtitles
def create_short_clip(video_path, start, end, subtitles, output_path):
print(f"🎬 Creating short: {output_path}")
clip = VideoFileClip(video_path).subclipped(start, end)
video_duration = clip.duration
print(f"📏 Video clip duration: {video_duration:.2f}s")
# Convert to vertical 9:16
vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2)
clips = [vertical_clip]
for (s, e, text) in subtitles:
try:
# Ensure subtitle timing doesn't exceed video duration
subtitle_start = max(0, s)
subtitle_end = min(e, video_duration)
if subtitle_start >= video_duration or subtitle_end <= subtitle_start:
print(f"⚠️ Skipping subtitle outside video duration: {text[:30]}...")
continue
# Opus Clip style professional subtitles
words = text.strip().split()
if not words:
continue
# Break text into smaller chunks for better readability (max 3-4 words per line)
chunks = []
current_chunk = []
for word in words:
current_chunk.append(word)
if len(current_chunk) >= 3 or len(' '.join(current_chunk)) > 25:
chunks.append(' '.join(current_chunk))
current_chunk = []
if current_chunk:
chunks.append(' '.join(current_chunk))
# Position subtitles in the center-bottom area (Opus style)
subtitle_position = 0.75
# Create subtitle for each chunk with smooth transitions
chunk_duration = (subtitle_end - subtitle_start) / len(chunks)
for chunk_idx, chunk_text in enumerate(chunks):
chunk_start = subtitle_start + (chunk_idx * chunk_duration)
chunk_end = min(chunk_start + chunk_duration, subtitle_end)
chunk_words = chunk_text.split()
# Base subtitle with Opus-style design (bold white text with strong outline)
base_subtitle = TextClip(
text=chunk_text.upper(),
font='C:/Windows/Fonts/LatoWeb-Bold.ttf', # Lato Bold - excellent for subtitles
font_size=65, # Larger, chunkier text
color='white',
stroke_color='black',
stroke_width=5 # Thicker outline for better readability
)
base_subtitle = base_subtitle.with_start(chunk_start).with_end(chunk_end).with_position(('center', subtitle_position), relative=True)
clips.append(base_subtitle)
# Opus-style word-by-word highlighting (yellow/orange like Opus)
word_duration = chunk_duration / len(chunk_words)
for i, word in enumerate(chunk_words):
word_start = chunk_start + (i * word_duration)
word_end = min(word_start + word_duration * 0.8, chunk_end)
# Opus-style highlighted word (vibrant yellow/orange)
highlighted_word = TextClip(
text=word.upper(),
font='C:/Windows/Fonts/LatoWeb-Bold.ttf', # Lato Bold for consistency
font_size=68, # Slightly larger for highlight effect
color='#FFD700', # Gold/yellow like Opus Clip
stroke_color='#FF6B35', # Orange outline for pop
stroke_width=5
)
# Calculate precise word positioning within the chunk
words_before = chunk_words[:i]
chars_before = sum(len(w) for w in words_before) + len(words_before)
# More accurate character width calculation for Arial Bold
char_width = 35 # Adjusted for larger, bolder font
total_chunk_width = len(chunk_text) * char_width
word_x_offset = (chars_before * char_width) - (total_chunk_width // 2)
highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((540 + word_x_offset, subtitle_position), relative=(False, True))
clips.append(highlighted_word)
print(f"✅ Added Opus-style subtitle ({subtitle_start:.1f}s-{subtitle_end:.1f}s): {text[:30]}...")
except Exception as e:
print(f"⚠️ Subtitle error: {e}, skipping subtitle: {text[:50]}...")
continue
final = CompositeVideoClip(clips, size=(1080, 1920))
final.write_videofile(output_path, codec="libx264", audio_codec="aac", threads=1)
# 💥 Force close to avoid Windows pipe errors
clip.reader.close()
if clip.audio:
clip.audio.reader.close()
final.close()
def generate_shorts(video_path, max_clips=3, output_folder="shorts"):
os.makedirs(output_folder, exist_ok=True)
best_moments = detect_loud_moments(video_path, threshold_db=-30)
selected = best_moments[:max_clips]
for i, (start, end) in enumerate(selected):
subtitles = transcribe_and_extract_subtitles(video_path, start, end)
out_path = os.path.join(output_folder, f"short_{i+1}.mp4")
create_short_clip(video_path, start, end, subtitles, out_path)
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python shorts_generator.py your_video.mp4")
else:
generate_shorts(sys.argv[1])

View File

@ -2,9 +2,12 @@ import os
import math import math
import tempfile import tempfile
import moviepy as mp import moviepy as mp
import speech_recognition as sr
import tkinter as tk import tkinter as tk
from tkinter import filedialog, messagebox, ttk from tkinter import filedialog, messagebox, ttk
from pydub import AudioSegment
from pydub.silence import split_on_silence
import threading
from faster_whisper import WhisperModel
def format_time(seconds): def format_time(seconds):
@ -44,52 +47,71 @@ def write_srt(subtitles, output_path):
f.write(f"{wrap_text(sub['text'])}\n\n") f.write(f"{wrap_text(sub['text'])}\n\n")
def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None, chunk_duration=10): def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None, model_size="base", language="auto"):
try: try:
print("📽️ Loading video file...")
video = mp.VideoFileClip(video_path) video = mp.VideoFileClip(video_path)
audio = video.audio audio = video.audio
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file: with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
temp_audio_path = temp_audio_file.name temp_audio_path = temp_audio_file.name
print("🔊 Extracting audio...")
audio.write_audiofile(temp_audio_path, logger=None) audio.write_audiofile(temp_audio_path, logger=None)
recognizer = sr.Recognizer() print(f"🤖 Loading Whisper model ({model_size})...")
# Initialize Whisper model - much more accurate than Google Speech Recognition
model = WhisperModel(model_size, device="cpu", compute_type="int8")
print("🎯 Transcribing with Whisper AI...")
# Transcribe the entire audio file at once - Whisper handles timing automatically
segments, info = model.transcribe(
temp_audio_path,
language=None if language == "auto" else language,
word_timestamps=True,
vad_filter=True, # Voice Activity Detection for better accuracy
vad_parameters=dict(min_silence_duration_ms=500)
)
print(f"🌍 Detected language: {info.language} (confidence: {info.language_probability:.2f})")
subtitles = [] subtitles = []
segment_list = list(segments) # Convert generator to list for progress tracking
print(f"📝 Processing {len(segment_list)} speech segments...")
for i, segment in enumerate(segment_list):
# Whisper provides precise timing and text
start_time = segment.start
end_time = segment.end
text = segment.text.strip()
if text and len(text) > 0:
subtitles.append({
"start": start_time,
"end": end_time,
"text": text
})
print(f"✅ Segment {i+1}: '{text[:50]}...' ({start_time:.1f}s - {end_time:.1f}s)")
# Update progress bar
if progress_callback:
progress_callback(i + 1, len(segment_list))
with sr.AudioFile(temp_audio_path) as source: # Clean up
audio_duration = source.DURATION if os.path.exists(temp_audio_path):
num_chunks = math.ceil(audio_duration / chunk_duration) os.remove(temp_audio_path)
for i in range(num_chunks): if video:
start_time = i * chunk_duration video.close()
end_time = min((i + 1) * chunk_duration, audio_duration) if audio:
audio.close()
source_offset = start_time
duration = end_time - start_time print(f"🎯 Generated {len(subtitles)} subtitle segments with Whisper AI")
audio_data = recognizer.record(source, offset=source_offset, duration=duration)
try:
text = recognizer.recognize_google(audio_data)
subtitles.append({
"start": start_time,
"end": end_time,
"text": text
})
except sr.UnknownValueError:
pass
except sr.RequestError as e:
print(f"API error: {e}")
# Update progress bar
if progress_callback:
progress_callback(i + 1, num_chunks)
os.remove(temp_audio_path)
write_srt(subtitles, srt_output_path) write_srt(subtitles, srt_output_path)
return True return True
except Exception as e: except Exception as e:
print(f"Error: {e}") print(f"❌ Error: {e}")
return False return False
@ -98,7 +120,7 @@ def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None,
def select_file_and_generate(): def select_file_and_generate():
video_path = filedialog.askopenfilename( video_path = filedialog.askopenfilename(
title="Select a video file", title="Select a video file",
filetypes=[("Video files", "*.mp4 *.mov *.avi *.mkv")] filetypes=[("Video files", "*.mp4 *.mov *.avi *.mkv *.webm *.flv")]
) )
if not video_path: if not video_path:
@ -113,43 +135,127 @@ def select_file_and_generate():
if not srt_output_path: if not srt_output_path:
return return
# Disable button during processing
select_button.config(state="disabled", text="Processing...")
progress_bar["value"] = 0 progress_bar["value"] = 0
progress_label.config(text="Starting...") progress_label.config(text="Starting speech recognition...")
status_label.config(text="🔄 Processing video...", fg="blue")
root.update() root.update()
def update_progress(current, total): def update_progress(current, total):
percent = (current / total) * 100 percent = (current / total) * 100
progress_bar["value"] = percent progress_bar["value"] = percent
progress_label.config(text=f"Progress: {current}/{total} chunks") progress_label.config(text=f"Processing: {current}/{total} segments")
root.update() root.update()
success = transcribe_video_to_srt(video_path, srt_output_path, progress_callback=update_progress) def process_video():
try:
model_size = model_size_var.get()
language = language_var.get()
success = transcribe_video_to_srt(
video_path,
srt_output_path,
progress_callback=update_progress,
model_size=model_size,
language=language
)
if success: if success:
messagebox.showinfo("Success", f"Subtitles saved to:\n{srt_output_path}") status_label.config(text="✅ Subtitles generated successfully!", fg="green")
else: messagebox.showinfo("Success", f"Subtitles saved to:\n{srt_output_path}\n\nOpen with your subtitle_generator2.py for editing!")
messagebox.showerror("Error", "Something went wrong. See console for details.") else:
status_label.config(text="❌ Error occurred during processing", fg="red")
messagebox.showerror("Error", "Something went wrong. Check console for details.")
progress_label.config(text="Done") except Exception as e:
status_label.config(text="❌ Unexpected error occurred", fg="red")
messagebox.showerror("Error", f"Unexpected error: {e}")
finally:
# Re-enable button
select_button.config(state="normal", text="📂 Select Video and Generate Subtitles")
progress_label.config(text="Done")
# Run in separate thread to prevent GUI freezing
thread = threading.Thread(target=process_video)
thread.daemon = True
thread.start()
# GUI Setup # GUI Setup
root = tk.Tk() root = tk.Tk()
root.title("Auto Subtitle Generator (.srt) with Progress") root.title("🎬 Auto Subtitle Generator - Speech to SRT")
root.geometry("500x350")
frame = tk.Frame(root, padx=20, pady=20) frame = tk.Frame(root, padx=20, pady=20)
frame.pack() frame.pack(fill="both", expand=True)
label = tk.Label(frame, text="Select a video file to auto-generate subtitles (SRT):") # Title
label.pack(pady=(0, 10)) title_label = tk.Label(frame, text="🎬 Auto Subtitle Generator", font=("Arial", 16, "bold"))
title_label.pack(pady=(0, 10))
select_button = tk.Button(frame, text="Select Video and Generate Subtitles", command=select_file_and_generate) subtitle_label = tk.Label(frame, text="Extract speech from video and create perfectly timed SRT subtitles", font=("Arial", 10))
select_button.pack(pady=5) subtitle_label.pack(pady=(0, 20))
progress_bar = ttk.Progressbar(frame, length=300, mode="determinate") # Settings Frame
progress_bar.pack(pady=(15, 5)) settings_frame = tk.LabelFrame(frame, text="Whisper AI Settings", padx=10, pady=10)
settings_frame.pack(fill="x", pady=(0, 15))
progress_label = tk.Label(frame, text="Idle") # Model Size Selection
model_frame = tk.Frame(settings_frame)
model_frame.pack(fill="x", pady=(0, 10))
tk.Label(model_frame, text="🧠 Model Size:", font=("Arial", 9)).pack(side="left")
model_size_var = tk.StringVar(value="base")
model_dropdown = ttk.Combobox(model_frame, textvariable=model_size_var,
values=["tiny", "base", "small", "medium", "large"],
state="readonly", width=12)
model_dropdown.pack(side="right")
# Language Selection
language_frame = tk.Frame(settings_frame)
language_frame.pack(fill="x", pady=(0, 10))
tk.Label(language_frame, text="🌍 Language:", font=("Arial", 9)).pack(side="left")
language_var = tk.StringVar(value="auto")
language_dropdown = ttk.Combobox(language_frame, textvariable=language_var,
values=["auto", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh"],
state="readonly", width=12)
language_dropdown.pack(side="right")
# Help text
help_label = tk.Label(
settings_frame,
text=" 💡 Base model recommended for best speed/accuracy balance\n 🔍 Auto language detection works for most videos",
font=("Arial", 8),
fg="gray"
)
help_label.pack(anchor="w")
# Main Action Button
select_button = tk.Button(
frame,
text="📂 Select Video and Generate Subtitles",
command=select_file_and_generate,
font=("Arial", 11, "bold"),
bg="#4CAF50",
fg="white",
pady=8
)
select_button.pack(pady=15, fill="x")
# Progress Section
progress_frame = tk.LabelFrame(frame, text="Progress", padx=10, pady=10)
progress_frame.pack(fill="x", pady=(0, 10))
progress_bar = ttk.Progressbar(progress_frame, length=400, mode="determinate")
progress_bar.pack(fill="x", pady=(0, 5))
progress_label = tk.Label(progress_frame, text="Ready to process video", font=("Arial", 9))
progress_label.pack() progress_label.pack()
# Status Label
status_label = tk.Label(frame, text="💡 Tip: Use generated SRT files with subtitle_generator2.py for advanced editing!", font=("Arial", 9), fg="blue")
status_label.pack(pady=(10, 0))
root.mainloop() root.mainloop()

View File

@ -1,3 +0,0 @@
1
00:00:00,000 --> 00:00:02,500
You're running

37
test_whisper.py Normal file
View File

@ -0,0 +1,37 @@
#!/usr/bin/env python3
"""
Test script to verify faster_whisper integration
"""
import os
from faster_whisper import WhisperModel
def test_whisper_setup():
"""Test if faster_whisper is working correctly"""
print("🧪 Testing faster_whisper setup...")
try:
# Try to initialize the smallest model
print("📥 Loading tiny model (this might take a moment on first run)...")
model = WhisperModel("tiny")
print("✅ Successfully loaded Whisper tiny model!")
# Check available models
available_models = ["tiny", "base", "small", "medium", "large"]
print(f"🎯 Available models: {', '.join(available_models)}")
# Test basic functionality with a short audio
print("🔍 Whisper model ready for transcription!")
return True
except Exception as e:
print(f"❌ Error: {e}")
return False
if __name__ == "__main__":
if test_whisper_setup():
print("\n🎉 faster_whisper is ready to use!")
print("💡 Your subtitle generator now has much better speech recognition!")
else:
print("\n⚠️ There might be an issue with faster_whisper setup")