Refactor subtitle generation and integration with Whisper AI; remove unused files and enhance GUI for better user experience

This commit is contained in:
klop51 2025-08-09 10:35:13 +02:00
parent 5ce79f084d
commit 491040b148
10 changed files with 223 additions and 710 deletions

175
app.py
View File

@ -1,175 +0,0 @@
import tkinter as tk
from tkinter import filedialog
from moviepy import VideoFileClip, TextClip, CompositeVideoClip
import threading
import json
# Global settings with defaults
settings = {
"subtitle_y_px": 1550,
"highlight_offset": -8,
"font_size_subtitle": 65,
"font_size_highlight": 68,
"highlight_x_offset": 0,
"video_path": None,
"selected_font": "Arial" # Default font
}
# Compatible fonts that work across different systems
COMPATIBLE_FONTS = [
"Arial",
"Times-Roman",
"Helvetica",
"Courier",
"Comic-Sans-MS",
"Impact",
"Verdana",
"Tahoma",
"Georgia",
"Trebuchet-MS"
]
preset_file = "subtitle_gui_presets.json"
def save_presets():
with open(preset_file, "w") as f:
json.dump(settings, f)
print("💾 Presets saved!")
def load_presets():
global settings
try:
with open(preset_file, "r") as f:
loaded = json.load(f)
settings.update(loaded)
print("✅ Presets loaded!")
sync_gui()
except FileNotFoundError:
print("⚠️ No presets found.")
def sync_gui():
sub_y_slider.set(settings["subtitle_y_px"])
highlight_slider.set(settings["highlight_offset"])
highlight_x_slider.set(settings["highlight_x_offset"])
sub_font_slider.set(settings["font_size_subtitle"])
highlight_font_slider.set(settings["font_size_highlight"])
font_var.set(settings["selected_font"])
def render_preview():
if not settings["video_path"]:
print("⚠️ No video selected.")
return
clip = VideoFileClip(settings["video_path"]).subclipped(0, 3) # Use first 3 seconds
vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2)
subtitle_text = "THIS IS A TEST SUBTITLE"
highlight_word = "SUBTITLE"
base_subtitle = TextClip(
text=subtitle_text,
font_size=settings["font_size_subtitle"],
font=settings["selected_font"],
color='white',
stroke_color='black',
stroke_width=5
).with_duration(3).with_position(('center', settings["subtitle_y_px"]))
# Compute highlight word position
full_text = subtitle_text.upper()
words = full_text.split()
highlight_index = words.index(highlight_word.upper())
chars_before = sum(len(w) + 1 for w in words[:highlight_index])
char_width = 35
total_width = len(full_text) * char_width
x_offset = (chars_before * char_width) - (total_width // 2) + settings["highlight_x_offset"]
highlighted_word = TextClip(
text=highlight_word,
font_size=settings["font_size_highlight"],
font=settings["selected_font"],
color='#FFD700',
stroke_color='#FF6B35',
stroke_width=5
).with_duration(1.5).with_start(0.75).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
final = CompositeVideoClip([vertical_clip, base_subtitle, highlighted_word], size=(1080, 1920))
# Scale down the preview to fit 1080p monitor (max height ~900px to leave room for taskbar)
preview_scale = 900 / 1920 # Scale factor to fit height
preview_width = int(1080 * preview_scale)
preview_height = int(1920 * preview_scale)
preview_clip = final.resized((preview_width, preview_height))
preview_clip.preview(fps=24, audio=False)
clip.close()
final.close()
preview_clip.close()
def update_setting(var_name, value):
settings[var_name] = int(value)
def update_font(font_name):
settings["selected_font"] = font_name
def open_video():
file_path = filedialog.askopenfilename(filetypes=[("MP4 files", "*.mp4")])
if file_path:
settings["video_path"] = file_path
print(f"📂 Loaded video: {file_path}")
def start_preview_thread():
threading.Thread(target=render_preview).start()
# GUI Setup
root = tk.Tk()
root.title("Subtitle Positioning Tool")
root.geometry("400x600")
load_btn = tk.Button(root, text="🎥 Load Video", command=open_video)
load_btn.pack(pady=10)
tk.Label(root, text="Font Family").pack()
font_var = tk.StringVar(value=settings["selected_font"])
font_dropdown = tk.OptionMenu(root, font_var, *COMPATIBLE_FONTS, command=update_font)
font_dropdown.pack(pady=5)
tk.Label(root, text="Subtitle Y Position").pack()
sub_y_slider = tk.Scale(root, from_=1000, to=1800, orient="horizontal",
command=lambda v: update_setting("subtitle_y_px", v))
sub_y_slider.set(settings["subtitle_y_px"])
sub_y_slider.pack()
tk.Label(root, text="Highlight Y Offset").pack()
highlight_slider = tk.Scale(root, from_=-100, to=100, orient="horizontal",
command=lambda v: update_setting("highlight_offset", v))
highlight_slider.set(settings["highlight_offset"])
highlight_slider.pack()
tk.Label(root, text="Highlight X Offset").pack()
highlight_x_slider = tk.Scale(root, from_=-300, to=300, orient="horizontal",
command=lambda v: update_setting("highlight_x_offset", v))
highlight_x_slider.set(settings["highlight_x_offset"])
highlight_x_slider.pack()
tk.Label(root, text="Subtitle Font Size").pack()
sub_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
command=lambda v: update_setting("font_size_subtitle", v))
sub_font_slider.set(settings["font_size_subtitle"])
sub_font_slider.pack()
tk.Label(root, text="Highlight Font Size").pack()
highlight_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
command=lambda v: update_setting("font_size_highlight", v))
highlight_font_slider.set(settings["font_size_highlight"])
highlight_font_slider.pack()
preview_btn = tk.Button(root, text="▶️ Preview Clip", command=start_preview_thread)
preview_btn.pack(pady=10)
save_btn = tk.Button(root, text="💾 Save Preset", command=save_presets)
save_btn.pack(pady=5)
load_preset_btn = tk.Button(root, text="📂 Load Preset", command=load_presets)
load_preset_btn.pack(pady=5)
root.mainloop()

322
app2.py
View File

@ -1,322 +0,0 @@
import tkinter as tk
from tkinter import filedialog
from moviepy import VideoFileClip, TextClip, CompositeVideoClip
import threading
import json
import re
import os
import platform
def get_system_fonts():
"""Get list of available system fonts"""
fonts = []
if platform.system() == "Windows":
# Common Windows font paths
font_paths = [
"C:/Windows/Fonts/",
"C:/Windows/System32/Fonts/"
]
common_fonts = []
for font_path in font_paths:
if os.path.exists(font_path):
for file in os.listdir(font_path):
if file.endswith(('.ttf', '.otf')):
# Extract font name without extension
font_name = os.path.splitext(file)[0]
# Clean up common variations
if 'arial' in font_name.lower() and 'bold' not in font_name.lower():
common_fonts.append('arial.ttf')
elif 'times' in font_name.lower() and 'bold' not in font_name.lower():
common_fonts.append('times.ttf')
elif 'courier' in font_name.lower() and 'bold' not in font_name.lower():
common_fonts.append('cour.ttf')
elif 'comic' in font_name.lower():
common_fonts.append('comic.ttf')
elif 'impact' in font_name.lower():
common_fonts.append('impact.ttf')
elif 'verdana' in font_name.lower():
common_fonts.append('verdana.ttf')
elif 'tahoma' in font_name.lower():
common_fonts.append('tahoma.ttf')
# Add found fonts, fallback to common Windows fonts
fonts = list(set(common_fonts)) if common_fonts else [
'arial.ttf', 'times.ttf', 'cour.ttf', 'comic.ttf',
'impact.ttf', 'verdana.ttf', 'tahoma.ttf'
]
# Add option to use no font (system default)
fonts.insert(0, 'System Default')
return fonts
AVAILABLE_FONTS = get_system_fonts()
# Global settings with defaults
settings = {
"subtitle_y_px": 1550,
"highlight_offset": -8,
"font_size_subtitle": 65,
"font_size_highlight": 68,
"highlight_x_offset": 0,
"video_path": None,
"font": "System Default",
"subtitles": [],
"current_index": 0
}
# Compatible fonts that work across different systems
COMPATIBLE_FONTS = [
"Arial",
"Times-Roman",
"Helvetica",
"Courier",
"Comic-Sans-MS",
"Impact",
"Verdana",
"Tahoma",
"Georgia",
"Trebuchet-MS"
]
preset_file = "subtitle_gui_presets.json"
# === SRT PARSER ===
def parse_srt(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
contents = f.read()
pattern = r"(\d+)\s+(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\s+([\s\S]*?)(?=\n\d+|\Z)"
matches = re.findall(pattern, contents)
subtitles = []
for _, start, end, text in matches:
subtitles.append({
"start": srt_time_to_seconds(start),
"end": srt_time_to_seconds(end),
"text": text.replace('\n', ' ')
})
return subtitles
def srt_time_to_seconds(time_str):
h, m, s_ms = time_str.split(':')
s, ms = s_ms.split(',')
return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000
# === PRESETS ===
def save_presets():
with open(preset_file, "w") as f:
json.dump(settings, f)
print("📂 Presets saved!")
def load_presets():
global settings
try:
with open(preset_file, "r") as f:
loaded = json.load(f)
settings.update(loaded)
print("✅ Presets loaded!")
sync_gui()
except FileNotFoundError:
print("⚠️ No presets found.")
# === SYNC ===
def sync_gui():
sub_y_slider.set(settings["subtitle_y_px"])
highlight_slider.set(settings["highlight_offset"])
highlight_x_slider.set(settings["highlight_x_offset"])
sub_font_slider.set(settings["font_size_subtitle"])
highlight_font_slider.set(settings["font_size_highlight"])
font_dropdown_var.set(settings["font"])
def render_preview():
if not settings["video_path"] or not settings["subtitles"]:
print("⚠️ Video or subtitles not loaded.")
return
sub = settings["subtitles"][settings["current_index"]]
subtitle_text = sub["text"]
start_time = sub["start"]
end_time = sub["end"]
duration = end_time - start_time
clip = VideoFileClip(settings["video_path"]).subclipped(start_time, end_time)
vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2)
highlight_word = subtitle_text.split()[-1] # Highlight last word for now
# Create TextClip with font if specified, otherwise use system default
if settings["font"] == "System Default":
base_subtitle = TextClip(
text=subtitle_text,
font_size=settings["font_size_subtitle"],
color='white',
stroke_color='black',
stroke_width=5
).with_duration(duration).with_position(('center', settings["subtitle_y_px"]))
else:
try:
base_subtitle = TextClip(
text=subtitle_text,
font=settings["font"],
font_size=settings["font_size_subtitle"],
color='white',
stroke_color='black',
stroke_width=5
).with_duration(duration).with_position(('center', settings["subtitle_y_px"]))
except:
# Fallback to system default if font fails
print(f"⚠️ Font {settings['font']} failed, using system default")
base_subtitle = TextClip(
text=subtitle_text,
font_size=settings["font_size_subtitle"],
color='white',
stroke_color='black',
stroke_width=5
).with_duration(duration).with_position(('center', settings["subtitle_y_px"]))
full_text = subtitle_text.upper()
words = full_text.split()
try:
highlight_index = words.index(highlight_word.upper())
except ValueError:
highlight_index = len(words) - 1
chars_before = sum(len(w) + 1 for w in words[:highlight_index])
char_width = 35
total_width = len(full_text) * char_width
x_offset = (chars_before * char_width) - (total_width // 2) + settings["highlight_x_offset"]
# Create highlighted word with same font logic
if settings["font"] == "System Default":
highlighted_word = TextClip(
text=highlight_word,
font_size=settings["font_size_highlight"],
color='#FFD700',
stroke_color='#FF6B35',
stroke_width=5
).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
else:
try:
highlighted_word = TextClip(
text=highlight_word,
font=settings["font"],
font_size=settings["font_size_highlight"],
color='#FFD700',
stroke_color='#FF6B35',
stroke_width=5
).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
except:
# Fallback to system default if font fails
highlighted_word = TextClip(
text=highlight_word,
font_size=settings["font_size_highlight"],
color='#FFD700',
stroke_color='#FF6B35',
stroke_width=5
).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"]))
final = CompositeVideoClip([vertical_clip, base_subtitle, highlighted_word], size=(1080, 1920))
# Scale down the preview to fit 1080p monitor (max height ~900px to leave room for taskbar)
preview_scale = 900 / 1920 # Scale factor to fit height
preview_width = int(1080 * preview_scale)
preview_height = int(1920 * preview_scale)
preview_clip = final.resized((preview_width, preview_height))
preview_clip.preview(fps=24, audio=False)
clip.close()
final.close()
preview_clip.close()
def update_setting(var_name, value):
settings[var_name] = int(value) if var_name.startswith("font_size") or "offset" in var_name or "y_px" in var_name else value
def update_font(value):
settings["font"] = value
def open_video():
file_path = filedialog.askopenfilename(filetypes=[("MP4 files", "*.mp4")])
if file_path:
settings["video_path"] = file_path
print(f"📂 Loaded video: {file_path}")
def load_srt():
file_path = filedialog.askopenfilename(filetypes=[("SRT Subtitle", "*.srt")])
if file_path:
settings["subtitles"] = parse_srt(file_path)
settings["current_index"] = 0
print(f"📝 Loaded {len(settings['subtitles'])} subtitles from {file_path}")
def next_sub():
if settings["current_index"] < len(settings["subtitles"]) - 1:
settings["current_index"] += 1
start_preview_thread()
def prev_sub():
if settings["current_index"] > 0:
settings["current_index"] -= 1
start_preview_thread()
def start_preview_thread():
threading.Thread(target=render_preview).start()
# === GUI ===
root = tk.Tk()
root.title("Subtitle Positioning Tool")
root.geometry("420x700")
load_btn = tk.Button(root, text="🎥 Load Video", command=open_video)
load_btn.pack(pady=10)
load_srt_btn = tk.Button(root, text="📑 Load SRT Subtitles", command=load_srt)
load_srt_btn.pack(pady=5)
tk.Label(root, text="Subtitle Y Position").pack()
sub_y_slider = tk.Scale(root, from_=1000, to=1800, orient="horizontal",
command=lambda v: update_setting("subtitle_y_px", v))
sub_y_slider.set(settings["subtitle_y_px"])
sub_y_slider.pack()
tk.Label(root, text="Highlight Y Offset").pack()
highlight_slider = tk.Scale(root, from_=-100, to=100, orient="horizontal",
command=lambda v: update_setting("highlight_offset", v))
highlight_slider.set(settings["highlight_offset"])
highlight_slider.pack()
tk.Label(root, text="Highlight X Offset").pack()
highlight_x_slider = tk.Scale(root, from_=-300, to=300, orient="horizontal",
command=lambda v: update_setting("highlight_x_offset", v))
highlight_x_slider.set(settings["highlight_x_offset"])
highlight_x_slider.pack()
tk.Label(root, text="Subtitle Font Size").pack()
sub_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
command=lambda v: update_setting("font_size_subtitle", v))
sub_font_slider.set(settings["font_size_subtitle"])
sub_font_slider.pack()
tk.Label(root, text="Highlight Font Size").pack()
highlight_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal",
command=lambda v: update_setting("font_size_highlight", v))
highlight_font_slider.set(settings["font_size_highlight"])
highlight_font_slider.pack()
tk.Label(root, text="Font").pack()
font_dropdown_var = tk.StringVar(value=settings["font"])
font_dropdown = tk.OptionMenu(root, font_dropdown_var, *AVAILABLE_FONTS, command=update_font)
font_dropdown.pack(pady=5)
preview_btn = tk.Button(root, text="▶️ Preview Clip", command=start_preview_thread)
preview_btn.pack(pady=10)
nav_frame = tk.Frame(root)
tk.Button(nav_frame, text="⏮️ Prev", command=prev_sub).pack(side="left", padx=5)
tk.Button(nav_frame, text="⏭️ Next", command=next_sub).pack(side="right", padx=5)
nav_frame.pack(pady=5)
save_btn = tk.Button(root, text="📂 Save Preset", command=save_presets)
save_btn.pack(pady=5)
load_preset_btn = tk.Button(root, text="📂 Load Preset", command=load_presets)
load_preset_btn.pack(pady=5)
root.mainloop()

22
myvideo.srt Normal file
View File

@ -0,0 +1,22 @@
1
00:00:30,000 --> 00:00:40,000
okay after we will
2
00:02:00,000 --> 00:02:10,000
find it difficult to believe we prepare
to fight arm and arm the corny and royal
family
3
00:02:20,000 --> 00:02:30,000
hello me
4
00:02:30,000 --> 00:02:40,000
as expected
5
00:02:40,000 --> 00:02:50,000
gacha

View File

@ -112,7 +112,8 @@ def create_short_clip(video_path, start, end, subtitles, output_path):
word_width, _ = highlighted_word.size
word_x = current_x + (word_width / 2)
highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((word_x -125, subtitle_y_px))
highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((word_x -125
, subtitle_y_px))
clips.append(highlighted_word)
current_x += word_width + 20 # Add spacing between words

4
sub2.srt Normal file
View File

@ -0,0 +1,4 @@
1
00:00:00,000 --> 00:00:08,250
Yeah, yeah, level she's 24.

View File

@ -1,157 +0,0 @@
import os
import numpy as np
from moviepy import VideoFileClip, TextClip, CompositeVideoClip
from faster_whisper import WhisperModel
def detect_loud_moments(video_path, chunk_duration=5, threshold_db=10):
print("🔍 Analyzing audio...")
clip = VideoFileClip(video_path)
audio = clip.audio.to_soundarray(fps=44100)
volume = np.linalg.norm(audio, axis=1)
chunk_size = int(chunk_duration * 44100)
loud_chunks = []
max_db = -float('inf')
for i in range(0, len(volume), chunk_size):
chunk = volume[i:i+chunk_size]
db = 20 * np.log10(np.mean(chunk) + 1e-10)
max_db = max(max_db, db)
if db > threshold_db:
start = i / 44100
loud_chunks.append((start, min(start + chunk_duration, clip.duration)))
print(f"🔊 Max volume found: {max_db:.2f} dB, threshold: {threshold_db} dB")
print(f"📈 Found {len(loud_chunks)} loud moments")
clip.close()
return loud_chunks
def transcribe_and_extract_subtitles(video_path, start, end):
print(f"🗣️ Transcribing audio from {start:.2f}s to {end:.2f}s...")
model = WhisperModel("base", device="cpu", compute_type="int8")
segments, _ = model.transcribe(video_path, beam_size=5, language="en", vad_filter=True)
subtitles = []
for segment in segments:
if start <= segment.start <= end:
subtitles.append((segment.start - start, segment.end - start, segment.text))
return subtitles
def create_short_clip(video_path, start, end, subtitles, output_path):
print(f"🎬 Creating short: {output_path}")
clip = VideoFileClip(video_path).subclipped(start, end)
video_duration = clip.duration
print(f"📏 Video clip duration: {video_duration:.2f}s")
# Convert to vertical 9:16
vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2)
clips = [vertical_clip]
for (s, e, text) in subtitles:
try:
# Ensure subtitle timing doesn't exceed video duration
subtitle_start = max(0, s)
subtitle_end = min(e, video_duration)
if subtitle_start >= video_duration or subtitle_end <= subtitle_start:
print(f"⚠️ Skipping subtitle outside video duration: {text[:30]}...")
continue
# Opus Clip style professional subtitles
words = text.strip().split()
if not words:
continue
# Break text into smaller chunks for better readability (max 3-4 words per line)
chunks = []
current_chunk = []
for word in words:
current_chunk.append(word)
if len(current_chunk) >= 3 or len(' '.join(current_chunk)) > 25:
chunks.append(' '.join(current_chunk))
current_chunk = []
if current_chunk:
chunks.append(' '.join(current_chunk))
# Position subtitles in the center-bottom area (Opus style)
subtitle_position = 0.75
# Create subtitle for each chunk with smooth transitions
chunk_duration = (subtitle_end - subtitle_start) / len(chunks)
for chunk_idx, chunk_text in enumerate(chunks):
chunk_start = subtitle_start + (chunk_idx * chunk_duration)
chunk_end = min(chunk_start + chunk_duration, subtitle_end)
chunk_words = chunk_text.split()
# Base subtitle with Opus-style design (bold white text with strong outline)
base_subtitle = TextClip(
text=chunk_text.upper(),
font='C:/Windows/Fonts/LatoWeb-Bold.ttf', # Lato Bold - excellent for subtitles
font_size=65, # Larger, chunkier text
color='white',
stroke_color='black',
stroke_width=5 # Thicker outline for better readability
)
base_subtitle = base_subtitle.with_start(chunk_start).with_end(chunk_end).with_position(('center', subtitle_position), relative=True)
clips.append(base_subtitle)
# Opus-style word-by-word highlighting (yellow/orange like Opus)
word_duration = chunk_duration / len(chunk_words)
for i, word in enumerate(chunk_words):
word_start = chunk_start + (i * word_duration)
word_end = min(word_start + word_duration * 0.8, chunk_end)
# Opus-style highlighted word (vibrant yellow/orange)
highlighted_word = TextClip(
text=word.upper(),
font='C:/Windows/Fonts/LatoWeb-Bold.ttf', # Lato Bold for consistency
font_size=68, # Slightly larger for highlight effect
color='#FFD700', # Gold/yellow like Opus Clip
stroke_color='#FF6B35', # Orange outline for pop
stroke_width=5
)
# Calculate precise word positioning within the chunk
words_before = chunk_words[:i]
chars_before = sum(len(w) for w in words_before) + len(words_before)
# More accurate character width calculation for Arial Bold
char_width = 35 # Adjusted for larger, bolder font
total_chunk_width = len(chunk_text) * char_width
word_x_offset = (chars_before * char_width) - (total_chunk_width // 2)
highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((540 + word_x_offset, subtitle_position), relative=(False, True))
clips.append(highlighted_word)
print(f"✅ Added Opus-style subtitle ({subtitle_start:.1f}s-{subtitle_end:.1f}s): {text[:30]}...")
except Exception as e:
print(f"⚠️ Subtitle error: {e}, skipping subtitle: {text[:50]}...")
continue
final = CompositeVideoClip(clips, size=(1080, 1920))
final.write_videofile(output_path, codec="libx264", audio_codec="aac", threads=1)
# 💥 Force close to avoid Windows pipe errors
clip.reader.close()
if clip.audio:
clip.audio.reader.close()
final.close()
def generate_shorts(video_path, max_clips=3, output_folder="shorts"):
os.makedirs(output_folder, exist_ok=True)
best_moments = detect_loud_moments(video_path, threshold_db=-30)
selected = best_moments[:max_clips]
for i, (start, end) in enumerate(selected):
subtitles = transcribe_and_extract_subtitles(video_path, start, end)
out_path = os.path.join(output_folder, f"short_{i+1}.mp4")
create_short_clip(video_path, start, end, subtitles, out_path)
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python shorts_generator.py your_video.mp4")
else:
generate_shorts(sys.argv[1])

View File

@ -2,9 +2,12 @@ import os
import math
import tempfile
import moviepy as mp
import speech_recognition as sr
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from pydub import AudioSegment
from pydub.silence import split_on_silence
import threading
from faster_whisper import WhisperModel
def format_time(seconds):
@ -44,52 +47,71 @@ def write_srt(subtitles, output_path):
f.write(f"{wrap_text(sub['text'])}\n\n")
def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None, chunk_duration=10):
def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None, model_size="base", language="auto"):
try:
print("📽️ Loading video file...")
video = mp.VideoFileClip(video_path)
audio = video.audio
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
temp_audio_path = temp_audio_file.name
print("🔊 Extracting audio...")
audio.write_audiofile(temp_audio_path, logger=None)
recognizer = sr.Recognizer()
print(f"🤖 Loading Whisper model ({model_size})...")
# Initialize Whisper model - much more accurate than Google Speech Recognition
model = WhisperModel(model_size, device="cpu", compute_type="int8")
print("🎯 Transcribing with Whisper AI...")
# Transcribe the entire audio file at once - Whisper handles timing automatically
segments, info = model.transcribe(
temp_audio_path,
language=None if language == "auto" else language,
word_timestamps=True,
vad_filter=True, # Voice Activity Detection for better accuracy
vad_parameters=dict(min_silence_duration_ms=500)
)
print(f"🌍 Detected language: {info.language} (confidence: {info.language_probability:.2f})")
subtitles = []
segment_list = list(segments) # Convert generator to list for progress tracking
print(f"📝 Processing {len(segment_list)} speech segments...")
for i, segment in enumerate(segment_list):
# Whisper provides precise timing and text
start_time = segment.start
end_time = segment.end
text = segment.text.strip()
if text and len(text) > 0:
subtitles.append({
"start": start_time,
"end": end_time,
"text": text
})
print(f"✅ Segment {i+1}: '{text[:50]}...' ({start_time:.1f}s - {end_time:.1f}s)")
# Update progress bar
if progress_callback:
progress_callback(i + 1, len(segment_list))
with sr.AudioFile(temp_audio_path) as source:
audio_duration = source.DURATION
num_chunks = math.ceil(audio_duration / chunk_duration)
for i in range(num_chunks):
start_time = i * chunk_duration
end_time = min((i + 1) * chunk_duration, audio_duration)
source_offset = start_time
duration = end_time - start_time
audio_data = recognizer.record(source, offset=source_offset, duration=duration)
try:
text = recognizer.recognize_google(audio_data)
subtitles.append({
"start": start_time,
"end": end_time,
"text": text
})
except sr.UnknownValueError:
pass
except sr.RequestError as e:
print(f"API error: {e}")
# Update progress bar
if progress_callback:
progress_callback(i + 1, num_chunks)
os.remove(temp_audio_path)
# Clean up
if os.path.exists(temp_audio_path):
os.remove(temp_audio_path)
if video:
video.close()
if audio:
audio.close()
print(f"🎯 Generated {len(subtitles)} subtitle segments with Whisper AI")
write_srt(subtitles, srt_output_path)
return True
except Exception as e:
print(f"Error: {e}")
print(f"❌ Error: {e}")
return False
@ -98,7 +120,7 @@ def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None,
def select_file_and_generate():
video_path = filedialog.askopenfilename(
title="Select a video file",
filetypes=[("Video files", "*.mp4 *.mov *.avi *.mkv")]
filetypes=[("Video files", "*.mp4 *.mov *.avi *.mkv *.webm *.flv")]
)
if not video_path:
@ -113,43 +135,127 @@ def select_file_and_generate():
if not srt_output_path:
return
# Disable button during processing
select_button.config(state="disabled", text="Processing...")
progress_bar["value"] = 0
progress_label.config(text="Starting...")
progress_label.config(text="Starting speech recognition...")
status_label.config(text="🔄 Processing video...", fg="blue")
root.update()
def update_progress(current, total):
percent = (current / total) * 100
progress_bar["value"] = percent
progress_label.config(text=f"Progress: {current}/{total} chunks")
progress_label.config(text=f"Processing: {current}/{total} segments")
root.update()
success = transcribe_video_to_srt(video_path, srt_output_path, progress_callback=update_progress)
def process_video():
try:
model_size = model_size_var.get()
language = language_var.get()
success = transcribe_video_to_srt(
video_path,
srt_output_path,
progress_callback=update_progress,
model_size=model_size,
language=language
)
if success:
messagebox.showinfo("Success", f"Subtitles saved to:\n{srt_output_path}")
else:
messagebox.showerror("Error", "Something went wrong. See console for details.")
if success:
status_label.config(text="✅ Subtitles generated successfully!", fg="green")
messagebox.showinfo("Success", f"Subtitles saved to:\n{srt_output_path}\n\nOpen with your subtitle_generator2.py for editing!")
else:
status_label.config(text="❌ Error occurred during processing", fg="red")
messagebox.showerror("Error", "Something went wrong. Check console for details.")
progress_label.config(text="Done")
except Exception as e:
status_label.config(text="❌ Unexpected error occurred", fg="red")
messagebox.showerror("Error", f"Unexpected error: {e}")
finally:
# Re-enable button
select_button.config(state="normal", text="📂 Select Video and Generate Subtitles")
progress_label.config(text="Done")
# Run in separate thread to prevent GUI freezing
thread = threading.Thread(target=process_video)
thread.daemon = True
thread.start()
# GUI Setup
root = tk.Tk()
root.title("Auto Subtitle Generator (.srt) with Progress")
root.title("🎬 Auto Subtitle Generator - Speech to SRT")
root.geometry("500x350")
frame = tk.Frame(root, padx=20, pady=20)
frame.pack()
frame.pack(fill="both", expand=True)
label = tk.Label(frame, text="Select a video file to auto-generate subtitles (SRT):")
label.pack(pady=(0, 10))
# Title
title_label = tk.Label(frame, text="🎬 Auto Subtitle Generator", font=("Arial", 16, "bold"))
title_label.pack(pady=(0, 10))
select_button = tk.Button(frame, text="Select Video and Generate Subtitles", command=select_file_and_generate)
select_button.pack(pady=5)
subtitle_label = tk.Label(frame, text="Extract speech from video and create perfectly timed SRT subtitles", font=("Arial", 10))
subtitle_label.pack(pady=(0, 20))
progress_bar = ttk.Progressbar(frame, length=300, mode="determinate")
progress_bar.pack(pady=(15, 5))
# Settings Frame
settings_frame = tk.LabelFrame(frame, text="Whisper AI Settings", padx=10, pady=10)
settings_frame.pack(fill="x", pady=(0, 15))
progress_label = tk.Label(frame, text="Idle")
# Model Size Selection
model_frame = tk.Frame(settings_frame)
model_frame.pack(fill="x", pady=(0, 10))
tk.Label(model_frame, text="🧠 Model Size:", font=("Arial", 9)).pack(side="left")
model_size_var = tk.StringVar(value="base")
model_dropdown = ttk.Combobox(model_frame, textvariable=model_size_var,
values=["tiny", "base", "small", "medium", "large"],
state="readonly", width=12)
model_dropdown.pack(side="right")
# Language Selection
language_frame = tk.Frame(settings_frame)
language_frame.pack(fill="x", pady=(0, 10))
tk.Label(language_frame, text="🌍 Language:", font=("Arial", 9)).pack(side="left")
language_var = tk.StringVar(value="auto")
language_dropdown = ttk.Combobox(language_frame, textvariable=language_var,
values=["auto", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh"],
state="readonly", width=12)
language_dropdown.pack(side="right")
# Help text
help_label = tk.Label(
settings_frame,
text=" 💡 Base model recommended for best speed/accuracy balance\n 🔍 Auto language detection works for most videos",
font=("Arial", 8),
fg="gray"
)
help_label.pack(anchor="w")
# Main Action Button
select_button = tk.Button(
frame,
text="📂 Select Video and Generate Subtitles",
command=select_file_and_generate,
font=("Arial", 11, "bold"),
bg="#4CAF50",
fg="white",
pady=8
)
select_button.pack(pady=15, fill="x")
# Progress Section
progress_frame = tk.LabelFrame(frame, text="Progress", padx=10, pady=10)
progress_frame.pack(fill="x", pady=(0, 10))
progress_bar = ttk.Progressbar(progress_frame, length=400, mode="determinate")
progress_bar.pack(fill="x", pady=(0, 5))
progress_label = tk.Label(progress_frame, text="Ready to process video", font=("Arial", 9))
progress_label.pack()
# Status Label
status_label = tk.Label(frame, text="💡 Tip: Use generated SRT files with subtitle_generator2.py for advanced editing!", font=("Arial", 9), fg="blue")
status_label.pack(pady=(10, 0))
root.mainloop()

View File

@ -1,3 +0,0 @@
1
00:00:00,000 --> 00:00:02,500
You're running

37
test_whisper.py Normal file
View File

@ -0,0 +1,37 @@
#!/usr/bin/env python3
"""
Test script to verify faster_whisper integration
"""
import os
from faster_whisper import WhisperModel
def test_whisper_setup():
"""Test if faster_whisper is working correctly"""
print("🧪 Testing faster_whisper setup...")
try:
# Try to initialize the smallest model
print("📥 Loading tiny model (this might take a moment on first run)...")
model = WhisperModel("tiny")
print("✅ Successfully loaded Whisper tiny model!")
# Check available models
available_models = ["tiny", "base", "small", "medium", "large"]
print(f"🎯 Available models: {', '.join(available_models)}")
# Test basic functionality with a short audio
print("🔍 Whisper model ready for transcription!")
return True
except Exception as e:
print(f"❌ Error: {e}")
return False
if __name__ == "__main__":
if test_whisper_setup():
print("\n🎉 faster_whisper is ready to use!")
print("💡 Your subtitle generator now has much better speech recognition!")
else:
print("\n⚠️ There might be an issue with faster_whisper setup")