diff --git a/app.py b/app.py index 5fe5437..e69de29 100644 --- a/app.py +++ b/app.py @@ -1,175 +0,0 @@ -import tkinter as tk -from tkinter import filedialog -from moviepy import VideoFileClip, TextClip, CompositeVideoClip -import threading -import json - -# Global settings with defaults -settings = { - "subtitle_y_px": 1550, - "highlight_offset": -8, - "font_size_subtitle": 65, - "font_size_highlight": 68, - "highlight_x_offset": 0, - "video_path": None, - "selected_font": "Arial" # Default font -} - -# Compatible fonts that work across different systems -COMPATIBLE_FONTS = [ - "Arial", - "Times-Roman", - "Helvetica", - "Courier", - "Comic-Sans-MS", - "Impact", - "Verdana", - "Tahoma", - "Georgia", - "Trebuchet-MS" -] - -preset_file = "subtitle_gui_presets.json" - -def save_presets(): - with open(preset_file, "w") as f: - json.dump(settings, f) - print("๐Ÿ’พ Presets saved!") - -def load_presets(): - global settings - try: - with open(preset_file, "r") as f: - loaded = json.load(f) - settings.update(loaded) - print("โœ… Presets loaded!") - sync_gui() - except FileNotFoundError: - print("โš ๏ธ No presets found.") - -def sync_gui(): - sub_y_slider.set(settings["subtitle_y_px"]) - highlight_slider.set(settings["highlight_offset"]) - highlight_x_slider.set(settings["highlight_x_offset"]) - sub_font_slider.set(settings["font_size_subtitle"]) - highlight_font_slider.set(settings["font_size_highlight"]) - font_var.set(settings["selected_font"]) - -def render_preview(): - if not settings["video_path"]: - print("โš ๏ธ No video selected.") - return - - clip = VideoFileClip(settings["video_path"]).subclipped(0, 3) # Use first 3 seconds - vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2) - - subtitle_text = "THIS IS A TEST SUBTITLE" - highlight_word = "SUBTITLE" - - base_subtitle = TextClip( - text=subtitle_text, - font_size=settings["font_size_subtitle"], - font=settings["selected_font"], - color='white', - stroke_color='black', - stroke_width=5 - ).with_duration(3).with_position(('center', settings["subtitle_y_px"])) - - # Compute highlight word position - full_text = subtitle_text.upper() - words = full_text.split() - highlight_index = words.index(highlight_word.upper()) - chars_before = sum(len(w) + 1 for w in words[:highlight_index]) - char_width = 35 - total_width = len(full_text) * char_width - x_offset = (chars_before * char_width) - (total_width // 2) + settings["highlight_x_offset"] - - highlighted_word = TextClip( - text=highlight_word, - font_size=settings["font_size_highlight"], - font=settings["selected_font"], - color='#FFD700', - stroke_color='#FF6B35', - stroke_width=5 - ).with_duration(1.5).with_start(0.75).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"])) - - final = CompositeVideoClip([vertical_clip, base_subtitle, highlighted_word], size=(1080, 1920)) - # Scale down the preview to fit 1080p monitor (max height ~900px to leave room for taskbar) - preview_scale = 900 / 1920 # Scale factor to fit height - preview_width = int(1080 * preview_scale) - preview_height = int(1920 * preview_scale) - preview_clip = final.resized((preview_width, preview_height)) - preview_clip.preview(fps=24, audio=False) - - clip.close() - final.close() - preview_clip.close() - -def update_setting(var_name, value): - settings[var_name] = int(value) - -def update_font(font_name): - settings["selected_font"] = font_name - -def open_video(): - file_path = filedialog.askopenfilename(filetypes=[("MP4 files", "*.mp4")]) - if file_path: - settings["video_path"] = file_path - print(f"๐Ÿ“‚ Loaded video: {file_path}") - -def start_preview_thread(): - threading.Thread(target=render_preview).start() - -# GUI Setup -root = tk.Tk() -root.title("Subtitle Positioning Tool") -root.geometry("400x600") - -load_btn = tk.Button(root, text="๐ŸŽฅ Load Video", command=open_video) -load_btn.pack(pady=10) - -tk.Label(root, text="Font Family").pack() -font_var = tk.StringVar(value=settings["selected_font"]) -font_dropdown = tk.OptionMenu(root, font_var, *COMPATIBLE_FONTS, command=update_font) -font_dropdown.pack(pady=5) - -tk.Label(root, text="Subtitle Y Position").pack() -sub_y_slider = tk.Scale(root, from_=1000, to=1800, orient="horizontal", - command=lambda v: update_setting("subtitle_y_px", v)) -sub_y_slider.set(settings["subtitle_y_px"]) -sub_y_slider.pack() - -tk.Label(root, text="Highlight Y Offset").pack() -highlight_slider = tk.Scale(root, from_=-100, to=100, orient="horizontal", - command=lambda v: update_setting("highlight_offset", v)) -highlight_slider.set(settings["highlight_offset"]) -highlight_slider.pack() - -tk.Label(root, text="Highlight X Offset").pack() -highlight_x_slider = tk.Scale(root, from_=-300, to=300, orient="horizontal", - command=lambda v: update_setting("highlight_x_offset", v)) -highlight_x_slider.set(settings["highlight_x_offset"]) -highlight_x_slider.pack() - -tk.Label(root, text="Subtitle Font Size").pack() -sub_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal", - command=lambda v: update_setting("font_size_subtitle", v)) -sub_font_slider.set(settings["font_size_subtitle"]) -sub_font_slider.pack() - -tk.Label(root, text="Highlight Font Size").pack() -highlight_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal", - command=lambda v: update_setting("font_size_highlight", v)) -highlight_font_slider.set(settings["font_size_highlight"]) -highlight_font_slider.pack() - -preview_btn = tk.Button(root, text="โ–ถ๏ธ Preview Clip", command=start_preview_thread) -preview_btn.pack(pady=10) - -save_btn = tk.Button(root, text="๐Ÿ’พ Save Preset", command=save_presets) -save_btn.pack(pady=5) - -load_preset_btn = tk.Button(root, text="๐Ÿ“‚ Load Preset", command=load_presets) -load_preset_btn.pack(pady=5) - -root.mainloop() diff --git a/app2.py b/app2.py index 939612d..e69de29 100644 --- a/app2.py +++ b/app2.py @@ -1,322 +0,0 @@ -import tkinter as tk -from tkinter import filedialog -from moviepy import VideoFileClip, TextClip, CompositeVideoClip -import threading -import json -import re -import os -import platform - -def get_system_fonts(): - """Get list of available system fonts""" - fonts = [] - - if platform.system() == "Windows": - # Common Windows font paths - font_paths = [ - "C:/Windows/Fonts/", - "C:/Windows/System32/Fonts/" - ] - - common_fonts = [] - for font_path in font_paths: - if os.path.exists(font_path): - for file in os.listdir(font_path): - if file.endswith(('.ttf', '.otf')): - # Extract font name without extension - font_name = os.path.splitext(file)[0] - # Clean up common variations - if 'arial' in font_name.lower() and 'bold' not in font_name.lower(): - common_fonts.append('arial.ttf') - elif 'times' in font_name.lower() and 'bold' not in font_name.lower(): - common_fonts.append('times.ttf') - elif 'courier' in font_name.lower() and 'bold' not in font_name.lower(): - common_fonts.append('cour.ttf') - elif 'comic' in font_name.lower(): - common_fonts.append('comic.ttf') - elif 'impact' in font_name.lower(): - common_fonts.append('impact.ttf') - elif 'verdana' in font_name.lower(): - common_fonts.append('verdana.ttf') - elif 'tahoma' in font_name.lower(): - common_fonts.append('tahoma.ttf') - - # Add found fonts, fallback to common Windows fonts - fonts = list(set(common_fonts)) if common_fonts else [ - 'arial.ttf', 'times.ttf', 'cour.ttf', 'comic.ttf', - 'impact.ttf', 'verdana.ttf', 'tahoma.ttf' - ] - - # Add option to use no font (system default) - fonts.insert(0, 'System Default') - return fonts - -AVAILABLE_FONTS = get_system_fonts() - -# Global settings with defaults -settings = { - "subtitle_y_px": 1550, - "highlight_offset": -8, - "font_size_subtitle": 65, - "font_size_highlight": 68, - "highlight_x_offset": 0, - "video_path": None, - "font": "System Default", - "subtitles": [], - "current_index": 0 -} - -# Compatible fonts that work across different systems -COMPATIBLE_FONTS = [ - "Arial", - "Times-Roman", - "Helvetica", - "Courier", - "Comic-Sans-MS", - "Impact", - "Verdana", - "Tahoma", - "Georgia", - "Trebuchet-MS" -] - -preset_file = "subtitle_gui_presets.json" - -# === SRT PARSER === -def parse_srt(file_path): - with open(file_path, 'r', encoding='utf-8') as f: - contents = f.read() - pattern = r"(\d+)\s+(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\s+([\s\S]*?)(?=\n\d+|\Z)" - matches = re.findall(pattern, contents) - subtitles = [] - for _, start, end, text in matches: - subtitles.append({ - "start": srt_time_to_seconds(start), - "end": srt_time_to_seconds(end), - "text": text.replace('\n', ' ') - }) - return subtitles - -def srt_time_to_seconds(time_str): - h, m, s_ms = time_str.split(':') - s, ms = s_ms.split(',') - return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000 - -# === PRESETS === -def save_presets(): - with open(preset_file, "w") as f: - json.dump(settings, f) - print("๐Ÿ“‚ Presets saved!") - -def load_presets(): - global settings - try: - with open(preset_file, "r") as f: - loaded = json.load(f) - settings.update(loaded) - print("โœ… Presets loaded!") - sync_gui() - except FileNotFoundError: - print("โš ๏ธ No presets found.") - -# === SYNC === -def sync_gui(): - sub_y_slider.set(settings["subtitle_y_px"]) - highlight_slider.set(settings["highlight_offset"]) - highlight_x_slider.set(settings["highlight_x_offset"]) - sub_font_slider.set(settings["font_size_subtitle"]) - highlight_font_slider.set(settings["font_size_highlight"]) - font_dropdown_var.set(settings["font"]) - -def render_preview(): - if not settings["video_path"] or not settings["subtitles"]: - print("โš ๏ธ Video or subtitles not loaded.") - return - - sub = settings["subtitles"][settings["current_index"]] - subtitle_text = sub["text"] - start_time = sub["start"] - end_time = sub["end"] - duration = end_time - start_time - - clip = VideoFileClip(settings["video_path"]).subclipped(start_time, end_time) - vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2) - - highlight_word = subtitle_text.split()[-1] # Highlight last word for now - - # Create TextClip with font if specified, otherwise use system default - if settings["font"] == "System Default": - base_subtitle = TextClip( - text=subtitle_text, - font_size=settings["font_size_subtitle"], - color='white', - stroke_color='black', - stroke_width=5 - ).with_duration(duration).with_position(('center', settings["subtitle_y_px"])) - else: - try: - base_subtitle = TextClip( - text=subtitle_text, - font=settings["font"], - font_size=settings["font_size_subtitle"], - color='white', - stroke_color='black', - stroke_width=5 - ).with_duration(duration).with_position(('center', settings["subtitle_y_px"])) - except: - # Fallback to system default if font fails - print(f"โš ๏ธ Font {settings['font']} failed, using system default") - base_subtitle = TextClip( - text=subtitle_text, - font_size=settings["font_size_subtitle"], - color='white', - stroke_color='black', - stroke_width=5 - ).with_duration(duration).with_position(('center', settings["subtitle_y_px"])) - - full_text = subtitle_text.upper() - words = full_text.split() - try: - highlight_index = words.index(highlight_word.upper()) - except ValueError: - highlight_index = len(words) - 1 - - chars_before = sum(len(w) + 1 for w in words[:highlight_index]) - char_width = 35 - total_width = len(full_text) * char_width - x_offset = (chars_before * char_width) - (total_width // 2) + settings["highlight_x_offset"] - - # Create highlighted word with same font logic - if settings["font"] == "System Default": - highlighted_word = TextClip( - text=highlight_word, - font_size=settings["font_size_highlight"], - color='#FFD700', - stroke_color='#FF6B35', - stroke_width=5 - ).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"])) - else: - try: - highlighted_word = TextClip( - text=highlight_word, - font=settings["font"], - font_size=settings["font_size_highlight"], - color='#FFD700', - stroke_color='#FF6B35', - stroke_width=5 - ).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"])) - except: - # Fallback to system default if font fails - highlighted_word = TextClip( - text=highlight_word, - font_size=settings["font_size_highlight"], - color='#FFD700', - stroke_color='#FF6B35', - stroke_width=5 - ).with_duration(duration / 2).with_start(duration / 4).with_position((540 + x_offset, settings["subtitle_y_px"] + settings["highlight_offset"])) - - final = CompositeVideoClip([vertical_clip, base_subtitle, highlighted_word], size=(1080, 1920)) - # Scale down the preview to fit 1080p monitor (max height ~900px to leave room for taskbar) - preview_scale = 900 / 1920 # Scale factor to fit height - preview_width = int(1080 * preview_scale) - preview_height = int(1920 * preview_scale) - preview_clip = final.resized((preview_width, preview_height)) - preview_clip.preview(fps=24, audio=False) - - clip.close() - final.close() - preview_clip.close() - -def update_setting(var_name, value): - settings[var_name] = int(value) if var_name.startswith("font_size") or "offset" in var_name or "y_px" in var_name else value - -def update_font(value): - settings["font"] = value - -def open_video(): - file_path = filedialog.askopenfilename(filetypes=[("MP4 files", "*.mp4")]) - if file_path: - settings["video_path"] = file_path - print(f"๐Ÿ“‚ Loaded video: {file_path}") - -def load_srt(): - file_path = filedialog.askopenfilename(filetypes=[("SRT Subtitle", "*.srt")]) - if file_path: - settings["subtitles"] = parse_srt(file_path) - settings["current_index"] = 0 - print(f"๐Ÿ“ Loaded {len(settings['subtitles'])} subtitles from {file_path}") - -def next_sub(): - if settings["current_index"] < len(settings["subtitles"]) - 1: - settings["current_index"] += 1 - start_preview_thread() - -def prev_sub(): - if settings["current_index"] > 0: - settings["current_index"] -= 1 - start_preview_thread() - -def start_preview_thread(): - threading.Thread(target=render_preview).start() - -# === GUI === -root = tk.Tk() -root.title("Subtitle Positioning Tool") -root.geometry("420x700") - -load_btn = tk.Button(root, text="๐ŸŽฅ Load Video", command=open_video) -load_btn.pack(pady=10) - -load_srt_btn = tk.Button(root, text="๐Ÿ“‘ Load SRT Subtitles", command=load_srt) -load_srt_btn.pack(pady=5) - -tk.Label(root, text="Subtitle Y Position").pack() -sub_y_slider = tk.Scale(root, from_=1000, to=1800, orient="horizontal", - command=lambda v: update_setting("subtitle_y_px", v)) -sub_y_slider.set(settings["subtitle_y_px"]) -sub_y_slider.pack() - -tk.Label(root, text="Highlight Y Offset").pack() -highlight_slider = tk.Scale(root, from_=-100, to=100, orient="horizontal", - command=lambda v: update_setting("highlight_offset", v)) -highlight_slider.set(settings["highlight_offset"]) -highlight_slider.pack() - -tk.Label(root, text="Highlight X Offset").pack() -highlight_x_slider = tk.Scale(root, from_=-300, to=300, orient="horizontal", - command=lambda v: update_setting("highlight_x_offset", v)) -highlight_x_slider.set(settings["highlight_x_offset"]) -highlight_x_slider.pack() - -tk.Label(root, text="Subtitle Font Size").pack() -sub_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal", - command=lambda v: update_setting("font_size_subtitle", v)) -sub_font_slider.set(settings["font_size_subtitle"]) -sub_font_slider.pack() - -tk.Label(root, text="Highlight Font Size").pack() -highlight_font_slider = tk.Scale(root, from_=30, to=100, orient="horizontal", - command=lambda v: update_setting("font_size_highlight", v)) -highlight_font_slider.set(settings["font_size_highlight"]) -highlight_font_slider.pack() - -tk.Label(root, text="Font").pack() -font_dropdown_var = tk.StringVar(value=settings["font"]) -font_dropdown = tk.OptionMenu(root, font_dropdown_var, *AVAILABLE_FONTS, command=update_font) -font_dropdown.pack(pady=5) - -preview_btn = tk.Button(root, text="โ–ถ๏ธ Preview Clip", command=start_preview_thread) -preview_btn.pack(pady=10) - -nav_frame = tk.Frame(root) -tk.Button(nav_frame, text="โฎ๏ธ Prev", command=prev_sub).pack(side="left", padx=5) -tk.Button(nav_frame, text="โญ๏ธ Next", command=next_sub).pack(side="right", padx=5) -nav_frame.pack(pady=5) - -save_btn = tk.Button(root, text="๐Ÿ“‚ Save Preset", command=save_presets) -save_btn.pack(pady=5) - -load_preset_btn = tk.Button(root, text="๐Ÿ“‚ Load Preset", command=load_presets) -load_preset_btn.pack(pady=5) - -root.mainloop() diff --git a/myvideo.srt b/myvideo.srt new file mode 100644 index 0000000..1167446 --- /dev/null +++ b/myvideo.srt @@ -0,0 +1,22 @@ +1 +00:00:30,000 --> 00:00:40,000 +okay after we will + +2 +00:02:00,000 --> 00:02:10,000 +find it difficult to believe we prepare +to fight arm and arm the corny and royal +family + +3 +00:02:20,000 --> 00:02:30,000 +hello me + +4 +00:02:30,000 --> 00:02:40,000 +as expected + +5 +00:02:40,000 --> 00:02:50,000 +gacha + diff --git a/shorts_generator2.py b/shorts_generator2.py index ec223d6..49d0452 100644 --- a/shorts_generator2.py +++ b/shorts_generator2.py @@ -112,7 +112,8 @@ def create_short_clip(video_path, start, end, subtitles, output_path): word_width, _ = highlighted_word.size word_x = current_x + (word_width / 2) - highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((word_x -125, subtitle_y_px)) + highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((word_x -125 + , subtitle_y_px)) clips.append(highlighted_word) current_x += word_width + 20 # Add spacing between words diff --git a/sub2.srt b/sub2.srt new file mode 100644 index 0000000..ec5ac54 --- /dev/null +++ b/sub2.srt @@ -0,0 +1,4 @@ +1 +00:00:00,000 --> 00:00:08,250 +Yeah, yeah, level she's 24. + diff --git a/subtitle_extrator.py b/subtitle_extrator.py deleted file mode 100644 index bee70fe..0000000 --- a/subtitle_extrator.py +++ /dev/null @@ -1,157 +0,0 @@ -import os -import numpy as np -from moviepy import VideoFileClip, TextClip, CompositeVideoClip -from faster_whisper import WhisperModel - -def detect_loud_moments(video_path, chunk_duration=5, threshold_db=10): - print("๐Ÿ” Analyzing audio...") - clip = VideoFileClip(video_path) - audio = clip.audio.to_soundarray(fps=44100) - volume = np.linalg.norm(audio, axis=1) - chunk_size = int(chunk_duration * 44100) - - loud_chunks = [] - max_db = -float('inf') - for i in range(0, len(volume), chunk_size): - chunk = volume[i:i+chunk_size] - db = 20 * np.log10(np.mean(chunk) + 1e-10) - max_db = max(max_db, db) - if db > threshold_db: - start = i / 44100 - loud_chunks.append((start, min(start + chunk_duration, clip.duration))) - - print(f"๐Ÿ”Š Max volume found: {max_db:.2f} dB, threshold: {threshold_db} dB") - print(f"๐Ÿ“ˆ Found {len(loud_chunks)} loud moments") - clip.close() - return loud_chunks - -def transcribe_and_extract_subtitles(video_path, start, end): - print(f"๐Ÿ—ฃ๏ธ Transcribing audio from {start:.2f}s to {end:.2f}s...") - model = WhisperModel("base", device="cpu", compute_type="int8") - segments, _ = model.transcribe(video_path, beam_size=5, language="en", vad_filter=True) - - subtitles = [] - for segment in segments: - if start <= segment.start <= end: - subtitles.append((segment.start - start, segment.end - start, segment.text)) - return subtitles - -def create_short_clip(video_path, start, end, subtitles, output_path): - print(f"๐ŸŽฌ Creating short: {output_path}") - clip = VideoFileClip(video_path).subclipped(start, end) - video_duration = clip.duration - print(f"๐Ÿ“ Video clip duration: {video_duration:.2f}s") - - # Convert to vertical 9:16 - vertical_clip = clip.resized(height=1920).cropped(width=1080, x_center=clip.w / 2) - - clips = [vertical_clip] - for (s, e, text) in subtitles: - try: - # Ensure subtitle timing doesn't exceed video duration - subtitle_start = max(0, s) - subtitle_end = min(e, video_duration) - - if subtitle_start >= video_duration or subtitle_end <= subtitle_start: - print(f"โš ๏ธ Skipping subtitle outside video duration: {text[:30]}...") - continue - - # Opus Clip style professional subtitles - words = text.strip().split() - if not words: - continue - - # Break text into smaller chunks for better readability (max 3-4 words per line) - chunks = [] - current_chunk = [] - for word in words: - current_chunk.append(word) - if len(current_chunk) >= 3 or len(' '.join(current_chunk)) > 25: - chunks.append(' '.join(current_chunk)) - current_chunk = [] - if current_chunk: - chunks.append(' '.join(current_chunk)) - - # Position subtitles in the center-bottom area (Opus style) - subtitle_position = 0.75 - - # Create subtitle for each chunk with smooth transitions - chunk_duration = (subtitle_end - subtitle_start) / len(chunks) - - for chunk_idx, chunk_text in enumerate(chunks): - chunk_start = subtitle_start + (chunk_idx * chunk_duration) - chunk_end = min(chunk_start + chunk_duration, subtitle_end) - - chunk_words = chunk_text.split() - - # Base subtitle with Opus-style design (bold white text with strong outline) - base_subtitle = TextClip( - text=chunk_text.upper(), - font='C:/Windows/Fonts/LatoWeb-Bold.ttf', # Lato Bold - excellent for subtitles - font_size=65, # Larger, chunkier text - color='white', - stroke_color='black', - stroke_width=5 # Thicker outline for better readability - ) - base_subtitle = base_subtitle.with_start(chunk_start).with_end(chunk_end).with_position(('center', subtitle_position), relative=True) - clips.append(base_subtitle) - - # Opus-style word-by-word highlighting (yellow/orange like Opus) - word_duration = chunk_duration / len(chunk_words) - - for i, word in enumerate(chunk_words): - word_start = chunk_start + (i * word_duration) - word_end = min(word_start + word_duration * 0.8, chunk_end) - - # Opus-style highlighted word (vibrant yellow/orange) - highlighted_word = TextClip( - text=word.upper(), - font='C:/Windows/Fonts/LatoWeb-Bold.ttf', # Lato Bold for consistency - font_size=68, # Slightly larger for highlight effect - color='#FFD700', # Gold/yellow like Opus Clip - stroke_color='#FF6B35', # Orange outline for pop - stroke_width=5 - ) - - # Calculate precise word positioning within the chunk - words_before = chunk_words[:i] - chars_before = sum(len(w) for w in words_before) + len(words_before) - - # More accurate character width calculation for Arial Bold - char_width = 35 # Adjusted for larger, bolder font - total_chunk_width = len(chunk_text) * char_width - word_x_offset = (chars_before * char_width) - (total_chunk_width // 2) - - highlighted_word = highlighted_word.with_start(word_start).with_end(word_end).with_position((540 + word_x_offset, subtitle_position), relative=(False, True)) - clips.append(highlighted_word) - - print(f"โœ… Added Opus-style subtitle ({subtitle_start:.1f}s-{subtitle_end:.1f}s): {text[:30]}...") - except Exception as e: - print(f"โš ๏ธ Subtitle error: {e}, skipping subtitle: {text[:50]}...") - continue - - final = CompositeVideoClip(clips, size=(1080, 1920)) - final.write_videofile(output_path, codec="libx264", audio_codec="aac", threads=1) - - # ๐Ÿ’ฅ Force close to avoid Windows pipe errors - clip.reader.close() - if clip.audio: - clip.audio.reader.close() - final.close() - -def generate_shorts(video_path, max_clips=3, output_folder="shorts"): - os.makedirs(output_folder, exist_ok=True) - best_moments = detect_loud_moments(video_path, threshold_db=-30) - - selected = best_moments[:max_clips] - for i, (start, end) in enumerate(selected): - subtitles = transcribe_and_extract_subtitles(video_path, start, end) - out_path = os.path.join(output_folder, f"short_{i+1}.mp4") - create_short_clip(video_path, start, end, subtitles, out_path) - -if __name__ == "__main__": - import sys - if len(sys.argv) < 2: - print("Usage: python shorts_generator.py your_video.mp4") - else: - generate_shorts(sys.argv[1]) diff --git a/subtitle_generator.py b/subtitle_generator.py index 7412f0f..1ab4e22 100644 --- a/subtitle_generator.py +++ b/subtitle_generator.py @@ -2,9 +2,12 @@ import os import math import tempfile import moviepy as mp -import speech_recognition as sr import tkinter as tk from tkinter import filedialog, messagebox, ttk +from pydub import AudioSegment +from pydub.silence import split_on_silence +import threading +from faster_whisper import WhisperModel def format_time(seconds): @@ -44,52 +47,71 @@ def write_srt(subtitles, output_path): f.write(f"{wrap_text(sub['text'])}\n\n") -def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None, chunk_duration=10): +def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None, model_size="base", language="auto"): try: + print("๐Ÿ“ฝ๏ธ Loading video file...") video = mp.VideoFileClip(video_path) audio = video.audio with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file: temp_audio_path = temp_audio_file.name + print("๐Ÿ”Š Extracting audio...") audio.write_audiofile(temp_audio_path, logger=None) - recognizer = sr.Recognizer() + print(f"๐Ÿค– Loading Whisper model ({model_size})...") + # Initialize Whisper model - much more accurate than Google Speech Recognition + model = WhisperModel(model_size, device="cpu", compute_type="int8") + + print("๐ŸŽฏ Transcribing with Whisper AI...") + # Transcribe the entire audio file at once - Whisper handles timing automatically + segments, info = model.transcribe( + temp_audio_path, + language=None if language == "auto" else language, + word_timestamps=True, + vad_filter=True, # Voice Activity Detection for better accuracy + vad_parameters=dict(min_silence_duration_ms=500) + ) + + print(f"๐ŸŒ Detected language: {info.language} (confidence: {info.language_probability:.2f})") + subtitles = [] + segment_list = list(segments) # Convert generator to list for progress tracking + + print(f"๐Ÿ“ Processing {len(segment_list)} speech segments...") + + for i, segment in enumerate(segment_list): + # Whisper provides precise timing and text + start_time = segment.start + end_time = segment.end + text = segment.text.strip() + + if text and len(text) > 0: + subtitles.append({ + "start": start_time, + "end": end_time, + "text": text + }) + print(f"โœ… Segment {i+1}: '{text[:50]}...' ({start_time:.1f}s - {end_time:.1f}s)") + + # Update progress bar + if progress_callback: + progress_callback(i + 1, len(segment_list)) - with sr.AudioFile(temp_audio_path) as source: - audio_duration = source.DURATION - num_chunks = math.ceil(audio_duration / chunk_duration) - - for i in range(num_chunks): - start_time = i * chunk_duration - end_time = min((i + 1) * chunk_duration, audio_duration) - - source_offset = start_time - duration = end_time - start_time - - audio_data = recognizer.record(source, offset=source_offset, duration=duration) - - try: - text = recognizer.recognize_google(audio_data) - subtitles.append({ - "start": start_time, - "end": end_time, - "text": text - }) - except sr.UnknownValueError: - pass - except sr.RequestError as e: - print(f"API error: {e}") - - # Update progress bar - if progress_callback: - progress_callback(i + 1, num_chunks) - - os.remove(temp_audio_path) + # Clean up + if os.path.exists(temp_audio_path): + os.remove(temp_audio_path) + + if video: + video.close() + if audio: + audio.close() + + print(f"๐ŸŽฏ Generated {len(subtitles)} subtitle segments with Whisper AI") write_srt(subtitles, srt_output_path) return True + except Exception as e: - print(f"Error: {e}") + print(f"โŒ Error: {e}") return False @@ -98,7 +120,7 @@ def transcribe_video_to_srt(video_path, srt_output_path, progress_callback=None, def select_file_and_generate(): video_path = filedialog.askopenfilename( title="Select a video file", - filetypes=[("Video files", "*.mp4 *.mov *.avi *.mkv")] + filetypes=[("Video files", "*.mp4 *.mov *.avi *.mkv *.webm *.flv")] ) if not video_path: @@ -113,43 +135,127 @@ def select_file_and_generate(): if not srt_output_path: return + # Disable button during processing + select_button.config(state="disabled", text="Processing...") progress_bar["value"] = 0 - progress_label.config(text="Starting...") + progress_label.config(text="Starting speech recognition...") + status_label.config(text="๐Ÿ”„ Processing video...", fg="blue") root.update() def update_progress(current, total): percent = (current / total) * 100 progress_bar["value"] = percent - progress_label.config(text=f"Progress: {current}/{total} chunks") + progress_label.config(text=f"Processing: {current}/{total} segments") root.update() - success = transcribe_video_to_srt(video_path, srt_output_path, progress_callback=update_progress) + def process_video(): + try: + model_size = model_size_var.get() + language = language_var.get() + + success = transcribe_video_to_srt( + video_path, + srt_output_path, + progress_callback=update_progress, + model_size=model_size, + language=language + ) - if success: - messagebox.showinfo("Success", f"Subtitles saved to:\n{srt_output_path}") - else: - messagebox.showerror("Error", "Something went wrong. See console for details.") + if success: + status_label.config(text="โœ… Subtitles generated successfully!", fg="green") + messagebox.showinfo("Success", f"Subtitles saved to:\n{srt_output_path}\n\nOpen with your subtitle_generator2.py for editing!") + else: + status_label.config(text="โŒ Error occurred during processing", fg="red") + messagebox.showerror("Error", "Something went wrong. Check console for details.") - progress_label.config(text="Done") + except Exception as e: + status_label.config(text="โŒ Unexpected error occurred", fg="red") + messagebox.showerror("Error", f"Unexpected error: {e}") + finally: + # Re-enable button + select_button.config(state="normal", text="๐Ÿ“‚ Select Video and Generate Subtitles") + progress_label.config(text="Done") + + # Run in separate thread to prevent GUI freezing + thread = threading.Thread(target=process_video) + thread.daemon = True + thread.start() # GUI Setup root = tk.Tk() -root.title("Auto Subtitle Generator (.srt) with Progress") +root.title("๐ŸŽฌ Auto Subtitle Generator - Speech to SRT") +root.geometry("500x350") frame = tk.Frame(root, padx=20, pady=20) -frame.pack() +frame.pack(fill="both", expand=True) -label = tk.Label(frame, text="Select a video file to auto-generate subtitles (SRT):") -label.pack(pady=(0, 10)) +# Title +title_label = tk.Label(frame, text="๐ŸŽฌ Auto Subtitle Generator", font=("Arial", 16, "bold")) +title_label.pack(pady=(0, 10)) -select_button = tk.Button(frame, text="Select Video and Generate Subtitles", command=select_file_and_generate) -select_button.pack(pady=5) +subtitle_label = tk.Label(frame, text="Extract speech from video and create perfectly timed SRT subtitles", font=("Arial", 10)) +subtitle_label.pack(pady=(0, 20)) -progress_bar = ttk.Progressbar(frame, length=300, mode="determinate") -progress_bar.pack(pady=(15, 5)) +# Settings Frame +settings_frame = tk.LabelFrame(frame, text="Whisper AI Settings", padx=10, pady=10) +settings_frame.pack(fill="x", pady=(0, 15)) -progress_label = tk.Label(frame, text="Idle") +# Model Size Selection +model_frame = tk.Frame(settings_frame) +model_frame.pack(fill="x", pady=(0, 10)) + +tk.Label(model_frame, text="๐Ÿง  Model Size:", font=("Arial", 9)).pack(side="left") +model_size_var = tk.StringVar(value="base") +model_dropdown = ttk.Combobox(model_frame, textvariable=model_size_var, + values=["tiny", "base", "small", "medium", "large"], + state="readonly", width=12) +model_dropdown.pack(side="right") + +# Language Selection +language_frame = tk.Frame(settings_frame) +language_frame.pack(fill="x", pady=(0, 10)) + +tk.Label(language_frame, text="๐ŸŒ Language:", font=("Arial", 9)).pack(side="left") +language_var = tk.StringVar(value="auto") +language_dropdown = ttk.Combobox(language_frame, textvariable=language_var, + values=["auto", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh"], + state="readonly", width=12) +language_dropdown.pack(side="right") + +# Help text +help_label = tk.Label( + settings_frame, + text=" ๐Ÿ’ก Base model recommended for best speed/accuracy balance\n ๐Ÿ” Auto language detection works for most videos", + font=("Arial", 8), + fg="gray" +) +help_label.pack(anchor="w") + +# Main Action Button +select_button = tk.Button( + frame, + text="๐Ÿ“‚ Select Video and Generate Subtitles", + command=select_file_and_generate, + font=("Arial", 11, "bold"), + bg="#4CAF50", + fg="white", + pady=8 +) +select_button.pack(pady=15, fill="x") + +# Progress Section +progress_frame = tk.LabelFrame(frame, text="Progress", padx=10, pady=10) +progress_frame.pack(fill="x", pady=(0, 10)) + +progress_bar = ttk.Progressbar(progress_frame, length=400, mode="determinate") +progress_bar.pack(fill="x", pady=(0, 5)) + +progress_label = tk.Label(progress_frame, text="Ready to process video", font=("Arial", 9)) progress_label.pack() +# Status Label +status_label = tk.Label(frame, text="๐Ÿ’ก Tip: Use generated SRT files with subtitle_generator2.py for advanced editing!", font=("Arial", 9), fg="blue") +status_label.pack(pady=(10, 0)) + root.mainloop() diff --git a/subtitle_generator2_fixed.py b/subtitle_generator2_fixed.py deleted file mode 100644 index e69de29..0000000 diff --git a/subtitles.srt b/subtitles.srt deleted file mode 100644 index 0ede093..0000000 --- a/subtitles.srt +++ /dev/null @@ -1,3 +0,0 @@ -1 -00:00:00,000 --> 00:00:02,500 -You're running \ No newline at end of file diff --git a/test_whisper.py b/test_whisper.py new file mode 100644 index 0000000..4634011 --- /dev/null +++ b/test_whisper.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +""" +Test script to verify faster_whisper integration +""" + +import os +from faster_whisper import WhisperModel + +def test_whisper_setup(): + """Test if faster_whisper is working correctly""" + print("๐Ÿงช Testing faster_whisper setup...") + + try: + # Try to initialize the smallest model + print("๐Ÿ“ฅ Loading tiny model (this might take a moment on first run)...") + model = WhisperModel("tiny") + print("โœ… Successfully loaded Whisper tiny model!") + + # Check available models + available_models = ["tiny", "base", "small", "medium", "large"] + print(f"๐ŸŽฏ Available models: {', '.join(available_models)}") + + # Test basic functionality with a short audio + print("๐Ÿ” Whisper model ready for transcription!") + + return True + + except Exception as e: + print(f"โŒ Error: {e}") + return False + +if __name__ == "__main__": + if test_whisper_setup(): + print("\n๐ŸŽ‰ faster_whisper is ready to use!") + print("๐Ÿ’ก Your subtitle generator now has much better speech recognition!") + else: + print("\nโš ๏ธ There might be an issue with faster_whisper setup")