Tube 2 Book#
You have a youtube link. Download the youtube-video using code (HINT: You can prompt for yt_dlp
). Save the image as .mp4
. Read the video and split it into scenes (HINT: You can prompt for scene-detect). Extract the first frame of each scene and caption it. Generate a title page with all images and on the subsequent pages put three images and their captions.
Show code cell source
import os
import io
import math
import subprocess
from pathlib import Path
from typing import List
import cv2
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from fpdf import FPDF
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
from yt_dlp import YoutubeDL
from tqdm import tqdm
def download_youtube_video(youtube_url: str, output_path: str) -> None:
"""
Download a YouTube video to the specified path using yt-dlp.
"""
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/mp4",
"outtmpl": output_path,
}
with YoutubeDL(ydl_opts) as ydl:
ydl.download([youtube_url])
def detect_and_split_scenes(
video_path: str, scenes_dir: str, threshold: float = 30.0
) -> List[Path]:
"""
Detect scenes in the video and split it into separate MP4 files.
Returns a list of Paths for each scene file created.
"""
scenes_path = Path(scenes_dir)
scenes_path.mkdir(parents=True, exist_ok=True)
video_manager = VideoManager([video_path])
scene_manager = SceneManager()
scene_manager.add_detector(ContentDetector(threshold=threshold))
video_manager.start()
scene_manager.detect_scenes(frame_source=video_manager)
scene_list = scene_manager.get_scene_list()
video_manager.release()
# Convert detected scenes into (start, end) timestamps in seconds
timestamps = [(start.get_seconds(), end.get_seconds()) for start, end in scene_list]
scene_files: List[Path] = []
for idx, (start_sec, end_sec) in enumerate(tqdm(timestamps, desc="Splitting scenes"), start=1):
scene_filename = f"scene_{idx:04d}.mp4"
scene_filepath = scenes_path / scene_filename
cmd = [
"ffmpeg",
"-y", # overwrite if exists
"-i", video_path,
"-ss", f"{start_sec:.3f}",
"-to", f"{end_sec:.3f}",
"-c", "copy",
str(scene_filepath),
]
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
scene_files.append(scene_filepath)
return scene_files
def extract_first_frames_from_videos(
video_folder: str, output_folder: str, exts: set = None, fmt: str = "jpg", timeout: int = 10
) -> List[Path]:
"""
For each video in `video_folder` with an extension in `exts`, extract its first frame
and save it as an image in `output_folder`. Returns a list of saved image Paths.
"""
if exts is None:
exts = {"mp4", "avi", "mov", "mkv", "webm"}
out_path = Path(output_folder)
out_path.mkdir(parents=True, exist_ok=True)
saved_frames: List[Path] = []
video_paths = [
p for p in Path(video_folder).iterdir()
if p.is_file() and p.suffix.lower().lstrip(".") in exts
]
for vid_path in tqdm(video_paths, desc="Extracting first frames"):
output_img = out_path / f"{vid_path.stem}.{fmt}"
cmd = [
"ffmpeg",
"-y",
"-loglevel", "error",
"-i", str(vid_path),
"-frames:v", "1",
str(output_img),
]
try:
subprocess.run(cmd, check=True, timeout=timeout)
saved_frames.append(output_img)
except subprocess.TimeoutExpired:
# skip if ffmpeg hangs
continue
except subprocess.CalledProcessError:
# skip if ffmpeg returns an error
continue
return saved_frames
def caption_images_with_blip(
image_folder: str, valid_extensions: set = None
) -> List[str]:
"""
Caption all images in `image_folder` using the BLIP model. Returns a list of captions
in the same sorted order as the images on disk.
"""
if valid_extensions is None:
valid_extensions = {"jpg", "jpeg", "png", "bmp", "gif", "webp"}
image_paths = sorted([
p for p in Path(image_folder).iterdir()
if p.is_file() and p.suffix.lower().lstrip(".") in valid_extensions
])
if not image_paths:
return []
# Load BLIP processor & model (ensure torch, transformers, pillow are installed)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
model.eval()
captions: List[str] = []
for img_path in tqdm(image_paths, desc="Captioning images"):
image = Image.open(img_path).convert("RGB")
inputs = processor(images=image, return_tensors="pt")
with torch.no_grad():
output_ids = model.generate(**inputs)
caption = processor.decode(output_ids[0], skip_special_tokens=True)
captions.append(caption)
return captions
def add_title_page_with_grid(
pdf: FPDF, image_paths: List[Path], title: str, valid_exts: set = None
) -> None:
"""
Add an A4 page with a background grid of square-cropped, semi-transparent images
(and black squares for empty cells) plus an overlaid title.
"""
if valid_exts is None:
valid_exts = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}
imgs = [p for p in image_paths if p.suffix.lower() in valid_exts]
pdf.add_page()
pdf.set_auto_page_break(False)
pdf.set_margins(0, 0, 0)
page_w, page_h = pdf.w, pdf.h # 210 × 297 mm
n = len(imgs)
ratio = page_w / page_h
cols = math.ceil(math.sqrt(max(n, 1) * ratio))
rows = math.ceil(max(n, 1) / cols)
total_cells = rows * cols
cell_w_nom = page_w / cols
cell_h_nom = page_h / rows
cell_size = max(cell_w_nom, cell_h_nom)
offset_x = (page_w - cols * cell_size) / 2
offset_y = (page_h - rows * cell_size) / 2
black_img = Image.new("RGB", (1, 1), (0, 0, 0))
for idx in range(total_cells):
row, col = divmod(idx, cols)
x = offset_x + col * cell_size
y = offset_y + row * cell_size
if idx < len(imgs):
img = Image.open(imgs[idx]).convert("RGB")
w, h = img.size
side = min(w, h)
left = (w - side) / 2
top = (h - side) / 2
cropped = img.crop((left, top, left + side, top + side))
else:
cropped = black_img
buf = io.BytesIO()
cropped.save(buf, format="PNG")
buf.seek(0)
with pdf.local_context(fill_opacity=0.5):
pdf.image(buf, x=x, y=y, w=cell_size, h=cell_size, type="PNG")
buf.close()
# Overlay title
font_pt = 24
pdf.set_font("Arial", "B", font_pt)
pdf.set_text_color(0, 0, 0)
text_w = pdf.get_string_width(title)
text_x = (page_w - text_w) / 2
text_y = page_h / 2
pdf.text(x=text_x, y=text_y, txt=title)
def add_caption_pages(
pdf: FPDF, image_paths: List[Path], captions: List[str], valid_exts: set = None
) -> None:
"""
After the title page, add pages containing up to three images per page with centered captions.
"""
if valid_exts is None:
valid_exts = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}
imgs = [p for p in image_paths if p.suffix.lower() in valid_exts]
paired = list(zip(imgs, captions))
page_w, page_h = pdf.w, pdf.h
margin_left = 15
margin_right = 15
margin_top = 15
margin_bottom = 15
available_w = page_w - margin_left - margin_right
available_h = page_h - margin_top - margin_bottom
image_h = 60 # mm
caption_h = 5 # mm
spacing = (available_h - 3 * (image_h + caption_h)) / 4
if spacing < 0:
raise ValueError("Not enough vertical space for three images at requested size.")
footer_y = page_h - 10
for i in range(0, len(paired), 3):
group = paired[i : i + 3]
pdf.add_page()
y = margin_top + spacing
for img_path, caption in group:
img = Image.open(img_path)
w_px, h_px = img.size
ar = h_px / w_px
img_w_mm = image_h / ar
if img_w_mm > available_w:
img_w_mm = available_w
image_h_mm = available_w * ar
else:
image_h_mm = image_h
x = margin_left + (available_w - img_w_mm) / 2
pdf.image(str(img_path), x=x, y=y, h=image_h_mm)
pdf.set_font("Arial", "", 12)
pdf.set_text_color(0, 0, 0)
text_w = pdf.get_string_width(caption)
caption_x = (page_w - text_w) / 2
caption_y = y + image_h_mm + 1
pdf.set_xy(caption_x, caption_y)
pdf.cell(text_w, caption_h, caption)
y += image_h_mm + caption_h + spacing + 1
pdf.set_font("Arial", "", 10)
pdf.set_text_color(0, 0, 0)
pdf.set_xy(0, footer_y)
pdf.cell(page_w, 10, f"{pdf.page_no()}", align="C")
def create_pdf_from_frames(
frames_folder: str, captions: List[str], output_pdf_path: str, title: str
) -> None:
"""
(Unused in `main()` but provided for convenience)
Create a PDF that has a title page (grid of frames) and subsequent pages of frames with captions.
"""
frame_paths = sorted([p for p in Path(frames_folder).iterdir() if p.is_file()])
if not frame_paths or not captions:
raise ValueError("No frames or captions provided for PDF creation.")
pdf = FPDF("P", "mm", "A4")
add_title_page_with_grid(pdf, frame_paths, title)
add_caption_pages(pdf, frame_paths, captions)
pdf.output(output_pdf_path)
def main():
youtube_url = "https://www.youtube.com/watch?v=FR7wOGyAzpw"
video_output = "my_video.mp4"
scenes_dir = "scenes"
frames_dir = "frames"
output_pdf = "tube2book.pdf"
# 1. Download the YouTube video
print("Downloading video...")
download_youtube_video(youtube_url, video_output)
# 2. Detect scenes and split the video
print("Detecting and splitting scenes...")
detect_and_split_scenes(video_output, scenes_dir, threshold=27.0)
# 3. Extract first frames from each scene
print("Extracting first frames from scenes...")
extract_first_frames_from_videos(scenes_dir, frames_dir)
# 4. Caption each extracted frame
print("Generating captions for frames...")
captions = caption_images_with_blip(frames_dir)
# 5. Build the PDF (title page + caption pages) directly in main()
print("Building PDF...")
frame_paths = sorted([p for p in Path(frames_dir).iterdir() if p.is_file()])
if not frame_paths or not captions:
raise ValueError("No frames or captions available for PDF creation.")
pdf = FPDF("P", "mm", "A4")
add_title_page_with_grid(pdf, frame_paths, youtube_url)
add_caption_pages(pdf, frame_paths, captions)
pdf.output(output_pdf)
print(f"PDF saved to {output_pdf}")
if __name__ == "__main__":
main()