Tube 2 Book

Tube 2 Book#

You have a youtube link. Download the youtube-video using code (HINT: You can prompt for yt_dlp). Save the image as .mp4. Read the video and split it into scenes (HINT: You can prompt for scene-detect). Extract the first frame of each scene and caption it. Generate a title page with all images and on the subsequent pages put three images and their captions.
Show code cell source Hide code cell source
import os
import io
import math
import subprocess
from pathlib import Path
from typing import List

import cv2
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from fpdf import FPDF
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
from yt_dlp import YoutubeDL
from tqdm import tqdm


def download_youtube_video(youtube_url: str, output_path: str) -> None:
    """
    Download a YouTube video to the specified path using yt-dlp.
    """
    ydl_opts = {
        "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/mp4",
        "outtmpl": output_path,
    }
    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])


def detect_and_split_scenes(
    video_path: str, scenes_dir: str, threshold: float = 30.0
) -> List[Path]:
    """
    Detect scenes in the video and split it into separate MP4 files.
    Returns a list of Paths for each scene file created.
    """
    scenes_path = Path(scenes_dir)
    scenes_path.mkdir(parents=True, exist_ok=True)

    video_manager = VideoManager([video_path])
    scene_manager = SceneManager()
    scene_manager.add_detector(ContentDetector(threshold=threshold))

    video_manager.start()
    scene_manager.detect_scenes(frame_source=video_manager)
    scene_list = scene_manager.get_scene_list()
    video_manager.release()

    # Convert detected scenes into (start, end) timestamps in seconds
    timestamps = [(start.get_seconds(), end.get_seconds()) for start, end in scene_list]
    scene_files: List[Path] = []

    for idx, (start_sec, end_sec) in enumerate(tqdm(timestamps, desc="Splitting scenes"), start=1):
        scene_filename = f"scene_{idx:04d}.mp4"
        scene_filepath = scenes_path / scene_filename

        cmd = [
            "ffmpeg",
            "-y",  # overwrite if exists
            "-i", video_path,
            "-ss", f"{start_sec:.3f}",
            "-to", f"{end_sec:.3f}",
            "-c", "copy",
            str(scene_filepath),
        ]
        subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        scene_files.append(scene_filepath)

    return scene_files


def extract_first_frames_from_videos(
    video_folder: str, output_folder: str, exts: set = None, fmt: str = "jpg", timeout: int = 10
) -> List[Path]:
    """
    For each video in `video_folder` with an extension in `exts`, extract its first frame
    and save it as an image in `output_folder`. Returns a list of saved image Paths.
    """
    if exts is None:
        exts = {"mp4", "avi", "mov", "mkv", "webm"}

    out_path = Path(output_folder)
    out_path.mkdir(parents=True, exist_ok=True)

    saved_frames: List[Path] = []
    video_paths = [
        p for p in Path(video_folder).iterdir()
        if p.is_file() and p.suffix.lower().lstrip(".") in exts
    ]

    for vid_path in tqdm(video_paths, desc="Extracting first frames"):
        output_img = out_path / f"{vid_path.stem}.{fmt}"
        cmd = [
            "ffmpeg",
            "-y",
            "-loglevel", "error",
            "-i", str(vid_path),
            "-frames:v", "1",
            str(output_img),
        ]
        try:
            subprocess.run(cmd, check=True, timeout=timeout)
            saved_frames.append(output_img)
        except subprocess.TimeoutExpired:
            # skip if ffmpeg hangs
            continue
        except subprocess.CalledProcessError:
            # skip if ffmpeg returns an error
            continue

    return saved_frames


def caption_images_with_blip(
    image_folder: str, valid_extensions: set = None
) -> List[str]:
    """
    Caption all images in `image_folder` using the BLIP model. Returns a list of captions
    in the same sorted order as the images on disk.
    """
    if valid_extensions is None:
        valid_extensions = {"jpg", "jpeg", "png", "bmp", "gif", "webp"}

    image_paths = sorted([
        p for p in Path(image_folder).iterdir()
        if p.is_file() and p.suffix.lower().lstrip(".") in valid_extensions
    ])
    if not image_paths:
        return []

    # Load BLIP processor & model (ensure torch, transformers, pillow are installed)
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    model.eval()

    captions: List[str] = []
    for img_path in tqdm(image_paths, desc="Captioning images"):
        image = Image.open(img_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")
        with torch.no_grad():
            output_ids = model.generate(**inputs)
        caption = processor.decode(output_ids[0], skip_special_tokens=True)
        captions.append(caption)

    return captions


def add_title_page_with_grid(
    pdf: FPDF, image_paths: List[Path], title: str, valid_exts: set = None
) -> None:
    """
    Add an A4 page with a background grid of square-cropped, semi-transparent images
    (and black squares for empty cells) plus an overlaid title.
    """
    if valid_exts is None:
        valid_exts = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}

    imgs = [p for p in image_paths if p.suffix.lower() in valid_exts]
    pdf.add_page()
    pdf.set_auto_page_break(False)
    pdf.set_margins(0, 0, 0)

    page_w, page_h = pdf.w, pdf.h  # 210 × 297 mm
    n = len(imgs)
    ratio = page_w / page_h
    cols = math.ceil(math.sqrt(max(n, 1) * ratio))
    rows = math.ceil(max(n, 1) / cols)
    total_cells = rows * cols

    cell_w_nom = page_w / cols
    cell_h_nom = page_h / rows
    cell_size = max(cell_w_nom, cell_h_nom)
    offset_x = (page_w - cols * cell_size) / 2
    offset_y = (page_h - rows * cell_size) / 2

    black_img = Image.new("RGB", (1, 1), (0, 0, 0))

    for idx in range(total_cells):
        row, col = divmod(idx, cols)
        x = offset_x + col * cell_size
        y = offset_y + row * cell_size

        if idx < len(imgs):
            img = Image.open(imgs[idx]).convert("RGB")
            w, h = img.size
            side = min(w, h)
            left = (w - side) / 2
            top = (h - side) / 2
            cropped = img.crop((left, top, left + side, top + side))
        else:
            cropped = black_img

        buf = io.BytesIO()
        cropped.save(buf, format="PNG")
        buf.seek(0)

        with pdf.local_context(fill_opacity=0.5):
            pdf.image(buf, x=x, y=y, w=cell_size, h=cell_size, type="PNG")
        buf.close()

    # Overlay title
    font_pt = 24
    pdf.set_font("Arial", "B", font_pt)
    pdf.set_text_color(0, 0, 0)
    text_w = pdf.get_string_width(title)
    text_x = (page_w - text_w) / 2
    text_y = page_h / 2
    pdf.text(x=text_x, y=text_y, txt=title)


def add_caption_pages(
    pdf: FPDF, image_paths: List[Path], captions: List[str], valid_exts: set = None
) -> None:
    """
    After the title page, add pages containing up to three images per page with centered captions.
    """
    if valid_exts is None:
        valid_exts = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}

    imgs = [p for p in image_paths if p.suffix.lower() in valid_exts]
    paired = list(zip(imgs, captions))
    page_w, page_h = pdf.w, pdf.h

    margin_left = 15
    margin_right = 15
    margin_top = 15
    margin_bottom = 15

    available_w = page_w - margin_left - margin_right
    available_h = page_h - margin_top - margin_bottom

    image_h = 60  # mm
    caption_h = 5  # mm
    spacing = (available_h - 3 * (image_h + caption_h)) / 4
    if spacing < 0:
        raise ValueError("Not enough vertical space for three images at requested size.")

    footer_y = page_h - 10

    for i in range(0, len(paired), 3):
        group = paired[i : i + 3]
        pdf.add_page()
        y = margin_top + spacing

        for img_path, caption in group:
            img = Image.open(img_path)
            w_px, h_px = img.size
            ar = h_px / w_px
            img_w_mm = image_h / ar
            if img_w_mm > available_w:
                img_w_mm = available_w
                image_h_mm = available_w * ar
            else:
                image_h_mm = image_h

            x = margin_left + (available_w - img_w_mm) / 2
            pdf.image(str(img_path), x=x, y=y, h=image_h_mm)

            pdf.set_font("Arial", "", 12)
            pdf.set_text_color(0, 0, 0)
            text_w = pdf.get_string_width(caption)
            caption_x = (page_w - text_w) / 2
            caption_y = y + image_h_mm + 1
            pdf.set_xy(caption_x, caption_y)
            pdf.cell(text_w, caption_h, caption)

            y += image_h_mm + caption_h + spacing + 1

        pdf.set_font("Arial", "", 10)
        pdf.set_text_color(0, 0, 0)
        pdf.set_xy(0, footer_y)
        pdf.cell(page_w, 10, f"{pdf.page_no()}", align="C")


def create_pdf_from_frames(
    frames_folder: str, captions: List[str], output_pdf_path: str, title: str
) -> None:
    """
    (Unused in `main()` but provided for convenience)
    Create a PDF that has a title page (grid of frames) and subsequent pages of frames with captions.
    """
    frame_paths = sorted([p for p in Path(frames_folder).iterdir() if p.is_file()])
    if not frame_paths or not captions:
        raise ValueError("No frames or captions provided for PDF creation.")

    pdf = FPDF("P", "mm", "A4")
    add_title_page_with_grid(pdf, frame_paths, title)
    add_caption_pages(pdf, frame_paths, captions)
    pdf.output(output_pdf_path)


def main():
    youtube_url = "https://www.youtube.com/watch?v=FR7wOGyAzpw"
    video_output = "my_video.mp4"
    scenes_dir = "scenes"
    frames_dir = "frames"
    output_pdf = "tube2book.pdf"

    # 1. Download the YouTube video
    print("Downloading video...")
    download_youtube_video(youtube_url, video_output)

    # 2. Detect scenes and split the video
    print("Detecting and splitting scenes...")
    detect_and_split_scenes(video_output, scenes_dir, threshold=27.0)

    # 3. Extract first frames from each scene
    print("Extracting first frames from scenes...")
    extract_first_frames_from_videos(scenes_dir, frames_dir)

    # 4. Caption each extracted frame
    print("Generating captions for frames...")
    captions = caption_images_with_blip(frames_dir)

    # 5. Build the PDF (title page + caption pages) directly in main()
    print("Building PDF...")
    frame_paths = sorted([p for p in Path(frames_dir).iterdir() if p.is_file()])
    if not frame_paths or not captions:
        raise ValueError("No frames or captions available for PDF creation.")

    pdf = FPDF("P", "mm", "A4")
    add_title_page_with_grid(pdf, frame_paths, youtube_url)
    add_caption_pages(pdf, frame_paths, captions)
    pdf.output(output_pdf)

    print(f"PDF saved to {output_pdf}")


if __name__ == "__main__":
    main()