Picture Book

Picture Book#

You have a folder containing a number of images. Read all the images and generate captions for each image (HINT: You can use BLIP in your prompt). Make a title page with a random image from the list (or one from the middle) and place three images and their caption on the subsequent pages.
Show code cell source Hide code cell source
import io
from pathlib import Path
from typing import List

import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from fpdf import FPDF
from tqdm import tqdm


def caption_images_with_blip(
    image_folder: str, valid_extensions: set = None
) -> List[str]:
    """
    Caption all images in `image_folder` using the BLIP model.
    Returns captions in the same sorted order as the images on disk.
    """
    if valid_extensions is None:
        valid_extensions = {"jpg", "jpeg", "png", "bmp", "gif", "webp"}

    image_paths = sorted(
        [
            p
            for p in Path(image_folder).iterdir()
            if p.is_file() and p.suffix.lower().lstrip(".") in valid_extensions
        ]
    )
    if not image_paths:
        return []

    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    model.eval()

    captions: List[str] = []
    for img_path in tqdm(image_paths, desc="Captioning images"):
        image = Image.open(img_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")
        with torch.no_grad():
            output_ids = model.generate(**inputs)
        caption = processor.decode(output_ids[0], skip_special_tokens=True)
        captions.append(caption)

    return captions


def create_pdf_from_images(
    image_folder: str, captions: List[str], output_pdf_path: str, title: str
) -> None:
    """
    Create a PDF with:
    - Title page: middle image (full width) and centered title text below.
    - Subsequent pages: three images per page (stacked vertically), each with its caption below.
    """

    image_paths = sorted([p for p in Path(image_folder).iterdir() if p.is_file()])
    if not image_paths or not captions:
        raise ValueError("No images or captions provided for PDF creation.")

    pdf = FPDF("P", "mm", "A4")
    page_w, page_h = pdf.w, pdf.h
    margin = 10  # mm on each side
    available_w = page_w - 2 * margin
    available_h = page_h - 2 * margin

    # --- Title Page using the middle image ---
    middle_idx = len(image_paths) // 2
    mid_img_path = image_paths[middle_idx]

    pdf.add_page()
    pdf.set_auto_page_break(False)
    pdf.set_margins(margin, margin, margin)

    img = Image.open(mid_img_path)
    w_px, h_px = img.size
    ar = h_px / w_px  # height / width

    # Scale image to full available width
    img_w_mm = available_w
    img_h_mm = available_w * ar

    # Cap image height at half the available height
    half_page = available_h / 2
    if img_h_mm > half_page:
        img_h_mm = half_page
        img_w_mm = half_page / ar

    x = margin + (available_w - img_w_mm) / 2
    y = margin
    pdf.image(str(mid_img_path), x=x, y=y, w=img_w_mm, h=img_h_mm)

    # Title text centered below the image
    pdf.set_font("Arial", "B", 24)
    pdf.set_text_color(0, 0, 0)
    text_w = pdf.get_string_width(title)
    text_x = (page_w - text_w) / 2
    text_y = y + img_h_mm + 24
    pdf.text(x=text_x, y=text_y, txt=title)

    # --- Caption Pages: three images per page ---
    # We allocate a block for each image+caption+gap so that none overlap.
    gap_between_image_and_caption = 10  # mm
    caption_height = 5  # mm reserved for the caption line
    spacing_between_blocks = 5  # mm vertical space between each block

    # Compute total block height per image+caption:
    #   block_total_h * 3 + (spacing_between_blocks * 2) = available_h
    block_total_h = (available_h - 2 * spacing_between_blocks) / 3
    # Now, the image itself must fit in:
    max_image_h = block_total_h - gap_between_image_and_caption - caption_height

    for i in range(0, len(image_paths), 3):
        group = image_paths[i : i + 3]
        pdf.add_page()
        pdf.set_margins(margin, margin, margin)

        y_block = margin
        for j, img_path in enumerate(group):
            idx = i + j
            caption = captions[idx]

            img = Image.open(img_path)
            w_px, h_px = img.size
            ar = h_px / w_px  # height/width aspect ratio

            # Attempt to use full available width, then cap by max_image_h if needed
            img_w_mm = available_w
            img_h_mm = available_w * ar
            if img_h_mm > max_image_h:
                img_h_mm = max_image_h
                img_w_mm = max_image_h / ar

            x = margin + (available_w - img_w_mm) / 2
            y = y_block
            pdf.image(str(img_path), x=x, y=y, w=img_w_mm, h=img_h_mm)

            # Caption text centered below the image, leaving exactly 'gap_between_image_and_caption'
            caption_y = y + img_h_mm + gap_between_image_and_caption
            pdf.set_font("Arial", "", 12)
            pdf.set_text_color(0, 0, 0)
            text_w = pdf.get_string_width(caption)
            caption_x = (page_w - text_w) / 2
            pdf.text(x=caption_x, y=caption_y, text=caption)

            # Move down by exactly one block_total_h + spacing_between_blocks
            y_block += block_total_h + spacing_between_blocks

    pdf.output(output_pdf_path)


def main():
    image_folder = "pictures"  # folder containing images to caption
    output_pdf = "picture_book.pdf"
    title = "The Sprite Fright Picture Book"

    print("Generating captions...")
    captions = caption_images_with_blip(image_folder)

    print("Creating PDF...")
    create_pdf_from_images(image_folder, captions, output_pdf, title)

    print(f"PDF saved to {output_pdf}")


if __name__ == "__main__":
    main()