Picture Book#
You have a folder containing a number of images. Read all the images and generate captions for each image (HINT: You can use BLIP in your prompt). Make a title page with a random image from the list (or one from the middle) and place three images and their caption on the subsequent pages.
Show code cell source
import io
from pathlib import Path
from typing import List
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from fpdf import FPDF
from tqdm import tqdm
def caption_images_with_blip(
image_folder: str, valid_extensions: set = None
) -> List[str]:
"""
Caption all images in `image_folder` using the BLIP model.
Returns captions in the same sorted order as the images on disk.
"""
if valid_extensions is None:
valid_extensions = {"jpg", "jpeg", "png", "bmp", "gif", "webp"}
image_paths = sorted(
[
p
for p in Path(image_folder).iterdir()
if p.is_file() and p.suffix.lower().lstrip(".") in valid_extensions
]
)
if not image_paths:
return []
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
model.eval()
captions: List[str] = []
for img_path in tqdm(image_paths, desc="Captioning images"):
image = Image.open(img_path).convert("RGB")
inputs = processor(images=image, return_tensors="pt")
with torch.no_grad():
output_ids = model.generate(**inputs)
caption = processor.decode(output_ids[0], skip_special_tokens=True)
captions.append(caption)
return captions
def create_pdf_from_images(
image_folder: str, captions: List[str], output_pdf_path: str, title: str
) -> None:
"""
Create a PDF with:
- Title page: middle image (full width) and centered title text below.
- Subsequent pages: three images per page (stacked vertically), each with its caption below.
"""
image_paths = sorted([p for p in Path(image_folder).iterdir() if p.is_file()])
if not image_paths or not captions:
raise ValueError("No images or captions provided for PDF creation.")
pdf = FPDF("P", "mm", "A4")
page_w, page_h = pdf.w, pdf.h
margin = 10 # mm on each side
available_w = page_w - 2 * margin
available_h = page_h - 2 * margin
# --- Title Page using the middle image ---
middle_idx = len(image_paths) // 2
mid_img_path = image_paths[middle_idx]
pdf.add_page()
pdf.set_auto_page_break(False)
pdf.set_margins(margin, margin, margin)
img = Image.open(mid_img_path)
w_px, h_px = img.size
ar = h_px / w_px # height / width
# Scale image to full available width
img_w_mm = available_w
img_h_mm = available_w * ar
# Cap image height at half the available height
half_page = available_h / 2
if img_h_mm > half_page:
img_h_mm = half_page
img_w_mm = half_page / ar
x = margin + (available_w - img_w_mm) / 2
y = margin
pdf.image(str(mid_img_path), x=x, y=y, w=img_w_mm, h=img_h_mm)
# Title text centered below the image
pdf.set_font("Arial", "B", 24)
pdf.set_text_color(0, 0, 0)
text_w = pdf.get_string_width(title)
text_x = (page_w - text_w) / 2
text_y = y + img_h_mm + 24
pdf.text(x=text_x, y=text_y, txt=title)
# --- Caption Pages: three images per page ---
# We allocate a block for each image+caption+gap so that none overlap.
gap_between_image_and_caption = 10 # mm
caption_height = 5 # mm reserved for the caption line
spacing_between_blocks = 5 # mm vertical space between each block
# Compute total block height per image+caption:
# block_total_h * 3 + (spacing_between_blocks * 2) = available_h
block_total_h = (available_h - 2 * spacing_between_blocks) / 3
# Now, the image itself must fit in:
max_image_h = block_total_h - gap_between_image_and_caption - caption_height
for i in range(0, len(image_paths), 3):
group = image_paths[i : i + 3]
pdf.add_page()
pdf.set_margins(margin, margin, margin)
y_block = margin
for j, img_path in enumerate(group):
idx = i + j
caption = captions[idx]
img = Image.open(img_path)
w_px, h_px = img.size
ar = h_px / w_px # height/width aspect ratio
# Attempt to use full available width, then cap by max_image_h if needed
img_w_mm = available_w
img_h_mm = available_w * ar
if img_h_mm > max_image_h:
img_h_mm = max_image_h
img_w_mm = max_image_h / ar
x = margin + (available_w - img_w_mm) / 2
y = y_block
pdf.image(str(img_path), x=x, y=y, w=img_w_mm, h=img_h_mm)
# Caption text centered below the image, leaving exactly 'gap_between_image_and_caption'
caption_y = y + img_h_mm + gap_between_image_and_caption
pdf.set_font("Arial", "", 12)
pdf.set_text_color(0, 0, 0)
text_w = pdf.get_string_width(caption)
caption_x = (page_w - text_w) / 2
pdf.text(x=caption_x, y=caption_y, text=caption)
# Move down by exactly one block_total_h + spacing_between_blocks
y_block += block_total_h + spacing_between_blocks
pdf.output(output_pdf_path)
def main():
image_folder = "pictures" # folder containing images to caption
output_pdf = "picture_book.pdf"
title = "The Sprite Fright Picture Book"
print("Generating captions...")
captions = caption_images_with_blip(image_folder)
print("Creating PDF...")
create_pdf_from_images(image_folder, captions, output_pdf, title)
print(f"PDF saved to {output_pdf}")
if __name__ == "__main__":
main()