diff --git a/03_ppt_replace_images.py b/03_ppt_replace_images.py deleted file mode 100644 index 1ff248b..0000000 --- a/03_ppt_replace_images.py +++ /dev/null @@ -1,63 +0,0 @@ -import marimo - -__generated_with = "0.19.2" -app = marimo.App(width="medium") - -with app.setup: - import marimo as mo - from pathlib import Path - import utils - - -@app.cell -def _(): - PPT_FILE = Path('data/Presentation.pptx') - UPDATED_PPT_FILE = Path('data/Updated_Presentation.pptx') - return PPT_FILE, UPDATED_PPT_FILE - - -@app.cell -def _(): - IMAGE_FILE = Path('figures/OneDrive_2026-01-28/Cons-Early_Professional/cold_distant_approachable_familiar_warm.png') - return (IMAGE_FILE,) - - -@app.function -def image_alt_text_converter(fpath): - """convert image file path to alt text - """ - - if not isinstance(fpath, Path): - fpath = Path(fpath) - - fparts = fpath.parts - assert fparts[0] == 'figures', "Image file path must start with 'figures'" - - return Path('/'.join(fparts[2:])).as_posix() - - -@app.cell -def _(IMAGE_FILE): - img_alt_txt = image_alt_text_converter(IMAGE_FILE) - img_alt_txt - return (img_alt_txt,) - - -@app.cell -def _(IMAGE_FILE, PPT_FILE, UPDATED_PPT_FILE, img_alt_txt): - utils.pptx_replace_named_image( - presentation_path=PPT_FILE, - target_tag=img_alt_txt, - new_image_path=IMAGE_FILE, - save_path=UPDATED_PPT_FILE) - return - - -@app.cell -def _(P): - print(P.slides[10]) - return - - -if __name__ == "__main__": - app.run() diff --git a/01_ingest_qualtrics_export.py b/99_example_ingest_qualtrics_export.py similarity index 100% rename from 01_ingest_qualtrics_export.py rename to 99_example_ingest_qualtrics_export.py diff --git a/99_example_ppt_replace_images.py b/99_example_ppt_replace_images.py new file mode 100644 index 0000000..9fd75a7 --- /dev/null +++ b/99_example_ppt_replace_images.py @@ -0,0 +1,73 @@ +import marimo + +__generated_with = "0.19.2" +app = marimo.App(width="medium") + +with app.setup: + import marimo as mo + from pathlib import Path + import utils + + +@app.cell +def _(): + mo.md(r""" + # Tag existing images with Alt-Text + + Based on image content + """) + return + + +@app.cell +def _(): + TAG_SOURCE = Path('data/test_tag_source.pptx') + TAG_TARGET = Path('data/test_tag_target.pptx') + TAG_IMAGE_DIR = Path('figures/OneDrive_2026-01-28/') + return TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET + + +@app.cell +def _(TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET): + utils.update_ppt_alt_text(ppt_path=TAG_SOURCE, image_source_dir=TAG_IMAGE_DIR, output_path=TAG_TARGET) + return + + +@app.cell +def _(): + return + + +@app.cell +def _(): + mo.md(r""" + # Replace Images using Alt-Text + """) + return + + +@app.cell +def _(): + REPLACE_SOURCE = Path('data/test_replace_source.pptx') + REPLACE_TARGET = Path('data/test_replace_target.pptx') + return REPLACE_SOURCE, REPLACE_TARGET + + +@app.cell +def _(): + IMAGE_FILE = Path('figures/OneDrive_2026-01-28/Cons-Early_Professional/cold_distant_approachable_familiar_warm.png') + return (IMAGE_FILE,) + + +@app.cell +def _(IMAGE_FILE, REPLACE_SOURCE, REPLACE_TARGET): + utils.pptx_replace_named_image( + presentation_path=REPLACE_SOURCE, + target_tag=utils.image_alt_text_generator(IMAGE_FILE), + new_image_path=IMAGE_FILE, + save_path=REPLACE_TARGET) + return + + +if __name__ == "__main__": + app.run() diff --git a/utils.py b/utils.py index a235a63..532f133 100644 --- a/utils.py +++ b/utils.py @@ -4,13 +4,27 @@ import pandas as pd from typing import Union import json import re +import hashlib +import os from plots import JPMCPlotsMixin -import marimo as mo from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE + +def image_alt_text_generator(fpath): + """convert image file path to alt text + """ + + if not isinstance(fpath, Path): + fpath = Path(fpath) + + fparts = fpath.parts + assert fparts[0] == 'figures', "Image file path must start with 'figures'" + + return Path('/'.join(fparts[2:])).as_posix() + def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save_path): """ Finds and replaces specific images in a PowerPoint presentation while @@ -91,6 +105,159 @@ def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save print(f"Successfully saved to {save_path}") +def _calculate_file_sha1(file_path: Union[str, Path]) -> str: + """Calculate SHA1 hash of a file.""" + sha1 = hashlib.sha1() + with open(file_path, 'rb') as f: + while True: + data = f.read(65536) + if not data: + break + sha1.update(data) + return sha1.hexdigest() + + +def _build_image_hash_map(root_dir: Union[str, Path]) -> dict: + """ + Recursively walk the directory and build a map of SHA1 hashes to file paths. + Only includes common image extensions. + """ + hash_map = {} + valid_extensions = {'.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'} + + root = Path(root_dir) + print(f"Building image hash map from {root}...") + + count = 0 + for root_path, dirs, files in os.walk(root): + for file in files: + file_path = Path(root_path) / file + if file_path.suffix.lower() in valid_extensions: + try: + file_sha1 = _calculate_file_sha1(file_path) + # We store the absolute path for reference, but we might just need the path relative to project for alt text + hash_map[file_sha1] = file_path + count += 1 + except Exception as e: + print(f"Error hashing {file_path}: {e}") + + print(f"Indexed {count} images.") + return hash_map + + +def _iter_picture_shapes(shapes): + """ + Recursively iterate over shapes and yield those that are pictures + (have an 'image' property), diving into groups. + """ + for shape in shapes: + # Check groups recursively + if shape.shape_type == MSO_SHAPE_TYPE.GROUP: + yield from _iter_picture_shapes(shape.shapes) + continue + + # Check if shape has image property (Pictures, Placeholders with images) + if hasattr(shape, 'image'): + yield shape + + +def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None): + """ + Updates the alt text of images in a PowerPoint presentation by matching + their content (SHA1 hash) with images in a source directory. + + Args: + ppt_path (str/Path): Path to the PowerPoint file. + image_source_dir (str/Path): Directory containing source images to match against. + output_path (str/Path, optional): Path to save the updated presentation. + If None, overwrites the input file. + """ + if output_path is None: + output_path = ppt_path + + # 1. Build lookup map of {sha1: file_path} from the source directory + image_hash_map = _build_image_hash_map(image_source_dir) + + # 2. Open Presentation + try: + prs = Presentation(ppt_path) + except Exception as e: + print(f"Error opening presentation {ppt_path}: {e}") + return + + updates_count = 0 + slides = list(prs.slides) + total_slides = len(slides) + + print(f"Processing {total_slides} slides...") + + for i, slide in enumerate(slides): + # Use recursive iterator to find all pictures including those in groups/placeholders + picture_shapes = list(_iter_picture_shapes(slide.shapes)) + + for shape in picture_shapes: + try: + # shape.image.sha1 returns the SHA1 hash of the image blob + current_sha1 = shape.image.sha1 + + if current_sha1 in image_hash_map: + original_path = image_hash_map[current_sha1] + + # Generate Alt Text + try: + # Prepare path for generator. + # Try to relativize to CWD if capable + pass_path = original_path + try: + pass_path = original_path.relative_to(Path.cwd()) + except ValueError: + pass + + new_alt_text = image_alt_text_generator(pass_path) + + # Check existing alt text to avoid redundant updates/log them + # Accessing alt text via cNvPr + # Note: Different shape types might store non-visual props differently + # Picture: nvPicPr.cNvPr + # GraphicFrame: nvGraphicFramePr.cNvPr + # Group: nvGrpSpPr.cNvPr + # Shape/Placeholder: nvSpPr.cNvPr + + nvPr = None + for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']: + if hasattr(shape._element, attr): + nvPr = getattr(shape._element, attr) + break + + if nvPr and hasattr(nvPr, 'cNvPr'): + cNvPr = nvPr.cNvPr + existing_alt_text = cNvPr.get("descr", "") + + if existing_alt_text != new_alt_text: + print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'") + print(f" Old: '{existing_alt_text}' -> New: '{new_alt_text}'") + cNvPr.set("descr", new_alt_text) + updates_count += 1 + else: + print(f"Could not find cNvPr for shape on slide {i+1}") + + except AssertionError as e: + print(f"Skipping match for {original_path} due to generator error: {e}") + except Exception as e: + print(f"Error updating alt text for {original_path}: {e}") + + except AttributeError: + continue + except Exception as e: + print(f"Error processing shape on slide {i+1}: {e}") + + if updates_count > 0: + prs.save(output_path) + print(f"Saved updated presentation to {output_path} with {updates_count} updates.") + else: + print("No images matched or required updates.") + + def extract_voice_label(html_str: str) -> str: """ Extract voice label from HTML string and convert to short format.