From f0eab32c34fadce79fc195e4b51351cd18651dbf Mon Sep 17 00:00:00 2001 From: Luigi Maiorano Date: Wed, 4 Feb 2026 17:48:48 +0100 Subject: [PATCH] update alt-text with full filepaths --- 04_PPTX_Update_Images.py | 4 +- utils.py | 213 +++++++++++++++++++++++---------------- 2 files changed, 128 insertions(+), 89 deletions(-) diff --git a/04_PPTX_Update_Images.py b/04_PPTX_Update_Images.py index dc290cd..f3cec37 100644 --- a/04_PPTX_Update_Images.py +++ b/04_PPTX_Update_Images.py @@ -26,7 +26,7 @@ def _(): @app.cell def _(): - TAG_SOURCE = Path('/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/data/reports/VOICE_Perception-Research-Report_3-2-26.pptx') + TAG_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_15-30.pptx') # TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx') TAG_IMAGE_DIR = Path('figures/debug') return TAG_IMAGE_DIR, TAG_SOURCE @@ -52,7 +52,7 @@ def _(): @app.cell def _(): - REPLACE_SOURCE = Path('/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/data/reports/VOICE_Perception-Research-Report_3-2-26.pptx') + REPLACE_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_15-30.pptx') # REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx') NEW_IMAGES_DIR = Path('figures/debug') diff --git a/utils.py b/utils.py index f3d7548..759f1dd 100644 --- a/utils.py +++ b/utils.py @@ -413,10 +413,30 @@ def _iter_picture_shapes(shapes): yield shape +def _set_shape_alt_text(shape, alt_text: str): + """ + Set alt text (descr attribute) for a PowerPoint shape. + """ + nvPr = None + # Check for common property names used by python-pptx elements + for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']: + if hasattr(shape._element, attr): + nvPr = getattr(shape._element, attr) + break + + if nvPr and hasattr(nvPr, 'cNvPr'): + nvPr.cNvPr.set("descr", alt_text) + + def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True): """ - Updates the alt text of images in a PowerPoint presentation by matching - their content with images in a source directory. + Updates the alt text of images in a PowerPoint presentation. + + 1. First pass: Validates existing alt-text format (/). + - Fixes full paths by keeping only the last two parts. + - Clears invalid alt-text. + 2. Second pass: If images are missing alt-text, matches them against source directory + using perceptual hash or SHA1. Args: ppt_path (str/Path): Path to the PowerPoint file. @@ -430,10 +450,7 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, if output_path is None: output_path = ppt_path - # 1. Build lookup map of {hash: file_path} from the source directory - image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash) - - # 2. Open Presentation + # Open Presentation try: prs = Presentation(ppt_path) except Exception as e: @@ -441,110 +458,132 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, return updates_count = 0 - unmatched_images = [] # Collect unmatched images to report at the end + images_needing_match = [] + slides = list(prs.slides) total_slides = len(slides) - print(f"Processing {total_slides} slides...") + print(f"Scanning {total_slides} slides for existing alt-text...") + # Pass 1: Scan and clean existing alt-text for i, slide in enumerate(slides): - # Use recursive iterator to find all pictures including those in groups/placeholders picture_shapes = list(_iter_picture_shapes(slide.shapes)) for shape in picture_shapes: - try: - # Get image hash based on selected method - if use_perceptual_hash: - # Use perceptual hash of the image blob for visual content matching - current_hash = _calculate_perceptual_hash(shape.image.blob) - else: - # Use SHA1 hash from python-pptx (exact byte match) - current_hash = shape.image.sha1 + alt_text = _get_shape_alt_text(shape) + has_valid_alt = False + + if alt_text: + # Handle potential path separators and whitespace + clean_alt = alt_text.strip().replace('\\', '/') + parts = clean_alt.split('/') - if current_hash in image_hash_map: - original_path = image_hash_map[current_hash] + # Check if it looks like a path/file reference (at least 2 parts like dir/file) + if len(parts) >= 2: + # Enforce format: keep last 2 parts (e.g. filter/image.png) + new_alt = '/'.join(parts[-2:]) - # Generate Alt Text - try: - # Prepare path for generator. - # Try to relativize to CWD if capable - pass_path = original_path - try: - pass_path = original_path.relative_to(Path.cwd()) - except ValueError: - pass + if new_alt != alt_text: + print(f"Slide {i+1}: Fixing alt-text format: '{alt_text}' -> '{new_alt}'") + _set_shape_alt_text(shape, new_alt) + updates_count += 1 - new_alt_text = image_alt_text_generator(pass_path) - - # Check existing alt text to avoid redundant updates/log them - # Accessing alt text via cNvPr - # Note: Different shape types might store non-visual props differently - # Picture: nvPicPr.cNvPr - # GraphicFrame: nvGraphicFramePr.cNvPr - # Group: nvGrpSpPr.cNvPr - # Shape/Placeholder: nvSpPr.cNvPr - - nvPr = None - for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']: - if hasattr(shape._element, attr): - nvPr = getattr(shape._element, attr) - break - - if nvPr and hasattr(nvPr, 'cNvPr'): - cNvPr = nvPr.cNvPr - existing_alt_text = cNvPr.get("descr", "") - - if existing_alt_text != new_alt_text: - print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'") - print(f" Old: '{existing_alt_text}' -> New: '{new_alt_text}'") - cNvPr.set("descr", new_alt_text) - updates_count += 1 - else: - print(f"Could not find cNvPr for shape on slide {i+1}") - - except AssertionError as e: - print(f"Skipping match for {original_path} due to generator error: {e}") - except Exception as e: - print(f"Error updating alt text for {original_path}: {e}") - + has_valid_alt = True else: - # Check if image already has alt text set - if so, skip reporting as unmatched - existing_alt = _get_shape_alt_text(shape) - if existing_alt: - # Image already has alt text, no need to report as unmatched - continue - - shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID')) - shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})" - hash_type = "pHash" if use_perceptual_hash else "SHA1" - - unmatched_images.append({ - 'slide': i+1, - 'shape_name': shape_name, - 'hash_type': hash_type, - 'hash': current_hash - }) - - except AttributeError: - continue - except Exception as e: - print(f"Error processing shape on slide {i+1}: {e}") + # User requested deleting other cases that do not meet format + # If it's single word or doesn't look like our path format + pass # logic below handles this + + if not has_valid_alt: + if alt_text: + print(f"Slide {i+1}: Invalid/Legacy alt-text '{alt_text}'. Clearing for re-matching.") + _set_shape_alt_text(shape, "") + updates_count += 1 + + # Queue for hash matching + shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID')) + shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})" + images_needing_match.append({ + 'slide_idx': i, # 0-based + 'slide_num': i+1, + 'shape': shape, + 'shape_name': shape_name + }) - # Print summary + if not images_needing_match: + print("\nAll images have valid alt-text format. No hash matching needed.") + if updates_count > 0: + prs.save(output_path) + print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.") + else: + print("Presentation is up to date.") + return + + # Pass 2: Hash Matching + print(f"\n{len(images_needing_match)} images missing proper alt-text. Proceeding with hash matching...") + + # Build lookup map of {hash: file_path} only if needed + image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash) + + unmatched_images = [] + + for item in images_needing_match: + shape = item['shape'] + slide_num = item['slide_num'] + + try: + # Get image hash + if use_perceptual_hash: + current_hash = _calculate_perceptual_hash(shape.image.blob) + else: + current_hash = shape.image.sha1 + + if current_hash in image_hash_map: + original_path = image_hash_map[current_hash] + + # Generate Alt Text + try: + # Try to relativize to CWD if capable + pass_path = original_path + try: + pass_path = original_path.relative_to(Path.cwd()) + except ValueError: + pass + + new_alt_text = image_alt_text_generator(pass_path) + + print(f"Slide {slide_num}: Match found! Assigning alt-text '{new_alt_text}'") + _set_shape_alt_text(shape, new_alt_text) + updates_count += 1 + + except Exception as e: + print(f"Error generating alt text for {original_path}: {e}") + else: + hash_type = "pHash" if use_perceptual_hash else "SHA1" + unmatched_images.append({ + 'slide': slide_num, + 'shape_name': item['shape_name'], + 'hash_type': hash_type, + 'hash': current_hash + }) + + except Exception as e: + print(f"Error processing shape on slide {slide_num}: {e}") + + # Save and Print Summary print("\n" + "="*80) if updates_count > 0: prs.save(output_path) print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.") else: - print("No images matched or required updates.") + print("No matches found for missing images.") - # List unmatched images at the end if unmatched_images: - print(f"\n⚠ {len(unmatched_images)} image(s) not found in source directory:") + print(f"\n⚠ {len(unmatched_images)} image(s) could not be matched:") for img in unmatched_images: print(f" • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})") else: - print("\n✓ All images matched successfully!") + print("\n✓ All images processed successfully!") print("="*80)