From f0eab32c34fadce79fc195e4b51351cd18651dbf Mon Sep 17 00:00:00 2001
From: Luigi Maiorano <luigi.maiorano@qumo.io>
Date: Wed, 4 Feb 2026 17:48:48 +0100
Subject: [PATCH] update alt-text with full filepaths

---
 04_PPTX_Update_Images.py |   4 +-
 utils.py                 | 213 +++++++++++++++++++++++----------------
 2 files changed, 128 insertions(+), 89 deletions(-)
diff --git a/04_PPTX_Update_Images.py b/04_PPTX_Update_Images.py
index dc290cd..f3cec37 100644
--- a/04_PPTX_Update_Images.py
+++ b/04_PPTX_Update_Images.py
@@ -26,7 +26,7 @@ def _():
 
 @app.cell
 def _():
-    TAG_SOURCE = Path('/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/data/reports/VOICE_Perception-Research-Report_3-2-26.pptx')
+    TAG_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_15-30.pptx')
     # TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx')
     TAG_IMAGE_DIR = Path('figures/debug')
     return TAG_IMAGE_DIR, TAG_SOURCE
@@ -52,7 +52,7 @@ def _():
 
 @app.cell
 def _():
-    REPLACE_SOURCE = Path('/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/data/reports/VOICE_Perception-Research-Report_3-2-26.pptx')
+    REPLACE_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_15-30.pptx')
     # REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
 
     NEW_IMAGES_DIR = Path('figures/debug')
diff --git a/utils.py b/utils.py
index f3d7548..759f1dd 100644
--- a/utils.py
+++ b/utils.py
@@ -413,10 +413,30 @@ def _iter_picture_shapes(shapes):
             yield shape
 
 
+def _set_shape_alt_text(shape, alt_text: str):
+    """
+    Set alt text (descr attribute) for a PowerPoint shape.
+    """
+    nvPr = None
+    # Check for common property names used by python-pptx elements
+    for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
+        if hasattr(shape._element, attr):
+            nvPr = getattr(shape._element, attr)
+            break
+            
+    if nvPr and hasattr(nvPr, 'cNvPr'):
+        nvPr.cNvPr.set("descr", alt_text)
+
+
 def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True):
     """
-    Updates the alt text of images in a PowerPoint presentation by matching
-    their content with images in a source directory.
+    Updates the alt text of images in a PowerPoint presentation.
+    
+    1. First pass: Validates existing alt-text format (<filter>/<filename>). 
+       - Fixes full paths by keeping only the last two parts.
+       - Clears invalid alt-text.
+    2. Second pass: If images are missing alt-text, matches them against source directory
+       using perceptual hash or SHA1.
 
     Args:
         ppt_path (str/Path): Path to the PowerPoint file.
@@ -430,10 +450,7 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
     if output_path is None:
         output_path = ppt_path
 
-    # 1. Build lookup map of {hash: file_path} from the source directory
-    image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
-
-    # 2. Open Presentation
+    # Open Presentation
     try:
         prs = Presentation(ppt_path)
     except Exception as e:
@@ -441,110 +458,132 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
         return
 
     updates_count = 0
-    unmatched_images = []  # Collect unmatched images to report at the end
+    images_needing_match = []
+    
     slides = list(prs.slides)
     total_slides = len(slides)
 
-    print(f"Processing {total_slides} slides...")
+    print(f"Scanning {total_slides} slides for existing alt-text...")
 
+    # Pass 1: Scan and clean existing alt-text
     for i, slide in enumerate(slides):
-        # Use recursive iterator to find all pictures including those in groups/placeholders
         picture_shapes = list(_iter_picture_shapes(slide.shapes))
         
         for shape in picture_shapes:
-            try:
-                # Get image hash based on selected method
-                if use_perceptual_hash:
-                    # Use perceptual hash of the image blob for visual content matching
-                    current_hash = _calculate_perceptual_hash(shape.image.blob)
-                else:
-                    # Use SHA1 hash from python-pptx (exact byte match)
-                    current_hash = shape.image.sha1
+            alt_text = _get_shape_alt_text(shape)
+            has_valid_alt = False
+            
+            if alt_text:
+                # Handle potential path separators and whitespace
+                clean_alt = alt_text.strip().replace('\\', '/')
+                parts = clean_alt.split('/')
                 
-                if current_hash in image_hash_map:
-                    original_path = image_hash_map[current_hash]
+                # Check if it looks like a path/file reference (at least 2 parts like dir/file)
+                if len(parts) >= 2:
+                    # Enforce format: keep last 2 parts (e.g. filter/image.png)
+                    new_alt = '/'.join(parts[-2:])
                     
-                    # Generate Alt Text
-                    try:
-                        # Prepare path for generator. 
-                        # Try to relativize to CWD if capable
-                        pass_path = original_path
-                        try:
-                            pass_path = original_path.relative_to(Path.cwd())
-                        except ValueError:
-                            pass
+                    if new_alt != alt_text:
+                        print(f"Slide {i+1}: Fixing alt-text format: '{alt_text}' -> '{new_alt}'")
+                        _set_shape_alt_text(shape, new_alt)
+                        updates_count += 1
                         
-                        new_alt_text = image_alt_text_generator(pass_path)
-                        
-                        # Check existing alt text to avoid redundant updates/log them
-                        # Accessing alt text via cNvPr
-                        # Note: Different shape types might store non-visual props differently
-                        # Picture: nvPicPr.cNvPr
-                        # GraphicFrame: nvGraphicFramePr.cNvPr
-                        # Group: nvGrpSpPr.cNvPr
-                        # Shape/Placeholder: nvSpPr.cNvPr
-                        
-                        nvPr = None
-                        for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
-                            if hasattr(shape._element, attr):
-                                nvPr = getattr(shape._element, attr)
-                                break
-                        
-                        if nvPr and hasattr(nvPr, 'cNvPr'):
-                            cNvPr = nvPr.cNvPr
-                            existing_alt_text = cNvPr.get("descr", "")
-                            
-                            if existing_alt_text != new_alt_text:
-                                print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'")
-                                print(f"  Old: '{existing_alt_text}' -> New: '{new_alt_text}'")
-                                cNvPr.set("descr", new_alt_text)
-                                updates_count += 1
-                        else:
-                            print(f"Could not find cNvPr for shape on slide {i+1}")
-                            
-                    except AssertionError as e:
-                        print(f"Skipping match for {original_path} due to generator error: {e}")
-                    except Exception as e:
-                        print(f"Error updating alt text for {original_path}: {e}")
-                
+                    has_valid_alt = True
                 else:
-                    # Check if image already has alt text set - if so, skip reporting as unmatched
-                    existing_alt = _get_shape_alt_text(shape)
-                    if existing_alt:
-                        # Image already has alt text, no need to report as unmatched
-                        continue
-                    
-                    shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
-                    shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
-                    hash_type = "pHash" if use_perceptual_hash else "SHA1"
-                    
-                    unmatched_images.append({
-                        'slide': i+1,
-                        'shape_name': shape_name,
-                        'hash_type': hash_type,
-                        'hash': current_hash
-                    })
-                            
-            except AttributeError:
-                continue
-            except Exception as e:
-                print(f"Error processing shape on slide {i+1}: {e}")
+                    # User requested deleting other cases that do not meet format
+                    # If it's single word or doesn't look like our path format
+                    pass # logic below handles this
+            
+            if not has_valid_alt:
+                if alt_text:
+                    print(f"Slide {i+1}: Invalid/Legacy alt-text '{alt_text}'. Clearing for re-matching.")
+                    _set_shape_alt_text(shape, "")
+                    updates_count += 1
+                
+                # Queue for hash matching
+                shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
+                shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
+                images_needing_match.append({
+                    'slide_idx': i, # 0-based
+                    'slide_num': i+1,
+                    'shape': shape,
+                    'shape_name': shape_name
+                })
 
-    # Print summary
+    if not images_needing_match:
+        print("\nAll images have valid alt-text format. No hash matching needed.")
+        if updates_count > 0:
+            prs.save(output_path)
+            print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
+        else:
+             print("Presentation is up to date.")
+        return
+
+    # Pass 2: Hash Matching
+    print(f"\n{len(images_needing_match)} images missing proper alt-text. Proceeding with hash matching...")
+    
+    # Build lookup map of {hash: file_path} only if needed
+    image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
+    
+    unmatched_images = []
+
+    for item in images_needing_match:
+        shape = item['shape']
+        slide_num = item['slide_num']
+        
+        try:
+            # Get image hash
+            if use_perceptual_hash:
+                current_hash = _calculate_perceptual_hash(shape.image.blob)
+            else:
+                current_hash = shape.image.sha1
+            
+            if current_hash in image_hash_map:
+                original_path = image_hash_map[current_hash]
+                
+                # Generate Alt Text
+                try:
+                    # Try to relativize to CWD if capable
+                    pass_path = original_path
+                    try:
+                        pass_path = original_path.relative_to(Path.cwd())
+                    except ValueError:
+                        pass
+                    
+                    new_alt_text = image_alt_text_generator(pass_path)
+                    
+                    print(f"Slide {slide_num}: Match found! Assigning alt-text '{new_alt_text}'")
+                    _set_shape_alt_text(shape, new_alt_text)
+                    updates_count += 1
+                        
+                except Exception as e:
+                    print(f"Error generating alt text for {original_path}: {e}")
+            else:
+                hash_type = "pHash" if use_perceptual_hash else "SHA1"
+                unmatched_images.append({
+                    'slide': slide_num,
+                    'shape_name': item['shape_name'],
+                    'hash_type': hash_type,
+                    'hash': current_hash
+                })
+                        
+        except Exception as e:
+            print(f"Error processing shape on slide {slide_num}: {e}")
+
+    # Save and Print Summary
     print("\n" + "="*80)
     if updates_count > 0:
         prs.save(output_path)
         print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
     else:
-        print("No images matched or required updates.")
+        print("No matches found for missing images.")
     
-    # List unmatched images at the end
     if unmatched_images:
-        print(f"\n⚠ {len(unmatched_images)} image(s) not found in source directory:")
+        print(f"\n⚠ {len(unmatched_images)} image(s) could not be matched:")
         for img in unmatched_images:
             print(f"  • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})")
     else:
-        print("\n✓ All images matched successfully!")
+        print("\n✓ All images processed successfully!")
     print("="*80)