fix ppt update images

2026-02-02 17:36:32 +01:00
parent 6ba30ff041
commit b7cf6adfb8
2 changed files with 230 additions and 49 deletions
--- a/04_PPTX_Update_Images.py
+++ b/04_PPTX_Update_Images.py
@@ -33,19 +33,6 @@ def _(TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET):
    return


-@app.cell
-def _():
-    utils._calculate_file_sha1('figures/OneDrive_2026-01-28/All_Respondents/most_prominent_personality_traits.png')
-    return
-
-
-@app.cell
-def _():
-    utils._calculate_perceptual_hash('figures/Picture.png')
-
-    return
-
-
@app.cell(hide_code=True)
 def _():
    mo.md(r"""
@@ -56,26 +43,21 @@ def _():

@app.cell
 def _():
-    REPLACE_SOURCE = Path('data/test_replace_source.pptx')
-    REPLACE_TARGET = Path('data/test_replace_target.pptx')
-    return REPLACE_SOURCE, REPLACE_TARGET
+    REPLACE_SOURCE = Path('data/reports/Perception-Research-Report_tagged.pptx')
+    REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2.pptx')

-
-app._unparsable_cell(
-    r"""
-    IMAGE_FILE = Path('figures/OneDrive_2026-01-28/Cons-Early_Professional/cold_distant_approachable_familiar_warm.png'
-    """,
-    name="_"
-)
+    NEW_IMAGES_DIR = Path('figures/2-2-26')
+    return NEW_IMAGES_DIR, REPLACE_SOURCE, REPLACE_TARGET


@app.cell
-def _(IMAGE_FILE, REPLACE_SOURCE, REPLACE_TARGET):
-    utils.pptx_replace_named_image(
-        presentation_path=REPLACE_SOURCE,
-        target_tag=utils.image_alt_text_generator(IMAGE_FILE),
-        new_image_path=IMAGE_FILE,
-        save_path=REPLACE_TARGET)
+def _(NEW_IMAGES_DIR, REPLACE_SOURCE, REPLACE_TARGET):
+    # get all files in the image source directory and subdirectories
+    results = utils.pptx_replace_images_from_directory(
+        REPLACE_SOURCE,            # Source presentation path,
+        NEW_IMAGES_DIR,          # Source directory with new images
+        REPLACE_TARGET  # Output path (optional, defaults to overwrite)
+    )
    return


--- a/utils.py
+++ b/utils.py
@@ -37,10 +37,223 @@ def image_alt_text_generator(fpath, include_dataset_dirname=False) -> str:
    else:
        return Path('/'.join(fparts[2:])).as_posix()

+def _get_shape_alt_text(shape) -> str:
+    """
+    Extract alt text from a PowerPoint shape.
+    
+    Args:
+        shape: A python-pptx shape object.
+        
+    Returns:
+        str: The alt text (descr attribute) or empty string if not found.
+    """
+    try:
+        # Check for common property names used by python-pptx elements to store non-visual props
+        # nvPicPr (Picture), nvSpPr (Shape/Placeholder), nvGrpSpPr (Group), 
+        # nvGraphicFramePr (GraphicFrame), nvCxnSpPr (Connector)
+        nvPr = None
+        for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
+            if hasattr(shape._element, attr):
+                nvPr = getattr(shape._element, attr)
+                break
+        
+        if nvPr is not None and hasattr(nvPr, 'cNvPr'):
+            return nvPr.cNvPr.get("descr", "")
+    except Exception:
+        pass
+    return ""
+
+
+def pptx_replace_images_from_directory(
+    presentation_path: Union[str, Path],
+    image_source_dir: Union[str, Path],
+    save_path: Union[str, Path] = None
+) -> dict:
+    """
+    Replace all images in a PowerPoint presentation using images from a directory
+    where subdirectory/filename paths match the alt_text of each image.
+    
+    This function scans all images in the presentation, extracts their alt_text,
+    and looks for a matching image file in the source directory. The alt_text
+    should be a relative path (e.g., "All_Respondents/chart_name.png") that
+    corresponds to the directory structure under image_source_dir.
+    
+    Args:
+        presentation_path (str/Path): Path to the source .pptx file.
+        image_source_dir (str/Path): Root directory containing replacement images.
+            The directory structure should mirror the alt_text paths.
+            Example: if alt_text is "All_Respondents/voice_scale.png", the
+            replacement image should be at image_source_dir/All_Respondents/voice_scale.png
+        save_path (str/Path, optional): Path to save the modified presentation.
+            If None, overwrites the input file.
+    
+    Returns:
+        dict: Summary with keys:
+            - 'replaced': List of dicts with slide number, shape name, and matched path
+            - 'not_found': List of dicts with slide number, shape name, and alt_text
+            - 'no_alt_text': List of dicts with slide number and shape name
+            - 'total_images': Total number of picture shapes processed
+    
+    Example:
+        >>> pptx_replace_images_from_directory(
+        ...     "presentation.pptx",
+        ...     "figures/2-2-26/",
+        ...     "presentation_updated.pptx"
+        ... )
+        
+    Notes:
+        - Alt text should be set using update_ppt_alt_text() or image_alt_text_generator()
+        - Images without alt_text are skipped
+        - Original image position, size, and aspect ratio are preserved
+    """
+    presentation_path = Path(presentation_path)
+    image_source_dir = Path(image_source_dir)
+    
+    if save_path is None:
+        save_path = presentation_path
+    else:
+        save_path = Path(save_path)
+    
+    if not presentation_path.exists():
+        raise FileNotFoundError(f"Presentation not found: {presentation_path}")
+    if not image_source_dir.exists():
+        raise FileNotFoundError(f"Image source directory not found: {image_source_dir}")
+    
+    # Build a lookup of all available images in the source directory
+    available_images = {}
+    for img_path in image_source_dir.rglob("*"):
+        if img_path.is_file() and img_path.suffix.lower() in {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp'}:
+            # Store relative path from image_source_dir as key
+            rel_path = img_path.relative_to(image_source_dir).as_posix()
+            available_images[rel_path] = img_path
+    
+    print(f"Found {len(available_images)} images in source directory")
+    
+    # Open presentation
+    prs = Presentation(presentation_path)
+    
+    # Track results
+    results = {
+        'replaced': [],
+        'not_found': [],
+        'no_alt_text': [],
+        'total_images': 0
+    }
+    
+    total_slides = len(prs.slides)
+    print(f"Processing {total_slides} slides...")
+    
+    for slide_idx, slide in enumerate(prs.slides):
+        slide_num = slide_idx + 1
+        
+        # Use recursive iterator to find all pictures including those in groups
+        picture_shapes = list(_iter_picture_shapes(slide.shapes))
+        
+        for shape in picture_shapes:
+            results['total_images'] += 1
+            shape_name = shape.name or f"Unnamed (ID: {getattr(shape, 'shape_id', 'unknown')})"
+            
+            # Get alt text
+            alt_text = _get_shape_alt_text(shape)
+            
+            if not alt_text:
+                results['no_alt_text'].append({
+                    'slide': slide_num,
+                    'shape_name': shape_name
+                })
+                continue
+            
+            # Look for matching image in source directory
+            # Try the alt_text as-is, and also with common extensions if not present
+            matched_path = None
+            
+            if alt_text in available_images:
+                matched_path = available_images[alt_text]
+            else:
+                # Try adding common extensions if alt_text doesn't have one
+                alt_text_path = Path(alt_text)
+                if not alt_text_path.suffix:
+                    for ext in ['.png', '.jpg', '.jpeg', '.gif']:
+                        test_key = f"{alt_text}{ext}"
+                        if test_key in available_images:
+                            matched_path = available_images[test_key]
+                            break
+            
+            if matched_path is None:
+                results['not_found'].append({
+                    'slide': slide_num,
+                    'shape_name': shape_name,
+                    'alt_text': alt_text
+                })
+                continue
+            
+            # Replace the image
+            try:
+                # Record coordinates
+                left, top, width, height = shape.left, shape.top, shape.width, shape.height
+                
+                # Remove old shape from XML
+                old_element = shape._element
+                old_element.getparent().remove(old_element)
+                
+                # Add new image at the same position/size
+                new_shape = slide.shapes.add_picture(str(matched_path), left, top, width, height)
+                
+                # Preserve the alt text on the new shape
+                new_nvPr = None
+                for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
+                    if hasattr(new_shape._element, attr):
+                        new_nvPr = getattr(new_shape._element, attr)
+                        break
+                if new_nvPr and hasattr(new_nvPr, 'cNvPr'):
+                    new_nvPr.cNvPr.set("descr", alt_text)
+                
+                results['replaced'].append({
+                    'slide': slide_num,
+                    'shape_name': shape_name,
+                    'matched_path': str(matched_path)
+                })
+                print(f"Slide {slide_num}: Replaced '{alt_text}'")
+                    
+            except Exception as e:
+                results['not_found'].append({
+                    'slide': slide_num,
+                    'shape_name': shape_name,
+                    'alt_text': alt_text,
+                    'error': str(e)
+                })
+    
+    # Save presentation
+    prs.save(save_path)
+    
+    # Print summary
+    print("\n" + "=" * 80)
+    if results['replaced']:
+        print(f"✓ Saved updated presentation to {save_path} with {len(results['replaced'])} replacements.")
+    else:
+        print("No images matched or required updates.")
+    
+    if results['not_found']:
+        print(f"\n⚠ {len(results['not_found'])} image(s) not found in source directory:")
+        for item in results['not_found']:
+            print(f"  • Slide {item['slide']}: '{item.get('alt_text', 'N/A')}'")
+    
+    if results['no_alt_text']:
+        print(f"\n⚠ {len(results['no_alt_text'])} image(s) without alt text (skipped):")
+        for item in results['no_alt_text']:
+            print(f"  • Slide {item['slide']}: '{item['shape_name']}'")
+    
+    if not results['not_found'] and not results['no_alt_text']:
+        print("\n✓ All images replaced successfully!")
+    print("=" * 80)
+    
+    return results
+
+
 def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save_path):
    """
-    Finds and replaces specific images in a PowerPoint presentation while 
-    preserving their original position, size, and aspect ratio.
+    Finds and replaces a specific image in a PowerPoint presentation while 
+    preserving its original position, size, and aspect ratio.

    This function performs a 'surgical' replacement: it records the coordinates 
    of the existing image, removes it from the slide's XML, and inserts a 
@@ -48,6 +261,9 @@ def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save
    image by searching for a specific string within the Shape Name 
    (Selection Pane) or Alt Text.
    
+    Note: For batch replacement of all images using a directory structure,
+    use pptx_replace_images_from_directory() instead.
+
    Args:
        presentation_path (str): The file path to the source .pptx file.
        target_tag (str): The unique identifier to look for (e.g., 'HERO_IMAGE').
@@ -73,24 +289,7 @@ def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save
            print(f"Checking shape: {shape.name} of type {shape.shape_type}...")
            
            shape_name = shape.name or ""
-            alt_text = ""
-            
-            # More robust strategy: Check for alt text in ANY valid element property
-            # This allows replacing Pictures, Placeholders, GraphicFrames, etc.
-            try:
-                # Check for common property names used by python-pptx elements to store non-visual props
-                # nvPicPr (Picture), nvSpPr (Shape/Placeholder), nvGrpSpPr (Group), 
-                # nvGraphicFramePr (GraphicFrame), nvCxnSpPr (Connector)
-                nvPr = None
-                for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
-                    if hasattr(shape._element, attr):
-                        nvPr = getattr(shape._element, attr)
-                        break
-                
-                if nvPr is not None and hasattr(nvPr, 'cNvPr'):
-                    alt_text = nvPr.cNvPr.get("descr", "")
-            except Exception:
-                pass
+            alt_text = _get_shape_alt_text(shape)
            
            print(f"Alt Text for shape '{shape_name}': {alt_text}")