rename example notebooks and finish ppt pipeline functions

2026-01-29 16:07:55 +01:00
parent 3ee25f9e33
commit 5f9e67a312
4 changed files with 241 additions and 64 deletions
--- a/utils.py
+++ b/utils.py
@@ -4,13 +4,27 @@ import pandas as pd
 from typing import Union
 import json
 import re
+import hashlib
+import os
 from plots import JPMCPlotsMixin

-import marimo as mo

 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE

+
+def image_alt_text_generator(fpath):
+    """convert image file path to alt text
+    """
+
+    if not isinstance(fpath, Path):
+        fpath = Path(fpath)
+
+    fparts = fpath.parts
+    assert fparts[0] == 'figures', "Image file path must start with 'figures'"
+
+    return Path('/'.join(fparts[2:])).as_posix()
+
 def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save_path):
    """
    Finds and replaces specific images in a PowerPoint presentation while 
@@ -91,6 +105,159 @@ def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save
    print(f"Successfully saved to {save_path}")


+def _calculate_file_sha1(file_path: Union[str, Path]) -> str:
+    """Calculate SHA1 hash of a file."""
+    sha1 = hashlib.sha1()
+    with open(file_path, 'rb') as f:
+        while True:
+            data = f.read(65536)
+            if not data:
+                break
+            sha1.update(data)
+    return sha1.hexdigest()
+
+
+def _build_image_hash_map(root_dir: Union[str, Path]) -> dict:
+    """
+    Recursively walk the directory and build a map of SHA1 hashes to file paths.
+    Only includes common image extensions.
+    """
+    hash_map = {}
+    valid_extensions = {'.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'}
+    
+    root = Path(root_dir)
+    print(f"Building image hash map from {root}...")
+    
+    count = 0
+    for root_path, dirs, files in os.walk(root):
+        for file in files:
+            file_path = Path(root_path) / file
+            if file_path.suffix.lower() in valid_extensions:
+                try:
+                    file_sha1 = _calculate_file_sha1(file_path)
+                    # We store the absolute path for reference, but we might just need the path relative to project for alt text
+                    hash_map[file_sha1] = file_path
+                    count += 1
+                except Exception as e:
+                    print(f"Error hashing {file_path}: {e}")
+                    
+    print(f"Indexed {count} images.")
+    return hash_map
+
+
+def _iter_picture_shapes(shapes):
+    """
+    Recursively iterate over shapes and yield those that are pictures 
+    (have an 'image' property), diving into groups.
+    """
+    for shape in shapes:
+        # Check groups recursively
+        if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
+            yield from _iter_picture_shapes(shape.shapes)
+            continue
+            
+        # Check if shape has image property (Pictures, Placeholders with images)
+        if hasattr(shape, 'image'):
+            yield shape
+
+
+def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None):
+    """
+    Updates the alt text of images in a PowerPoint presentation by matching
+    their content (SHA1 hash) with images in a source directory.
+
+    Args:
+        ppt_path (str/Path): Path to the PowerPoint file.
+        image_source_dir (str/Path): Directory containing source images to match against.
+        output_path (str/Path, optional): Path to save the updated presentation. 
+                                          If None, overwrites the input file.
+    """
+    if output_path is None:
+        output_path = ppt_path
+
+    # 1. Build lookup map of {sha1: file_path} from the source directory
+    image_hash_map = _build_image_hash_map(image_source_dir)
+
+    # 2. Open Presentation
+    try:
+        prs = Presentation(ppt_path)
+    except Exception as e:
+        print(f"Error opening presentation {ppt_path}: {e}")
+        return
+
+    updates_count = 0
+    slides = list(prs.slides)
+    total_slides = len(slides)
+
+    print(f"Processing {total_slides} slides...")
+
+    for i, slide in enumerate(slides):
+        # Use recursive iterator to find all pictures including those in groups/placeholders
+        picture_shapes = list(_iter_picture_shapes(slide.shapes))
+        
+        for shape in picture_shapes:
+            try:
+                # shape.image.sha1 returns the SHA1 hash of the image blob
+                current_sha1 = shape.image.sha1
+                
+                if current_sha1 in image_hash_map:
+                    original_path = image_hash_map[current_sha1]
+                    
+                    # Generate Alt Text
+                    try:
+                        # Prepare path for generator. 
+                        # Try to relativize to CWD if capable
+                        pass_path = original_path
+                        try:
+                            pass_path = original_path.relative_to(Path.cwd())
+                        except ValueError:
+                            pass
+                        
+                        new_alt_text = image_alt_text_generator(pass_path)
+                        
+                        # Check existing alt text to avoid redundant updates/log them
+                        # Accessing alt text via cNvPr
+                        # Note: Different shape types might store non-visual props differently
+                        # Picture: nvPicPr.cNvPr
+                        # GraphicFrame: nvGraphicFramePr.cNvPr
+                        # Group: nvGrpSpPr.cNvPr
+                        # Shape/Placeholder: nvSpPr.cNvPr
+                        
+                        nvPr = None
+                        for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
+                            if hasattr(shape._element, attr):
+                                nvPr = getattr(shape._element, attr)
+                                break
+                        
+                        if nvPr and hasattr(nvPr, 'cNvPr'):
+                            cNvPr = nvPr.cNvPr
+                            existing_alt_text = cNvPr.get("descr", "")
+                            
+                            if existing_alt_text != new_alt_text:
+                                print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'")
+                                print(f"  Old: '{existing_alt_text}' -> New: '{new_alt_text}'")
+                                cNvPr.set("descr", new_alt_text)
+                                updates_count += 1
+                        else:
+                            print(f"Could not find cNvPr for shape on slide {i+1}")
+                            
+                    except AssertionError as e:
+                        print(f"Skipping match for {original_path} due to generator error: {e}")
+                    except Exception as e:
+                        print(f"Error updating alt text for {original_path}: {e}")
+                            
+            except AttributeError:
+                continue
+            except Exception as e:
+                print(f"Error processing shape on slide {i+1}: {e}")
+
+    if updates_count > 0:
+        prs.save(output_path)
+        print(f"Saved updated presentation to {output_path} with {updates_count} updates.")
+    else:
+        print("No images matched or required updates.")
+
+
 def extract_voice_label(html_str: str) -> str:
    """
    Extract voice label from HTML string and convert to short format.