fixed plot alt-text-tag function

2026-02-02 17:07:44 +01:00
parent 45dd121d90
commit 02a0214539
5 changed files with 244 additions and 15 deletions
--- a/utils.py
+++ b/utils.py
@@ -6,6 +6,11 @@ import json
 import re
 import hashlib
 import os
+from io import BytesIO
+
+import imagehash
+from PIL import Image
+
 from plots import JPMCPlotsMixin


@@ -124,16 +129,54 @@ def _calculate_file_sha1(file_path: Union[str, Path]) -> str:
    return sha1.hexdigest()


-def _build_image_hash_map(root_dir: Union[str, Path]) -> dict:
+def _calculate_perceptual_hash(image_source: Union[str, Path, bytes]) -> str:
    """
-    Recursively walk the directory and build a map of SHA1 hashes to file paths.
+    Calculate perceptual hash of an image based on visual content.
+    
+    Uses pHash (perceptual hash) which is robust against:
+    - Metadata differences
+    - Minor compression differences
+    - Small color/contrast variations
+    
+    Args:
+        image_source: File path to image or raw image bytes.
+        
+    Returns:
+        str: Hexadecimal string representation of the perceptual hash.
+    """
+    if isinstance(image_source, bytes):
+        img = Image.open(BytesIO(image_source))
+    else:
+        img = Image.open(image_source)
+    
+    # Convert to RGB if necessary (handles RGBA, P mode, etc.)
+    if img.mode not in ('RGB', 'L'):
+        img = img.convert('RGB')
+    
+    # Use pHash (perceptual hash) - robust against minor differences
+    phash = imagehash.phash(img)
+    return str(phash)
+
+
+def _build_image_hash_map(root_dir: Union[str, Path], use_perceptual_hash: bool = True) -> dict:
+    """
+    Recursively walk the directory and build a map of image hashes to file paths.
    Only includes common image extensions.
+    
+    Args:
+        root_dir: Root directory to scan for images.
+        use_perceptual_hash: If True, uses perceptual hashing (robust against metadata
+            differences). If False, uses SHA1 byte hashing (exact match only).
+    
+    Returns:
+        dict: Mapping of hash strings to file paths.
    """
    hash_map = {}
    valid_extensions = {'.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'}
    
    root = Path(root_dir)
-    print(f"Building image hash map from {root}...")
+    hash_type = "perceptual" if use_perceptual_hash else "SHA1"
+    print(f"Building image hash map from {root} using {hash_type} hashing...")
    
    count = 0
    for root_path, dirs, files in os.walk(root):
@@ -141,9 +184,12 @@ def _build_image_hash_map(root_dir: Union[str, Path]) -> dict:
            file_path = Path(root_path) / file
            if file_path.suffix.lower() in valid_extensions:
                try:
-                    file_sha1 = _calculate_file_sha1(file_path)
+                    if use_perceptual_hash:
+                        file_hash = _calculate_perceptual_hash(file_path)
+                    else:
+                        file_hash = _calculate_file_sha1(file_path)
                    # We store the absolute path for reference, but we might just need the path relative to project for alt text
-                    hash_map[file_sha1] = file_path
+                    hash_map[file_hash] = file_path
                    count += 1
                except Exception as e:
                    print(f"Error hashing {file_path}: {e}")
@@ -168,22 +214,25 @@ def _iter_picture_shapes(shapes):
            yield shape


-def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None):
+def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True):
    """
    Updates the alt text of images in a PowerPoint presentation by matching
-    their content (SHA1 hash) with images in a source directory.
+    their content with images in a source directory.

    Args:
        ppt_path (str/Path): Path to the PowerPoint file.
        image_source_dir (str/Path): Directory containing source images to match against.
        output_path (str/Path, optional): Path to save the updated presentation. 
                                          If None, overwrites the input file.
+        use_perceptual_hash (bool): If True (default), uses perceptual hashing which
+            matches images based on visual content (robust against metadata differences,
+            re-compression, etc.). If False, uses SHA1 byte hashing (exact file match only).
    """
    if output_path is None:
        output_path = ppt_path

-    # 1. Build lookup map of {sha1: file_path} from the source directory
-    image_hash_map = _build_image_hash_map(image_source_dir)
+    # 1. Build lookup map of {hash: file_path} from the source directory
+    image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)

    # 2. Open Presentation
    try:
@@ -193,6 +242,7 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
        return

    updates_count = 0
+    unmatched_images = []  # Collect unmatched images to report at the end
    slides = list(prs.slides)
    total_slides = len(slides)

@@ -204,11 +254,16 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
        
        for shape in picture_shapes:
            try:
-                # shape.image.sha1 returns the SHA1 hash of the image blob
-                current_sha1 = shape.image.sha1
+                # Get image hash based on selected method
+                if use_perceptual_hash:
+                    # Use perceptual hash of the image blob for visual content matching
+                    current_hash = _calculate_perceptual_hash(shape.image.blob)
+                else:
+                    # Use SHA1 hash from python-pptx (exact byte match)
+                    current_hash = shape.image.sha1
                
-                if current_sha1 in image_hash_map:
-                    original_path = image_hash_map[current_sha1]
+                if current_hash in image_hash_map:
+                    original_path = image_hash_map[current_hash]
                    
                    # Generate Alt Text
                    try:
@@ -252,17 +307,39 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
                        print(f"Skipping match for {original_path} due to generator error: {e}")
                    except Exception as e:
                        print(f"Error updating alt text for {original_path}: {e}")
+                
+                else:
+                    shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
+                    shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
+                    hash_type = "pHash" if use_perceptual_hash else "SHA1"
+                    unmatched_images.append({
+                        'slide': i+1,
+                        'shape_name': shape_name,
+                        'hash_type': hash_type,
+                        'hash': current_hash
+                    })
                            
            except AttributeError:
                continue
            except Exception as e:
                print(f"Error processing shape on slide {i+1}: {e}")

+    # Print summary
+    print("\n" + "="*80)
    if updates_count > 0:
        prs.save(output_path)
-        print(f"Saved updated presentation to {output_path} with {updates_count} updates.")
+        print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
    else:
        print("No images matched or required updates.")
+    
+    # List unmatched images at the end
+    if unmatched_images:
+        print(f"\n⚠ {len(unmatched_images)} image(s) not found in source directory:")
+        for img in unmatched_images:
+            print(f"  • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})")
+    else:
+        print("\n✓ All images matched successfully!")
+    print("="*80)


 def extract_voice_label(html_str: str) -> str: