fixed plot alt-text-tag function

This commit is contained in:
2026-02-02 17:07:44 +01:00
parent 45dd121d90
commit 02a0214539
5 changed files with 244 additions and 15 deletions

105
utils.py
View File

@@ -6,6 +6,11 @@ import json
import re
import hashlib
import os
from io import BytesIO
import imagehash
from PIL import Image
from plots import JPMCPlotsMixin
@@ -124,16 +129,54 @@ def _calculate_file_sha1(file_path: Union[str, Path]) -> str:
return sha1.hexdigest()
def _build_image_hash_map(root_dir: Union[str, Path]) -> dict:
def _calculate_perceptual_hash(image_source: Union[str, Path, bytes]) -> str:
"""
Recursively walk the directory and build a map of SHA1 hashes to file paths.
Calculate perceptual hash of an image based on visual content.
Uses pHash (perceptual hash) which is robust against:
- Metadata differences
- Minor compression differences
- Small color/contrast variations
Args:
image_source: File path to image or raw image bytes.
Returns:
str: Hexadecimal string representation of the perceptual hash.
"""
if isinstance(image_source, bytes):
img = Image.open(BytesIO(image_source))
else:
img = Image.open(image_source)
# Convert to RGB if necessary (handles RGBA, P mode, etc.)
if img.mode not in ('RGB', 'L'):
img = img.convert('RGB')
# Use pHash (perceptual hash) - robust against minor differences
phash = imagehash.phash(img)
return str(phash)
def _build_image_hash_map(root_dir: Union[str, Path], use_perceptual_hash: bool = True) -> dict:
"""
Recursively walk the directory and build a map of image hashes to file paths.
Only includes common image extensions.
Args:
root_dir: Root directory to scan for images.
use_perceptual_hash: If True, uses perceptual hashing (robust against metadata
differences). If False, uses SHA1 byte hashing (exact match only).
Returns:
dict: Mapping of hash strings to file paths.
"""
hash_map = {}
valid_extensions = {'.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'}
root = Path(root_dir)
print(f"Building image hash map from {root}...")
hash_type = "perceptual" if use_perceptual_hash else "SHA1"
print(f"Building image hash map from {root} using {hash_type} hashing...")
count = 0
for root_path, dirs, files in os.walk(root):
@@ -141,9 +184,12 @@ def _build_image_hash_map(root_dir: Union[str, Path]) -> dict:
file_path = Path(root_path) / file
if file_path.suffix.lower() in valid_extensions:
try:
file_sha1 = _calculate_file_sha1(file_path)
if use_perceptual_hash:
file_hash = _calculate_perceptual_hash(file_path)
else:
file_hash = _calculate_file_sha1(file_path)
# We store the absolute path for reference, but we might just need the path relative to project for alt text
hash_map[file_sha1] = file_path
hash_map[file_hash] = file_path
count += 1
except Exception as e:
print(f"Error hashing {file_path}: {e}")
@@ -168,22 +214,25 @@ def _iter_picture_shapes(shapes):
yield shape
def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None):
def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True):
"""
Updates the alt text of images in a PowerPoint presentation by matching
their content (SHA1 hash) with images in a source directory.
their content with images in a source directory.
Args:
ppt_path (str/Path): Path to the PowerPoint file.
image_source_dir (str/Path): Directory containing source images to match against.
output_path (str/Path, optional): Path to save the updated presentation.
If None, overwrites the input file.
use_perceptual_hash (bool): If True (default), uses perceptual hashing which
matches images based on visual content (robust against metadata differences,
re-compression, etc.). If False, uses SHA1 byte hashing (exact file match only).
"""
if output_path is None:
output_path = ppt_path
# 1. Build lookup map of {sha1: file_path} from the source directory
image_hash_map = _build_image_hash_map(image_source_dir)
# 1. Build lookup map of {hash: file_path} from the source directory
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
# 2. Open Presentation
try:
@@ -193,6 +242,7 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
return
updates_count = 0
unmatched_images = [] # Collect unmatched images to report at the end
slides = list(prs.slides)
total_slides = len(slides)
@@ -204,11 +254,16 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
for shape in picture_shapes:
try:
# shape.image.sha1 returns the SHA1 hash of the image blob
current_sha1 = shape.image.sha1
# Get image hash based on selected method
if use_perceptual_hash:
# Use perceptual hash of the image blob for visual content matching
current_hash = _calculate_perceptual_hash(shape.image.blob)
else:
# Use SHA1 hash from python-pptx (exact byte match)
current_hash = shape.image.sha1
if current_sha1 in image_hash_map:
original_path = image_hash_map[current_sha1]
if current_hash in image_hash_map:
original_path = image_hash_map[current_hash]
# Generate Alt Text
try:
@@ -252,17 +307,39 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
print(f"Skipping match for {original_path} due to generator error: {e}")
except Exception as e:
print(f"Error updating alt text for {original_path}: {e}")
else:
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
hash_type = "pHash" if use_perceptual_hash else "SHA1"
unmatched_images.append({
'slide': i+1,
'shape_name': shape_name,
'hash_type': hash_type,
'hash': current_hash
})
except AttributeError:
continue
except Exception as e:
print(f"Error processing shape on slide {i+1}: {e}")
# Print summary
print("\n" + "="*80)
if updates_count > 0:
prs.save(output_path)
print(f"Saved updated presentation to {output_path} with {updates_count} updates.")
print(f"Saved updated presentation to {output_path} with {updates_count} updates.")
else:
print("No images matched or required updates.")
# List unmatched images at the end
if unmatched_images:
print(f"\n{len(unmatched_images)} image(s) not found in source directory:")
for img in unmatched_images:
print(f" • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})")
else:
print("\n✓ All images matched successfully!")
print("="*80)
def extract_voice_label(html_str: str) -> str: