fixed plot alt-text-tag function
This commit is contained in:
105
utils.py
105
utils.py
@@ -6,6 +6,11 @@ import json
|
||||
import re
|
||||
import hashlib
|
||||
import os
|
||||
from io import BytesIO
|
||||
|
||||
import imagehash
|
||||
from PIL import Image
|
||||
|
||||
from plots import JPMCPlotsMixin
|
||||
|
||||
|
||||
@@ -124,16 +129,54 @@ def _calculate_file_sha1(file_path: Union[str, Path]) -> str:
|
||||
return sha1.hexdigest()
|
||||
|
||||
|
||||
def _build_image_hash_map(root_dir: Union[str, Path]) -> dict:
|
||||
def _calculate_perceptual_hash(image_source: Union[str, Path, bytes]) -> str:
|
||||
"""
|
||||
Recursively walk the directory and build a map of SHA1 hashes to file paths.
|
||||
Calculate perceptual hash of an image based on visual content.
|
||||
|
||||
Uses pHash (perceptual hash) which is robust against:
|
||||
- Metadata differences
|
||||
- Minor compression differences
|
||||
- Small color/contrast variations
|
||||
|
||||
Args:
|
||||
image_source: File path to image or raw image bytes.
|
||||
|
||||
Returns:
|
||||
str: Hexadecimal string representation of the perceptual hash.
|
||||
"""
|
||||
if isinstance(image_source, bytes):
|
||||
img = Image.open(BytesIO(image_source))
|
||||
else:
|
||||
img = Image.open(image_source)
|
||||
|
||||
# Convert to RGB if necessary (handles RGBA, P mode, etc.)
|
||||
if img.mode not in ('RGB', 'L'):
|
||||
img = img.convert('RGB')
|
||||
|
||||
# Use pHash (perceptual hash) - robust against minor differences
|
||||
phash = imagehash.phash(img)
|
||||
return str(phash)
|
||||
|
||||
|
||||
def _build_image_hash_map(root_dir: Union[str, Path], use_perceptual_hash: bool = True) -> dict:
|
||||
"""
|
||||
Recursively walk the directory and build a map of image hashes to file paths.
|
||||
Only includes common image extensions.
|
||||
|
||||
Args:
|
||||
root_dir: Root directory to scan for images.
|
||||
use_perceptual_hash: If True, uses perceptual hashing (robust against metadata
|
||||
differences). If False, uses SHA1 byte hashing (exact match only).
|
||||
|
||||
Returns:
|
||||
dict: Mapping of hash strings to file paths.
|
||||
"""
|
||||
hash_map = {}
|
||||
valid_extensions = {'.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'}
|
||||
|
||||
root = Path(root_dir)
|
||||
print(f"Building image hash map from {root}...")
|
||||
hash_type = "perceptual" if use_perceptual_hash else "SHA1"
|
||||
print(f"Building image hash map from {root} using {hash_type} hashing...")
|
||||
|
||||
count = 0
|
||||
for root_path, dirs, files in os.walk(root):
|
||||
@@ -141,9 +184,12 @@ def _build_image_hash_map(root_dir: Union[str, Path]) -> dict:
|
||||
file_path = Path(root_path) / file
|
||||
if file_path.suffix.lower() in valid_extensions:
|
||||
try:
|
||||
file_sha1 = _calculate_file_sha1(file_path)
|
||||
if use_perceptual_hash:
|
||||
file_hash = _calculate_perceptual_hash(file_path)
|
||||
else:
|
||||
file_hash = _calculate_file_sha1(file_path)
|
||||
# We store the absolute path for reference, but we might just need the path relative to project for alt text
|
||||
hash_map[file_sha1] = file_path
|
||||
hash_map[file_hash] = file_path
|
||||
count += 1
|
||||
except Exception as e:
|
||||
print(f"Error hashing {file_path}: {e}")
|
||||
@@ -168,22 +214,25 @@ def _iter_picture_shapes(shapes):
|
||||
yield shape
|
||||
|
||||
|
||||
def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None):
|
||||
def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True):
|
||||
"""
|
||||
Updates the alt text of images in a PowerPoint presentation by matching
|
||||
their content (SHA1 hash) with images in a source directory.
|
||||
their content with images in a source directory.
|
||||
|
||||
Args:
|
||||
ppt_path (str/Path): Path to the PowerPoint file.
|
||||
image_source_dir (str/Path): Directory containing source images to match against.
|
||||
output_path (str/Path, optional): Path to save the updated presentation.
|
||||
If None, overwrites the input file.
|
||||
use_perceptual_hash (bool): If True (default), uses perceptual hashing which
|
||||
matches images based on visual content (robust against metadata differences,
|
||||
re-compression, etc.). If False, uses SHA1 byte hashing (exact file match only).
|
||||
"""
|
||||
if output_path is None:
|
||||
output_path = ppt_path
|
||||
|
||||
# 1. Build lookup map of {sha1: file_path} from the source directory
|
||||
image_hash_map = _build_image_hash_map(image_source_dir)
|
||||
# 1. Build lookup map of {hash: file_path} from the source directory
|
||||
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
|
||||
|
||||
# 2. Open Presentation
|
||||
try:
|
||||
@@ -193,6 +242,7 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
|
||||
return
|
||||
|
||||
updates_count = 0
|
||||
unmatched_images = [] # Collect unmatched images to report at the end
|
||||
slides = list(prs.slides)
|
||||
total_slides = len(slides)
|
||||
|
||||
@@ -204,11 +254,16 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
|
||||
|
||||
for shape in picture_shapes:
|
||||
try:
|
||||
# shape.image.sha1 returns the SHA1 hash of the image blob
|
||||
current_sha1 = shape.image.sha1
|
||||
# Get image hash based on selected method
|
||||
if use_perceptual_hash:
|
||||
# Use perceptual hash of the image blob for visual content matching
|
||||
current_hash = _calculate_perceptual_hash(shape.image.blob)
|
||||
else:
|
||||
# Use SHA1 hash from python-pptx (exact byte match)
|
||||
current_hash = shape.image.sha1
|
||||
|
||||
if current_sha1 in image_hash_map:
|
||||
original_path = image_hash_map[current_sha1]
|
||||
if current_hash in image_hash_map:
|
||||
original_path = image_hash_map[current_hash]
|
||||
|
||||
# Generate Alt Text
|
||||
try:
|
||||
@@ -252,17 +307,39 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
|
||||
print(f"Skipping match for {original_path} due to generator error: {e}")
|
||||
except Exception as e:
|
||||
print(f"Error updating alt text for {original_path}: {e}")
|
||||
|
||||
else:
|
||||
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
|
||||
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
|
||||
hash_type = "pHash" if use_perceptual_hash else "SHA1"
|
||||
unmatched_images.append({
|
||||
'slide': i+1,
|
||||
'shape_name': shape_name,
|
||||
'hash_type': hash_type,
|
||||
'hash': current_hash
|
||||
})
|
||||
|
||||
except AttributeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Error processing shape on slide {i+1}: {e}")
|
||||
|
||||
# Print summary
|
||||
print("\n" + "="*80)
|
||||
if updates_count > 0:
|
||||
prs.save(output_path)
|
||||
print(f"Saved updated presentation to {output_path} with {updates_count} updates.")
|
||||
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
|
||||
else:
|
||||
print("No images matched or required updates.")
|
||||
|
||||
# List unmatched images at the end
|
||||
if unmatched_images:
|
||||
print(f"\n⚠ {len(unmatched_images)} image(s) not found in source directory:")
|
||||
for img in unmatched_images:
|
||||
print(f" • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})")
|
||||
else:
|
||||
print("\n✓ All images matched successfully!")
|
||||
print("="*80)
|
||||
|
||||
|
||||
def extract_voice_label(html_str: str) -> str:
|
||||
|
||||
Reference in New Issue
Block a user