update alt-text with full filepaths

This commit is contained in:
2026-02-04 17:48:48 +01:00
parent d231fc02db
commit f0eab32c34
2 changed files with 128 additions and 89 deletions

View File

@@ -26,7 +26,7 @@ def _():
@app.cell
def _():
TAG_SOURCE = Path('/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/data/reports/VOICE_Perception-Research-Report_3-2-26.pptx')
TAG_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_15-30.pptx')
# TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx')
TAG_IMAGE_DIR = Path('figures/debug')
return TAG_IMAGE_DIR, TAG_SOURCE
@@ -52,7 +52,7 @@ def _():
@app.cell
def _():
REPLACE_SOURCE = Path('/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/data/reports/VOICE_Perception-Research-Report_3-2-26.pptx')
REPLACE_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_15-30.pptx')
# REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
NEW_IMAGES_DIR = Path('figures/debug')

159
utils.py
View File

@@ -413,10 +413,30 @@ def _iter_picture_shapes(shapes):
yield shape
def _set_shape_alt_text(shape, alt_text: str):
"""
Set alt text (descr attribute) for a PowerPoint shape.
"""
nvPr = None
# Check for common property names used by python-pptx elements
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
if hasattr(shape._element, attr):
nvPr = getattr(shape._element, attr)
break
if nvPr and hasattr(nvPr, 'cNvPr'):
nvPr.cNvPr.set("descr", alt_text)
def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True):
"""
Updates the alt text of images in a PowerPoint presentation by matching
their content with images in a source directory.
Updates the alt text of images in a PowerPoint presentation.
1. First pass: Validates existing alt-text format (<filter>/<filename>).
- Fixes full paths by keeping only the last two parts.
- Clears invalid alt-text.
2. Second pass: If images are missing alt-text, matches them against source directory
using perceptual hash or SHA1.
Args:
ppt_path (str/Path): Path to the PowerPoint file.
@@ -430,10 +450,7 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
if output_path is None:
output_path = ppt_path
# 1. Build lookup map of {hash: file_path} from the source directory
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
# 2. Open Presentation
# Open Presentation
try:
prs = Presentation(ppt_path)
except Exception as e:
@@ -441,24 +458,84 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
return
updates_count = 0
unmatched_images = [] # Collect unmatched images to report at the end
images_needing_match = []
slides = list(prs.slides)
total_slides = len(slides)
print(f"Processing {total_slides} slides...")
print(f"Scanning {total_slides} slides for existing alt-text...")
# Pass 1: Scan and clean existing alt-text
for i, slide in enumerate(slides):
# Use recursive iterator to find all pictures including those in groups/placeholders
picture_shapes = list(_iter_picture_shapes(slide.shapes))
for shape in picture_shapes:
alt_text = _get_shape_alt_text(shape)
has_valid_alt = False
if alt_text:
# Handle potential path separators and whitespace
clean_alt = alt_text.strip().replace('\\', '/')
parts = clean_alt.split('/')
# Check if it looks like a path/file reference (at least 2 parts like dir/file)
if len(parts) >= 2:
# Enforce format: keep last 2 parts (e.g. filter/image.png)
new_alt = '/'.join(parts[-2:])
if new_alt != alt_text:
print(f"Slide {i+1}: Fixing alt-text format: '{alt_text}' -> '{new_alt}'")
_set_shape_alt_text(shape, new_alt)
updates_count += 1
has_valid_alt = True
else:
# User requested deleting other cases that do not meet format
# If it's single word or doesn't look like our path format
pass # logic below handles this
if not has_valid_alt:
if alt_text:
print(f"Slide {i+1}: Invalid/Legacy alt-text '{alt_text}'. Clearing for re-matching.")
_set_shape_alt_text(shape, "")
updates_count += 1
# Queue for hash matching
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
images_needing_match.append({
'slide_idx': i, # 0-based
'slide_num': i+1,
'shape': shape,
'shape_name': shape_name
})
if not images_needing_match:
print("\nAll images have valid alt-text format. No hash matching needed.")
if updates_count > 0:
prs.save(output_path)
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
else:
print("Presentation is up to date.")
return
# Pass 2: Hash Matching
print(f"\n{len(images_needing_match)} images missing proper alt-text. Proceeding with hash matching...")
# Build lookup map of {hash: file_path} only if needed
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
unmatched_images = []
for item in images_needing_match:
shape = item['shape']
slide_num = item['slide_num']
try:
# Get image hash based on selected method
# Get image hash
if use_perceptual_hash:
# Use perceptual hash of the image blob for visual content matching
current_hash = _calculate_perceptual_hash(shape.image.blob)
else:
# Use SHA1 hash from python-pptx (exact byte match)
current_hash = shape.image.sha1
if current_hash in image_hash_map:
@@ -466,7 +543,6 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
# Generate Alt Text
try:
# Prepare path for generator.
# Try to relativize to CWD if capable
pass_path = original_path
try:
@@ -476,75 +552,38 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
new_alt_text = image_alt_text_generator(pass_path)
# Check existing alt text to avoid redundant updates/log them
# Accessing alt text via cNvPr
# Note: Different shape types might store non-visual props differently
# Picture: nvPicPr.cNvPr
# GraphicFrame: nvGraphicFramePr.cNvPr
# Group: nvGrpSpPr.cNvPr
# Shape/Placeholder: nvSpPr.cNvPr
nvPr = None
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
if hasattr(shape._element, attr):
nvPr = getattr(shape._element, attr)
break
if nvPr and hasattr(nvPr, 'cNvPr'):
cNvPr = nvPr.cNvPr
existing_alt_text = cNvPr.get("descr", "")
if existing_alt_text != new_alt_text:
print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'")
print(f" Old: '{existing_alt_text}' -> New: '{new_alt_text}'")
cNvPr.set("descr", new_alt_text)
print(f"Slide {slide_num}: Match found! Assigning alt-text '{new_alt_text}'")
_set_shape_alt_text(shape, new_alt_text)
updates_count += 1
else:
print(f"Could not find cNvPr for shape on slide {i+1}")
except AssertionError as e:
print(f"Skipping match for {original_path} due to generator error: {e}")
except Exception as e:
print(f"Error updating alt text for {original_path}: {e}")
print(f"Error generating alt text for {original_path}: {e}")
else:
# Check if image already has alt text set - if so, skip reporting as unmatched
existing_alt = _get_shape_alt_text(shape)
if existing_alt:
# Image already has alt text, no need to report as unmatched
continue
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
hash_type = "pHash" if use_perceptual_hash else "SHA1"
unmatched_images.append({
'slide': i+1,
'shape_name': shape_name,
'slide': slide_num,
'shape_name': item['shape_name'],
'hash_type': hash_type,
'hash': current_hash
})
except AttributeError:
continue
except Exception as e:
print(f"Error processing shape on slide {i+1}: {e}")
print(f"Error processing shape on slide {slide_num}: {e}")
# Print summary
# Save and Print Summary
print("\n" + "="*80)
if updates_count > 0:
prs.save(output_path)
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
else:
print("No images matched or required updates.")
print("No matches found for missing images.")
# List unmatched images at the end
if unmatched_images:
print(f"\n{len(unmatched_images)} image(s) not found in source directory:")
print(f"\n{len(unmatched_images)} image(s) could not be matched:")
for img in unmatched_images:
print(f" • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})")
else:
print("\n✓ All images matched successfully!")
print("\n✓ All images processed successfully!")
print("="*80)