update alt-text with full filepaths

This commit is contained in:
2026-02-04 17:48:48 +01:00
parent d231fc02db
commit f0eab32c34
2 changed files with 128 additions and 89 deletions

View File

@@ -26,7 +26,7 @@ def _():
@app.cell @app.cell
def _(): def _():
TAG_SOURCE = Path('/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/data/reports/VOICE_Perception-Research-Report_3-2-26.pptx') TAG_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_15-30.pptx')
# TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx') # TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx')
TAG_IMAGE_DIR = Path('figures/debug') TAG_IMAGE_DIR = Path('figures/debug')
return TAG_IMAGE_DIR, TAG_SOURCE return TAG_IMAGE_DIR, TAG_SOURCE
@@ -52,7 +52,7 @@ def _():
@app.cell @app.cell
def _(): def _():
REPLACE_SOURCE = Path('/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/data/reports/VOICE_Perception-Research-Report_3-2-26.pptx') REPLACE_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_15-30.pptx')
# REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx') # REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
NEW_IMAGES_DIR = Path('figures/debug') NEW_IMAGES_DIR = Path('figures/debug')

205
utils.py
View File

@@ -413,10 +413,30 @@ def _iter_picture_shapes(shapes):
yield shape yield shape
def _set_shape_alt_text(shape, alt_text: str):
"""
Set alt text (descr attribute) for a PowerPoint shape.
"""
nvPr = None
# Check for common property names used by python-pptx elements
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
if hasattr(shape._element, attr):
nvPr = getattr(shape._element, attr)
break
if nvPr and hasattr(nvPr, 'cNvPr'):
nvPr.cNvPr.set("descr", alt_text)
def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True): def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True):
""" """
Updates the alt text of images in a PowerPoint presentation by matching Updates the alt text of images in a PowerPoint presentation.
their content with images in a source directory.
1. First pass: Validates existing alt-text format (<filter>/<filename>).
- Fixes full paths by keeping only the last two parts.
- Clears invalid alt-text.
2. Second pass: If images are missing alt-text, matches them against source directory
using perceptual hash or SHA1.
Args: Args:
ppt_path (str/Path): Path to the PowerPoint file. ppt_path (str/Path): Path to the PowerPoint file.
@@ -430,10 +450,7 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
if output_path is None: if output_path is None:
output_path = ppt_path output_path = ppt_path
# 1. Build lookup map of {hash: file_path} from the source directory # Open Presentation
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
# 2. Open Presentation
try: try:
prs = Presentation(ppt_path) prs = Presentation(ppt_path)
except Exception as e: except Exception as e:
@@ -441,110 +458,132 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
return return
updates_count = 0 updates_count = 0
unmatched_images = [] # Collect unmatched images to report at the end images_needing_match = []
slides = list(prs.slides) slides = list(prs.slides)
total_slides = len(slides) total_slides = len(slides)
print(f"Processing {total_slides} slides...") print(f"Scanning {total_slides} slides for existing alt-text...")
# Pass 1: Scan and clean existing alt-text
for i, slide in enumerate(slides): for i, slide in enumerate(slides):
# Use recursive iterator to find all pictures including those in groups/placeholders
picture_shapes = list(_iter_picture_shapes(slide.shapes)) picture_shapes = list(_iter_picture_shapes(slide.shapes))
for shape in picture_shapes: for shape in picture_shapes:
try: alt_text = _get_shape_alt_text(shape)
# Get image hash based on selected method has_valid_alt = False
if use_perceptual_hash:
# Use perceptual hash of the image blob for visual content matching if alt_text:
current_hash = _calculate_perceptual_hash(shape.image.blob) # Handle potential path separators and whitespace
clean_alt = alt_text.strip().replace('\\', '/')
parts = clean_alt.split('/')
# Check if it looks like a path/file reference (at least 2 parts like dir/file)
if len(parts) >= 2:
# Enforce format: keep last 2 parts (e.g. filter/image.png)
new_alt = '/'.join(parts[-2:])
if new_alt != alt_text:
print(f"Slide {i+1}: Fixing alt-text format: '{alt_text}' -> '{new_alt}'")
_set_shape_alt_text(shape, new_alt)
updates_count += 1
has_valid_alt = True
else: else:
# Use SHA1 hash from python-pptx (exact byte match) # User requested deleting other cases that do not meet format
current_hash = shape.image.sha1 # If it's single word or doesn't look like our path format
pass # logic below handles this
if current_hash in image_hash_map: if not has_valid_alt:
original_path = image_hash_map[current_hash] if alt_text:
print(f"Slide {i+1}: Invalid/Legacy alt-text '{alt_text}'. Clearing for re-matching.")
_set_shape_alt_text(shape, "")
updates_count += 1
# Generate Alt Text # Queue for hash matching
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
images_needing_match.append({
'slide_idx': i, # 0-based
'slide_num': i+1,
'shape': shape,
'shape_name': shape_name
})
if not images_needing_match:
print("\nAll images have valid alt-text format. No hash matching needed.")
if updates_count > 0:
prs.save(output_path)
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
else:
print("Presentation is up to date.")
return
# Pass 2: Hash Matching
print(f"\n{len(images_needing_match)} images missing proper alt-text. Proceeding with hash matching...")
# Build lookup map of {hash: file_path} only if needed
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
unmatched_images = []
for item in images_needing_match:
shape = item['shape']
slide_num = item['slide_num']
try:
# Get image hash
if use_perceptual_hash:
current_hash = _calculate_perceptual_hash(shape.image.blob)
else:
current_hash = shape.image.sha1
if current_hash in image_hash_map:
original_path = image_hash_map[current_hash]
# Generate Alt Text
try:
# Try to relativize to CWD if capable
pass_path = original_path
try: try:
# Prepare path for generator. pass_path = original_path.relative_to(Path.cwd())
# Try to relativize to CWD if capable except ValueError:
pass_path = original_path pass
try:
pass_path = original_path.relative_to(Path.cwd())
except ValueError:
pass
new_alt_text = image_alt_text_generator(pass_path) new_alt_text = image_alt_text_generator(pass_path)
# Check existing alt text to avoid redundant updates/log them print(f"Slide {slide_num}: Match found! Assigning alt-text '{new_alt_text}'")
# Accessing alt text via cNvPr _set_shape_alt_text(shape, new_alt_text)
# Note: Different shape types might store non-visual props differently updates_count += 1
# Picture: nvPicPr.cNvPr
# GraphicFrame: nvGraphicFramePr.cNvPr
# Group: nvGrpSpPr.cNvPr
# Shape/Placeholder: nvSpPr.cNvPr
nvPr = None except Exception as e:
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']: print(f"Error generating alt text for {original_path}: {e}")
if hasattr(shape._element, attr): else:
nvPr = getattr(shape._element, attr) hash_type = "pHash" if use_perceptual_hash else "SHA1"
break unmatched_images.append({
'slide': slide_num,
'shape_name': item['shape_name'],
'hash_type': hash_type,
'hash': current_hash
})
if nvPr and hasattr(nvPr, 'cNvPr'): except Exception as e:
cNvPr = nvPr.cNvPr print(f"Error processing shape on slide {slide_num}: {e}")
existing_alt_text = cNvPr.get("descr", "")
if existing_alt_text != new_alt_text: # Save and Print Summary
print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'")
print(f" Old: '{existing_alt_text}' -> New: '{new_alt_text}'")
cNvPr.set("descr", new_alt_text)
updates_count += 1
else:
print(f"Could not find cNvPr for shape on slide {i+1}")
except AssertionError as e:
print(f"Skipping match for {original_path} due to generator error: {e}")
except Exception as e:
print(f"Error updating alt text for {original_path}: {e}")
else:
# Check if image already has alt text set - if so, skip reporting as unmatched
existing_alt = _get_shape_alt_text(shape)
if existing_alt:
# Image already has alt text, no need to report as unmatched
continue
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
hash_type = "pHash" if use_perceptual_hash else "SHA1"
unmatched_images.append({
'slide': i+1,
'shape_name': shape_name,
'hash_type': hash_type,
'hash': current_hash
})
except AttributeError:
continue
except Exception as e:
print(f"Error processing shape on slide {i+1}: {e}")
# Print summary
print("\n" + "="*80) print("\n" + "="*80)
if updates_count > 0: if updates_count > 0:
prs.save(output_path) prs.save(output_path)
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.") print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
else: else:
print("No images matched or required updates.") print("No matches found for missing images.")
# List unmatched images at the end
if unmatched_images: if unmatched_images:
print(f"\n{len(unmatched_images)} image(s) not found in source directory:") print(f"\n{len(unmatched_images)} image(s) could not be matched:")
for img in unmatched_images: for img in unmatched_images:
print(f" • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})") print(f" • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})")
else: else:
print("\n✓ All images matched successfully!") print("\n✓ All images processed successfully!")
print("="*80) print("="*80)