update alt-text with full filepaths
This commit is contained in:
@@ -26,7 +26,7 @@ def _():
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
TAG_SOURCE = Path('/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/data/reports/VOICE_Perception-Research-Report_3-2-26.pptx')
|
||||
TAG_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_15-30.pptx')
|
||||
# TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx')
|
||||
TAG_IMAGE_DIR = Path('figures/debug')
|
||||
return TAG_IMAGE_DIR, TAG_SOURCE
|
||||
@@ -52,7 +52,7 @@ def _():
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
REPLACE_SOURCE = Path('/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/data/reports/VOICE_Perception-Research-Report_3-2-26.pptx')
|
||||
REPLACE_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_15-30.pptx')
|
||||
# REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
|
||||
|
||||
NEW_IMAGES_DIR = Path('figures/debug')
|
||||
|
||||
159
utils.py
159
utils.py
@@ -413,10 +413,30 @@ def _iter_picture_shapes(shapes):
|
||||
yield shape
|
||||
|
||||
|
||||
def _set_shape_alt_text(shape, alt_text: str):
|
||||
"""
|
||||
Set alt text (descr attribute) for a PowerPoint shape.
|
||||
"""
|
||||
nvPr = None
|
||||
# Check for common property names used by python-pptx elements
|
||||
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
|
||||
if hasattr(shape._element, attr):
|
||||
nvPr = getattr(shape._element, attr)
|
||||
break
|
||||
|
||||
if nvPr and hasattr(nvPr, 'cNvPr'):
|
||||
nvPr.cNvPr.set("descr", alt_text)
|
||||
|
||||
|
||||
def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True):
|
||||
"""
|
||||
Updates the alt text of images in a PowerPoint presentation by matching
|
||||
their content with images in a source directory.
|
||||
Updates the alt text of images in a PowerPoint presentation.
|
||||
|
||||
1. First pass: Validates existing alt-text format (<filter>/<filename>).
|
||||
- Fixes full paths by keeping only the last two parts.
|
||||
- Clears invalid alt-text.
|
||||
2. Second pass: If images are missing alt-text, matches them against source directory
|
||||
using perceptual hash or SHA1.
|
||||
|
||||
Args:
|
||||
ppt_path (str/Path): Path to the PowerPoint file.
|
||||
@@ -430,10 +450,7 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
|
||||
if output_path is None:
|
||||
output_path = ppt_path
|
||||
|
||||
# 1. Build lookup map of {hash: file_path} from the source directory
|
||||
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
|
||||
|
||||
# 2. Open Presentation
|
||||
# Open Presentation
|
||||
try:
|
||||
prs = Presentation(ppt_path)
|
||||
except Exception as e:
|
||||
@@ -441,24 +458,84 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
|
||||
return
|
||||
|
||||
updates_count = 0
|
||||
unmatched_images = [] # Collect unmatched images to report at the end
|
||||
images_needing_match = []
|
||||
|
||||
slides = list(prs.slides)
|
||||
total_slides = len(slides)
|
||||
|
||||
print(f"Processing {total_slides} slides...")
|
||||
print(f"Scanning {total_slides} slides for existing alt-text...")
|
||||
|
||||
# Pass 1: Scan and clean existing alt-text
|
||||
for i, slide in enumerate(slides):
|
||||
# Use recursive iterator to find all pictures including those in groups/placeholders
|
||||
picture_shapes = list(_iter_picture_shapes(slide.shapes))
|
||||
|
||||
for shape in picture_shapes:
|
||||
alt_text = _get_shape_alt_text(shape)
|
||||
has_valid_alt = False
|
||||
|
||||
if alt_text:
|
||||
# Handle potential path separators and whitespace
|
||||
clean_alt = alt_text.strip().replace('\\', '/')
|
||||
parts = clean_alt.split('/')
|
||||
|
||||
# Check if it looks like a path/file reference (at least 2 parts like dir/file)
|
||||
if len(parts) >= 2:
|
||||
# Enforce format: keep last 2 parts (e.g. filter/image.png)
|
||||
new_alt = '/'.join(parts[-2:])
|
||||
|
||||
if new_alt != alt_text:
|
||||
print(f"Slide {i+1}: Fixing alt-text format: '{alt_text}' -> '{new_alt}'")
|
||||
_set_shape_alt_text(shape, new_alt)
|
||||
updates_count += 1
|
||||
|
||||
has_valid_alt = True
|
||||
else:
|
||||
# User requested deleting other cases that do not meet format
|
||||
# If it's single word or doesn't look like our path format
|
||||
pass # logic below handles this
|
||||
|
||||
if not has_valid_alt:
|
||||
if alt_text:
|
||||
print(f"Slide {i+1}: Invalid/Legacy alt-text '{alt_text}'. Clearing for re-matching.")
|
||||
_set_shape_alt_text(shape, "")
|
||||
updates_count += 1
|
||||
|
||||
# Queue for hash matching
|
||||
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
|
||||
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
|
||||
images_needing_match.append({
|
||||
'slide_idx': i, # 0-based
|
||||
'slide_num': i+1,
|
||||
'shape': shape,
|
||||
'shape_name': shape_name
|
||||
})
|
||||
|
||||
if not images_needing_match:
|
||||
print("\nAll images have valid alt-text format. No hash matching needed.")
|
||||
if updates_count > 0:
|
||||
prs.save(output_path)
|
||||
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
|
||||
else:
|
||||
print("Presentation is up to date.")
|
||||
return
|
||||
|
||||
# Pass 2: Hash Matching
|
||||
print(f"\n{len(images_needing_match)} images missing proper alt-text. Proceeding with hash matching...")
|
||||
|
||||
# Build lookup map of {hash: file_path} only if needed
|
||||
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
|
||||
|
||||
unmatched_images = []
|
||||
|
||||
for item in images_needing_match:
|
||||
shape = item['shape']
|
||||
slide_num = item['slide_num']
|
||||
|
||||
try:
|
||||
# Get image hash based on selected method
|
||||
# Get image hash
|
||||
if use_perceptual_hash:
|
||||
# Use perceptual hash of the image blob for visual content matching
|
||||
current_hash = _calculate_perceptual_hash(shape.image.blob)
|
||||
else:
|
||||
# Use SHA1 hash from python-pptx (exact byte match)
|
||||
current_hash = shape.image.sha1
|
||||
|
||||
if current_hash in image_hash_map:
|
||||
@@ -466,7 +543,6 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
|
||||
|
||||
# Generate Alt Text
|
||||
try:
|
||||
# Prepare path for generator.
|
||||
# Try to relativize to CWD if capable
|
||||
pass_path = original_path
|
||||
try:
|
||||
@@ -476,75 +552,38 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
|
||||
|
||||
new_alt_text = image_alt_text_generator(pass_path)
|
||||
|
||||
# Check existing alt text to avoid redundant updates/log them
|
||||
# Accessing alt text via cNvPr
|
||||
# Note: Different shape types might store non-visual props differently
|
||||
# Picture: nvPicPr.cNvPr
|
||||
# GraphicFrame: nvGraphicFramePr.cNvPr
|
||||
# Group: nvGrpSpPr.cNvPr
|
||||
# Shape/Placeholder: nvSpPr.cNvPr
|
||||
|
||||
nvPr = None
|
||||
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
|
||||
if hasattr(shape._element, attr):
|
||||
nvPr = getattr(shape._element, attr)
|
||||
break
|
||||
|
||||
if nvPr and hasattr(nvPr, 'cNvPr'):
|
||||
cNvPr = nvPr.cNvPr
|
||||
existing_alt_text = cNvPr.get("descr", "")
|
||||
|
||||
if existing_alt_text != new_alt_text:
|
||||
print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'")
|
||||
print(f" Old: '{existing_alt_text}' -> New: '{new_alt_text}'")
|
||||
cNvPr.set("descr", new_alt_text)
|
||||
print(f"Slide {slide_num}: Match found! Assigning alt-text '{new_alt_text}'")
|
||||
_set_shape_alt_text(shape, new_alt_text)
|
||||
updates_count += 1
|
||||
else:
|
||||
print(f"Could not find cNvPr for shape on slide {i+1}")
|
||||
|
||||
except AssertionError as e:
|
||||
print(f"Skipping match for {original_path} due to generator error: {e}")
|
||||
except Exception as e:
|
||||
print(f"Error updating alt text for {original_path}: {e}")
|
||||
|
||||
print(f"Error generating alt text for {original_path}: {e}")
|
||||
else:
|
||||
# Check if image already has alt text set - if so, skip reporting as unmatched
|
||||
existing_alt = _get_shape_alt_text(shape)
|
||||
if existing_alt:
|
||||
# Image already has alt text, no need to report as unmatched
|
||||
continue
|
||||
|
||||
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
|
||||
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
|
||||
hash_type = "pHash" if use_perceptual_hash else "SHA1"
|
||||
|
||||
unmatched_images.append({
|
||||
'slide': i+1,
|
||||
'shape_name': shape_name,
|
||||
'slide': slide_num,
|
||||
'shape_name': item['shape_name'],
|
||||
'hash_type': hash_type,
|
||||
'hash': current_hash
|
||||
})
|
||||
|
||||
except AttributeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Error processing shape on slide {i+1}: {e}")
|
||||
print(f"Error processing shape on slide {slide_num}: {e}")
|
||||
|
||||
# Print summary
|
||||
# Save and Print Summary
|
||||
print("\n" + "="*80)
|
||||
if updates_count > 0:
|
||||
prs.save(output_path)
|
||||
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
|
||||
else:
|
||||
print("No images matched or required updates.")
|
||||
print("No matches found for missing images.")
|
||||
|
||||
# List unmatched images at the end
|
||||
if unmatched_images:
|
||||
print(f"\n⚠ {len(unmatched_images)} image(s) not found in source directory:")
|
||||
print(f"\n⚠ {len(unmatched_images)} image(s) could not be matched:")
|
||||
for img in unmatched_images:
|
||||
print(f" • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})")
|
||||
else:
|
||||
print("\n✓ All images matched successfully!")
|
||||
print("\n✓ All images processed successfully!")
|
||||
print("="*80)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user