update alt-text with full filepaths
This commit is contained in:
@@ -26,7 +26,7 @@ def _():
|
|||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _():
|
def _():
|
||||||
TAG_SOURCE = Path('/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/data/reports/VOICE_Perception-Research-Report_3-2-26.pptx')
|
TAG_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_15-30.pptx')
|
||||||
# TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx')
|
# TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx')
|
||||||
TAG_IMAGE_DIR = Path('figures/debug')
|
TAG_IMAGE_DIR = Path('figures/debug')
|
||||||
return TAG_IMAGE_DIR, TAG_SOURCE
|
return TAG_IMAGE_DIR, TAG_SOURCE
|
||||||
@@ -52,7 +52,7 @@ def _():
|
|||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _():
|
def _():
|
||||||
REPLACE_SOURCE = Path('/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/data/reports/VOICE_Perception-Research-Report_3-2-26.pptx')
|
REPLACE_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_15-30.pptx')
|
||||||
# REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
|
# REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
|
||||||
|
|
||||||
NEW_IMAGES_DIR = Path('figures/debug')
|
NEW_IMAGES_DIR = Path('figures/debug')
|
||||||
|
|||||||
159
utils.py
159
utils.py
@@ -413,10 +413,30 @@ def _iter_picture_shapes(shapes):
|
|||||||
yield shape
|
yield shape
|
||||||
|
|
||||||
|
|
||||||
|
def _set_shape_alt_text(shape, alt_text: str):
|
||||||
|
"""
|
||||||
|
Set alt text (descr attribute) for a PowerPoint shape.
|
||||||
|
"""
|
||||||
|
nvPr = None
|
||||||
|
# Check for common property names used by python-pptx elements
|
||||||
|
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
|
||||||
|
if hasattr(shape._element, attr):
|
||||||
|
nvPr = getattr(shape._element, attr)
|
||||||
|
break
|
||||||
|
|
||||||
|
if nvPr and hasattr(nvPr, 'cNvPr'):
|
||||||
|
nvPr.cNvPr.set("descr", alt_text)
|
||||||
|
|
||||||
|
|
||||||
def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True):
|
def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True):
|
||||||
"""
|
"""
|
||||||
Updates the alt text of images in a PowerPoint presentation by matching
|
Updates the alt text of images in a PowerPoint presentation.
|
||||||
their content with images in a source directory.
|
|
||||||
|
1. First pass: Validates existing alt-text format (<filter>/<filename>).
|
||||||
|
- Fixes full paths by keeping only the last two parts.
|
||||||
|
- Clears invalid alt-text.
|
||||||
|
2. Second pass: If images are missing alt-text, matches them against source directory
|
||||||
|
using perceptual hash or SHA1.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
ppt_path (str/Path): Path to the PowerPoint file.
|
ppt_path (str/Path): Path to the PowerPoint file.
|
||||||
@@ -430,10 +450,7 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
|
|||||||
if output_path is None:
|
if output_path is None:
|
||||||
output_path = ppt_path
|
output_path = ppt_path
|
||||||
|
|
||||||
# 1. Build lookup map of {hash: file_path} from the source directory
|
# Open Presentation
|
||||||
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
|
|
||||||
|
|
||||||
# 2. Open Presentation
|
|
||||||
try:
|
try:
|
||||||
prs = Presentation(ppt_path)
|
prs = Presentation(ppt_path)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -441,24 +458,84 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
|
|||||||
return
|
return
|
||||||
|
|
||||||
updates_count = 0
|
updates_count = 0
|
||||||
unmatched_images = [] # Collect unmatched images to report at the end
|
images_needing_match = []
|
||||||
|
|
||||||
slides = list(prs.slides)
|
slides = list(prs.slides)
|
||||||
total_slides = len(slides)
|
total_slides = len(slides)
|
||||||
|
|
||||||
print(f"Processing {total_slides} slides...")
|
print(f"Scanning {total_slides} slides for existing alt-text...")
|
||||||
|
|
||||||
|
# Pass 1: Scan and clean existing alt-text
|
||||||
for i, slide in enumerate(slides):
|
for i, slide in enumerate(slides):
|
||||||
# Use recursive iterator to find all pictures including those in groups/placeholders
|
|
||||||
picture_shapes = list(_iter_picture_shapes(slide.shapes))
|
picture_shapes = list(_iter_picture_shapes(slide.shapes))
|
||||||
|
|
||||||
for shape in picture_shapes:
|
for shape in picture_shapes:
|
||||||
|
alt_text = _get_shape_alt_text(shape)
|
||||||
|
has_valid_alt = False
|
||||||
|
|
||||||
|
if alt_text:
|
||||||
|
# Handle potential path separators and whitespace
|
||||||
|
clean_alt = alt_text.strip().replace('\\', '/')
|
||||||
|
parts = clean_alt.split('/')
|
||||||
|
|
||||||
|
# Check if it looks like a path/file reference (at least 2 parts like dir/file)
|
||||||
|
if len(parts) >= 2:
|
||||||
|
# Enforce format: keep last 2 parts (e.g. filter/image.png)
|
||||||
|
new_alt = '/'.join(parts[-2:])
|
||||||
|
|
||||||
|
if new_alt != alt_text:
|
||||||
|
print(f"Slide {i+1}: Fixing alt-text format: '{alt_text}' -> '{new_alt}'")
|
||||||
|
_set_shape_alt_text(shape, new_alt)
|
||||||
|
updates_count += 1
|
||||||
|
|
||||||
|
has_valid_alt = True
|
||||||
|
else:
|
||||||
|
# User requested deleting other cases that do not meet format
|
||||||
|
# If it's single word or doesn't look like our path format
|
||||||
|
pass # logic below handles this
|
||||||
|
|
||||||
|
if not has_valid_alt:
|
||||||
|
if alt_text:
|
||||||
|
print(f"Slide {i+1}: Invalid/Legacy alt-text '{alt_text}'. Clearing for re-matching.")
|
||||||
|
_set_shape_alt_text(shape, "")
|
||||||
|
updates_count += 1
|
||||||
|
|
||||||
|
# Queue for hash matching
|
||||||
|
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
|
||||||
|
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
|
||||||
|
images_needing_match.append({
|
||||||
|
'slide_idx': i, # 0-based
|
||||||
|
'slide_num': i+1,
|
||||||
|
'shape': shape,
|
||||||
|
'shape_name': shape_name
|
||||||
|
})
|
||||||
|
|
||||||
|
if not images_needing_match:
|
||||||
|
print("\nAll images have valid alt-text format. No hash matching needed.")
|
||||||
|
if updates_count > 0:
|
||||||
|
prs.save(output_path)
|
||||||
|
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
|
||||||
|
else:
|
||||||
|
print("Presentation is up to date.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Pass 2: Hash Matching
|
||||||
|
print(f"\n{len(images_needing_match)} images missing proper alt-text. Proceeding with hash matching...")
|
||||||
|
|
||||||
|
# Build lookup map of {hash: file_path} only if needed
|
||||||
|
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
|
||||||
|
|
||||||
|
unmatched_images = []
|
||||||
|
|
||||||
|
for item in images_needing_match:
|
||||||
|
shape = item['shape']
|
||||||
|
slide_num = item['slide_num']
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get image hash based on selected method
|
# Get image hash
|
||||||
if use_perceptual_hash:
|
if use_perceptual_hash:
|
||||||
# Use perceptual hash of the image blob for visual content matching
|
|
||||||
current_hash = _calculate_perceptual_hash(shape.image.blob)
|
current_hash = _calculate_perceptual_hash(shape.image.blob)
|
||||||
else:
|
else:
|
||||||
# Use SHA1 hash from python-pptx (exact byte match)
|
|
||||||
current_hash = shape.image.sha1
|
current_hash = shape.image.sha1
|
||||||
|
|
||||||
if current_hash in image_hash_map:
|
if current_hash in image_hash_map:
|
||||||
@@ -466,7 +543,6 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
|
|||||||
|
|
||||||
# Generate Alt Text
|
# Generate Alt Text
|
||||||
try:
|
try:
|
||||||
# Prepare path for generator.
|
|
||||||
# Try to relativize to CWD if capable
|
# Try to relativize to CWD if capable
|
||||||
pass_path = original_path
|
pass_path = original_path
|
||||||
try:
|
try:
|
||||||
@@ -476,75 +552,38 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
|
|||||||
|
|
||||||
new_alt_text = image_alt_text_generator(pass_path)
|
new_alt_text = image_alt_text_generator(pass_path)
|
||||||
|
|
||||||
# Check existing alt text to avoid redundant updates/log them
|
print(f"Slide {slide_num}: Match found! Assigning alt-text '{new_alt_text}'")
|
||||||
# Accessing alt text via cNvPr
|
_set_shape_alt_text(shape, new_alt_text)
|
||||||
# Note: Different shape types might store non-visual props differently
|
|
||||||
# Picture: nvPicPr.cNvPr
|
|
||||||
# GraphicFrame: nvGraphicFramePr.cNvPr
|
|
||||||
# Group: nvGrpSpPr.cNvPr
|
|
||||||
# Shape/Placeholder: nvSpPr.cNvPr
|
|
||||||
|
|
||||||
nvPr = None
|
|
||||||
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
|
|
||||||
if hasattr(shape._element, attr):
|
|
||||||
nvPr = getattr(shape._element, attr)
|
|
||||||
break
|
|
||||||
|
|
||||||
if nvPr and hasattr(nvPr, 'cNvPr'):
|
|
||||||
cNvPr = nvPr.cNvPr
|
|
||||||
existing_alt_text = cNvPr.get("descr", "")
|
|
||||||
|
|
||||||
if existing_alt_text != new_alt_text:
|
|
||||||
print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'")
|
|
||||||
print(f" Old: '{existing_alt_text}' -> New: '{new_alt_text}'")
|
|
||||||
cNvPr.set("descr", new_alt_text)
|
|
||||||
updates_count += 1
|
updates_count += 1
|
||||||
else:
|
|
||||||
print(f"Could not find cNvPr for shape on slide {i+1}")
|
|
||||||
|
|
||||||
except AssertionError as e:
|
|
||||||
print(f"Skipping match for {original_path} due to generator error: {e}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error updating alt text for {original_path}: {e}")
|
print(f"Error generating alt text for {original_path}: {e}")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Check if image already has alt text set - if so, skip reporting as unmatched
|
|
||||||
existing_alt = _get_shape_alt_text(shape)
|
|
||||||
if existing_alt:
|
|
||||||
# Image already has alt text, no need to report as unmatched
|
|
||||||
continue
|
|
||||||
|
|
||||||
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
|
|
||||||
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
|
|
||||||
hash_type = "pHash" if use_perceptual_hash else "SHA1"
|
hash_type = "pHash" if use_perceptual_hash else "SHA1"
|
||||||
|
|
||||||
unmatched_images.append({
|
unmatched_images.append({
|
||||||
'slide': i+1,
|
'slide': slide_num,
|
||||||
'shape_name': shape_name,
|
'shape_name': item['shape_name'],
|
||||||
'hash_type': hash_type,
|
'hash_type': hash_type,
|
||||||
'hash': current_hash
|
'hash': current_hash
|
||||||
})
|
})
|
||||||
|
|
||||||
except AttributeError:
|
|
||||||
continue
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing shape on slide {i+1}: {e}")
|
print(f"Error processing shape on slide {slide_num}: {e}")
|
||||||
|
|
||||||
# Print summary
|
# Save and Print Summary
|
||||||
print("\n" + "="*80)
|
print("\n" + "="*80)
|
||||||
if updates_count > 0:
|
if updates_count > 0:
|
||||||
prs.save(output_path)
|
prs.save(output_path)
|
||||||
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
|
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
|
||||||
else:
|
else:
|
||||||
print("No images matched or required updates.")
|
print("No matches found for missing images.")
|
||||||
|
|
||||||
# List unmatched images at the end
|
|
||||||
if unmatched_images:
|
if unmatched_images:
|
||||||
print(f"\n⚠ {len(unmatched_images)} image(s) not found in source directory:")
|
print(f"\n⚠ {len(unmatched_images)} image(s) could not be matched:")
|
||||||
for img in unmatched_images:
|
for img in unmatched_images:
|
||||||
print(f" • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})")
|
print(f" • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})")
|
||||||
else:
|
else:
|
||||||
print("\n✓ All images matched successfully!")
|
print("\n✓ All images processed successfully!")
|
||||||
print("="*80)
|
print("="*80)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user