rename example notebooks and finish ppt pipeline functions

This commit is contained in:
2026-01-29 16:07:55 +01:00
parent 3ee25f9e33
commit 5f9e67a312
4 changed files with 241 additions and 64 deletions

View File

@@ -1,63 +0,0 @@
import marimo
__generated_with = "0.19.2"
app = marimo.App(width="medium")
with app.setup:
import marimo as mo
from pathlib import Path
import utils
@app.cell
def _():
PPT_FILE = Path('data/Presentation.pptx')
UPDATED_PPT_FILE = Path('data/Updated_Presentation.pptx')
return PPT_FILE, UPDATED_PPT_FILE
@app.cell
def _():
IMAGE_FILE = Path('figures/OneDrive_2026-01-28/Cons-Early_Professional/cold_distant_approachable_familiar_warm.png')
return (IMAGE_FILE,)
@app.function
def image_alt_text_converter(fpath):
"""convert image file path to alt text
"""
if not isinstance(fpath, Path):
fpath = Path(fpath)
fparts = fpath.parts
assert fparts[0] == 'figures', "Image file path must start with 'figures'"
return Path('/'.join(fparts[2:])).as_posix()
@app.cell
def _(IMAGE_FILE):
img_alt_txt = image_alt_text_converter(IMAGE_FILE)
img_alt_txt
return (img_alt_txt,)
@app.cell
def _(IMAGE_FILE, PPT_FILE, UPDATED_PPT_FILE, img_alt_txt):
utils.pptx_replace_named_image(
presentation_path=PPT_FILE,
target_tag=img_alt_txt,
new_image_path=IMAGE_FILE,
save_path=UPDATED_PPT_FILE)
return
@app.cell
def _(P):
print(P.slides[10])
return
if __name__ == "__main__":
app.run()

View File

@@ -0,0 +1,73 @@
import marimo
__generated_with = "0.19.2"
app = marimo.App(width="medium")
with app.setup:
import marimo as mo
from pathlib import Path
import utils
@app.cell
def _():
mo.md(r"""
# Tag existing images with Alt-Text
Based on image content
""")
return
@app.cell
def _():
TAG_SOURCE = Path('data/test_tag_source.pptx')
TAG_TARGET = Path('data/test_tag_target.pptx')
TAG_IMAGE_DIR = Path('figures/OneDrive_2026-01-28/')
return TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET
@app.cell
def _(TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET):
utils.update_ppt_alt_text(ppt_path=TAG_SOURCE, image_source_dir=TAG_IMAGE_DIR, output_path=TAG_TARGET)
return
@app.cell
def _():
return
@app.cell
def _():
mo.md(r"""
# Replace Images using Alt-Text
""")
return
@app.cell
def _():
REPLACE_SOURCE = Path('data/test_replace_source.pptx')
REPLACE_TARGET = Path('data/test_replace_target.pptx')
return REPLACE_SOURCE, REPLACE_TARGET
@app.cell
def _():
IMAGE_FILE = Path('figures/OneDrive_2026-01-28/Cons-Early_Professional/cold_distant_approachable_familiar_warm.png')
return (IMAGE_FILE,)
@app.cell
def _(IMAGE_FILE, REPLACE_SOURCE, REPLACE_TARGET):
utils.pptx_replace_named_image(
presentation_path=REPLACE_SOURCE,
target_tag=utils.image_alt_text_generator(IMAGE_FILE),
new_image_path=IMAGE_FILE,
save_path=REPLACE_TARGET)
return
if __name__ == "__main__":
app.run()

169
utils.py
View File

@@ -4,13 +4,27 @@ import pandas as pd
from typing import Union
import json
import re
import hashlib
import os
from plots import JPMCPlotsMixin
import marimo as mo
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
def image_alt_text_generator(fpath):
"""convert image file path to alt text
"""
if not isinstance(fpath, Path):
fpath = Path(fpath)
fparts = fpath.parts
assert fparts[0] == 'figures', "Image file path must start with 'figures'"
return Path('/'.join(fparts[2:])).as_posix()
def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save_path):
"""
Finds and replaces specific images in a PowerPoint presentation while
@@ -91,6 +105,159 @@ def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save
print(f"Successfully saved to {save_path}")
def _calculate_file_sha1(file_path: Union[str, Path]) -> str:
"""Calculate SHA1 hash of a file."""
sha1 = hashlib.sha1()
with open(file_path, 'rb') as f:
while True:
data = f.read(65536)
if not data:
break
sha1.update(data)
return sha1.hexdigest()
def _build_image_hash_map(root_dir: Union[str, Path]) -> dict:
"""
Recursively walk the directory and build a map of SHA1 hashes to file paths.
Only includes common image extensions.
"""
hash_map = {}
valid_extensions = {'.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'}
root = Path(root_dir)
print(f"Building image hash map from {root}...")
count = 0
for root_path, dirs, files in os.walk(root):
for file in files:
file_path = Path(root_path) / file
if file_path.suffix.lower() in valid_extensions:
try:
file_sha1 = _calculate_file_sha1(file_path)
# We store the absolute path for reference, but we might just need the path relative to project for alt text
hash_map[file_sha1] = file_path
count += 1
except Exception as e:
print(f"Error hashing {file_path}: {e}")
print(f"Indexed {count} images.")
return hash_map
def _iter_picture_shapes(shapes):
"""
Recursively iterate over shapes and yield those that are pictures
(have an 'image' property), diving into groups.
"""
for shape in shapes:
# Check groups recursively
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
yield from _iter_picture_shapes(shape.shapes)
continue
# Check if shape has image property (Pictures, Placeholders with images)
if hasattr(shape, 'image'):
yield shape
def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None):
"""
Updates the alt text of images in a PowerPoint presentation by matching
their content (SHA1 hash) with images in a source directory.
Args:
ppt_path (str/Path): Path to the PowerPoint file.
image_source_dir (str/Path): Directory containing source images to match against.
output_path (str/Path, optional): Path to save the updated presentation.
If None, overwrites the input file.
"""
if output_path is None:
output_path = ppt_path
# 1. Build lookup map of {sha1: file_path} from the source directory
image_hash_map = _build_image_hash_map(image_source_dir)
# 2. Open Presentation
try:
prs = Presentation(ppt_path)
except Exception as e:
print(f"Error opening presentation {ppt_path}: {e}")
return
updates_count = 0
slides = list(prs.slides)
total_slides = len(slides)
print(f"Processing {total_slides} slides...")
for i, slide in enumerate(slides):
# Use recursive iterator to find all pictures including those in groups/placeholders
picture_shapes = list(_iter_picture_shapes(slide.shapes))
for shape in picture_shapes:
try:
# shape.image.sha1 returns the SHA1 hash of the image blob
current_sha1 = shape.image.sha1
if current_sha1 in image_hash_map:
original_path = image_hash_map[current_sha1]
# Generate Alt Text
try:
# Prepare path for generator.
# Try to relativize to CWD if capable
pass_path = original_path
try:
pass_path = original_path.relative_to(Path.cwd())
except ValueError:
pass
new_alt_text = image_alt_text_generator(pass_path)
# Check existing alt text to avoid redundant updates/log them
# Accessing alt text via cNvPr
# Note: Different shape types might store non-visual props differently
# Picture: nvPicPr.cNvPr
# GraphicFrame: nvGraphicFramePr.cNvPr
# Group: nvGrpSpPr.cNvPr
# Shape/Placeholder: nvSpPr.cNvPr
nvPr = None
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
if hasattr(shape._element, attr):
nvPr = getattr(shape._element, attr)
break
if nvPr and hasattr(nvPr, 'cNvPr'):
cNvPr = nvPr.cNvPr
existing_alt_text = cNvPr.get("descr", "")
if existing_alt_text != new_alt_text:
print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'")
print(f" Old: '{existing_alt_text}' -> New: '{new_alt_text}'")
cNvPr.set("descr", new_alt_text)
updates_count += 1
else:
print(f"Could not find cNvPr for shape on slide {i+1}")
except AssertionError as e:
print(f"Skipping match for {original_path} due to generator error: {e}")
except Exception as e:
print(f"Error updating alt text for {original_path}: {e}")
except AttributeError:
continue
except Exception as e:
print(f"Error processing shape on slide {i+1}: {e}")
if updates_count > 0:
prs.save(output_path)
print(f"Saved updated presentation to {output_path} with {updates_count} updates.")
else:
print("No images matched or required updates.")
def extract_voice_label(html_str: str) -> str:
"""
Extract voice label from HTML string and convert to short format.