rename example notebooks and finish ppt pipeline functions
This commit is contained in:
@@ -1,63 +0,0 @@
|
|||||||
import marimo
|
|
||||||
|
|
||||||
__generated_with = "0.19.2"
|
|
||||||
app = marimo.App(width="medium")
|
|
||||||
|
|
||||||
with app.setup:
|
|
||||||
import marimo as mo
|
|
||||||
from pathlib import Path
|
|
||||||
import utils
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
|
||||||
def _():
|
|
||||||
PPT_FILE = Path('data/Presentation.pptx')
|
|
||||||
UPDATED_PPT_FILE = Path('data/Updated_Presentation.pptx')
|
|
||||||
return PPT_FILE, UPDATED_PPT_FILE
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
|
||||||
def _():
|
|
||||||
IMAGE_FILE = Path('figures/OneDrive_2026-01-28/Cons-Early_Professional/cold_distant_approachable_familiar_warm.png')
|
|
||||||
return (IMAGE_FILE,)
|
|
||||||
|
|
||||||
|
|
||||||
@app.function
|
|
||||||
def image_alt_text_converter(fpath):
|
|
||||||
"""convert image file path to alt text
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not isinstance(fpath, Path):
|
|
||||||
fpath = Path(fpath)
|
|
||||||
|
|
||||||
fparts = fpath.parts
|
|
||||||
assert fparts[0] == 'figures', "Image file path must start with 'figures'"
|
|
||||||
|
|
||||||
return Path('/'.join(fparts[2:])).as_posix()
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
|
||||||
def _(IMAGE_FILE):
|
|
||||||
img_alt_txt = image_alt_text_converter(IMAGE_FILE)
|
|
||||||
img_alt_txt
|
|
||||||
return (img_alt_txt,)
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
|
||||||
def _(IMAGE_FILE, PPT_FILE, UPDATED_PPT_FILE, img_alt_txt):
|
|
||||||
utils.pptx_replace_named_image(
|
|
||||||
presentation_path=PPT_FILE,
|
|
||||||
target_tag=img_alt_txt,
|
|
||||||
new_image_path=IMAGE_FILE,
|
|
||||||
save_path=UPDATED_PPT_FILE)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
|
||||||
def _(P):
|
|
||||||
print(P.slides[10])
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
app.run()
|
|
||||||
73
99_example_ppt_replace_images.py
Normal file
73
99_example_ppt_replace_images.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
import marimo
|
||||||
|
|
||||||
|
__generated_with = "0.19.2"
|
||||||
|
app = marimo.App(width="medium")
|
||||||
|
|
||||||
|
with app.setup:
|
||||||
|
import marimo as mo
|
||||||
|
from pathlib import Path
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
mo.md(r"""
|
||||||
|
# Tag existing images with Alt-Text
|
||||||
|
|
||||||
|
Based on image content
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
TAG_SOURCE = Path('data/test_tag_source.pptx')
|
||||||
|
TAG_TARGET = Path('data/test_tag_target.pptx')
|
||||||
|
TAG_IMAGE_DIR = Path('figures/OneDrive_2026-01-28/')
|
||||||
|
return TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET):
|
||||||
|
utils.update_ppt_alt_text(ppt_path=TAG_SOURCE, image_source_dir=TAG_IMAGE_DIR, output_path=TAG_TARGET)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
mo.md(r"""
|
||||||
|
# Replace Images using Alt-Text
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
REPLACE_SOURCE = Path('data/test_replace_source.pptx')
|
||||||
|
REPLACE_TARGET = Path('data/test_replace_target.pptx')
|
||||||
|
return REPLACE_SOURCE, REPLACE_TARGET
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
IMAGE_FILE = Path('figures/OneDrive_2026-01-28/Cons-Early_Professional/cold_distant_approachable_familiar_warm.png')
|
||||||
|
return (IMAGE_FILE,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(IMAGE_FILE, REPLACE_SOURCE, REPLACE_TARGET):
|
||||||
|
utils.pptx_replace_named_image(
|
||||||
|
presentation_path=REPLACE_SOURCE,
|
||||||
|
target_tag=utils.image_alt_text_generator(IMAGE_FILE),
|
||||||
|
new_image_path=IMAGE_FILE,
|
||||||
|
save_path=REPLACE_TARGET)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run()
|
||||||
169
utils.py
169
utils.py
@@ -4,13 +4,27 @@ import pandas as pd
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
from plots import JPMCPlotsMixin
|
from plots import JPMCPlotsMixin
|
||||||
|
|
||||||
import marimo as mo
|
|
||||||
|
|
||||||
from pptx import Presentation
|
from pptx import Presentation
|
||||||
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
||||||
|
|
||||||
|
|
||||||
|
def image_alt_text_generator(fpath):
|
||||||
|
"""convert image file path to alt text
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not isinstance(fpath, Path):
|
||||||
|
fpath = Path(fpath)
|
||||||
|
|
||||||
|
fparts = fpath.parts
|
||||||
|
assert fparts[0] == 'figures', "Image file path must start with 'figures'"
|
||||||
|
|
||||||
|
return Path('/'.join(fparts[2:])).as_posix()
|
||||||
|
|
||||||
def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save_path):
|
def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save_path):
|
||||||
"""
|
"""
|
||||||
Finds and replaces specific images in a PowerPoint presentation while
|
Finds and replaces specific images in a PowerPoint presentation while
|
||||||
@@ -91,6 +105,159 @@ def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save
|
|||||||
print(f"Successfully saved to {save_path}")
|
print(f"Successfully saved to {save_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def _calculate_file_sha1(file_path: Union[str, Path]) -> str:
|
||||||
|
"""Calculate SHA1 hash of a file."""
|
||||||
|
sha1 = hashlib.sha1()
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
while True:
|
||||||
|
data = f.read(65536)
|
||||||
|
if not data:
|
||||||
|
break
|
||||||
|
sha1.update(data)
|
||||||
|
return sha1.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _build_image_hash_map(root_dir: Union[str, Path]) -> dict:
|
||||||
|
"""
|
||||||
|
Recursively walk the directory and build a map of SHA1 hashes to file paths.
|
||||||
|
Only includes common image extensions.
|
||||||
|
"""
|
||||||
|
hash_map = {}
|
||||||
|
valid_extensions = {'.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'}
|
||||||
|
|
||||||
|
root = Path(root_dir)
|
||||||
|
print(f"Building image hash map from {root}...")
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for root_path, dirs, files in os.walk(root):
|
||||||
|
for file in files:
|
||||||
|
file_path = Path(root_path) / file
|
||||||
|
if file_path.suffix.lower() in valid_extensions:
|
||||||
|
try:
|
||||||
|
file_sha1 = _calculate_file_sha1(file_path)
|
||||||
|
# We store the absolute path for reference, but we might just need the path relative to project for alt text
|
||||||
|
hash_map[file_sha1] = file_path
|
||||||
|
count += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error hashing {file_path}: {e}")
|
||||||
|
|
||||||
|
print(f"Indexed {count} images.")
|
||||||
|
return hash_map
|
||||||
|
|
||||||
|
|
||||||
|
def _iter_picture_shapes(shapes):
|
||||||
|
"""
|
||||||
|
Recursively iterate over shapes and yield those that are pictures
|
||||||
|
(have an 'image' property), diving into groups.
|
||||||
|
"""
|
||||||
|
for shape in shapes:
|
||||||
|
# Check groups recursively
|
||||||
|
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
||||||
|
yield from _iter_picture_shapes(shape.shapes)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if shape has image property (Pictures, Placeholders with images)
|
||||||
|
if hasattr(shape, 'image'):
|
||||||
|
yield shape
|
||||||
|
|
||||||
|
|
||||||
|
def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None):
|
||||||
|
"""
|
||||||
|
Updates the alt text of images in a PowerPoint presentation by matching
|
||||||
|
their content (SHA1 hash) with images in a source directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ppt_path (str/Path): Path to the PowerPoint file.
|
||||||
|
image_source_dir (str/Path): Directory containing source images to match against.
|
||||||
|
output_path (str/Path, optional): Path to save the updated presentation.
|
||||||
|
If None, overwrites the input file.
|
||||||
|
"""
|
||||||
|
if output_path is None:
|
||||||
|
output_path = ppt_path
|
||||||
|
|
||||||
|
# 1. Build lookup map of {sha1: file_path} from the source directory
|
||||||
|
image_hash_map = _build_image_hash_map(image_source_dir)
|
||||||
|
|
||||||
|
# 2. Open Presentation
|
||||||
|
try:
|
||||||
|
prs = Presentation(ppt_path)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error opening presentation {ppt_path}: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
updates_count = 0
|
||||||
|
slides = list(prs.slides)
|
||||||
|
total_slides = len(slides)
|
||||||
|
|
||||||
|
print(f"Processing {total_slides} slides...")
|
||||||
|
|
||||||
|
for i, slide in enumerate(slides):
|
||||||
|
# Use recursive iterator to find all pictures including those in groups/placeholders
|
||||||
|
picture_shapes = list(_iter_picture_shapes(slide.shapes))
|
||||||
|
|
||||||
|
for shape in picture_shapes:
|
||||||
|
try:
|
||||||
|
# shape.image.sha1 returns the SHA1 hash of the image blob
|
||||||
|
current_sha1 = shape.image.sha1
|
||||||
|
|
||||||
|
if current_sha1 in image_hash_map:
|
||||||
|
original_path = image_hash_map[current_sha1]
|
||||||
|
|
||||||
|
# Generate Alt Text
|
||||||
|
try:
|
||||||
|
# Prepare path for generator.
|
||||||
|
# Try to relativize to CWD if capable
|
||||||
|
pass_path = original_path
|
||||||
|
try:
|
||||||
|
pass_path = original_path.relative_to(Path.cwd())
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
new_alt_text = image_alt_text_generator(pass_path)
|
||||||
|
|
||||||
|
# Check existing alt text to avoid redundant updates/log them
|
||||||
|
# Accessing alt text via cNvPr
|
||||||
|
# Note: Different shape types might store non-visual props differently
|
||||||
|
# Picture: nvPicPr.cNvPr
|
||||||
|
# GraphicFrame: nvGraphicFramePr.cNvPr
|
||||||
|
# Group: nvGrpSpPr.cNvPr
|
||||||
|
# Shape/Placeholder: nvSpPr.cNvPr
|
||||||
|
|
||||||
|
nvPr = None
|
||||||
|
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
|
||||||
|
if hasattr(shape._element, attr):
|
||||||
|
nvPr = getattr(shape._element, attr)
|
||||||
|
break
|
||||||
|
|
||||||
|
if nvPr and hasattr(nvPr, 'cNvPr'):
|
||||||
|
cNvPr = nvPr.cNvPr
|
||||||
|
existing_alt_text = cNvPr.get("descr", "")
|
||||||
|
|
||||||
|
if existing_alt_text != new_alt_text:
|
||||||
|
print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'")
|
||||||
|
print(f" Old: '{existing_alt_text}' -> New: '{new_alt_text}'")
|
||||||
|
cNvPr.set("descr", new_alt_text)
|
||||||
|
updates_count += 1
|
||||||
|
else:
|
||||||
|
print(f"Could not find cNvPr for shape on slide {i+1}")
|
||||||
|
|
||||||
|
except AssertionError as e:
|
||||||
|
print(f"Skipping match for {original_path} due to generator error: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error updating alt text for {original_path}: {e}")
|
||||||
|
|
||||||
|
except AttributeError:
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing shape on slide {i+1}: {e}")
|
||||||
|
|
||||||
|
if updates_count > 0:
|
||||||
|
prs.save(output_path)
|
||||||
|
print(f"Saved updated presentation to {output_path} with {updates_count} updates.")
|
||||||
|
else:
|
||||||
|
print("No images matched or required updates.")
|
||||||
|
|
||||||
|
|
||||||
def extract_voice_label(html_str: str) -> str:
|
def extract_voice_label(html_str: str) -> str:
|
||||||
"""
|
"""
|
||||||
Extract voice label from HTML string and convert to short format.
|
Extract voice label from HTML string and convert to short format.
|
||||||
|
|||||||
Reference in New Issue
Block a user