added functionality to load keywords from excel file

This commit is contained in:
2025-12-16 22:25:12 -08:00
parent e81961b819
commit e90b41f648
4 changed files with 181 additions and 91 deletions

View File

@@ -104,6 +104,22 @@ def _(mo):
@app.cell(hide_code=True)
def _(all_tags_df, mo):
tag_select = mo.ui.dropdown(
options=all_tags_df['tag'].unique().tolist(),
label="Select Tag to Process",
# value="Chase as a brand",
full_width=True,
)
tag_select
return (tag_select,)
@app.cell
def _(WORKING_DIR, all_tags_df, mo, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
start_processing_btn = None
start_processing_btn = mo.ui.button(
label="Start Keyword Extraction",
@@ -111,26 +127,27 @@ def _(all_tags_df, mo):
on_click=lambda val: True
)
tag_select = mo.ui.dropdown(
options=all_tags_df['tag'].unique().tolist(),
label="Select Tag to Process",
value="Chase as a brand",
full_width=True,
)
tag_select
return start_processing_btn, tag_select
@app.cell
def _(all_tags_df, mo, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
tag_fname = tag_select.value.replace(" ", "-").replace('/','-')
SAVE_DIR = WORKING_DIR / tag_fname
if not SAVE_DIR.exists():
SAVE_DIR.mkdir(parents=True)
KEYWORDS_FPATH = SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'
KEYWORD_FREQ_FPATH = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
# filter all_tags_df to only the document = file_dropdown.value
df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
df
return df, tag_fname
tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
tags_df
return (
KEYWORDS_FPATH,
KEYWORD_FREQ_FPATH,
SAVE_DIR,
start_processing_btn,
tag_fname,
tags_df,
)
@app.cell(hide_code=True)
@@ -141,22 +158,24 @@ def _(mo):
return
@app.cell(hide_code=True)
@app.cell
def _(mo, start_processing_btn, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
# mdf = mpd.from_pandas(df)
start_processing_btn
return
@app.cell(hide_code=True)
def _(client, df, mo, model_select, pd, start_processing_btn):
@app.cell
def _(client, mo, model_select, pd, start_processing_btn, tags_df):
from utils import ollama_keyword_extraction, worker_extraction
# Wait for start processing button
mo.stop(not start_processing_btn.value, "Click button above to start processing")
df = tags_df
# Run keyword extraction
df['keywords'] = df.progress_apply(
lambda row: pd.Series(ollama_keyword_extraction(
content=row['content'],
@@ -166,31 +185,17 @@ def _(client, df, mo, model_select, pd, start_processing_btn):
)),
axis=1
)
return (df,)
return
@app.cell(hide_code=True)
def _(WORKING_DIR, df, mo, pd, tag_fname):
# Save results to csv
mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
SAVE_DIR = WORKING_DIR / tag_fname
if not SAVE_DIR.exists():
SAVE_DIR.mkdir(parents=True)
@app.cell
def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
mo.stop(not start_processing_btn.value, "Click button above to process first")
df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws))
df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx',
index=False
)
all_keywords_list = df['keywords'].tolist()
all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
# Calculate frequencies per keyword
@@ -206,16 +211,60 @@ def _(WORKING_DIR, df, mo, pd, tag_fname):
freq_df.reset_index(inplace=True)
freq_df.sort_values(by='frequency', ascending=False, inplace=True)
_freq_fpath = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
# Save to Excel files
df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
KEYWORDS_FPATH,
index=False
)
freq_df.to_excel(
_freq_fpath,
KEYWORD_FREQ_FPATH,
index=False
)
mo.vstack([
mo.md(f"Keywords per-highligh saved to: `{SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'}`"),
mo.md(f"Keyword frequencies saved to: `{_freq_fpath}`")
mo.md(f"Keywords per-highlight saved to: `{KEYWORDS_FPATH}`"),
mo.md(f"Keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
])
return SAVE_DIR, keyword_freq
return (freq_df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# 4b) [optional] Load data from `keyword_frequencies_*.xlsx`
""")
return
@app.cell(hide_code=True)
def _(KEYWORD_FREQ_FPATH, mo):
load_existing_btn = None
if KEYWORD_FREQ_FPATH.exists():
load_existing_btn = mo.ui.run_button(label=f"Load keywords from `{KEYWORD_FREQ_FPATH.name}`")
load_existing_btn
return (load_existing_btn,)
@app.cell(hide_code=True)
def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
if load_existing_btn.value:
_fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
# Drop nan rows if any
_fdf.dropna(subset=['keyword', 'frequency'], inplace=True)
_fdf.sort_values(by='frequency', ascending=False, inplace=True)
_fdf.reset_index(drop=True, inplace=True)
print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
frequency_df = _fdf
else:
frequency_df = freq_df
return (frequency_df,)
@app.cell(hide_code=True)
@@ -228,7 +277,7 @@ def _(mo):
@app.cell(hide_code=True)
def _():
# Start with loading all necessary libraries
# Import all necessary libraries
import numpy as np
from os import path
from PIL import Image, ImageDraw
@@ -257,18 +306,26 @@ def _(mo):
@app.cell(hide_code=True)
def _(df, keyword_freq, min_freq_select, mo):
mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
def _(freq_df, frequency_df, min_freq_select, mo):
mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish")
MIN_FREQ = min_freq_select.value
freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ]
keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
freq_df_filtered.reset_index(drop=True, inplace=True)
# create list of keywords sorted by their frequencies. only store the keyword
sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
sorted_keywords_list
keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
table_selection = mo.ui.table(freq_df_filtered, page_size=50)
table_selection
# keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
# # create list of keywords sorted by their frequencies. only store the keyword
# sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
# sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
# sorted_keywords_list
return (keyword_freq_filtered,)
@@ -278,8 +335,10 @@ def _(mo, tag_select):
## 5.2) Inspect Keyword Dataset
1. Check the threshold is set correctly. If not, adjust accordingly
2. Check the keywords are good. If not, run extraction again (step 4)
3. Add explicit exclusions if necessary
2. Read all the keywords and verify they are good. If not
- Add explicit exclusions if necessary below
- OR Rerun the keyword extraction above
Add words to this dict that should be ignored in the WordCloud for specific tags.
@@ -299,7 +358,10 @@ def _():
"banking",
"chase",
"jpmorgan",
"youthful"
"youthful",
"customer service",
"customer service focused",
"great brand",
],
'why customer chase': [
"customer service",
@@ -322,17 +384,20 @@ def _(mo):
canvas_size = (1200, 800)
logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
return buffer, canvas_size, logo_switch
@app.cell(hide_code=True)
def _(logo_switch, mo):
run_wordcloud_btn = mo.ui.run_button(label="(Re-) Generate WordCloud")
run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud")
mo.vstack([
mo.md("## 5.4) Generate WordCloud with/without Logo"),
mo.md("Adjust the settings and click the button below to (re-)generate the WordCloud. \n\nWhen satisfied with the result, click 'Save WordCloud to File' to save the image."),
mo.md("""Use these buttons to iteratively (re)generate the WordCloud until it looks nice.
Placement and color of words is randomized, size is proportional to frequency.
When satisfied with the result, click 'Save WordCloud to File' to save the image."""),
mo.md('---'),
mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')]
)
@@ -370,7 +435,7 @@ def _(
# Make sure this path points to your uploaded file
logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png"
logo = Image.open(logo_path).convert("RGBA")
# Optional: Resize logo if it's too large or small for the canvas
# target_width = 600
# ratio = target_width / logo.width
@@ -382,26 +447,26 @@ def _(
# Use Image.Resampling.LANCZOS for high-quality downsampling
# If you get an error, try Image.LANCZOS or Image.ANTIALIAS
logo = logo.resize((target_width, new_height), Image.Resampling.LANCZOS)
# 3. Create the mask (0 = draw here, 255 = don't draw here)
# Initialize with 0 (black/draw everywhere)
mask_image = Image.new("L", canvas_size, 0)
draw = ImageDraw.Draw(mask_image)
# 4. Draw a protected circular area in the center
center = (canvas_size[0] // 2, canvas_size[1] // 2)
# Calculate radius: half of logo max dimension + buffer
radius = (max(logo.size) // 2) + buffer
# Draw the white circle (255) which the WordCloud will avoid
draw.ellipse(
(center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius),
fill=255
)
chase_mask = np.array(mask_image)
# Generate the WordCloud
wordcloud = WordCloud(
background_color='white',