added functionality to load keywords from excel file
This commit is contained in:
@@ -104,6 +104,22 @@ def _(mo):
|
||||
@app.cell(hide_code=True)
|
||||
def _(all_tags_df, mo):
|
||||
|
||||
|
||||
|
||||
tag_select = mo.ui.dropdown(
|
||||
options=all_tags_df['tag'].unique().tolist(),
|
||||
label="Select Tag to Process",
|
||||
# value="Chase as a brand",
|
||||
full_width=True,
|
||||
)
|
||||
tag_select
|
||||
return (tag_select,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(WORKING_DIR, all_tags_df, mo, tag_select):
|
||||
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
|
||||
|
||||
start_processing_btn = None
|
||||
start_processing_btn = mo.ui.button(
|
||||
label="Start Keyword Extraction",
|
||||
@@ -111,26 +127,27 @@ def _(all_tags_df, mo):
|
||||
on_click=lambda val: True
|
||||
)
|
||||
|
||||
tag_select = mo.ui.dropdown(
|
||||
options=all_tags_df['tag'].unique().tolist(),
|
||||
label="Select Tag to Process",
|
||||
value="Chase as a brand",
|
||||
full_width=True,
|
||||
)
|
||||
tag_select
|
||||
return start_processing_btn, tag_select
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(all_tags_df, mo, tag_select):
|
||||
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
|
||||
|
||||
tag_fname = tag_select.value.replace(" ", "-").replace('/','-')
|
||||
|
||||
SAVE_DIR = WORKING_DIR / tag_fname
|
||||
|
||||
if not SAVE_DIR.exists():
|
||||
SAVE_DIR.mkdir(parents=True)
|
||||
|
||||
KEYWORDS_FPATH = SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'
|
||||
KEYWORD_FREQ_FPATH = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
|
||||
|
||||
# filter all_tags_df to only the document = file_dropdown.value
|
||||
df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
|
||||
df
|
||||
return df, tag_fname
|
||||
tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
|
||||
tags_df
|
||||
return (
|
||||
KEYWORDS_FPATH,
|
||||
KEYWORD_FREQ_FPATH,
|
||||
SAVE_DIR,
|
||||
start_processing_btn,
|
||||
tag_fname,
|
||||
tags_df,
|
||||
)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
@@ -141,22 +158,24 @@ def _(mo):
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
@app.cell
|
||||
def _(mo, start_processing_btn, tag_select):
|
||||
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
|
||||
|
||||
# mdf = mpd.from_pandas(df)
|
||||
start_processing_btn
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(client, df, mo, model_select, pd, start_processing_btn):
|
||||
@app.cell
|
||||
def _(client, mo, model_select, pd, start_processing_btn, tags_df):
|
||||
from utils import ollama_keyword_extraction, worker_extraction
|
||||
# Wait for start processing button
|
||||
mo.stop(not start_processing_btn.value, "Click button above to start processing")
|
||||
|
||||
|
||||
df = tags_df
|
||||
# Run keyword extraction
|
||||
|
||||
df['keywords'] = df.progress_apply(
|
||||
lambda row: pd.Series(ollama_keyword_extraction(
|
||||
content=row['content'],
|
||||
@@ -166,31 +185,17 @@ def _(client, df, mo, model_select, pd, start_processing_btn):
|
||||
)),
|
||||
axis=1
|
||||
)
|
||||
return (df,)
|
||||
|
||||
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(WORKING_DIR, df, mo, pd, tag_fname):
|
||||
# Save results to csv
|
||||
mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
|
||||
|
||||
SAVE_DIR = WORKING_DIR / tag_fname
|
||||
|
||||
if not SAVE_DIR.exists():
|
||||
SAVE_DIR.mkdir(parents=True)
|
||||
|
||||
@app.cell
|
||||
def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
|
||||
mo.stop(not start_processing_btn.value, "Click button above to process first")
|
||||
|
||||
df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws))
|
||||
|
||||
df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
|
||||
SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx',
|
||||
index=False
|
||||
)
|
||||
|
||||
|
||||
all_keywords_list = df['keywords'].tolist()
|
||||
|
||||
all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
|
||||
|
||||
# Calculate frequencies per keyword
|
||||
@@ -206,16 +211,60 @@ def _(WORKING_DIR, df, mo, pd, tag_fname):
|
||||
freq_df.reset_index(inplace=True)
|
||||
freq_df.sort_values(by='frequency', ascending=False, inplace=True)
|
||||
|
||||
_freq_fpath = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
|
||||
|
||||
|
||||
# Save to Excel files
|
||||
|
||||
df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
|
||||
KEYWORDS_FPATH,
|
||||
index=False
|
||||
)
|
||||
|
||||
freq_df.to_excel(
|
||||
_freq_fpath,
|
||||
KEYWORD_FREQ_FPATH,
|
||||
index=False
|
||||
)
|
||||
mo.vstack([
|
||||
mo.md(f"Keywords per-highligh saved to: `{SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'}`"),
|
||||
mo.md(f"Keyword frequencies saved to: `{_freq_fpath}`")
|
||||
mo.md(f"Keywords per-highlight saved to: `{KEYWORDS_FPATH}`"),
|
||||
mo.md(f"Keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
|
||||
])
|
||||
return SAVE_DIR, keyword_freq
|
||||
return (freq_df,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# 4b) [optional] Load data from `keyword_frequencies_*.xlsx`
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(KEYWORD_FREQ_FPATH, mo):
|
||||
load_existing_btn = None
|
||||
if KEYWORD_FREQ_FPATH.exists():
|
||||
load_existing_btn = mo.ui.run_button(label=f"Load keywords from `{KEYWORD_FREQ_FPATH.name}`")
|
||||
|
||||
load_existing_btn
|
||||
return (load_existing_btn,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
|
||||
if load_existing_btn.value:
|
||||
_fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
|
||||
|
||||
# Drop nan rows if any
|
||||
_fdf.dropna(subset=['keyword', 'frequency'], inplace=True)
|
||||
_fdf.sort_values(by='frequency', ascending=False, inplace=True)
|
||||
_fdf.reset_index(drop=True, inplace=True)
|
||||
print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
|
||||
|
||||
frequency_df = _fdf
|
||||
|
||||
else:
|
||||
frequency_df = freq_df
|
||||
return (frequency_df,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
@@ -228,7 +277,7 @@ def _(mo):
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
# Start with loading all necessary libraries
|
||||
# Import all necessary libraries
|
||||
import numpy as np
|
||||
from os import path
|
||||
from PIL import Image, ImageDraw
|
||||
@@ -257,18 +306,26 @@ def _(mo):
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(df, keyword_freq, min_freq_select, mo):
|
||||
mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
|
||||
def _(freq_df, frequency_df, min_freq_select, mo):
|
||||
mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish")
|
||||
|
||||
MIN_FREQ = min_freq_select.value
|
||||
|
||||
freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ]
|
||||
|
||||
keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
|
||||
freq_df_filtered.reset_index(drop=True, inplace=True)
|
||||
|
||||
# create list of keywords sorted by their frequencies. only store the keyword
|
||||
sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
|
||||
sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
|
||||
sorted_keywords_list
|
||||
keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
|
||||
|
||||
table_selection = mo.ui.table(freq_df_filtered, page_size=50)
|
||||
table_selection
|
||||
|
||||
# keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
|
||||
|
||||
# # create list of keywords sorted by their frequencies. only store the keyword
|
||||
# sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
|
||||
# sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
|
||||
# sorted_keywords_list
|
||||
return (keyword_freq_filtered,)
|
||||
|
||||
|
||||
@@ -278,8 +335,10 @@ def _(mo, tag_select):
|
||||
## 5.2) Inspect Keyword Dataset
|
||||
|
||||
1. Check the threshold is set correctly. If not, adjust accordingly
|
||||
2. Check the keywords are good. If not, run extraction again (step 4)
|
||||
3. Add explicit exclusions if necessary
|
||||
2. Read all the keywords and verify they are good. If not
|
||||
- Add explicit exclusions if necessary below
|
||||
- OR Rerun the keyword extraction above
|
||||
|
||||
|
||||
|
||||
Add words to this dict that should be ignored in the WordCloud for specific tags.
|
||||
@@ -299,7 +358,10 @@ def _():
|
||||
"banking",
|
||||
"chase",
|
||||
"jpmorgan",
|
||||
"youthful"
|
||||
"youthful",
|
||||
"customer service",
|
||||
"customer service focused",
|
||||
"great brand",
|
||||
],
|
||||
'why customer chase': [
|
||||
"customer service",
|
||||
@@ -322,17 +384,20 @@ def _(mo):
|
||||
canvas_size = (1200, 800)
|
||||
|
||||
logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
|
||||
|
||||
return buffer, canvas_size, logo_switch
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(logo_switch, mo):
|
||||
run_wordcloud_btn = mo.ui.run_button(label="(Re-) Generate WordCloud")
|
||||
run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud")
|
||||
|
||||
mo.vstack([
|
||||
mo.md("## 5.4) Generate WordCloud with/without Logo"),
|
||||
mo.md("Adjust the settings and click the button below to (re-)generate the WordCloud. \n\nWhen satisfied with the result, click 'Save WordCloud to File' to save the image."),
|
||||
mo.md("""Use these buttons to iteratively (re)generate the WordCloud until it looks nice.
|
||||
|
||||
Placement and color of words is randomized, size is proportional to frequency.
|
||||
|
||||
When satisfied with the result, click 'Save WordCloud to File' to save the image."""),
|
||||
mo.md('---'),
|
||||
mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')]
|
||||
)
|
||||
@@ -370,7 +435,7 @@ def _(
|
||||
# Make sure this path points to your uploaded file
|
||||
logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png"
|
||||
logo = Image.open(logo_path).convert("RGBA")
|
||||
|
||||
|
||||
# Optional: Resize logo if it's too large or small for the canvas
|
||||
# target_width = 600
|
||||
# ratio = target_width / logo.width
|
||||
@@ -382,26 +447,26 @@ def _(
|
||||
# Use Image.Resampling.LANCZOS for high-quality downsampling
|
||||
# If you get an error, try Image.LANCZOS or Image.ANTIALIAS
|
||||
logo = logo.resize((target_width, new_height), Image.Resampling.LANCZOS)
|
||||
|
||||
|
||||
# 3. Create the mask (0 = draw here, 255 = don't draw here)
|
||||
# Initialize with 0 (black/draw everywhere)
|
||||
mask_image = Image.new("L", canvas_size, 0)
|
||||
draw = ImageDraw.Draw(mask_image)
|
||||
|
||||
|
||||
# 4. Draw a protected circular area in the center
|
||||
center = (canvas_size[0] // 2, canvas_size[1] // 2)
|
||||
|
||||
|
||||
# Calculate radius: half of logo max dimension + buffer
|
||||
radius = (max(logo.size) // 2) + buffer
|
||||
|
||||
|
||||
# Draw the white circle (255) which the WordCloud will avoid
|
||||
draw.ellipse(
|
||||
(center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius),
|
||||
fill=255
|
||||
)
|
||||
|
||||
|
||||
chase_mask = np.array(mask_image)
|
||||
|
||||
|
||||
# Generate the WordCloud
|
||||
wordcloud = WordCloud(
|
||||
background_color='white',
|
||||
|
||||
Reference in New Issue
Block a user