cleanup notebook and make usable

This commit is contained in:
2025-12-16 20:15:44 -08:00
parent 4ba8af03d2
commit e81961b819
3 changed files with 177 additions and 101 deletions

View File

@@ -25,7 +25,7 @@ def _():
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
WORKING_DIR = Path('./data/processing/02-b_WordClouds')
if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True)
@@ -73,7 +73,7 @@ def _(mo):
def _(TAGUETTE_EXPORT_DIR, pd):
all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
all_tags_df['_seq_id'] = range(len(all_tags_df))
all_tags_df
# all_tags_df
return (all_tags_df,)
@@ -81,7 +81,7 @@ def _(TAGUETTE_EXPORT_DIR, pd):
def _(all_tags_df):
# get count of rows per tag
tag_counts = all_tags_df['tag'].value_counts().reset_index()
tag_counts
# tag_counts
return
@@ -89,7 +89,7 @@ def _(all_tags_df):
def _(TAGUETTE_EXPORT_DIR, pd):
codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
codebook_df
# codebook_df
return
@@ -101,25 +101,36 @@ def _(mo):
return
@app.cell
@app.cell(hide_code=True)
def _(all_tags_df, mo):
start_processing_btn = None
start_processing_btn = mo.ui.button(
label="Start Keyword Extraction",
kind="warn",
on_click=lambda val: True
)
tag_select = mo.ui.dropdown(
options=all_tags_df['tag'].unique().tolist(),
label="Select Tag to Process",
value="Chase as a brand",
full_width=True
full_width=True,
)
tag_select
return (tag_select,)
return start_processing_btn, tag_select
@app.cell
def _(all_tags_df, mo, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
tag_fname = tag_select.value.replace(" ", "-").replace('/','-')
# filter all_tags_df to only the document = file_dropdown.value
df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
df
return (df,)
return df, tag_fname
@app.cell(hide_code=True)
@@ -130,37 +141,21 @@ def _(mo):
return
@app.cell
def _(mo, tag_select):
@app.cell(hide_code=True)
def _(mo, start_processing_btn, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
# mdf = mpd.from_pandas(df)
start_processing_btn = mo.ui.button(
label="Start Keyword Extraction",
kind="warn",
on_click=lambda val: True
)
start_processing_btn
return (start_processing_btn,)
return
@app.cell
def _(
WORKING_DIR,
client,
df,
mo,
model_select,
pd,
start_processing_btn,
tag_select,
):
@app.cell(hide_code=True)
def _(client, df, mo, model_select, pd, start_processing_btn):
from utils import ollama_keyword_extraction, worker_extraction
# Wait for start processing button
mo.stop(not start_processing_btn.value, "Click button above to start processing")
# Run keyword extraction
df['keywords'] = df.progress_apply(
lambda row: pd.Series(ollama_keyword_extraction(
@@ -172,13 +167,55 @@ def _(
axis=1
)
df['keywords_txt'] = df['keywords'].progress_apply(lambda kws: ', '.join(kws))
df[['id', 'tag', 'content', 'keywords_txt']].to_csv(
WORKING_DIR / f'keywords_{tag_select.value.replace(" ", "-")}.csv',
return
@app.cell(hide_code=True)
def _(WORKING_DIR, df, mo, pd, tag_fname):
# Save results to csv
mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
SAVE_DIR = WORKING_DIR / tag_fname
if not SAVE_DIR.exists():
SAVE_DIR.mkdir(parents=True)
df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws))
df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx',
index=False
)
return
all_keywords_list = df['keywords'].tolist()
all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
# Calculate frequencies per keyword
keyword_freq = {}
for kw in all_keywords_flat:
if kw in keyword_freq:
keyword_freq[kw] += 1
else:
keyword_freq[kw] = 1
freq_df = pd.DataFrame.from_dict(keyword_freq, orient='index', columns=['frequency'])
freq_df.index.name = 'keyword'
freq_df.reset_index(inplace=True)
freq_df.sort_values(by='frequency', ascending=False, inplace=True)
_freq_fpath = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
freq_df.to_excel(
_freq_fpath,
index=False
)
mo.vstack([
mo.md(f"Keywords per-highligh saved to: `{SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'}`"),
mo.md(f"Keyword frequencies saved to: `{_freq_fpath}`")
])
return SAVE_DIR, keyword_freq
@app.cell(hide_code=True)
@@ -189,7 +226,7 @@ def _(mo):
return
@app.cell
@app.cell(hide_code=True)
def _():
# Start with loading all necessary libraries
import numpy as np
@@ -197,26 +234,34 @@ def _():
from PIL import Image, ImageDraw
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from utils import blue_color_func
import warnings
warnings.filterwarnings("ignore")
return Image, ImageDraw, WordCloud, np, plt
return Image, ImageDraw, WordCloud, blue_color_func, np, plt
@app.cell
def _(df):
MIN_FREQ = 2
all_keywords_list = df['keywords'].tolist()
all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## 5.1) Select threshold frequency
""")
return
keyword_freq = {}
for kw in all_keywords_flat:
if kw in keyword_freq:
keyword_freq[kw] += 1
else:
keyword_freq[kw] = 1
@app.cell(hide_code=True)
def _(mo):
min_freq_select = mo.ui.number(start=1, stop=20, label="Threshold Minimum Keyword Frequency: ", value=2)
min_freq_select
return (min_freq_select,)
@app.cell(hide_code=True)
def _(df, keyword_freq, min_freq_select, mo):
mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
MIN_FREQ = min_freq_select.value
keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
@@ -227,65 +272,73 @@ def _(df):
return (keyword_freq_filtered,)
@app.cell
def _():
IGNORE_WORDS = {
'chase as a brand': [
"brand"
]
}
@app.cell(hide_code=True)
def _(mo, tag_select):
mo.md(rf"""
## 5.2) Inspect Keyword Dataset
1. Check the threshold is set correctly. If not, adjust accordingly
2. Check the keywords are good. If not, run extraction again (step 4)
3. Add explicit exclusions if necessary
return (IGNORE_WORDS,)
Add words to this dict that should be ignored in the WordCloud for specific tags.
Make sure to create the correct key that matches the active selected tag:
@app.cell
def _(plt):
import random
import matplotlib.colors as mcolors
def blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
# Use the provided random_state for reproducibility if available, else use random module
r = random_state if random_state else random
# Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
# 0.0 is white/light, 1.0 is dark blue
min_val, max_val = 0.4, 1.0
color_val = r.uniform(min_val, max_val)
# Get color from matplotlib colormap
rgba = plt.cm.Blues(color_val)
return mcolors.to_hex(rgba)
return (blue_color_func,)
@app.cell
def _():
# chase_mask = np.array(Image.open("./data/assets/Chase-National-Bank-Logo.png"))
# def transform_format(val):
# if val == 0:
# return 255
# else:
# return 1
# transformed_chase_mask = np.ndarray((chase_mask.shape[0], chase_mask.shape[1]), np.int32)
# for i in range(len(chase_mask)):
# transformed_chase_mask[i] = list(map(transform_format, chase_mask[i]))
Active selected tag = '`{tag_select.value.lower()}`'
""")
return
@app.cell
def _():
IGNORE_WORDS = {
'chase as a brand': [
"brand",
"banking experience",
"banking",
"chase",
"jpmorgan",
"youthful"
],
'why customer chase': [
"customer service",
"customer loyalty",
"chase",
"chase customer",
"banking experience",
],
'chase as a person (personification)': [
"CPC1"
]
# <active-selected-tag>: [list, of, words, to, ignore]
}
return (IGNORE_WORDS,)
@app.cell(hide_code=True)
def _(mo):
buffer = -100 # Adjust this to increase/decrease space between logo and words
canvas_size = (1200, 800)
logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
logo_switch
return buffer, canvas_size, logo_switch
@app.cell(hide_code=True)
def _(logo_switch, mo):
run_wordcloud_btn = mo.ui.run_button(label="(Re-) Generate WordCloud")
mo.vstack([
mo.md("## 5.4) Generate WordCloud with/without Logo"),
mo.md("Adjust the settings and click the button below to (re-)generate the WordCloud. \n\nWhen satisfied with the result, click 'Save WordCloud to File' to save the image."),
mo.md('---'),
mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')]
)
return (run_wordcloud_btn,)
@app.cell(hide_code=True)
def _(
IGNORE_WORDS,
@@ -300,8 +353,12 @@ def _(
mo,
np,
plt,
run_wordcloud_btn,
tag_select,
):
if run_wordcloud_btn.value:
pass
# remove specific keywords depending on selected tag
if IGNORE_WORDS.get(tag_select.value.lower()):
for word in IGNORE_WORDS[tag_select.value.lower()]:
@@ -364,7 +421,7 @@ def _(
background_color='white',
width=canvas_size[0],
height=canvas_size[1],
max_font_size=100, # Increased font size for larger canvas
max_font_size=150, # Increased font size for larger canvas
max_words=20, # Increased word count to fill space
color_func=blue_color_func,
# mask=chase_mask, # Apply the circular mask
@@ -386,8 +443,8 @@ def _(
# Paste logo (using alpha channel as mask to keep transparency)
wc_image.paste(logo, logo_pos, logo)
# Display the generated image
fig = plt.figure(figsize=(7,7))
# Display the generated image
fig = plt.figure(figsize=(7,7))
# Display the generated image:
plt.imshow(wc_image, interpolation='bilinear')
@@ -396,7 +453,7 @@ def _(
save_wordcloud_btn = None
save_wordcloud_btn = mo.ui.button(
label="Save_wordcloud_button",
label="Save WordCloud to File",
kind="warn",
on_click=lambda val: True
)
@@ -404,17 +461,19 @@ def _(
return save_wordcloud_btn, wc_image
@app.cell
def _(WORKING_DIR, mo, save_wordcloud_btn, tag_select, wc_image):
@app.cell(hide_code=True)
def _(SAVE_DIR, mo, save_wordcloud_btn, tag_fname, wc_image):
# Wait for start processing button
mo.stop(not save_wordcloud_btn.value, "Click button above to save wordcloud image")
filename = f'wordcloud_{tag_select.value.replace(" ", "-")}.png'
fpath = WORKING_DIR / filename
filename = f'wordcloud_{tag_fname}.png'
fpath = SAVE_DIR / filename
# add a (increasing) number to the filename so we can save multiple. find the latest in the directory first
existing_files = list(WORKING_DIR.glob(f'wordcloud_{tag_select.value.replace(" ", "-")}*.png'))
existing_files = list(SAVE_DIR.glob(f'wordcloud_{tag_fname}*.png'))
if existing_files:
existing_numbers = []
for ef in existing_files:
@@ -425,7 +484,7 @@ def _(WORKING_DIR, mo, save_wordcloud_btn, tag_select, wc_image):
next_number = max(existing_numbers) + 1
else:
next_number = 1
fpath = WORKING_DIR / f'wordcloud_{tag_select.value.replace(" ", "-")}_{next_number}.png'
fpath = SAVE_DIR / f'wordcloud_{tag_fname}_{next_number}.png'
wc_image.save(fpath)
mo.md(f"Wordcloud saved to: {fpath}")

View File

@@ -2,4 +2,4 @@ from .ollama_utils import connect_qumo_ollama
from .data_utils import create_sentiment_matrix, extract_theme
from .transcript_utils import load_srt, csv_to_markdown, cpc_smb_to_markdown
from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis
from .keyword_analysis import ollama_keyword_extraction, worker_extraction
from .keyword_analysis import ollama_keyword_extraction, worker_extraction, blue_color_func

View File

@@ -2,6 +2,23 @@ import pandas as pd
from ollama import Client
import json
import matplotlib.pyplot as plt
import random
import matplotlib.colors as mcolors
def blue_color_func( word, font_size, position, orientation, random_state=None, **kwargs):
# Use the provided random_state for reproducibility if available, else use random module
r = random_state if random_state else random
# Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
# 0.0 is white/light, 1.0 is dark blue
min_val, max_val = 0.4, 1.0
color_val = r.uniform(min_val, max_val)
# Get color from matplotlib colormap
rgba = plt.cm.Blues(color_val)
return mcolors.to_hex(rgba)
def worker_extraction(row, host, model):