cleanup notebook and make usable

This commit is contained in:
2025-12-16 20:15:44 -08:00
parent 4ba8af03d2
commit e81961b819
3 changed files with 177 additions and 101 deletions

View File

@@ -25,7 +25,7 @@ def _():
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False) client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export') TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
WORKING_DIR = Path('./data/processing/02_taguette_postprocess') WORKING_DIR = Path('./data/processing/02-b_WordClouds')
if not WORKING_DIR.exists(): if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True) WORKING_DIR.mkdir(parents=True)
@@ -73,7 +73,7 @@ def _(mo):
def _(TAGUETTE_EXPORT_DIR, pd): def _(TAGUETTE_EXPORT_DIR, pd):
all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv') all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
all_tags_df['_seq_id'] = range(len(all_tags_df)) all_tags_df['_seq_id'] = range(len(all_tags_df))
all_tags_df # all_tags_df
return (all_tags_df,) return (all_tags_df,)
@@ -81,7 +81,7 @@ def _(TAGUETTE_EXPORT_DIR, pd):
def _(all_tags_df): def _(all_tags_df):
# get count of rows per tag # get count of rows per tag
tag_counts = all_tags_df['tag'].value_counts().reset_index() tag_counts = all_tags_df['tag'].value_counts().reset_index()
tag_counts # tag_counts
return return
@@ -89,7 +89,7 @@ def _(all_tags_df):
def _(TAGUETTE_EXPORT_DIR, pd): def _(TAGUETTE_EXPORT_DIR, pd):
codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv') codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
codebook_df.rename(columns={'description': 'theme_description'}, inplace=True) codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
codebook_df # codebook_df
return return
@@ -101,25 +101,36 @@ def _(mo):
return return
@app.cell @app.cell(hide_code=True)
def _(all_tags_df, mo): def _(all_tags_df, mo):
start_processing_btn = None
start_processing_btn = mo.ui.button(
label="Start Keyword Extraction",
kind="warn",
on_click=lambda val: True
)
tag_select = mo.ui.dropdown( tag_select = mo.ui.dropdown(
options=all_tags_df['tag'].unique().tolist(), options=all_tags_df['tag'].unique().tolist(),
label="Select Tag to Process", label="Select Tag to Process",
value="Chase as a brand", value="Chase as a brand",
full_width=True full_width=True,
) )
tag_select tag_select
return (tag_select,) return start_processing_btn, tag_select
@app.cell @app.cell
def _(all_tags_df, mo, tag_select): def _(all_tags_df, mo, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue")) mo.stop(not tag_select.value, mo.md("Select tag to continue"))
tag_fname = tag_select.value.replace(" ", "-").replace('/','-')
# filter all_tags_df to only the document = file_dropdown.value # filter all_tags_df to only the document = file_dropdown.value
df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy() df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
df df
return (df,) return df, tag_fname
@app.cell(hide_code=True) @app.cell(hide_code=True)
@@ -130,37 +141,21 @@ def _(mo):
return return
@app.cell @app.cell(hide_code=True)
def _(mo, tag_select): def _(mo, start_processing_btn, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue")) mo.stop(not tag_select.value, mo.md("Select tag to continue"))
# mdf = mpd.from_pandas(df) # mdf = mpd.from_pandas(df)
start_processing_btn = mo.ui.button(
label="Start Keyword Extraction",
kind="warn",
on_click=lambda val: True
)
start_processing_btn start_processing_btn
return (start_processing_btn,) return
@app.cell @app.cell(hide_code=True)
def _( def _(client, df, mo, model_select, pd, start_processing_btn):
WORKING_DIR,
client,
df,
mo,
model_select,
pd,
start_processing_btn,
tag_select,
):
from utils import ollama_keyword_extraction, worker_extraction from utils import ollama_keyword_extraction, worker_extraction
# Wait for start processing button # Wait for start processing button
mo.stop(not start_processing_btn.value, "Click button above to start processing") mo.stop(not start_processing_btn.value, "Click button above to start processing")
# Run keyword extraction # Run keyword extraction
df['keywords'] = df.progress_apply( df['keywords'] = df.progress_apply(
lambda row: pd.Series(ollama_keyword_extraction( lambda row: pd.Series(ollama_keyword_extraction(
@@ -172,13 +167,55 @@ def _(
axis=1 axis=1
) )
df['keywords_txt'] = df['keywords'].progress_apply(lambda kws: ', '.join(kws))
df[['id', 'tag', 'content', 'keywords_txt']].to_csv( return
WORKING_DIR / f'keywords_{tag_select.value.replace(" ", "-")}.csv',
@app.cell(hide_code=True)
def _(WORKING_DIR, df, mo, pd, tag_fname):
# Save results to csv
mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
SAVE_DIR = WORKING_DIR / tag_fname
if not SAVE_DIR.exists():
SAVE_DIR.mkdir(parents=True)
df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws))
df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx',
index=False index=False
) )
return
all_keywords_list = df['keywords'].tolist()
all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
# Calculate frequencies per keyword
keyword_freq = {}
for kw in all_keywords_flat:
if kw in keyword_freq:
keyword_freq[kw] += 1
else:
keyword_freq[kw] = 1
freq_df = pd.DataFrame.from_dict(keyword_freq, orient='index', columns=['frequency'])
freq_df.index.name = 'keyword'
freq_df.reset_index(inplace=True)
freq_df.sort_values(by='frequency', ascending=False, inplace=True)
_freq_fpath = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
freq_df.to_excel(
_freq_fpath,
index=False
)
mo.vstack([
mo.md(f"Keywords per-highligh saved to: `{SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'}`"),
mo.md(f"Keyword frequencies saved to: `{_freq_fpath}`")
])
return SAVE_DIR, keyword_freq
@app.cell(hide_code=True) @app.cell(hide_code=True)
@@ -189,7 +226,7 @@ def _(mo):
return return
@app.cell @app.cell(hide_code=True)
def _(): def _():
# Start with loading all necessary libraries # Start with loading all necessary libraries
import numpy as np import numpy as np
@@ -197,26 +234,34 @@ def _():
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from utils import blue_color_func
import warnings import warnings
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
return Image, ImageDraw, WordCloud, np, plt return Image, ImageDraw, WordCloud, blue_color_func, np, plt
@app.cell @app.cell(hide_code=True)
def _(df): def _(mo):
MIN_FREQ = 2 mo.md(r"""
## 5.1) Select threshold frequency
all_keywords_list = df['keywords'].tolist() """)
all_keywords_flat = [item for sublist in all_keywords_list for item in sublist] return
keyword_freq = {} @app.cell(hide_code=True)
for kw in all_keywords_flat: def _(mo):
if kw in keyword_freq: min_freq_select = mo.ui.number(start=1, stop=20, label="Threshold Minimum Keyword Frequency: ", value=2)
keyword_freq[kw] += 1 min_freq_select
else: return (min_freq_select,)
keyword_freq[kw] = 1
@app.cell(hide_code=True)
def _(df, keyword_freq, min_freq_select, mo):
mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
MIN_FREQ = min_freq_select.value
keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ} keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
@@ -227,65 +272,73 @@ def _(df):
return (keyword_freq_filtered,) return (keyword_freq_filtered,)
@app.cell @app.cell(hide_code=True)
def _(): def _(mo, tag_select):
IGNORE_WORDS = { mo.md(rf"""
'chase as a brand': [ ## 5.2) Inspect Keyword Dataset
"brand"
] 1. Check the threshold is set correctly. If not, adjust accordingly
} 2. Check the keywords are good. If not, run extraction again (step 4)
3. Add explicit exclusions if necessary
return (IGNORE_WORDS,) Add words to this dict that should be ignored in the WordCloud for specific tags.
Make sure to create the correct key that matches the active selected tag:
Active selected tag = '`{tag_select.value.lower()}`'
@app.cell """)
def _(plt):
import random
import matplotlib.colors as mcolors
def blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
# Use the provided random_state for reproducibility if available, else use random module
r = random_state if random_state else random
# Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
# 0.0 is white/light, 1.0 is dark blue
min_val, max_val = 0.4, 1.0
color_val = r.uniform(min_val, max_val)
# Get color from matplotlib colormap
rgba = plt.cm.Blues(color_val)
return mcolors.to_hex(rgba)
return (blue_color_func,)
@app.cell
def _():
# chase_mask = np.array(Image.open("./data/assets/Chase-National-Bank-Logo.png"))
# def transform_format(val):
# if val == 0:
# return 255
# else:
# return 1
# transformed_chase_mask = np.ndarray((chase_mask.shape[0], chase_mask.shape[1]), np.int32)
# for i in range(len(chase_mask)):
# transformed_chase_mask[i] = list(map(transform_format, chase_mask[i]))
return return
@app.cell @app.cell
def _():
IGNORE_WORDS = {
'chase as a brand': [
"brand",
"banking experience",
"banking",
"chase",
"jpmorgan",
"youthful"
],
'why customer chase': [
"customer service",
"customer loyalty",
"chase",
"chase customer",
"banking experience",
],
'chase as a person (personification)': [
"CPC1"
]
# <active-selected-tag>: [list, of, words, to, ignore]
}
return (IGNORE_WORDS,)
@app.cell(hide_code=True)
def _(mo): def _(mo):
buffer = -100 # Adjust this to increase/decrease space between logo and words buffer = -100 # Adjust this to increase/decrease space between logo and words
canvas_size = (1200, 800) canvas_size = (1200, 800)
logo_switch = mo.ui.switch(label="Include Chase Logo", value=False) logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
logo_switch
return buffer, canvas_size, logo_switch return buffer, canvas_size, logo_switch
@app.cell(hide_code=True)
def _(logo_switch, mo):
run_wordcloud_btn = mo.ui.run_button(label="(Re-) Generate WordCloud")
mo.vstack([
mo.md("## 5.4) Generate WordCloud with/without Logo"),
mo.md("Adjust the settings and click the button below to (re-)generate the WordCloud. \n\nWhen satisfied with the result, click 'Save WordCloud to File' to save the image."),
mo.md('---'),
mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')]
)
return (run_wordcloud_btn,)
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _( def _(
IGNORE_WORDS, IGNORE_WORDS,
@@ -300,8 +353,12 @@ def _(
mo, mo,
np, np,
plt, plt,
run_wordcloud_btn,
tag_select, tag_select,
): ):
if run_wordcloud_btn.value:
pass
# remove specific keywords depending on selected tag # remove specific keywords depending on selected tag
if IGNORE_WORDS.get(tag_select.value.lower()): if IGNORE_WORDS.get(tag_select.value.lower()):
for word in IGNORE_WORDS[tag_select.value.lower()]: for word in IGNORE_WORDS[tag_select.value.lower()]:
@@ -364,7 +421,7 @@ def _(
background_color='white', background_color='white',
width=canvas_size[0], width=canvas_size[0],
height=canvas_size[1], height=canvas_size[1],
max_font_size=100, # Increased font size for larger canvas max_font_size=150, # Increased font size for larger canvas
max_words=20, # Increased word count to fill space max_words=20, # Increased word count to fill space
color_func=blue_color_func, color_func=blue_color_func,
# mask=chase_mask, # Apply the circular mask # mask=chase_mask, # Apply the circular mask
@@ -396,7 +453,7 @@ def _(
save_wordcloud_btn = None save_wordcloud_btn = None
save_wordcloud_btn = mo.ui.button( save_wordcloud_btn = mo.ui.button(
label="Save_wordcloud_button", label="Save WordCloud to File",
kind="warn", kind="warn",
on_click=lambda val: True on_click=lambda val: True
) )
@@ -404,17 +461,19 @@ def _(
return save_wordcloud_btn, wc_image return save_wordcloud_btn, wc_image
@app.cell @app.cell(hide_code=True)
def _(WORKING_DIR, mo, save_wordcloud_btn, tag_select, wc_image): def _(SAVE_DIR, mo, save_wordcloud_btn, tag_fname, wc_image):
# Wait for start processing button # Wait for start processing button
mo.stop(not save_wordcloud_btn.value, "Click button above to save wordcloud image") mo.stop(not save_wordcloud_btn.value, "Click button above to save wordcloud image")
filename = f'wordcloud_{tag_select.value.replace(" ", "-")}.png' filename = f'wordcloud_{tag_fname}.png'
fpath = WORKING_DIR / filename
fpath = SAVE_DIR / filename
# add a (increasing) number to the filename so we can save multiple. find the latest in the directory first # add a (increasing) number to the filename so we can save multiple. find the latest in the directory first
existing_files = list(WORKING_DIR.glob(f'wordcloud_{tag_select.value.replace(" ", "-")}*.png')) existing_files = list(SAVE_DIR.glob(f'wordcloud_{tag_fname}*.png'))
if existing_files: if existing_files:
existing_numbers = [] existing_numbers = []
for ef in existing_files: for ef in existing_files:
@@ -425,7 +484,7 @@ def _(WORKING_DIR, mo, save_wordcloud_btn, tag_select, wc_image):
next_number = max(existing_numbers) + 1 next_number = max(existing_numbers) + 1
else: else:
next_number = 1 next_number = 1
fpath = WORKING_DIR / f'wordcloud_{tag_select.value.replace(" ", "-")}_{next_number}.png' fpath = SAVE_DIR / f'wordcloud_{tag_fname}_{next_number}.png'
wc_image.save(fpath) wc_image.save(fpath)
mo.md(f"Wordcloud saved to: {fpath}") mo.md(f"Wordcloud saved to: {fpath}")

View File

@@ -2,4 +2,4 @@ from .ollama_utils import connect_qumo_ollama
from .data_utils import create_sentiment_matrix, extract_theme from .data_utils import create_sentiment_matrix, extract_theme
from .transcript_utils import load_srt, csv_to_markdown, cpc_smb_to_markdown from .transcript_utils import load_srt, csv_to_markdown, cpc_smb_to_markdown
from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis
from .keyword_analysis import ollama_keyword_extraction, worker_extraction from .keyword_analysis import ollama_keyword_extraction, worker_extraction, blue_color_func

View File

@@ -2,6 +2,23 @@ import pandas as pd
from ollama import Client from ollama import Client
import json import json
import matplotlib.pyplot as plt
import random
import matplotlib.colors as mcolors
def blue_color_func( word, font_size, position, orientation, random_state=None, **kwargs):
# Use the provided random_state for reproducibility if available, else use random module
r = random_state if random_state else random
# Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
# 0.0 is white/light, 1.0 is dark blue
min_val, max_val = 0.4, 1.0
color_val = r.uniform(min_val, max_val)
# Get color from matplotlib colormap
rgba = plt.cm.Blues(color_val)
return mcolors.to_hex(rgba)
def worker_extraction(row, host, model): def worker_extraction(row, host, model):