diff --git a/02-B_Thematic-Processing.py b/02-B_Thematic-Processing.py index d71cf19..61d5328 100644 --- a/02-B_Thematic-Processing.py +++ b/02-B_Thematic-Processing.py @@ -25,7 +25,7 @@ def _(): client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False) TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export') - WORKING_DIR = Path('./data/processing/02_taguette_postprocess') + WORKING_DIR = Path('./data/processing/02-b_WordClouds') if not WORKING_DIR.exists(): WORKING_DIR.mkdir(parents=True) @@ -73,7 +73,7 @@ def _(mo): def _(TAGUETTE_EXPORT_DIR, pd): all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv') all_tags_df['_seq_id'] = range(len(all_tags_df)) - all_tags_df + # all_tags_df return (all_tags_df,) @@ -81,7 +81,7 @@ def _(TAGUETTE_EXPORT_DIR, pd): def _(all_tags_df): # get count of rows per tag tag_counts = all_tags_df['tag'].value_counts().reset_index() - tag_counts + # tag_counts return @@ -89,7 +89,7 @@ def _(all_tags_df): def _(TAGUETTE_EXPORT_DIR, pd): codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv') codebook_df.rename(columns={'description': 'theme_description'}, inplace=True) - codebook_df + # codebook_df return @@ -101,25 +101,36 @@ def _(mo): return -@app.cell +@app.cell(hide_code=True) def _(all_tags_df, mo): + + start_processing_btn = None + start_processing_btn = mo.ui.button( + label="Start Keyword Extraction", + kind="warn", + on_click=lambda val: True + ) + tag_select = mo.ui.dropdown( options=all_tags_df['tag'].unique().tolist(), label="Select Tag to Process", value="Chase as a brand", - full_width=True + full_width=True, ) tag_select - return (tag_select,) + return start_processing_btn, tag_select @app.cell def _(all_tags_df, mo, tag_select): mo.stop(not tag_select.value, mo.md("Select tag to continue")) + + tag_fname = tag_select.value.replace(" ", "-").replace('/','-') + # filter all_tags_df to only the document = file_dropdown.value df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy() df - return (df,) + return df, tag_fname @app.cell(hide_code=True) @@ -130,37 +141,21 @@ def _(mo): return -@app.cell -def _(mo, tag_select): +@app.cell(hide_code=True) +def _(mo, start_processing_btn, tag_select): mo.stop(not tag_select.value, mo.md("Select tag to continue")) # mdf = mpd.from_pandas(df) - - start_processing_btn = mo.ui.button( - label="Start Keyword Extraction", - kind="warn", - on_click=lambda val: True - ) start_processing_btn - return (start_processing_btn,) + return -@app.cell -def _( - WORKING_DIR, - client, - df, - mo, - model_select, - pd, - start_processing_btn, - tag_select, -): +@app.cell(hide_code=True) +def _(client, df, mo, model_select, pd, start_processing_btn): from utils import ollama_keyword_extraction, worker_extraction # Wait for start processing button mo.stop(not start_processing_btn.value, "Click button above to start processing") - # Run keyword extraction df['keywords'] = df.progress_apply( lambda row: pd.Series(ollama_keyword_extraction( @@ -172,13 +167,55 @@ def _( axis=1 ) - df['keywords_txt'] = df['keywords'].progress_apply(lambda kws: ', '.join(kws)) - df[['id', 'tag', 'content', 'keywords_txt']].to_csv( - WORKING_DIR / f'keywords_{tag_select.value.replace(" ", "-")}.csv', + return + + +@app.cell(hide_code=True) +def _(WORKING_DIR, df, mo, pd, tag_fname): + # Save results to csv + mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish") + + SAVE_DIR = WORKING_DIR / tag_fname + + if not SAVE_DIR.exists(): + SAVE_DIR.mkdir(parents=True) + + + df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws)) + + df[['id', 'tag', 'content', 'keywords_txt']].to_excel( + SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx', index=False ) - return + + + all_keywords_list = df['keywords'].tolist() + all_keywords_flat = [item for sublist in all_keywords_list for item in sublist] + + # Calculate frequencies per keyword + keyword_freq = {} + for kw in all_keywords_flat: + if kw in keyword_freq: + keyword_freq[kw] += 1 + else: + keyword_freq[kw] = 1 + + freq_df = pd.DataFrame.from_dict(keyword_freq, orient='index', columns=['frequency']) + freq_df.index.name = 'keyword' + freq_df.reset_index(inplace=True) + freq_df.sort_values(by='frequency', ascending=False, inplace=True) + + _freq_fpath = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx' + freq_df.to_excel( + _freq_fpath, + index=False + ) + mo.vstack([ + mo.md(f"Keywords per-highligh saved to: `{SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'}`"), + mo.md(f"Keyword frequencies saved to: `{_freq_fpath}`") + ]) + return SAVE_DIR, keyword_freq @app.cell(hide_code=True) @@ -189,7 +226,7 @@ def _(mo): return -@app.cell +@app.cell(hide_code=True) def _(): # Start with loading all necessary libraries import numpy as np @@ -197,26 +234,34 @@ def _(): from PIL import Image, ImageDraw from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import matplotlib.pyplot as plt + from utils import blue_color_func import warnings warnings.filterwarnings("ignore") - return Image, ImageDraw, WordCloud, np, plt + return Image, ImageDraw, WordCloud, blue_color_func, np, plt -@app.cell -def _(df): - MIN_FREQ = 2 - - all_keywords_list = df['keywords'].tolist() - all_keywords_flat = [item for sublist in all_keywords_list for item in sublist] +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## 5.1) Select threshold frequency + """) + return - keyword_freq = {} - for kw in all_keywords_flat: - if kw in keyword_freq: - keyword_freq[kw] += 1 - else: - keyword_freq[kw] = 1 +@app.cell(hide_code=True) +def _(mo): + min_freq_select = mo.ui.number(start=1, stop=20, label="Threshold Minimum Keyword Frequency: ", value=2) + min_freq_select + return (min_freq_select,) + + +@app.cell(hide_code=True) +def _(df, keyword_freq, min_freq_select, mo): + mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish") + + MIN_FREQ = min_freq_select.value + keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ} @@ -227,65 +272,73 @@ def _(df): return (keyword_freq_filtered,) -@app.cell -def _(): - IGNORE_WORDS = { - 'chase as a brand': [ - "brand" - ] - } +@app.cell(hide_code=True) +def _(mo, tag_select): + mo.md(rf""" + ## 5.2) Inspect Keyword Dataset + + 1. Check the threshold is set correctly. If not, adjust accordingly + 2. Check the keywords are good. If not, run extraction again (step 4) + 3. Add explicit exclusions if necessary - return (IGNORE_WORDS,) + Add words to this dict that should be ignored in the WordCloud for specific tags. + Make sure to create the correct key that matches the active selected tag: - -@app.cell -def _(plt): - import random - import matplotlib.colors as mcolors - - def blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs): - # Use the provided random_state for reproducibility if available, else use random module - r = random_state if random_state else random - - # Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0) - # 0.0 is white/light, 1.0 is dark blue - min_val, max_val = 0.4, 1.0 - color_val = r.uniform(min_val, max_val) - - # Get color from matplotlib colormap - rgba = plt.cm.Blues(color_val) - return mcolors.to_hex(rgba) - return (blue_color_func,) - - -@app.cell -def _(): - # chase_mask = np.array(Image.open("./data/assets/Chase-National-Bank-Logo.png")) - - # def transform_format(val): - # if val == 0: - # return 255 - # else: - # return 1 - - # transformed_chase_mask = np.ndarray((chase_mask.shape[0], chase_mask.shape[1]), np.int32) - # for i in range(len(chase_mask)): - # transformed_chase_mask[i] = list(map(transform_format, chase_mask[i])) + Active selected tag = '`{tag_select.value.lower()}`' + """) return @app.cell +def _(): + IGNORE_WORDS = { + 'chase as a brand': [ + "brand", + "banking experience", + "banking", + "chase", + "jpmorgan", + "youthful" + ], + 'why customer chase': [ + "customer service", + "customer loyalty", + "chase", + "chase customer", + "banking experience", + ], + 'chase as a person (personification)': [ + "CPC1" + ] + # : [list, of, words, to, ignore] + } + return (IGNORE_WORDS,) + + +@app.cell(hide_code=True) def _(mo): buffer = -100 # Adjust this to increase/decrease space between logo and words canvas_size = (1200, 800) logo_switch = mo.ui.switch(label="Include Chase Logo", value=False) - logo_switch return buffer, canvas_size, logo_switch +@app.cell(hide_code=True) +def _(logo_switch, mo): + run_wordcloud_btn = mo.ui.run_button(label="(Re-) Generate WordCloud") + + mo.vstack([ + mo.md("## 5.4) Generate WordCloud with/without Logo"), + mo.md("Adjust the settings and click the button below to (re-)generate the WordCloud. \n\nWhen satisfied with the result, click 'Save WordCloud to File' to save the image."), + mo.md('---'), + mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')] + ) + return (run_wordcloud_btn,) + + @app.cell(hide_code=True) def _( IGNORE_WORDS, @@ -300,8 +353,12 @@ def _( mo, np, plt, + run_wordcloud_btn, tag_select, ): + if run_wordcloud_btn.value: + pass + # remove specific keywords depending on selected tag if IGNORE_WORDS.get(tag_select.value.lower()): for word in IGNORE_WORDS[tag_select.value.lower()]: @@ -364,7 +421,7 @@ def _( background_color='white', width=canvas_size[0], height=canvas_size[1], - max_font_size=100, # Increased font size for larger canvas + max_font_size=150, # Increased font size for larger canvas max_words=20, # Increased word count to fill space color_func=blue_color_func, # mask=chase_mask, # Apply the circular mask @@ -386,8 +443,8 @@ def _( # Paste logo (using alpha channel as mask to keep transparency) wc_image.paste(logo, logo_pos, logo) - # Display the generated image - fig = plt.figure(figsize=(7,7)) + # Display the generated image + fig = plt.figure(figsize=(7,7)) # Display the generated image: plt.imshow(wc_image, interpolation='bilinear') @@ -396,7 +453,7 @@ def _( save_wordcloud_btn = None save_wordcloud_btn = mo.ui.button( - label="Save_wordcloud_button", + label="Save WordCloud to File", kind="warn", on_click=lambda val: True ) @@ -404,17 +461,19 @@ def _( return save_wordcloud_btn, wc_image -@app.cell -def _(WORKING_DIR, mo, save_wordcloud_btn, tag_select, wc_image): +@app.cell(hide_code=True) +def _(SAVE_DIR, mo, save_wordcloud_btn, tag_fname, wc_image): # Wait for start processing button mo.stop(not save_wordcloud_btn.value, "Click button above to save wordcloud image") - filename = f'wordcloud_{tag_select.value.replace(" ", "-")}.png' - fpath = WORKING_DIR / filename + filename = f'wordcloud_{tag_fname}.png' + + + fpath = SAVE_DIR / filename # add a (increasing) number to the filename so we can save multiple. find the latest in the directory first - existing_files = list(WORKING_DIR.glob(f'wordcloud_{tag_select.value.replace(" ", "-")}*.png')) + existing_files = list(SAVE_DIR.glob(f'wordcloud_{tag_fname}*.png')) if existing_files: existing_numbers = [] for ef in existing_files: @@ -425,7 +484,7 @@ def _(WORKING_DIR, mo, save_wordcloud_btn, tag_select, wc_image): next_number = max(existing_numbers) + 1 else: next_number = 1 - fpath = WORKING_DIR / f'wordcloud_{tag_select.value.replace(" ", "-")}_{next_number}.png' + fpath = SAVE_DIR / f'wordcloud_{tag_fname}_{next_number}.png' wc_image.save(fpath) mo.md(f"Wordcloud saved to: {fpath}") diff --git a/utils/__init__.py b/utils/__init__.py index 40eea45..13d228e 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -2,4 +2,4 @@ from .ollama_utils import connect_qumo_ollama from .data_utils import create_sentiment_matrix, extract_theme from .transcript_utils import load_srt, csv_to_markdown, cpc_smb_to_markdown from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis -from .keyword_analysis import ollama_keyword_extraction, worker_extraction +from .keyword_analysis import ollama_keyword_extraction, worker_extraction, blue_color_func diff --git a/utils/keyword_analysis.py b/utils/keyword_analysis.py index 9df75b4..c66666a 100644 --- a/utils/keyword_analysis.py +++ b/utils/keyword_analysis.py @@ -2,6 +2,23 @@ import pandas as pd from ollama import Client import json +import matplotlib.pyplot as plt + +import random +import matplotlib.colors as mcolors + +def blue_color_func( word, font_size, position, orientation, random_state=None, **kwargs): + # Use the provided random_state for reproducibility if available, else use random module + r = random_state if random_state else random + + # Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0) + # 0.0 is white/light, 1.0 is dark blue + min_val, max_val = 0.4, 1.0 + color_val = r.uniform(min_val, max_val) + + # Get color from matplotlib colormap + rgba = plt.cm.Blues(color_val) + return mcolors.to_hex(rgba) def worker_extraction(row, host, model):