cleanup notebook and make usable

2025-12-16 20:15:44 -08:00
parent 4ba8af03d2
commit e81961b819
3 changed files with 177 additions and 101 deletions
--- a/02-B_Thematic-Processing.py
+++ b/02-B_Thematic-Processing.py
@@ -25,7 +25,7 @@ def _():
    client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
    TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
-    WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
+    WORKING_DIR = Path('./data/processing/02-b_WordClouds')
    if not WORKING_DIR.exists():
        WORKING_DIR.mkdir(parents=True)
@@ -73,7 +73,7 @@ def _(mo):
 def _(TAGUETTE_EXPORT_DIR, pd):
    all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
    all_tags_df['_seq_id'] = range(len(all_tags_df))
-    all_tags_df
+    # all_tags_df
    return (all_tags_df,)
@@ -81,7 +81,7 @@ def _(TAGUETTE_EXPORT_DIR, pd):
 def _(all_tags_df):
    # get count of rows per tag
    tag_counts = all_tags_df['tag'].value_counts().reset_index()
-    tag_counts
+    # tag_counts
    return
@@ -89,7 +89,7 @@ def _(all_tags_df):
 def _(TAGUETTE_EXPORT_DIR, pd):
    codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
    codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
-    codebook_df
+    # codebook_df
    return
@@ -101,25 +101,36 @@ def _(mo):
    return
-@app.cell
+@app.cell(hide_code=True)
 def _(all_tags_df, mo):
    start_processing_btn = None
    start_processing_btn = mo.ui.button(
        label="Start Keyword Extraction",
        kind="warn",
        on_click=lambda val: True
    )
    tag_select = mo.ui.dropdown(
        options=all_tags_df['tag'].unique().tolist(),
        label="Select Tag to Process",
        value="Chase as a brand",
-        full_width=True
+        full_width=True,
    )
    tag_select
-    return (tag_select,)
+    return start_processing_btn, tag_select
@app.cell
 def _(all_tags_df, mo, tag_select):
    mo.stop(not tag_select.value, mo.md("Select tag to continue"))
    tag_fname = tag_select.value.replace(" ", "-").replace('/','-')
    # filter all_tags_df to only the document = file_dropdown.value
    df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
    df
-    return (df,)
+    return df, tag_fname
@app.cell(hide_code=True)
@@ -130,37 +141,21 @@ def _(mo):
    return
-@app.cell
+@app.cell(hide_code=True)
-def _(mo, tag_select):
+def _(mo, start_processing_btn, tag_select):
    mo.stop(not tag_select.value, mo.md("Select tag to continue"))
    # mdf = mpd.from_pandas(df)
    start_processing_btn = mo.ui.button(
        label="Start Keyword Extraction",
        kind="warn",
        on_click=lambda val: True
    )
    start_processing_btn
-    return (start_processing_btn,)
+    return
-@app.cell
+@app.cell(hide_code=True)
-def _(
+def _(client, df, mo, model_select, pd, start_processing_btn):
    WORKING_DIR,
    client,
    df,
    mo,
    model_select,
    pd,
    start_processing_btn,
    tag_select,
 ):
    from utils import ollama_keyword_extraction, worker_extraction
    # Wait for start processing button
    mo.stop(not start_processing_btn.value, "Click button above to start processing")
    # Run keyword extraction
    df['keywords'] = df.progress_apply(
        lambda row: pd.Series(ollama_keyword_extraction(
@@ -172,13 +167,55 @@ def _(
        axis=1
    )
    df['keywords_txt'] = df['keywords'].progress_apply(lambda kws: ', '.join(kws))
-    df[['id', 'tag', 'content', 'keywords_txt']].to_csv(
+    return
-        WORKING_DIR / f'keywords_{tag_select.value.replace(" ", "-")}.csv',
+
@app.cell(hide_code=True)
 def _(WORKING_DIR, df, mo, pd, tag_fname):
    # Save results to csv
    mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
    SAVE_DIR = WORKING_DIR / tag_fname
    if not SAVE_DIR.exists():
        SAVE_DIR.mkdir(parents=True)
    df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws))
    df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
        SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx',
        index=False
    )
-    return
+
    all_keywords_list = df['keywords'].tolist()
    all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
    # Calculate frequencies per keyword
    keyword_freq = {}
    for kw in all_keywords_flat:
        if kw in keyword_freq:
            keyword_freq[kw] += 1
        else:
            keyword_freq[kw] = 1
    freq_df = pd.DataFrame.from_dict(keyword_freq, orient='index', columns=['frequency'])
    freq_df.index.name = 'keyword'
    freq_df.reset_index(inplace=True)
    freq_df.sort_values(by='frequency', ascending=False, inplace=True)
    _freq_fpath = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
    freq_df.to_excel(
        _freq_fpath,
        index=False
    )
    mo.vstack([
        mo.md(f"Keywords per-highligh saved to: `{SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'}`"),
        mo.md(f"Keyword frequencies saved to: `{_freq_fpath}`")
    ])
    return SAVE_DIR, keyword_freq
@app.cell(hide_code=True)
@@ -189,7 +226,7 @@ def _(mo):
    return
-@app.cell
+@app.cell(hide_code=True)
 def _():
    # Start with loading all necessary libraries
    import numpy as np
@@ -197,26 +234,34 @@ def _():
    from PIL import Image, ImageDraw
    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
    import matplotlib.pyplot as plt
    from utils import blue_color_func
    import warnings
    warnings.filterwarnings("ignore")
-    return Image, ImageDraw, WordCloud, np, plt
+    return Image, ImageDraw, WordCloud, blue_color_func, np, plt
-@app.cell
+@app.cell(hide_code=True)
-def _(df):
+def _(mo):
-    MIN_FREQ = 2
+    mo.md(r"""
-
+    ## 5.1) Select threshold frequency
-    all_keywords_list = df['keywords'].tolist()
+    """)
-    all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
+    return
-    keyword_freq = {}
+@app.cell(hide_code=True)
-    for kw in all_keywords_flat:
+def _(mo):
-        if kw in keyword_freq:
+    min_freq_select = mo.ui.number(start=1, stop=20, label="Threshold Minimum Keyword Frequency: ", value=2)
-            keyword_freq[kw] += 1
+    min_freq_select
-        else:
+    return (min_freq_select,)
-            keyword_freq[kw] = 1
+
@app.cell(hide_code=True)
 def _(df, keyword_freq, min_freq_select, mo):
    mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
    MIN_FREQ = min_freq_select.value
    keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
@@ -227,65 +272,73 @@ def _(df):
    return (keyword_freq_filtered,)
-@app.cell
+@app.cell(hide_code=True)
-def _():
+def _(mo, tag_select):
-    IGNORE_WORDS = {
+    mo.md(rf"""
-        'chase as a brand': [
+    ## 5.2) Inspect Keyword Dataset
-            "brand"
+
-        ]
+    1. Check the threshold is set correctly. If not, adjust accordingly
-    }
+    2. Check the keywords are good. If not, run extraction again (step 4)
    3. Add explicit exclusions if necessary
-    return (IGNORE_WORDS,)
+    Add words to this dict that should be ignored in the WordCloud for specific tags. 
    Make sure to create the correct key that matches the active selected tag: 
-
+    Active selected tag = '`{tag_select.value.lower()}`'
-@app.cell
+    """)
 def _(plt):
    import random
    import matplotlib.colors as mcolors
    def blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
        # Use the provided random_state for reproducibility if available, else use random module
        r = random_state if random_state else random
        # Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
        # 0.0 is white/light, 1.0 is dark blue
        min_val, max_val = 0.4, 1.0
        color_val = r.uniform(min_val, max_val)
        # Get color from matplotlib colormap
        rgba = plt.cm.Blues(color_val)
        return mcolors.to_hex(rgba)
    return (blue_color_func,)
@app.cell
 def _():
    # chase_mask = np.array(Image.open("./data/assets/Chase-National-Bank-Logo.png"))
    # def transform_format(val):
    #     if val == 0:
    #         return 255
    #     else:
    #         return 1
    # transformed_chase_mask = np.ndarray((chase_mask.shape[0], chase_mask.shape[1]), np.int32)
    # for i in range(len(chase_mask)):
    #     transformed_chase_mask[i] = list(map(transform_format, chase_mask[i]))
    return
@app.cell
 def _():
    IGNORE_WORDS = {
        'chase as a brand': [
            "brand",
            "banking experience",
            "banking",
            "chase",
            "jpmorgan",
            "youthful"
        ],
        'why customer chase': [
            "customer service",
            "customer loyalty",
            "chase",
            "chase customer",
            "banking experience",
        ],
        'chase as a person (personification)': [
            "CPC1"
        ]
        # <active-selected-tag>: [list, of, words, to, ignore]
    }
    return (IGNORE_WORDS,)
@app.cell(hide_code=True)
 def _(mo):
    buffer = -100 # Adjust this to increase/decrease space between logo and words
    canvas_size = (1200, 800)
    logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
    logo_switch
    return buffer, canvas_size, logo_switch
@app.cell(hide_code=True)
 def _(logo_switch, mo):
    run_wordcloud_btn = mo.ui.run_button(label="(Re-) Generate WordCloud")
    mo.vstack([
        mo.md("## 5.4) Generate WordCloud with/without Logo"),
        mo.md("Adjust the settings and click the button below to (re-)generate the WordCloud. \n\nWhen satisfied with the result, click 'Save WordCloud to File' to save the image."),
        mo.md('---'),
        mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')]
    )
    return (run_wordcloud_btn,)
@app.cell(hide_code=True)
 def _(
    IGNORE_WORDS,
@@ -300,8 +353,12 @@ def _(
    mo,
    np,
    plt,
    run_wordcloud_btn,
    tag_select,
 ):
    if run_wordcloud_btn.value:
        pass
    # remove specific keywords depending on selected tag
    if IGNORE_WORDS.get(tag_select.value.lower()):
        for word in IGNORE_WORDS[tag_select.value.lower()]:
@@ -364,7 +421,7 @@ def _(
            background_color='white',
            width=canvas_size[0],
            height=canvas_size[1],
-            max_font_size=100,  # Increased font size for larger canvas
+            max_font_size=150,  # Increased font size for larger canvas
            max_words=20,      # Increased word count to fill space
            color_func=blue_color_func,
            # mask=chase_mask,    # Apply the circular mask
@@ -386,8 +443,8 @@ def _(
        # Paste logo (using alpha channel as mask to keep transparency)
        wc_image.paste(logo, logo_pos, logo)
-    # Display the generated image
+        # Display the generated image
-    fig = plt.figure(figsize=(7,7))
+        fig = plt.figure(figsize=(7,7))
    # Display the generated image:
    plt.imshow(wc_image, interpolation='bilinear')
@@ -396,7 +453,7 @@ def _(
    save_wordcloud_btn = None
    save_wordcloud_btn = mo.ui.button(
-        label="Save_wordcloud_button",
+        label="Save WordCloud to File",
        kind="warn",
        on_click=lambda val: True
    )
@@ -404,17 +461,19 @@ def _(
    return save_wordcloud_btn, wc_image
-@app.cell
+@app.cell(hide_code=True)
-def _(WORKING_DIR, mo, save_wordcloud_btn, tag_select, wc_image):
+def _(SAVE_DIR, mo, save_wordcloud_btn, tag_fname, wc_image):
    # Wait for start processing button
    mo.stop(not save_wordcloud_btn.value, "Click button above to save wordcloud image")
-    filename = f'wordcloud_{tag_select.value.replace(" ", "-")}.png'
+    filename = f'wordcloud_{tag_fname}.png'
-    fpath = WORKING_DIR / filename
+
    fpath = SAVE_DIR / filename
    # add a (increasing) number to the filename so we can save multiple. find the latest in the directory first
-    existing_files = list(WORKING_DIR.glob(f'wordcloud_{tag_select.value.replace(" ", "-")}*.png'))
+    existing_files = list(SAVE_DIR.glob(f'wordcloud_{tag_fname}*.png'))
    if existing_files:
        existing_numbers = []
        for ef in existing_files:
@@ -425,7 +484,7 @@ def _(WORKING_DIR, mo, save_wordcloud_btn, tag_select, wc_image):
            next_number = max(existing_numbers) + 1
        else:
            next_number = 1
-        fpath = WORKING_DIR / f'wordcloud_{tag_select.value.replace(" ", "-")}_{next_number}.png'
+        fpath = SAVE_DIR / f'wordcloud_{tag_fname}_{next_number}.png'
    wc_image.save(fpath)
    mo.md(f"Wordcloud saved to: {fpath}")
--- a/utils/init.py
+++ b/utils/init.py
@@ -2,4 +2,4 @@ from .ollama_utils import connect_qumo_ollama
 from .data_utils import create_sentiment_matrix, extract_theme
 from .transcript_utils import load_srt, csv_to_markdown, cpc_smb_to_markdown
 from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis
-from .keyword_analysis import ollama_keyword_extraction, worker_extraction
+from .keyword_analysis import ollama_keyword_extraction, worker_extraction, blue_color_func
--- a/utils/keyword_analysis.py
+++ b/utils/keyword_analysis.py
@@ -2,6 +2,23 @@ import pandas as pd
 from ollama import Client
 import json
 import matplotlib.pyplot as plt
 import random
 import matplotlib.colors as mcolors
 def blue_color_func( word, font_size, position, orientation, random_state=None, **kwargs):
    # Use the provided random_state for reproducibility if available, else use random module
    r = random_state if random_state else random
    # Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
    # 0.0 is white/light, 1.0 is dark blue
    min_val, max_val = 0.4, 1.0
    color_val = r.uniform(min_val, max_val)
    # Get color from matplotlib colormap
    rgba = plt.cm.Blues(color_val)
    return mcolors.to_hex(rgba)
 def worker_extraction(row, host, model):