cleanup notebook and make usable

2025-12-16 20:15:44 -08:00
parent 4ba8af03d2
commit e81961b819
3 changed files with 177 additions and 101 deletions
--- a/02-B_Thematic-Processing.py
+++ b/02-B_Thematic-Processing.py
@@ -25,7 +25,7 @@ def _():
    client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)

    TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
-    WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
+    WORKING_DIR = Path('./data/processing/02-b_WordClouds')

    if not WORKING_DIR.exists():
        WORKING_DIR.mkdir(parents=True)
@@ -73,7 +73,7 @@ def _(mo):
 def _(TAGUETTE_EXPORT_DIR, pd):
    all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
    all_tags_df['_seq_id'] = range(len(all_tags_df))
-    all_tags_df
+    # all_tags_df
    return (all_tags_df,)


@@ -81,7 +81,7 @@ def _(TAGUETTE_EXPORT_DIR, pd):
 def _(all_tags_df):
    # get count of rows per tag
    tag_counts = all_tags_df['tag'].value_counts().reset_index()
-    tag_counts
+    # tag_counts
    return


@@ -89,7 +89,7 @@ def _(all_tags_df):
 def _(TAGUETTE_EXPORT_DIR, pd):
    codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
    codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
-    codebook_df
+    # codebook_df
    return


@@ -101,25 +101,36 @@ def _(mo):
    return


-@app.cell
+@app.cell(hide_code=True)
 def _(all_tags_df, mo):
+
+    start_processing_btn = None
+    start_processing_btn = mo.ui.button(
+        label="Start Keyword Extraction",
+        kind="warn",
+        on_click=lambda val: True
+    )
+
    tag_select = mo.ui.dropdown(
        options=all_tags_df['tag'].unique().tolist(),
        label="Select Tag to Process",
        value="Chase as a brand",
-        full_width=True
+        full_width=True,
    )
    tag_select
-    return (tag_select,)
+    return start_processing_btn, tag_select


@app.cell
 def _(all_tags_df, mo, tag_select):
    mo.stop(not tag_select.value, mo.md("Select tag to continue"))
+
+    tag_fname = tag_select.value.replace(" ", "-").replace('/','-')
+
    # filter all_tags_df to only the document = file_dropdown.value
    df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
    df
-    return (df,)
+    return df, tag_fname


@app.cell(hide_code=True)
@@ -130,37 +141,21 @@ def _(mo):
    return


-@app.cell
-def _(mo, tag_select):
+@app.cell(hide_code=True)
+def _(mo, start_processing_btn, tag_select):
    mo.stop(not tag_select.value, mo.md("Select tag to continue"))

    # mdf = mpd.from_pandas(df)
-
-    start_processing_btn = mo.ui.button(
-        label="Start Keyword Extraction",
-        kind="warn",
-        on_click=lambda val: True
-    )
    start_processing_btn
-    return (start_processing_btn,)
+    return


-@app.cell
-def _(
-    WORKING_DIR,
-    client,
-    df,
-    mo,
-    model_select,
-    pd,
-    start_processing_btn,
-    tag_select,
-):
+@app.cell(hide_code=True)
+def _(client, df, mo, model_select, pd, start_processing_btn):
    from utils import ollama_keyword_extraction, worker_extraction
    # Wait for start processing button
    mo.stop(not start_processing_btn.value, "Click button above to start processing")

-
    # Run keyword extraction
    df['keywords'] = df.progress_apply(
        lambda row: pd.Series(ollama_keyword_extraction(
@@ -172,13 +167,55 @@ def _(
        axis=1
    )

-    df['keywords_txt'] = df['keywords'].progress_apply(lambda kws: ', '.join(kws))

-    df[['id', 'tag', 'content', 'keywords_txt']].to_csv(
-        WORKING_DIR / f'keywords_{tag_select.value.replace(" ", "-")}.csv',
+    return
+
+
+@app.cell(hide_code=True)
+def _(WORKING_DIR, df, mo, pd, tag_fname):
+    # Save results to csv
+    mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
+
+    SAVE_DIR = WORKING_DIR / tag_fname
+
+    if not SAVE_DIR.exists():
+        SAVE_DIR.mkdir(parents=True)
+
+
+    df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws))
+
+    df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
+        SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx',
        index=False
    )
-    return
+
+
+    all_keywords_list = df['keywords'].tolist()
+    all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
+
+    # Calculate frequencies per keyword
+    keyword_freq = {}
+    for kw in all_keywords_flat:
+        if kw in keyword_freq:
+            keyword_freq[kw] += 1
+        else:
+            keyword_freq[kw] = 1
+
+    freq_df = pd.DataFrame.from_dict(keyword_freq, orient='index', columns=['frequency'])
+    freq_df.index.name = 'keyword'
+    freq_df.reset_index(inplace=True)
+    freq_df.sort_values(by='frequency', ascending=False, inplace=True)
+
+    _freq_fpath = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
+    freq_df.to_excel(
+        _freq_fpath,
+        index=False
+    )
+    mo.vstack([
+        mo.md(f"Keywords per-highligh saved to: `{SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'}`"),
+        mo.md(f"Keyword frequencies saved to: `{_freq_fpath}`")
+    ])
+    return SAVE_DIR, keyword_freq


@app.cell(hide_code=True)
@@ -189,7 +226,7 @@ def _(mo):
    return


-@app.cell
+@app.cell(hide_code=True)
 def _():
    # Start with loading all necessary libraries
    import numpy as np
@@ -197,26 +234,34 @@ def _():
    from PIL import Image, ImageDraw
    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
    import matplotlib.pyplot as plt
+    from utils import blue_color_func

    import warnings
    warnings.filterwarnings("ignore")
-    return Image, ImageDraw, WordCloud, np, plt
+    return Image, ImageDraw, WordCloud, blue_color_func, np, plt


-@app.cell
-def _(df):
-    MIN_FREQ = 2
-
-    all_keywords_list = df['keywords'].tolist()
-    all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## 5.1) Select threshold frequency
+    """)
+    return


-    keyword_freq = {}
-    for kw in all_keywords_flat:
-        if kw in keyword_freq:
-            keyword_freq[kw] += 1
-        else:
-            keyword_freq[kw] = 1
+@app.cell(hide_code=True)
+def _(mo):
+    min_freq_select = mo.ui.number(start=1, stop=20, label="Threshold Minimum Keyword Frequency: ", value=2)
+    min_freq_select
+    return (min_freq_select,)
+
+
+@app.cell(hide_code=True)
+def _(df, keyword_freq, min_freq_select, mo):
+    mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
+
+    MIN_FREQ = min_freq_select.value
+

    keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}

@@ -227,65 +272,73 @@ def _(df):
    return (keyword_freq_filtered,)


-@app.cell
-def _():
-    IGNORE_WORDS = {
-        'chase as a brand': [
-            "brand"
-        ]
-    }
+@app.cell(hide_code=True)
+def _(mo, tag_select):
+    mo.md(rf"""
+    ## 5.2) Inspect Keyword Dataset
+
+    1. Check the threshold is set correctly. If not, adjust accordingly
+    2. Check the keywords are good. If not, run extraction again (step 4)
+    3. Add explicit exclusions if necessary


-    return (IGNORE_WORDS,)
+    Add words to this dict that should be ignored in the WordCloud for specific tags. 
+    Make sure to create the correct key that matches the active selected tag: 

-
-@app.cell
-def _(plt):
-    import random
-    import matplotlib.colors as mcolors
-
-    def blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
-        # Use the provided random_state for reproducibility if available, else use random module
-        r = random_state if random_state else random
-
-        # Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
-        # 0.0 is white/light, 1.0 is dark blue
-        min_val, max_val = 0.4, 1.0
-        color_val = r.uniform(min_val, max_val)
-
-        # Get color from matplotlib colormap
-        rgba = plt.cm.Blues(color_val)
-        return mcolors.to_hex(rgba)
-    return (blue_color_func,)
-
-
-@app.cell
-def _():
-    # chase_mask = np.array(Image.open("./data/assets/Chase-National-Bank-Logo.png"))
-
-    # def transform_format(val):
-    #     if val == 0:
-    #         return 255
-    #     else:
-    #         return 1
-
-    # transformed_chase_mask = np.ndarray((chase_mask.shape[0], chase_mask.shape[1]), np.int32)
-    # for i in range(len(chase_mask)):
-    #     transformed_chase_mask[i] = list(map(transform_format, chase_mask[i]))
+    Active selected tag = '`{tag_select.value.lower()}`'
+    """)
    return


@app.cell
+def _():
+    IGNORE_WORDS = {
+        'chase as a brand': [
+            "brand",
+            "banking experience",
+            "banking",
+            "chase",
+            "jpmorgan",
+            "youthful"
+        ],
+        'why customer chase': [
+            "customer service",
+            "customer loyalty",
+            "chase",
+            "chase customer",
+            "banking experience",
+        ],
+        'chase as a person (personification)': [
+            "CPC1"
+        ]
+        # <active-selected-tag>: [list, of, words, to, ignore]
+    }
+    return (IGNORE_WORDS,)
+
+
+@app.cell(hide_code=True)
 def _(mo):
    buffer = -100 # Adjust this to increase/decrease space between logo and words
    canvas_size = (1200, 800)

    logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
-    logo_switch

    return buffer, canvas_size, logo_switch


+@app.cell(hide_code=True)
+def _(logo_switch, mo):
+    run_wordcloud_btn = mo.ui.run_button(label="(Re-) Generate WordCloud")
+
+    mo.vstack([
+        mo.md("## 5.4) Generate WordCloud with/without Logo"),
+        mo.md("Adjust the settings and click the button below to (re-)generate the WordCloud. \n\nWhen satisfied with the result, click 'Save WordCloud to File' to save the image."),
+        mo.md('---'),
+        mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')]
+    )
+    return (run_wordcloud_btn,)
+
+
@app.cell(hide_code=True)
 def _(
    IGNORE_WORDS,
@@ -300,8 +353,12 @@ def _(
    mo,
    np,
    plt,
+    run_wordcloud_btn,
    tag_select,
 ):
+    if run_wordcloud_btn.value:
+        pass
+
    # remove specific keywords depending on selected tag
    if IGNORE_WORDS.get(tag_select.value.lower()):
        for word in IGNORE_WORDS[tag_select.value.lower()]:
@@ -364,7 +421,7 @@ def _(
            background_color='white',
            width=canvas_size[0],
            height=canvas_size[1],
-            max_font_size=100,  # Increased font size for larger canvas
+            max_font_size=150,  # Increased font size for larger canvas
            max_words=20,      # Increased word count to fill space
            color_func=blue_color_func,
            # mask=chase_mask,    # Apply the circular mask
@@ -396,7 +453,7 @@ def _(

    save_wordcloud_btn = None
    save_wordcloud_btn = mo.ui.button(
-        label="Save_wordcloud_button",
+        label="Save WordCloud to File",
        kind="warn",
        on_click=lambda val: True
    )
@@ -404,17 +461,19 @@ def _(
    return save_wordcloud_btn, wc_image


-@app.cell
-def _(WORKING_DIR, mo, save_wordcloud_btn, tag_select, wc_image):
+@app.cell(hide_code=True)
+def _(SAVE_DIR, mo, save_wordcloud_btn, tag_fname, wc_image):
    # Wait for start processing button
    mo.stop(not save_wordcloud_btn.value, "Click button above to save wordcloud image")


-    filename = f'wordcloud_{tag_select.value.replace(" ", "-")}.png'
-    fpath = WORKING_DIR / filename
+    filename = f'wordcloud_{tag_fname}.png'
+
+
+    fpath = SAVE_DIR / filename

    # add a (increasing) number to the filename so we can save multiple. find the latest in the directory first
-    existing_files = list(WORKING_DIR.glob(f'wordcloud_{tag_select.value.replace(" ", "-")}*.png'))
+    existing_files = list(SAVE_DIR.glob(f'wordcloud_{tag_fname}*.png'))
    if existing_files:
        existing_numbers = []
        for ef in existing_files:
@@ -425,7 +484,7 @@ def _(WORKING_DIR, mo, save_wordcloud_btn, tag_select, wc_image):
            next_number = max(existing_numbers) + 1
        else:
            next_number = 1
-        fpath = WORKING_DIR / f'wordcloud_{tag_select.value.replace(" ", "-")}_{next_number}.png'
+        fpath = SAVE_DIR / f'wordcloud_{tag_fname}_{next_number}.png'

    wc_image.save(fpath)
    mo.md(f"Wordcloud saved to: {fpath}")
--- a/utils/init.py
+++ b/utils/init.py
@@ -2,4 +2,4 @@ from .ollama_utils import connect_qumo_ollama
 from .data_utils import create_sentiment_matrix, extract_theme
 from .transcript_utils import load_srt, csv_to_markdown, cpc_smb_to_markdown
 from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis
-from .keyword_analysis import ollama_keyword_extraction, worker_extraction
+from .keyword_analysis import ollama_keyword_extraction, worker_extraction, blue_color_func
--- a/utils/keyword_analysis.py
+++ b/utils/keyword_analysis.py
@@ -2,6 +2,23 @@ import pandas as pd

 from ollama import Client
 import json
+import matplotlib.pyplot as plt
+
+import random
+import matplotlib.colors as mcolors
+
+def blue_color_func( word, font_size, position, orientation, random_state=None, **kwargs):
+    # Use the provided random_state for reproducibility if available, else use random module
+    r = random_state if random_state else random
+
+    # Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
+    # 0.0 is white/light, 1.0 is dark blue
+    min_val, max_val = 0.4, 1.0
+    color_val = r.uniform(min_val, max_val)
+
+    # Get color from matplotlib colormap
+    rgba = plt.cm.Blues(color_val)
+    return mcolors.to_hex(rgba)


 def worker_extraction(row, host, model):