final tweaks

voice keyword blacklist
rename
2025-12-17 01:37:42 -08:00 · 2025-12-17 01:19:22 -08:00 · 2025-12-17 00:25:03 -08:00 · 2025-12-16 23:56:13 -08:00 · 2025-12-16 23:42:25 -08:00 · 2025-12-16 23:21:03 -08:00
3 changed files with 178 additions and 71 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,5 @@ __pycache__/
 data/
 docker-volumes/
-logs/
+logs/
--- a/02-B_Thematic-Processing.py
+++ b/02-B_Thematic-Processing.py
@@ -22,24 +22,27 @@ def _():
    tqdm.pandas()
    client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
    TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
    WORKING_DIR = Path('./data/processing/02-b_WordClouds')
    VOICE_EXCLUDE_KEYWORDS_FILE = WORKING_DIR / 'voice_excl_keywords.txt'
    if not WORKING_DIR.exists():
        WORKING_DIR.mkdir(parents=True)
    if not TAGUETTE_EXPORT_DIR.exists():
        TAGUETTE_EXPORT_DIR.mkdir(parents=True)
-    model_select = mo.ui.dropdown(
+    if not VOICE_EXCLUDE_KEYWORDS_FILE.exists():
-        options=_models,
+        VOICE_EXCLUDE_KEYWORDS_FILE.touch()
-        value=_models[0],
+
-        label="Select Ollama Model to use",
+    return (
-        searchable=True,
+        OLLAMA_LOCATION,
        TAGUETTE_EXPORT_DIR,
        VOICE_EXCLUDE_KEYWORDS_FILE,
        WORKING_DIR,
        connect_qumo_ollama,
        mo,
        pd,
    )
    model_select
    return TAGUETTE_EXPORT_DIR, WORKING_DIR, client, mo, model_select, pd
@app.cell(hide_code=True)
@@ -116,7 +119,7 @@ def _(all_tags_df, mo):
    return (tag_select,)
-@app.cell
+@app.cell(hide_code=True)
 def _(WORKING_DIR, all_tags_df, mo, tag_select):
    mo.stop(not tag_select.value, mo.md("Select tag to continue"))
@@ -139,7 +142,7 @@ def _(WORKING_DIR, all_tags_df, mo, tag_select):
    # filter all_tags_df to only the document = file_dropdown.value
    tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
-    tags_df
+    tags_df.head()
    return (
        KEYWORDS_FPATH,
        KEYWORD_FREQ_FPATH,
@@ -151,44 +154,65 @@ def _(WORKING_DIR, all_tags_df, mo, tag_select):
@app.cell(hide_code=True)
-def _(mo):
+def _(KEYWORD_FREQ_FPATH, mo):
-    mo.md(r"""
+    mo.md(rf"""
-    # 4) Keyword extraction
+    # 4) Keyword extraction {'(skippable, see 4b)' if KEYWORD_FREQ_FPATH.exists() else '(Required)'}
    """)
    return
@app.cell(hide_code=True)
 def _(OLLAMA_LOCATION, connect_qumo_ollama, mo):
    try:
        client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
        model_select = mo.ui.dropdown(
            options=_models,
            value=_models[0],
            label="Select Ollama Model to use",
            searchable=True,
        )
    except Exception as e:
        mo.md(f"Error connecting to Ollama server at `{OLLAMA_LOCATION}`: {e}")
        model_select = None
        client = None
    model_select
    return client, model_select
@app.cell
-def _(mo, start_processing_btn, tag_select):
+def _(mo, model_select, start_processing_btn, tag_select):
-    mo.stop(not tag_select.value, mo.md("Select tag to continue"))
+    mo.stop(not tag_select.value or model_select is None, mo.md("Select tag to continue"))
    start_processing_btn
    return
-@app.cell
+@app.cell(hide_code=True)
 def _(client, mo, model_select, pd, start_processing_btn, tags_df):
    from utils import ollama_keyword_extraction, worker_extraction
    # Wait for start processing button
    mo.stop(not start_processing_btn.value, "Click button above to start processing")
    if client is not None:
        df = tags_df
        # Run keyword extraction
-    df = tags_df
+        df['keywords'] = df.progress_apply(
-    # Run keyword extraction
+            lambda row: pd.Series(ollama_keyword_extraction(
-
+                content=row['content'], 
-    df['keywords'] = df.progress_apply(
+                tag=row['tag'], 
-        lambda row: pd.Series(ollama_keyword_extraction(
+                client=client, 
-            content=row['content'], 
+                model=model_select.value
-            tag=row['tag'], 
+            )),
-            client=client, 
+            axis=1
-            model=model_select.value
+        )
-        )),
+    else:
-        axis=1
+        mo.md("Ollama client not available, See 4b) for loading data from xlsx.")
    )
    return (df,)
-@app.cell
+@app.cell(hide_code=True)
 def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
    mo.stop(not start_processing_btn.value, "Click button above to process first")
@@ -232,26 +256,37 @@ def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
@app.cell(hide_code=True)
-def _(mo):
+def _(KEYWORD_FREQ_FPATH, mo):
-    mo.md(r"""
+    mo.md(rf"""
-    # 4b) [optional] Load data from `keyword_frequencies_*.xlsx`
+    # 4b) [optional] Load data from `keyword_frequencies_{KEYWORD_FREQ_FPATH.name}`
    """)
    return
@app.cell(hide_code=True)
-def _(KEYWORD_FREQ_FPATH, mo):
+def _(KEYWORD_FREQ_FPATH, mo, start_processing_btn):
    if start_processing_btn is not None: # Triggers re-execution of this cell when keyword extraction completes
        pass
    load_existing_btn = None
    if KEYWORD_FREQ_FPATH.exists():
-        load_existing_btn = mo.ui.run_button(label=f"Load keywords from `{KEYWORD_FREQ_FPATH.name}`")
+        load_existing_btn = mo.ui.run_button(label=f"Load `{KEYWORD_FREQ_FPATH.name}`", kind='warn')
    load_existing_btn
    return (load_existing_btn,)
@app.cell(hide_code=True)
-def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
+def _(
-    if load_existing_btn.value:
+    KEYWORD_FREQ_FPATH,
    VOICE_EXCLUDE_KEYWORDS_FILE,
    freq_df,
    load_existing_btn,
    pd,
    tag_select,
 ):
    if load_existing_btn is not None and load_existing_btn.value:
        _fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
        # Drop nan rows if any
@@ -259,11 +294,23 @@ def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
        _fdf.sort_values(by='frequency', ascending=False, inplace=True)
        _fdf.reset_index(drop=True, inplace=True)
        print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
-
+        
        frequency_df = _fdf
    else:
        frequency_df = freq_df
    if tag_select.value.startswith('V'):
        # Read exclusion list
        excl_kw = []
        with VOICE_EXCLUDE_KEYWORDS_FILE.open('r') as _f:
            for line in _f:
                excl_kw.append(line.strip())
        _drop_idx = frequency_df[frequency_df['keyword'].isin(excl_kw)].index
        frequency_df.drop(index=_drop_idx, inplace=True, axis=0)
        print(f"Dropped {len(_drop_idx)} keywords automatically")
    return (frequency_df,)
@@ -305,30 +352,6 @@ def _(mo):
    return (min_freq_select,)
@app.cell(hide_code=True)
 def _(freq_df, frequency_df, min_freq_select, mo):
    mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish")
    MIN_FREQ = min_freq_select.value
    freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ]
    freq_df_filtered.reset_index(drop=True, inplace=True)
    keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
    table_selection = mo.ui.table(freq_df_filtered, page_size=50)
    table_selection
    # keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
    # # create list of keywords sorted by their frequencies. only store the keyword
    # sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
    # sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
    # sorted_keywords_list
    return (keyword_freq_filtered,)
@app.cell(hide_code=True)
 def _(mo, tag_select):
    mo.md(rf"""
@@ -349,7 +372,80 @@ def _(mo, tag_select):
    return
-@app.cell
+@app.cell(hide_code=True)
 def _(frequency_df, min_freq_select, mo):
    mo.stop('keyword' not in frequency_df.columns, "Waiting for keyword extraction to finish")
    MIN_FREQ = min_freq_select.value
    _freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
    table_selection = mo.ui.table(_freq_df_filtered, page_size=50)
    table_selection
    return MIN_FREQ, table_selection
@app.cell(hide_code=True)
 def _(mo, table_selection):
    remove_rows_btn = None
    if len(table_selection.value) >0 :
        remove_rows_btn = mo.ui.run_button(label="Click to remove selected keywords and update xlsx")
    remove_rows_btn
    return (remove_rows_btn,)
@app.cell(hide_code=True)
 def _(
    KEYWORD_FREQ_FPATH,
    VOICE_EXCLUDE_KEYWORDS_FILE,
    frequency_df,
    mo,
    remove_rows_btn,
    table_selection,
    tag_select,
 ):
    _s = None
    if remove_rows_btn is not None and remove_rows_btn.value:
        # get selected rows
        selected_rows = table_selection.value
        if len(selected_rows) >0 :
            rows_to_drop = table_selection.value.index.tolist()
            try:
                if tag_select.value.startswith('V'):
                    # append values to an VoiceKeywordsExclusion file (txt file just a list of keywords)
                    exclude_keywords = frequency_df.loc[rows_to_drop, 'keyword'].to_list()
                    with VOICE_EXCLUDE_KEYWORDS_FILE.open('w') as f:
                        for _kw in exclude_keywords:
                            f.write(_kw + '\n')
                frequency_df.drop(index=rows_to_drop, inplace=True, axis=0)
            except KeyError:
                _s = mo.callout("GO BACK TO STEP 4b) and reload data to continue refining the dataset.", kind='warn')
            else:
                # Save updated frequencies back to xlsx
                frequency_df.to_excel(
                    KEYWORD_FREQ_FPATH,
                    index=False
                )
                print(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
            # mo.callout(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`", kind="success")
            _s = mo.callout("GO BACK TO STEP 4b) and reload data before continuing.", kind='warn')
    _s
    return
@app.cell(hide_code=True)
 def _():
    IGNORE_WORDS = {
        'chase as a brand': [
@@ -384,11 +480,13 @@ def _(mo):
    canvas_size = (1200, 800)
    logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
-    return buffer, canvas_size, logo_switch
+
    n_words = mo.ui.slider(start=10, stop=200, step=1, value=100, debounce=True, show_value=True, label="Max number of words in WordCloud")
    return buffer, canvas_size, logo_switch, n_words
@app.cell(hide_code=True)
-def _(logo_switch, mo):
+def _(logo_switch, mo, n_words):
    run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud")
    mo.vstack([
@@ -399,7 +497,7 @@ def _(logo_switch, mo):
        When satisfied with the result, click 'Save WordCloud to File' to save the image."""),
        mo.md('---'),
-        mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')]
+        mo.hstack([logo_switch, n_words, run_wordcloud_btn], align='center', justify='space-around')]
    )
    return (run_wordcloud_btn,)
@@ -409,13 +507,15 @@ def _(
    IGNORE_WORDS,
    Image,
    ImageDraw,
    MIN_FREQ,
    WordCloud,
    blue_color_func,
    buffer,
    canvas_size,
-    keyword_freq_filtered,
+    frequency_df,
    logo_switch,
    mo,
    n_words,
    np,
    plt,
    run_wordcloud_btn,
@@ -424,6 +524,12 @@ def _(
    if run_wordcloud_btn.value:
        pass
    freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
    # freq_df_filtered.reset_index(drop=True, inplace=True)
    keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
    # remove specific keywords depending on selected tag
    if IGNORE_WORDS.get(tag_select.value.lower()):
        for word in IGNORE_WORDS[tag_select.value.lower()]:
@@ -433,7 +539,7 @@ def _(
    if logo_switch.value:
        # 1. Load the logo
        # Make sure this path points to your uploaded file
-        logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png" 
+        logo_path = "./assets/JP-Morgan-Chase-Symbol.png" 
        logo = Image.open(logo_path).convert("RGBA")
        # Optional: Resize logo if it's too large or small for the canvas
@@ -473,7 +579,7 @@ def _(
            width=canvas_size[0],
            height=canvas_size[1],
            max_font_size=100,  # Increased font size for larger canvas
-            max_words=20,      # Increased word count to fill space
+            max_words=n_words.value,      # Increased word count to fill space
            color_func=blue_color_func,
            mask=chase_mask,    # Apply the circular mask
            contour_width=0,
@@ -487,7 +593,7 @@ def _(
            width=canvas_size[0],
            height=canvas_size[1],
            max_font_size=150,  # Increased font size for larger canvas
-            max_words=20,      # Increased word count to fill space
+            max_words=n_words.value,      # Increased word count to fill space
            color_func=blue_color_func,
            # mask=chase_mask,    # Apply the circular mask
            # contour_width=0,
--- a/assets/JP-Morgan-Chase-Symbol.png
+++ b/assets/JP-Morgan-Chase-Symbol.png
Author	SHA1	Message	Date
Luigi Maiorano	069e568d00	final tweaks	2025-12-17 01:37:42 -08:00
Luigi Maiorano	417273c745	voice keyword blacklist	2025-12-17 01:19:22 -08:00
Luigi Maiorano	eee6947f01	rename	2025-12-17 00:25:03 -08:00
Luigi Maiorano	d6b449e8c6	add warning message and increase n words	2025-12-16 23:56:13 -08:00
Luigi Maiorano	8fbc11da7a	Inline removal of keywords	2025-12-16 23:42:25 -08:00
Luigi Maiorano	50f9538dcf	format for consecutive runs	2025-12-16 23:21:03 -08:00