Inline removal of keywords

format for consecutive runs
2025-12-16 23:42:25 -08:00 · 2025-12-16 23:21:03 -08:00
3 changed files with 100 additions and 50 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,5 @@ __pycache__/

 data/
 docker-volumes/
-logs/
+logs/
+
--- a/02-B_Thematic-Processing.py
+++ b/02-B_Thematic-Processing.py
@@ -22,7 +22,6 @@ def _():
    tqdm.pandas()


-    client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)

    TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
    WORKING_DIR = Path('./data/processing/02-b_WordClouds')
@@ -32,14 +31,14 @@ def _():
    if not TAGUETTE_EXPORT_DIR.exists():
        TAGUETTE_EXPORT_DIR.mkdir(parents=True)

-    model_select = mo.ui.dropdown(
-        options=_models,
-        value=_models[0],
-        label="Select Ollama Model to use",
-        searchable=True,
+    return (
+        OLLAMA_LOCATION,
+        TAGUETTE_EXPORT_DIR,
+        WORKING_DIR,
+        connect_qumo_ollama,
+        mo,
+        pd,
    )
-    model_select
-    return TAGUETTE_EXPORT_DIR, WORKING_DIR, client, mo, model_select, pd


@app.cell(hide_code=True)
@@ -159,8 +158,27 @@ def _(mo):


@app.cell
-def _(mo, start_processing_btn, tag_select):
-    mo.stop(not tag_select.value, mo.md("Select tag to continue"))
+def _(OLLAMA_LOCATION, connect_qumo_ollama, mo):
+    try:
+        client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
+        model_select = mo.ui.dropdown(
+            options=_models,
+            value=_models[0],
+            label="Select Ollama Model to use",
+            searchable=True,
+        )
+    except Exception as e:
+        mo.md(f"Error connecting to Ollama server at `{OLLAMA_LOCATION}`: {e}")
+        model_select = None
+        client = None
+
+    model_select
+    return client, model_select
+
+
+@app.cell
+def _(mo, model_select, start_processing_btn, tag_select):
+    mo.stop(not tag_select.value or model_select is None, mo.md("Select tag to continue"))

    start_processing_btn
    return
@@ -172,19 +190,21 @@ def _(client, mo, model_select, pd, start_processing_btn, tags_df):
    # Wait for start processing button
    mo.stop(not start_processing_btn.value, "Click button above to start processing")

+    if client is not None:
+        df = tags_df
+        # Run keyword extraction

-    df = tags_df
-    # Run keyword extraction
-
-    df['keywords'] = df.progress_apply(
-        lambda row: pd.Series(ollama_keyword_extraction(
-            content=row['content'], 
-            tag=row['tag'], 
-            client=client, 
-            model=model_select.value
-        )),
-        axis=1
-    )
+        df['keywords'] = df.progress_apply(
+            lambda row: pd.Series(ollama_keyword_extraction(
+                content=row['content'], 
+                tag=row['tag'], 
+                client=client, 
+                model=model_select.value
+            )),
+            axis=1
+        )
+    else:
+        mo.md("Ollama client not available, See 4b) for loading data from xlsx.")
    return (df,)


@@ -251,7 +271,7 @@ def _(KEYWORD_FREQ_FPATH, mo):

@app.cell(hide_code=True)
 def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
-    if load_existing_btn.value:
+    if load_existing_btn is not None and load_existing_btn.value:
        _fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')

        # Drop nan rows if any
@@ -305,30 +325,6 @@ def _(mo):
    return (min_freq_select,)


-@app.cell(hide_code=True)
-def _(freq_df, frequency_df, min_freq_select, mo):
-    mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish")
-
-    MIN_FREQ = min_freq_select.value
-
-    freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ]
-
-    freq_df_filtered.reset_index(drop=True, inplace=True)
-
-    keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
-
-    table_selection = mo.ui.table(freq_df_filtered, page_size=50)
-    table_selection
-
-    # keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
-
-    # # create list of keywords sorted by their frequencies. only store the keyword
-    # sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
-    # sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
-    # sorted_keywords_list
-    return (keyword_freq_filtered,)
-
-
@app.cell(hide_code=True)
 def _(mo, tag_select):
    mo.md(rf"""
@@ -350,6 +346,52 @@ def _(mo, tag_select):


@app.cell
+def _(frequency_df, min_freq_select, mo):
+    mo.stop('keyword' not in frequency_df.columns, "Waiting for keyword extraction to finish")
+
+    MIN_FREQ = min_freq_select.value
+
+    _freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
+
+    table_selection = mo.ui.table(_freq_df_filtered, page_size=50)
+    table_selection
+
+    return MIN_FREQ, table_selection
+
+
+@app.cell(hide_code=True)
+def _(mo, table_selection):
+    remove_rows_btn = None
+    if len(table_selection.value) >0 :
+        remove_rows_btn = mo.ui.run_button(label="Click to remove selected keywords and update xlsx")
+
+    remove_rows_btn
+    return (remove_rows_btn,)
+
+
+@app.cell(hide_code=True)
+def _(KEYWORD_FREQ_FPATH, frequency_df, remove_rows_btn, table_selection):
+    if remove_rows_btn is not None and remove_rows_btn.value:
+        # get selected rows
+        selected_rows = table_selection.value
+        if len(selected_rows) >0 :
+            rows_to_drop = table_selection.value.index.tolist()
+
+            frequency_df.drop(index=rows_to_drop, inplace=True, axis=0)
+        
+            # Save updated frequencies back to xlsx
+            frequency_df.to_excel(
+                KEYWORD_FREQ_FPATH,
+                index=False
+            )
+        
+            print(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
+
+            print("GO TO STEP 4b) and reload data to continue refining the dataset.")
+    return
+
+
+@app.cell(hide_code=True)
 def _():
    IGNORE_WORDS = {
        'chase as a brand': [
@@ -409,11 +451,12 @@ def _(
    IGNORE_WORDS,
    Image,
    ImageDraw,
+    MIN_FREQ,
    WordCloud,
    blue_color_func,
    buffer,
    canvas_size,
-    keyword_freq_filtered,
+    frequency_df,
    logo_switch,
    mo,
    np,
@@ -424,6 +467,12 @@ def _(
    if run_wordcloud_btn.value:
        pass

+    freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
+
+    # freq_df_filtered.reset_index(drop=True, inplace=True)
+
+    keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
+
    # remove specific keywords depending on selected tag
    if IGNORE_WORDS.get(tag_select.value.lower()):
        for word in IGNORE_WORDS[tag_select.value.lower()]:
@@ -433,7 +482,7 @@ def _(
    if logo_switch.value:
        # 1. Load the logo
        # Make sure this path points to your uploaded file
-        logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png" 
+        logo_path = "./assets/JP-Morgan-Chase-Symbol.png" 
        logo = Image.open(logo_path).convert("RGBA")

        # Optional: Resize logo if it's too large or small for the canvas
--- a/assets/JP-Morgan-Chase-Symbol.png
+++ b/assets/JP-Morgan-Chase-Symbol.png
Author	SHA1	Message	Date
Luigi Maiorano	8fbc11da7a	Inline removal of keywords	2025-12-16 23:42:25 -08:00
Luigi Maiorano	50f9538dcf	format for consecutive runs	2025-12-16 23:21:03 -08:00