voice keyword blacklist

2025-12-17 01:19:22 -08:00
parent eee6947f01
commit 417273c745
1 changed files with 53 additions and 8 deletions
--- a/02-B_WordClouds.py
+++ b/02-B_WordClouds.py
@@ -22,18 +22,22 @@ def _():
    tqdm.pandas()


-
    TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
    WORKING_DIR = Path('./data/processing/02-b_WordClouds')
+    VOICE_EXCLUDE_KEYWORDS_FILE = WORKING_DIR / 'voice_excl_keywords.txt'

    if not WORKING_DIR.exists():
        WORKING_DIR.mkdir(parents=True)
    if not TAGUETTE_EXPORT_DIR.exists():
        TAGUETTE_EXPORT_DIR.mkdir(parents=True)

+    if not VOICE_EXCLUDE_KEYWORDS_FILE.exists():
+        VOICE_EXCLUDE_KEYWORDS_FILE.touch()
+
    return (
        OLLAMA_LOCATION,
        TAGUETTE_EXPORT_DIR,
+        VOICE_EXCLUDE_KEYWORDS_FILE,
        WORKING_DIR,
        connect_qumo_ollama,
        mo,
@@ -115,7 +119,7 @@ def _(all_tags_df, mo):
    return (tag_select,)


-@app.cell
+@app.cell(hide_code=True)
 def _(WORKING_DIR, all_tags_df, mo, tag_select):
    mo.stop(not tag_select.value, mo.md("Select tag to continue"))

@@ -152,7 +156,7 @@ def _(WORKING_DIR, all_tags_df, mo, tag_select):
@app.cell(hide_code=True)
 def _(KEYWORD_FREQ_FPATH, mo):
    mo.md(rf"""
-    # 4) Keyword extraction {'(skippable, see 4b)' if KEYWORD_FREQ_FPATH.exists() else ''}
+    # 4) Keyword extraction {'(skippable, see 4b)' if KEYWORD_FREQ_FPATH.exists() else '(Required)'}
    """)
    return

@@ -267,14 +271,21 @@ def _(KEYWORD_FREQ_FPATH, mo, start_processing_btn):
    
    load_existing_btn = None
    if KEYWORD_FREQ_FPATH.exists():
-        load_existing_btn = mo.ui.run_button(label=f"Load keywords from `{KEYWORD_FREQ_FPATH.name}`", kind='warn')
+        load_existing_btn = mo.ui.run_button(label=f"Load `{KEYWORD_FREQ_FPATH.name}`", kind='warn')

    load_existing_btn
    return (load_existing_btn,)


@app.cell(hide_code=True)
-def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
+def _(
+    KEYWORD_FREQ_FPATH,
+    VOICE_EXCLUDE_KEYWORDS_FILE,
+    freq_df,
+    load_existing_btn,
+    pd,
+    tag_select,
+):
    if load_existing_btn is not None and load_existing_btn.value:
        _fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')

@@ -284,6 +295,19 @@ def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
        _fdf.reset_index(drop=True, inplace=True)
        print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")

+        if tag_select.value.startswith('V'):
+            # Read exclusion list
+            excl_kw = []
+            with VOICE_EXCLUDE_KEYWORDS_FILE.open('r') as _f:
+                for line in _f:
+                    excl_kw.append(line.strip())
+
+            _drop_idx = _fdf[_fdf['keyword'].isin(excl_kw)].index
+
+            _fdf.drop(index=_drop_idx, inplace=True, axis=0)
+            print(f"Dropped {len(_drop_idx)} keywords automatically")
+        
+
        frequency_df = _fdf

    else:
@@ -374,7 +398,15 @@ def _(mo, table_selection):


@app.cell(hide_code=True)
-def _(KEYWORD_FREQ_FPATH, frequency_df, mo, remove_rows_btn, table_selection):
+def _(
+    KEYWORD_FREQ_FPATH,
+    VOICE_EXCLUDE_KEYWORDS_FILE,
+    frequency_df,
+    mo,
+    remove_rows_btn,
+    table_selection,
+    tag_select,
+):
    _s = None
    if remove_rows_btn is not None and remove_rows_btn.value:
        # get selected rows
@@ -382,7 +414,20 @@ def _(KEYWORD_FREQ_FPATH, frequency_df, mo, remove_rows_btn, table_selection):
        if len(selected_rows) >0 :
            rows_to_drop = table_selection.value.index.tolist()
            try:
+                if tag_select.value.startswith('V'):
+                    # append values to an VoiceKeywordsExclusion file (txt file just a list of keywords)
+                    exclude_keywords = frequency_df.loc[rows_to_drop, 'keyword'].to_list()
+
+                    with VOICE_EXCLUDE_KEYWORDS_FILE.open('w') as f:
+                        for _kw in exclude_keywords:
+                            f.write(_kw + '\n')
+                
+
+                
                frequency_df.drop(index=rows_to_drop, inplace=True, axis=0)
+
+                
+            
            except KeyError:
                _s = mo.callout("GO BACK TO STEP 4b) and reload data to continue refining the dataset.", kind='warn')
            else:
@@ -395,7 +440,7 @@ def _(KEYWORD_FREQ_FPATH, frequency_df, mo, remove_rows_btn, table_selection):
                print(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")

            # mo.callout(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`", kind="success")
-            _s = mo.callout("GO BACK TO STEP 4b) and reload data to continue refining the dataset.", kind='warn')
+            _s = mo.callout("GO BACK TO STEP 4b) and reload data before continuing.", kind='warn')

    _s
    return
@@ -437,7 +482,7 @@ def _(mo):

    logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)

-    n_words = mo.ui.slider(start=10, stop=200, step=1, value=40, debounce=True, show_value=True, label="Max number of words in WordCloud")
+    n_words = mo.ui.slider(start=10, stop=200, step=1, value=100, debounce=True, show_value=True, label="Max number of words in WordCloud")
    return buffer, canvas_size, logo_switch, n_words