diff --git a/02-B_WordClouds.py b/02-B_WordClouds.py index cfc76e9..283eda4 100644 --- a/02-B_WordClouds.py +++ b/02-B_WordClouds.py @@ -22,18 +22,22 @@ def _(): tqdm.pandas() - TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export') WORKING_DIR = Path('./data/processing/02-b_WordClouds') + VOICE_EXCLUDE_KEYWORDS_FILE = WORKING_DIR / 'voice_excl_keywords.txt' if not WORKING_DIR.exists(): WORKING_DIR.mkdir(parents=True) if not TAGUETTE_EXPORT_DIR.exists(): TAGUETTE_EXPORT_DIR.mkdir(parents=True) + if not VOICE_EXCLUDE_KEYWORDS_FILE.exists(): + VOICE_EXCLUDE_KEYWORDS_FILE.touch() + return ( OLLAMA_LOCATION, TAGUETTE_EXPORT_DIR, + VOICE_EXCLUDE_KEYWORDS_FILE, WORKING_DIR, connect_qumo_ollama, mo, @@ -115,7 +119,7 @@ def _(all_tags_df, mo): return (tag_select,) -@app.cell +@app.cell(hide_code=True) def _(WORKING_DIR, all_tags_df, mo, tag_select): mo.stop(not tag_select.value, mo.md("Select tag to continue")) @@ -152,7 +156,7 @@ def _(WORKING_DIR, all_tags_df, mo, tag_select): @app.cell(hide_code=True) def _(KEYWORD_FREQ_FPATH, mo): mo.md(rf""" - # 4) Keyword extraction {'(skippable, see 4b)' if KEYWORD_FREQ_FPATH.exists() else ''} + # 4) Keyword extraction {'(skippable, see 4b)' if KEYWORD_FREQ_FPATH.exists() else '(Required)'} """) return @@ -267,14 +271,21 @@ def _(KEYWORD_FREQ_FPATH, mo, start_processing_btn): load_existing_btn = None if KEYWORD_FREQ_FPATH.exists(): - load_existing_btn = mo.ui.run_button(label=f"Load keywords from `{KEYWORD_FREQ_FPATH.name}`", kind='warn') + load_existing_btn = mo.ui.run_button(label=f"Load `{KEYWORD_FREQ_FPATH.name}`", kind='warn') load_existing_btn return (load_existing_btn,) @app.cell(hide_code=True) -def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd): +def _( + KEYWORD_FREQ_FPATH, + VOICE_EXCLUDE_KEYWORDS_FILE, + freq_df, + load_existing_btn, + pd, + tag_select, +): if load_existing_btn is not None and load_existing_btn.value: _fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl') @@ -284,6 +295,19 @@ def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd): _fdf.reset_index(drop=True, inplace=True) print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.") + if tag_select.value.startswith('V'): + # Read exclusion list + excl_kw = [] + with VOICE_EXCLUDE_KEYWORDS_FILE.open('r') as _f: + for line in _f: + excl_kw.append(line.strip()) + + _drop_idx = _fdf[_fdf['keyword'].isin(excl_kw)].index + + _fdf.drop(index=_drop_idx, inplace=True, axis=0) + print(f"Dropped {len(_drop_idx)} keywords automatically") + + frequency_df = _fdf else: @@ -374,7 +398,15 @@ def _(mo, table_selection): @app.cell(hide_code=True) -def _(KEYWORD_FREQ_FPATH, frequency_df, mo, remove_rows_btn, table_selection): +def _( + KEYWORD_FREQ_FPATH, + VOICE_EXCLUDE_KEYWORDS_FILE, + frequency_df, + mo, + remove_rows_btn, + table_selection, + tag_select, +): _s = None if remove_rows_btn is not None and remove_rows_btn.value: # get selected rows @@ -382,7 +414,20 @@ def _(KEYWORD_FREQ_FPATH, frequency_df, mo, remove_rows_btn, table_selection): if len(selected_rows) >0 : rows_to_drop = table_selection.value.index.tolist() try: + if tag_select.value.startswith('V'): + # append values to an VoiceKeywordsExclusion file (txt file just a list of keywords) + exclude_keywords = frequency_df.loc[rows_to_drop, 'keyword'].to_list() + + with VOICE_EXCLUDE_KEYWORDS_FILE.open('w') as f: + for _kw in exclude_keywords: + f.write(_kw + '\n') + + + frequency_df.drop(index=rows_to_drop, inplace=True, axis=0) + + + except KeyError: _s = mo.callout("GO BACK TO STEP 4b) and reload data to continue refining the dataset.", kind='warn') else: @@ -395,7 +440,7 @@ def _(KEYWORD_FREQ_FPATH, frequency_df, mo, remove_rows_btn, table_selection): print(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`") # mo.callout(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`", kind="success") - _s = mo.callout("GO BACK TO STEP 4b) and reload data to continue refining the dataset.", kind='warn') + _s = mo.callout("GO BACK TO STEP 4b) and reload data before continuing.", kind='warn') _s return @@ -437,7 +482,7 @@ def _(mo): logo_switch = mo.ui.switch(label="Include Chase Logo", value=False) - n_words = mo.ui.slider(start=10, stop=200, step=1, value=40, debounce=True, show_value=True, label="Max number of words in WordCloud") + n_words = mo.ui.slider(start=10, stop=200, step=1, value=100, debounce=True, show_value=True, label="Max number of words in WordCloud") return buffer, canvas_size, logo_switch, n_words