voice keyword blacklist
This commit is contained in:
@@ -22,18 +22,22 @@ def _():
|
|||||||
tqdm.pandas()
|
tqdm.pandas()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
|
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
|
||||||
WORKING_DIR = Path('./data/processing/02-b_WordClouds')
|
WORKING_DIR = Path('./data/processing/02-b_WordClouds')
|
||||||
|
VOICE_EXCLUDE_KEYWORDS_FILE = WORKING_DIR / 'voice_excl_keywords.txt'
|
||||||
|
|
||||||
if not WORKING_DIR.exists():
|
if not WORKING_DIR.exists():
|
||||||
WORKING_DIR.mkdir(parents=True)
|
WORKING_DIR.mkdir(parents=True)
|
||||||
if not TAGUETTE_EXPORT_DIR.exists():
|
if not TAGUETTE_EXPORT_DIR.exists():
|
||||||
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
|
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
|
||||||
|
|
||||||
|
if not VOICE_EXCLUDE_KEYWORDS_FILE.exists():
|
||||||
|
VOICE_EXCLUDE_KEYWORDS_FILE.touch()
|
||||||
|
|
||||||
return (
|
return (
|
||||||
OLLAMA_LOCATION,
|
OLLAMA_LOCATION,
|
||||||
TAGUETTE_EXPORT_DIR,
|
TAGUETTE_EXPORT_DIR,
|
||||||
|
VOICE_EXCLUDE_KEYWORDS_FILE,
|
||||||
WORKING_DIR,
|
WORKING_DIR,
|
||||||
connect_qumo_ollama,
|
connect_qumo_ollama,
|
||||||
mo,
|
mo,
|
||||||
@@ -115,7 +119,7 @@ def _(all_tags_df, mo):
|
|||||||
return (tag_select,)
|
return (tag_select,)
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell(hide_code=True)
|
||||||
def _(WORKING_DIR, all_tags_df, mo, tag_select):
|
def _(WORKING_DIR, all_tags_df, mo, tag_select):
|
||||||
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
|
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
|
||||||
|
|
||||||
@@ -152,7 +156,7 @@ def _(WORKING_DIR, all_tags_df, mo, tag_select):
|
|||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(KEYWORD_FREQ_FPATH, mo):
|
def _(KEYWORD_FREQ_FPATH, mo):
|
||||||
mo.md(rf"""
|
mo.md(rf"""
|
||||||
# 4) Keyword extraction {'(skippable, see 4b)' if KEYWORD_FREQ_FPATH.exists() else ''}
|
# 4) Keyword extraction {'(skippable, see 4b)' if KEYWORD_FREQ_FPATH.exists() else '(Required)'}
|
||||||
""")
|
""")
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -267,14 +271,21 @@ def _(KEYWORD_FREQ_FPATH, mo, start_processing_btn):
|
|||||||
|
|
||||||
load_existing_btn = None
|
load_existing_btn = None
|
||||||
if KEYWORD_FREQ_FPATH.exists():
|
if KEYWORD_FREQ_FPATH.exists():
|
||||||
load_existing_btn = mo.ui.run_button(label=f"Load keywords from `{KEYWORD_FREQ_FPATH.name}`", kind='warn')
|
load_existing_btn = mo.ui.run_button(label=f"Load `{KEYWORD_FREQ_FPATH.name}`", kind='warn')
|
||||||
|
|
||||||
load_existing_btn
|
load_existing_btn
|
||||||
return (load_existing_btn,)
|
return (load_existing_btn,)
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
|
def _(
|
||||||
|
KEYWORD_FREQ_FPATH,
|
||||||
|
VOICE_EXCLUDE_KEYWORDS_FILE,
|
||||||
|
freq_df,
|
||||||
|
load_existing_btn,
|
||||||
|
pd,
|
||||||
|
tag_select,
|
||||||
|
):
|
||||||
if load_existing_btn is not None and load_existing_btn.value:
|
if load_existing_btn is not None and load_existing_btn.value:
|
||||||
_fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
|
_fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
|
||||||
|
|
||||||
@@ -284,6 +295,19 @@ def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
|
|||||||
_fdf.reset_index(drop=True, inplace=True)
|
_fdf.reset_index(drop=True, inplace=True)
|
||||||
print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
|
print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
|
||||||
|
|
||||||
|
if tag_select.value.startswith('V'):
|
||||||
|
# Read exclusion list
|
||||||
|
excl_kw = []
|
||||||
|
with VOICE_EXCLUDE_KEYWORDS_FILE.open('r') as _f:
|
||||||
|
for line in _f:
|
||||||
|
excl_kw.append(line.strip())
|
||||||
|
|
||||||
|
_drop_idx = _fdf[_fdf['keyword'].isin(excl_kw)].index
|
||||||
|
|
||||||
|
_fdf.drop(index=_drop_idx, inplace=True, axis=0)
|
||||||
|
print(f"Dropped {len(_drop_idx)} keywords automatically")
|
||||||
|
|
||||||
|
|
||||||
frequency_df = _fdf
|
frequency_df = _fdf
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@@ -374,7 +398,15 @@ def _(mo, table_selection):
|
|||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(KEYWORD_FREQ_FPATH, frequency_df, mo, remove_rows_btn, table_selection):
|
def _(
|
||||||
|
KEYWORD_FREQ_FPATH,
|
||||||
|
VOICE_EXCLUDE_KEYWORDS_FILE,
|
||||||
|
frequency_df,
|
||||||
|
mo,
|
||||||
|
remove_rows_btn,
|
||||||
|
table_selection,
|
||||||
|
tag_select,
|
||||||
|
):
|
||||||
_s = None
|
_s = None
|
||||||
if remove_rows_btn is not None and remove_rows_btn.value:
|
if remove_rows_btn is not None and remove_rows_btn.value:
|
||||||
# get selected rows
|
# get selected rows
|
||||||
@@ -382,7 +414,20 @@ def _(KEYWORD_FREQ_FPATH, frequency_df, mo, remove_rows_btn, table_selection):
|
|||||||
if len(selected_rows) >0 :
|
if len(selected_rows) >0 :
|
||||||
rows_to_drop = table_selection.value.index.tolist()
|
rows_to_drop = table_selection.value.index.tolist()
|
||||||
try:
|
try:
|
||||||
|
if tag_select.value.startswith('V'):
|
||||||
|
# append values to an VoiceKeywordsExclusion file (txt file just a list of keywords)
|
||||||
|
exclude_keywords = frequency_df.loc[rows_to_drop, 'keyword'].to_list()
|
||||||
|
|
||||||
|
with VOICE_EXCLUDE_KEYWORDS_FILE.open('w') as f:
|
||||||
|
for _kw in exclude_keywords:
|
||||||
|
f.write(_kw + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
frequency_df.drop(index=rows_to_drop, inplace=True, axis=0)
|
frequency_df.drop(index=rows_to_drop, inplace=True, axis=0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
except KeyError:
|
except KeyError:
|
||||||
_s = mo.callout("GO BACK TO STEP 4b) and reload data to continue refining the dataset.", kind='warn')
|
_s = mo.callout("GO BACK TO STEP 4b) and reload data to continue refining the dataset.", kind='warn')
|
||||||
else:
|
else:
|
||||||
@@ -395,7 +440,7 @@ def _(KEYWORD_FREQ_FPATH, frequency_df, mo, remove_rows_btn, table_selection):
|
|||||||
print(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
|
print(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
|
||||||
|
|
||||||
# mo.callout(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`", kind="success")
|
# mo.callout(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`", kind="success")
|
||||||
_s = mo.callout("GO BACK TO STEP 4b) and reload data to continue refining the dataset.", kind='warn')
|
_s = mo.callout("GO BACK TO STEP 4b) and reload data before continuing.", kind='warn')
|
||||||
|
|
||||||
_s
|
_s
|
||||||
return
|
return
|
||||||
@@ -437,7 +482,7 @@ def _(mo):
|
|||||||
|
|
||||||
logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
|
logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
|
||||||
|
|
||||||
n_words = mo.ui.slider(start=10, stop=200, step=1, value=40, debounce=True, show_value=True, label="Max number of words in WordCloud")
|
n_words = mo.ui.slider(start=10, stop=200, step=1, value=100, debounce=True, show_value=True, label="Max number of words in WordCloud")
|
||||||
return buffer, canvas_size, logo_switch, n_words
|
return buffer, canvas_size, logo_switch, n_words
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user