Compare commits

...

6 Commits

Author SHA1 Message Date
069e568d00 final tweaks 2025-12-17 01:37:42 -08:00
417273c745 voice keyword blacklist 2025-12-17 01:19:22 -08:00
eee6947f01 rename 2025-12-17 00:25:03 -08:00
d6b449e8c6 add warning message and increase n words 2025-12-16 23:56:13 -08:00
8fbc11da7a Inline removal of keywords 2025-12-16 23:42:25 -08:00
50f9538dcf format for consecutive runs 2025-12-16 23:21:03 -08:00
3 changed files with 178 additions and 71 deletions

3
.gitignore vendored
View File

@@ -13,4 +13,5 @@ __pycache__/
data/ data/
docker-volumes/ docker-volumes/
logs/ logs/

View File

@@ -22,24 +22,27 @@ def _():
tqdm.pandas() tqdm.pandas()
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export') TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
WORKING_DIR = Path('./data/processing/02-b_WordClouds') WORKING_DIR = Path('./data/processing/02-b_WordClouds')
VOICE_EXCLUDE_KEYWORDS_FILE = WORKING_DIR / 'voice_excl_keywords.txt'
if not WORKING_DIR.exists(): if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True) WORKING_DIR.mkdir(parents=True)
if not TAGUETTE_EXPORT_DIR.exists(): if not TAGUETTE_EXPORT_DIR.exists():
TAGUETTE_EXPORT_DIR.mkdir(parents=True) TAGUETTE_EXPORT_DIR.mkdir(parents=True)
model_select = mo.ui.dropdown( if not VOICE_EXCLUDE_KEYWORDS_FILE.exists():
options=_models, VOICE_EXCLUDE_KEYWORDS_FILE.touch()
value=_models[0],
label="Select Ollama Model to use", return (
searchable=True, OLLAMA_LOCATION,
TAGUETTE_EXPORT_DIR,
VOICE_EXCLUDE_KEYWORDS_FILE,
WORKING_DIR,
connect_qumo_ollama,
mo,
pd,
) )
model_select
return TAGUETTE_EXPORT_DIR, WORKING_DIR, client, mo, model_select, pd
@app.cell(hide_code=True) @app.cell(hide_code=True)
@@ -116,7 +119,7 @@ def _(all_tags_df, mo):
return (tag_select,) return (tag_select,)
@app.cell @app.cell(hide_code=True)
def _(WORKING_DIR, all_tags_df, mo, tag_select): def _(WORKING_DIR, all_tags_df, mo, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue")) mo.stop(not tag_select.value, mo.md("Select tag to continue"))
@@ -139,7 +142,7 @@ def _(WORKING_DIR, all_tags_df, mo, tag_select):
# filter all_tags_df to only the document = file_dropdown.value # filter all_tags_df to only the document = file_dropdown.value
tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy() tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
tags_df tags_df.head()
return ( return (
KEYWORDS_FPATH, KEYWORDS_FPATH,
KEYWORD_FREQ_FPATH, KEYWORD_FREQ_FPATH,
@@ -151,44 +154,65 @@ def _(WORKING_DIR, all_tags_df, mo, tag_select):
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo): def _(KEYWORD_FREQ_FPATH, mo):
mo.md(r""" mo.md(rf"""
# 4) Keyword extraction # 4) Keyword extraction {'(skippable, see 4b)' if KEYWORD_FREQ_FPATH.exists() else '(Required)'}
""") """)
return return
@app.cell(hide_code=True)
def _(OLLAMA_LOCATION, connect_qumo_ollama, mo):
try:
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
model_select = mo.ui.dropdown(
options=_models,
value=_models[0],
label="Select Ollama Model to use",
searchable=True,
)
except Exception as e:
mo.md(f"Error connecting to Ollama server at `{OLLAMA_LOCATION}`: {e}")
model_select = None
client = None
model_select
return client, model_select
@app.cell @app.cell
def _(mo, start_processing_btn, tag_select): def _(mo, model_select, start_processing_btn, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue")) mo.stop(not tag_select.value or model_select is None, mo.md("Select tag to continue"))
start_processing_btn start_processing_btn
return return
@app.cell @app.cell(hide_code=True)
def _(client, mo, model_select, pd, start_processing_btn, tags_df): def _(client, mo, model_select, pd, start_processing_btn, tags_df):
from utils import ollama_keyword_extraction, worker_extraction from utils import ollama_keyword_extraction, worker_extraction
# Wait for start processing button # Wait for start processing button
mo.stop(not start_processing_btn.value, "Click button above to start processing") mo.stop(not start_processing_btn.value, "Click button above to start processing")
if client is not None:
df = tags_df
# Run keyword extraction
df = tags_df df['keywords'] = df.progress_apply(
# Run keyword extraction lambda row: pd.Series(ollama_keyword_extraction(
content=row['content'],
df['keywords'] = df.progress_apply( tag=row['tag'],
lambda row: pd.Series(ollama_keyword_extraction( client=client,
content=row['content'], model=model_select.value
tag=row['tag'], )),
client=client, axis=1
model=model_select.value )
)), else:
axis=1 mo.md("Ollama client not available, See 4b) for loading data from xlsx.")
)
return (df,) return (df,)
@app.cell @app.cell(hide_code=True)
def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn): def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
mo.stop(not start_processing_btn.value, "Click button above to process first") mo.stop(not start_processing_btn.value, "Click button above to process first")
@@ -232,26 +256,37 @@ def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo): def _(KEYWORD_FREQ_FPATH, mo):
mo.md(r""" mo.md(rf"""
# 4b) [optional] Load data from `keyword_frequencies_*.xlsx` # 4b) [optional] Load data from `keyword_frequencies_{KEYWORD_FREQ_FPATH.name}`
""") """)
return return
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(KEYWORD_FREQ_FPATH, mo): def _(KEYWORD_FREQ_FPATH, mo, start_processing_btn):
if start_processing_btn is not None: # Triggers re-execution of this cell when keyword extraction completes
pass
load_existing_btn = None load_existing_btn = None
if KEYWORD_FREQ_FPATH.exists(): if KEYWORD_FREQ_FPATH.exists():
load_existing_btn = mo.ui.run_button(label=f"Load keywords from `{KEYWORD_FREQ_FPATH.name}`") load_existing_btn = mo.ui.run_button(label=f"Load `{KEYWORD_FREQ_FPATH.name}`", kind='warn')
load_existing_btn load_existing_btn
return (load_existing_btn,) return (load_existing_btn,)
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd): def _(
if load_existing_btn.value: KEYWORD_FREQ_FPATH,
VOICE_EXCLUDE_KEYWORDS_FILE,
freq_df,
load_existing_btn,
pd,
tag_select,
):
if load_existing_btn is not None and load_existing_btn.value:
_fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl') _fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
# Drop nan rows if any # Drop nan rows if any
@@ -259,11 +294,23 @@ def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
_fdf.sort_values(by='frequency', ascending=False, inplace=True) _fdf.sort_values(by='frequency', ascending=False, inplace=True)
_fdf.reset_index(drop=True, inplace=True) _fdf.reset_index(drop=True, inplace=True)
print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.") print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
frequency_df = _fdf frequency_df = _fdf
else: else:
frequency_df = freq_df frequency_df = freq_df
if tag_select.value.startswith('V'):
# Read exclusion list
excl_kw = []
with VOICE_EXCLUDE_KEYWORDS_FILE.open('r') as _f:
for line in _f:
excl_kw.append(line.strip())
_drop_idx = frequency_df[frequency_df['keyword'].isin(excl_kw)].index
frequency_df.drop(index=_drop_idx, inplace=True, axis=0)
print(f"Dropped {len(_drop_idx)} keywords automatically")
return (frequency_df,) return (frequency_df,)
@@ -305,30 +352,6 @@ def _(mo):
return (min_freq_select,) return (min_freq_select,)
@app.cell(hide_code=True)
def _(freq_df, frequency_df, min_freq_select, mo):
mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish")
MIN_FREQ = min_freq_select.value
freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ]
freq_df_filtered.reset_index(drop=True, inplace=True)
keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
table_selection = mo.ui.table(freq_df_filtered, page_size=50)
table_selection
# keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
# # create list of keywords sorted by their frequencies. only store the keyword
# sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
# sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
# sorted_keywords_list
return (keyword_freq_filtered,)
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo, tag_select): def _(mo, tag_select):
mo.md(rf""" mo.md(rf"""
@@ -349,7 +372,80 @@ def _(mo, tag_select):
return return
@app.cell @app.cell(hide_code=True)
def _(frequency_df, min_freq_select, mo):
mo.stop('keyword' not in frequency_df.columns, "Waiting for keyword extraction to finish")
MIN_FREQ = min_freq_select.value
_freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
table_selection = mo.ui.table(_freq_df_filtered, page_size=50)
table_selection
return MIN_FREQ, table_selection
@app.cell(hide_code=True)
def _(mo, table_selection):
remove_rows_btn = None
if len(table_selection.value) >0 :
remove_rows_btn = mo.ui.run_button(label="Click to remove selected keywords and update xlsx")
remove_rows_btn
return (remove_rows_btn,)
@app.cell(hide_code=True)
def _(
KEYWORD_FREQ_FPATH,
VOICE_EXCLUDE_KEYWORDS_FILE,
frequency_df,
mo,
remove_rows_btn,
table_selection,
tag_select,
):
_s = None
if remove_rows_btn is not None and remove_rows_btn.value:
# get selected rows
selected_rows = table_selection.value
if len(selected_rows) >0 :
rows_to_drop = table_selection.value.index.tolist()
try:
if tag_select.value.startswith('V'):
# append values to an VoiceKeywordsExclusion file (txt file just a list of keywords)
exclude_keywords = frequency_df.loc[rows_to_drop, 'keyword'].to_list()
with VOICE_EXCLUDE_KEYWORDS_FILE.open('w') as f:
for _kw in exclude_keywords:
f.write(_kw + '\n')
frequency_df.drop(index=rows_to_drop, inplace=True, axis=0)
except KeyError:
_s = mo.callout("GO BACK TO STEP 4b) and reload data to continue refining the dataset.", kind='warn')
else:
# Save updated frequencies back to xlsx
frequency_df.to_excel(
KEYWORD_FREQ_FPATH,
index=False
)
print(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
# mo.callout(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`", kind="success")
_s = mo.callout("GO BACK TO STEP 4b) and reload data before continuing.", kind='warn')
_s
return
@app.cell(hide_code=True)
def _(): def _():
IGNORE_WORDS = { IGNORE_WORDS = {
'chase as a brand': [ 'chase as a brand': [
@@ -384,11 +480,13 @@ def _(mo):
canvas_size = (1200, 800) canvas_size = (1200, 800)
logo_switch = mo.ui.switch(label="Include Chase Logo", value=False) logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
return buffer, canvas_size, logo_switch
n_words = mo.ui.slider(start=10, stop=200, step=1, value=100, debounce=True, show_value=True, label="Max number of words in WordCloud")
return buffer, canvas_size, logo_switch, n_words
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(logo_switch, mo): def _(logo_switch, mo, n_words):
run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud") run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud")
mo.vstack([ mo.vstack([
@@ -399,7 +497,7 @@ def _(logo_switch, mo):
When satisfied with the result, click 'Save WordCloud to File' to save the image."""), When satisfied with the result, click 'Save WordCloud to File' to save the image."""),
mo.md('---'), mo.md('---'),
mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')] mo.hstack([logo_switch, n_words, run_wordcloud_btn], align='center', justify='space-around')]
) )
return (run_wordcloud_btn,) return (run_wordcloud_btn,)
@@ -409,13 +507,15 @@ def _(
IGNORE_WORDS, IGNORE_WORDS,
Image, Image,
ImageDraw, ImageDraw,
MIN_FREQ,
WordCloud, WordCloud,
blue_color_func, blue_color_func,
buffer, buffer,
canvas_size, canvas_size,
keyword_freq_filtered, frequency_df,
logo_switch, logo_switch,
mo, mo,
n_words,
np, np,
plt, plt,
run_wordcloud_btn, run_wordcloud_btn,
@@ -424,6 +524,12 @@ def _(
if run_wordcloud_btn.value: if run_wordcloud_btn.value:
pass pass
freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
# freq_df_filtered.reset_index(drop=True, inplace=True)
keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
# remove specific keywords depending on selected tag # remove specific keywords depending on selected tag
if IGNORE_WORDS.get(tag_select.value.lower()): if IGNORE_WORDS.get(tag_select.value.lower()):
for word in IGNORE_WORDS[tag_select.value.lower()]: for word in IGNORE_WORDS[tag_select.value.lower()]:
@@ -433,7 +539,7 @@ def _(
if logo_switch.value: if logo_switch.value:
# 1. Load the logo # 1. Load the logo
# Make sure this path points to your uploaded file # Make sure this path points to your uploaded file
logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png" logo_path = "./assets/JP-Morgan-Chase-Symbol.png"
logo = Image.open(logo_path).convert("RGBA") logo = Image.open(logo_path).convert("RGBA")
# Optional: Resize logo if it's too large or small for the canvas # Optional: Resize logo if it's too large or small for the canvas
@@ -473,7 +579,7 @@ def _(
width=canvas_size[0], width=canvas_size[0],
height=canvas_size[1], height=canvas_size[1],
max_font_size=100, # Increased font size for larger canvas max_font_size=100, # Increased font size for larger canvas
max_words=20, # Increased word count to fill space max_words=n_words.value, # Increased word count to fill space
color_func=blue_color_func, color_func=blue_color_func,
mask=chase_mask, # Apply the circular mask mask=chase_mask, # Apply the circular mask
contour_width=0, contour_width=0,
@@ -487,7 +593,7 @@ def _(
width=canvas_size[0], width=canvas_size[0],
height=canvas_size[1], height=canvas_size[1],
max_font_size=150, # Increased font size for larger canvas max_font_size=150, # Increased font size for larger canvas
max_words=20, # Increased word count to fill space max_words=n_words.value, # Increased word count to fill space
color_func=blue_color_func, color_func=blue_color_func,
# mask=chase_mask, # Apply the circular mask # mask=chase_mask, # Apply the circular mask
# contour_width=0, # contour_width=0,

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB