Compare commits
6 Commits
e90b41f648
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 069e568d00 | |||
| 417273c745 | |||
| eee6947f01 | |||
| d6b449e8c6 | |||
| 8fbc11da7a | |||
| 50f9538dcf |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -13,4 +13,5 @@ __pycache__/
|
|||||||
|
|
||||||
data/
|
data/
|
||||||
docker-volumes/
|
docker-volumes/
|
||||||
logs/
|
logs/
|
||||||
|
|
||||||
|
|||||||
@@ -22,24 +22,27 @@ def _():
|
|||||||
tqdm.pandas()
|
tqdm.pandas()
|
||||||
|
|
||||||
|
|
||||||
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
|
|
||||||
|
|
||||||
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
|
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
|
||||||
WORKING_DIR = Path('./data/processing/02-b_WordClouds')
|
WORKING_DIR = Path('./data/processing/02-b_WordClouds')
|
||||||
|
VOICE_EXCLUDE_KEYWORDS_FILE = WORKING_DIR / 'voice_excl_keywords.txt'
|
||||||
|
|
||||||
if not WORKING_DIR.exists():
|
if not WORKING_DIR.exists():
|
||||||
WORKING_DIR.mkdir(parents=True)
|
WORKING_DIR.mkdir(parents=True)
|
||||||
if not TAGUETTE_EXPORT_DIR.exists():
|
if not TAGUETTE_EXPORT_DIR.exists():
|
||||||
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
|
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
|
||||||
|
|
||||||
model_select = mo.ui.dropdown(
|
if not VOICE_EXCLUDE_KEYWORDS_FILE.exists():
|
||||||
options=_models,
|
VOICE_EXCLUDE_KEYWORDS_FILE.touch()
|
||||||
value=_models[0],
|
|
||||||
label="Select Ollama Model to use",
|
return (
|
||||||
searchable=True,
|
OLLAMA_LOCATION,
|
||||||
|
TAGUETTE_EXPORT_DIR,
|
||||||
|
VOICE_EXCLUDE_KEYWORDS_FILE,
|
||||||
|
WORKING_DIR,
|
||||||
|
connect_qumo_ollama,
|
||||||
|
mo,
|
||||||
|
pd,
|
||||||
)
|
)
|
||||||
model_select
|
|
||||||
return TAGUETTE_EXPORT_DIR, WORKING_DIR, client, mo, model_select, pd
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
@@ -116,7 +119,7 @@ def _(all_tags_df, mo):
|
|||||||
return (tag_select,)
|
return (tag_select,)
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell(hide_code=True)
|
||||||
def _(WORKING_DIR, all_tags_df, mo, tag_select):
|
def _(WORKING_DIR, all_tags_df, mo, tag_select):
|
||||||
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
|
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
|
||||||
|
|
||||||
@@ -139,7 +142,7 @@ def _(WORKING_DIR, all_tags_df, mo, tag_select):
|
|||||||
|
|
||||||
# filter all_tags_df to only the document = file_dropdown.value
|
# filter all_tags_df to only the document = file_dropdown.value
|
||||||
tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
|
tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
|
||||||
tags_df
|
tags_df.head()
|
||||||
return (
|
return (
|
||||||
KEYWORDS_FPATH,
|
KEYWORDS_FPATH,
|
||||||
KEYWORD_FREQ_FPATH,
|
KEYWORD_FREQ_FPATH,
|
||||||
@@ -151,44 +154,65 @@ def _(WORKING_DIR, all_tags_df, mo, tag_select):
|
|||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(mo):
|
def _(KEYWORD_FREQ_FPATH, mo):
|
||||||
mo.md(r"""
|
mo.md(rf"""
|
||||||
# 4) Keyword extraction
|
# 4) Keyword extraction {'(skippable, see 4b)' if KEYWORD_FREQ_FPATH.exists() else '(Required)'}
|
||||||
""")
|
""")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(OLLAMA_LOCATION, connect_qumo_ollama, mo):
|
||||||
|
try:
|
||||||
|
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
|
||||||
|
model_select = mo.ui.dropdown(
|
||||||
|
options=_models,
|
||||||
|
value=_models[0],
|
||||||
|
label="Select Ollama Model to use",
|
||||||
|
searchable=True,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
mo.md(f"Error connecting to Ollama server at `{OLLAMA_LOCATION}`: {e}")
|
||||||
|
model_select = None
|
||||||
|
client = None
|
||||||
|
|
||||||
|
model_select
|
||||||
|
return client, model_select
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(mo, start_processing_btn, tag_select):
|
def _(mo, model_select, start_processing_btn, tag_select):
|
||||||
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
|
mo.stop(not tag_select.value or model_select is None, mo.md("Select tag to continue"))
|
||||||
|
|
||||||
start_processing_btn
|
start_processing_btn
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell(hide_code=True)
|
||||||
def _(client, mo, model_select, pd, start_processing_btn, tags_df):
|
def _(client, mo, model_select, pd, start_processing_btn, tags_df):
|
||||||
from utils import ollama_keyword_extraction, worker_extraction
|
from utils import ollama_keyword_extraction, worker_extraction
|
||||||
# Wait for start processing button
|
# Wait for start processing button
|
||||||
mo.stop(not start_processing_btn.value, "Click button above to start processing")
|
mo.stop(not start_processing_btn.value, "Click button above to start processing")
|
||||||
|
|
||||||
|
if client is not None:
|
||||||
|
df = tags_df
|
||||||
|
# Run keyword extraction
|
||||||
|
|
||||||
df = tags_df
|
df['keywords'] = df.progress_apply(
|
||||||
# Run keyword extraction
|
lambda row: pd.Series(ollama_keyword_extraction(
|
||||||
|
content=row['content'],
|
||||||
df['keywords'] = df.progress_apply(
|
tag=row['tag'],
|
||||||
lambda row: pd.Series(ollama_keyword_extraction(
|
client=client,
|
||||||
content=row['content'],
|
model=model_select.value
|
||||||
tag=row['tag'],
|
)),
|
||||||
client=client,
|
axis=1
|
||||||
model=model_select.value
|
)
|
||||||
)),
|
else:
|
||||||
axis=1
|
mo.md("Ollama client not available, See 4b) for loading data from xlsx.")
|
||||||
)
|
|
||||||
return (df,)
|
return (df,)
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell(hide_code=True)
|
||||||
def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
|
def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
|
||||||
mo.stop(not start_processing_btn.value, "Click button above to process first")
|
mo.stop(not start_processing_btn.value, "Click button above to process first")
|
||||||
|
|
||||||
@@ -232,26 +256,37 @@ def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
|
|||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(mo):
|
def _(KEYWORD_FREQ_FPATH, mo):
|
||||||
mo.md(r"""
|
mo.md(rf"""
|
||||||
# 4b) [optional] Load data from `keyword_frequencies_*.xlsx`
|
# 4b) [optional] Load data from `keyword_frequencies_{KEYWORD_FREQ_FPATH.name}`
|
||||||
""")
|
""")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(KEYWORD_FREQ_FPATH, mo):
|
def _(KEYWORD_FREQ_FPATH, mo, start_processing_btn):
|
||||||
|
if start_processing_btn is not None: # Triggers re-execution of this cell when keyword extraction completes
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
load_existing_btn = None
|
load_existing_btn = None
|
||||||
if KEYWORD_FREQ_FPATH.exists():
|
if KEYWORD_FREQ_FPATH.exists():
|
||||||
load_existing_btn = mo.ui.run_button(label=f"Load keywords from `{KEYWORD_FREQ_FPATH.name}`")
|
load_existing_btn = mo.ui.run_button(label=f"Load `{KEYWORD_FREQ_FPATH.name}`", kind='warn')
|
||||||
|
|
||||||
load_existing_btn
|
load_existing_btn
|
||||||
return (load_existing_btn,)
|
return (load_existing_btn,)
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
|
def _(
|
||||||
if load_existing_btn.value:
|
KEYWORD_FREQ_FPATH,
|
||||||
|
VOICE_EXCLUDE_KEYWORDS_FILE,
|
||||||
|
freq_df,
|
||||||
|
load_existing_btn,
|
||||||
|
pd,
|
||||||
|
tag_select,
|
||||||
|
):
|
||||||
|
if load_existing_btn is not None and load_existing_btn.value:
|
||||||
_fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
|
_fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
|
||||||
|
|
||||||
# Drop nan rows if any
|
# Drop nan rows if any
|
||||||
@@ -259,11 +294,23 @@ def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
|
|||||||
_fdf.sort_values(by='frequency', ascending=False, inplace=True)
|
_fdf.sort_values(by='frequency', ascending=False, inplace=True)
|
||||||
_fdf.reset_index(drop=True, inplace=True)
|
_fdf.reset_index(drop=True, inplace=True)
|
||||||
print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
|
print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
|
||||||
|
|
||||||
frequency_df = _fdf
|
frequency_df = _fdf
|
||||||
|
|
||||||
else:
|
else:
|
||||||
frequency_df = freq_df
|
frequency_df = freq_df
|
||||||
|
|
||||||
|
if tag_select.value.startswith('V'):
|
||||||
|
# Read exclusion list
|
||||||
|
excl_kw = []
|
||||||
|
with VOICE_EXCLUDE_KEYWORDS_FILE.open('r') as _f:
|
||||||
|
for line in _f:
|
||||||
|
excl_kw.append(line.strip())
|
||||||
|
|
||||||
|
_drop_idx = frequency_df[frequency_df['keyword'].isin(excl_kw)].index
|
||||||
|
|
||||||
|
frequency_df.drop(index=_drop_idx, inplace=True, axis=0)
|
||||||
|
print(f"Dropped {len(_drop_idx)} keywords automatically")
|
||||||
return (frequency_df,)
|
return (frequency_df,)
|
||||||
|
|
||||||
|
|
||||||
@@ -305,30 +352,6 @@ def _(mo):
|
|||||||
return (min_freq_select,)
|
return (min_freq_select,)
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
|
||||||
def _(freq_df, frequency_df, min_freq_select, mo):
|
|
||||||
mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish")
|
|
||||||
|
|
||||||
MIN_FREQ = min_freq_select.value
|
|
||||||
|
|
||||||
freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ]
|
|
||||||
|
|
||||||
freq_df_filtered.reset_index(drop=True, inplace=True)
|
|
||||||
|
|
||||||
keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
|
|
||||||
|
|
||||||
table_selection = mo.ui.table(freq_df_filtered, page_size=50)
|
|
||||||
table_selection
|
|
||||||
|
|
||||||
# keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
|
|
||||||
|
|
||||||
# # create list of keywords sorted by their frequencies. only store the keyword
|
|
||||||
# sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
|
|
||||||
# sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
|
|
||||||
# sorted_keywords_list
|
|
||||||
return (keyword_freq_filtered,)
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(mo, tag_select):
|
def _(mo, tag_select):
|
||||||
mo.md(rf"""
|
mo.md(rf"""
|
||||||
@@ -349,7 +372,80 @@ def _(mo, tag_select):
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell(hide_code=True)
|
||||||
|
def _(frequency_df, min_freq_select, mo):
|
||||||
|
mo.stop('keyword' not in frequency_df.columns, "Waiting for keyword extraction to finish")
|
||||||
|
|
||||||
|
MIN_FREQ = min_freq_select.value
|
||||||
|
|
||||||
|
_freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
|
||||||
|
|
||||||
|
table_selection = mo.ui.table(_freq_df_filtered, page_size=50)
|
||||||
|
table_selection
|
||||||
|
|
||||||
|
return MIN_FREQ, table_selection
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(mo, table_selection):
|
||||||
|
remove_rows_btn = None
|
||||||
|
if len(table_selection.value) >0 :
|
||||||
|
remove_rows_btn = mo.ui.run_button(label="Click to remove selected keywords and update xlsx")
|
||||||
|
|
||||||
|
remove_rows_btn
|
||||||
|
return (remove_rows_btn,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(
|
||||||
|
KEYWORD_FREQ_FPATH,
|
||||||
|
VOICE_EXCLUDE_KEYWORDS_FILE,
|
||||||
|
frequency_df,
|
||||||
|
mo,
|
||||||
|
remove_rows_btn,
|
||||||
|
table_selection,
|
||||||
|
tag_select,
|
||||||
|
):
|
||||||
|
_s = None
|
||||||
|
if remove_rows_btn is not None and remove_rows_btn.value:
|
||||||
|
# get selected rows
|
||||||
|
selected_rows = table_selection.value
|
||||||
|
if len(selected_rows) >0 :
|
||||||
|
rows_to_drop = table_selection.value.index.tolist()
|
||||||
|
try:
|
||||||
|
if tag_select.value.startswith('V'):
|
||||||
|
# append values to an VoiceKeywordsExclusion file (txt file just a list of keywords)
|
||||||
|
exclude_keywords = frequency_df.loc[rows_to_drop, 'keyword'].to_list()
|
||||||
|
|
||||||
|
with VOICE_EXCLUDE_KEYWORDS_FILE.open('w') as f:
|
||||||
|
for _kw in exclude_keywords:
|
||||||
|
f.write(_kw + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
frequency_df.drop(index=rows_to_drop, inplace=True, axis=0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
except KeyError:
|
||||||
|
_s = mo.callout("GO BACK TO STEP 4b) and reload data to continue refining the dataset.", kind='warn')
|
||||||
|
else:
|
||||||
|
# Save updated frequencies back to xlsx
|
||||||
|
frequency_df.to_excel(
|
||||||
|
KEYWORD_FREQ_FPATH,
|
||||||
|
index=False
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
|
||||||
|
|
||||||
|
# mo.callout(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`", kind="success")
|
||||||
|
_s = mo.callout("GO BACK TO STEP 4b) and reload data before continuing.", kind='warn')
|
||||||
|
|
||||||
|
_s
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
def _():
|
def _():
|
||||||
IGNORE_WORDS = {
|
IGNORE_WORDS = {
|
||||||
'chase as a brand': [
|
'chase as a brand': [
|
||||||
@@ -384,11 +480,13 @@ def _(mo):
|
|||||||
canvas_size = (1200, 800)
|
canvas_size = (1200, 800)
|
||||||
|
|
||||||
logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
|
logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
|
||||||
return buffer, canvas_size, logo_switch
|
|
||||||
|
n_words = mo.ui.slider(start=10, stop=200, step=1, value=100, debounce=True, show_value=True, label="Max number of words in WordCloud")
|
||||||
|
return buffer, canvas_size, logo_switch, n_words
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(logo_switch, mo):
|
def _(logo_switch, mo, n_words):
|
||||||
run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud")
|
run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud")
|
||||||
|
|
||||||
mo.vstack([
|
mo.vstack([
|
||||||
@@ -399,7 +497,7 @@ def _(logo_switch, mo):
|
|||||||
|
|
||||||
When satisfied with the result, click 'Save WordCloud to File' to save the image."""),
|
When satisfied with the result, click 'Save WordCloud to File' to save the image."""),
|
||||||
mo.md('---'),
|
mo.md('---'),
|
||||||
mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')]
|
mo.hstack([logo_switch, n_words, run_wordcloud_btn], align='center', justify='space-around')]
|
||||||
)
|
)
|
||||||
return (run_wordcloud_btn,)
|
return (run_wordcloud_btn,)
|
||||||
|
|
||||||
@@ -409,13 +507,15 @@ def _(
|
|||||||
IGNORE_WORDS,
|
IGNORE_WORDS,
|
||||||
Image,
|
Image,
|
||||||
ImageDraw,
|
ImageDraw,
|
||||||
|
MIN_FREQ,
|
||||||
WordCloud,
|
WordCloud,
|
||||||
blue_color_func,
|
blue_color_func,
|
||||||
buffer,
|
buffer,
|
||||||
canvas_size,
|
canvas_size,
|
||||||
keyword_freq_filtered,
|
frequency_df,
|
||||||
logo_switch,
|
logo_switch,
|
||||||
mo,
|
mo,
|
||||||
|
n_words,
|
||||||
np,
|
np,
|
||||||
plt,
|
plt,
|
||||||
run_wordcloud_btn,
|
run_wordcloud_btn,
|
||||||
@@ -424,6 +524,12 @@ def _(
|
|||||||
if run_wordcloud_btn.value:
|
if run_wordcloud_btn.value:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
|
||||||
|
|
||||||
|
# freq_df_filtered.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
|
||||||
|
|
||||||
# remove specific keywords depending on selected tag
|
# remove specific keywords depending on selected tag
|
||||||
if IGNORE_WORDS.get(tag_select.value.lower()):
|
if IGNORE_WORDS.get(tag_select.value.lower()):
|
||||||
for word in IGNORE_WORDS[tag_select.value.lower()]:
|
for word in IGNORE_WORDS[tag_select.value.lower()]:
|
||||||
@@ -433,7 +539,7 @@ def _(
|
|||||||
if logo_switch.value:
|
if logo_switch.value:
|
||||||
# 1. Load the logo
|
# 1. Load the logo
|
||||||
# Make sure this path points to your uploaded file
|
# Make sure this path points to your uploaded file
|
||||||
logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png"
|
logo_path = "./assets/JP-Morgan-Chase-Symbol.png"
|
||||||
logo = Image.open(logo_path).convert("RGBA")
|
logo = Image.open(logo_path).convert("RGBA")
|
||||||
|
|
||||||
# Optional: Resize logo if it's too large or small for the canvas
|
# Optional: Resize logo if it's too large or small for the canvas
|
||||||
@@ -473,7 +579,7 @@ def _(
|
|||||||
width=canvas_size[0],
|
width=canvas_size[0],
|
||||||
height=canvas_size[1],
|
height=canvas_size[1],
|
||||||
max_font_size=100, # Increased font size for larger canvas
|
max_font_size=100, # Increased font size for larger canvas
|
||||||
max_words=20, # Increased word count to fill space
|
max_words=n_words.value, # Increased word count to fill space
|
||||||
color_func=blue_color_func,
|
color_func=blue_color_func,
|
||||||
mask=chase_mask, # Apply the circular mask
|
mask=chase_mask, # Apply the circular mask
|
||||||
contour_width=0,
|
contour_width=0,
|
||||||
@@ -487,7 +593,7 @@ def _(
|
|||||||
width=canvas_size[0],
|
width=canvas_size[0],
|
||||||
height=canvas_size[1],
|
height=canvas_size[1],
|
||||||
max_font_size=150, # Increased font size for larger canvas
|
max_font_size=150, # Increased font size for larger canvas
|
||||||
max_words=20, # Increased word count to fill space
|
max_words=n_words.value, # Increased word count to fill space
|
||||||
color_func=blue_color_func,
|
color_func=blue_color_func,
|
||||||
# mask=chase_mask, # Apply the circular mask
|
# mask=chase_mask, # Apply the circular mask
|
||||||
# contour_width=0,
|
# contour_width=0,
|
||||||
BIN
assets/JP-Morgan-Chase-Symbol.png
Normal file
BIN
assets/JP-Morgan-Chase-Symbol.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 13 KiB |
Reference in New Issue
Block a user