Compare commits
2 Commits
e90b41f648
...
8fbc11da7a
| Author | SHA1 | Date | |
|---|---|---|---|
| 8fbc11da7a | |||
| 50f9538dcf |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -13,4 +13,5 @@ __pycache__/
|
||||
|
||||
data/
|
||||
docker-volumes/
|
||||
logs/
|
||||
logs/
|
||||
|
||||
|
||||
@@ -22,7 +22,6 @@ def _():
|
||||
tqdm.pandas()
|
||||
|
||||
|
||||
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
|
||||
|
||||
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
|
||||
WORKING_DIR = Path('./data/processing/02-b_WordClouds')
|
||||
@@ -32,14 +31,14 @@ def _():
|
||||
if not TAGUETTE_EXPORT_DIR.exists():
|
||||
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
|
||||
|
||||
model_select = mo.ui.dropdown(
|
||||
options=_models,
|
||||
value=_models[0],
|
||||
label="Select Ollama Model to use",
|
||||
searchable=True,
|
||||
return (
|
||||
OLLAMA_LOCATION,
|
||||
TAGUETTE_EXPORT_DIR,
|
||||
WORKING_DIR,
|
||||
connect_qumo_ollama,
|
||||
mo,
|
||||
pd,
|
||||
)
|
||||
model_select
|
||||
return TAGUETTE_EXPORT_DIR, WORKING_DIR, client, mo, model_select, pd
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
@@ -159,8 +158,27 @@ def _(mo):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, start_processing_btn, tag_select):
|
||||
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
|
||||
def _(OLLAMA_LOCATION, connect_qumo_ollama, mo):
|
||||
try:
|
||||
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
|
||||
model_select = mo.ui.dropdown(
|
||||
options=_models,
|
||||
value=_models[0],
|
||||
label="Select Ollama Model to use",
|
||||
searchable=True,
|
||||
)
|
||||
except Exception as e:
|
||||
mo.md(f"Error connecting to Ollama server at `{OLLAMA_LOCATION}`: {e}")
|
||||
model_select = None
|
||||
client = None
|
||||
|
||||
model_select
|
||||
return client, model_select
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, model_select, start_processing_btn, tag_select):
|
||||
mo.stop(not tag_select.value or model_select is None, mo.md("Select tag to continue"))
|
||||
|
||||
start_processing_btn
|
||||
return
|
||||
@@ -172,19 +190,21 @@ def _(client, mo, model_select, pd, start_processing_btn, tags_df):
|
||||
# Wait for start processing button
|
||||
mo.stop(not start_processing_btn.value, "Click button above to start processing")
|
||||
|
||||
if client is not None:
|
||||
df = tags_df
|
||||
# Run keyword extraction
|
||||
|
||||
df = tags_df
|
||||
# Run keyword extraction
|
||||
|
||||
df['keywords'] = df.progress_apply(
|
||||
lambda row: pd.Series(ollama_keyword_extraction(
|
||||
content=row['content'],
|
||||
tag=row['tag'],
|
||||
client=client,
|
||||
model=model_select.value
|
||||
)),
|
||||
axis=1
|
||||
)
|
||||
df['keywords'] = df.progress_apply(
|
||||
lambda row: pd.Series(ollama_keyword_extraction(
|
||||
content=row['content'],
|
||||
tag=row['tag'],
|
||||
client=client,
|
||||
model=model_select.value
|
||||
)),
|
||||
axis=1
|
||||
)
|
||||
else:
|
||||
mo.md("Ollama client not available, See 4b) for loading data from xlsx.")
|
||||
return (df,)
|
||||
|
||||
|
||||
@@ -251,7 +271,7 @@ def _(KEYWORD_FREQ_FPATH, mo):
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
|
||||
if load_existing_btn.value:
|
||||
if load_existing_btn is not None and load_existing_btn.value:
|
||||
_fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
|
||||
|
||||
# Drop nan rows if any
|
||||
@@ -305,30 +325,6 @@ def _(mo):
|
||||
return (min_freq_select,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(freq_df, frequency_df, min_freq_select, mo):
|
||||
mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish")
|
||||
|
||||
MIN_FREQ = min_freq_select.value
|
||||
|
||||
freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ]
|
||||
|
||||
freq_df_filtered.reset_index(drop=True, inplace=True)
|
||||
|
||||
keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
|
||||
|
||||
table_selection = mo.ui.table(freq_df_filtered, page_size=50)
|
||||
table_selection
|
||||
|
||||
# keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
|
||||
|
||||
# # create list of keywords sorted by their frequencies. only store the keyword
|
||||
# sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
|
||||
# sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
|
||||
# sorted_keywords_list
|
||||
return (keyword_freq_filtered,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo, tag_select):
|
||||
mo.md(rf"""
|
||||
@@ -350,6 +346,52 @@ def _(mo, tag_select):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(frequency_df, min_freq_select, mo):
|
||||
mo.stop('keyword' not in frequency_df.columns, "Waiting for keyword extraction to finish")
|
||||
|
||||
MIN_FREQ = min_freq_select.value
|
||||
|
||||
_freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
|
||||
|
||||
table_selection = mo.ui.table(_freq_df_filtered, page_size=50)
|
||||
table_selection
|
||||
|
||||
return MIN_FREQ, table_selection
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo, table_selection):
|
||||
remove_rows_btn = None
|
||||
if len(table_selection.value) >0 :
|
||||
remove_rows_btn = mo.ui.run_button(label="Click to remove selected keywords and update xlsx")
|
||||
|
||||
remove_rows_btn
|
||||
return (remove_rows_btn,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(KEYWORD_FREQ_FPATH, frequency_df, remove_rows_btn, table_selection):
|
||||
if remove_rows_btn is not None and remove_rows_btn.value:
|
||||
# get selected rows
|
||||
selected_rows = table_selection.value
|
||||
if len(selected_rows) >0 :
|
||||
rows_to_drop = table_selection.value.index.tolist()
|
||||
|
||||
frequency_df.drop(index=rows_to_drop, inplace=True, axis=0)
|
||||
|
||||
# Save updated frequencies back to xlsx
|
||||
frequency_df.to_excel(
|
||||
KEYWORD_FREQ_FPATH,
|
||||
index=False
|
||||
)
|
||||
|
||||
print(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
|
||||
|
||||
print("GO TO STEP 4b) and reload data to continue refining the dataset.")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
IGNORE_WORDS = {
|
||||
'chase as a brand': [
|
||||
@@ -409,11 +451,12 @@ def _(
|
||||
IGNORE_WORDS,
|
||||
Image,
|
||||
ImageDraw,
|
||||
MIN_FREQ,
|
||||
WordCloud,
|
||||
blue_color_func,
|
||||
buffer,
|
||||
canvas_size,
|
||||
keyword_freq_filtered,
|
||||
frequency_df,
|
||||
logo_switch,
|
||||
mo,
|
||||
np,
|
||||
@@ -424,6 +467,12 @@ def _(
|
||||
if run_wordcloud_btn.value:
|
||||
pass
|
||||
|
||||
freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
|
||||
|
||||
# freq_df_filtered.reset_index(drop=True, inplace=True)
|
||||
|
||||
keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
|
||||
|
||||
# remove specific keywords depending on selected tag
|
||||
if IGNORE_WORDS.get(tag_select.value.lower()):
|
||||
for word in IGNORE_WORDS[tag_select.value.lower()]:
|
||||
@@ -433,7 +482,7 @@ def _(
|
||||
if logo_switch.value:
|
||||
# 1. Load the logo
|
||||
# Make sure this path points to your uploaded file
|
||||
logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png"
|
||||
logo_path = "./assets/JP-Morgan-Chase-Symbol.png"
|
||||
logo = Image.open(logo_path).convert("RGBA")
|
||||
|
||||
# Optional: Resize logo if it's too large or small for the canvas
|
||||
|
||||
BIN
assets/JP-Morgan-Chase-Symbol.png
Normal file
BIN
assets/JP-Morgan-Chase-Symbol.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 13 KiB |
Reference in New Issue
Block a user