format for consecutive runs

This commit is contained in:
2025-12-16 23:21:03 -08:00
parent e90b41f648
commit 50f9538dcf
3 changed files with 64 additions and 49 deletions

3
.gitignore vendored
View File

@@ -13,4 +13,5 @@ __pycache__/
data/ data/
docker-volumes/ docker-volumes/
logs/ logs/

View File

@@ -22,7 +22,6 @@ def _():
tqdm.pandas() tqdm.pandas()
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export') TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
WORKING_DIR = Path('./data/processing/02-b_WordClouds') WORKING_DIR = Path('./data/processing/02-b_WordClouds')
@@ -32,14 +31,14 @@ def _():
if not TAGUETTE_EXPORT_DIR.exists(): if not TAGUETTE_EXPORT_DIR.exists():
TAGUETTE_EXPORT_DIR.mkdir(parents=True) TAGUETTE_EXPORT_DIR.mkdir(parents=True)
model_select = mo.ui.dropdown( return (
options=_models, OLLAMA_LOCATION,
value=_models[0], TAGUETTE_EXPORT_DIR,
label="Select Ollama Model to use", WORKING_DIR,
searchable=True, connect_qumo_ollama,
mo,
pd,
) )
model_select
return TAGUETTE_EXPORT_DIR, WORKING_DIR, client, mo, model_select, pd
@app.cell(hide_code=True) @app.cell(hide_code=True)
@@ -159,8 +158,27 @@ def _(mo):
@app.cell @app.cell
def _(mo, start_processing_btn, tag_select): def _(OLLAMA_LOCATION, connect_qumo_ollama, mo):
mo.stop(not tag_select.value, mo.md("Select tag to continue")) try:
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
model_select = mo.ui.dropdown(
options=_models,
value=_models[0],
label="Select Ollama Model to use",
searchable=True,
)
except Exception as e:
mo.md(f"Error connecting to Ollama server at `{OLLAMA_LOCATION}`: {e}")
model_select = None
client = None
model_select
return client, model_select
@app.cell
def _(mo, model_select, start_processing_btn, tag_select):
mo.stop(not tag_select.value or model_select is None, mo.md("Select tag to continue"))
start_processing_btn start_processing_btn
return return
@@ -172,19 +190,21 @@ def _(client, mo, model_select, pd, start_processing_btn, tags_df):
# Wait for start processing button # Wait for start processing button
mo.stop(not start_processing_btn.value, "Click button above to start processing") mo.stop(not start_processing_btn.value, "Click button above to start processing")
if client is not None:
df = tags_df
# Run keyword extraction
df = tags_df df['keywords'] = df.progress_apply(
# Run keyword extraction lambda row: pd.Series(ollama_keyword_extraction(
content=row['content'],
df['keywords'] = df.progress_apply( tag=row['tag'],
lambda row: pd.Series(ollama_keyword_extraction( client=client,
content=row['content'], model=model_select.value
tag=row['tag'], )),
client=client, axis=1
model=model_select.value )
)), else:
axis=1 mo.md("Ollama client not available, See 4b) for loading data from xlsx.")
)
return (df,) return (df,)
@@ -251,7 +271,7 @@ def _(KEYWORD_FREQ_FPATH, mo):
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd): def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
if load_existing_btn.value: if load_existing_btn is not None and load_existing_btn.value:
_fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl') _fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
# Drop nan rows if any # Drop nan rows if any
@@ -305,30 +325,6 @@ def _(mo):
return (min_freq_select,) return (min_freq_select,)
@app.cell(hide_code=True)
def _(freq_df, frequency_df, min_freq_select, mo):
mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish")
MIN_FREQ = min_freq_select.value
freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ]
freq_df_filtered.reset_index(drop=True, inplace=True)
keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
table_selection = mo.ui.table(freq_df_filtered, page_size=50)
table_selection
# keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
# # create list of keywords sorted by their frequencies. only store the keyword
# sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
# sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
# sorted_keywords_list
return (keyword_freq_filtered,)
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo, tag_select): def _(mo, tag_select):
mo.md(rf""" mo.md(rf"""
@@ -349,6 +345,24 @@ def _(mo, tag_select):
return return
@app.cell(hide_code=True)
def _(frequency_df, min_freq_select, mo):
mo.stop('keyword' not in frequency_df.columns, "Waiting for keyword extraction to finish")
MIN_FREQ = min_freq_select.value
freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
freq_df_filtered.reset_index(drop=True, inplace=True)
keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
table_selection = mo.ui.table(freq_df_filtered, page_size=50)
table_selection
return (keyword_freq_filtered,)
@app.cell @app.cell
def _(): def _():
IGNORE_WORDS = { IGNORE_WORDS = {
@@ -433,7 +447,7 @@ def _(
if logo_switch.value: if logo_switch.value:
# 1. Load the logo # 1. Load the logo
# Make sure this path points to your uploaded file # Make sure this path points to your uploaded file
logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png" logo_path = "./assets/JP-Morgan-Chase-Symbol.png"
logo = Image.open(logo_path).convert("RGBA") logo = Image.open(logo_path).convert("RGBA")
# Optional: Resize logo if it's too large or small for the canvas # Optional: Resize logo if it's too large or small for the canvas

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB