diff --git a/.gitignore b/.gitignore index 3c1ca91..42d70d7 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,5 @@ __pycache__/ data/ docker-volumes/ -logs/ \ No newline at end of file +logs/ + diff --git a/02-B_Thematic-Processing.py b/02-B_Thematic-Processing.py index a43cd59..f3411c8 100644 --- a/02-B_Thematic-Processing.py +++ b/02-B_Thematic-Processing.py @@ -22,7 +22,6 @@ def _(): tqdm.pandas() - client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False) TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export') WORKING_DIR = Path('./data/processing/02-b_WordClouds') @@ -32,14 +31,14 @@ def _(): if not TAGUETTE_EXPORT_DIR.exists(): TAGUETTE_EXPORT_DIR.mkdir(parents=True) - model_select = mo.ui.dropdown( - options=_models, - value=_models[0], - label="Select Ollama Model to use", - searchable=True, + return ( + OLLAMA_LOCATION, + TAGUETTE_EXPORT_DIR, + WORKING_DIR, + connect_qumo_ollama, + mo, + pd, ) - model_select - return TAGUETTE_EXPORT_DIR, WORKING_DIR, client, mo, model_select, pd @app.cell(hide_code=True) @@ -159,8 +158,27 @@ def _(mo): @app.cell -def _(mo, start_processing_btn, tag_select): - mo.stop(not tag_select.value, mo.md("Select tag to continue")) +def _(OLLAMA_LOCATION, connect_qumo_ollama, mo): + try: + client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False) + model_select = mo.ui.dropdown( + options=_models, + value=_models[0], + label="Select Ollama Model to use", + searchable=True, + ) + except Exception as e: + mo.md(f"Error connecting to Ollama server at `{OLLAMA_LOCATION}`: {e}") + model_select = None + client = None + + model_select + return client, model_select + + +@app.cell +def _(mo, model_select, start_processing_btn, tag_select): + mo.stop(not tag_select.value or model_select is None, mo.md("Select tag to continue")) start_processing_btn return @@ -172,19 +190,21 @@ def _(client, mo, model_select, pd, start_processing_btn, tags_df): # Wait for start processing button mo.stop(not start_processing_btn.value, "Click button above to start processing") + if client is not None: + df = tags_df + # Run keyword extraction - df = tags_df - # Run keyword extraction - - df['keywords'] = df.progress_apply( - lambda row: pd.Series(ollama_keyword_extraction( - content=row['content'], - tag=row['tag'], - client=client, - model=model_select.value - )), - axis=1 - ) + df['keywords'] = df.progress_apply( + lambda row: pd.Series(ollama_keyword_extraction( + content=row['content'], + tag=row['tag'], + client=client, + model=model_select.value + )), + axis=1 + ) + else: + mo.md("Ollama client not available, See 4b) for loading data from xlsx.") return (df,) @@ -251,7 +271,7 @@ def _(KEYWORD_FREQ_FPATH, mo): @app.cell(hide_code=True) def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd): - if load_existing_btn.value: + if load_existing_btn is not None and load_existing_btn.value: _fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl') # Drop nan rows if any @@ -305,30 +325,6 @@ def _(mo): return (min_freq_select,) -@app.cell(hide_code=True) -def _(freq_df, frequency_df, min_freq_select, mo): - mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish") - - MIN_FREQ = min_freq_select.value - - freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ] - - freq_df_filtered.reset_index(drop=True, inplace=True) - - keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict() - - table_selection = mo.ui.table(freq_df_filtered, page_size=50) - table_selection - - # keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ} - - # # create list of keywords sorted by their frequencies. only store the keyword - # sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True) - # sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords] - # sorted_keywords_list - return (keyword_freq_filtered,) - - @app.cell(hide_code=True) def _(mo, tag_select): mo.md(rf""" @@ -349,6 +345,24 @@ def _(mo, tag_select): return +@app.cell(hide_code=True) +def _(frequency_df, min_freq_select, mo): + mo.stop('keyword' not in frequency_df.columns, "Waiting for keyword extraction to finish") + + MIN_FREQ = min_freq_select.value + + freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy() + + freq_df_filtered.reset_index(drop=True, inplace=True) + + keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict() + + table_selection = mo.ui.table(freq_df_filtered, page_size=50) + table_selection + + return (keyword_freq_filtered,) + + @app.cell def _(): IGNORE_WORDS = { @@ -433,7 +447,7 @@ def _( if logo_switch.value: # 1. Load the logo # Make sure this path points to your uploaded file - logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png" + logo_path = "./assets/JP-Morgan-Chase-Symbol.png" logo = Image.open(logo_path).convert("RGBA") # Optional: Resize logo if it's too large or small for the canvas diff --git a/assets/JP-Morgan-Chase-Symbol.png b/assets/JP-Morgan-Chase-Symbol.png new file mode 100644 index 0000000..6189df0 Binary files /dev/null and b/assets/JP-Morgan-Chase-Symbol.png differ