progress apply

2025-12-16 16:28:07 -08:00
parent 12e14e3c9b
commit 228a6daa59
5 changed files with 212 additions and 45 deletions
--- a/02-B_Thematic-Processing.py
+++ b/02-B_Thematic-Processing.py
@@ -7,8 +7,8 @@ app = marimo.App(width="medium")
@app.cell
 def _():
    import marimo as mo
-    # import pandas as pd
-    import modin.pandas as pd
+    import pandas as pd
+    import modin.pandas as mpd
    from tqdm import tqdm
    from pathlib import Path
    from datetime import datetime
@@ -20,8 +20,7 @@ def _():

    # initialize tqdm for pandas
    tqdm.pandas()
-    from modin.config import ProgressBar
-    ProgressBar.enable()
+

    client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)

@@ -134,6 +133,9 @@ def _(mo):
@app.cell
 def _(mo, tag_select):
    mo.stop(not tag_select.value, mo.md("Select tag to continue"))
+
+    # mdf = mpd.from_pandas(df)
+
    start_processing_btn = mo.ui.button(
        label="Start Keyword Extraction",
        kind="warn",
@@ -144,13 +146,23 @@ def _(mo, tag_select):


@app.cell
-def _(client, df, mo, model_select, pd, start_processing_btn):
-    from utils import ollama_keyword_extraction
+def _(
+    WORKING_DIR,
+    client,
+    df,
+    mo,
+    model_select,
+    pd,
+    start_processing_btn,
+    tag_select,
+):
+    from utils import ollama_keyword_extraction, worker_extraction
    # Wait for start processing button
    mo.stop(not start_processing_btn.value, "Click button above to start processing")

+
    # Run keyword extraction
-    df['keywords'] = df.apply(
+    df['keywords'] = df.progress_apply(
        lambda row: pd.Series(ollama_keyword_extraction(
            content=row['content'], 
            tag=row['tag'], 
@@ -159,17 +171,9 @@ def _(client, df, mo, model_select, pd, start_processing_btn):
        )),
        axis=1
    )
-    return

-
-@app.cell
-def _(df):
    df['keywords_txt'] = df['keywords'].progress_apply(lambda kws: ', '.join(kws))
-    return

-
-@app.cell
-def _(WORKING_DIR, df, tag_select):
    df[['id', 'tag', 'content', 'keywords_txt']].to_csv(
        WORKING_DIR / f'keywords_{tag_select.value.replace(" ", "-")}.csv',
        index=False
@@ -214,7 +218,7 @@ def _(df):
        else:
            keyword_freq[kw] = 1

-    keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq > MIN_FREQ}
+    keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}

    # create list of keywords sorted by their frequencies. only store the keyword
    sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
@@ -231,12 +235,12 @@ def _(plt):
    def blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
        # Use the provided random_state for reproducibility if available, else use random module
        r = random_state if random_state else random
-    
+
        # Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
        # 0.0 is white/light, 1.0 is dark blue
        min_val, max_val = 0.4, 1.0
        color_val = r.uniform(min_val, max_val)
-    
+
        # Get color from matplotlib colormap
        rgba = plt.cm.Blues(color_val)
        return mcolors.to_hex(rgba)