added functionality to load keywords from excel file

2025-12-16 22:25:12 -08:00
parent e81961b819
commit e90b41f648
4 changed files with 181 additions and 91 deletions
--- a/02-B_Thematic-Processing.py
+++ b/02-B_Thematic-Processing.py
@@ -104,6 +104,22 @@ def _(mo):
@app.cell(hide_code=True)
 def _(all_tags_df, mo):
    tag_select = mo.ui.dropdown(
        options=all_tags_df['tag'].unique().tolist(),
        label="Select Tag to Process",
        # value="Chase as a brand",
        full_width=True,
    )
    tag_select
    return (tag_select,)
@app.cell
 def _(WORKING_DIR, all_tags_df, mo, tag_select):
    mo.stop(not tag_select.value, mo.md("Select tag to continue"))
    start_processing_btn = None
    start_processing_btn = mo.ui.button(
        label="Start Keyword Extraction",
@@ -111,26 +127,27 @@ def _(all_tags_df, mo):
        on_click=lambda val: True
    )
    tag_select = mo.ui.dropdown(
        options=all_tags_df['tag'].unique().tolist(),
        label="Select Tag to Process",
        value="Chase as a brand",
        full_width=True,
    )
    tag_select
    return start_processing_btn, tag_select
@app.cell
 def _(all_tags_df, mo, tag_select):
    mo.stop(not tag_select.value, mo.md("Select tag to continue"))
    tag_fname = tag_select.value.replace(" ", "-").replace('/','-')
    SAVE_DIR = WORKING_DIR / tag_fname
    if not SAVE_DIR.exists():
        SAVE_DIR.mkdir(parents=True)
    KEYWORDS_FPATH = SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'
    KEYWORD_FREQ_FPATH = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
    # filter all_tags_df to only the document = file_dropdown.value
-    df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
+    tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
-    df
+    tags_df
-    return df, tag_fname
+    return (
        KEYWORDS_FPATH,
        KEYWORD_FREQ_FPATH,
        SAVE_DIR,
        start_processing_btn,
        tag_fname,
        tags_df,
    )
@app.cell(hide_code=True)
@@ -141,22 +158,24 @@ def _(mo):
    return
-@app.cell(hide_code=True)
+@app.cell
 def _(mo, start_processing_btn, tag_select):
    mo.stop(not tag_select.value, mo.md("Select tag to continue"))
    # mdf = mpd.from_pandas(df)
    start_processing_btn
    return
-@app.cell(hide_code=True)
+@app.cell
-def _(client, df, mo, model_select, pd, start_processing_btn):
+def _(client, mo, model_select, pd, start_processing_btn, tags_df):
    from utils import ollama_keyword_extraction, worker_extraction
    # Wait for start processing button
    mo.stop(not start_processing_btn.value, "Click button above to start processing")
    df = tags_df
    # Run keyword extraction
    df['keywords'] = df.progress_apply(
        lambda row: pd.Series(ollama_keyword_extraction(
            content=row['content'], 
@@ -166,31 +185,17 @@ def _(client, df, mo, model_select, pd, start_processing_btn):
        )),
        axis=1
    )
    return (df,)
-    return
+@app.cell
-
+def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
-
+    mo.stop(not start_processing_btn.value, "Click button above to process first")
@app.cell(hide_code=True)
 def _(WORKING_DIR, df, mo, pd, tag_fname):
    # Save results to csv
    mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
    SAVE_DIR = WORKING_DIR / tag_fname
    if not SAVE_DIR.exists():
        SAVE_DIR.mkdir(parents=True)
    df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws))
    df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
        SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx',
        index=False
    )
    all_keywords_list = df['keywords'].tolist()
    all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
    # Calculate frequencies per keyword
@@ -206,16 +211,60 @@ def _(WORKING_DIR, df, mo, pd, tag_fname):
    freq_df.reset_index(inplace=True)
    freq_df.sort_values(by='frequency', ascending=False, inplace=True)
-    _freq_fpath = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
+
    # Save to Excel files
    df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
        KEYWORDS_FPATH,
        index=False
    )
    freq_df.to_excel(
-        _freq_fpath,
+        KEYWORD_FREQ_FPATH,
        index=False
    )
    mo.vstack([
-        mo.md(f"Keywords per-highligh saved to: `{SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'}`"),
+        mo.md(f"Keywords per-highlight saved to: `{KEYWORDS_FPATH}`"),
-        mo.md(f"Keyword frequencies saved to: `{_freq_fpath}`")
+        mo.md(f"Keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
    ])
-    return SAVE_DIR, keyword_freq
+    return (freq_df,)
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    # 4b) [optional] Load data from `keyword_frequencies_*.xlsx`
    """)
    return
@app.cell(hide_code=True)
 def _(KEYWORD_FREQ_FPATH, mo):
    load_existing_btn = None
    if KEYWORD_FREQ_FPATH.exists():
        load_existing_btn = mo.ui.run_button(label=f"Load keywords from `{KEYWORD_FREQ_FPATH.name}`")
    load_existing_btn
    return (load_existing_btn,)
@app.cell(hide_code=True)
 def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
    if load_existing_btn.value:
        _fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
        # Drop nan rows if any
        _fdf.dropna(subset=['keyword', 'frequency'], inplace=True)
        _fdf.sort_values(by='frequency', ascending=False, inplace=True)
        _fdf.reset_index(drop=True, inplace=True)
        print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
        frequency_df = _fdf
    else:
        frequency_df = freq_df
    return (frequency_df,)
@app.cell(hide_code=True)
@@ -228,7 +277,7 @@ def _(mo):
@app.cell(hide_code=True)
 def _():
-    # Start with loading all necessary libraries
+    # Import all necessary libraries
    import numpy as np
    from os import path
    from PIL import Image, ImageDraw
@@ -257,18 +306,26 @@ def _(mo):
@app.cell(hide_code=True)
-def _(df, keyword_freq, min_freq_select, mo):
+def _(freq_df, frequency_df, min_freq_select, mo):
-    mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
+    mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish")
    MIN_FREQ = min_freq_select.value
    freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ]
-    keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
+    freq_df_filtered.reset_index(drop=True, inplace=True)
-    # create list of keywords sorted by their frequencies. only store the keyword
+    keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
-    sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
+
-    sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
+    table_selection = mo.ui.table(freq_df_filtered, page_size=50)
-    sorted_keywords_list
+    table_selection
    # keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
    # # create list of keywords sorted by their frequencies. only store the keyword
    # sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
    # sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
    # sorted_keywords_list
    return (keyword_freq_filtered,)
@@ -278,8 +335,10 @@ def _(mo, tag_select):
    ## 5.2) Inspect Keyword Dataset
    1. Check the threshold is set correctly. If not, adjust accordingly
-    2. Check the keywords are good. If not, run extraction again (step 4)
+    2. Read all the keywords and verify they are good. If not
-    3. Add explicit exclusions if necessary
+       - Add explicit exclusions if necessary below
       - OR Rerun the keyword extraction above
    Add words to this dict that should be ignored in the WordCloud for specific tags. 
@@ -299,7 +358,10 @@ def _():
            "banking",
            "chase",
            "jpmorgan",
-            "youthful"
+            "youthful",
            "customer service",
            "customer service focused",
            "great brand",
        ],
        'why customer chase': [
            "customer service",
@@ -322,17 +384,20 @@ def _(mo):
    canvas_size = (1200, 800)
    logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
    return buffer, canvas_size, logo_switch
@app.cell(hide_code=True)
 def _(logo_switch, mo):
-    run_wordcloud_btn = mo.ui.run_button(label="(Re-) Generate WordCloud")
+    run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud")
    mo.vstack([
        mo.md("## 5.4) Generate WordCloud with/without Logo"),
-        mo.md("Adjust the settings and click the button below to (re-)generate the WordCloud. \n\nWhen satisfied with the result, click 'Save WordCloud to File' to save the image."),
+        mo.md("""Use these buttons to iteratively (re)generate the WordCloud until it looks nice. 
        Placement and color of words is randomized, size is proportional to frequency.
        When satisfied with the result, click 'Save WordCloud to File' to save the image."""),
        mo.md('---'),
        mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')]
    )
@@ -370,7 +435,7 @@ def _(
        # Make sure this path points to your uploaded file
        logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png" 
        logo = Image.open(logo_path).convert("RGBA")
-    
+
        # Optional: Resize logo if it's too large or small for the canvas
        # target_width = 600
        # ratio = target_width / logo.width
@@ -382,26 +447,26 @@ def _(
            # Use Image.Resampling.LANCZOS for high-quality downsampling
            # If you get an error, try Image.LANCZOS or Image.ANTIALIAS
            logo = logo.resize((target_width, new_height), Image.Resampling.LANCZOS)
-    
+
        # 3. Create the mask (0 = draw here, 255 = don't draw here)
        # Initialize with 0 (black/draw everywhere)
        mask_image = Image.new("L", canvas_size, 0)
        draw = ImageDraw.Draw(mask_image)
-    
+
        # 4. Draw a protected circular area in the center
        center = (canvas_size[0] // 2, canvas_size[1] // 2)
-    
+
        # Calculate radius: half of logo max dimension + buffer
        radius = (max(logo.size) // 2) + buffer
-    
+
        # Draw the white circle (255) which the WordCloud will avoid
        draw.ellipse(
            (center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius),
            fill=255
        )
-    
+
        chase_mask = np.array(mask_image)
-    
+
        # Generate the WordCloud
        wordcloud = WordCloud(
            background_color='white',
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,7 @@ dependencies = [
    "numpy>=2.3.5",
    "ollama>=0.6.1",
    "openai>=2.9.0",
    "openpyxl>=3.1.5",
    "pandas>=2.3.3",
    "pyzmq>=27.1.0",
    "requests>=2.32.5",
--- a/utils/keyword_analysis.py
+++ b/utils/keyword_analysis.py
@@ -48,38 +48,39 @@ def ollama_keyword_extraction(content, tag, client: Client, model) -> list:
    """
    # Construct prompt for Ollama model
-    prompt = f"""
+    # Prompt optimized for small models (Llama 3.2):
-### Role
+    # - Fewer rules, prioritized by importance
-You are a qualitative data analyst. Your task is to extract keywords from a user quote to build a semantic word cluster.
+    # - Explicit verbatim instruction (prevents truncation errors)
    # - Examples that reinforce exact copying
    # - Positive framing (do X) instead of negative (don't do Y)
    # - Minimal formatting overhead
    prompt = f"""Extract keywords from interview quotes for thematic analysis.
-### Guidelines
+RULES (in priority order):
-1. **Quantity:** Extract **1-5** high-value keywords. If the quote only contains 1 valid insight, return only 1 keyword. Do not force extra words.
+1. Extract only keywords RELEVANT to the given context. Ignore off-topic content. Do NOT invent keywords.
-2. **Specificity:** Avoid vague, single nouns (e.g., "tech", "choice", "system"). Instead, capture the descriptor (e.g., "tech-forward", "payment choice", "legacy system").
+2. Use words from the quote, but generalize for clustering (e.g., "not youthful" → "traditional").
-3. **Adjectives:** Standalone adjectives are acceptable if they are strong descriptors (e.g., "reliable", "trustworthy", "professional").
+3. Extract 1-5 keywords or short phrases that capture key themes.
-4. **Normalize:** Convert verbs to present tense and nouns to singular.
+4. Prefer descriptive phrases over vague single words (e.g., "tech forward" not "tech").
 5. **Output Format:** Return a single JSON object with the key "keywords" containing a list of strings.
-### Examples
+EXAMPLES:
-**Input Context:** Chase as a Brand
+Context: Chase as a Brand
-**Input Quote:** "I would describe it as, you know, like the next big thing, like, you know, tech forward, you know, customer service forward, and just hating that availability."
+Quote: "It's definitely not, like, youthful or trendy."
-**Output:** {{ "keywords": ["tech forward", "customer service focused", "availability"] }}
+Output: {{"keywords": ["traditional", "established"]}}
-**Input Context:** App Usability
+Context: App Usability  
-**Input Quote:** "There are so many options when I try to pay, it's confusing."
+Quote: "There are so many options when I try to pay, it's confusing."
-**Output:** {{ "keywords": ["confusing", "payment options"] }}
+Output: {{"keywords": ["confusing", "overwhelming options"]}}
-**Input Context:** Investment Tools
+Context: Brand Perception
-**Input Quote:** "It is just really reliable."
+Quote: "I would say reliable, trustworthy, kind of old-school."
-**Output:** {{ "keywords": ["reliable"] }}
+Output: {{"keywords": ["reliable", "trustworthy", "old-school"]}}
-### Input Data
+NOW EXTRACT KEYWORDS:
 **Context/Theme:** {tag}
 **Quote:** "{content}"
-### Output
+Context: {tag}
-```json
+Quote: "{content}"
-"""
+Output:"""
    max_retries = 3
    for attempt in range(max_retries):
--- a/uv.lock
+++ b/uv.lock
@@ -379,6 +379,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/11/a8/c6a4b901d17399c77cd81fb001ce8961e9f5e04d3daf27e8925cb012e163/docutils-0.22.3-py3-none-any.whl", hash = "sha256:bd772e4aca73aff037958d44f2be5229ded4c09927fcf8690c577b66234d6ceb", size = 633032, upload-time = "2025-11-06T02:35:52.391Z" },
 ]
 [[package]]
 name = "et-xmlfile"
 version = "2.0.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
 ]
 [[package]]
 name = "fonttools"
 version = "4.61.1"
@@ -546,6 +555,7 @@ dependencies = [
    { name = "numpy" },
    { name = "ollama" },
    { name = "openai" },
    { name = "openpyxl" },
    { name = "pandas" },
    { name = "pyzmq" },
    { name = "requests" },
@@ -560,6 +570,7 @@ requires-dist = [
    { name = "numpy", specifier = ">=2.3.5" },
    { name = "ollama", specifier = ">=0.6.1" },
    { name = "openai", specifier = ">=2.9.0" },
    { name = "openpyxl", specifier = ">=3.1.5" },
    { name = "pandas", specifier = ">=2.3.3" },
    { name = "pyzmq", specifier = ">=27.1.0" },
    { name = "requests", specifier = ">=2.32.5" },
@@ -1176,6 +1187,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/59/fd/ae2da789cd923dd033c99b8d544071a827c92046b150db01cfa5cea5b3fd/openai-2.9.0-py3-none-any.whl", hash = "sha256:0d168a490fbb45630ad508a6f3022013c155a68fd708069b6a1a01a5e8f0ffad", size = 1030836, upload-time = "2025-12-04T18:15:07.063Z" },
 ]
 [[package]]
 name = "openpyxl"
 version = "3.1.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "et-xmlfile" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
 ]
 [[package]]
 name = "opentelemetry-api"
 version = "1.10.0"