added functionality to load keywords from excel file

2025-12-16 22:25:12 -08:00
parent e81961b819
commit e90b41f648
4 changed files with 181 additions and 91 deletions
--- a/02-B_Thematic-Processing.py
+++ b/02-B_Thematic-Processing.py
@@ -104,6 +104,22 @@ def _(mo):
@app.cell(hide_code=True)
 def _(all_tags_df, mo):

+
+
+    tag_select = mo.ui.dropdown(
+        options=all_tags_df['tag'].unique().tolist(),
+        label="Select Tag to Process",
+        # value="Chase as a brand",
+        full_width=True,
+    )
+    tag_select
+    return (tag_select,)
+
+
+@app.cell
+def _(WORKING_DIR, all_tags_df, mo, tag_select):
+    mo.stop(not tag_select.value, mo.md("Select tag to continue"))
+
    start_processing_btn = None
    start_processing_btn = mo.ui.button(
        label="Start Keyword Extraction",
@@ -111,26 +127,27 @@ def _(all_tags_df, mo):
        on_click=lambda val: True
    )

-    tag_select = mo.ui.dropdown(
-        options=all_tags_df['tag'].unique().tolist(),
-        label="Select Tag to Process",
-        value="Chase as a brand",
-        full_width=True,
-    )
-    tag_select
-    return start_processing_btn, tag_select
-
-
-@app.cell
-def _(all_tags_df, mo, tag_select):
-    mo.stop(not tag_select.value, mo.md("Select tag to continue"))
-
    tag_fname = tag_select.value.replace(" ", "-").replace('/','-')

+    SAVE_DIR = WORKING_DIR / tag_fname
+
+    if not SAVE_DIR.exists():
+        SAVE_DIR.mkdir(parents=True)
+
+    KEYWORDS_FPATH = SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'
+    KEYWORD_FREQ_FPATH = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
+
    # filter all_tags_df to only the document = file_dropdown.value
-    df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
-    df
-    return df, tag_fname
+    tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
+    tags_df
+    return (
+        KEYWORDS_FPATH,
+        KEYWORD_FREQ_FPATH,
+        SAVE_DIR,
+        start_processing_btn,
+        tag_fname,
+        tags_df,
+    )


@app.cell(hide_code=True)
@@ -141,22 +158,24 @@ def _(mo):
    return


-@app.cell(hide_code=True)
+@app.cell
 def _(mo, start_processing_btn, tag_select):
    mo.stop(not tag_select.value, mo.md("Select tag to continue"))

-    # mdf = mpd.from_pandas(df)
    start_processing_btn
    return


-@app.cell(hide_code=True)
-def _(client, df, mo, model_select, pd, start_processing_btn):
+@app.cell
+def _(client, mo, model_select, pd, start_processing_btn, tags_df):
    from utils import ollama_keyword_extraction, worker_extraction
    # Wait for start processing button
    mo.stop(not start_processing_btn.value, "Click button above to start processing")

+
+    df = tags_df
    # Run keyword extraction
+
    df['keywords'] = df.progress_apply(
        lambda row: pd.Series(ollama_keyword_extraction(
            content=row['content'], 
@@ -166,31 +185,17 @@ def _(client, df, mo, model_select, pd, start_processing_btn):
        )),
        axis=1
    )
+    return (df,)


-    return
-
-
-@app.cell(hide_code=True)
-def _(WORKING_DIR, df, mo, pd, tag_fname):
-    # Save results to csv
-    mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
-
-    SAVE_DIR = WORKING_DIR / tag_fname
-
-    if not SAVE_DIR.exists():
-        SAVE_DIR.mkdir(parents=True)
-
+@app.cell
+def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
+    mo.stop(not start_processing_btn.value, "Click button above to process first")

    df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws))

-    df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
-        SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx',
-        index=False
-    )
-
-
    all_keywords_list = df['keywords'].tolist()
+
    all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]

    # Calculate frequencies per keyword
@@ -206,16 +211,60 @@ def _(WORKING_DIR, df, mo, pd, tag_fname):
    freq_df.reset_index(inplace=True)
    freq_df.sort_values(by='frequency', ascending=False, inplace=True)

-    _freq_fpath = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
+
+
+    # Save to Excel files
+
+    df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
+        KEYWORDS_FPATH,
+        index=False
+    )
+
    freq_df.to_excel(
-        _freq_fpath,
+        KEYWORD_FREQ_FPATH,
        index=False
    )
    mo.vstack([
-        mo.md(f"Keywords per-highligh saved to: `{SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'}`"),
-        mo.md(f"Keyword frequencies saved to: `{_freq_fpath}`")
+        mo.md(f"Keywords per-highlight saved to: `{KEYWORDS_FPATH}`"),
+        mo.md(f"Keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
    ])
-    return SAVE_DIR, keyword_freq
+    return (freq_df,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # 4b) [optional] Load data from `keyword_frequencies_*.xlsx`
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(KEYWORD_FREQ_FPATH, mo):
+    load_existing_btn = None
+    if KEYWORD_FREQ_FPATH.exists():
+        load_existing_btn = mo.ui.run_button(label=f"Load keywords from `{KEYWORD_FREQ_FPATH.name}`")
+
+    load_existing_btn
+    return (load_existing_btn,)
+
+
+@app.cell(hide_code=True)
+def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
+    if load_existing_btn.value:
+        _fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
+
+        # Drop nan rows if any
+        _fdf.dropna(subset=['keyword', 'frequency'], inplace=True)
+        _fdf.sort_values(by='frequency', ascending=False, inplace=True)
+        _fdf.reset_index(drop=True, inplace=True)
+        print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
+
+        frequency_df = _fdf
+
+    else:
+        frequency_df = freq_df
+    return (frequency_df,)


@app.cell(hide_code=True)
@@ -228,7 +277,7 @@ def _(mo):

@app.cell(hide_code=True)
 def _():
-    # Start with loading all necessary libraries
+    # Import all necessary libraries
    import numpy as np
    from os import path
    from PIL import Image, ImageDraw
@@ -257,18 +306,26 @@ def _(mo):


@app.cell(hide_code=True)
-def _(df, keyword_freq, min_freq_select, mo):
-    mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
+def _(freq_df, frequency_df, min_freq_select, mo):
+    mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish")

    MIN_FREQ = min_freq_select.value

+    freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ]

-    keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
+    freq_df_filtered.reset_index(drop=True, inplace=True)

-    # create list of keywords sorted by their frequencies. only store the keyword
-    sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
-    sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
-    sorted_keywords_list
+    keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
+
+    table_selection = mo.ui.table(freq_df_filtered, page_size=50)
+    table_selection
+
+    # keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
+
+    # # create list of keywords sorted by their frequencies. only store the keyword
+    # sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
+    # sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
+    # sorted_keywords_list
    return (keyword_freq_filtered,)


@@ -278,8 +335,10 @@ def _(mo, tag_select):
    ## 5.2) Inspect Keyword Dataset

    1. Check the threshold is set correctly. If not, adjust accordingly
-    2. Check the keywords are good. If not, run extraction again (step 4)
-    3. Add explicit exclusions if necessary
+    2. Read all the keywords and verify they are good. If not
+       - Add explicit exclusions if necessary below
+       - OR Rerun the keyword extraction above
+


    Add words to this dict that should be ignored in the WordCloud for specific tags. 
@@ -299,7 +358,10 @@ def _():
            "banking",
            "chase",
            "jpmorgan",
-            "youthful"
+            "youthful",
+            "customer service",
+            "customer service focused",
+            "great brand",
        ],
        'why customer chase': [
            "customer service",
@@ -322,17 +384,20 @@ def _(mo):
    canvas_size = (1200, 800)

    logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
-
    return buffer, canvas_size, logo_switch


@app.cell(hide_code=True)
 def _(logo_switch, mo):
-    run_wordcloud_btn = mo.ui.run_button(label="(Re-) Generate WordCloud")
+    run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud")

    mo.vstack([
        mo.md("## 5.4) Generate WordCloud with/without Logo"),
-        mo.md("Adjust the settings and click the button below to (re-)generate the WordCloud. \n\nWhen satisfied with the result, click 'Save WordCloud to File' to save the image."),
+        mo.md("""Use these buttons to iteratively (re)generate the WordCloud until it looks nice. 
+
+        Placement and color of words is randomized, size is proportional to frequency.
+
+        When satisfied with the result, click 'Save WordCloud to File' to save the image."""),
        mo.md('---'),
        mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')]
    )
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,7 @@ dependencies = [
    "numpy>=2.3.5",
    "ollama>=0.6.1",
    "openai>=2.9.0",
+    "openpyxl>=3.1.5",
    "pandas>=2.3.3",
    "pyzmq>=27.1.0",
    "requests>=2.32.5",
--- a/utils/keyword_analysis.py
+++ b/utils/keyword_analysis.py
@@ -48,38 +48,39 @@ def ollama_keyword_extraction(content, tag, client: Client, model) -> list:
    """
    
    # Construct prompt for Ollama model
-    prompt = f"""
-### Role
-You are a qualitative data analyst. Your task is to extract keywords from a user quote to build a semantic word cluster.
+    # Prompt optimized for small models (Llama 3.2):
+    # - Fewer rules, prioritized by importance
+    # - Explicit verbatim instruction (prevents truncation errors)
+    # - Examples that reinforce exact copying
+    # - Positive framing (do X) instead of negative (don't do Y)
+    # - Minimal formatting overhead
+    prompt = f"""Extract keywords from interview quotes for thematic analysis.

-### Guidelines
-1. **Quantity:** Extract **1-5** high-value keywords. If the quote only contains 1 valid insight, return only 1 keyword. Do not force extra words.
-2. **Specificity:** Avoid vague, single nouns (e.g., "tech", "choice", "system"). Instead, capture the descriptor (e.g., "tech-forward", "payment choice", "legacy system").
-3. **Adjectives:** Standalone adjectives are acceptable if they are strong descriptors (e.g., "reliable", "trustworthy", "professional").
-4. **Normalize:** Convert verbs to present tense and nouns to singular.
-5. **Output Format:** Return a single JSON object with the key "keywords" containing a list of strings.
+RULES (in priority order):
+1. Extract only keywords RELEVANT to the given context. Ignore off-topic content. Do NOT invent keywords.
+2. Use words from the quote, but generalize for clustering (e.g., "not youthful" → "traditional").
+3. Extract 1-5 keywords or short phrases that capture key themes.
+4. Prefer descriptive phrases over vague single words (e.g., "tech forward" not "tech").

-### Examples
+EXAMPLES:

-**Input Context:** Chase as a Brand
-**Input Quote:** "I would describe it as, you know, like the next big thing, like, you know, tech forward, you know, customer service forward, and just hating that availability."
-**Output:** {{ "keywords": ["tech forward", "customer service focused", "availability"] }}
+Context: Chase as a Brand
+Quote: "It's definitely not, like, youthful or trendy."
+Output: {{"keywords": ["traditional", "established"]}}

-**Input Context:** App Usability
-**Input Quote:** "There are so many options when I try to pay, it's confusing."
-**Output:** {{ "keywords": ["confusing", "payment options"] }}
+Context: App Usability  
+Quote: "There are so many options when I try to pay, it's confusing."
+Output: {{"keywords": ["confusing", "overwhelming options"]}}

-**Input Context:** Investment Tools
-**Input Quote:** "It is just really reliable."
-**Output:** {{ "keywords": ["reliable"] }}
+Context: Brand Perception
+Quote: "I would say reliable, trustworthy, kind of old-school."
+Output: {{"keywords": ["reliable", "trustworthy", "old-school"]}}

-### Input Data
-**Context/Theme:** {tag}
-**Quote:** "{content}"
+NOW EXTRACT KEYWORDS:

-### Output
-```json
-"""
+Context: {tag}
+Quote: "{content}"
+Output:"""

    max_retries = 3
    for attempt in range(max_retries):
--- a/uv.lock
+++ b/uv.lock
@@ -379,6 +379,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/11/a8/c6a4b901d17399c77cd81fb001ce8961e9f5e04d3daf27e8925cb012e163/docutils-0.22.3-py3-none-any.whl", hash = "sha256:bd772e4aca73aff037958d44f2be5229ded4c09927fcf8690c577b66234d6ceb", size = 633032, upload-time = "2025-11-06T02:35:52.391Z" },
 ]

+[[package]]
+name = "et-xmlfile"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
+]
+
 [[package]]
 name = "fonttools"
 version = "4.61.1"
@@ -546,6 +555,7 @@ dependencies = [
    { name = "numpy" },
    { name = "ollama" },
    { name = "openai" },
+    { name = "openpyxl" },
    { name = "pandas" },
    { name = "pyzmq" },
    { name = "requests" },
@@ -560,6 +570,7 @@ requires-dist = [
    { name = "numpy", specifier = ">=2.3.5" },
    { name = "ollama", specifier = ">=0.6.1" },
    { name = "openai", specifier = ">=2.9.0" },
+    { name = "openpyxl", specifier = ">=3.1.5" },
    { name = "pandas", specifier = ">=2.3.3" },
    { name = "pyzmq", specifier = ">=27.1.0" },
    { name = "requests", specifier = ">=2.32.5" },
@@ -1176,6 +1187,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/59/fd/ae2da789cd923dd033c99b8d544071a827c92046b150db01cfa5cea5b3fd/openai-2.9.0-py3-none-any.whl", hash = "sha256:0d168a490fbb45630ad508a6f3022013c155a68fd708069b6a1a01a5e8f0ffad", size = 1030836, upload-time = "2025-12-04T18:15:07.063Z" },
 ]

+[[package]]
+name = "openpyxl"
+version = "3.1.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "et-xmlfile" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
+]
+
 [[package]]
 name = "opentelemetry-api"
 version = "1.10.0"