From e90b41f648e24ee1fa91af9dcce9c4c300a0b9eb Mon Sep 17 00:00:00 2001 From: Luigi Maiorano Date: Tue, 16 Dec 2025 22:25:12 -0800 Subject: [PATCH] added functionality to load keywords from excel file --- 02-B_Thematic-Processing.py | 197 ++++++++++++++++++++++++------------ pyproject.toml | 1 + utils/keyword_analysis.py | 51 +++++----- uv.lock | 23 +++++ 4 files changed, 181 insertions(+), 91 deletions(-) diff --git a/02-B_Thematic-Processing.py b/02-B_Thematic-Processing.py index 61d5328..a43cd59 100644 --- a/02-B_Thematic-Processing.py +++ b/02-B_Thematic-Processing.py @@ -104,6 +104,22 @@ def _(mo): @app.cell(hide_code=True) def _(all_tags_df, mo): + + + tag_select = mo.ui.dropdown( + options=all_tags_df['tag'].unique().tolist(), + label="Select Tag to Process", + # value="Chase as a brand", + full_width=True, + ) + tag_select + return (tag_select,) + + +@app.cell +def _(WORKING_DIR, all_tags_df, mo, tag_select): + mo.stop(not tag_select.value, mo.md("Select tag to continue")) + start_processing_btn = None start_processing_btn = mo.ui.button( label="Start Keyword Extraction", @@ -111,26 +127,27 @@ def _(all_tags_df, mo): on_click=lambda val: True ) - tag_select = mo.ui.dropdown( - options=all_tags_df['tag'].unique().tolist(), - label="Select Tag to Process", - value="Chase as a brand", - full_width=True, - ) - tag_select - return start_processing_btn, tag_select - - -@app.cell -def _(all_tags_df, mo, tag_select): - mo.stop(not tag_select.value, mo.md("Select tag to continue")) - tag_fname = tag_select.value.replace(" ", "-").replace('/','-') + SAVE_DIR = WORKING_DIR / tag_fname + + if not SAVE_DIR.exists(): + SAVE_DIR.mkdir(parents=True) + + KEYWORDS_FPATH = SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx' + KEYWORD_FREQ_FPATH = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx' + # filter all_tags_df to only the document = file_dropdown.value - df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy() - df - return df, tag_fname + tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy() + tags_df + return ( + KEYWORDS_FPATH, + KEYWORD_FREQ_FPATH, + SAVE_DIR, + start_processing_btn, + tag_fname, + tags_df, + ) @app.cell(hide_code=True) @@ -141,22 +158,24 @@ def _(mo): return -@app.cell(hide_code=True) +@app.cell def _(mo, start_processing_btn, tag_select): mo.stop(not tag_select.value, mo.md("Select tag to continue")) - # mdf = mpd.from_pandas(df) start_processing_btn return -@app.cell(hide_code=True) -def _(client, df, mo, model_select, pd, start_processing_btn): +@app.cell +def _(client, mo, model_select, pd, start_processing_btn, tags_df): from utils import ollama_keyword_extraction, worker_extraction # Wait for start processing button mo.stop(not start_processing_btn.value, "Click button above to start processing") + + df = tags_df # Run keyword extraction + df['keywords'] = df.progress_apply( lambda row: pd.Series(ollama_keyword_extraction( content=row['content'], @@ -166,31 +185,17 @@ def _(client, df, mo, model_select, pd, start_processing_btn): )), axis=1 ) + return (df,) - return - - -@app.cell(hide_code=True) -def _(WORKING_DIR, df, mo, pd, tag_fname): - # Save results to csv - mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish") - - SAVE_DIR = WORKING_DIR / tag_fname - - if not SAVE_DIR.exists(): - SAVE_DIR.mkdir(parents=True) - +@app.cell +def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn): + mo.stop(not start_processing_btn.value, "Click button above to process first") df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws)) - df[['id', 'tag', 'content', 'keywords_txt']].to_excel( - SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx', - index=False - ) - - all_keywords_list = df['keywords'].tolist() + all_keywords_flat = [item for sublist in all_keywords_list for item in sublist] # Calculate frequencies per keyword @@ -206,16 +211,60 @@ def _(WORKING_DIR, df, mo, pd, tag_fname): freq_df.reset_index(inplace=True) freq_df.sort_values(by='frequency', ascending=False, inplace=True) - _freq_fpath = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx' + + + # Save to Excel files + + df[['id', 'tag', 'content', 'keywords_txt']].to_excel( + KEYWORDS_FPATH, + index=False + ) + freq_df.to_excel( - _freq_fpath, + KEYWORD_FREQ_FPATH, index=False ) mo.vstack([ - mo.md(f"Keywords per-highligh saved to: `{SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'}`"), - mo.md(f"Keyword frequencies saved to: `{_freq_fpath}`") + mo.md(f"Keywords per-highlight saved to: `{KEYWORDS_FPATH}`"), + mo.md(f"Keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`") ]) - return SAVE_DIR, keyword_freq + return (freq_df,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # 4b) [optional] Load data from `keyword_frequencies_*.xlsx` + """) + return + + +@app.cell(hide_code=True) +def _(KEYWORD_FREQ_FPATH, mo): + load_existing_btn = None + if KEYWORD_FREQ_FPATH.exists(): + load_existing_btn = mo.ui.run_button(label=f"Load keywords from `{KEYWORD_FREQ_FPATH.name}`") + + load_existing_btn + return (load_existing_btn,) + + +@app.cell(hide_code=True) +def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd): + if load_existing_btn.value: + _fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl') + + # Drop nan rows if any + _fdf.dropna(subset=['keyword', 'frequency'], inplace=True) + _fdf.sort_values(by='frequency', ascending=False, inplace=True) + _fdf.reset_index(drop=True, inplace=True) + print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.") + + frequency_df = _fdf + + else: + frequency_df = freq_df + return (frequency_df,) @app.cell(hide_code=True) @@ -228,7 +277,7 @@ def _(mo): @app.cell(hide_code=True) def _(): - # Start with loading all necessary libraries + # Import all necessary libraries import numpy as np from os import path from PIL import Image, ImageDraw @@ -257,18 +306,26 @@ def _(mo): @app.cell(hide_code=True) -def _(df, keyword_freq, min_freq_select, mo): - mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish") +def _(freq_df, frequency_df, min_freq_select, mo): + mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish") MIN_FREQ = min_freq_select.value + freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ] - keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ} + freq_df_filtered.reset_index(drop=True, inplace=True) - # create list of keywords sorted by their frequencies. only store the keyword - sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True) - sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords] - sorted_keywords_list + keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict() + + table_selection = mo.ui.table(freq_df_filtered, page_size=50) + table_selection + + # keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ} + + # # create list of keywords sorted by their frequencies. only store the keyword + # sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True) + # sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords] + # sorted_keywords_list return (keyword_freq_filtered,) @@ -278,8 +335,10 @@ def _(mo, tag_select): ## 5.2) Inspect Keyword Dataset 1. Check the threshold is set correctly. If not, adjust accordingly - 2. Check the keywords are good. If not, run extraction again (step 4) - 3. Add explicit exclusions if necessary + 2. Read all the keywords and verify they are good. If not + - Add explicit exclusions if necessary below + - OR Rerun the keyword extraction above + Add words to this dict that should be ignored in the WordCloud for specific tags. @@ -299,7 +358,10 @@ def _(): "banking", "chase", "jpmorgan", - "youthful" + "youthful", + "customer service", + "customer service focused", + "great brand", ], 'why customer chase': [ "customer service", @@ -322,17 +384,20 @@ def _(mo): canvas_size = (1200, 800) logo_switch = mo.ui.switch(label="Include Chase Logo", value=False) - return buffer, canvas_size, logo_switch @app.cell(hide_code=True) def _(logo_switch, mo): - run_wordcloud_btn = mo.ui.run_button(label="(Re-) Generate WordCloud") + run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud") mo.vstack([ mo.md("## 5.4) Generate WordCloud with/without Logo"), - mo.md("Adjust the settings and click the button below to (re-)generate the WordCloud. \n\nWhen satisfied with the result, click 'Save WordCloud to File' to save the image."), + mo.md("""Use these buttons to iteratively (re)generate the WordCloud until it looks nice. + + Placement and color of words is randomized, size is proportional to frequency. + + When satisfied with the result, click 'Save WordCloud to File' to save the image."""), mo.md('---'), mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')] ) @@ -370,7 +435,7 @@ def _( # Make sure this path points to your uploaded file logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png" logo = Image.open(logo_path).convert("RGBA") - + # Optional: Resize logo if it's too large or small for the canvas # target_width = 600 # ratio = target_width / logo.width @@ -382,26 +447,26 @@ def _( # Use Image.Resampling.LANCZOS for high-quality downsampling # If you get an error, try Image.LANCZOS or Image.ANTIALIAS logo = logo.resize((target_width, new_height), Image.Resampling.LANCZOS) - + # 3. Create the mask (0 = draw here, 255 = don't draw here) # Initialize with 0 (black/draw everywhere) mask_image = Image.new("L", canvas_size, 0) draw = ImageDraw.Draw(mask_image) - + # 4. Draw a protected circular area in the center center = (canvas_size[0] // 2, canvas_size[1] // 2) - + # Calculate radius: half of logo max dimension + buffer radius = (max(logo.size) // 2) + buffer - + # Draw the white circle (255) which the WordCloud will avoid draw.ellipse( (center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius), fill=255 ) - + chase_mask = np.array(mask_image) - + # Generate the WordCloud wordcloud = WordCloud( background_color='white', diff --git a/pyproject.toml b/pyproject.toml index 5286dbd..51cbf54 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ dependencies = [ "numpy>=2.3.5", "ollama>=0.6.1", "openai>=2.9.0", + "openpyxl>=3.1.5", "pandas>=2.3.3", "pyzmq>=27.1.0", "requests>=2.32.5", diff --git a/utils/keyword_analysis.py b/utils/keyword_analysis.py index c66666a..5182012 100644 --- a/utils/keyword_analysis.py +++ b/utils/keyword_analysis.py @@ -48,38 +48,39 @@ def ollama_keyword_extraction(content, tag, client: Client, model) -> list: """ # Construct prompt for Ollama model - prompt = f""" -### Role -You are a qualitative data analyst. Your task is to extract keywords from a user quote to build a semantic word cluster. + # Prompt optimized for small models (Llama 3.2): + # - Fewer rules, prioritized by importance + # - Explicit verbatim instruction (prevents truncation errors) + # - Examples that reinforce exact copying + # - Positive framing (do X) instead of negative (don't do Y) + # - Minimal formatting overhead + prompt = f"""Extract keywords from interview quotes for thematic analysis. -### Guidelines -1. **Quantity:** Extract **1-5** high-value keywords. If the quote only contains 1 valid insight, return only 1 keyword. Do not force extra words. -2. **Specificity:** Avoid vague, single nouns (e.g., "tech", "choice", "system"). Instead, capture the descriptor (e.g., "tech-forward", "payment choice", "legacy system"). -3. **Adjectives:** Standalone adjectives are acceptable if they are strong descriptors (e.g., "reliable", "trustworthy", "professional"). -4. **Normalize:** Convert verbs to present tense and nouns to singular. -5. **Output Format:** Return a single JSON object with the key "keywords" containing a list of strings. +RULES (in priority order): +1. Extract only keywords RELEVANT to the given context. Ignore off-topic content. Do NOT invent keywords. +2. Use words from the quote, but generalize for clustering (e.g., "not youthful" → "traditional"). +3. Extract 1-5 keywords or short phrases that capture key themes. +4. Prefer descriptive phrases over vague single words (e.g., "tech forward" not "tech"). -### Examples +EXAMPLES: -**Input Context:** Chase as a Brand -**Input Quote:** "I would describe it as, you know, like the next big thing, like, you know, tech forward, you know, customer service forward, and just hating that availability." -**Output:** {{ "keywords": ["tech forward", "customer service focused", "availability"] }} +Context: Chase as a Brand +Quote: "It's definitely not, like, youthful or trendy." +Output: {{"keywords": ["traditional", "established"]}} -**Input Context:** App Usability -**Input Quote:** "There are so many options when I try to pay, it's confusing." -**Output:** {{ "keywords": ["confusing", "payment options"] }} +Context: App Usability +Quote: "There are so many options when I try to pay, it's confusing." +Output: {{"keywords": ["confusing", "overwhelming options"]}} -**Input Context:** Investment Tools -**Input Quote:** "It is just really reliable." -**Output:** {{ "keywords": ["reliable"] }} +Context: Brand Perception +Quote: "I would say reliable, trustworthy, kind of old-school." +Output: {{"keywords": ["reliable", "trustworthy", "old-school"]}} -### Input Data -**Context/Theme:** {tag} -**Quote:** "{content}" +NOW EXTRACT KEYWORDS: -### Output -```json -""" +Context: {tag} +Quote: "{content}" +Output:""" max_retries = 3 for attempt in range(max_retries): diff --git a/uv.lock b/uv.lock index 5a28410..d8cdcd5 100644 --- a/uv.lock +++ b/uv.lock @@ -379,6 +379,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/11/a8/c6a4b901d17399c77cd81fb001ce8961e9f5e04d3daf27e8925cb012e163/docutils-0.22.3-py3-none-any.whl", hash = "sha256:bd772e4aca73aff037958d44f2be5229ded4c09927fcf8690c577b66234d6ceb", size = 633032, upload-time = "2025-11-06T02:35:52.391Z" }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, +] + [[package]] name = "fonttools" version = "4.61.1" @@ -546,6 +555,7 @@ dependencies = [ { name = "numpy" }, { name = "ollama" }, { name = "openai" }, + { name = "openpyxl" }, { name = "pandas" }, { name = "pyzmq" }, { name = "requests" }, @@ -560,6 +570,7 @@ requires-dist = [ { name = "numpy", specifier = ">=2.3.5" }, { name = "ollama", specifier = ">=0.6.1" }, { name = "openai", specifier = ">=2.9.0" }, + { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pandas", specifier = ">=2.3.3" }, { name = "pyzmq", specifier = ">=27.1.0" }, { name = "requests", specifier = ">=2.32.5" }, @@ -1176,6 +1187,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/59/fd/ae2da789cd923dd033c99b8d544071a827c92046b150db01cfa5cea5b3fd/openai-2.9.0-py3-none-any.whl", hash = "sha256:0d168a490fbb45630ad508a6f3022013c155a68fd708069b6a1a01a5e8f0ffad", size = 1030836, upload-time = "2025-12-04T18:15:07.063Z" }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, +] + [[package]] name = "opentelemetry-api" version = "1.10.0"