added functionality to load keywords from excel file

This commit is contained in:
2025-12-16 22:25:12 -08:00
parent e81961b819
commit e90b41f648
4 changed files with 181 additions and 91 deletions

View File

@@ -104,6 +104,22 @@ def _(mo):
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(all_tags_df, mo): def _(all_tags_df, mo):
tag_select = mo.ui.dropdown(
options=all_tags_df['tag'].unique().tolist(),
label="Select Tag to Process",
# value="Chase as a brand",
full_width=True,
)
tag_select
return (tag_select,)
@app.cell
def _(WORKING_DIR, all_tags_df, mo, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
start_processing_btn = None start_processing_btn = None
start_processing_btn = mo.ui.button( start_processing_btn = mo.ui.button(
label="Start Keyword Extraction", label="Start Keyword Extraction",
@@ -111,26 +127,27 @@ def _(all_tags_df, mo):
on_click=lambda val: True on_click=lambda val: True
) )
tag_select = mo.ui.dropdown(
options=all_tags_df['tag'].unique().tolist(),
label="Select Tag to Process",
value="Chase as a brand",
full_width=True,
)
tag_select
return start_processing_btn, tag_select
@app.cell
def _(all_tags_df, mo, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
tag_fname = tag_select.value.replace(" ", "-").replace('/','-') tag_fname = tag_select.value.replace(" ", "-").replace('/','-')
SAVE_DIR = WORKING_DIR / tag_fname
if not SAVE_DIR.exists():
SAVE_DIR.mkdir(parents=True)
KEYWORDS_FPATH = SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'
KEYWORD_FREQ_FPATH = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
# filter all_tags_df to only the document = file_dropdown.value # filter all_tags_df to only the document = file_dropdown.value
df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy() tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
df tags_df
return df, tag_fname return (
KEYWORDS_FPATH,
KEYWORD_FREQ_FPATH,
SAVE_DIR,
start_processing_btn,
tag_fname,
tags_df,
)
@app.cell(hide_code=True) @app.cell(hide_code=True)
@@ -141,22 +158,24 @@ def _(mo):
return return
@app.cell(hide_code=True) @app.cell
def _(mo, start_processing_btn, tag_select): def _(mo, start_processing_btn, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue")) mo.stop(not tag_select.value, mo.md("Select tag to continue"))
# mdf = mpd.from_pandas(df)
start_processing_btn start_processing_btn
return return
@app.cell(hide_code=True) @app.cell
def _(client, df, mo, model_select, pd, start_processing_btn): def _(client, mo, model_select, pd, start_processing_btn, tags_df):
from utils import ollama_keyword_extraction, worker_extraction from utils import ollama_keyword_extraction, worker_extraction
# Wait for start processing button # Wait for start processing button
mo.stop(not start_processing_btn.value, "Click button above to start processing") mo.stop(not start_processing_btn.value, "Click button above to start processing")
df = tags_df
# Run keyword extraction # Run keyword extraction
df['keywords'] = df.progress_apply( df['keywords'] = df.progress_apply(
lambda row: pd.Series(ollama_keyword_extraction( lambda row: pd.Series(ollama_keyword_extraction(
content=row['content'], content=row['content'],
@@ -166,31 +185,17 @@ def _(client, df, mo, model_select, pd, start_processing_btn):
)), )),
axis=1 axis=1
) )
return (df,)
return @app.cell
def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
mo.stop(not start_processing_btn.value, "Click button above to process first")
@app.cell(hide_code=True)
def _(WORKING_DIR, df, mo, pd, tag_fname):
# Save results to csv
mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish")
SAVE_DIR = WORKING_DIR / tag_fname
if not SAVE_DIR.exists():
SAVE_DIR.mkdir(parents=True)
df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws)) df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws))
df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx',
index=False
)
all_keywords_list = df['keywords'].tolist() all_keywords_list = df['keywords'].tolist()
all_keywords_flat = [item for sublist in all_keywords_list for item in sublist] all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
# Calculate frequencies per keyword # Calculate frequencies per keyword
@@ -206,16 +211,60 @@ def _(WORKING_DIR, df, mo, pd, tag_fname):
freq_df.reset_index(inplace=True) freq_df.reset_index(inplace=True)
freq_df.sort_values(by='frequency', ascending=False, inplace=True) freq_df.sort_values(by='frequency', ascending=False, inplace=True)
_freq_fpath = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
# Save to Excel files
df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
KEYWORDS_FPATH,
index=False
)
freq_df.to_excel( freq_df.to_excel(
_freq_fpath, KEYWORD_FREQ_FPATH,
index=False index=False
) )
mo.vstack([ mo.vstack([
mo.md(f"Keywords per-highligh saved to: `{SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'}`"), mo.md(f"Keywords per-highlight saved to: `{KEYWORDS_FPATH}`"),
mo.md(f"Keyword frequencies saved to: `{_freq_fpath}`") mo.md(f"Keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
]) ])
return SAVE_DIR, keyword_freq return (freq_df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# 4b) [optional] Load data from `keyword_frequencies_*.xlsx`
""")
return
@app.cell(hide_code=True)
def _(KEYWORD_FREQ_FPATH, mo):
load_existing_btn = None
if KEYWORD_FREQ_FPATH.exists():
load_existing_btn = mo.ui.run_button(label=f"Load keywords from `{KEYWORD_FREQ_FPATH.name}`")
load_existing_btn
return (load_existing_btn,)
@app.cell(hide_code=True)
def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd):
if load_existing_btn.value:
_fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
# Drop nan rows if any
_fdf.dropna(subset=['keyword', 'frequency'], inplace=True)
_fdf.sort_values(by='frequency', ascending=False, inplace=True)
_fdf.reset_index(drop=True, inplace=True)
print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
frequency_df = _fdf
else:
frequency_df = freq_df
return (frequency_df,)
@app.cell(hide_code=True) @app.cell(hide_code=True)
@@ -228,7 +277,7 @@ def _(mo):
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(): def _():
# Start with loading all necessary libraries # Import all necessary libraries
import numpy as np import numpy as np
from os import path from os import path
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
@@ -257,18 +306,26 @@ def _(mo):
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(df, keyword_freq, min_freq_select, mo): def _(freq_df, frequency_df, min_freq_select, mo):
mo.stop('keywords' not in df.columns, "Waiting for keyword extraction to finish") mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish")
MIN_FREQ = min_freq_select.value MIN_FREQ = min_freq_select.value
freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ]
keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ} freq_df_filtered.reset_index(drop=True, inplace=True)
# create list of keywords sorted by their frequencies. only store the keyword keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords] table_selection = mo.ui.table(freq_df_filtered, page_size=50)
sorted_keywords_list table_selection
# keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
# # create list of keywords sorted by their frequencies. only store the keyword
# sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
# sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
# sorted_keywords_list
return (keyword_freq_filtered,) return (keyword_freq_filtered,)
@@ -278,8 +335,10 @@ def _(mo, tag_select):
## 5.2) Inspect Keyword Dataset ## 5.2) Inspect Keyword Dataset
1. Check the threshold is set correctly. If not, adjust accordingly 1. Check the threshold is set correctly. If not, adjust accordingly
2. Check the keywords are good. If not, run extraction again (step 4) 2. Read all the keywords and verify they are good. If not
3. Add explicit exclusions if necessary - Add explicit exclusions if necessary below
- OR Rerun the keyword extraction above
Add words to this dict that should be ignored in the WordCloud for specific tags. Add words to this dict that should be ignored in the WordCloud for specific tags.
@@ -299,7 +358,10 @@ def _():
"banking", "banking",
"chase", "chase",
"jpmorgan", "jpmorgan",
"youthful" "youthful",
"customer service",
"customer service focused",
"great brand",
], ],
'why customer chase': [ 'why customer chase': [
"customer service", "customer service",
@@ -322,17 +384,20 @@ def _(mo):
canvas_size = (1200, 800) canvas_size = (1200, 800)
logo_switch = mo.ui.switch(label="Include Chase Logo", value=False) logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
return buffer, canvas_size, logo_switch return buffer, canvas_size, logo_switch
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(logo_switch, mo): def _(logo_switch, mo):
run_wordcloud_btn = mo.ui.run_button(label="(Re-) Generate WordCloud") run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud")
mo.vstack([ mo.vstack([
mo.md("## 5.4) Generate WordCloud with/without Logo"), mo.md("## 5.4) Generate WordCloud with/without Logo"),
mo.md("Adjust the settings and click the button below to (re-)generate the WordCloud. \n\nWhen satisfied with the result, click 'Save WordCloud to File' to save the image."), mo.md("""Use these buttons to iteratively (re)generate the WordCloud until it looks nice.
Placement and color of words is randomized, size is proportional to frequency.
When satisfied with the result, click 'Save WordCloud to File' to save the image."""),
mo.md('---'), mo.md('---'),
mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')] mo.hstack([logo_switch, run_wordcloud_btn], align='center', justify='space-around')]
) )
@@ -370,7 +435,7 @@ def _(
# Make sure this path points to your uploaded file # Make sure this path points to your uploaded file
logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png" logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png"
logo = Image.open(logo_path).convert("RGBA") logo = Image.open(logo_path).convert("RGBA")
# Optional: Resize logo if it's too large or small for the canvas # Optional: Resize logo if it's too large or small for the canvas
# target_width = 600 # target_width = 600
# ratio = target_width / logo.width # ratio = target_width / logo.width
@@ -382,26 +447,26 @@ def _(
# Use Image.Resampling.LANCZOS for high-quality downsampling # Use Image.Resampling.LANCZOS for high-quality downsampling
# If you get an error, try Image.LANCZOS or Image.ANTIALIAS # If you get an error, try Image.LANCZOS or Image.ANTIALIAS
logo = logo.resize((target_width, new_height), Image.Resampling.LANCZOS) logo = logo.resize((target_width, new_height), Image.Resampling.LANCZOS)
# 3. Create the mask (0 = draw here, 255 = don't draw here) # 3. Create the mask (0 = draw here, 255 = don't draw here)
# Initialize with 0 (black/draw everywhere) # Initialize with 0 (black/draw everywhere)
mask_image = Image.new("L", canvas_size, 0) mask_image = Image.new("L", canvas_size, 0)
draw = ImageDraw.Draw(mask_image) draw = ImageDraw.Draw(mask_image)
# 4. Draw a protected circular area in the center # 4. Draw a protected circular area in the center
center = (canvas_size[0] // 2, canvas_size[1] // 2) center = (canvas_size[0] // 2, canvas_size[1] // 2)
# Calculate radius: half of logo max dimension + buffer # Calculate radius: half of logo max dimension + buffer
radius = (max(logo.size) // 2) + buffer radius = (max(logo.size) // 2) + buffer
# Draw the white circle (255) which the WordCloud will avoid # Draw the white circle (255) which the WordCloud will avoid
draw.ellipse( draw.ellipse(
(center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius), (center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius),
fill=255 fill=255
) )
chase_mask = np.array(mask_image) chase_mask = np.array(mask_image)
# Generate the WordCloud # Generate the WordCloud
wordcloud = WordCloud( wordcloud = WordCloud(
background_color='white', background_color='white',

View File

@@ -10,6 +10,7 @@ dependencies = [
"numpy>=2.3.5", "numpy>=2.3.5",
"ollama>=0.6.1", "ollama>=0.6.1",
"openai>=2.9.0", "openai>=2.9.0",
"openpyxl>=3.1.5",
"pandas>=2.3.3", "pandas>=2.3.3",
"pyzmq>=27.1.0", "pyzmq>=27.1.0",
"requests>=2.32.5", "requests>=2.32.5",

View File

@@ -48,38 +48,39 @@ def ollama_keyword_extraction(content, tag, client: Client, model) -> list:
""" """
# Construct prompt for Ollama model # Construct prompt for Ollama model
prompt = f""" # Prompt optimized for small models (Llama 3.2):
### Role # - Fewer rules, prioritized by importance
You are a qualitative data analyst. Your task is to extract keywords from a user quote to build a semantic word cluster. # - Explicit verbatim instruction (prevents truncation errors)
# - Examples that reinforce exact copying
# - Positive framing (do X) instead of negative (don't do Y)
# - Minimal formatting overhead
prompt = f"""Extract keywords from interview quotes for thematic analysis.
### Guidelines RULES (in priority order):
1. **Quantity:** Extract **1-5** high-value keywords. If the quote only contains 1 valid insight, return only 1 keyword. Do not force extra words. 1. Extract only keywords RELEVANT to the given context. Ignore off-topic content. Do NOT invent keywords.
2. **Specificity:** Avoid vague, single nouns (e.g., "tech", "choice", "system"). Instead, capture the descriptor (e.g., "tech-forward", "payment choice", "legacy system"). 2. Use words from the quote, but generalize for clustering (e.g., "not youthful" "traditional").
3. **Adjectives:** Standalone adjectives are acceptable if they are strong descriptors (e.g., "reliable", "trustworthy", "professional"). 3. Extract 1-5 keywords or short phrases that capture key themes.
4. **Normalize:** Convert verbs to present tense and nouns to singular. 4. Prefer descriptive phrases over vague single words (e.g., "tech forward" not "tech").
5. **Output Format:** Return a single JSON object with the key "keywords" containing a list of strings.
### Examples EXAMPLES:
**Input Context:** Chase as a Brand Context: Chase as a Brand
**Input Quote:** "I would describe it as, you know, like the next big thing, like, you know, tech forward, you know, customer service forward, and just hating that availability." Quote: "It's definitely not, like, youthful or trendy."
**Output:** {{ "keywords": ["tech forward", "customer service focused", "availability"] }} Output: {{"keywords": ["traditional", "established"]}}
**Input Context:** App Usability Context: App Usability
**Input Quote:** "There are so many options when I try to pay, it's confusing." Quote: "There are so many options when I try to pay, it's confusing."
**Output:** {{ "keywords": ["confusing", "payment options"] }} Output: {{"keywords": ["confusing", "overwhelming options"]}}
**Input Context:** Investment Tools Context: Brand Perception
**Input Quote:** "It is just really reliable." Quote: "I would say reliable, trustworthy, kind of old-school."
**Output:** {{ "keywords": ["reliable"] }} Output: {{"keywords": ["reliable", "trustworthy", "old-school"]}}
### Input Data NOW EXTRACT KEYWORDS:
**Context/Theme:** {tag}
**Quote:** "{content}"
### Output Context: {tag}
```json Quote: "{content}"
""" Output:"""
max_retries = 3 max_retries = 3
for attempt in range(max_retries): for attempt in range(max_retries):

23
uv.lock generated
View File

@@ -379,6 +379,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/11/a8/c6a4b901d17399c77cd81fb001ce8961e9f5e04d3daf27e8925cb012e163/docutils-0.22.3-py3-none-any.whl", hash = "sha256:bd772e4aca73aff037958d44f2be5229ded4c09927fcf8690c577b66234d6ceb", size = 633032, upload-time = "2025-11-06T02:35:52.391Z" }, { url = "https://files.pythonhosted.org/packages/11/a8/c6a4b901d17399c77cd81fb001ce8961e9f5e04d3daf27e8925cb012e163/docutils-0.22.3-py3-none-any.whl", hash = "sha256:bd772e4aca73aff037958d44f2be5229ded4c09927fcf8690c577b66234d6ceb", size = 633032, upload-time = "2025-11-06T02:35:52.391Z" },
] ]
[[package]]
name = "et-xmlfile"
version = "2.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
]
[[package]] [[package]]
name = "fonttools" name = "fonttools"
version = "4.61.1" version = "4.61.1"
@@ -546,6 +555,7 @@ dependencies = [
{ name = "numpy" }, { name = "numpy" },
{ name = "ollama" }, { name = "ollama" },
{ name = "openai" }, { name = "openai" },
{ name = "openpyxl" },
{ name = "pandas" }, { name = "pandas" },
{ name = "pyzmq" }, { name = "pyzmq" },
{ name = "requests" }, { name = "requests" },
@@ -560,6 +570,7 @@ requires-dist = [
{ name = "numpy", specifier = ">=2.3.5" }, { name = "numpy", specifier = ">=2.3.5" },
{ name = "ollama", specifier = ">=0.6.1" }, { name = "ollama", specifier = ">=0.6.1" },
{ name = "openai", specifier = ">=2.9.0" }, { name = "openai", specifier = ">=2.9.0" },
{ name = "openpyxl", specifier = ">=3.1.5" },
{ name = "pandas", specifier = ">=2.3.3" }, { name = "pandas", specifier = ">=2.3.3" },
{ name = "pyzmq", specifier = ">=27.1.0" }, { name = "pyzmq", specifier = ">=27.1.0" },
{ name = "requests", specifier = ">=2.32.5" }, { name = "requests", specifier = ">=2.32.5" },
@@ -1176,6 +1187,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/59/fd/ae2da789cd923dd033c99b8d544071a827c92046b150db01cfa5cea5b3fd/openai-2.9.0-py3-none-any.whl", hash = "sha256:0d168a490fbb45630ad508a6f3022013c155a68fd708069b6a1a01a5e8f0ffad", size = 1030836, upload-time = "2025-12-04T18:15:07.063Z" }, { url = "https://files.pythonhosted.org/packages/59/fd/ae2da789cd923dd033c99b8d544071a827c92046b150db01cfa5cea5b3fd/openai-2.9.0-py3-none-any.whl", hash = "sha256:0d168a490fbb45630ad508a6f3022013c155a68fd708069b6a1a01a5e8f0ffad", size = 1030836, upload-time = "2025-12-04T18:15:07.063Z" },
] ]
[[package]]
name = "openpyxl"
version = "3.1.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "et-xmlfile" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
]
[[package]] [[package]]
name = "opentelemetry-api" name = "opentelemetry-api"
version = "1.10.0" version = "1.10.0"