Files
Interview-Analysis/02-B_Thematic-Processing.py
2025-12-16 17:44:50 -08:00

437 lines
12 KiB
Python

import marimo
__generated_with = "0.18.3"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import pandas as pd
import modin.pandas as mpd
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from utils import connect_qumo_ollama
OLLAMA_LOCATION= 'localhost'
# VM_NAME = 'ollama-lite'
# initialize tqdm for pandas
tqdm.pandas()
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True)
if not TAGUETTE_EXPORT_DIR.exists():
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
model_select = mo.ui.dropdown(
options=_models,
value=_models[0],
label="Select Ollama Model to use",
searchable=True,
)
model_select
return TAGUETTE_EXPORT_DIR, WORKING_DIR, client, mo, model_select, pd
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# 1) Export Data out of Taguette
**Highlights**
1. Go to: https://taguette.qumo.io/project/1
2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
**Tags Codebook**
1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
_NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# 2) Import Data
""")
return
@app.cell
def _(TAGUETTE_EXPORT_DIR, pd):
all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
all_tags_df['_seq_id'] = range(len(all_tags_df))
all_tags_df
return (all_tags_df,)
@app.cell
def _(all_tags_df):
# get count of rows per tag
tag_counts = all_tags_df['tag'].value_counts().reset_index()
tag_counts
return
@app.cell
def _(TAGUETTE_EXPORT_DIR, pd):
codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
codebook_df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# 3) Select Tag for processing
""")
return
@app.cell
def _(all_tags_df, mo):
tag_select = mo.ui.dropdown(
options=all_tags_df['tag'].unique().tolist(),
label="Select Tag to Process",
value="Chase as a brand",
full_width=True
)
tag_select
return (tag_select,)
@app.cell
def _(all_tags_df, mo, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
# filter all_tags_df to only the document = file_dropdown.value
df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
df
return (df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# 4) Keyword extraction
""")
return
@app.cell
def _(mo, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
# mdf = mpd.from_pandas(df)
start_processing_btn = mo.ui.button(
label="Start Keyword Extraction",
kind="warn",
on_click=lambda val: True
)
start_processing_btn
return (start_processing_btn,)
@app.cell
def _(
WORKING_DIR,
client,
df,
mo,
model_select,
pd,
start_processing_btn,
tag_select,
):
from utils import ollama_keyword_extraction, worker_extraction
# Wait for start processing button
mo.stop(not start_processing_btn.value, "Click button above to start processing")
# Run keyword extraction
df['keywords'] = df.progress_apply(
lambda row: pd.Series(ollama_keyword_extraction(
content=row['content'],
tag=row['tag'],
client=client,
model=model_select.value
)),
axis=1
)
df['keywords_txt'] = df['keywords'].progress_apply(lambda kws: ', '.join(kws))
df[['id', 'tag', 'content', 'keywords_txt']].to_csv(
WORKING_DIR / f'keywords_{tag_select.value.replace(" ", "-")}.csv',
index=False
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# 5) Wordcloud generation
""")
return
@app.cell
def _():
# Start with loading all necessary libraries
import numpy as np
from os import path
from PIL import Image, ImageDraw
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
return Image, ImageDraw, WordCloud, np, plt
@app.cell
def _(df):
MIN_FREQ = 2
all_keywords_list = df['keywords'].tolist()
all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
keyword_freq = {}
for kw in all_keywords_flat:
if kw in keyword_freq:
keyword_freq[kw] += 1
else:
keyword_freq[kw] = 1
keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ}
# create list of keywords sorted by their frequencies. only store the keyword
sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True)
sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords]
sorted_keywords_list
return (keyword_freq_filtered,)
@app.cell
def _():
IGNORE_WORDS = {
'chase as a brand': [
"brand"
]
}
return (IGNORE_WORDS,)
@app.cell
def _(plt):
import random
import matplotlib.colors as mcolors
def blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
# Use the provided random_state for reproducibility if available, else use random module
r = random_state if random_state else random
# Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
# 0.0 is white/light, 1.0 is dark blue
min_val, max_val = 0.4, 1.0
color_val = r.uniform(min_val, max_val)
# Get color from matplotlib colormap
rgba = plt.cm.Blues(color_val)
return mcolors.to_hex(rgba)
return (blue_color_func,)
@app.cell
def _():
# chase_mask = np.array(Image.open("./data/assets/Chase-National-Bank-Logo.png"))
# def transform_format(val):
# if val == 0:
# return 255
# else:
# return 1
# transformed_chase_mask = np.ndarray((chase_mask.shape[0], chase_mask.shape[1]), np.int32)
# for i in range(len(chase_mask)):
# transformed_chase_mask[i] = list(map(transform_format, chase_mask[i]))
return
@app.cell
def _(mo):
buffer = -100 # Adjust this to increase/decrease space between logo and words
canvas_size = (1200, 800)
logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
logo_switch
return buffer, canvas_size, logo_switch
@app.cell(hide_code=True)
def _(
IGNORE_WORDS,
Image,
ImageDraw,
WordCloud,
blue_color_func,
buffer,
canvas_size,
keyword_freq_filtered,
logo_switch,
mo,
np,
plt,
tag_select,
):
# remove specific keywords depending on selected tag
if IGNORE_WORDS.get(tag_select.value.lower()):
for word in IGNORE_WORDS[tag_select.value.lower()]:
if word in keyword_freq_filtered:
del keyword_freq_filtered[word]
if logo_switch.value:
# 1. Load the logo
# Make sure this path points to your uploaded file
logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png"
logo = Image.open(logo_path).convert("RGBA")
# Optional: Resize logo if it's too large or small for the canvas
# target_width = 600
# ratio = target_width / logo.width
# logo = logo.resize((target_width, int(logo.height * ratio)), Image.Resampling.LANCZOS)
target_width = 600 # Set a reasonable size for the logo
if logo.width > target_width:
ratio = target_width / logo.width
new_height = int(logo.height * ratio)
# Use Image.Resampling.LANCZOS for high-quality downsampling
# If you get an error, try Image.LANCZOS or Image.ANTIALIAS
logo = logo.resize((target_width, new_height), Image.Resampling.LANCZOS)
# 3. Create the mask (0 = draw here, 255 = don't draw here)
# Initialize with 0 (black/draw everywhere)
mask_image = Image.new("L", canvas_size, 0)
draw = ImageDraw.Draw(mask_image)
# 4. Draw a protected circular area in the center
center = (canvas_size[0] // 2, canvas_size[1] // 2)
# Calculate radius: half of logo max dimension + buffer
radius = (max(logo.size) // 2) + buffer
# Draw the white circle (255) which the WordCloud will avoid
draw.ellipse(
(center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius),
fill=255
)
chase_mask = np.array(mask_image)
# Generate the WordCloud
wordcloud = WordCloud(
background_color='white',
width=canvas_size[0],
height=canvas_size[1],
max_font_size=100, # Increased font size for larger canvas
max_words=20, # Increased word count to fill space
color_func=blue_color_func,
mask=chase_mask, # Apply the circular mask
contour_width=0,
contour_color='steelblue'
).generate_from_frequencies(keyword_freq_filtered)
else:
# Generate the WordCloud
wordcloud = WordCloud(
background_color='white',
width=canvas_size[0],
height=canvas_size[1],
max_font_size=100, # Increased font size for larger canvas
max_words=20, # Increased word count to fill space
color_func=blue_color_func,
# mask=chase_mask, # Apply the circular mask
# contour_width=0,
# contour_color='steelblue'
).generate_from_frequencies(keyword_freq_filtered)
# Convert WordCloud to Image to composite the logo
wc_image = wordcloud.to_image()
if logo_switch.value:
# Calculate position to center the logo
logo_pos = (
(canvas_size[0] - logo.width) // 2,
(canvas_size[1] - logo.height) // 2
)
# Paste logo (using alpha channel as mask to keep transparency)
wc_image.paste(logo, logo_pos, logo)
# Display the generated image
fig = plt.figure(figsize=(7,7))
# Display the generated image:
plt.imshow(wc_image, interpolation='bilinear')
plt.axis("off")
plt.show()
save_wordcloud_btn = None
save_wordcloud_btn = mo.ui.button(
label="Save_wordcloud_button",
kind="warn",
on_click=lambda val: True
)
save_wordcloud_btn
return save_wordcloud_btn, wc_image
@app.cell
def _(WORKING_DIR, mo, save_wordcloud_btn, tag_select, wc_image):
# Wait for start processing button
mo.stop(not save_wordcloud_btn.value, "Click button above to save wordcloud image")
filename = f'wordcloud_{tag_select.value.replace(" ", "-")}.png'
fpath = WORKING_DIR / filename
# add a (increasing) number to the filename so we can save multiple. find the latest in the directory first
existing_files = list(WORKING_DIR.glob(f'wordcloud_{tag_select.value.replace(" ", "-")}*.png'))
if existing_files:
existing_numbers = []
for ef in existing_files:
parts = ef.stem.split('_')
if len(parts) > 2 and parts[-1].isdigit():
existing_numbers.append(int(parts[-1]))
if existing_numbers:
next_number = max(existing_numbers) + 1
else:
next_number = 1
fpath = WORKING_DIR / f'wordcloud_{tag_select.value.replace(" ", "-")}_{next_number}.png'
wc_image.save(fpath)
mo.md(f"Wordcloud saved to: {fpath}")
return
if __name__ == "__main__":
app.run()