Compare commits

..

27 Commits

Author SHA1 Message Date
069e568d00 final tweaks 2025-12-17 01:37:42 -08:00
417273c745 voice keyword blacklist 2025-12-17 01:19:22 -08:00
eee6947f01 rename 2025-12-17 00:25:03 -08:00
d6b449e8c6 add warning message and increase n words 2025-12-16 23:56:13 -08:00
8fbc11da7a Inline removal of keywords 2025-12-16 23:42:25 -08:00
50f9538dcf format for consecutive runs 2025-12-16 23:21:03 -08:00
e90b41f648 added functionality to load keywords from excel file 2025-12-16 22:25:12 -08:00
e81961b819 cleanup notebook and make usable 2025-12-16 20:15:44 -08:00
4ba8af03d2 logo in word cloud 2025-12-16 17:44:50 -08:00
228a6daa59 progress apply 2025-12-16 16:28:07 -08:00
12e14e3c9b keywords 2025-12-16 14:39:54 -08:00
a5ffd8315e cpc1 afwijking 2025-12-14 20:02:40 +01:00
c2a5c12794 update import to work with CPC and SMB 2025-12-12 21:26:35 +01:00
ccc5154b93 llm processing of sentiment 2025-12-12 14:28:51 +01:00
e576f98cce basic parsing working 2025-12-11 12:56:23 +01:00
b023d44934 minor edits 2025-12-10 08:34:57 +01:00
ad00860fa1 added local ollama support 2025-12-10 08:28:01 +01:00
b214e7ab17 cleanup instructions 2025-12-10 07:41:15 +01:00
7f951d9ee5 Aggregation step 2025-12-09 22:33:51 +01:00
821fa01edb sentiments saving to intermediate csv 2025-12-09 21:40:54 +01:00
514570062c restructure analysis 2025-12-09 21:05:07 +01:00
beddfee087 rename and start post process 2025-12-09 13:58:11 +01:00
60d2876725 preview md 2025-12-08 11:31:03 +01:00
ab4ee4b34a Merge branch 'main' of gitea.tail44fa00.ts.net:Qumo/Interview-Analysis 2025-12-07 21:38:48 +01:00
8cc2bc9087 taguette pre-process 2025-12-07 21:37:42 +01:00
mtorsij
523a59f864 Added taguette to uv 2025-12-06 11:12:38 +01:00
98202ac3f2 architecture clarification 2025-12-03 12:12:23 +01:00
21 changed files with 3741 additions and 124 deletions

3
.gitignore vendored
View File

@@ -12,3 +12,6 @@ __marimo__
__pycache__/
data/
docker-volumes/
logs/

16
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
}
]
}

114
01_Taguette-Pre-Process.py Normal file
View File

@@ -0,0 +1,114 @@
import marimo
__generated_with = "0.18.3"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import pandas as pd
from pathlib import Path
from utils import csv_to_markdown, cpc_smb_to_markdown
return Path, cpc_smb_to_markdown, csv_to_markdown, mo
@app.cell
def _(Path):
INPUT_DIR = Path("data/transcripts/raw")
OUTPUT_DIR = Path("data/transcripts/clean")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
return INPUT_DIR, OUTPUT_DIR
@app.cell
def _(INPUT_DIR, mo):
csv_files = list(INPUT_DIR.glob("*.csv"))
file_options = {f.stem: str(f) for f in csv_files}
file_dropdown = mo.ui.dropdown(
options=file_options,
label="Select CSV Transcript",
full_width=True
)
file_dropdown
return (file_dropdown,)
@app.cell
def _(Path, cpc_smb_to_markdown, csv_to_markdown):
def jpmc_transcript_to_md(filepath):
fp = Path(filepath)
try:
return csv_to_markdown(filepath)
except Exception as e:
try:
return cpc_smb_to_markdown(filepath)
except Exception as e2:
raise ValueError(f"Failed to process file {filepath} with errors: {e}, {e2}")
return (jpmc_transcript_to_md,)
@app.cell(hide_code=True)
def _(file_dropdown, jpmc_transcript_to_md, mo):
# Preview
preview = mo.md("")
if file_dropdown.value:
md_content = jpmc_transcript_to_md(file_dropdown.value)
preview = mo.md(md_content[:1000])
preview
return
@app.cell
def _(mo):
convert_btn = mo.ui.run_button(label="Convert to Markdown")
convert_btn
return (convert_btn,)
@app.cell
def _(OUTPUT_DIR, Path, convert_btn, file_dropdown, jpmc_transcript_to_md, mo):
result = mo.md("")
saved_md = None
if convert_btn.value and file_dropdown.value:
saved_md = jpmc_transcript_to_md(file_dropdown.value)
_out_path = OUTPUT_DIR / (Path(file_dropdown.value).stem + ".md")
_out_path.write_text(saved_md)
result = mo.callout(f"✅ Saved to `{_out_path}`", kind="success")
result
return (saved_md,)
@app.cell
def _(mo, saved_md):
saved_preview = mo.md("")
if saved_md:
saved_preview = mo.vstack([
mo.md("### Saved Markdown Preview"),
mo.md(saved_md)
])
saved_preview
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Taguette
Upload and process using taguette: http://taguette.tail44fa00.ts.net/
""")
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()

666
02-B_WordClouds.py Normal file
View File

@@ -0,0 +1,666 @@
import marimo
__generated_with = "0.18.3"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import pandas as pd
import modin.pandas as mpd
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from utils import connect_qumo_ollama
OLLAMA_LOCATION= 'localhost'
# VM_NAME = 'ollama-lite'
# initialize tqdm for pandas
tqdm.pandas()
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
WORKING_DIR = Path('./data/processing/02-b_WordClouds')
VOICE_EXCLUDE_KEYWORDS_FILE = WORKING_DIR / 'voice_excl_keywords.txt'
if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True)
if not TAGUETTE_EXPORT_DIR.exists():
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
if not VOICE_EXCLUDE_KEYWORDS_FILE.exists():
VOICE_EXCLUDE_KEYWORDS_FILE.touch()
return (
OLLAMA_LOCATION,
TAGUETTE_EXPORT_DIR,
VOICE_EXCLUDE_KEYWORDS_FILE,
WORKING_DIR,
connect_qumo_ollama,
mo,
pd,
)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# 1) Export Data out of Taguette
**Highlights**
1. Go to: https://taguette.qumo.io/project/1
2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
**Tags Codebook**
1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
_NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# 2) Import Data
""")
return
@app.cell
def _(TAGUETTE_EXPORT_DIR, pd):
all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
all_tags_df['_seq_id'] = range(len(all_tags_df))
# all_tags_df
return (all_tags_df,)
@app.cell
def _(all_tags_df):
# get count of rows per tag
tag_counts = all_tags_df['tag'].value_counts().reset_index()
# tag_counts
return
@app.cell
def _(TAGUETTE_EXPORT_DIR, pd):
codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
# codebook_df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# 3) Select Tag for processing
""")
return
@app.cell(hide_code=True)
def _(all_tags_df, mo):
tag_select = mo.ui.dropdown(
options=all_tags_df['tag'].unique().tolist(),
label="Select Tag to Process",
# value="Chase as a brand",
full_width=True,
)
tag_select
return (tag_select,)
@app.cell(hide_code=True)
def _(WORKING_DIR, all_tags_df, mo, tag_select):
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
start_processing_btn = None
start_processing_btn = mo.ui.button(
label="Start Keyword Extraction",
kind="warn",
on_click=lambda val: True
)
tag_fname = tag_select.value.replace(" ", "-").replace('/','-')
SAVE_DIR = WORKING_DIR / tag_fname
if not SAVE_DIR.exists():
SAVE_DIR.mkdir(parents=True)
KEYWORDS_FPATH = SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'
KEYWORD_FREQ_FPATH = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
# filter all_tags_df to only the document = file_dropdown.value
tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
tags_df.head()
return (
KEYWORDS_FPATH,
KEYWORD_FREQ_FPATH,
SAVE_DIR,
start_processing_btn,
tag_fname,
tags_df,
)
@app.cell(hide_code=True)
def _(KEYWORD_FREQ_FPATH, mo):
mo.md(rf"""
# 4) Keyword extraction {'(skippable, see 4b)' if KEYWORD_FREQ_FPATH.exists() else '(Required)'}
""")
return
@app.cell(hide_code=True)
def _(OLLAMA_LOCATION, connect_qumo_ollama, mo):
try:
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
model_select = mo.ui.dropdown(
options=_models,
value=_models[0],
label="Select Ollama Model to use",
searchable=True,
)
except Exception as e:
mo.md(f"Error connecting to Ollama server at `{OLLAMA_LOCATION}`: {e}")
model_select = None
client = None
model_select
return client, model_select
@app.cell
def _(mo, model_select, start_processing_btn, tag_select):
mo.stop(not tag_select.value or model_select is None, mo.md("Select tag to continue"))
start_processing_btn
return
@app.cell(hide_code=True)
def _(client, mo, model_select, pd, start_processing_btn, tags_df):
from utils import ollama_keyword_extraction, worker_extraction
# Wait for start processing button
mo.stop(not start_processing_btn.value, "Click button above to start processing")
if client is not None:
df = tags_df
# Run keyword extraction
df['keywords'] = df.progress_apply(
lambda row: pd.Series(ollama_keyword_extraction(
content=row['content'],
tag=row['tag'],
client=client,
model=model_select.value
)),
axis=1
)
else:
mo.md("Ollama client not available, See 4b) for loading data from xlsx.")
return (df,)
@app.cell(hide_code=True)
def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
mo.stop(not start_processing_btn.value, "Click button above to process first")
df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws))
all_keywords_list = df['keywords'].tolist()
all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
# Calculate frequencies per keyword
keyword_freq = {}
for kw in all_keywords_flat:
if kw in keyword_freq:
keyword_freq[kw] += 1
else:
keyword_freq[kw] = 1
freq_df = pd.DataFrame.from_dict(keyword_freq, orient='index', columns=['frequency'])
freq_df.index.name = 'keyword'
freq_df.reset_index(inplace=True)
freq_df.sort_values(by='frequency', ascending=False, inplace=True)
# Save to Excel files
df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
KEYWORDS_FPATH,
index=False
)
freq_df.to_excel(
KEYWORD_FREQ_FPATH,
index=False
)
mo.vstack([
mo.md(f"Keywords per-highlight saved to: `{KEYWORDS_FPATH}`"),
mo.md(f"Keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
])
return (freq_df,)
@app.cell(hide_code=True)
def _(KEYWORD_FREQ_FPATH, mo):
mo.md(rf"""
# 4b) [optional] Load data from `keyword_frequencies_{KEYWORD_FREQ_FPATH.name}`
""")
return
@app.cell(hide_code=True)
def _(KEYWORD_FREQ_FPATH, mo, start_processing_btn):
if start_processing_btn is not None: # Triggers re-execution of this cell when keyword extraction completes
pass
load_existing_btn = None
if KEYWORD_FREQ_FPATH.exists():
load_existing_btn = mo.ui.run_button(label=f"Load `{KEYWORD_FREQ_FPATH.name}`", kind='warn')
load_existing_btn
return (load_existing_btn,)
@app.cell(hide_code=True)
def _(
KEYWORD_FREQ_FPATH,
VOICE_EXCLUDE_KEYWORDS_FILE,
freq_df,
load_existing_btn,
pd,
tag_select,
):
if load_existing_btn is not None and load_existing_btn.value:
_fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
# Drop nan rows if any
_fdf.dropna(subset=['keyword', 'frequency'], inplace=True)
_fdf.sort_values(by='frequency', ascending=False, inplace=True)
_fdf.reset_index(drop=True, inplace=True)
print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
frequency_df = _fdf
else:
frequency_df = freq_df
if tag_select.value.startswith('V'):
# Read exclusion list
excl_kw = []
with VOICE_EXCLUDE_KEYWORDS_FILE.open('r') as _f:
for line in _f:
excl_kw.append(line.strip())
_drop_idx = frequency_df[frequency_df['keyword'].isin(excl_kw)].index
frequency_df.drop(index=_drop_idx, inplace=True, axis=0)
print(f"Dropped {len(_drop_idx)} keywords automatically")
return (frequency_df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# 5) Wordcloud generation
""")
return
@app.cell(hide_code=True)
def _():
# Import all necessary libraries
import numpy as np
from os import path
from PIL import Image, ImageDraw
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from utils import blue_color_func
import warnings
warnings.filterwarnings("ignore")
return Image, ImageDraw, WordCloud, blue_color_func, np, plt
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## 5.1) Select threshold frequency
""")
return
@app.cell(hide_code=True)
def _(mo):
min_freq_select = mo.ui.number(start=1, stop=20, label="Threshold Minimum Keyword Frequency: ", value=2)
min_freq_select
return (min_freq_select,)
@app.cell(hide_code=True)
def _(mo, tag_select):
mo.md(rf"""
## 5.2) Inspect Keyword Dataset
1. Check the threshold is set correctly. If not, adjust accordingly
2. Read all the keywords and verify they are good. If not
- Add explicit exclusions if necessary below
- OR Rerun the keyword extraction above
Add words to this dict that should be ignored in the WordCloud for specific tags.
Make sure to create the correct key that matches the active selected tag:
Active selected tag = '`{tag_select.value.lower()}`'
""")
return
@app.cell(hide_code=True)
def _(frequency_df, min_freq_select, mo):
mo.stop('keyword' not in frequency_df.columns, "Waiting for keyword extraction to finish")
MIN_FREQ = min_freq_select.value
_freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
table_selection = mo.ui.table(_freq_df_filtered, page_size=50)
table_selection
return MIN_FREQ, table_selection
@app.cell(hide_code=True)
def _(mo, table_selection):
remove_rows_btn = None
if len(table_selection.value) >0 :
remove_rows_btn = mo.ui.run_button(label="Click to remove selected keywords and update xlsx")
remove_rows_btn
return (remove_rows_btn,)
@app.cell(hide_code=True)
def _(
KEYWORD_FREQ_FPATH,
VOICE_EXCLUDE_KEYWORDS_FILE,
frequency_df,
mo,
remove_rows_btn,
table_selection,
tag_select,
):
_s = None
if remove_rows_btn is not None and remove_rows_btn.value:
# get selected rows
selected_rows = table_selection.value
if len(selected_rows) >0 :
rows_to_drop = table_selection.value.index.tolist()
try:
if tag_select.value.startswith('V'):
# append values to an VoiceKeywordsExclusion file (txt file just a list of keywords)
exclude_keywords = frequency_df.loc[rows_to_drop, 'keyword'].to_list()
with VOICE_EXCLUDE_KEYWORDS_FILE.open('w') as f:
for _kw in exclude_keywords:
f.write(_kw + '\n')
frequency_df.drop(index=rows_to_drop, inplace=True, axis=0)
except KeyError:
_s = mo.callout("GO BACK TO STEP 4b) and reload data to continue refining the dataset.", kind='warn')
else:
# Save updated frequencies back to xlsx
frequency_df.to_excel(
KEYWORD_FREQ_FPATH,
index=False
)
print(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
# mo.callout(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`", kind="success")
_s = mo.callout("GO BACK TO STEP 4b) and reload data before continuing.", kind='warn')
_s
return
@app.cell(hide_code=True)
def _():
IGNORE_WORDS = {
'chase as a brand': [
"brand",
"banking experience",
"banking",
"chase",
"jpmorgan",
"youthful",
"customer service",
"customer service focused",
"great brand",
],
'why customer chase': [
"customer service",
"customer loyalty",
"chase",
"chase customer",
"banking experience",
],
'chase as a person (personification)': [
"CPC1"
]
# <active-selected-tag>: [list, of, words, to, ignore]
}
return (IGNORE_WORDS,)
@app.cell(hide_code=True)
def _(mo):
buffer = -100 # Adjust this to increase/decrease space between logo and words
canvas_size = (1200, 800)
logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
n_words = mo.ui.slider(start=10, stop=200, step=1, value=100, debounce=True, show_value=True, label="Max number of words in WordCloud")
return buffer, canvas_size, logo_switch, n_words
@app.cell(hide_code=True)
def _(logo_switch, mo, n_words):
run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud")
mo.vstack([
mo.md("## 5.4) Generate WordCloud with/without Logo"),
mo.md("""Use these buttons to iteratively (re)generate the WordCloud until it looks nice.
Placement and color of words is randomized, size is proportional to frequency.
When satisfied with the result, click 'Save WordCloud to File' to save the image."""),
mo.md('---'),
mo.hstack([logo_switch, n_words, run_wordcloud_btn], align='center', justify='space-around')]
)
return (run_wordcloud_btn,)
@app.cell(hide_code=True)
def _(
IGNORE_WORDS,
Image,
ImageDraw,
MIN_FREQ,
WordCloud,
blue_color_func,
buffer,
canvas_size,
frequency_df,
logo_switch,
mo,
n_words,
np,
plt,
run_wordcloud_btn,
tag_select,
):
if run_wordcloud_btn.value:
pass
freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
# freq_df_filtered.reset_index(drop=True, inplace=True)
keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
# remove specific keywords depending on selected tag
if IGNORE_WORDS.get(tag_select.value.lower()):
for word in IGNORE_WORDS[tag_select.value.lower()]:
if word in keyword_freq_filtered:
del keyword_freq_filtered[word]
if logo_switch.value:
# 1. Load the logo
# Make sure this path points to your uploaded file
logo_path = "./assets/JP-Morgan-Chase-Symbol.png"
logo = Image.open(logo_path).convert("RGBA")
# Optional: Resize logo if it's too large or small for the canvas
# target_width = 600
# ratio = target_width / logo.width
# logo = logo.resize((target_width, int(logo.height * ratio)), Image.Resampling.LANCZOS)
target_width = 600 # Set a reasonable size for the logo
if logo.width > target_width:
ratio = target_width / logo.width
new_height = int(logo.height * ratio)
# Use Image.Resampling.LANCZOS for high-quality downsampling
# If you get an error, try Image.LANCZOS or Image.ANTIALIAS
logo = logo.resize((target_width, new_height), Image.Resampling.LANCZOS)
# 3. Create the mask (0 = draw here, 255 = don't draw here)
# Initialize with 0 (black/draw everywhere)
mask_image = Image.new("L", canvas_size, 0)
draw = ImageDraw.Draw(mask_image)
# 4. Draw a protected circular area in the center
center = (canvas_size[0] // 2, canvas_size[1] // 2)
# Calculate radius: half of logo max dimension + buffer
radius = (max(logo.size) // 2) + buffer
# Draw the white circle (255) which the WordCloud will avoid
draw.ellipse(
(center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius),
fill=255
)
chase_mask = np.array(mask_image)
# Generate the WordCloud
wordcloud = WordCloud(
background_color='white',
width=canvas_size[0],
height=canvas_size[1],
max_font_size=100, # Increased font size for larger canvas
max_words=n_words.value, # Increased word count to fill space
color_func=blue_color_func,
mask=chase_mask, # Apply the circular mask
contour_width=0,
contour_color='steelblue'
).generate_from_frequencies(keyword_freq_filtered)
else:
# Generate the WordCloud
wordcloud = WordCloud(
background_color='white',
width=canvas_size[0],
height=canvas_size[1],
max_font_size=150, # Increased font size for larger canvas
max_words=n_words.value, # Increased word count to fill space
color_func=blue_color_func,
# mask=chase_mask, # Apply the circular mask
# contour_width=0,
# contour_color='steelblue'
).generate_from_frequencies(keyword_freq_filtered)
# Convert WordCloud to Image to composite the logo
wc_image = wordcloud.to_image()
if logo_switch.value:
# Calculate position to center the logo
logo_pos = (
(canvas_size[0] - logo.width) // 2,
(canvas_size[1] - logo.height) // 2
)
# Paste logo (using alpha channel as mask to keep transparency)
wc_image.paste(logo, logo_pos, logo)
# Display the generated image
fig = plt.figure(figsize=(7,7))
# Display the generated image:
plt.imshow(wc_image, interpolation='bilinear')
plt.axis("off")
plt.show()
save_wordcloud_btn = None
save_wordcloud_btn = mo.ui.button(
label="Save WordCloud to File",
kind="warn",
on_click=lambda val: True
)
save_wordcloud_btn
return save_wordcloud_btn, wc_image
@app.cell(hide_code=True)
def _(SAVE_DIR, mo, save_wordcloud_btn, tag_fname, wc_image):
# Wait for start processing button
mo.stop(not save_wordcloud_btn.value, "Click button above to save wordcloud image")
filename = f'wordcloud_{tag_fname}.png'
fpath = SAVE_DIR / filename
# add a (increasing) number to the filename so we can save multiple. find the latest in the directory first
existing_files = list(SAVE_DIR.glob(f'wordcloud_{tag_fname}*.png'))
if existing_files:
existing_numbers = []
for ef in existing_files:
parts = ef.stem.split('_')
if len(parts) > 2 and parts[-1].isdigit():
existing_numbers.append(int(parts[-1]))
if existing_numbers:
next_number = max(existing_numbers) + 1
else:
next_number = 1
fpath = SAVE_DIR / f'wordcloud_{tag_fname}_{next_number}.png'
wc_image.save(fpath)
mo.md(f"Wordcloud saved to: {fpath}")
return
if __name__ == "__main__":
app.run()

461
02_Taguette_Post-Process.py Normal file
View File

@@ -0,0 +1,461 @@
import marimo
__generated_with = "0.18.3"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import pandas as pd
from pathlib import Path
from datetime import datetime
from utils import connect_qumo_ollama
OLLAMA_LOCATION= 'localhost'
# VM_NAME = 'ollama-lite'
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True)
if not TAGUETTE_EXPORT_DIR.exists():
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
model_select = mo.ui.dropdown(
options=_models,
value=_models[0],
label="Select Ollama Model to use",
searchable=True,
)
model_select
return (
TAGUETTE_EXPORT_DIR,
WORKING_DIR,
client,
datetime,
mo,
model_select,
pd,
)
@app.cell(hide_code=True)
def _(TAGUETTE_EXPORT_DIR, mo):
mo.md(rf"""
# Step 1: Export Data out of Taguette
**Highlights**
1. Go to: https://taguette.qumo.io/project/1
2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
**Tags Codebook**
1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
_NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Step 2: Import here for processing
""")
return
@app.cell
def _(TAGUETTE_EXPORT_DIR, pd):
all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
all_tags_df['_seq_id'] = range(len(all_tags_df))
all_tags_df
return (all_tags_df,)
@app.cell
def _(TAGUETTE_EXPORT_DIR, pd):
codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
codebook_df
return (codebook_df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Step 3: Process each 'Interview'
""")
return
@app.cell
def _(all_tags_df, mo):
interview_select = mo.ui.dropdown(
options=all_tags_df['document'].unique().tolist(),
label="Select Interview to Process",
full_width=True
)
interview_select
return (interview_select,)
@app.cell
def _(all_tags_df, interview_select, mo):
mo.stop(not interview_select.value, mo.md("Select interview to continue"))
# filter all_tags_df to only the document = file_dropdown.value
df = all_tags_df.loc[all_tags_df['document'] == interview_select.value].copy()
return (df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Add `_context` column to track Voice / Character is being referred to per highlight
Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.
1. Iterates through the dataframe in document order (row by row)
2. Uses a set to track which highlight IDs we've already processed
3. When we encounter a new highlight ID for the first time, we process all its rows
4. Collects all _V- or _C- tags within that highlight
5. Assigns the context to all rows with that ID
6. This preserves document order and handles multi-tag highlights correctly
Example of challenging case:
| tag | content | _seq_id | _context |
|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
| _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 |
| _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 |
| VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 |
| VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 |
""")
return
@app.cell
def _(df):
# First pass: identify context tags within each highlight group
df['_context'] = None
last_context = None
processed_ids = set()
# Process in document order
for idx, row in df.iterrows():
highlight_id = row['id']
# If we haven't processed this highlight yet
if highlight_id not in processed_ids:
processed_ids.add(highlight_id)
# Get all rows for this highlight
highlight_rows = df[df['id'] == highlight_id]
# Collect all context tags in this highlight
context_tags = []
for _, h_row in highlight_rows.iterrows():
tag = h_row.get('tag', '')
if '_V-' in tag or '_C-' in tag:
context_tags.append(tag)
# If we found context tags, join them with comma
if context_tags:
context_tag = ', '.join(context_tags)
last_context = context_tag
else:
# If no context tag in this highlight, use the last context
context_tag = last_context
# Assign the context to all rows in this highlight
df.loc[df['id'] == highlight_id, '_context'] = context_tag
df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Split multi-context rows (only VT- and CT- theme tags)
For rows that have multiple contexts (e.g., both _V-54 and _V-41)
- split these into separate rows for each context.
- Then mark these for 'manual_analysis'
""")
return
@app.cell
def _(df, pd):
# Expand rows that contain multiple contexts (comma-separated)
expanded_rows = []
for _, _row in df.iterrows():
context_value = _row['_context']
has_multiple = pd.notna(context_value) and ',' in str(context_value)
if has_multiple:
contexts = [c.strip() for c in str(context_value).split(',')]
else:
contexts = [context_value]
if has_multiple:
for ctx in contexts:
new_row = _row.copy()
new_row['_context'] = ctx
new_row['manual_analysis'] = True
if str(new_row['tag']).startswith(('VT -', 'CT -')):
new_row['sentiment'] = None
expanded_rows.append(new_row)
else:
new_row = _row.copy()
new_row['_context'] = contexts[0]
new_row['manual_analysis'] = False
expanded_rows.append(new_row)
expanded_df_raw = pd.DataFrame(expanded_rows).reset_index(drop=True)
sentiment_df = expanded_df_raw.loc[
expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
].copy()
print(f"{len(sentiment_df[sentiment_df['manual_analysis']])} Rows with multiple contexts")
sentiment_df[sentiment_df['manual_analysis']]
return (sentiment_df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Create 'theme' column
""")
return
@app.cell
def _(sentiment_df):
from utils import extract_theme
sentiment_df['theme'] = sentiment_df.apply(lambda row: extract_theme(row['tag']), axis=1)
sentiment_df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Extract Sentiment + Reasoning
For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.
Values to be used:
- Positive: +1
- Neutral: 0
- Negative: -1
""")
return
@app.cell
def _(mo):
start_processing_btn = mo.ui.button(
label="Start Sentiment Extraction",
kind="warn",
on_click=lambda val: True
)
start_processing_btn
return (start_processing_btn,)
@app.cell
def _(
client,
codebook_df,
mo,
model_select,
pd,
sentiment_df,
start_processing_btn,
):
from utils import dummy_sentiment_analysis, ollama_sentiment_analysis
# add theme_description to be used in LLM prompt
_df = sentiment_df.merge(codebook_df, on='tag', how='left', suffixes=('', '_codebook'))
# Wait for start processing button
mo.stop(not start_processing_btn.value, "Click button above to start processing")
sentiment_df[['keywords', 'sentiment', 'reason']] = _df[~_df['manual_analysis']].apply(
lambda row: pd.Series(ollama_sentiment_analysis(
content=row['content'],
theme=row['theme'],
theme_description=row['theme_description'],
client=client,
model=model_select.value
)),
axis=1
)
return
@app.cell
def _(mo, sentiment_df):
mo.stop(('sentiment' not in sentiment_df.columns), "Run above cells to extract sentiment analysis")
sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Multi-context tags
""")
return
@app.cell
def _(mo, sentiment_df):
manual_rows = sentiment_df[sentiment_df['manual_analysis']]
split_rows_editor = None
rows_to_edit = []
if not manual_rows.empty:
print(
f"⚠️ {len(manual_rows)} rows were created from multi-context splits. "
"See next cell for manual review."
)
# Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
rows_to_edit = sentiment_df[
(sentiment_df['manual_analysis'])
]
# Create data editor for split rows
split_rows_editor = mo.ui.data_editor(
rows_to_edit
).form(label="Update Sentiment / Manual Flag")
else:
print("✓ No multi-context rows found")
return rows_to_edit, split_rows_editor
@app.cell
def _(split_rows_editor):
split_rows_editor
return
@app.cell(hide_code=True)
def _(mo, rows_to_edit, split_rows_editor):
if split_rows_editor is not None:
mo.vstack([
mo.md(f"""
### ⚠️ Manual Review Required
**{len(rows_to_edit)} rows** were split from multi-context entries.
Please review them below:
1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
2. Click **Submit** to apply changes.
"""),
split_rows_editor
])
return
@app.cell
def _(mo, split_rows_editor):
# Capture the edited manual-analysis rows for validation
reviewed_manual_rows = getattr(split_rows_editor, 'value', '')
mo.stop(reviewed_manual_rows is None, mo.md("Submit your sentiment analysis changes before continuing."))
# Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
if (reviewed_manual_rows != '') and (not reviewed_manual_rows.empty):
valid_sentiments = {-1, 0, 1}
needs_review = reviewed_manual_rows[
reviewed_manual_rows['manual_analysis']
& ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
]
assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
return (reviewed_manual_rows,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Recombine
""")
return
@app.cell
def _(pd, reviewed_manual_rows, sentiment_df):
_static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
if isinstance(reviewed_manual_rows, pd.DataFrame):
recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
else:
recombined_df = sentiment_df
recombined_df
return (recombined_df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Step 3: Process 'Other' tags
These need to be reviewed manually for interesting content
""")
return
@app.cell
def _(mo):
mo.md(r"""
""")
return
@app.cell
def _():
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Save to CSV
""")
return
@app.cell
def _(WORKING_DIR, datetime, interview_select, recombined_df):
# Save to CSV in working dir
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments.csv"
recombined_df.to_csv(filename, index=False)
print(f"✓ Saved processed data to '{filename}'")
return
if __name__ == "__main__":
app.run()

146
03_Sentiment_Analysis.py Normal file
View File

@@ -0,0 +1,146 @@
import marimo
__generated_with = "0.18.3"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import pandas as pd
from pathlib import Path
from utils import create_sentiment_matrix
INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True)
return INPUT_DIR, Path, WORKING_DIR, create_sentiment_matrix, mo, pd
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Load Sentiment CSV
""")
return
@app.cell
def _(INPUT_DIR, mo):
csv_files = list(INPUT_DIR.glob("*.csv"))
file_options = {f.stem: str(f) for f in csv_files}
sentiment_csv = mo.ui.dropdown(
options=file_options,
label="Select Sentiment CSV File",
full_width=True
)
sentiment_csv
return (sentiment_csv,)
@app.cell
def _(Path, pd, sentiment_csv):
input_csv_name = Path(sentiment_csv.value).stem
timestamp = input_csv_name.split('_')[-1]
doc = input_csv_name.split('_')[0]
sentiment_df = pd.read_csv(sentiment_csv.value)
sentiment_df
return doc, sentiment_df, timestamp
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Phase 1: Individual interview analysis
- Create sentiment matrices for each interview (document)
- Save the intermediate results to file in the `WORKING_DIR`
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 1.1: Voice Sample vs. Theme Sentiment Matrix
For each interview (document), create a matrix where:
- Rows represent the different Voices (based on '_V-' tags)
- Columns represent the different VoiceThemes(based on 'VT -' tags)
- Each cell contains the aggregated sentiment score (sum) for that Voice/Theme combination
""")
return
@app.cell
def _(create_sentiment_matrix, sentiment_df):
voice_matrix = create_sentiment_matrix(sentiment_df, column_prefix='VT - ', row_prefix='_V-')
voice_matrix
return (voice_matrix,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
SAVE TO CSV
""")
return
@app.cell
def _(WORKING_DIR, doc, timestamp, voice_matrix):
# Save to CSV
voice_filename = WORKING_DIR / f"{doc}_voice_theme_matrix_{timestamp}.csv"
voice_matrix.to_csv(voice_filename)
print(f"Saved to '{voice_filename}'")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 1.2: Character Sample vs. Theme Sentiment Matrix
For each interview (document), create a matrix where:
- Rows represent the different Characters (based on '_C-' tags)
- Columns represent the different CharacterThemes (based on 'CT -' tags)
- Each cell contains the aggregated sentiment score (sum) for that Character/Theme combination
""")
return
@app.cell
def _(create_sentiment_matrix, sentiment_df):
character_matrix = create_sentiment_matrix(sentiment_df, column_prefix='CT - ', row_prefix='_C-')
character_matrix
return (character_matrix,)
@app.cell
def _(WORKING_DIR, character_matrix, doc, timestamp):
# Save to CSV
character_filename = WORKING_DIR / f"{doc}_character_theme_matrix_{timestamp}.csv"
character_matrix.to_csv(character_filename)
print(f"Saved to '{character_filename}'")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 1.3: Chase Brand Sentiment
TODO: not sure we have enough supporting data for this yet
""")
return
if __name__ == "__main__":
app.run()

86
04_Results_Aggregation.py Normal file
View File

@@ -0,0 +1,86 @@
import marimo
__generated_with = "0.18.3"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import pandas as pd
from pathlib import Path
INPUT_DIR = Path("./data/processing/03_sentiment_analysis")
WORKING_DIR = Path('./data/processing/04_sentiment_aggregation')
if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True)
return INPUT_DIR, mo, pd
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Voices
""")
return
@app.cell
def _(INPUT_DIR, mo):
voice_csv_files = list(INPUT_DIR.glob("*voice*.csv"))
file_options = {f.stem: str(f) for f in voice_csv_files}
voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation")
return (voice_multiselect,)
@app.cell
def _(mo, voice_multiselect):
mo.hstack([voice_multiselect, mo.md(f"Has value: {voice_multiselect.value}")])
return
@app.cell
def _(pd, voice_multiselect):
# Load all voice CSV files and aggregate them so that each row-column pair is summed
KEY_COL = "_context"
def _read_voice_csv(path: str) -> pd.DataFrame:
df = pd.read_csv(path).set_index(KEY_COL)
df = df.apply(pd.to_numeric, errors="coerce")
return df
def aggregate_voice_data(files: list[str]) -> pd.DataFrame:
if not files:
return pd.DataFrame()
master = _read_voice_csv(files[0])
for path in files[1:]:
master = master.add(_read_voice_csv(path), fill_value=0)
return master.reset_index()
master_df = aggregate_voice_data(voice_multiselect.value)
master_df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Characters
""")
return
@app.cell
def _(INPUT_DIR):
char_csv_files = list(INPUT_DIR.glob("*character*.csv"))
char_csv_files
return
if __name__ == "__main__":
app.run()

View File

@@ -78,7 +78,9 @@ def _(mo):
**Goal:** Convert unstructured text into a structured dataset.
1. **Input:** All 26 Transcripts + `master_codebook.json`.
This will be a dedicated notebook, and be run per transcript.
1. **Input:** Transcript + `master_codebook.json`.
2. **Process:**
* The LLM analyzes each transcript segment-by-segment.
* It extracts specific quotes that match a Theme Definition.
@@ -86,8 +88,9 @@ def _(mo):
* **Granular Sentiment Analysis:** For each quote, the model identifies:
* **Subject:** The specific topic/object being discussed (e.g., "Login Flow", "Brand Tone").
* **Sentiment:** Positive / Neutral / Negative.
3. **Output:** `coded_segments.csv`
3. **Output:** `<transcript_name>_coded_segments.csv`
* Columns: `Source_File`, `Speaker`, `Theme`, `Quote`, `Subject`, `Sentiment`, `Context`.
* Each transcript produces its own CSV-file, which can be reviewed and adjusted before moving to the next stage
""")
return

226
Stage1_Theme_Discovery.py Normal file
View File

@@ -0,0 +1,226 @@
import marimo
__generated_with = "0.18.1"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import json
import pandas as pd
import re
from pathlib import Path
from utils import connect_qumo_ollama, load_srt
# Configuration
VM_NAME = 'hiperf-gpu'
MODEL = 'llama3.3:70b'
TRANSCRIPT_DIR = Path("data/transcripts")
OUTPUT_FILE = Path("master_codebook.json")
client = connect_qumo_ollama(VM_NAME)
return (
MODEL,
OUTPUT_FILE,
TRANSCRIPT_DIR,
client,
json,
load_srt,
mo,
pd,
re,
)
@app.cell
def _(mo):
mo.md(r"""
# Stage 1: Theme Discovery
**Goal:** Identify recurring themes across a sample of interviews.
1. **Select Transcripts:** Choose 4-5 representative interviews.
2. **Extract Topics:** The AI will analyze each transcript to find key topics.
3. **Synthesize Themes:** Topics are grouped into a Master Codebook.
4. **Refine & Save:** Edit the definitions and save the `master_codebook.json`.
""")
return
@app.cell
def _(TRANSCRIPT_DIR, mo):
# File Selection
srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
file_options = {f.name: str(f) for f in srt_files}
file_selector = mo.ui.multiselect(
options=file_options,
label="Select Transcripts (Recommended: 4-5)",
full_width=True
)
file_selector
return (file_selector,)
@app.cell
def _(file_selector, mo):
mo.md(f"**Selected:** {len(file_selector.value)} files")
return
@app.cell
def _(mo):
start_discovery_btn = mo.ui.run_button(label="Start Discovery Process")
start_discovery_btn
return (start_discovery_btn,)
@app.cell
def _(
MODEL,
client,
file_selector,
json,
load_srt,
mo,
re,
start_discovery_btn,
):
# Map Phase: Extract Topics per Transcript
extracted_topics = []
status_callout = mo.md("")
if start_discovery_btn.value and file_selector.value:
with mo.status.spinner("Analyzing transcripts...") as _spinner:
for filepath in file_selector.value:
_transcript = load_srt(filepath)
# Truncate for discovery if too long (optional, but good for speed)
# Using first 15k chars usually gives enough context for high-level themes
_context = _transcript[:15000]
_prompt = f"""
Analyze this interview transcript and list the top 5-7 key topics or themes discussed.
Focus on: Brand voice, Customer experience, Design systems, and AI.
Return ONLY a JSON list of strings. Example: ["Inconsistent Tone", "Mobile Latency", "AI Trust"]
Transcript:
{_context}...
"""
try:
_response = client.generate(model=MODEL, prompt=_prompt)
# Find JSON list in response
_match = re.search(r'\[.*\]', _response.response, re.DOTALL)
if _match:
_topics = json.loads(_match.group(0))
extracted_topics.extend(_topics)
except Exception as e:
print(f"Error processing {filepath}: {e}")
status_callout = mo.callout(
f"✅ Extracted {len(extracted_topics)} raw topics from {len(file_selector.value)} files.",
kind="success"
)
elif start_discovery_btn.value:
status_callout = mo.callout("Please select at least one file.", kind="warn")
status_callout
return (extracted_topics,)
@app.cell
def _(MODEL, client, extracted_topics, json, mo, re, start_discovery_btn):
# Reduce Phase: Synthesize Themes
suggested_themes = []
if start_discovery_btn.value and extracted_topics:
with mo.status.spinner("Synthesizing Master Codebook...") as _spinner:
_topics_str = ", ".join(extracted_topics)
_synthesis_prompt = f"""
You are a qualitative data architect.
I have a list of raw topics extracted from multiple interviews:
[{_topics_str}]
Task:
1. Group these into 5-8 distinct, high-level Themes.
2. Create a definition for each theme.
3. Assign a hex color code to each.
4. ALWAYS include a theme named "Other" for miscellaneous insights.
Return a JSON object with this structure:
[
{{"Theme": "Theme Name", "Definition": "Description...", "Color": "#HEXCODE"}},
...
]
"""
_response = client.generate(model=MODEL, prompt=_synthesis_prompt)
_match = re.search(r'\[.*\]', _response.response, re.DOTALL)
if _match:
try:
suggested_themes = json.loads(_match.group(0))
except:
suggested_themes = [{"Theme": "Error parsing JSON", "Definition": _response.response, "Color": "#000000"}]
return (suggested_themes,)
@app.cell
def _(mo, pd, suggested_themes):
# Interactive Editor
# Default empty structure if nothing generated yet
_initial_data = suggested_themes if suggested_themes else [
{"Theme": "Example Theme", "Definition": "Description here...", "Color": "#CCCCCC"}
]
df_themes = pd.DataFrame(_initial_data)
theme_editor = mo.ui.data_editor(
df_themes,
label="Master Codebook Editor",
column_config={
"Color": mo.ui.column.color_picker(label="Color")
},
num_rows="dynamic" # Allow adding/removing rows
)
mo.vstack([
mo.md("### Review & Refine Codebook"),
mo.md("Edit the themes below. You can add rows, change colors, or refine definitions."),
theme_editor
])
return (theme_editor,)
@app.cell
def _(OUTPUT_FILE, json, mo, theme_editor):
save_btn = mo.ui.run_button(label="Save Master Codebook")
save_message = mo.md("")
if save_btn.value:
_final_df = theme_editor.value
# Convert to list of dicts
_codebook = _final_df.to_dict(orient="records")
with open(OUTPUT_FILE, "w") as f:
json.dump(_codebook, f, indent=2)
save_message = mo.callout(f"✅ Saved {len(_codebook)} themes to `{OUTPUT_FILE}`", kind="success")
mo.vstack([
save_btn,
save_message
])
return
if __name__ == "__main__":
app.run()

View File

@@ -0,0 +1,212 @@
import marimo
__generated_with = "0.18.0"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import json
import pandas as pd
from pathlib import Path
from utils import connect_qumo_ollama, load_srt
# Configuration
CODEBOOK_PATH = Path("data/labels/master_codebook.json")
TRANSCRIPT_DIR = Path("data/transcripts")
OUTPUT_DIR = Path("data/labeled_transcripts")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Initialize LLM Client
client = connect_qumo_ollama("hiperf-gpu")
return (
CODEBOOK_PATH,
OUTPUT_DIR,
Path,
TRANSCRIPT_DIR,
client,
connect_qumo_ollama,
json,
load_srt,
mo,
pd,
)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Stage 2: Structured Theme Coding
**Goal:** Extract specific quotes for defined themes from full transcripts.
""")
return
@app.cell
def _(CODEBOOK_PATH, json, mo):
# Load Codebook
if CODEBOOK_PATH.exists():
with open(CODEBOOK_PATH, "r") as f:
codebook = json.load(f)
else:
codebook = []
mo.md(f"**Loaded {len(codebook)} Themes from Codebook**")
return codebook, f
@app.cell
def _(TRANSCRIPT_DIR, mo):
# File Selector
srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
file_dropdown = mo.ui.dropdown(
options={f.name: str(f) for f in srt_files},
label="Select Transcript"
)
file_dropdown
return file_dropdown, srt_files
@app.cell
def _(mo):
run_btn = mo.ui.run_button(label="Start Analysis")
run_btn
return (run_btn,)
@app.cell
def _(
OUTPUT_DIR,
client,
codebook,
file_dropdown,
json,
load_srt,
mo,
pd,
run_btn,
):
# Analysis Logic
results = []
status_message = mo.md("")
if run_btn.value and file_dropdown.value:
transcript_path = file_dropdown.value
transcript_name = file_dropdown.selected_key.replace(".srt", "")
transcript_text = load_srt(transcript_path)
with mo.status.progress_bar(codebook, title="Analyzing Themes") as bar:
for theme in bar:
theme_name = theme["name"]
theme_def = theme["definition"]
prompt = f"""
You are a qualitative data analyst. Analyze the following transcript for the theme: "{theme_name}".
Definition: {theme_def}
Extract ALL relevant quotes that match this definition.
For each quote, identify the specific Subject and the Sentiment (Positive, Neutral, Negative).
Return ONLY a JSON array of objects with these keys: "quote", "subject", "sentiment".
If no quotes are found, return an empty array [].
Transcript:
{transcript_text}
"""
try:
response = client.generate(model="llama3.3:70b", prompt=prompt, format="json")
content = response.get("response", "[]")
extracted = json.loads(content)
# Add metadata
for item in extracted:
item["theme"] = theme_name
item["source_file"] = transcript_name
results.append(item)
except Exception as e:
print(f"Error processing theme {theme_name}: {e}")
# "Other" Category Analysis (Negative Constraint Strategy)
if results or codebook: # Proceed if we have themes to exclude
status_message = mo.md("🔍 Analyzing for 'Other' emerging themes...")
# Format existing themes for exclusion
existing_themes_text = "\n".join([f"- {t['name']}: {t['definition']}" for t in codebook])
other_prompt = f"""
You are a qualitative data analyst.
Your goal is to identify "Emerging Themes" in the transcript that have NOT been captured by our existing codebook.
### EXISTING THEMES (IGNORE THESE)
We have already analyzed the transcript for the following themes. DO NOT extract quotes that primarily fit these definitions:
{existing_themes_text}
### INSTRUCTIONS
1. Analyze the transcript below.
2. Identify significant quotes, insights, or patterns that are distinct from the "Existing Themes" listed above.
3. Label these findings as "Other".
4. If a quote is borderline, only include it if it offers a novel angle not covered by the existing definition.
Return ONLY a JSON array of objects with these keys: "quote", "subject", "sentiment".
If no new insights are found, return an empty array [].
### TRANSCRIPT
{transcript_text}
"""
try:
response = client.generate(model="llama3.3:70b", prompt=other_prompt, format="json")
content = response.get("response", "[]")
extracted_other = json.loads(content)
for item in extracted_other:
item["theme"] = "Other"
item["source_file"] = transcript_name
results.append(item)
except Exception as e:
print(f"Error processing 'Other' theme: {e}")
# Save Results
if results:
df = pd.DataFrame(results)
output_path = OUTPUT_DIR / f"{transcript_name}_coded.csv"
df.to_csv(output_path, index=False)
status_message = mo.md(f"✅ Analysis Complete! Saved to `{output_path}`")
else:
status_message = mo.md("⚠️ No quotes found for any theme.")
df = pd.DataFrame()
elif run_btn.value and not file_dropdown.value:
status_message = mo.md("⚠️ Please select a transcript first.")
df = pd.DataFrame()
else:
df = pd.DataFrame()
mo.vstack([status_message, mo.ui.table(df)])
return (
bar,
content,
df,
extracted,
item,
output_path,
prompt,
response,
results,
status_message,
theme,
theme_def,
theme_name,
transcript_name,
transcript_path,
transcript_text,
)
if __name__ == "__main__":
app.run()

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

60
ollama/docker-compose.yml Normal file
View File

@@ -0,0 +1,60 @@
services:
ollama:
image: ollama/ollama:latest
ports:
- 11434:11434
volumes:
- ./docker-volumes/ollama:/root/.ollama
container_name: ollama
tty: true
restart: unless-stopped
# GPU SUPPORT NOTES:
# 1. The "deploy" section is ignored by classic 'docker-compose'; it's honored in Swarm.
# 2. For local 'docker compose up' with NVIDIA GPUs you need the host configured with
# nvidia-container-toolkit. Then either:
# a) Leave the reservation block (Compose V2 now honors it) OR
# b) Start with: docker compose up --build (Compose will request GPUs) OR
# c) Explicitly override: docker compose run --gpus all ollama
# 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the
# 'devices' section further down as a fallback (less portable).
## UNCOMMENT THE FOLLOWING BLOCK FOR NVIDIA GPU SUPPORT ###
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: all
# capabilities: [gpu]
environment:
# Visible devices / capabilities for the NVIDIA container runtime
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
## ---------- END GPU SUPPORT BLOCK ------------###
# Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors):
# devices:
# - /dev/nvidiactl:/dev/nvidiactl
# - /dev/nvidia-uvm:/dev/nvidia-uvm
# - /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools
# - /dev/nvidia0:/dev/nvidia0
open-webui:
image: ghcr.io/open-webui/open-webui:main
container_name: open-webui
volumes:
- ./docker-volumes/open-webui:/app/backend/data
depends_on:
- ollama
ports:
- 3000:8080
environment:
- 'OLLAMA_BASE_URL=http://ollama:11434'
- 'ENABLE_OLLAMA_API=true'
- 'WEBUI_SECRET_KEY='
extra_hosts:
- host.docker.internal:host-gateway
restart: unless-stopped

View File

@@ -6,8 +6,18 @@ readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"marimo>=0.18.0",
"modin[dask]>=0.37.1",
"numpy>=2.3.5",
"ollama>=0.6.1",
"openai>=2.9.0",
"openpyxl>=3.1.5",
"pandas>=2.3.3",
"pyzmq>=27.1.0",
"requests>=2.32.5",
"taguette>=1.5.1",
"wordcloud>=1.9.5",
]
[tool.uv.sources]
wordcloud = { git = "https://github.com/amueller/word_cloud.git" }

View File

@@ -1,86 +0,0 @@
"""
Standard utils for this repository
"""
import re
from pathlib import Path
import requests
from ollama import Client
def load_srt(path: str | Path) -> str:
"""Load and parse an SRT file, returning clean transcript with speaker labels.
Args:
path: Path to the SRT file
Returns:
Clean transcript string with format "SPEAKER_XX: text" per line,
timestamps stripped, consecutive lines from same speaker merged.
"""
path = Path(path)
content = path.read_text(encoding='utf-8')
# Parse SRT blocks: sequence number, timestamp, speaker|text
# Pattern matches: number, timestamp line, content line(s)
blocks = re.split(r'\n\n+', content.strip())
turns = []
for block in blocks:
lines = block.strip().split('\n')
if len(lines) < 3:
continue
# Skip sequence number (line 0) and timestamp (line 1)
# Content is line 2 onwards
text_lines = lines[2:]
text = ' '.join(text_lines)
# Parse speaker|text format
if '|' in text:
speaker, utterance = text.split('|', 1)
speaker = speaker.strip()
utterance = utterance.strip()
else:
speaker = "UNKNOWN"
utterance = text.strip()
turns.append((speaker, utterance))
# Merge consecutive turns from same speaker
merged = []
for speaker, utterance in turns:
if merged and merged[-1][0] == speaker:
merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
else:
merged.append((speaker, utterance))
# Format as "SPEAKER_XX: text"
transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
return '\n\n'.join(transcript_lines)
def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
"""Establish connection to Qumo Ollama instance
vm_name: str ('ollama-lite' or 'hiperf-gpu')
Name of the VM running the Ollama instance
Returns:
tuple(Client): Ollama client connected to the specified VM
"""
QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:11434'
try:
requests.get(QUMO_OLLAMA_URL, timeout=5)
client = Client(
host=QUMO_OLLAMA_URL
)
except requests.ConnectionError:
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
print(f"Connection succesful. WebUI available at: http://{vm_name}.tail44fa00.ts.net:3000\nAvailable models:")
for m in client.list().models:
print(f" - '{m.model}' ")
return client

5
utils/__init__.py Normal file
View File

@@ -0,0 +1,5 @@
from .ollama_utils import connect_qumo_ollama
from .data_utils import create_sentiment_matrix, extract_theme
from .transcript_utils import load_srt, csv_to_markdown, cpc_smb_to_markdown
from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis
from .keyword_analysis import ollama_keyword_extraction, worker_extraction, blue_color_func

65
utils/data_utils.py Normal file
View File

@@ -0,0 +1,65 @@
import pandas as pd
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
"""
Create a sentiment matrix for a specific document.
Parameters:
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
- document_name: Name of the document to filter by
Returns:
- DataFrame representing the sentiment matrix
"""
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
sentiment_rows = doc_df[
doc_df['tag'].str.contains(column_prefix, na=False)
].copy()
if sentiment_rows.empty:
print("No sentiment data found")
return pd.DataFrame()
# Filter for rows with valid Voice/Character context
valid_rows = sentiment_rows[
sentiment_rows['_context'].notna() &
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
].copy()
if valid_rows.empty:
print("No Voice/Character context found")
return pd.DataFrame()
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
# Sum sentiment scores for each combination
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
# Pivot to create the matrix
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
# # Convert to integers for cleaner display
# matrix = matrix.astype(int)
return matrix
def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
"""
Extract the theme from a tag string.
Parameters:
- tag: str, the tag string (e.g., 'VT - Personal Experience')
- theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
Returns:
- str, the extracted theme (e.g., 'Personal Experience')
- None if no theme found
"""
for prefix in theme_prefixes.split('|'):
if tag.startswith(prefix):
return tag.replace(prefix, '').strip()
return None

109
utils/keyword_analysis.py Normal file
View File

@@ -0,0 +1,109 @@
import pandas as pd
from ollama import Client
import json
import matplotlib.pyplot as plt
import random
import matplotlib.colors as mcolors
def blue_color_func( word, font_size, position, orientation, random_state=None, **kwargs):
# Use the provided random_state for reproducibility if available, else use random module
r = random_state if random_state else random
# Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
# 0.0 is white/light, 1.0 is dark blue
min_val, max_val = 0.4, 1.0
color_val = r.uniform(min_val, max_val)
# Get color from matplotlib colormap
rgba = plt.cm.Blues(color_val)
return mcolors.to_hex(rgba)
def worker_extraction(row, host, model):
# Instantiate local client for this specific worker/thread
local_client = Client(host=host)
return ollama_keyword_extraction(
content=row['content'],
tag=row['tag'],
client=local_client,
model=model
)
def ollama_keyword_extraction(content, tag, client: Client, model) -> list:
"""
Perform sentiment analysis using Ollama model.
Parameters:
- content: Text content to analyze
- tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
Returns:
- sentiment score and reason
"""
# Construct prompt for Ollama model
# Prompt optimized for small models (Llama 3.2):
# - Fewer rules, prioritized by importance
# - Explicit verbatim instruction (prevents truncation errors)
# - Examples that reinforce exact copying
# - Positive framing (do X) instead of negative (don't do Y)
# - Minimal formatting overhead
prompt = f"""Extract keywords from interview quotes for thematic analysis.
RULES (in priority order):
1. Extract only keywords RELEVANT to the given context. Ignore off-topic content. Do NOT invent keywords.
2. Use words from the quote, but generalize for clustering (e.g., "not youthful""traditional").
3. Extract 1-5 keywords or short phrases that capture key themes.
4. Prefer descriptive phrases over vague single words (e.g., "tech forward" not "tech").
EXAMPLES:
Context: Chase as a Brand
Quote: "It's definitely not, like, youthful or trendy."
Output: {{"keywords": ["traditional", "established"]}}
Context: App Usability
Quote: "There are so many options when I try to pay, it's confusing."
Output: {{"keywords": ["confusing", "overwhelming options"]}}
Context: Brand Perception
Quote: "I would say reliable, trustworthy, kind of old-school."
Output: {{"keywords": ["reliable", "trustworthy", "old-school"]}}
NOW EXTRACT KEYWORDS:
Context: {tag}
Quote: "{content}"
Output:"""
max_retries = 3
for attempt in range(max_retries):
try:
resp = client.generate(
model=model,
prompt=prompt,
format='json',
)
response_text = resp.response.strip()
# Extract JSON from response
start_index = response_text.find('{')
if start_index == -1:
raise ValueError("No JSON found")
response_json, _ = json.JSONDecoder().raw_decode(response_text[start_index:])
keywords = response_json.get('keywords', [])
return [keywords]
except Exception as e:
print(f"Attempt {attempt + 1}/{max_retries} failed: {e}. Output was: {response_text}")
if attempt == max_retries - 1:
return [[]]

42
utils/ollama_utils.py Normal file
View File

@@ -0,0 +1,42 @@
import requests
from ollama import Client
def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434', print_models=True) -> Client:
"""Establish connection to Qumo Ollama instance
vm_name: str ('ollama-lite' or 'hiperf-gpu')
Name of the VM running the Ollama instance
Returns:
tuple(Client): Ollama client connected to the specified VM
"""
QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
if vm_name in ['localhost', '0.0.0.0']:
QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
try:
requests.get(QUMO_OLLAMA_URL, timeout=5)
client = Client(
host=QUMO_OLLAMA_URL
)
print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}")
models = [m.model for m in client.list().models]
if print_models:
print("Available models:")
for m in models:
print(f" - '{m}' ")
return client, models
except requests.ConnectionError:
pass
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
return None, None

135
utils/sentiment_analysis.py Normal file
View File

@@ -0,0 +1,135 @@
import random
import pandas as pd
from ollama import Client
import json
def dummy_sentiment_analysis(content, tag):
if tag.startswith('VT -') or tag.startswith('CT -'):
return random.choice([-1, 0, 1]), 'random dummy sentiment' # Random sentiment for testing
return 'test', 'not applicable'
def ollama_sentiment_analysis(content, theme, theme_description, client: Client, model) -> tuple[list[str], int, str]:
"""
Perform sentiment analysis using Ollama model.
Parameters:
- content: Text content to analyze
- tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
Returns:
- sentiment score and reason
"""
prompt = f"""
# Role
You are an expert in sentiment analysis. Your task is to analyze the sentiment of a quote in relation to a specific theme.
# Input
Theme: `{theme}`
Theme Description: `{theme_description}`
Quote:
```
{content}
```
# Instructions
1. Analyze the sentiment of the quote specifically regarding the theme.
2. Extract relevant keywords or phrases from the quote. Prioritize specific descriptors found in the text that match or relate to the theme.
3. Assign a sentiment score:
- -1: Negative (complaint, dissatisfaction, criticism)
- 0: Neutral (factual, mixed, or no strong opinion)
- 1: Positive (praise, satisfaction, agreement)
4. Provide a concise reason (max 10 words).
# Constraints
- Return ONLY a valid JSON object.
- Do not use Markdown formatting (no ```json blocks).
- Do not write any Python code or explanations outside the JSON.
- If the quote is irrelevant to the theme, return sentiment 0.
# Response Format
{{
"keywords": ["<list_of_keywords>"],
"sentiment": <integer_score>,
"reason": "<string_reason>"
}}
# Examples
Example 1:
Theme: `Speed`
Quote: `It was a little slow for me.`
Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "Dissatisfaction with speed"}}
Example 2:
Theme: `Price`
Quote: `It costs $50.`
Response: {{"keywords": [], "sentiment": 0, "reason": "Factual statement"}}
Example 3:
Theme: `Friendliness`
Quote: `Sound very welcoming.`
Response: {{"keywords": ["welcoming"], "sentiment": 1, "reason": "Positive descriptor used"}}
"""
max_retries = 3
for attempt in range(max_retries):
try:
resp = client.generate(
model=model,
prompt=prompt,
)
response_text = resp.response.strip()
# Extract JSON from response
start_index = response_text.find('{')
end_index = response_text.rfind('}') + 1
if start_index == -1 or end_index == 0:
raise ValueError("No JSON found")
json_str = response_text[start_index:end_index]
response_json = json.loads(json_str)
keywords = response_json.get('keywords', [])
sentiment = response_json.get('sentiment', 'test')
reason = response_json.get('reason', 'no reason provided')
return keywords, sentiment, reason
except Exception as e:
print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
if attempt == max_retries - 1:
return [], None, 'parsing error'
if __name__ == "__main__":
client = Client(
host="http://localhost:11434"
)
sentiment_df = pd.DataFrame({
'content': [
"I love this product!",
"This is the worst service ever.",
"It's okay, not great but not terrible."
],
'tag': [
'VT - Personal Experience',
'VT - Personal Experience',
'VT - Personal Experience'
],
'manual_analysis': [False, False, True]
})
sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['tag'], client, model='llama3.2:latest')),
axis=1
)
print(sentiment_df.head())

148
utils/transcript_utils.py Normal file
View File

@@ -0,0 +1,148 @@
from pathlib import Path
import re
import pandas as pd
def load_srt(path: str | Path) -> str:
"""Load and parse an SRT file, returning clean transcript with speaker labels.
Args:
path: Path to the SRT file
Returns:
Clean transcript string with format "SPEAKER_XX: text" per line,
timestamps stripped, consecutive lines from same speaker merged.
"""
path = Path(path)
content = path.read_text(encoding='utf-8')
# Parse SRT blocks: sequence number, timestamp, speaker|text
# Pattern matches: number, timestamp line, content line(s)
blocks = re.split(r'\n\n+', content.strip())
turns = []
for block in blocks:
lines = block.strip().split('\n')
if len(lines) < 3:
continue
# Skip sequence number (line 0) and timestamp (line 1)
# Content is line 2 onwards
text_lines = lines[2:]
text = ' '.join(text_lines)
# Parse speaker|text format
if '|' in text:
speaker, utterance = text.split('|', 1)
speaker = speaker.strip()
utterance = utterance.strip()
else:
speaker = "UNKNOWN"
utterance = text.strip()
turns.append((speaker, utterance))
# Merge consecutive turns from same speaker
merged = []
for speaker, utterance in turns:
if merged and merged[-1][0] == speaker:
merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
else:
merged.append((speaker, utterance))
# Format as "SPEAKER_XX: text"
transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
return '\n\n'.join(transcript_lines)
def csv_to_markdown(csv_path:Path):
"""Convert transcript CSV to markdown, merging consecutive same-speaker turns."""
df = pd.read_csv(str(csv_path))
lines = ["# Interview Transcript"]
# Track previous speaker to detect when speaker changes
prev_speaker = None
# Accumulate text from consecutive turns by same speaker
merged_text = []
for _, row in df.iterrows():
speaker = row["Speaker"]
text = str(row["Transcript"]).strip()
if speaker == prev_speaker:
# Same speaker continues — append text to current block
merged_text.append(text)
else:
# New speaker detected — flush previous speaker's block
if prev_speaker is not None:
# Format: **Speaker**: text-part-1\n\ntext-part-2
# Use \n\n to ensure distinct paragraphs for readability
lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}")
# Start new block for current speaker
prev_speaker = speaker
merged_text = [text]
# Flush final speaker's block
if prev_speaker is not None:
lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}")
# Join all blocks with double newlines for clear separation
return "\n\n".join(lines)
def cpc_smb_to_markdown(cpc_path: Path) -> str:
"""Convert CPC text transcript to markdown, merging consecutive same-speaker turns."""
content = Path(cpc_path).read_text(encoding='utf-8')
lines = ["# Interview Transcript"]
prev_speaker = None
merged_text = []
# Regex to find speaker labels: Word followed by colon and space
speaker_pattern = re.compile(r'(?:^|\s)([A-Za-z0-9]+):\s')
for line in content.splitlines():
line = line.strip().replace('\n', ' ')
# Handle edge case: "CPC1, (She/ Her,) LOCATION: Hello." -> "CPC1: Hello."
match = re.match(r'^"?([A-Za-z0-9]+),\s*\(.*?\)\s*LOCATION:\s*(.*?)"?$', line)
if match:
line = f"{match.group(1)}: {match.group(2)}"
# Remove surrounding quotes
if line.startswith('"') and line.endswith('"'):
line = line[1:-1].strip()
if not line:
continue
parts = speaker_pattern.split(line)
# If no speaker found, skip line (assumed garbage like "Like", headers)
if len(parts) < 2:
continue
# parts[0] is text before the first speaker on this line
if parts[0].strip() and prev_speaker:
merged_text.append(parts[0].strip())
# Iterate over speaker-text pairs
for i in range(1, len(parts), 2):
speaker = parts[i]
text = parts[i+1].strip()
if speaker == prev_speaker:
merged_text.append(text)
else:
if prev_speaker is not None:
lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}")
prev_speaker = speaker
merged_text = [text]
if prev_speaker is not None:
lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}")
return "\n\n".join(lines)

1266
uv.lock generated

File diff suppressed because it is too large Load Diff