final tweaks

voice keyword blacklist
rename
2025-12-17 01:37:42 -08:00 · 2025-12-17 01:19:22 -08:00 · 2025-12-17 00:25:03 -08:00 · 2025-12-16 23:56:13 -08:00 · 2025-12-16 23:42:25 -08:00 · 2025-12-16 23:21:03 -08:00
21 changed files with 3741 additions and 124 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,7 @@
 __marimo__
 __pycache__/

-data/
+data/
+docker-volumes/
+logs/
+
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
--- a/01_Taguette-Pre-Process.py
+++ b/01_Taguette-Pre-Process.py
@@ -0,0 +1,114 @@
+import marimo
+
+__generated_with = "0.18.3"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import pandas as pd
+    from pathlib import Path
+    from utils import csv_to_markdown, cpc_smb_to_markdown
+    return Path, cpc_smb_to_markdown, csv_to_markdown, mo
+
+
+@app.cell
+def _(Path):
+    INPUT_DIR = Path("data/transcripts/raw")
+    OUTPUT_DIR = Path("data/transcripts/clean")
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    return INPUT_DIR, OUTPUT_DIR
+
+
+@app.cell
+def _(INPUT_DIR, mo):
+    csv_files = list(INPUT_DIR.glob("*.csv"))
+    file_options = {f.stem: str(f) for f in csv_files}
+
+    file_dropdown = mo.ui.dropdown(
+        options=file_options,
+        label="Select CSV Transcript",
+        full_width=True
+    )
+    file_dropdown
+    return (file_dropdown,)
+
+
+@app.cell
+def _(Path, cpc_smb_to_markdown, csv_to_markdown):
+    def jpmc_transcript_to_md(filepath):
+        fp = Path(filepath)
+        try:
+            return csv_to_markdown(filepath)
+        except Exception as e:
+            try:
+                return cpc_smb_to_markdown(filepath)
+            except Exception as e2:
+                raise ValueError(f"Failed to process file {filepath} with errors: {e}, {e2}")
+    return (jpmc_transcript_to_md,)
+
+
+@app.cell(hide_code=True)
+def _(file_dropdown, jpmc_transcript_to_md, mo):
+    # Preview
+    preview = mo.md("")
+    if file_dropdown.value:
+        md_content = jpmc_transcript_to_md(file_dropdown.value)
+        preview = mo.md(md_content[:1000])
+
+    preview
+    return
+
+
+@app.cell
+def _(mo):
+    convert_btn = mo.ui.run_button(label="Convert to Markdown")
+    convert_btn
+    return (convert_btn,)
+
+
+@app.cell
+def _(OUTPUT_DIR, Path, convert_btn, file_dropdown, jpmc_transcript_to_md, mo):
+    result = mo.md("")
+    saved_md = None
+
+    if convert_btn.value and file_dropdown.value:
+        saved_md = jpmc_transcript_to_md(file_dropdown.value)
+        _out_path = OUTPUT_DIR / (Path(file_dropdown.value).stem + ".md")
+        _out_path.write_text(saved_md)
+        result = mo.callout(f"✅ Saved to `{_out_path}`", kind="success")
+
+    result
+    return (saved_md,)
+
+
+@app.cell
+def _(mo, saved_md):
+    saved_preview = mo.md("")
+    if saved_md:
+        saved_preview = mo.vstack([
+            mo.md("### Saved Markdown Preview"),
+            mo.md(saved_md)
+        ])
+    saved_preview
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Taguette
+
+    Upload and process using taguette: http://taguette.tail44fa00.ts.net/
+    """)
+    return
+
+
+@app.cell
+def _():
+    return
+
+
+if __name__ == "__main__":
+    app.run()
--- a/02-B_WordClouds.py
+++ b/02-B_WordClouds.py
@@ -0,0 +1,666 @@
+import marimo
+
+__generated_with = "0.18.3"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import pandas as pd
+    import modin.pandas as mpd
+    from tqdm import tqdm
+    from pathlib import Path
+    from datetime import datetime
+
+    from utils import connect_qumo_ollama
+
+    OLLAMA_LOCATION= 'localhost'
+    # VM_NAME = 'ollama-lite'
+
+    # initialize tqdm for pandas
+    tqdm.pandas()
+
+
+    TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
+    WORKING_DIR = Path('./data/processing/02-b_WordClouds')
+    VOICE_EXCLUDE_KEYWORDS_FILE = WORKING_DIR / 'voice_excl_keywords.txt'
+
+    if not WORKING_DIR.exists():
+        WORKING_DIR.mkdir(parents=True)
+    if not TAGUETTE_EXPORT_DIR.exists():
+        TAGUETTE_EXPORT_DIR.mkdir(parents=True)
+
+    if not VOICE_EXCLUDE_KEYWORDS_FILE.exists():
+        VOICE_EXCLUDE_KEYWORDS_FILE.touch()
+
+    return (
+        OLLAMA_LOCATION,
+        TAGUETTE_EXPORT_DIR,
+        VOICE_EXCLUDE_KEYWORDS_FILE,
+        WORKING_DIR,
+        connect_qumo_ollama,
+        mo,
+        pd,
+    )
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # 1) Export Data out of Taguette
+
+    **Highlights**
+    1. Go to: https://taguette.qumo.io/project/1
+    2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
+    3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
+
+    **Tags Codebook**
+    1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
+    2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
+
+    _NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # 2) Import Data
+    """)
+    return
+
+
+@app.cell
+def _(TAGUETTE_EXPORT_DIR, pd):
+    all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
+    all_tags_df['_seq_id'] = range(len(all_tags_df))
+    # all_tags_df
+    return (all_tags_df,)
+
+
+@app.cell
+def _(all_tags_df):
+    # get count of rows per tag
+    tag_counts = all_tags_df['tag'].value_counts().reset_index()
+    # tag_counts
+    return
+
+
+@app.cell
+def _(TAGUETTE_EXPORT_DIR, pd):
+    codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
+    codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
+    # codebook_df
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # 3) Select Tag for processing
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(all_tags_df, mo):
+
+
+
+    tag_select = mo.ui.dropdown(
+        options=all_tags_df['tag'].unique().tolist(),
+        label="Select Tag to Process",
+        # value="Chase as a brand",
+        full_width=True,
+    )
+    tag_select
+    return (tag_select,)
+
+
+@app.cell(hide_code=True)
+def _(WORKING_DIR, all_tags_df, mo, tag_select):
+    mo.stop(not tag_select.value, mo.md("Select tag to continue"))
+
+    start_processing_btn = None
+    start_processing_btn = mo.ui.button(
+        label="Start Keyword Extraction",
+        kind="warn",
+        on_click=lambda val: True
+    )
+
+    tag_fname = tag_select.value.replace(" ", "-").replace('/','-')
+
+    SAVE_DIR = WORKING_DIR / tag_fname
+
+    if not SAVE_DIR.exists():
+        SAVE_DIR.mkdir(parents=True)
+
+    KEYWORDS_FPATH = SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'
+    KEYWORD_FREQ_FPATH = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
+
+    # filter all_tags_df to only the document = file_dropdown.value
+    tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
+    tags_df.head()
+    return (
+        KEYWORDS_FPATH,
+        KEYWORD_FREQ_FPATH,
+        SAVE_DIR,
+        start_processing_btn,
+        tag_fname,
+        tags_df,
+    )
+
+
+@app.cell(hide_code=True)
+def _(KEYWORD_FREQ_FPATH, mo):
+    mo.md(rf"""
+    # 4) Keyword extraction {'(skippable, see 4b)' if KEYWORD_FREQ_FPATH.exists() else '(Required)'}
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(OLLAMA_LOCATION, connect_qumo_ollama, mo):
+    try:
+        client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
+        model_select = mo.ui.dropdown(
+            options=_models,
+            value=_models[0],
+            label="Select Ollama Model to use",
+            searchable=True,
+        )
+    except Exception as e:
+        mo.md(f"Error connecting to Ollama server at `{OLLAMA_LOCATION}`: {e}")
+        model_select = None
+        client = None
+
+    model_select
+    return client, model_select
+
+
+@app.cell
+def _(mo, model_select, start_processing_btn, tag_select):
+    mo.stop(not tag_select.value or model_select is None, mo.md("Select tag to continue"))
+
+    start_processing_btn
+    return
+
+
+@app.cell(hide_code=True)
+def _(client, mo, model_select, pd, start_processing_btn, tags_df):
+    from utils import ollama_keyword_extraction, worker_extraction
+    # Wait for start processing button
+    mo.stop(not start_processing_btn.value, "Click button above to start processing")
+
+    if client is not None:
+        df = tags_df
+        # Run keyword extraction
+
+        df['keywords'] = df.progress_apply(
+            lambda row: pd.Series(ollama_keyword_extraction(
+                content=row['content'], 
+                tag=row['tag'], 
+                client=client, 
+                model=model_select.value
+            )),
+            axis=1
+        )
+    else:
+        mo.md("Ollama client not available, See 4b) for loading data from xlsx.")
+    return (df,)
+
+
+@app.cell(hide_code=True)
+def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
+    mo.stop(not start_processing_btn.value, "Click button above to process first")
+
+    df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws))
+
+    all_keywords_list = df['keywords'].tolist()
+
+    all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
+
+    # Calculate frequencies per keyword
+    keyword_freq = {}
+    for kw in all_keywords_flat:
+        if kw in keyword_freq:
+            keyword_freq[kw] += 1
+        else:
+            keyword_freq[kw] = 1
+
+    freq_df = pd.DataFrame.from_dict(keyword_freq, orient='index', columns=['frequency'])
+    freq_df.index.name = 'keyword'
+    freq_df.reset_index(inplace=True)
+    freq_df.sort_values(by='frequency', ascending=False, inplace=True)
+
+
+
+    # Save to Excel files
+
+    df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
+        KEYWORDS_FPATH,
+        index=False
+    )
+
+    freq_df.to_excel(
+        KEYWORD_FREQ_FPATH,
+        index=False
+    )
+    mo.vstack([
+        mo.md(f"Keywords per-highlight saved to: `{KEYWORDS_FPATH}`"),
+        mo.md(f"Keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
+    ])
+    return (freq_df,)
+
+
+@app.cell(hide_code=True)
+def _(KEYWORD_FREQ_FPATH, mo):
+    mo.md(rf"""
+    # 4b) [optional] Load data from `keyword_frequencies_{KEYWORD_FREQ_FPATH.name}`
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(KEYWORD_FREQ_FPATH, mo, start_processing_btn):
+    if start_processing_btn is not None: # Triggers re-execution of this cell when keyword extraction completes
+        pass
+
+    
+    load_existing_btn = None
+    if KEYWORD_FREQ_FPATH.exists():
+        load_existing_btn = mo.ui.run_button(label=f"Load `{KEYWORD_FREQ_FPATH.name}`", kind='warn')
+
+    load_existing_btn
+    return (load_existing_btn,)
+
+
+@app.cell(hide_code=True)
+def _(
+    KEYWORD_FREQ_FPATH,
+    VOICE_EXCLUDE_KEYWORDS_FILE,
+    freq_df,
+    load_existing_btn,
+    pd,
+    tag_select,
+):
+    if load_existing_btn is not None and load_existing_btn.value:
+        _fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
+
+        # Drop nan rows if any
+        _fdf.dropna(subset=['keyword', 'frequency'], inplace=True)
+        _fdf.sort_values(by='frequency', ascending=False, inplace=True)
+        _fdf.reset_index(drop=True, inplace=True)
+        print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
+        
+        frequency_df = _fdf
+
+    else:
+        frequency_df = freq_df
+
+    if tag_select.value.startswith('V'):
+        # Read exclusion list
+        excl_kw = []
+        with VOICE_EXCLUDE_KEYWORDS_FILE.open('r') as _f:
+            for line in _f:
+                excl_kw.append(line.strip())
+
+        _drop_idx = frequency_df[frequency_df['keyword'].isin(excl_kw)].index
+
+        frequency_df.drop(index=_drop_idx, inplace=True, axis=0)
+        print(f"Dropped {len(_drop_idx)} keywords automatically")
+    return (frequency_df,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # 5) Wordcloud generation
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _():
+    # Import all necessary libraries
+    import numpy as np
+    from os import path
+    from PIL import Image, ImageDraw
+    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
+    import matplotlib.pyplot as plt
+    from utils import blue_color_func
+
+    import warnings
+    warnings.filterwarnings("ignore")
+    return Image, ImageDraw, WordCloud, blue_color_func, np, plt
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## 5.1) Select threshold frequency
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    min_freq_select = mo.ui.number(start=1, stop=20, label="Threshold Minimum Keyword Frequency: ", value=2)
+    min_freq_select
+    return (min_freq_select,)
+
+
+@app.cell(hide_code=True)
+def _(mo, tag_select):
+    mo.md(rf"""
+    ## 5.2) Inspect Keyword Dataset
+
+    1. Check the threshold is set correctly. If not, adjust accordingly
+    2. Read all the keywords and verify they are good. If not
+       - Add explicit exclusions if necessary below
+       - OR Rerun the keyword extraction above
+
+
+
+    Add words to this dict that should be ignored in the WordCloud for specific tags. 
+    Make sure to create the correct key that matches the active selected tag: 
+
+    Active selected tag = '`{tag_select.value.lower()}`'
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(frequency_df, min_freq_select, mo):
+    mo.stop('keyword' not in frequency_df.columns, "Waiting for keyword extraction to finish")
+
+    MIN_FREQ = min_freq_select.value
+
+    _freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
+
+    table_selection = mo.ui.table(_freq_df_filtered, page_size=50)
+    table_selection
+
+    return MIN_FREQ, table_selection
+
+
+@app.cell(hide_code=True)
+def _(mo, table_selection):
+    remove_rows_btn = None
+    if len(table_selection.value) >0 :
+        remove_rows_btn = mo.ui.run_button(label="Click to remove selected keywords and update xlsx")
+
+    remove_rows_btn
+    return (remove_rows_btn,)
+
+
+@app.cell(hide_code=True)
+def _(
+    KEYWORD_FREQ_FPATH,
+    VOICE_EXCLUDE_KEYWORDS_FILE,
+    frequency_df,
+    mo,
+    remove_rows_btn,
+    table_selection,
+    tag_select,
+):
+    _s = None
+    if remove_rows_btn is not None and remove_rows_btn.value:
+        # get selected rows
+        selected_rows = table_selection.value
+        if len(selected_rows) >0 :
+            rows_to_drop = table_selection.value.index.tolist()
+            try:
+                if tag_select.value.startswith('V'):
+                    # append values to an VoiceKeywordsExclusion file (txt file just a list of keywords)
+                    exclude_keywords = frequency_df.loc[rows_to_drop, 'keyword'].to_list()
+
+                    with VOICE_EXCLUDE_KEYWORDS_FILE.open('w') as f:
+                        for _kw in exclude_keywords:
+                            f.write(_kw + '\n')
+                
+
+                
+                frequency_df.drop(index=rows_to_drop, inplace=True, axis=0)
+
+                
+            
+            except KeyError:
+                _s = mo.callout("GO BACK TO STEP 4b) and reload data to continue refining the dataset.", kind='warn')
+            else:
+                # Save updated frequencies back to xlsx
+                frequency_df.to_excel(
+                    KEYWORD_FREQ_FPATH,
+                    index=False
+                )
+            
+                print(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
+
+            # mo.callout(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`", kind="success")
+            _s = mo.callout("GO BACK TO STEP 4b) and reload data before continuing.", kind='warn')
+
+    _s
+    return
+
+
+@app.cell(hide_code=True)
+def _():
+    IGNORE_WORDS = {
+        'chase as a brand': [
+            "brand",
+            "banking experience",
+            "banking",
+            "chase",
+            "jpmorgan",
+            "youthful",
+            "customer service",
+            "customer service focused",
+            "great brand",
+        ],
+        'why customer chase': [
+            "customer service",
+            "customer loyalty",
+            "chase",
+            "chase customer",
+            "banking experience",
+        ],
+        'chase as a person (personification)': [
+            "CPC1"
+        ]
+        # <active-selected-tag>: [list, of, words, to, ignore]
+    }
+    return (IGNORE_WORDS,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    buffer = -100 # Adjust this to increase/decrease space between logo and words
+    canvas_size = (1200, 800)
+
+    logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
+
+    n_words = mo.ui.slider(start=10, stop=200, step=1, value=100, debounce=True, show_value=True, label="Max number of words in WordCloud")
+    return buffer, canvas_size, logo_switch, n_words
+
+
+@app.cell(hide_code=True)
+def _(logo_switch, mo, n_words):
+    run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud")
+
+    mo.vstack([
+        mo.md("## 5.4) Generate WordCloud with/without Logo"),
+        mo.md("""Use these buttons to iteratively (re)generate the WordCloud until it looks nice. 
+
+        Placement and color of words is randomized, size is proportional to frequency.
+
+        When satisfied with the result, click 'Save WordCloud to File' to save the image."""),
+        mo.md('---'),
+        mo.hstack([logo_switch, n_words, run_wordcloud_btn], align='center', justify='space-around')]
+    )
+    return (run_wordcloud_btn,)
+
+
+@app.cell(hide_code=True)
+def _(
+    IGNORE_WORDS,
+    Image,
+    ImageDraw,
+    MIN_FREQ,
+    WordCloud,
+    blue_color_func,
+    buffer,
+    canvas_size,
+    frequency_df,
+    logo_switch,
+    mo,
+    n_words,
+    np,
+    plt,
+    run_wordcloud_btn,
+    tag_select,
+):
+    if run_wordcloud_btn.value:
+        pass
+
+    freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
+
+    # freq_df_filtered.reset_index(drop=True, inplace=True)
+
+    keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
+
+    # remove specific keywords depending on selected tag
+    if IGNORE_WORDS.get(tag_select.value.lower()):
+        for word in IGNORE_WORDS[tag_select.value.lower()]:
+            if word in keyword_freq_filtered:
+                del keyword_freq_filtered[word]
+
+    if logo_switch.value:
+        # 1. Load the logo
+        # Make sure this path points to your uploaded file
+        logo_path = "./assets/JP-Morgan-Chase-Symbol.png" 
+        logo = Image.open(logo_path).convert("RGBA")
+
+        # Optional: Resize logo if it's too large or small for the canvas
+        # target_width = 600
+        # ratio = target_width / logo.width
+        # logo = logo.resize((target_width, int(logo.height * ratio)), Image.Resampling.LANCZOS)
+        target_width = 600  # Set a reasonable size for the logo
+        if logo.width > target_width:
+            ratio = target_width / logo.width
+            new_height = int(logo.height * ratio)
+            # Use Image.Resampling.LANCZOS for high-quality downsampling
+            # If you get an error, try Image.LANCZOS or Image.ANTIALIAS
+            logo = logo.resize((target_width, new_height), Image.Resampling.LANCZOS)
+
+        # 3. Create the mask (0 = draw here, 255 = don't draw here)
+        # Initialize with 0 (black/draw everywhere)
+        mask_image = Image.new("L", canvas_size, 0)
+        draw = ImageDraw.Draw(mask_image)
+
+        # 4. Draw a protected circular area in the center
+        center = (canvas_size[0] // 2, canvas_size[1] // 2)
+
+        # Calculate radius: half of logo max dimension + buffer
+        radius = (max(logo.size) // 2) + buffer
+
+        # Draw the white circle (255) which the WordCloud will avoid
+        draw.ellipse(
+            (center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius),
+            fill=255
+        )
+
+        chase_mask = np.array(mask_image)
+
+        # Generate the WordCloud
+        wordcloud = WordCloud(
+            background_color='white',
+            width=canvas_size[0],
+            height=canvas_size[1],
+            max_font_size=100,  # Increased font size for larger canvas
+            max_words=n_words.value,      # Increased word count to fill space
+            color_func=blue_color_func,
+            mask=chase_mask,    # Apply the circular mask
+            contour_width=0,
+            contour_color='steelblue'
+        ).generate_from_frequencies(keyword_freq_filtered)
+
+    else:
+        # Generate the WordCloud
+        wordcloud = WordCloud(
+            background_color='white',
+            width=canvas_size[0],
+            height=canvas_size[1],
+            max_font_size=150,  # Increased font size for larger canvas
+            max_words=n_words.value,      # Increased word count to fill space
+            color_func=blue_color_func,
+            # mask=chase_mask,    # Apply the circular mask
+            # contour_width=0,
+            # contour_color='steelblue'
+        ).generate_from_frequencies(keyword_freq_filtered)
+
+    # Convert WordCloud to Image to composite the logo
+    wc_image = wordcloud.to_image()
+
+    if logo_switch.value:
+
+        # Calculate position to center the logo
+        logo_pos = (
+            (canvas_size[0] - logo.width) // 2,
+            (canvas_size[1] - logo.height) // 2
+        )
+
+        # Paste logo (using alpha channel as mask to keep transparency)
+        wc_image.paste(logo, logo_pos, logo)
+
+        # Display the generated image
+        fig = plt.figure(figsize=(7,7))
+
+    # Display the generated image:
+    plt.imshow(wc_image, interpolation='bilinear')
+    plt.axis("off")
+    plt.show()
+
+    save_wordcloud_btn = None
+    save_wordcloud_btn = mo.ui.button(
+        label="Save WordCloud to File",
+        kind="warn",
+        on_click=lambda val: True
+    )
+    save_wordcloud_btn
+    return save_wordcloud_btn, wc_image
+
+
+@app.cell(hide_code=True)
+def _(SAVE_DIR, mo, save_wordcloud_btn, tag_fname, wc_image):
+    # Wait for start processing button
+    mo.stop(not save_wordcloud_btn.value, "Click button above to save wordcloud image")
+
+
+    filename = f'wordcloud_{tag_fname}.png'
+
+
+    fpath = SAVE_DIR / filename
+
+    # add a (increasing) number to the filename so we can save multiple. find the latest in the directory first
+    existing_files = list(SAVE_DIR.glob(f'wordcloud_{tag_fname}*.png'))
+    if existing_files:
+        existing_numbers = []
+        for ef in existing_files:
+            parts = ef.stem.split('_')
+            if len(parts) > 2 and parts[-1].isdigit():
+                existing_numbers.append(int(parts[-1]))
+        if existing_numbers:
+            next_number = max(existing_numbers) + 1
+        else:
+            next_number = 1
+        fpath = SAVE_DIR / f'wordcloud_{tag_fname}_{next_number}.png'
+
+    wc_image.save(fpath)
+    mo.md(f"Wordcloud saved to: {fpath}")
+    return
+
+
+if __name__ == "__main__":
+    app.run()
--- a/02_Taguette_Post-Process.py
+++ b/02_Taguette_Post-Process.py
@@ -0,0 +1,461 @@
+import marimo
+
+__generated_with = "0.18.3"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import pandas as pd
+    from pathlib import Path
+    from datetime import datetime
+
+    from utils import connect_qumo_ollama
+
+    OLLAMA_LOCATION= 'localhost'
+    # VM_NAME = 'ollama-lite'
+
+    client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
+
+    TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
+    WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
+
+    if not WORKING_DIR.exists():
+        WORKING_DIR.mkdir(parents=True)
+    if not TAGUETTE_EXPORT_DIR.exists():
+        TAGUETTE_EXPORT_DIR.mkdir(parents=True)
+
+    model_select = mo.ui.dropdown(
+        options=_models,
+        value=_models[0],
+        label="Select Ollama Model to use",
+        searchable=True,
+    )
+    model_select
+    return (
+        TAGUETTE_EXPORT_DIR,
+        WORKING_DIR,
+        client,
+        datetime,
+        mo,
+        model_select,
+        pd,
+    )
+
+
+@app.cell(hide_code=True)
+def _(TAGUETTE_EXPORT_DIR, mo):
+    mo.md(rf"""
+    # Step 1: Export Data out of Taguette
+
+    **Highlights**
+    1. Go to: https://taguette.qumo.io/project/1
+    2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
+    3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
+
+    **Tags Codebook**
+    1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
+    2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
+
+    _NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Step 2: Import here for processing
+    """)
+    return
+
+
+@app.cell
+def _(TAGUETTE_EXPORT_DIR, pd):
+    all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
+    all_tags_df['_seq_id'] = range(len(all_tags_df))
+    all_tags_df
+    return (all_tags_df,)
+
+
+@app.cell
+def _(TAGUETTE_EXPORT_DIR, pd):
+    codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
+    codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
+    codebook_df
+    return (codebook_df,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Step 3: Process each 'Interview'
+    """)
+    return
+
+
+@app.cell
+def _(all_tags_df, mo):
+
+    interview_select = mo.ui.dropdown(
+        options=all_tags_df['document'].unique().tolist(),
+        label="Select Interview to Process",
+        full_width=True
+    )
+    interview_select
+    return (interview_select,)
+
+
+@app.cell
+def _(all_tags_df, interview_select, mo):
+    mo.stop(not interview_select.value, mo.md("Select interview to continue"))
+    # filter all_tags_df to only the document = file_dropdown.value
+    df = all_tags_df.loc[all_tags_df['document'] == interview_select.value].copy()
+    return (df,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Add `_context` column to track Voice / Character is being referred to per highlight
+    Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.
+
+    1. Iterates through the dataframe in document order (row by row)
+    2. Uses a set to track which highlight IDs we've already processed
+    3. When we encounter a new highlight ID for the first time, we process all its rows
+    4. Collects all _V- or _C- tags within that highlight
+    5. Assigns the context to all rows with that ID
+    6. This preserves document order and handles multi-tag highlights correctly
+
+
+    Example of challenging case:
+
+    | tag                                | content | _seq_id | _context         |
+    |------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
+    |  _V-54                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 117        | _V-54, _V-41         |
+    | _V-41                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 118        | _V-54, _V-41         |
+    | VT - Human / Artificial            | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 119        | _V-54, _V-41         |
+    | VT - Friendliness / Empathy        | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 120        | _V-54, _V-41         |
+    """)
+    return
+
+
+@app.cell
+def _(df):
+    # First pass: identify context tags within each highlight group
+    df['_context'] = None
+    last_context = None
+    processed_ids = set()
+
+    # Process in document order
+    for idx, row in df.iterrows():
+        highlight_id = row['id']
+
+        # If we haven't processed this highlight yet
+        if highlight_id not in processed_ids:
+            processed_ids.add(highlight_id)
+
+            # Get all rows for this highlight
+            highlight_rows = df[df['id'] == highlight_id]
+
+            # Collect all context tags in this highlight
+            context_tags = []
+            for _, h_row in highlight_rows.iterrows():
+                tag = h_row.get('tag', '')
+                if '_V-' in tag or '_C-' in tag:
+                    context_tags.append(tag)
+
+            # If we found context tags, join them with comma
+            if context_tags:
+                context_tag = ', '.join(context_tags)
+                last_context = context_tag
+            else:
+                # If no context tag in this highlight, use the last context
+                context_tag = last_context
+
+            # Assign the context to all rows in this highlight
+            df.loc[df['id'] == highlight_id, '_context'] = context_tag
+
+    df
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Split multi-context rows (only VT- and CT- theme tags)
+
+    For rows that have multiple contexts (e.g., both _V-54 and _V-41)
+    - split these into separate rows for each context.
+    - Then mark these for 'manual_analysis'
+    """)
+    return
+
+
+@app.cell
+def _(df, pd):
+    # Expand rows that contain multiple contexts (comma-separated)
+    expanded_rows = []
+
+    for _, _row in df.iterrows():
+        context_value = _row['_context']
+        has_multiple = pd.notna(context_value) and ',' in str(context_value)
+
+        if has_multiple:
+            contexts = [c.strip() for c in str(context_value).split(',')]
+        else:
+            contexts = [context_value]
+
+        if has_multiple:
+            for ctx in contexts:
+                new_row = _row.copy()
+                new_row['_context'] = ctx
+                new_row['manual_analysis'] = True
+
+                if str(new_row['tag']).startswith(('VT -', 'CT -')):
+                    new_row['sentiment'] = None
+
+                expanded_rows.append(new_row)
+        else:
+            new_row = _row.copy()
+            new_row['_context'] = contexts[0]
+            new_row['manual_analysis'] = False
+            expanded_rows.append(new_row)
+
+    expanded_df_raw = pd.DataFrame(expanded_rows).reset_index(drop=True)
+
+
+    sentiment_df = expanded_df_raw.loc[
+        expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
+    ].copy()
+
+    print(f"{len(sentiment_df[sentiment_df['manual_analysis']])} Rows with multiple contexts")
+
+    sentiment_df[sentiment_df['manual_analysis']]
+    return (sentiment_df,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Create 'theme' column
+    """)
+    return
+
+
+@app.cell
+def _(sentiment_df):
+    from utils import extract_theme
+    sentiment_df['theme'] = sentiment_df.apply(lambda row: extract_theme(row['tag']), axis=1)
+    sentiment_df
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Extract Sentiment + Reasoning
+
+    For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.
+
+    Values to be used:
+    - Positive: +1
+    - Neutral: 0
+    - Negative: -1
+    """)
+    return
+
+
+@app.cell
+def _(mo):
+    start_processing_btn = mo.ui.button(
+        label="Start Sentiment Extraction",
+        kind="warn",
+        on_click=lambda val: True
+    )
+    start_processing_btn
+    return (start_processing_btn,)
+
+
+@app.cell
+def _(
+    client,
+    codebook_df,
+    mo,
+    model_select,
+    pd,
+    sentiment_df,
+    start_processing_btn,
+):
+    from utils import dummy_sentiment_analysis, ollama_sentiment_analysis
+
+    # add theme_description to be used in LLM prompt
+    _df = sentiment_df.merge(codebook_df, on='tag', how='left', suffixes=('', '_codebook'))
+
+    # Wait for start processing button
+    mo.stop(not start_processing_btn.value, "Click button above to start processing")
+
+
+    sentiment_df[['keywords', 'sentiment', 'reason']] = _df[~_df['manual_analysis']].apply(
+        lambda row: pd.Series(ollama_sentiment_analysis(
+            content=row['content'], 
+            theme=row['theme'], 
+            theme_description=row['theme_description'],
+            client=client, 
+            model=model_select.value
+        )),
+        axis=1
+    )
+    return
+
+
+@app.cell
+def _(mo, sentiment_df):
+    mo.stop(('sentiment' not in sentiment_df.columns), "Run above cells to extract sentiment analysis")
+    sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Multi-context tags
+    """)
+    return
+
+
+@app.cell
+def _(mo, sentiment_df):
+    manual_rows = sentiment_df[sentiment_df['manual_analysis']]
+    split_rows_editor = None
+    rows_to_edit = []
+
+    if not manual_rows.empty:
+        print(
+            f"⚠️  {len(manual_rows)} rows were created from multi-context splits. "
+            "See next cell for manual review."
+        )
+
+        # Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
+        rows_to_edit = sentiment_df[
+            (sentiment_df['manual_analysis'])
+        ]
+
+        # Create data editor for split rows
+        split_rows_editor = mo.ui.data_editor(
+            rows_to_edit
+    ).form(label="Update Sentiment / Manual Flag")
+
+    else:
+        print("✓ No multi-context rows found")
+    return rows_to_edit, split_rows_editor
+
+
+@app.cell
+def _(split_rows_editor):
+    split_rows_editor
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo, rows_to_edit, split_rows_editor):
+    if split_rows_editor is not None:
+        mo.vstack([
+            mo.md(f"""
+            ### ⚠️ Manual Review Required
+
+            **{len(rows_to_edit)} rows** were split from multi-context entries.
+            Please review them below:
+            1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
+            2. Click **Submit** to apply changes.
+            """),
+            split_rows_editor
+        ])
+    return
+
+
+@app.cell
+def _(mo, split_rows_editor):
+    # Capture the edited manual-analysis rows for validation
+    reviewed_manual_rows = getattr(split_rows_editor, 'value', '')
+    mo.stop(reviewed_manual_rows is None, mo.md("Submit your sentiment analysis changes before continuing."))
+
+    # Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
+
+    if (reviewed_manual_rows != '') and (not reviewed_manual_rows.empty):
+        valid_sentiments = {-1, 0, 1}
+        needs_review = reviewed_manual_rows[
+            reviewed_manual_rows['manual_analysis']
+            & ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
+        ]
+        assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
+
+        print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
+    return (reviewed_manual_rows,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Recombine
+    """)
+    return
+
+
+@app.cell
+def _(pd, reviewed_manual_rows, sentiment_df):
+    _static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
+    if isinstance(reviewed_manual_rows, pd.DataFrame):
+        recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
+    else:
+        recombined_df = sentiment_df
+
+    recombined_df
+    return (recombined_df,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Step 3: Process 'Other' tags
+
+    These need to be reviewed manually for interesting content
+    """)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+ 
+    """)
+    return
+
+
+@app.cell
+def _():
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Save to CSV
+    """)
+    return
+
+
+@app.cell
+def _(WORKING_DIR, datetime, interview_select, recombined_df):
+    # Save to CSV in working dir
+    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+    filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments.csv"
+    recombined_df.to_csv(filename, index=False)
+
+    print(f"✓ Saved processed data to '{filename}'")
+    return
+
+
+if __name__ == "__main__":
+    app.run()
--- a/03_Sentiment_Analysis.py
+++ b/03_Sentiment_Analysis.py
@@ -0,0 +1,146 @@
+import marimo
+
+__generated_with = "0.18.3"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import pandas as pd
+    from pathlib import Path
+    from utils import create_sentiment_matrix
+
+    INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
+    WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
+
+    if not WORKING_DIR.exists():
+        WORKING_DIR.mkdir(parents=True)
+    return INPUT_DIR, Path, WORKING_DIR, create_sentiment_matrix, mo, pd
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Load Sentiment CSV
+    """)
+    return
+
+
+@app.cell
+def _(INPUT_DIR, mo):
+    csv_files = list(INPUT_DIR.glob("*.csv"))
+    file_options = {f.stem: str(f) for f in csv_files}
+
+    sentiment_csv = mo.ui.dropdown(
+        options=file_options,
+        label="Select Sentiment CSV File",
+        full_width=True
+    )
+    sentiment_csv
+    return (sentiment_csv,)
+
+
+@app.cell
+def _(Path, pd, sentiment_csv):
+    input_csv_name = Path(sentiment_csv.value).stem
+    timestamp = input_csv_name.split('_')[-1]
+    doc = input_csv_name.split('_')[0]
+
+    sentiment_df = pd.read_csv(sentiment_csv.value)
+    sentiment_df
+    return doc, sentiment_df, timestamp
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Phase 1: Individual interview analysis
+    - Create sentiment matrices for each interview (document)
+    - Save the intermediate results to file in the `WORKING_DIR`
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Step 1.1: Voice Sample vs. Theme Sentiment Matrix
+
+    For each interview (document), create a matrix where:
+    - Rows represent the different Voices (based on '_V-' tags)
+    - Columns represent the different VoiceThemes(based on 'VT -' tags)
+    - Each cell contains the aggregated sentiment score (sum) for that Voice/Theme combination
+    """)
+    return
+
+
+@app.cell
+def _(create_sentiment_matrix, sentiment_df):
+    voice_matrix = create_sentiment_matrix(sentiment_df, column_prefix='VT - ', row_prefix='_V-')
+    voice_matrix
+    return (voice_matrix,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    SAVE TO CSV
+    """)
+    return
+
+
+@app.cell
+def _(WORKING_DIR, doc, timestamp, voice_matrix):
+    # Save to CSV
+    voice_filename = WORKING_DIR / f"{doc}_voice_theme_matrix_{timestamp}.csv"
+
+    voice_matrix.to_csv(voice_filename)
+
+    print(f"Saved to '{voice_filename}'")
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Step 1.2: Character Sample vs. Theme Sentiment Matrix
+
+    For each interview (document), create a matrix where:
+    - Rows represent the different Characters (based on  '_C-' tags)
+    - Columns represent the different CharacterThemes (based on 'CT -' tags)
+    - Each cell contains the aggregated sentiment score (sum) for that Character/Theme combination
+    """)
+    return
+
+
+@app.cell
+def _(create_sentiment_matrix, sentiment_df):
+    character_matrix = create_sentiment_matrix(sentiment_df, column_prefix='CT - ', row_prefix='_C-')
+    character_matrix
+    return (character_matrix,)
+
+
+@app.cell
+def _(WORKING_DIR, character_matrix, doc, timestamp):
+    # Save to CSV
+    character_filename = WORKING_DIR / f"{doc}_character_theme_matrix_{timestamp}.csv"
+
+    character_matrix.to_csv(character_filename)
+
+    print(f"Saved to '{character_filename}'")
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Step 1.3: Chase Brand Sentiment
+
+    TODO: not sure we have enough supporting data for this yet
+    """)
+    return
+
+
+if __name__ == "__main__":
+    app.run()
--- a/04_Results_Aggregation.py
+++ b/04_Results_Aggregation.py
@@ -0,0 +1,86 @@
+import marimo
+
+__generated_with = "0.18.3"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import pandas as pd
+    from pathlib import Path
+
+    INPUT_DIR = Path("./data/processing/03_sentiment_analysis")
+    WORKING_DIR = Path('./data/processing/04_sentiment_aggregation')
+
+    if not WORKING_DIR.exists():
+        WORKING_DIR.mkdir(parents=True)
+    return INPUT_DIR, mo, pd
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Voices
+    """)
+    return
+
+
+@app.cell
+def _(INPUT_DIR, mo):
+    voice_csv_files = list(INPUT_DIR.glob("*voice*.csv"))
+    file_options = {f.stem: str(f) for f in voice_csv_files}
+
+    voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation")
+
+    return (voice_multiselect,)
+
+
+@app.cell
+def _(mo, voice_multiselect):
+    mo.hstack([voice_multiselect, mo.md(f"Has value: {voice_multiselect.value}")])
+    return
+
+
+@app.cell
+def _(pd, voice_multiselect):
+    # Load all voice CSV files and aggregate them so that each row-column pair is summed
+    KEY_COL = "_context"
+
+    def _read_voice_csv(path: str) -> pd.DataFrame:
+        df = pd.read_csv(path).set_index(KEY_COL)
+        df = df.apply(pd.to_numeric, errors="coerce")
+        return df
+
+    def aggregate_voice_data(files: list[str]) -> pd.DataFrame:
+        if not files:
+            return pd.DataFrame()
+
+        master = _read_voice_csv(files[0])
+        for path in files[1:]:
+            master = master.add(_read_voice_csv(path), fill_value=0)
+
+        return master.reset_index()
+
+    master_df = aggregate_voice_data(voice_multiselect.value)
+    master_df
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Characters
+    """)
+    return
+
+
+@app.cell
+def _(INPUT_DIR):
+    char_csv_files = list(INPUT_DIR.glob("*character*.csv"))
+    char_csv_files
+    return
+
+
+if __name__ == "__main__":
+    app.run()
--- a/Architecture_Overview.py
+++ b/Architecture_Overview.py
@@ -78,7 +78,9 @@ def _(mo):

    **Goal:** Convert unstructured text into a structured dataset.

-    1.  **Input:** All 26 Transcripts + `master_codebook.json`.
+    This will be a dedicated notebook, and be run per transcript.
+
+    1.  **Input:** Transcript + `master_codebook.json`.
    2.  **Process:**
        *   The LLM analyzes each transcript segment-by-segment.
        *   It extracts specific quotes that match a Theme Definition.
@@ -86,8 +88,9 @@ def _(mo):
        *   **Granular Sentiment Analysis:** For each quote, the model identifies:
            *   **Subject:** The specific topic/object being discussed (e.g., "Login Flow", "Brand Tone").
            *   **Sentiment:** Positive / Neutral / Negative.
-    3.  **Output:** `coded_segments.csv`
+    3.  **Output:** `<transcript_name>_coded_segments.csv`
        *   Columns: `Source_File`, `Speaker`, `Theme`, `Quote`, `Subject`, `Sentiment`, `Context`.
+        *   Each transcript produces its own CSV-file, which can be reviewed and adjusted before moving to the next stage
    """)
    return

--- a/Stage1_Theme_Discovery.py
+++ b/Stage1_Theme_Discovery.py
@@ -0,0 +1,226 @@
+import marimo
+
+__generated_with = "0.18.1"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import json
+    import pandas as pd
+    import re
+    from pathlib import Path
+    from utils import connect_qumo_ollama, load_srt
+
+    # Configuration
+    VM_NAME = 'hiperf-gpu'
+    MODEL = 'llama3.3:70b'
+    TRANSCRIPT_DIR = Path("data/transcripts")
+    OUTPUT_FILE = Path("master_codebook.json")
+
+    client = connect_qumo_ollama(VM_NAME)
+    return (
+        MODEL,
+        OUTPUT_FILE,
+        TRANSCRIPT_DIR,
+        client,
+        json,
+        load_srt,
+        mo,
+        pd,
+        re,
+    )
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    # Stage 1: Theme Discovery
+
+    **Goal:** Identify recurring themes across a sample of interviews.
+
+    1.  **Select Transcripts:** Choose 4-5 representative interviews.
+    2.  **Extract Topics:** The AI will analyze each transcript to find key topics.
+    3.  **Synthesize Themes:** Topics are grouped into a Master Codebook.
+    4.  **Refine & Save:** Edit the definitions and save the `master_codebook.json`.
+    """)
+    return
+
+
+@app.cell
+def _(TRANSCRIPT_DIR, mo):
+    # File Selection
+    srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
+    file_options = {f.name: str(f) for f in srt_files}
+
+    file_selector = mo.ui.multiselect(
+        options=file_options,
+        label="Select Transcripts (Recommended: 4-5)",
+        full_width=True
+    )
+    file_selector
+    return (file_selector,)
+
+
+@app.cell
+def _(file_selector, mo):
+    mo.md(f"**Selected:** {len(file_selector.value)} files")
+    return
+
+
+@app.cell
+def _(mo):
+    start_discovery_btn = mo.ui.run_button(label="Start Discovery Process")
+    start_discovery_btn
+    return (start_discovery_btn,)
+
+
+@app.cell
+def _(
+    MODEL,
+    client,
+    file_selector,
+    json,
+    load_srt,
+    mo,
+    re,
+    start_discovery_btn,
+):
+    # Map Phase: Extract Topics per Transcript
+    extracted_topics = []
+    status_callout = mo.md("")
+
+    if start_discovery_btn.value and file_selector.value:
+        with mo.status.spinner("Analyzing transcripts...") as _spinner:
+            for filepath in file_selector.value:
+                _transcript = load_srt(filepath)
+
+                # Truncate for discovery if too long (optional, but good for speed)
+                # Using first 15k chars usually gives enough context for high-level themes
+                _context = _transcript[:15000] 
+
+                _prompt = f"""
+                Analyze this interview transcript and list the top 5-7 key topics or themes discussed.
+                Focus on: Brand voice, Customer experience, Design systems, and AI.
+
+                Return ONLY a JSON list of strings. Example: ["Inconsistent Tone", "Mobile Latency", "AI Trust"]
+
+                Transcript:
+                {_context}...
+                """
+
+                try:
+                    _response = client.generate(model=MODEL, prompt=_prompt)
+                    # Find JSON list in response
+                    _match = re.search(r'\[.*\]', _response.response, re.DOTALL)
+                    if _match:
+                        _topics = json.loads(_match.group(0))
+                        extracted_topics.extend(_topics)
+                except Exception as e:
+                    print(f"Error processing {filepath}: {e}")
+
+        status_callout = mo.callout(
+            f"✅ Extracted {len(extracted_topics)} raw topics from {len(file_selector.value)} files.", 
+            kind="success"
+        )
+    elif start_discovery_btn.value:
+        status_callout = mo.callout("Please select at least one file.", kind="warn")
+
+    status_callout
+    return (extracted_topics,)
+
+
+@app.cell
+def _(MODEL, client, extracted_topics, json, mo, re, start_discovery_btn):
+    # Reduce Phase: Synthesize Themes
+    suggested_themes = []
+
+    if start_discovery_btn.value and extracted_topics:
+        with mo.status.spinner("Synthesizing Master Codebook...") as _spinner:
+            _topics_str = ", ".join(extracted_topics)
+
+            _synthesis_prompt = f"""
+            You are a qualitative data architect. 
+
+            I have a list of raw topics extracted from multiple interviews:
+            [{_topics_str}]
+
+            Task:
+            1. Group these into 5-8 distinct, high-level Themes.
+            2. Create a definition for each theme.
+            3. Assign a hex color code to each.
+            4. ALWAYS include a theme named "Other" for miscellaneous insights.
+
+            Return a JSON object with this structure:
+            [
+                {{"Theme": "Theme Name", "Definition": "Description...", "Color": "#HEXCODE"}},
+                ...
+            ]
+            """
+
+            _response = client.generate(model=MODEL, prompt=_synthesis_prompt)
+
+            _match = re.search(r'\[.*\]', _response.response, re.DOTALL)
+            if _match:
+                try:
+                    suggested_themes = json.loads(_match.group(0))
+                except:
+                    suggested_themes = [{"Theme": "Error parsing JSON", "Definition": _response.response, "Color": "#000000"}]
+
+    return (suggested_themes,)
+
+
+@app.cell
+def _(mo, pd, suggested_themes):
+    # Interactive Editor
+
+    # Default empty structure if nothing generated yet
+    _initial_data = suggested_themes if suggested_themes else [
+        {"Theme": "Example Theme", "Definition": "Description here...", "Color": "#CCCCCC"}
+    ]
+
+    df_themes = pd.DataFrame(_initial_data)
+
+    theme_editor = mo.ui.data_editor(
+        df_themes,
+        label="Master Codebook Editor",
+        column_config={
+            "Color": mo.ui.column.color_picker(label="Color")
+        },
+        num_rows="dynamic" # Allow adding/removing rows
+    )
+
+    mo.vstack([
+        mo.md("### Review & Refine Codebook"),
+        mo.md("Edit the themes below. You can add rows, change colors, or refine definitions."),
+        theme_editor
+    ])
+    return (theme_editor,)
+
+
+@app.cell
+def _(OUTPUT_FILE, json, mo, theme_editor):
+    save_btn = mo.ui.run_button(label="Save Master Codebook")
+
+    save_message = mo.md("")
+
+    if save_btn.value:
+        _final_df = theme_editor.value
+        # Convert to list of dicts
+        _codebook = _final_df.to_dict(orient="records")
+
+        with open(OUTPUT_FILE, "w") as f:
+            json.dump(_codebook, f, indent=2)
+
+        save_message = mo.callout(f"✅ Saved {len(_codebook)} themes to `{OUTPUT_FILE}`", kind="success")
+
+    mo.vstack([
+        save_btn,
+        save_message
+    ])
+    return
+
+
+if __name__ == "__main__":
+    app.run()
--- a/Stage2_Structured_Theme_Coding.py
+++ b/Stage2_Structured_Theme_Coding.py
@@ -0,0 +1,212 @@
+import marimo
+
+__generated_with = "0.18.0"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import json
+    import pandas as pd
+    from pathlib import Path
+    from utils import connect_qumo_ollama, load_srt
+
+    # Configuration
+    CODEBOOK_PATH = Path("data/labels/master_codebook.json")
+    TRANSCRIPT_DIR = Path("data/transcripts")
+    OUTPUT_DIR = Path("data/labeled_transcripts")
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    
+    # Initialize LLM Client
+    client = connect_qumo_ollama("hiperf-gpu")
+    return (
+        CODEBOOK_PATH,
+        OUTPUT_DIR,
+        Path,
+        TRANSCRIPT_DIR,
+        client,
+        connect_qumo_ollama,
+        json,
+        load_srt,
+        mo,
+        pd,
+    )
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Stage 2: Structured Theme Coding
+
+    **Goal:** Extract specific quotes for defined themes from full transcripts.
+    """)
+    return
+
+
+@app.cell
+def _(CODEBOOK_PATH, json, mo):
+    # Load Codebook
+    if CODEBOOK_PATH.exists():
+        with open(CODEBOOK_PATH, "r") as f:
+            codebook = json.load(f)
+    else:
+        codebook = []
+        
+    mo.md(f"**Loaded {len(codebook)} Themes from Codebook**")
+    return codebook, f
+
+
+@app.cell
+def _(TRANSCRIPT_DIR, mo):
+    # File Selector
+    srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
+    file_dropdown = mo.ui.dropdown(
+        options={f.name: str(f) for f in srt_files},
+        label="Select Transcript"
+    )
+    file_dropdown
+    return file_dropdown, srt_files
+
+
+@app.cell
+def _(mo):
+    run_btn = mo.ui.run_button(label="Start Analysis")
+    run_btn
+    return (run_btn,)
+
+
+@app.cell
+def _(
+    OUTPUT_DIR,
+    client,
+    codebook,
+    file_dropdown,
+    json,
+    load_srt,
+    mo,
+    pd,
+    run_btn,
+):
+    # Analysis Logic
+    results = []
+    status_message = mo.md("")
+    
+    if run_btn.value and file_dropdown.value:
+        transcript_path = file_dropdown.value
+        transcript_name = file_dropdown.selected_key.replace(".srt", "")
+        transcript_text = load_srt(transcript_path)
+        
+        with mo.status.progress_bar(codebook, title="Analyzing Themes") as bar:
+            for theme in bar:
+                theme_name = theme["name"]
+                theme_def = theme["definition"]
+                
+                prompt = f"""
+                You are a qualitative data analyst. Analyze the following transcript for the theme: "{theme_name}".
+                Definition: {theme_def}
+                
+                Extract ALL relevant quotes that match this definition.
+                For each quote, identify the specific Subject and the Sentiment (Positive, Neutral, Negative).
+                
+                Return ONLY a JSON array of objects with these keys: "quote", "subject", "sentiment".
+                If no quotes are found, return an empty array [].
+                
+                Transcript:
+                {transcript_text}
+                """
+                
+                try:
+                    response = client.generate(model="llama3.3:70b", prompt=prompt, format="json")
+                    content = response.get("response", "[]")
+                    extracted = json.loads(content)
+                    
+                    # Add metadata
+                    for item in extracted:
+                        item["theme"] = theme_name
+                        item["source_file"] = transcript_name
+                        results.append(item)
+                        
+                except Exception as e:
+                    print(f"Error processing theme {theme_name}: {e}")
+
+        # "Other" Category Analysis (Negative Constraint Strategy)
+        if results or codebook: # Proceed if we have themes to exclude
+            status_message = mo.md("🔍 Analyzing for 'Other' emerging themes...")
+            
+            # Format existing themes for exclusion
+            existing_themes_text = "\n".join([f"- {t['name']}: {t['definition']}" for t in codebook])
+            
+            other_prompt = f"""
+            You are a qualitative data analyst.
+            Your goal is to identify "Emerging Themes" in the transcript that have NOT been captured by our existing codebook.
+
+            ### EXISTING THEMES (IGNORE THESE)
+            We have already analyzed the transcript for the following themes. DO NOT extract quotes that primarily fit these definitions:
+            {existing_themes_text}
+
+            ### INSTRUCTIONS
+            1. Analyze the transcript below.
+            2. Identify significant quotes, insights, or patterns that are distinct from the "Existing Themes" listed above.
+            3. Label these findings as "Other".
+            4. If a quote is borderline, only include it if it offers a novel angle not covered by the existing definition.
+            
+            Return ONLY a JSON array of objects with these keys: "quote", "subject", "sentiment".
+            If no new insights are found, return an empty array [].
+
+            ### TRANSCRIPT
+            {transcript_text}
+            """
+            
+            try:
+                response = client.generate(model="llama3.3:70b", prompt=other_prompt, format="json")
+                content = response.get("response", "[]")
+                extracted_other = json.loads(content)
+                
+                for item in extracted_other:
+                    item["theme"] = "Other"
+                    item["source_file"] = transcript_name
+                    results.append(item)
+                    
+            except Exception as e:
+                print(f"Error processing 'Other' theme: {e}")
+        
+        # Save Results
+        if results:
+            df = pd.DataFrame(results)
+            output_path = OUTPUT_DIR / f"{transcript_name}_coded.csv"
+            df.to_csv(output_path, index=False)
+            status_message = mo.md(f"✅ Analysis Complete! Saved to `{output_path}`")
+        else:
+            status_message = mo.md("⚠️ No quotes found for any theme.")
+            df = pd.DataFrame()
+
+    elif run_btn.value and not file_dropdown.value:
+         status_message = mo.md("⚠️ Please select a transcript first.")
+         df = pd.DataFrame()
+    else:
+        df = pd.DataFrame()
+
+    mo.vstack([status_message, mo.ui.table(df)])
+    return (
+        bar,
+        content,
+        df,
+        extracted,
+        item,
+        output_path,
+        prompt,
+        response,
+        results,
+        status_message,
+        theme,
+        theme_def,
+        theme_name,
+        transcript_name,
+        transcript_path,
+        transcript_text,
+    )
+
+
+if __name__ == "__main__":
+    app.run()
--- a/assets/JP-Morgan-Chase-Symbol.png
+++ b/assets/JP-Morgan-Chase-Symbol.png
--- a/ollama/docker-compose.yml
+++ b/ollama/docker-compose.yml
@@ -0,0 +1,60 @@
+services:
+  ollama:
+    image: ollama/ollama:latest
+    ports:
+      - 11434:11434
+    volumes:
+      - ./docker-volumes/ollama:/root/.ollama
+    container_name: ollama
+    tty: true
+    restart: unless-stopped
+    # GPU SUPPORT NOTES:
+    # 1. The "deploy" section is ignored by classic 'docker-compose'; it's honored in Swarm.
+    # 2. For local 'docker compose up' with NVIDIA GPUs you need the host configured with
+    #    nvidia-container-toolkit. Then either:
+    #       a) Leave the reservation block (Compose V2 now honors it) OR
+    #       b) Start with: docker compose up --build (Compose will request GPUs) OR
+    #       c) Explicitly override: docker compose run --gpus all ollama
+    # 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the
+    #    'devices' section further down as a fallback (less portable).
+
+    ## UNCOMMENT THE FOLLOWING BLOCK FOR NVIDIA GPU SUPPORT ###
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia
+    #           count: all
+    #           capabilities: [gpu]
+
+    environment:
+      # Visible devices / capabilities for the NVIDIA container runtime
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    ## ---------- END GPU SUPPORT BLOCK ------------###
+
+
+    # Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors):
+    # devices:
+    #   - /dev/nvidiactl:/dev/nvidiactl
+    #   - /dev/nvidia-uvm:/dev/nvidia-uvm
+    #   - /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools
+    #   - /dev/nvidia0:/dev/nvidia0
+
+  open-webui:
+    image: ghcr.io/open-webui/open-webui:main
+    container_name: open-webui
+    volumes:
+      - ./docker-volumes/open-webui:/app/backend/data
+    depends_on:
+      - ollama
+    ports:
+      - 3000:8080
+    environment:
+      - 'OLLAMA_BASE_URL=http://ollama:11434'
+      - 'ENABLE_OLLAMA_API=true'
+      - 'WEBUI_SECRET_KEY='
+
+    extra_hosts:
+      - host.docker.internal:host-gateway
+    restart: unless-stopped
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,8 +6,18 @@ readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
    "marimo>=0.18.0",
+    "modin[dask]>=0.37.1",
    "numpy>=2.3.5",
    "ollama>=0.6.1",
+    "openai>=2.9.0",
+    "openpyxl>=3.1.5",
+    "pandas>=2.3.3",
    "pyzmq>=27.1.0",
    "requests>=2.32.5",
+    "taguette>=1.5.1",
+    "wordcloud>=1.9.5",
 ]
+
+
+[tool.uv.sources]
+wordcloud = { git = "https://github.com/amueller/word_cloud.git" }
--- a/utils.py
+++ b/utils.py
@@ -1,86 +0,0 @@
-"""
-Standard utils for this repository
-"""
-
-import re
-from pathlib import Path
-
-import requests
-from ollama import Client
-
-
-def load_srt(path: str | Path) -> str:
-    """Load and parse an SRT file, returning clean transcript with speaker labels.
-    
-    Args:
-        path: Path to the SRT file
-        
-    Returns:
-        Clean transcript string with format "SPEAKER_XX: text" per line,
-        timestamps stripped, consecutive lines from same speaker merged.
-    """
-    path = Path(path)
-    content = path.read_text(encoding='utf-8')
-    
-    # Parse SRT blocks: sequence number, timestamp, speaker|text
-    # Pattern matches: number, timestamp line, content line(s)
-    blocks = re.split(r'\n\n+', content.strip())
-    
-    turns = []
-    for block in blocks:
-        lines = block.strip().split('\n')
-        if len(lines) < 3:
-            continue
-        
-        # Skip sequence number (line 0) and timestamp (line 1)
-        # Content is line 2 onwards
-        text_lines = lines[2:]
-        text = ' '.join(text_lines)
-        
-        # Parse speaker|text format
-        if '|' in text:
-            speaker, utterance = text.split('|', 1)
-            speaker = speaker.strip()
-            utterance = utterance.strip()
-        else:
-            speaker = "UNKNOWN"
-            utterance = text.strip()
-        
-        turns.append((speaker, utterance))
-    
-    # Merge consecutive turns from same speaker
-    merged = []
-    for speaker, utterance in turns:
-        if merged and merged[-1][0] == speaker:
-            merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
-        else:
-            merged.append((speaker, utterance))
-    
-    # Format as "SPEAKER_XX: text"
-    transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
-    return '\n\n'.join(transcript_lines)
-
-
-def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
-    """Establish connection to Qumo Ollama instance
-    
-    vm_name: str ('ollama-lite' or 'hiperf-gpu')
-        Name of the VM running the Ollama instance
-    
-    Returns:
-        tuple(Client): Ollama client connected to the specified VM
-    """
-    QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:11434'
-    try:
-        requests.get(QUMO_OLLAMA_URL, timeout=5)
-        client = Client(
-            host=QUMO_OLLAMA_URL
-        )
-    except requests.ConnectionError:
-        print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
-
-    print(f"Connection succesful. WebUI available at: http://{vm_name}.tail44fa00.ts.net:3000\nAvailable models:")
-    for m in client.list().models:
-        print(f"  - '{m.model}' ")
-    return client
-
--- a/utils/init.py
+++ b/utils/init.py
@@ -0,0 +1,5 @@
+from .ollama_utils import connect_qumo_ollama
+from .data_utils import create_sentiment_matrix, extract_theme
+from .transcript_utils import load_srt, csv_to_markdown, cpc_smb_to_markdown
+from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis
+from .keyword_analysis import ollama_keyword_extraction, worker_extraction, blue_color_func
--- a/utils/data_utils.py
+++ b/utils/data_utils.py
@@ -0,0 +1,65 @@
+import pandas as pd
+
+
+def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
+    """
+    Create a sentiment matrix for a specific document.
+
+    Parameters:
+    - df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
+    - document_name: Name of the document to filter by
+
+    Returns:
+    - DataFrame representing the sentiment matrix
+    """
+
+    # Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
+    sentiment_rows = doc_df[
+        doc_df['tag'].str.contains(column_prefix, na=False)
+    ].copy()
+
+    if sentiment_rows.empty:
+        print("No sentiment data found")
+        return pd.DataFrame()
+
+    # Filter for rows with valid Voice/Character context
+    valid_rows = sentiment_rows[
+        sentiment_rows['_context'].notna() & 
+        (sentiment_rows['_context'].str.contains(row_prefix, na=False))
+    ].copy()
+
+    if valid_rows.empty:
+        print("No Voice/Character context found")
+        return pd.DataFrame()
+
+    # Create aggregation: group by Voice/Character (_context) and Theme (tag)
+    # Sum sentiment scores for each combination
+    matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
+
+    # Pivot to create the matrix
+    matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
+
+    # # Convert to integers for cleaner display
+    # matrix = matrix.astype(int)
+
+    return matrix
+
+
+
+def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
+    """
+    Extract the theme from a tag string.
+
+    Parameters:
+    - tag: str, the tag string (e.g., 'VT - Personal Experience')
+    - theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
+
+    Returns:
+    - str, the extracted theme (e.g., 'Personal Experience')
+    - None if no theme found
+    """
+    for prefix in theme_prefixes.split('|'):
+        if tag.startswith(prefix):
+            return tag.replace(prefix, '').strip()
+    return None
+    
--- a/utils/keyword_analysis.py
+++ b/utils/keyword_analysis.py
@@ -0,0 +1,109 @@
+import pandas as pd
+
+from ollama import Client
+import json
+import matplotlib.pyplot as plt
+
+import random
+import matplotlib.colors as mcolors
+
+def blue_color_func( word, font_size, position, orientation, random_state=None, **kwargs):
+    # Use the provided random_state for reproducibility if available, else use random module
+    r = random_state if random_state else random
+
+    # Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
+    # 0.0 is white/light, 1.0 is dark blue
+    min_val, max_val = 0.4, 1.0
+    color_val = r.uniform(min_val, max_val)
+
+    # Get color from matplotlib colormap
+    rgba = plt.cm.Blues(color_val)
+    return mcolors.to_hex(rgba)
+
+
+def worker_extraction(row, host, model):
+
+    
+    # Instantiate local client for this specific worker/thread
+    local_client = Client(host=host)
+    
+    return ollama_keyword_extraction(
+        content=row['content'], 
+        tag=row['tag'], 
+        client=local_client, 
+        model=model
+    )
+
+
+def ollama_keyword_extraction(content, tag, client: Client, model) -> list:
+    """
+    Perform sentiment analysis using Ollama model.
+
+    Parameters:
+    - content: Text content to analyze
+    - tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
+
+    Returns:
+    - sentiment score and reason
+    """
+    
+    # Construct prompt for Ollama model
+    # Prompt optimized for small models (Llama 3.2):
+    # - Fewer rules, prioritized by importance
+    # - Explicit verbatim instruction (prevents truncation errors)
+    # - Examples that reinforce exact copying
+    # - Positive framing (do X) instead of negative (don't do Y)
+    # - Minimal formatting overhead
+    prompt = f"""Extract keywords from interview quotes for thematic analysis.
+
+RULES (in priority order):
+1. Extract only keywords RELEVANT to the given context. Ignore off-topic content. Do NOT invent keywords.
+2. Use words from the quote, but generalize for clustering (e.g., "not youthful" → "traditional").
+3. Extract 1-5 keywords or short phrases that capture key themes.
+4. Prefer descriptive phrases over vague single words (e.g., "tech forward" not "tech").
+
+EXAMPLES:
+
+Context: Chase as a Brand
+Quote: "It's definitely not, like, youthful or trendy."
+Output: {{"keywords": ["traditional", "established"]}}
+
+Context: App Usability  
+Quote: "There are so many options when I try to pay, it's confusing."
+Output: {{"keywords": ["confusing", "overwhelming options"]}}
+
+Context: Brand Perception
+Quote: "I would say reliable, trustworthy, kind of old-school."
+Output: {{"keywords": ["reliable", "trustworthy", "old-school"]}}
+
+NOW EXTRACT KEYWORDS:
+
+Context: {tag}
+Quote: "{content}"
+Output:"""
+
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            resp = client.generate(
+                model=model,
+                prompt=prompt,
+                format='json',
+            )
+            
+            response_text = resp.response.strip()
+
+            # Extract JSON from response
+            start_index = response_text.find('{')
+            
+            if start_index == -1:
+                raise ValueError("No JSON found")
+
+            response_json, _ = json.JSONDecoder().raw_decode(response_text[start_index:])
+            keywords = response_json.get('keywords', [])
+            return [keywords]
+        
+        except Exception as e:
+            print(f"Attempt {attempt + 1}/{max_retries} failed: {e}. Output was: {response_text}")
+            if attempt == max_retries - 1:
+                return [[]]
--- a/utils/ollama_utils.py
+++ b/utils/ollama_utils.py
@@ -0,0 +1,42 @@
+
+
+
+import requests
+from ollama import Client
+
+
+
+
+def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434', print_models=True) -> Client:
+    """Establish connection to Qumo Ollama instance
+    
+    vm_name: str ('ollama-lite' or 'hiperf-gpu')
+        Name of the VM running the Ollama instance
+    
+    Returns:
+        tuple(Client): Ollama client connected to the specified VM
+    """
+    QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
+
+    if vm_name in ['localhost', '0.0.0.0']:
+        QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
+  
+    try:
+        requests.get(QUMO_OLLAMA_URL, timeout=5)
+        client = Client(
+            host=QUMO_OLLAMA_URL
+        )
+    
+        print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}")
+        models = [m.model for m in client.list().models]
+        if print_models:
+            print("Available models:")
+            for m in models:
+                print(f"  - '{m}' ")
+        return client, models
+    
+    except requests.ConnectionError:
+        pass
+    
+    print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
+    return None, None
--- a/utils/sentiment_analysis.py
+++ b/utils/sentiment_analysis.py
@@ -0,0 +1,135 @@
+import random
+import pandas as pd
+
+from ollama import Client
+import json
+
+def dummy_sentiment_analysis(content, tag):
+    if tag.startswith('VT -') or tag.startswith('CT -'):
+        return random.choice([-1, 0, 1]), 'random dummy sentiment'  # Random sentiment for testing
+
+    return 'test', 'not applicable'
+
+
+
+def ollama_sentiment_analysis(content, theme, theme_description, client: Client, model) -> tuple[list[str], int, str]:
+    """
+    Perform sentiment analysis using Ollama model.
+
+    Parameters:
+    - content: Text content to analyze
+    - tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
+
+    Returns:
+    - sentiment score and reason
+    """
+    prompt = f"""
+    # Role
+    You are an expert in sentiment analysis. Your task is to analyze the sentiment of a quote in relation to a specific theme.
+
+    # Input
+    Theme: `{theme}`
+    Theme Description: `{theme_description}`
+    Quote:
+    ```
+    {content}
+    ```
+
+    # Instructions
+    1. Analyze the sentiment of the quote specifically regarding the theme.
+    2. Extract relevant keywords or phrases from the quote. Prioritize specific descriptors found in the text that match or relate to the theme.
+    3. Assign a sentiment score:
+       - -1: Negative (complaint, dissatisfaction, criticism)
+       - 0: Neutral (factual, mixed, or no strong opinion)
+       - 1: Positive (praise, satisfaction, agreement)
+    4. Provide a concise reason (max 10 words).
+
+    # Constraints
+    - Return ONLY a valid JSON object.
+    - Do not use Markdown formatting (no ```json blocks).
+    - Do not write any Python code or explanations outside the JSON.
+    - If the quote is irrelevant to the theme, return sentiment 0.
+
+    # Response Format
+    {{
+        "keywords": ["<list_of_keywords>"],
+        "sentiment": <integer_score>,
+        "reason": "<string_reason>"
+    }}
+
+    # Examples
+
+    Example 1:
+    Theme: `Speed`
+    Quote: `It was a little slow for me.`
+    Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "Dissatisfaction with speed"}}
+
+    Example 2:
+    Theme: `Price`
+    Quote: `It costs $50.`
+    Response: {{"keywords": [], "sentiment": 0, "reason": "Factual statement"}}
+
+    Example 3:
+    Theme: `Friendliness`
+    Quote: `Sound very welcoming.`
+    Response: {{"keywords": ["welcoming"], "sentiment": 1, "reason": "Positive descriptor used"}}
+    """
+
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            resp = client.generate(
+                model=model,
+                prompt=prompt,
+            )
+            
+            response_text = resp.response.strip()
+
+            # Extract JSON from response
+            start_index = response_text.find('{')
+            end_index = response_text.rfind('}') + 1
+            
+            if start_index == -1 or end_index == 0:
+                raise ValueError("No JSON found")
+
+            json_str = response_text[start_index:end_index]
+            
+            response_json = json.loads(json_str)
+            keywords = response_json.get('keywords', [])
+            sentiment = response_json.get('sentiment', 'test')
+            reason = response_json.get('reason', 'no reason provided')
+            return keywords, sentiment, reason
+        
+        except Exception as e:
+            print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
+            if attempt == max_retries - 1:
+                return [], None, 'parsing error'
+
+
+if __name__ == "__main__":
+
+    client = Client(
+            host="http://localhost:11434"
+        )
+
+    sentiment_df = pd.DataFrame({
+        'content': [
+            "I love this product!",
+            "This is the worst service ever.",
+            "It's okay, not great but not terrible."
+        ],
+        'tag': [
+            'VT - Personal Experience',
+            'VT - Personal Experience',
+            'VT - Personal Experience'
+        ],
+        'manual_analysis': [False, False, True]
+    })
+
+    sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
+        lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['tag'], client, model='llama3.2:latest')),
+        axis=1
+    )
+
+    print(sentiment_df.head())
+
--- a/utils/transcript_utils.py
+++ b/utils/transcript_utils.py
@@ -0,0 +1,148 @@
+
+from pathlib import Path
+import re
+import pandas as pd
+
+def load_srt(path: str | Path) -> str:
+    """Load and parse an SRT file, returning clean transcript with speaker labels.
+    
+    Args:
+        path: Path to the SRT file
+        
+    Returns:
+        Clean transcript string with format "SPEAKER_XX: text" per line,
+        timestamps stripped, consecutive lines from same speaker merged.
+    """
+    path = Path(path)
+    content = path.read_text(encoding='utf-8')
+    
+    # Parse SRT blocks: sequence number, timestamp, speaker|text
+    # Pattern matches: number, timestamp line, content line(s)
+    blocks = re.split(r'\n\n+', content.strip())
+    
+    turns = []
+    for block in blocks:
+        lines = block.strip().split('\n')
+        if len(lines) < 3:
+            continue
+        
+        # Skip sequence number (line 0) and timestamp (line 1)
+        # Content is line 2 onwards
+        text_lines = lines[2:]
+        text = ' '.join(text_lines)
+        
+        # Parse speaker|text format
+        if '|' in text:
+            speaker, utterance = text.split('|', 1)
+            speaker = speaker.strip()
+            utterance = utterance.strip()
+        else:
+            speaker = "UNKNOWN"
+            utterance = text.strip()
+        
+        turns.append((speaker, utterance))
+    
+    # Merge consecutive turns from same speaker
+    merged = []
+    for speaker, utterance in turns:
+        if merged and merged[-1][0] == speaker:
+            merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
+        else:
+            merged.append((speaker, utterance))
+    
+    # Format as "SPEAKER_XX: text"
+    transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
+    return '\n\n'.join(transcript_lines)
+
+
+
+def csv_to_markdown(csv_path:Path):
+    """Convert transcript CSV to markdown, merging consecutive same-speaker turns."""
+    df = pd.read_csv(str(csv_path))
+    
+    lines = ["# Interview Transcript"]
+
+    # Track previous speaker to detect when speaker changes
+    prev_speaker = None
+    # Accumulate text from consecutive turns by same speaker
+    merged_text = []
+
+    for _, row in df.iterrows():
+        speaker = row["Speaker"]
+        text = str(row["Transcript"]).strip()
+
+        if speaker == prev_speaker:
+            # Same speaker continues — append text to current block
+            merged_text.append(text)
+        else:
+            # New speaker detected — flush previous speaker's block
+            if prev_speaker is not None:
+                # Format: **Speaker**: text-part-1\n\ntext-part-2
+                # Use \n\n to ensure distinct paragraphs for readability
+                lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}")
+
+            # Start new block for current speaker
+            prev_speaker = speaker
+            merged_text = [text]
+
+    # Flush final speaker's block
+    if prev_speaker is not None:
+        lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}")
+
+    # Join all blocks with double newlines for clear separation
+    return "\n\n".join(lines)
+
+
+def cpc_smb_to_markdown(cpc_path: Path) -> str:
+    """Convert CPC text transcript to markdown, merging consecutive same-speaker turns."""
+    content = Path(cpc_path).read_text(encoding='utf-8')
+    
+    lines = ["# Interview Transcript"]
+    prev_speaker = None
+    merged_text = []
+    
+    # Regex to find speaker labels: Word followed by colon and space
+    speaker_pattern = re.compile(r'(?:^|\s)([A-Za-z0-9]+):\s')
+
+    for line in content.splitlines():
+        line = line.strip().replace('\n', ' ')
+        
+        # Handle edge case: "CPC1, (She/ Her,) LOCATION: Hello." -> "CPC1: Hello."
+        match = re.match(r'^"?([A-Za-z0-9]+),\s*\(.*?\)\s*LOCATION:\s*(.*?)"?$', line)
+        if match:
+            line = f"{match.group(1)}: {match.group(2)}"
+
+        # Remove surrounding quotes
+        if line.startswith('"') and line.endswith('"'):
+            line = line[1:-1].strip()
+            
+        if not line:
+            continue
+            
+        parts = speaker_pattern.split(line)
+        
+        # If no speaker found, skip line (assumed garbage like "Like", headers)
+        if len(parts) < 2:
+            continue
+            
+        # parts[0] is text before the first speaker on this line
+        if parts[0].strip() and prev_speaker:
+            merged_text.append(parts[0].strip())
+            
+        # Iterate over speaker-text pairs
+        for i in range(1, len(parts), 2):
+            speaker = parts[i]
+            text = parts[i+1].strip()
+            
+            if speaker == prev_speaker:
+                merged_text.append(text)
+            else:
+                if prev_speaker is not None:
+                    lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}")
+                prev_speaker = speaker
+                merged_text = [text]
+    
+    if prev_speaker is not None:
+        lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}")
+        
+    return "\n\n".join(lines)
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
Luigi Maiorano	069e568d00	final tweaks	2025-12-17 01:37:42 -08:00
Luigi Maiorano	417273c745	voice keyword blacklist	2025-12-17 01:19:22 -08:00
Luigi Maiorano	eee6947f01	rename	2025-12-17 00:25:03 -08:00
Luigi Maiorano	d6b449e8c6	add warning message and increase n words	2025-12-16 23:56:13 -08:00
Luigi Maiorano	8fbc11da7a	Inline removal of keywords	2025-12-16 23:42:25 -08:00
Luigi Maiorano	50f9538dcf	format for consecutive runs	2025-12-16 23:21:03 -08:00
Luigi Maiorano	e90b41f648	added functionality to load keywords from excel file	2025-12-16 22:25:12 -08:00
Luigi Maiorano	e81961b819	cleanup notebook and make usable	2025-12-16 20:15:44 -08:00
Luigi Maiorano	4ba8af03d2	logo in word cloud	2025-12-16 17:44:50 -08:00
Luigi Maiorano	228a6daa59	progress apply	2025-12-16 16:28:07 -08:00
Luigi Maiorano	12e14e3c9b	keywords	2025-12-16 14:39:54 -08:00
Luigi Maiorano	a5ffd8315e	cpc1 afwijking	2025-12-14 20:02:40 +01:00
Luigi Maiorano	c2a5c12794	update import to work with CPC and SMB	2025-12-12 21:26:35 +01:00
Luigi Maiorano	ccc5154b93	llm processing of sentiment	2025-12-12 14:28:51 +01:00
Luigi Maiorano	e576f98cce	basic parsing working	2025-12-11 12:56:23 +01:00
Luigi Maiorano	b023d44934	minor edits	2025-12-10 08:34:57 +01:00
Luigi Maiorano	ad00860fa1	added local ollama support	2025-12-10 08:28:01 +01:00
Luigi Maiorano	b214e7ab17	cleanup instructions	2025-12-10 07:41:15 +01:00
Luigi Maiorano	7f951d9ee5	Aggregation step	2025-12-09 22:33:51 +01:00
Luigi Maiorano	821fa01edb	sentiments saving to intermediate csv	2025-12-09 21:40:54 +01:00
Luigi Maiorano	514570062c	restructure analysis	2025-12-09 21:05:07 +01:00
Luigi Maiorano	beddfee087	rename and start post process	2025-12-09 13:58:11 +01:00
Luigi Maiorano	60d2876725	preview md	2025-12-08 11:31:03 +01:00
Luigi Maiorano	ab4ee4b34a	Merge branch 'main' of gitea.tail44fa00.ts.net:Qumo/Interview-Analysis	2025-12-07 21:38:48 +01:00
Luigi Maiorano	8cc2bc9087	taguette pre-process	2025-12-07 21:37:42 +01:00
mtorsij	523a59f864	Added taguette to uv	2025-12-06 11:12:38 +01:00
Luigi Maiorano	98202ac3f2	architecture clarification	2025-12-03 12:12:23 +01:00