Compare commits
27 Commits
b21f402e1e
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 069e568d00 | |||
| 417273c745 | |||
| eee6947f01 | |||
| d6b449e8c6 | |||
| 8fbc11da7a | |||
| 50f9538dcf | |||
| e90b41f648 | |||
| e81961b819 | |||
| 4ba8af03d2 | |||
| 228a6daa59 | |||
| 12e14e3c9b | |||
| a5ffd8315e | |||
| c2a5c12794 | |||
| ccc5154b93 | |||
| e576f98cce | |||
| b023d44934 | |||
| ad00860fa1 | |||
| b214e7ab17 | |||
| 7f951d9ee5 | |||
| 821fa01edb | |||
| 514570062c | |||
| beddfee087 | |||
| 60d2876725 | |||
| ab4ee4b34a | |||
| 8cc2bc9087 | |||
|
|
523a59f864 | ||
| 98202ac3f2 |
5
.gitignore
vendored
5
.gitignore
vendored
@@ -11,4 +11,7 @@
|
||||
__marimo__
|
||||
__pycache__/
|
||||
|
||||
data/
|
||||
data/
|
||||
docker-volumes/
|
||||
logs/
|
||||
|
||||
|
||||
16
.vscode/launch.json
vendored
Normal file
16
.vscode/launch.json
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
|
||||
{
|
||||
"name": "Python Debugger: Current File",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${file}",
|
||||
"console": "integratedTerminal"
|
||||
}
|
||||
]
|
||||
}
|
||||
114
01_Taguette-Pre-Process.py
Normal file
114
01_Taguette-Pre-Process.py
Normal file
@@ -0,0 +1,114 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.18.3"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from utils import csv_to_markdown, cpc_smb_to_markdown
|
||||
return Path, cpc_smb_to_markdown, csv_to_markdown, mo
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(Path):
|
||||
INPUT_DIR = Path("data/transcripts/raw")
|
||||
OUTPUT_DIR = Path("data/transcripts/clean")
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
return INPUT_DIR, OUTPUT_DIR
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(INPUT_DIR, mo):
|
||||
csv_files = list(INPUT_DIR.glob("*.csv"))
|
||||
file_options = {f.stem: str(f) for f in csv_files}
|
||||
|
||||
file_dropdown = mo.ui.dropdown(
|
||||
options=file_options,
|
||||
label="Select CSV Transcript",
|
||||
full_width=True
|
||||
)
|
||||
file_dropdown
|
||||
return (file_dropdown,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(Path, cpc_smb_to_markdown, csv_to_markdown):
|
||||
def jpmc_transcript_to_md(filepath):
|
||||
fp = Path(filepath)
|
||||
try:
|
||||
return csv_to_markdown(filepath)
|
||||
except Exception as e:
|
||||
try:
|
||||
return cpc_smb_to_markdown(filepath)
|
||||
except Exception as e2:
|
||||
raise ValueError(f"Failed to process file {filepath} with errors: {e}, {e2}")
|
||||
return (jpmc_transcript_to_md,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(file_dropdown, jpmc_transcript_to_md, mo):
|
||||
# Preview
|
||||
preview = mo.md("")
|
||||
if file_dropdown.value:
|
||||
md_content = jpmc_transcript_to_md(file_dropdown.value)
|
||||
preview = mo.md(md_content[:1000])
|
||||
|
||||
preview
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
convert_btn = mo.ui.run_button(label="Convert to Markdown")
|
||||
convert_btn
|
||||
return (convert_btn,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(OUTPUT_DIR, Path, convert_btn, file_dropdown, jpmc_transcript_to_md, mo):
|
||||
result = mo.md("")
|
||||
saved_md = None
|
||||
|
||||
if convert_btn.value and file_dropdown.value:
|
||||
saved_md = jpmc_transcript_to_md(file_dropdown.value)
|
||||
_out_path = OUTPUT_DIR / (Path(file_dropdown.value).stem + ".md")
|
||||
_out_path.write_text(saved_md)
|
||||
result = mo.callout(f"✅ Saved to `{_out_path}`", kind="success")
|
||||
|
||||
result
|
||||
return (saved_md,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, saved_md):
|
||||
saved_preview = mo.md("")
|
||||
if saved_md:
|
||||
saved_preview = mo.vstack([
|
||||
mo.md("### Saved Markdown Preview"),
|
||||
mo.md(saved_md)
|
||||
])
|
||||
saved_preview
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Taguette
|
||||
|
||||
Upload and process using taguette: http://taguette.tail44fa00.ts.net/
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
666
02-B_WordClouds.py
Normal file
666
02-B_WordClouds.py
Normal file
@@ -0,0 +1,666 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.18.3"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
import modin.pandas as mpd
|
||||
from tqdm import tqdm
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
from utils import connect_qumo_ollama
|
||||
|
||||
OLLAMA_LOCATION= 'localhost'
|
||||
# VM_NAME = 'ollama-lite'
|
||||
|
||||
# initialize tqdm for pandas
|
||||
tqdm.pandas()
|
||||
|
||||
|
||||
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
|
||||
WORKING_DIR = Path('./data/processing/02-b_WordClouds')
|
||||
VOICE_EXCLUDE_KEYWORDS_FILE = WORKING_DIR / 'voice_excl_keywords.txt'
|
||||
|
||||
if not WORKING_DIR.exists():
|
||||
WORKING_DIR.mkdir(parents=True)
|
||||
if not TAGUETTE_EXPORT_DIR.exists():
|
||||
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
|
||||
|
||||
if not VOICE_EXCLUDE_KEYWORDS_FILE.exists():
|
||||
VOICE_EXCLUDE_KEYWORDS_FILE.touch()
|
||||
|
||||
return (
|
||||
OLLAMA_LOCATION,
|
||||
TAGUETTE_EXPORT_DIR,
|
||||
VOICE_EXCLUDE_KEYWORDS_FILE,
|
||||
WORKING_DIR,
|
||||
connect_qumo_ollama,
|
||||
mo,
|
||||
pd,
|
||||
)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# 1) Export Data out of Taguette
|
||||
|
||||
**Highlights**
|
||||
1. Go to: https://taguette.qumo.io/project/1
|
||||
2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
|
||||
3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
|
||||
|
||||
**Tags Codebook**
|
||||
1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
|
||||
2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
|
||||
|
||||
_NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# 2) Import Data
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TAGUETTE_EXPORT_DIR, pd):
|
||||
all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
|
||||
all_tags_df['_seq_id'] = range(len(all_tags_df))
|
||||
# all_tags_df
|
||||
return (all_tags_df,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(all_tags_df):
|
||||
# get count of rows per tag
|
||||
tag_counts = all_tags_df['tag'].value_counts().reset_index()
|
||||
# tag_counts
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TAGUETTE_EXPORT_DIR, pd):
|
||||
codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
|
||||
codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
|
||||
# codebook_df
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# 3) Select Tag for processing
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(all_tags_df, mo):
|
||||
|
||||
|
||||
|
||||
tag_select = mo.ui.dropdown(
|
||||
options=all_tags_df['tag'].unique().tolist(),
|
||||
label="Select Tag to Process",
|
||||
# value="Chase as a brand",
|
||||
full_width=True,
|
||||
)
|
||||
tag_select
|
||||
return (tag_select,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(WORKING_DIR, all_tags_df, mo, tag_select):
|
||||
mo.stop(not tag_select.value, mo.md("Select tag to continue"))
|
||||
|
||||
start_processing_btn = None
|
||||
start_processing_btn = mo.ui.button(
|
||||
label="Start Keyword Extraction",
|
||||
kind="warn",
|
||||
on_click=lambda val: True
|
||||
)
|
||||
|
||||
tag_fname = tag_select.value.replace(" ", "-").replace('/','-')
|
||||
|
||||
SAVE_DIR = WORKING_DIR / tag_fname
|
||||
|
||||
if not SAVE_DIR.exists():
|
||||
SAVE_DIR.mkdir(parents=True)
|
||||
|
||||
KEYWORDS_FPATH = SAVE_DIR / f'keywords_per-highlight_{tag_fname}.xlsx'
|
||||
KEYWORD_FREQ_FPATH = SAVE_DIR / f'keyword_frequencies_{tag_fname}.xlsx'
|
||||
|
||||
# filter all_tags_df to only the document = file_dropdown.value
|
||||
tags_df = all_tags_df.loc[all_tags_df['tag'] == tag_select.value].copy()
|
||||
tags_df.head()
|
||||
return (
|
||||
KEYWORDS_FPATH,
|
||||
KEYWORD_FREQ_FPATH,
|
||||
SAVE_DIR,
|
||||
start_processing_btn,
|
||||
tag_fname,
|
||||
tags_df,
|
||||
)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(KEYWORD_FREQ_FPATH, mo):
|
||||
mo.md(rf"""
|
||||
# 4) Keyword extraction {'(skippable, see 4b)' if KEYWORD_FREQ_FPATH.exists() else '(Required)'}
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(OLLAMA_LOCATION, connect_qumo_ollama, mo):
|
||||
try:
|
||||
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
|
||||
model_select = mo.ui.dropdown(
|
||||
options=_models,
|
||||
value=_models[0],
|
||||
label="Select Ollama Model to use",
|
||||
searchable=True,
|
||||
)
|
||||
except Exception as e:
|
||||
mo.md(f"Error connecting to Ollama server at `{OLLAMA_LOCATION}`: {e}")
|
||||
model_select = None
|
||||
client = None
|
||||
|
||||
model_select
|
||||
return client, model_select
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, model_select, start_processing_btn, tag_select):
|
||||
mo.stop(not tag_select.value or model_select is None, mo.md("Select tag to continue"))
|
||||
|
||||
start_processing_btn
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(client, mo, model_select, pd, start_processing_btn, tags_df):
|
||||
from utils import ollama_keyword_extraction, worker_extraction
|
||||
# Wait for start processing button
|
||||
mo.stop(not start_processing_btn.value, "Click button above to start processing")
|
||||
|
||||
if client is not None:
|
||||
df = tags_df
|
||||
# Run keyword extraction
|
||||
|
||||
df['keywords'] = df.progress_apply(
|
||||
lambda row: pd.Series(ollama_keyword_extraction(
|
||||
content=row['content'],
|
||||
tag=row['tag'],
|
||||
client=client,
|
||||
model=model_select.value
|
||||
)),
|
||||
axis=1
|
||||
)
|
||||
else:
|
||||
mo.md("Ollama client not available, See 4b) for loading data from xlsx.")
|
||||
return (df,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(KEYWORDS_FPATH, KEYWORD_FREQ_FPATH, df, mo, pd, start_processing_btn):
|
||||
mo.stop(not start_processing_btn.value, "Click button above to process first")
|
||||
|
||||
df['keywords_txt'] = df['keywords'].apply(lambda kws: ', '.join(kws))
|
||||
|
||||
all_keywords_list = df['keywords'].tolist()
|
||||
|
||||
all_keywords_flat = [item for sublist in all_keywords_list for item in sublist]
|
||||
|
||||
# Calculate frequencies per keyword
|
||||
keyword_freq = {}
|
||||
for kw in all_keywords_flat:
|
||||
if kw in keyword_freq:
|
||||
keyword_freq[kw] += 1
|
||||
else:
|
||||
keyword_freq[kw] = 1
|
||||
|
||||
freq_df = pd.DataFrame.from_dict(keyword_freq, orient='index', columns=['frequency'])
|
||||
freq_df.index.name = 'keyword'
|
||||
freq_df.reset_index(inplace=True)
|
||||
freq_df.sort_values(by='frequency', ascending=False, inplace=True)
|
||||
|
||||
|
||||
|
||||
# Save to Excel files
|
||||
|
||||
df[['id', 'tag', 'content', 'keywords_txt']].to_excel(
|
||||
KEYWORDS_FPATH,
|
||||
index=False
|
||||
)
|
||||
|
||||
freq_df.to_excel(
|
||||
KEYWORD_FREQ_FPATH,
|
||||
index=False
|
||||
)
|
||||
mo.vstack([
|
||||
mo.md(f"Keywords per-highlight saved to: `{KEYWORDS_FPATH}`"),
|
||||
mo.md(f"Keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
|
||||
])
|
||||
return (freq_df,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(KEYWORD_FREQ_FPATH, mo):
|
||||
mo.md(rf"""
|
||||
# 4b) [optional] Load data from `keyword_frequencies_{KEYWORD_FREQ_FPATH.name}`
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(KEYWORD_FREQ_FPATH, mo, start_processing_btn):
|
||||
if start_processing_btn is not None: # Triggers re-execution of this cell when keyword extraction completes
|
||||
pass
|
||||
|
||||
|
||||
load_existing_btn = None
|
||||
if KEYWORD_FREQ_FPATH.exists():
|
||||
load_existing_btn = mo.ui.run_button(label=f"Load `{KEYWORD_FREQ_FPATH.name}`", kind='warn')
|
||||
|
||||
load_existing_btn
|
||||
return (load_existing_btn,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(
|
||||
KEYWORD_FREQ_FPATH,
|
||||
VOICE_EXCLUDE_KEYWORDS_FILE,
|
||||
freq_df,
|
||||
load_existing_btn,
|
||||
pd,
|
||||
tag_select,
|
||||
):
|
||||
if load_existing_btn is not None and load_existing_btn.value:
|
||||
_fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl')
|
||||
|
||||
# Drop nan rows if any
|
||||
_fdf.dropna(subset=['keyword', 'frequency'], inplace=True)
|
||||
_fdf.sort_values(by='frequency', ascending=False, inplace=True)
|
||||
_fdf.reset_index(drop=True, inplace=True)
|
||||
print(f"Loaded `{KEYWORD_FREQ_FPATH}` successfully.")
|
||||
|
||||
frequency_df = _fdf
|
||||
|
||||
else:
|
||||
frequency_df = freq_df
|
||||
|
||||
if tag_select.value.startswith('V'):
|
||||
# Read exclusion list
|
||||
excl_kw = []
|
||||
with VOICE_EXCLUDE_KEYWORDS_FILE.open('r') as _f:
|
||||
for line in _f:
|
||||
excl_kw.append(line.strip())
|
||||
|
||||
_drop_idx = frequency_df[frequency_df['keyword'].isin(excl_kw)].index
|
||||
|
||||
frequency_df.drop(index=_drop_idx, inplace=True, axis=0)
|
||||
print(f"Dropped {len(_drop_idx)} keywords automatically")
|
||||
return (frequency_df,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# 5) Wordcloud generation
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
# Import all necessary libraries
|
||||
import numpy as np
|
||||
from os import path
|
||||
from PIL import Image, ImageDraw
|
||||
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
|
||||
import matplotlib.pyplot as plt
|
||||
from utils import blue_color_func
|
||||
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore")
|
||||
return Image, ImageDraw, WordCloud, blue_color_func, np, plt
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## 5.1) Select threshold frequency
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
min_freq_select = mo.ui.number(start=1, stop=20, label="Threshold Minimum Keyword Frequency: ", value=2)
|
||||
min_freq_select
|
||||
return (min_freq_select,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo, tag_select):
|
||||
mo.md(rf"""
|
||||
## 5.2) Inspect Keyword Dataset
|
||||
|
||||
1. Check the threshold is set correctly. If not, adjust accordingly
|
||||
2. Read all the keywords and verify they are good. If not
|
||||
- Add explicit exclusions if necessary below
|
||||
- OR Rerun the keyword extraction above
|
||||
|
||||
|
||||
|
||||
Add words to this dict that should be ignored in the WordCloud for specific tags.
|
||||
Make sure to create the correct key that matches the active selected tag:
|
||||
|
||||
Active selected tag = '`{tag_select.value.lower()}`'
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(frequency_df, min_freq_select, mo):
|
||||
mo.stop('keyword' not in frequency_df.columns, "Waiting for keyword extraction to finish")
|
||||
|
||||
MIN_FREQ = min_freq_select.value
|
||||
|
||||
_freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
|
||||
|
||||
table_selection = mo.ui.table(_freq_df_filtered, page_size=50)
|
||||
table_selection
|
||||
|
||||
return MIN_FREQ, table_selection
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo, table_selection):
|
||||
remove_rows_btn = None
|
||||
if len(table_selection.value) >0 :
|
||||
remove_rows_btn = mo.ui.run_button(label="Click to remove selected keywords and update xlsx")
|
||||
|
||||
remove_rows_btn
|
||||
return (remove_rows_btn,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(
|
||||
KEYWORD_FREQ_FPATH,
|
||||
VOICE_EXCLUDE_KEYWORDS_FILE,
|
||||
frequency_df,
|
||||
mo,
|
||||
remove_rows_btn,
|
||||
table_selection,
|
||||
tag_select,
|
||||
):
|
||||
_s = None
|
||||
if remove_rows_btn is not None and remove_rows_btn.value:
|
||||
# get selected rows
|
||||
selected_rows = table_selection.value
|
||||
if len(selected_rows) >0 :
|
||||
rows_to_drop = table_selection.value.index.tolist()
|
||||
try:
|
||||
if tag_select.value.startswith('V'):
|
||||
# append values to an VoiceKeywordsExclusion file (txt file just a list of keywords)
|
||||
exclude_keywords = frequency_df.loc[rows_to_drop, 'keyword'].to_list()
|
||||
|
||||
with VOICE_EXCLUDE_KEYWORDS_FILE.open('w') as f:
|
||||
for _kw in exclude_keywords:
|
||||
f.write(_kw + '\n')
|
||||
|
||||
|
||||
|
||||
frequency_df.drop(index=rows_to_drop, inplace=True, axis=0)
|
||||
|
||||
|
||||
|
||||
except KeyError:
|
||||
_s = mo.callout("GO BACK TO STEP 4b) and reload data to continue refining the dataset.", kind='warn')
|
||||
else:
|
||||
# Save updated frequencies back to xlsx
|
||||
frequency_df.to_excel(
|
||||
KEYWORD_FREQ_FPATH,
|
||||
index=False
|
||||
)
|
||||
|
||||
print(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`")
|
||||
|
||||
# mo.callout(f"Updated keyword frequencies saved to: `{KEYWORD_FREQ_FPATH}`", kind="success")
|
||||
_s = mo.callout("GO BACK TO STEP 4b) and reload data before continuing.", kind='warn')
|
||||
|
||||
_s
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
IGNORE_WORDS = {
|
||||
'chase as a brand': [
|
||||
"brand",
|
||||
"banking experience",
|
||||
"banking",
|
||||
"chase",
|
||||
"jpmorgan",
|
||||
"youthful",
|
||||
"customer service",
|
||||
"customer service focused",
|
||||
"great brand",
|
||||
],
|
||||
'why customer chase': [
|
||||
"customer service",
|
||||
"customer loyalty",
|
||||
"chase",
|
||||
"chase customer",
|
||||
"banking experience",
|
||||
],
|
||||
'chase as a person (personification)': [
|
||||
"CPC1"
|
||||
]
|
||||
# <active-selected-tag>: [list, of, words, to, ignore]
|
||||
}
|
||||
return (IGNORE_WORDS,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
buffer = -100 # Adjust this to increase/decrease space between logo and words
|
||||
canvas_size = (1200, 800)
|
||||
|
||||
logo_switch = mo.ui.switch(label="Include Chase Logo", value=False)
|
||||
|
||||
n_words = mo.ui.slider(start=10, stop=200, step=1, value=100, debounce=True, show_value=True, label="Max number of words in WordCloud")
|
||||
return buffer, canvas_size, logo_switch, n_words
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(logo_switch, mo, n_words):
|
||||
run_wordcloud_btn = mo.ui.run_button(label="Generate WordCloud")
|
||||
|
||||
mo.vstack([
|
||||
mo.md("## 5.4) Generate WordCloud with/without Logo"),
|
||||
mo.md("""Use these buttons to iteratively (re)generate the WordCloud until it looks nice.
|
||||
|
||||
Placement and color of words is randomized, size is proportional to frequency.
|
||||
|
||||
When satisfied with the result, click 'Save WordCloud to File' to save the image."""),
|
||||
mo.md('---'),
|
||||
mo.hstack([logo_switch, n_words, run_wordcloud_btn], align='center', justify='space-around')]
|
||||
)
|
||||
return (run_wordcloud_btn,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(
|
||||
IGNORE_WORDS,
|
||||
Image,
|
||||
ImageDraw,
|
||||
MIN_FREQ,
|
||||
WordCloud,
|
||||
blue_color_func,
|
||||
buffer,
|
||||
canvas_size,
|
||||
frequency_df,
|
||||
logo_switch,
|
||||
mo,
|
||||
n_words,
|
||||
np,
|
||||
plt,
|
||||
run_wordcloud_btn,
|
||||
tag_select,
|
||||
):
|
||||
if run_wordcloud_btn.value:
|
||||
pass
|
||||
|
||||
freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy()
|
||||
|
||||
# freq_df_filtered.reset_index(drop=True, inplace=True)
|
||||
|
||||
keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict()
|
||||
|
||||
# remove specific keywords depending on selected tag
|
||||
if IGNORE_WORDS.get(tag_select.value.lower()):
|
||||
for word in IGNORE_WORDS[tag_select.value.lower()]:
|
||||
if word in keyword_freq_filtered:
|
||||
del keyword_freq_filtered[word]
|
||||
|
||||
if logo_switch.value:
|
||||
# 1. Load the logo
|
||||
# Make sure this path points to your uploaded file
|
||||
logo_path = "./assets/JP-Morgan-Chase-Symbol.png"
|
||||
logo = Image.open(logo_path).convert("RGBA")
|
||||
|
||||
# Optional: Resize logo if it's too large or small for the canvas
|
||||
# target_width = 600
|
||||
# ratio = target_width / logo.width
|
||||
# logo = logo.resize((target_width, int(logo.height * ratio)), Image.Resampling.LANCZOS)
|
||||
target_width = 600 # Set a reasonable size for the logo
|
||||
if logo.width > target_width:
|
||||
ratio = target_width / logo.width
|
||||
new_height = int(logo.height * ratio)
|
||||
# Use Image.Resampling.LANCZOS for high-quality downsampling
|
||||
# If you get an error, try Image.LANCZOS or Image.ANTIALIAS
|
||||
logo = logo.resize((target_width, new_height), Image.Resampling.LANCZOS)
|
||||
|
||||
# 3. Create the mask (0 = draw here, 255 = don't draw here)
|
||||
# Initialize with 0 (black/draw everywhere)
|
||||
mask_image = Image.new("L", canvas_size, 0)
|
||||
draw = ImageDraw.Draw(mask_image)
|
||||
|
||||
# 4. Draw a protected circular area in the center
|
||||
center = (canvas_size[0] // 2, canvas_size[1] // 2)
|
||||
|
||||
# Calculate radius: half of logo max dimension + buffer
|
||||
radius = (max(logo.size) // 2) + buffer
|
||||
|
||||
# Draw the white circle (255) which the WordCloud will avoid
|
||||
draw.ellipse(
|
||||
(center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius),
|
||||
fill=255
|
||||
)
|
||||
|
||||
chase_mask = np.array(mask_image)
|
||||
|
||||
# Generate the WordCloud
|
||||
wordcloud = WordCloud(
|
||||
background_color='white',
|
||||
width=canvas_size[0],
|
||||
height=canvas_size[1],
|
||||
max_font_size=100, # Increased font size for larger canvas
|
||||
max_words=n_words.value, # Increased word count to fill space
|
||||
color_func=blue_color_func,
|
||||
mask=chase_mask, # Apply the circular mask
|
||||
contour_width=0,
|
||||
contour_color='steelblue'
|
||||
).generate_from_frequencies(keyword_freq_filtered)
|
||||
|
||||
else:
|
||||
# Generate the WordCloud
|
||||
wordcloud = WordCloud(
|
||||
background_color='white',
|
||||
width=canvas_size[0],
|
||||
height=canvas_size[1],
|
||||
max_font_size=150, # Increased font size for larger canvas
|
||||
max_words=n_words.value, # Increased word count to fill space
|
||||
color_func=blue_color_func,
|
||||
# mask=chase_mask, # Apply the circular mask
|
||||
# contour_width=0,
|
||||
# contour_color='steelblue'
|
||||
).generate_from_frequencies(keyword_freq_filtered)
|
||||
|
||||
# Convert WordCloud to Image to composite the logo
|
||||
wc_image = wordcloud.to_image()
|
||||
|
||||
if logo_switch.value:
|
||||
|
||||
# Calculate position to center the logo
|
||||
logo_pos = (
|
||||
(canvas_size[0] - logo.width) // 2,
|
||||
(canvas_size[1] - logo.height) // 2
|
||||
)
|
||||
|
||||
# Paste logo (using alpha channel as mask to keep transparency)
|
||||
wc_image.paste(logo, logo_pos, logo)
|
||||
|
||||
# Display the generated image
|
||||
fig = plt.figure(figsize=(7,7))
|
||||
|
||||
# Display the generated image:
|
||||
plt.imshow(wc_image, interpolation='bilinear')
|
||||
plt.axis("off")
|
||||
plt.show()
|
||||
|
||||
save_wordcloud_btn = None
|
||||
save_wordcloud_btn = mo.ui.button(
|
||||
label="Save WordCloud to File",
|
||||
kind="warn",
|
||||
on_click=lambda val: True
|
||||
)
|
||||
save_wordcloud_btn
|
||||
return save_wordcloud_btn, wc_image
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(SAVE_DIR, mo, save_wordcloud_btn, tag_fname, wc_image):
|
||||
# Wait for start processing button
|
||||
mo.stop(not save_wordcloud_btn.value, "Click button above to save wordcloud image")
|
||||
|
||||
|
||||
filename = f'wordcloud_{tag_fname}.png'
|
||||
|
||||
|
||||
fpath = SAVE_DIR / filename
|
||||
|
||||
# add a (increasing) number to the filename so we can save multiple. find the latest in the directory first
|
||||
existing_files = list(SAVE_DIR.glob(f'wordcloud_{tag_fname}*.png'))
|
||||
if existing_files:
|
||||
existing_numbers = []
|
||||
for ef in existing_files:
|
||||
parts = ef.stem.split('_')
|
||||
if len(parts) > 2 and parts[-1].isdigit():
|
||||
existing_numbers.append(int(parts[-1]))
|
||||
if existing_numbers:
|
||||
next_number = max(existing_numbers) + 1
|
||||
else:
|
||||
next_number = 1
|
||||
fpath = SAVE_DIR / f'wordcloud_{tag_fname}_{next_number}.png'
|
||||
|
||||
wc_image.save(fpath)
|
||||
mo.md(f"Wordcloud saved to: {fpath}")
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
461
02_Taguette_Post-Process.py
Normal file
461
02_Taguette_Post-Process.py
Normal file
@@ -0,0 +1,461 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.18.3"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
from utils import connect_qumo_ollama
|
||||
|
||||
OLLAMA_LOCATION= 'localhost'
|
||||
# VM_NAME = 'ollama-lite'
|
||||
|
||||
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
|
||||
|
||||
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
|
||||
WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
|
||||
|
||||
if not WORKING_DIR.exists():
|
||||
WORKING_DIR.mkdir(parents=True)
|
||||
if not TAGUETTE_EXPORT_DIR.exists():
|
||||
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
|
||||
|
||||
model_select = mo.ui.dropdown(
|
||||
options=_models,
|
||||
value=_models[0],
|
||||
label="Select Ollama Model to use",
|
||||
searchable=True,
|
||||
)
|
||||
model_select
|
||||
return (
|
||||
TAGUETTE_EXPORT_DIR,
|
||||
WORKING_DIR,
|
||||
client,
|
||||
datetime,
|
||||
mo,
|
||||
model_select,
|
||||
pd,
|
||||
)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(TAGUETTE_EXPORT_DIR, mo):
|
||||
mo.md(rf"""
|
||||
# Step 1: Export Data out of Taguette
|
||||
|
||||
**Highlights**
|
||||
1. Go to: https://taguette.qumo.io/project/1
|
||||
2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
|
||||
3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
|
||||
|
||||
**Tags Codebook**
|
||||
1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
|
||||
2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
|
||||
|
||||
_NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Step 2: Import here for processing
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TAGUETTE_EXPORT_DIR, pd):
|
||||
all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
|
||||
all_tags_df['_seq_id'] = range(len(all_tags_df))
|
||||
all_tags_df
|
||||
return (all_tags_df,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TAGUETTE_EXPORT_DIR, pd):
|
||||
codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
|
||||
codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
|
||||
codebook_df
|
||||
return (codebook_df,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Step 3: Process each 'Interview'
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(all_tags_df, mo):
|
||||
|
||||
interview_select = mo.ui.dropdown(
|
||||
options=all_tags_df['document'].unique().tolist(),
|
||||
label="Select Interview to Process",
|
||||
full_width=True
|
||||
)
|
||||
interview_select
|
||||
return (interview_select,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(all_tags_df, interview_select, mo):
|
||||
mo.stop(not interview_select.value, mo.md("Select interview to continue"))
|
||||
# filter all_tags_df to only the document = file_dropdown.value
|
||||
df = all_tags_df.loc[all_tags_df['document'] == interview_select.value].copy()
|
||||
return (df,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Add `_context` column to track Voice / Character is being referred to per highlight
|
||||
Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.
|
||||
|
||||
1. Iterates through the dataframe in document order (row by row)
|
||||
2. Uses a set to track which highlight IDs we've already processed
|
||||
3. When we encounter a new highlight ID for the first time, we process all its rows
|
||||
4. Collects all _V- or _C- tags within that highlight
|
||||
5. Assigns the context to all rows with that ID
|
||||
6. This preserves document order and handles multi-tag highlights correctly
|
||||
|
||||
|
||||
Example of challenging case:
|
||||
|
||||
| tag | content | _seq_id | _context |
|
||||
|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
|
||||
| _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 |
|
||||
| _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 |
|
||||
| VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 |
|
||||
| VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 |
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(df):
|
||||
# First pass: identify context tags within each highlight group
|
||||
df['_context'] = None
|
||||
last_context = None
|
||||
processed_ids = set()
|
||||
|
||||
# Process in document order
|
||||
for idx, row in df.iterrows():
|
||||
highlight_id = row['id']
|
||||
|
||||
# If we haven't processed this highlight yet
|
||||
if highlight_id not in processed_ids:
|
||||
processed_ids.add(highlight_id)
|
||||
|
||||
# Get all rows for this highlight
|
||||
highlight_rows = df[df['id'] == highlight_id]
|
||||
|
||||
# Collect all context tags in this highlight
|
||||
context_tags = []
|
||||
for _, h_row in highlight_rows.iterrows():
|
||||
tag = h_row.get('tag', '')
|
||||
if '_V-' in tag or '_C-' in tag:
|
||||
context_tags.append(tag)
|
||||
|
||||
# If we found context tags, join them with comma
|
||||
if context_tags:
|
||||
context_tag = ', '.join(context_tags)
|
||||
last_context = context_tag
|
||||
else:
|
||||
# If no context tag in this highlight, use the last context
|
||||
context_tag = last_context
|
||||
|
||||
# Assign the context to all rows in this highlight
|
||||
df.loc[df['id'] == highlight_id, '_context'] = context_tag
|
||||
|
||||
df
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Split multi-context rows (only VT- and CT- theme tags)
|
||||
|
||||
For rows that have multiple contexts (e.g., both _V-54 and _V-41)
|
||||
- split these into separate rows for each context.
|
||||
- Then mark these for 'manual_analysis'
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(df, pd):
|
||||
# Expand rows that contain multiple contexts (comma-separated)
|
||||
expanded_rows = []
|
||||
|
||||
for _, _row in df.iterrows():
|
||||
context_value = _row['_context']
|
||||
has_multiple = pd.notna(context_value) and ',' in str(context_value)
|
||||
|
||||
if has_multiple:
|
||||
contexts = [c.strip() for c in str(context_value).split(',')]
|
||||
else:
|
||||
contexts = [context_value]
|
||||
|
||||
if has_multiple:
|
||||
for ctx in contexts:
|
||||
new_row = _row.copy()
|
||||
new_row['_context'] = ctx
|
||||
new_row['manual_analysis'] = True
|
||||
|
||||
if str(new_row['tag']).startswith(('VT -', 'CT -')):
|
||||
new_row['sentiment'] = None
|
||||
|
||||
expanded_rows.append(new_row)
|
||||
else:
|
||||
new_row = _row.copy()
|
||||
new_row['_context'] = contexts[0]
|
||||
new_row['manual_analysis'] = False
|
||||
expanded_rows.append(new_row)
|
||||
|
||||
expanded_df_raw = pd.DataFrame(expanded_rows).reset_index(drop=True)
|
||||
|
||||
|
||||
sentiment_df = expanded_df_raw.loc[
|
||||
expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
|
||||
].copy()
|
||||
|
||||
print(f"{len(sentiment_df[sentiment_df['manual_analysis']])} Rows with multiple contexts")
|
||||
|
||||
sentiment_df[sentiment_df['manual_analysis']]
|
||||
return (sentiment_df,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Create 'theme' column
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(sentiment_df):
|
||||
from utils import extract_theme
|
||||
sentiment_df['theme'] = sentiment_df.apply(lambda row: extract_theme(row['tag']), axis=1)
|
||||
sentiment_df
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Extract Sentiment + Reasoning
|
||||
|
||||
For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.
|
||||
|
||||
Values to be used:
|
||||
- Positive: +1
|
||||
- Neutral: 0
|
||||
- Negative: -1
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
start_processing_btn = mo.ui.button(
|
||||
label="Start Sentiment Extraction",
|
||||
kind="warn",
|
||||
on_click=lambda val: True
|
||||
)
|
||||
start_processing_btn
|
||||
return (start_processing_btn,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(
|
||||
client,
|
||||
codebook_df,
|
||||
mo,
|
||||
model_select,
|
||||
pd,
|
||||
sentiment_df,
|
||||
start_processing_btn,
|
||||
):
|
||||
from utils import dummy_sentiment_analysis, ollama_sentiment_analysis
|
||||
|
||||
# add theme_description to be used in LLM prompt
|
||||
_df = sentiment_df.merge(codebook_df, on='tag', how='left', suffixes=('', '_codebook'))
|
||||
|
||||
# Wait for start processing button
|
||||
mo.stop(not start_processing_btn.value, "Click button above to start processing")
|
||||
|
||||
|
||||
sentiment_df[['keywords', 'sentiment', 'reason']] = _df[~_df['manual_analysis']].apply(
|
||||
lambda row: pd.Series(ollama_sentiment_analysis(
|
||||
content=row['content'],
|
||||
theme=row['theme'],
|
||||
theme_description=row['theme_description'],
|
||||
client=client,
|
||||
model=model_select.value
|
||||
)),
|
||||
axis=1
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, sentiment_df):
|
||||
mo.stop(('sentiment' not in sentiment_df.columns), "Run above cells to extract sentiment analysis")
|
||||
sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Multi-context tags
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, sentiment_df):
|
||||
manual_rows = sentiment_df[sentiment_df['manual_analysis']]
|
||||
split_rows_editor = None
|
||||
rows_to_edit = []
|
||||
|
||||
if not manual_rows.empty:
|
||||
print(
|
||||
f"⚠️ {len(manual_rows)} rows were created from multi-context splits. "
|
||||
"See next cell for manual review."
|
||||
)
|
||||
|
||||
# Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
|
||||
rows_to_edit = sentiment_df[
|
||||
(sentiment_df['manual_analysis'])
|
||||
]
|
||||
|
||||
# Create data editor for split rows
|
||||
split_rows_editor = mo.ui.data_editor(
|
||||
rows_to_edit
|
||||
).form(label="Update Sentiment / Manual Flag")
|
||||
|
||||
else:
|
||||
print("✓ No multi-context rows found")
|
||||
return rows_to_edit, split_rows_editor
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(split_rows_editor):
|
||||
split_rows_editor
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo, rows_to_edit, split_rows_editor):
|
||||
if split_rows_editor is not None:
|
||||
mo.vstack([
|
||||
mo.md(f"""
|
||||
### ⚠️ Manual Review Required
|
||||
|
||||
**{len(rows_to_edit)} rows** were split from multi-context entries.
|
||||
Please review them below:
|
||||
1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
|
||||
2. Click **Submit** to apply changes.
|
||||
"""),
|
||||
split_rows_editor
|
||||
])
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, split_rows_editor):
|
||||
# Capture the edited manual-analysis rows for validation
|
||||
reviewed_manual_rows = getattr(split_rows_editor, 'value', '')
|
||||
mo.stop(reviewed_manual_rows is None, mo.md("Submit your sentiment analysis changes before continuing."))
|
||||
|
||||
# Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
|
||||
|
||||
if (reviewed_manual_rows != '') and (not reviewed_manual_rows.empty):
|
||||
valid_sentiments = {-1, 0, 1}
|
||||
needs_review = reviewed_manual_rows[
|
||||
reviewed_manual_rows['manual_analysis']
|
||||
& ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
|
||||
]
|
||||
assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
|
||||
|
||||
print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
|
||||
return (reviewed_manual_rows,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Recombine
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd, reviewed_manual_rows, sentiment_df):
|
||||
_static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
|
||||
if isinstance(reviewed_manual_rows, pd.DataFrame):
|
||||
recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
|
||||
else:
|
||||
recombined_df = sentiment_df
|
||||
|
||||
recombined_df
|
||||
return (recombined_df,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Step 3: Process 'Other' tags
|
||||
|
||||
These need to be reviewed manually for interesting content
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Save to CSV
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(WORKING_DIR, datetime, interview_select, recombined_df):
|
||||
# Save to CSV in working dir
|
||||
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments.csv"
|
||||
recombined_df.to_csv(filename, index=False)
|
||||
|
||||
print(f"✓ Saved processed data to '{filename}'")
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
146
03_Sentiment_Analysis.py
Normal file
146
03_Sentiment_Analysis.py
Normal file
@@ -0,0 +1,146 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.18.3"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from utils import create_sentiment_matrix
|
||||
|
||||
INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
|
||||
WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
|
||||
|
||||
if not WORKING_DIR.exists():
|
||||
WORKING_DIR.mkdir(parents=True)
|
||||
return INPUT_DIR, Path, WORKING_DIR, create_sentiment_matrix, mo, pd
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Load Sentiment CSV
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(INPUT_DIR, mo):
|
||||
csv_files = list(INPUT_DIR.glob("*.csv"))
|
||||
file_options = {f.stem: str(f) for f in csv_files}
|
||||
|
||||
sentiment_csv = mo.ui.dropdown(
|
||||
options=file_options,
|
||||
label="Select Sentiment CSV File",
|
||||
full_width=True
|
||||
)
|
||||
sentiment_csv
|
||||
return (sentiment_csv,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(Path, pd, sentiment_csv):
|
||||
input_csv_name = Path(sentiment_csv.value).stem
|
||||
timestamp = input_csv_name.split('_')[-1]
|
||||
doc = input_csv_name.split('_')[0]
|
||||
|
||||
sentiment_df = pd.read_csv(sentiment_csv.value)
|
||||
sentiment_df
|
||||
return doc, sentiment_df, timestamp
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Phase 1: Individual interview analysis
|
||||
- Create sentiment matrices for each interview (document)
|
||||
- Save the intermediate results to file in the `WORKING_DIR`
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 1.1: Voice Sample vs. Theme Sentiment Matrix
|
||||
|
||||
For each interview (document), create a matrix where:
|
||||
- Rows represent the different Voices (based on '_V-' tags)
|
||||
- Columns represent the different VoiceThemes(based on 'VT -' tags)
|
||||
- Each cell contains the aggregated sentiment score (sum) for that Voice/Theme combination
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(create_sentiment_matrix, sentiment_df):
|
||||
voice_matrix = create_sentiment_matrix(sentiment_df, column_prefix='VT - ', row_prefix='_V-')
|
||||
voice_matrix
|
||||
return (voice_matrix,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
SAVE TO CSV
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(WORKING_DIR, doc, timestamp, voice_matrix):
|
||||
# Save to CSV
|
||||
voice_filename = WORKING_DIR / f"{doc}_voice_theme_matrix_{timestamp}.csv"
|
||||
|
||||
voice_matrix.to_csv(voice_filename)
|
||||
|
||||
print(f"Saved to '{voice_filename}'")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 1.2: Character Sample vs. Theme Sentiment Matrix
|
||||
|
||||
For each interview (document), create a matrix where:
|
||||
- Rows represent the different Characters (based on '_C-' tags)
|
||||
- Columns represent the different CharacterThemes (based on 'CT -' tags)
|
||||
- Each cell contains the aggregated sentiment score (sum) for that Character/Theme combination
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(create_sentiment_matrix, sentiment_df):
|
||||
character_matrix = create_sentiment_matrix(sentiment_df, column_prefix='CT - ', row_prefix='_C-')
|
||||
character_matrix
|
||||
return (character_matrix,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(WORKING_DIR, character_matrix, doc, timestamp):
|
||||
# Save to CSV
|
||||
character_filename = WORKING_DIR / f"{doc}_character_theme_matrix_{timestamp}.csv"
|
||||
|
||||
character_matrix.to_csv(character_filename)
|
||||
|
||||
print(f"Saved to '{character_filename}'")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 1.3: Chase Brand Sentiment
|
||||
|
||||
TODO: not sure we have enough supporting data for this yet
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
86
04_Results_Aggregation.py
Normal file
86
04_Results_Aggregation.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.18.3"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
|
||||
INPUT_DIR = Path("./data/processing/03_sentiment_analysis")
|
||||
WORKING_DIR = Path('./data/processing/04_sentiment_aggregation')
|
||||
|
||||
if not WORKING_DIR.exists():
|
||||
WORKING_DIR.mkdir(parents=True)
|
||||
return INPUT_DIR, mo, pd
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Voices
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(INPUT_DIR, mo):
|
||||
voice_csv_files = list(INPUT_DIR.glob("*voice*.csv"))
|
||||
file_options = {f.stem: str(f) for f in voice_csv_files}
|
||||
|
||||
voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation")
|
||||
|
||||
return (voice_multiselect,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, voice_multiselect):
|
||||
mo.hstack([voice_multiselect, mo.md(f"Has value: {voice_multiselect.value}")])
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd, voice_multiselect):
|
||||
# Load all voice CSV files and aggregate them so that each row-column pair is summed
|
||||
KEY_COL = "_context"
|
||||
|
||||
def _read_voice_csv(path: str) -> pd.DataFrame:
|
||||
df = pd.read_csv(path).set_index(KEY_COL)
|
||||
df = df.apply(pd.to_numeric, errors="coerce")
|
||||
return df
|
||||
|
||||
def aggregate_voice_data(files: list[str]) -> pd.DataFrame:
|
||||
if not files:
|
||||
return pd.DataFrame()
|
||||
|
||||
master = _read_voice_csv(files[0])
|
||||
for path in files[1:]:
|
||||
master = master.add(_read_voice_csv(path), fill_value=0)
|
||||
|
||||
return master.reset_index()
|
||||
|
||||
master_df = aggregate_voice_data(voice_multiselect.value)
|
||||
master_df
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Characters
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(INPUT_DIR):
|
||||
char_csv_files = list(INPUT_DIR.glob("*character*.csv"))
|
||||
char_csv_files
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -78,7 +78,9 @@ def _(mo):
|
||||
|
||||
**Goal:** Convert unstructured text into a structured dataset.
|
||||
|
||||
1. **Input:** All 26 Transcripts + `master_codebook.json`.
|
||||
This will be a dedicated notebook, and be run per transcript.
|
||||
|
||||
1. **Input:** Transcript + `master_codebook.json`.
|
||||
2. **Process:**
|
||||
* The LLM analyzes each transcript segment-by-segment.
|
||||
* It extracts specific quotes that match a Theme Definition.
|
||||
@@ -86,8 +88,9 @@ def _(mo):
|
||||
* **Granular Sentiment Analysis:** For each quote, the model identifies:
|
||||
* **Subject:** The specific topic/object being discussed (e.g., "Login Flow", "Brand Tone").
|
||||
* **Sentiment:** Positive / Neutral / Negative.
|
||||
3. **Output:** `coded_segments.csv`
|
||||
3. **Output:** `<transcript_name>_coded_segments.csv`
|
||||
* Columns: `Source_File`, `Speaker`, `Theme`, `Quote`, `Subject`, `Sentiment`, `Context`.
|
||||
* Each transcript produces its own CSV-file, which can be reviewed and adjusted before moving to the next stage
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
226
Stage1_Theme_Discovery.py
Normal file
226
Stage1_Theme_Discovery.py
Normal file
@@ -0,0 +1,226 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.18.1"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import json
|
||||
import pandas as pd
|
||||
import re
|
||||
from pathlib import Path
|
||||
from utils import connect_qumo_ollama, load_srt
|
||||
|
||||
# Configuration
|
||||
VM_NAME = 'hiperf-gpu'
|
||||
MODEL = 'llama3.3:70b'
|
||||
TRANSCRIPT_DIR = Path("data/transcripts")
|
||||
OUTPUT_FILE = Path("master_codebook.json")
|
||||
|
||||
client = connect_qumo_ollama(VM_NAME)
|
||||
return (
|
||||
MODEL,
|
||||
OUTPUT_FILE,
|
||||
TRANSCRIPT_DIR,
|
||||
client,
|
||||
json,
|
||||
load_srt,
|
||||
mo,
|
||||
pd,
|
||||
re,
|
||||
)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Stage 1: Theme Discovery
|
||||
|
||||
**Goal:** Identify recurring themes across a sample of interviews.
|
||||
|
||||
1. **Select Transcripts:** Choose 4-5 representative interviews.
|
||||
2. **Extract Topics:** The AI will analyze each transcript to find key topics.
|
||||
3. **Synthesize Themes:** Topics are grouped into a Master Codebook.
|
||||
4. **Refine & Save:** Edit the definitions and save the `master_codebook.json`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TRANSCRIPT_DIR, mo):
|
||||
# File Selection
|
||||
srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
|
||||
file_options = {f.name: str(f) for f in srt_files}
|
||||
|
||||
file_selector = mo.ui.multiselect(
|
||||
options=file_options,
|
||||
label="Select Transcripts (Recommended: 4-5)",
|
||||
full_width=True
|
||||
)
|
||||
file_selector
|
||||
return (file_selector,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(file_selector, mo):
|
||||
mo.md(f"**Selected:** {len(file_selector.value)} files")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
start_discovery_btn = mo.ui.run_button(label="Start Discovery Process")
|
||||
start_discovery_btn
|
||||
return (start_discovery_btn,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(
|
||||
MODEL,
|
||||
client,
|
||||
file_selector,
|
||||
json,
|
||||
load_srt,
|
||||
mo,
|
||||
re,
|
||||
start_discovery_btn,
|
||||
):
|
||||
# Map Phase: Extract Topics per Transcript
|
||||
extracted_topics = []
|
||||
status_callout = mo.md("")
|
||||
|
||||
if start_discovery_btn.value and file_selector.value:
|
||||
with mo.status.spinner("Analyzing transcripts...") as _spinner:
|
||||
for filepath in file_selector.value:
|
||||
_transcript = load_srt(filepath)
|
||||
|
||||
# Truncate for discovery if too long (optional, but good for speed)
|
||||
# Using first 15k chars usually gives enough context for high-level themes
|
||||
_context = _transcript[:15000]
|
||||
|
||||
_prompt = f"""
|
||||
Analyze this interview transcript and list the top 5-7 key topics or themes discussed.
|
||||
Focus on: Brand voice, Customer experience, Design systems, and AI.
|
||||
|
||||
Return ONLY a JSON list of strings. Example: ["Inconsistent Tone", "Mobile Latency", "AI Trust"]
|
||||
|
||||
Transcript:
|
||||
{_context}...
|
||||
"""
|
||||
|
||||
try:
|
||||
_response = client.generate(model=MODEL, prompt=_prompt)
|
||||
# Find JSON list in response
|
||||
_match = re.search(r'\[.*\]', _response.response, re.DOTALL)
|
||||
if _match:
|
||||
_topics = json.loads(_match.group(0))
|
||||
extracted_topics.extend(_topics)
|
||||
except Exception as e:
|
||||
print(f"Error processing {filepath}: {e}")
|
||||
|
||||
status_callout = mo.callout(
|
||||
f"✅ Extracted {len(extracted_topics)} raw topics from {len(file_selector.value)} files.",
|
||||
kind="success"
|
||||
)
|
||||
elif start_discovery_btn.value:
|
||||
status_callout = mo.callout("Please select at least one file.", kind="warn")
|
||||
|
||||
status_callout
|
||||
return (extracted_topics,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(MODEL, client, extracted_topics, json, mo, re, start_discovery_btn):
|
||||
# Reduce Phase: Synthesize Themes
|
||||
suggested_themes = []
|
||||
|
||||
if start_discovery_btn.value and extracted_topics:
|
||||
with mo.status.spinner("Synthesizing Master Codebook...") as _spinner:
|
||||
_topics_str = ", ".join(extracted_topics)
|
||||
|
||||
_synthesis_prompt = f"""
|
||||
You are a qualitative data architect.
|
||||
|
||||
I have a list of raw topics extracted from multiple interviews:
|
||||
[{_topics_str}]
|
||||
|
||||
Task:
|
||||
1. Group these into 5-8 distinct, high-level Themes.
|
||||
2. Create a definition for each theme.
|
||||
3. Assign a hex color code to each.
|
||||
4. ALWAYS include a theme named "Other" for miscellaneous insights.
|
||||
|
||||
Return a JSON object with this structure:
|
||||
[
|
||||
{{"Theme": "Theme Name", "Definition": "Description...", "Color": "#HEXCODE"}},
|
||||
...
|
||||
]
|
||||
"""
|
||||
|
||||
_response = client.generate(model=MODEL, prompt=_synthesis_prompt)
|
||||
|
||||
_match = re.search(r'\[.*\]', _response.response, re.DOTALL)
|
||||
if _match:
|
||||
try:
|
||||
suggested_themes = json.loads(_match.group(0))
|
||||
except:
|
||||
suggested_themes = [{"Theme": "Error parsing JSON", "Definition": _response.response, "Color": "#000000"}]
|
||||
|
||||
return (suggested_themes,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, pd, suggested_themes):
|
||||
# Interactive Editor
|
||||
|
||||
# Default empty structure if nothing generated yet
|
||||
_initial_data = suggested_themes if suggested_themes else [
|
||||
{"Theme": "Example Theme", "Definition": "Description here...", "Color": "#CCCCCC"}
|
||||
]
|
||||
|
||||
df_themes = pd.DataFrame(_initial_data)
|
||||
|
||||
theme_editor = mo.ui.data_editor(
|
||||
df_themes,
|
||||
label="Master Codebook Editor",
|
||||
column_config={
|
||||
"Color": mo.ui.column.color_picker(label="Color")
|
||||
},
|
||||
num_rows="dynamic" # Allow adding/removing rows
|
||||
)
|
||||
|
||||
mo.vstack([
|
||||
mo.md("### Review & Refine Codebook"),
|
||||
mo.md("Edit the themes below. You can add rows, change colors, or refine definitions."),
|
||||
theme_editor
|
||||
])
|
||||
return (theme_editor,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(OUTPUT_FILE, json, mo, theme_editor):
|
||||
save_btn = mo.ui.run_button(label="Save Master Codebook")
|
||||
|
||||
save_message = mo.md("")
|
||||
|
||||
if save_btn.value:
|
||||
_final_df = theme_editor.value
|
||||
# Convert to list of dicts
|
||||
_codebook = _final_df.to_dict(orient="records")
|
||||
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(_codebook, f, indent=2)
|
||||
|
||||
save_message = mo.callout(f"✅ Saved {len(_codebook)} themes to `{OUTPUT_FILE}`", kind="success")
|
||||
|
||||
mo.vstack([
|
||||
save_btn,
|
||||
save_message
|
||||
])
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
212
Stage2_Structured_Theme_Coding.py
Normal file
212
Stage2_Structured_Theme_Coding.py
Normal file
@@ -0,0 +1,212 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.18.0"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import json
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from utils import connect_qumo_ollama, load_srt
|
||||
|
||||
# Configuration
|
||||
CODEBOOK_PATH = Path("data/labels/master_codebook.json")
|
||||
TRANSCRIPT_DIR = Path("data/transcripts")
|
||||
OUTPUT_DIR = Path("data/labeled_transcripts")
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Initialize LLM Client
|
||||
client = connect_qumo_ollama("hiperf-gpu")
|
||||
return (
|
||||
CODEBOOK_PATH,
|
||||
OUTPUT_DIR,
|
||||
Path,
|
||||
TRANSCRIPT_DIR,
|
||||
client,
|
||||
connect_qumo_ollama,
|
||||
json,
|
||||
load_srt,
|
||||
mo,
|
||||
pd,
|
||||
)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Stage 2: Structured Theme Coding
|
||||
|
||||
**Goal:** Extract specific quotes for defined themes from full transcripts.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(CODEBOOK_PATH, json, mo):
|
||||
# Load Codebook
|
||||
if CODEBOOK_PATH.exists():
|
||||
with open(CODEBOOK_PATH, "r") as f:
|
||||
codebook = json.load(f)
|
||||
else:
|
||||
codebook = []
|
||||
|
||||
mo.md(f"**Loaded {len(codebook)} Themes from Codebook**")
|
||||
return codebook, f
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TRANSCRIPT_DIR, mo):
|
||||
# File Selector
|
||||
srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
|
||||
file_dropdown = mo.ui.dropdown(
|
||||
options={f.name: str(f) for f in srt_files},
|
||||
label="Select Transcript"
|
||||
)
|
||||
file_dropdown
|
||||
return file_dropdown, srt_files
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
run_btn = mo.ui.run_button(label="Start Analysis")
|
||||
run_btn
|
||||
return (run_btn,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(
|
||||
OUTPUT_DIR,
|
||||
client,
|
||||
codebook,
|
||||
file_dropdown,
|
||||
json,
|
||||
load_srt,
|
||||
mo,
|
||||
pd,
|
||||
run_btn,
|
||||
):
|
||||
# Analysis Logic
|
||||
results = []
|
||||
status_message = mo.md("")
|
||||
|
||||
if run_btn.value and file_dropdown.value:
|
||||
transcript_path = file_dropdown.value
|
||||
transcript_name = file_dropdown.selected_key.replace(".srt", "")
|
||||
transcript_text = load_srt(transcript_path)
|
||||
|
||||
with mo.status.progress_bar(codebook, title="Analyzing Themes") as bar:
|
||||
for theme in bar:
|
||||
theme_name = theme["name"]
|
||||
theme_def = theme["definition"]
|
||||
|
||||
prompt = f"""
|
||||
You are a qualitative data analyst. Analyze the following transcript for the theme: "{theme_name}".
|
||||
Definition: {theme_def}
|
||||
|
||||
Extract ALL relevant quotes that match this definition.
|
||||
For each quote, identify the specific Subject and the Sentiment (Positive, Neutral, Negative).
|
||||
|
||||
Return ONLY a JSON array of objects with these keys: "quote", "subject", "sentiment".
|
||||
If no quotes are found, return an empty array [].
|
||||
|
||||
Transcript:
|
||||
{transcript_text}
|
||||
"""
|
||||
|
||||
try:
|
||||
response = client.generate(model="llama3.3:70b", prompt=prompt, format="json")
|
||||
content = response.get("response", "[]")
|
||||
extracted = json.loads(content)
|
||||
|
||||
# Add metadata
|
||||
for item in extracted:
|
||||
item["theme"] = theme_name
|
||||
item["source_file"] = transcript_name
|
||||
results.append(item)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing theme {theme_name}: {e}")
|
||||
|
||||
# "Other" Category Analysis (Negative Constraint Strategy)
|
||||
if results or codebook: # Proceed if we have themes to exclude
|
||||
status_message = mo.md("🔍 Analyzing for 'Other' emerging themes...")
|
||||
|
||||
# Format existing themes for exclusion
|
||||
existing_themes_text = "\n".join([f"- {t['name']}: {t['definition']}" for t in codebook])
|
||||
|
||||
other_prompt = f"""
|
||||
You are a qualitative data analyst.
|
||||
Your goal is to identify "Emerging Themes" in the transcript that have NOT been captured by our existing codebook.
|
||||
|
||||
### EXISTING THEMES (IGNORE THESE)
|
||||
We have already analyzed the transcript for the following themes. DO NOT extract quotes that primarily fit these definitions:
|
||||
{existing_themes_text}
|
||||
|
||||
### INSTRUCTIONS
|
||||
1. Analyze the transcript below.
|
||||
2. Identify significant quotes, insights, or patterns that are distinct from the "Existing Themes" listed above.
|
||||
3. Label these findings as "Other".
|
||||
4. If a quote is borderline, only include it if it offers a novel angle not covered by the existing definition.
|
||||
|
||||
Return ONLY a JSON array of objects with these keys: "quote", "subject", "sentiment".
|
||||
If no new insights are found, return an empty array [].
|
||||
|
||||
### TRANSCRIPT
|
||||
{transcript_text}
|
||||
"""
|
||||
|
||||
try:
|
||||
response = client.generate(model="llama3.3:70b", prompt=other_prompt, format="json")
|
||||
content = response.get("response", "[]")
|
||||
extracted_other = json.loads(content)
|
||||
|
||||
for item in extracted_other:
|
||||
item["theme"] = "Other"
|
||||
item["source_file"] = transcript_name
|
||||
results.append(item)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing 'Other' theme: {e}")
|
||||
|
||||
# Save Results
|
||||
if results:
|
||||
df = pd.DataFrame(results)
|
||||
output_path = OUTPUT_DIR / f"{transcript_name}_coded.csv"
|
||||
df.to_csv(output_path, index=False)
|
||||
status_message = mo.md(f"✅ Analysis Complete! Saved to `{output_path}`")
|
||||
else:
|
||||
status_message = mo.md("⚠️ No quotes found for any theme.")
|
||||
df = pd.DataFrame()
|
||||
|
||||
elif run_btn.value and not file_dropdown.value:
|
||||
status_message = mo.md("⚠️ Please select a transcript first.")
|
||||
df = pd.DataFrame()
|
||||
else:
|
||||
df = pd.DataFrame()
|
||||
|
||||
mo.vstack([status_message, mo.ui.table(df)])
|
||||
return (
|
||||
bar,
|
||||
content,
|
||||
df,
|
||||
extracted,
|
||||
item,
|
||||
output_path,
|
||||
prompt,
|
||||
response,
|
||||
results,
|
||||
status_message,
|
||||
theme,
|
||||
theme_def,
|
||||
theme_name,
|
||||
transcript_name,
|
||||
transcript_path,
|
||||
transcript_text,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
BIN
assets/JP-Morgan-Chase-Symbol.png
Normal file
BIN
assets/JP-Morgan-Chase-Symbol.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 13 KiB |
60
ollama/docker-compose.yml
Normal file
60
ollama/docker-compose.yml
Normal file
@@ -0,0 +1,60 @@
|
||||
services:
|
||||
ollama:
|
||||
image: ollama/ollama:latest
|
||||
ports:
|
||||
- 11434:11434
|
||||
volumes:
|
||||
- ./docker-volumes/ollama:/root/.ollama
|
||||
container_name: ollama
|
||||
tty: true
|
||||
restart: unless-stopped
|
||||
# GPU SUPPORT NOTES:
|
||||
# 1. The "deploy" section is ignored by classic 'docker-compose'; it's honored in Swarm.
|
||||
# 2. For local 'docker compose up' with NVIDIA GPUs you need the host configured with
|
||||
# nvidia-container-toolkit. Then either:
|
||||
# a) Leave the reservation block (Compose V2 now honors it) OR
|
||||
# b) Start with: docker compose up --build (Compose will request GPUs) OR
|
||||
# c) Explicitly override: docker compose run --gpus all ollama
|
||||
# 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the
|
||||
# 'devices' section further down as a fallback (less portable).
|
||||
|
||||
## UNCOMMENT THE FOLLOWING BLOCK FOR NVIDIA GPU SUPPORT ###
|
||||
# deploy:
|
||||
# resources:
|
||||
# reservations:
|
||||
# devices:
|
||||
# - driver: nvidia
|
||||
# count: all
|
||||
# capabilities: [gpu]
|
||||
|
||||
environment:
|
||||
# Visible devices / capabilities for the NVIDIA container runtime
|
||||
- NVIDIA_VISIBLE_DEVICES=all
|
||||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
## ---------- END GPU SUPPORT BLOCK ------------###
|
||||
|
||||
|
||||
# Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors):
|
||||
# devices:
|
||||
# - /dev/nvidiactl:/dev/nvidiactl
|
||||
# - /dev/nvidia-uvm:/dev/nvidia-uvm
|
||||
# - /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools
|
||||
# - /dev/nvidia0:/dev/nvidia0
|
||||
|
||||
open-webui:
|
||||
image: ghcr.io/open-webui/open-webui:main
|
||||
container_name: open-webui
|
||||
volumes:
|
||||
- ./docker-volumes/open-webui:/app/backend/data
|
||||
depends_on:
|
||||
- ollama
|
||||
ports:
|
||||
- 3000:8080
|
||||
environment:
|
||||
- 'OLLAMA_BASE_URL=http://ollama:11434'
|
||||
- 'ENABLE_OLLAMA_API=true'
|
||||
- 'WEBUI_SECRET_KEY='
|
||||
|
||||
extra_hosts:
|
||||
- host.docker.internal:host-gateway
|
||||
restart: unless-stopped
|
||||
@@ -6,8 +6,18 @@ readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"marimo>=0.18.0",
|
||||
"modin[dask]>=0.37.1",
|
||||
"numpy>=2.3.5",
|
||||
"ollama>=0.6.1",
|
||||
"openai>=2.9.0",
|
||||
"openpyxl>=3.1.5",
|
||||
"pandas>=2.3.3",
|
||||
"pyzmq>=27.1.0",
|
||||
"requests>=2.32.5",
|
||||
"taguette>=1.5.1",
|
||||
"wordcloud>=1.9.5",
|
||||
]
|
||||
|
||||
|
||||
[tool.uv.sources]
|
||||
wordcloud = { git = "https://github.com/amueller/word_cloud.git" }
|
||||
|
||||
86
utils.py
86
utils.py
@@ -1,86 +0,0 @@
|
||||
"""
|
||||
Standard utils for this repository
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from ollama import Client
|
||||
|
||||
|
||||
def load_srt(path: str | Path) -> str:
|
||||
"""Load and parse an SRT file, returning clean transcript with speaker labels.
|
||||
|
||||
Args:
|
||||
path: Path to the SRT file
|
||||
|
||||
Returns:
|
||||
Clean transcript string with format "SPEAKER_XX: text" per line,
|
||||
timestamps stripped, consecutive lines from same speaker merged.
|
||||
"""
|
||||
path = Path(path)
|
||||
content = path.read_text(encoding='utf-8')
|
||||
|
||||
# Parse SRT blocks: sequence number, timestamp, speaker|text
|
||||
# Pattern matches: number, timestamp line, content line(s)
|
||||
blocks = re.split(r'\n\n+', content.strip())
|
||||
|
||||
turns = []
|
||||
for block in blocks:
|
||||
lines = block.strip().split('\n')
|
||||
if len(lines) < 3:
|
||||
continue
|
||||
|
||||
# Skip sequence number (line 0) and timestamp (line 1)
|
||||
# Content is line 2 onwards
|
||||
text_lines = lines[2:]
|
||||
text = ' '.join(text_lines)
|
||||
|
||||
# Parse speaker|text format
|
||||
if '|' in text:
|
||||
speaker, utterance = text.split('|', 1)
|
||||
speaker = speaker.strip()
|
||||
utterance = utterance.strip()
|
||||
else:
|
||||
speaker = "UNKNOWN"
|
||||
utterance = text.strip()
|
||||
|
||||
turns.append((speaker, utterance))
|
||||
|
||||
# Merge consecutive turns from same speaker
|
||||
merged = []
|
||||
for speaker, utterance in turns:
|
||||
if merged and merged[-1][0] == speaker:
|
||||
merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
|
||||
else:
|
||||
merged.append((speaker, utterance))
|
||||
|
||||
# Format as "SPEAKER_XX: text"
|
||||
transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
|
||||
return '\n\n'.join(transcript_lines)
|
||||
|
||||
|
||||
def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
|
||||
"""Establish connection to Qumo Ollama instance
|
||||
|
||||
vm_name: str ('ollama-lite' or 'hiperf-gpu')
|
||||
Name of the VM running the Ollama instance
|
||||
|
||||
Returns:
|
||||
tuple(Client): Ollama client connected to the specified VM
|
||||
"""
|
||||
QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:11434'
|
||||
try:
|
||||
requests.get(QUMO_OLLAMA_URL, timeout=5)
|
||||
client = Client(
|
||||
host=QUMO_OLLAMA_URL
|
||||
)
|
||||
except requests.ConnectionError:
|
||||
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
|
||||
|
||||
print(f"Connection succesful. WebUI available at: http://{vm_name}.tail44fa00.ts.net:3000\nAvailable models:")
|
||||
for m in client.list().models:
|
||||
print(f" - '{m.model}' ")
|
||||
return client
|
||||
|
||||
5
utils/__init__.py
Normal file
5
utils/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .ollama_utils import connect_qumo_ollama
|
||||
from .data_utils import create_sentiment_matrix, extract_theme
|
||||
from .transcript_utils import load_srt, csv_to_markdown, cpc_smb_to_markdown
|
||||
from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis
|
||||
from .keyword_analysis import ollama_keyword_extraction, worker_extraction, blue_color_func
|
||||
65
utils/data_utils.py
Normal file
65
utils/data_utils.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
|
||||
"""
|
||||
Create a sentiment matrix for a specific document.
|
||||
|
||||
Parameters:
|
||||
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
|
||||
- document_name: Name of the document to filter by
|
||||
|
||||
Returns:
|
||||
- DataFrame representing the sentiment matrix
|
||||
"""
|
||||
|
||||
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
|
||||
sentiment_rows = doc_df[
|
||||
doc_df['tag'].str.contains(column_prefix, na=False)
|
||||
].copy()
|
||||
|
||||
if sentiment_rows.empty:
|
||||
print("No sentiment data found")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Filter for rows with valid Voice/Character context
|
||||
valid_rows = sentiment_rows[
|
||||
sentiment_rows['_context'].notna() &
|
||||
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
|
||||
].copy()
|
||||
|
||||
if valid_rows.empty:
|
||||
print("No Voice/Character context found")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
|
||||
# Sum sentiment scores for each combination
|
||||
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
|
||||
|
||||
# Pivot to create the matrix
|
||||
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
|
||||
|
||||
# # Convert to integers for cleaner display
|
||||
# matrix = matrix.astype(int)
|
||||
|
||||
return matrix
|
||||
|
||||
|
||||
|
||||
def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
|
||||
"""
|
||||
Extract the theme from a tag string.
|
||||
|
||||
Parameters:
|
||||
- tag: str, the tag string (e.g., 'VT - Personal Experience')
|
||||
- theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
|
||||
|
||||
Returns:
|
||||
- str, the extracted theme (e.g., 'Personal Experience')
|
||||
- None if no theme found
|
||||
"""
|
||||
for prefix in theme_prefixes.split('|'):
|
||||
if tag.startswith(prefix):
|
||||
return tag.replace(prefix, '').strip()
|
||||
return None
|
||||
|
||||
109
utils/keyword_analysis.py
Normal file
109
utils/keyword_analysis.py
Normal file
@@ -0,0 +1,109 @@
|
||||
import pandas as pd
|
||||
|
||||
from ollama import Client
|
||||
import json
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
import random
|
||||
import matplotlib.colors as mcolors
|
||||
|
||||
def blue_color_func( word, font_size, position, orientation, random_state=None, **kwargs):
|
||||
# Use the provided random_state for reproducibility if available, else use random module
|
||||
r = random_state if random_state else random
|
||||
|
||||
# Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
|
||||
# 0.0 is white/light, 1.0 is dark blue
|
||||
min_val, max_val = 0.4, 1.0
|
||||
color_val = r.uniform(min_val, max_val)
|
||||
|
||||
# Get color from matplotlib colormap
|
||||
rgba = plt.cm.Blues(color_val)
|
||||
return mcolors.to_hex(rgba)
|
||||
|
||||
|
||||
def worker_extraction(row, host, model):
|
||||
|
||||
|
||||
# Instantiate local client for this specific worker/thread
|
||||
local_client = Client(host=host)
|
||||
|
||||
return ollama_keyword_extraction(
|
||||
content=row['content'],
|
||||
tag=row['tag'],
|
||||
client=local_client,
|
||||
model=model
|
||||
)
|
||||
|
||||
|
||||
def ollama_keyword_extraction(content, tag, client: Client, model) -> list:
|
||||
"""
|
||||
Perform sentiment analysis using Ollama model.
|
||||
|
||||
Parameters:
|
||||
- content: Text content to analyze
|
||||
- tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
|
||||
|
||||
Returns:
|
||||
- sentiment score and reason
|
||||
"""
|
||||
|
||||
# Construct prompt for Ollama model
|
||||
# Prompt optimized for small models (Llama 3.2):
|
||||
# - Fewer rules, prioritized by importance
|
||||
# - Explicit verbatim instruction (prevents truncation errors)
|
||||
# - Examples that reinforce exact copying
|
||||
# - Positive framing (do X) instead of negative (don't do Y)
|
||||
# - Minimal formatting overhead
|
||||
prompt = f"""Extract keywords from interview quotes for thematic analysis.
|
||||
|
||||
RULES (in priority order):
|
||||
1. Extract only keywords RELEVANT to the given context. Ignore off-topic content. Do NOT invent keywords.
|
||||
2. Use words from the quote, but generalize for clustering (e.g., "not youthful" → "traditional").
|
||||
3. Extract 1-5 keywords or short phrases that capture key themes.
|
||||
4. Prefer descriptive phrases over vague single words (e.g., "tech forward" not "tech").
|
||||
|
||||
EXAMPLES:
|
||||
|
||||
Context: Chase as a Brand
|
||||
Quote: "It's definitely not, like, youthful or trendy."
|
||||
Output: {{"keywords": ["traditional", "established"]}}
|
||||
|
||||
Context: App Usability
|
||||
Quote: "There are so many options when I try to pay, it's confusing."
|
||||
Output: {{"keywords": ["confusing", "overwhelming options"]}}
|
||||
|
||||
Context: Brand Perception
|
||||
Quote: "I would say reliable, trustworthy, kind of old-school."
|
||||
Output: {{"keywords": ["reliable", "trustworthy", "old-school"]}}
|
||||
|
||||
NOW EXTRACT KEYWORDS:
|
||||
|
||||
Context: {tag}
|
||||
Quote: "{content}"
|
||||
Output:"""
|
||||
|
||||
max_retries = 3
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = client.generate(
|
||||
model=model,
|
||||
prompt=prompt,
|
||||
format='json',
|
||||
)
|
||||
|
||||
response_text = resp.response.strip()
|
||||
|
||||
# Extract JSON from response
|
||||
start_index = response_text.find('{')
|
||||
|
||||
if start_index == -1:
|
||||
raise ValueError("No JSON found")
|
||||
|
||||
response_json, _ = json.JSONDecoder().raw_decode(response_text[start_index:])
|
||||
keywords = response_json.get('keywords', [])
|
||||
return [keywords]
|
||||
|
||||
except Exception as e:
|
||||
print(f"Attempt {attempt + 1}/{max_retries} failed: {e}. Output was: {response_text}")
|
||||
if attempt == max_retries - 1:
|
||||
return [[]]
|
||||
42
utils/ollama_utils.py
Normal file
42
utils/ollama_utils.py
Normal file
@@ -0,0 +1,42 @@
|
||||
|
||||
|
||||
|
||||
import requests
|
||||
from ollama import Client
|
||||
|
||||
|
||||
|
||||
|
||||
def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434', print_models=True) -> Client:
|
||||
"""Establish connection to Qumo Ollama instance
|
||||
|
||||
vm_name: str ('ollama-lite' or 'hiperf-gpu')
|
||||
Name of the VM running the Ollama instance
|
||||
|
||||
Returns:
|
||||
tuple(Client): Ollama client connected to the specified VM
|
||||
"""
|
||||
QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
|
||||
|
||||
if vm_name in ['localhost', '0.0.0.0']:
|
||||
QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
|
||||
|
||||
try:
|
||||
requests.get(QUMO_OLLAMA_URL, timeout=5)
|
||||
client = Client(
|
||||
host=QUMO_OLLAMA_URL
|
||||
)
|
||||
|
||||
print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}")
|
||||
models = [m.model for m in client.list().models]
|
||||
if print_models:
|
||||
print("Available models:")
|
||||
for m in models:
|
||||
print(f" - '{m}' ")
|
||||
return client, models
|
||||
|
||||
except requests.ConnectionError:
|
||||
pass
|
||||
|
||||
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
|
||||
return None, None
|
||||
135
utils/sentiment_analysis.py
Normal file
135
utils/sentiment_analysis.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import random
|
||||
import pandas as pd
|
||||
|
||||
from ollama import Client
|
||||
import json
|
||||
|
||||
def dummy_sentiment_analysis(content, tag):
|
||||
if tag.startswith('VT -') or tag.startswith('CT -'):
|
||||
return random.choice([-1, 0, 1]), 'random dummy sentiment' # Random sentiment for testing
|
||||
|
||||
return 'test', 'not applicable'
|
||||
|
||||
|
||||
|
||||
def ollama_sentiment_analysis(content, theme, theme_description, client: Client, model) -> tuple[list[str], int, str]:
|
||||
"""
|
||||
Perform sentiment analysis using Ollama model.
|
||||
|
||||
Parameters:
|
||||
- content: Text content to analyze
|
||||
- tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
|
||||
|
||||
Returns:
|
||||
- sentiment score and reason
|
||||
"""
|
||||
prompt = f"""
|
||||
# Role
|
||||
You are an expert in sentiment analysis. Your task is to analyze the sentiment of a quote in relation to a specific theme.
|
||||
|
||||
# Input
|
||||
Theme: `{theme}`
|
||||
Theme Description: `{theme_description}`
|
||||
Quote:
|
||||
```
|
||||
{content}
|
||||
```
|
||||
|
||||
# Instructions
|
||||
1. Analyze the sentiment of the quote specifically regarding the theme.
|
||||
2. Extract relevant keywords or phrases from the quote. Prioritize specific descriptors found in the text that match or relate to the theme.
|
||||
3. Assign a sentiment score:
|
||||
- -1: Negative (complaint, dissatisfaction, criticism)
|
||||
- 0: Neutral (factual, mixed, or no strong opinion)
|
||||
- 1: Positive (praise, satisfaction, agreement)
|
||||
4. Provide a concise reason (max 10 words).
|
||||
|
||||
# Constraints
|
||||
- Return ONLY a valid JSON object.
|
||||
- Do not use Markdown formatting (no ```json blocks).
|
||||
- Do not write any Python code or explanations outside the JSON.
|
||||
- If the quote is irrelevant to the theme, return sentiment 0.
|
||||
|
||||
# Response Format
|
||||
{{
|
||||
"keywords": ["<list_of_keywords>"],
|
||||
"sentiment": <integer_score>,
|
||||
"reason": "<string_reason>"
|
||||
}}
|
||||
|
||||
# Examples
|
||||
|
||||
Example 1:
|
||||
Theme: `Speed`
|
||||
Quote: `It was a little slow for me.`
|
||||
Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "Dissatisfaction with speed"}}
|
||||
|
||||
Example 2:
|
||||
Theme: `Price`
|
||||
Quote: `It costs $50.`
|
||||
Response: {{"keywords": [], "sentiment": 0, "reason": "Factual statement"}}
|
||||
|
||||
Example 3:
|
||||
Theme: `Friendliness`
|
||||
Quote: `Sound very welcoming.`
|
||||
Response: {{"keywords": ["welcoming"], "sentiment": 1, "reason": "Positive descriptor used"}}
|
||||
"""
|
||||
|
||||
max_retries = 3
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = client.generate(
|
||||
model=model,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
response_text = resp.response.strip()
|
||||
|
||||
# Extract JSON from response
|
||||
start_index = response_text.find('{')
|
||||
end_index = response_text.rfind('}') + 1
|
||||
|
||||
if start_index == -1 or end_index == 0:
|
||||
raise ValueError("No JSON found")
|
||||
|
||||
json_str = response_text[start_index:end_index]
|
||||
|
||||
response_json = json.loads(json_str)
|
||||
keywords = response_json.get('keywords', [])
|
||||
sentiment = response_json.get('sentiment', 'test')
|
||||
reason = response_json.get('reason', 'no reason provided')
|
||||
return keywords, sentiment, reason
|
||||
|
||||
except Exception as e:
|
||||
print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
|
||||
if attempt == max_retries - 1:
|
||||
return [], None, 'parsing error'
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
client = Client(
|
||||
host="http://localhost:11434"
|
||||
)
|
||||
|
||||
sentiment_df = pd.DataFrame({
|
||||
'content': [
|
||||
"I love this product!",
|
||||
"This is the worst service ever.",
|
||||
"It's okay, not great but not terrible."
|
||||
],
|
||||
'tag': [
|
||||
'VT - Personal Experience',
|
||||
'VT - Personal Experience',
|
||||
'VT - Personal Experience'
|
||||
],
|
||||
'manual_analysis': [False, False, True]
|
||||
})
|
||||
|
||||
sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
|
||||
lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['tag'], client, model='llama3.2:latest')),
|
||||
axis=1
|
||||
)
|
||||
|
||||
print(sentiment_df.head())
|
||||
|
||||
148
utils/transcript_utils.py
Normal file
148
utils/transcript_utils.py
Normal file
@@ -0,0 +1,148 @@
|
||||
|
||||
from pathlib import Path
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
def load_srt(path: str | Path) -> str:
|
||||
"""Load and parse an SRT file, returning clean transcript with speaker labels.
|
||||
|
||||
Args:
|
||||
path: Path to the SRT file
|
||||
|
||||
Returns:
|
||||
Clean transcript string with format "SPEAKER_XX: text" per line,
|
||||
timestamps stripped, consecutive lines from same speaker merged.
|
||||
"""
|
||||
path = Path(path)
|
||||
content = path.read_text(encoding='utf-8')
|
||||
|
||||
# Parse SRT blocks: sequence number, timestamp, speaker|text
|
||||
# Pattern matches: number, timestamp line, content line(s)
|
||||
blocks = re.split(r'\n\n+', content.strip())
|
||||
|
||||
turns = []
|
||||
for block in blocks:
|
||||
lines = block.strip().split('\n')
|
||||
if len(lines) < 3:
|
||||
continue
|
||||
|
||||
# Skip sequence number (line 0) and timestamp (line 1)
|
||||
# Content is line 2 onwards
|
||||
text_lines = lines[2:]
|
||||
text = ' '.join(text_lines)
|
||||
|
||||
# Parse speaker|text format
|
||||
if '|' in text:
|
||||
speaker, utterance = text.split('|', 1)
|
||||
speaker = speaker.strip()
|
||||
utterance = utterance.strip()
|
||||
else:
|
||||
speaker = "UNKNOWN"
|
||||
utterance = text.strip()
|
||||
|
||||
turns.append((speaker, utterance))
|
||||
|
||||
# Merge consecutive turns from same speaker
|
||||
merged = []
|
||||
for speaker, utterance in turns:
|
||||
if merged and merged[-1][0] == speaker:
|
||||
merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
|
||||
else:
|
||||
merged.append((speaker, utterance))
|
||||
|
||||
# Format as "SPEAKER_XX: text"
|
||||
transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
|
||||
return '\n\n'.join(transcript_lines)
|
||||
|
||||
|
||||
|
||||
def csv_to_markdown(csv_path:Path):
|
||||
"""Convert transcript CSV to markdown, merging consecutive same-speaker turns."""
|
||||
df = pd.read_csv(str(csv_path))
|
||||
|
||||
lines = ["# Interview Transcript"]
|
||||
|
||||
# Track previous speaker to detect when speaker changes
|
||||
prev_speaker = None
|
||||
# Accumulate text from consecutive turns by same speaker
|
||||
merged_text = []
|
||||
|
||||
for _, row in df.iterrows():
|
||||
speaker = row["Speaker"]
|
||||
text = str(row["Transcript"]).strip()
|
||||
|
||||
if speaker == prev_speaker:
|
||||
# Same speaker continues — append text to current block
|
||||
merged_text.append(text)
|
||||
else:
|
||||
# New speaker detected — flush previous speaker's block
|
||||
if prev_speaker is not None:
|
||||
# Format: **Speaker**: text-part-1\n\ntext-part-2
|
||||
# Use \n\n to ensure distinct paragraphs for readability
|
||||
lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}")
|
||||
|
||||
# Start new block for current speaker
|
||||
prev_speaker = speaker
|
||||
merged_text = [text]
|
||||
|
||||
# Flush final speaker's block
|
||||
if prev_speaker is not None:
|
||||
lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}")
|
||||
|
||||
# Join all blocks with double newlines for clear separation
|
||||
return "\n\n".join(lines)
|
||||
|
||||
|
||||
def cpc_smb_to_markdown(cpc_path: Path) -> str:
|
||||
"""Convert CPC text transcript to markdown, merging consecutive same-speaker turns."""
|
||||
content = Path(cpc_path).read_text(encoding='utf-8')
|
||||
|
||||
lines = ["# Interview Transcript"]
|
||||
prev_speaker = None
|
||||
merged_text = []
|
||||
|
||||
# Regex to find speaker labels: Word followed by colon and space
|
||||
speaker_pattern = re.compile(r'(?:^|\s)([A-Za-z0-9]+):\s')
|
||||
|
||||
for line in content.splitlines():
|
||||
line = line.strip().replace('\n', ' ')
|
||||
|
||||
# Handle edge case: "CPC1, (She/ Her,) LOCATION: Hello." -> "CPC1: Hello."
|
||||
match = re.match(r'^"?([A-Za-z0-9]+),\s*\(.*?\)\s*LOCATION:\s*(.*?)"?$', line)
|
||||
if match:
|
||||
line = f"{match.group(1)}: {match.group(2)}"
|
||||
|
||||
# Remove surrounding quotes
|
||||
if line.startswith('"') and line.endswith('"'):
|
||||
line = line[1:-1].strip()
|
||||
|
||||
if not line:
|
||||
continue
|
||||
|
||||
parts = speaker_pattern.split(line)
|
||||
|
||||
# If no speaker found, skip line (assumed garbage like "Like", headers)
|
||||
if len(parts) < 2:
|
||||
continue
|
||||
|
||||
# parts[0] is text before the first speaker on this line
|
||||
if parts[0].strip() and prev_speaker:
|
||||
merged_text.append(parts[0].strip())
|
||||
|
||||
# Iterate over speaker-text pairs
|
||||
for i in range(1, len(parts), 2):
|
||||
speaker = parts[i]
|
||||
text = parts[i+1].strip()
|
||||
|
||||
if speaker == prev_speaker:
|
||||
merged_text.append(text)
|
||||
else:
|
||||
if prev_speaker is not None:
|
||||
lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}")
|
||||
prev_speaker = speaker
|
||||
merged_text = [text]
|
||||
|
||||
if prev_speaker is not None:
|
||||
lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}")
|
||||
|
||||
return "\n\n".join(lines)
|
||||
Reference in New Issue
Block a user