@@ -13,8 +13,8 @@ def _():
VM_NAME = ' hiperf-gpu '
MODEL = ' llama3.3:70b '
client = connect_qumo_ollama( VM_NAME )
return MODEL , Path , client , load_srt , mo
# client = connect_qumo_ollama( VM_NAME)
return MODEL , Path , load_srt , mo
@app.cell ( hide_code = True )
@@ -186,6 +186,316 @@ def _(mo):
return
@app.cell
def _ ( mo ) :
# Step 3a: Define themes for labelling
themes_input = mo . ui . text_area (
value = """ brand voice and tone
customer experience priorities
design system and consistency
AI and conversational interfaces """ ,
label = " Themes (one per line) " ,
full_width = True ,
rows = 6 ,
)
mo . md ( """ ### Step 3a: Define Themes
Enter one theme per line. These will be used to
label each interview transcript. Themes may overlap; the
same section can relate to multiple themes.
""" )
themes_input
return ( themes_input , )
@app.cell
def _ ( themes_input ) :
# Parse themes into a clean Python list
raw_lines = themes_input . value . splitlines ( ) if themes_input . value and themes_input . value else [ ]
theme_list = [ t . strip ( ) for t in raw_lines if t . strip ( ) ]
return ( theme_list , )
@app.cell
def _ ( Path , mo ) :
# Configuration for JSON output directory
OUTPUT_DIR = Path ( " data/labels " )
OUTPUT_DIR . mkdir ( parents = True , exist_ok = True )
mo . md ( f """ ### Step 3b: LLM-based Theme Labelling
This step runs an LLM over the current interview transcript
for each defined theme and saves one JSON file per theme
for this interview in ` { OUTPUT_DIR } `.
For each theme, the model will return full sections of the
conversation (multi-sentence chunks, not just short quotes)
that are about that theme.
""" )
label_button = mo . ui . run_button ( label = " Run Theme Labelling for This Interview " )
label_button
return
@app.cell
def _ (
MODEL ,
OUTPUT_THEME_DIR ,
Path ,
client ,
file_dropdown ,
labeled_transcript ,
mo ,
theme_label_button ,
theme_list ,
) :
import json
from datetime import datetime
theme_label_results = { }
if theme_label_button . value and file_dropdown . value and theme_list :
interview_id = Path ( file_dropdown . value ) . stem
for theme in theme_list :
prompt = f """ You are an expert qualitative researcher.
You will analyse a single interview transcript for ONE specific theme.
Theme: " { theme } "
Tasks:
1. Decide if the theme is present in this interview.
2. If present, estimate how relevant it is on a 0– 1 scale
where 0 = not mentioned, 0.5 = moderately important,
1 = central theme of the interview.
3. Identify all sections of the conversation that are
primarily about this theme. A section can span multiple
consecutive utterances and should form a coherent piece
of the dialogue about the theme, not just a single
sentence.
Each section should include:
- the dominant speaker label (or " mixed " if multiple)
- the full section text (one or more sentences)
Return your answer ONLY as a JSON object with this schema:
{{
" theme " : string, // the theme name
" present " : bool, // whether the theme appears
" relevance " : float, // 0.0– 1.0
" sections " : [
{{
" speaker " : string, // main speaker label for the section
" section_text " : string // full section text about the theme
}}
]
}}
Transcript:
"""
{ labeled_transcript }
"""
"""
response = client . generate ( model = MODEL , prompt = prompt )
raw_text = response . response . strip ( )
try :
parsed = json . loads ( raw_text )
except json . JSONDecodeError :
# Fallback: try to extract JSON between braces
try :
start = raw_text . index ( " { " )
end = raw_text . rindex ( " } " ) + 1
parsed = json . loads ( raw_text [ start : end ] )
except Exception :
parsed = {
" theme " : theme ,
" present " : False ,
" relevance " : 0.0 ,
" sections " : [ ] ,
" _parse_error " : True ,
" _raw " : raw_text ,
}
# Normalise fields
parsed [ " theme " ] = parsed . get ( " theme " , theme )
parsed [ " present " ] = bool ( parsed . get ( " present " , False ) )
try :
parsed [ " relevance " ] = float ( parsed . get ( " relevance " , 0.0 ) )
except ( TypeError , ValueError ) :
parsed [ " relevance " ] = 0.0
if not isinstance ( parsed . get ( " sections " ) , list ) :
parsed [ " sections " ] = [ ]
theme_label_results [ theme ] = parsed
# Write per-interview-per-theme JSON file
out_path = OUTPUT_THEME_DIR / f " { interview_id } __ { theme . replace ( ' ' , ' _ ' ) } .json "
out_data = {
" interview_id " : interview_id ,
" theme " : parsed [ " theme " ] ,
" present " : parsed [ " present " ] ,
" relevance " : parsed [ " relevance " ] ,
" sections " : parsed [ " sections " ] ,
" generated_at " : datetime . utcnow ( ) . isoformat ( ) + " Z " ,
}
out_path . write_text ( json . dumps ( out_data , ensure_ascii = False , indent = 2 ) , encoding = " utf-8 " )
if theme_label_button . value :
if not file_dropdown . value :
status = " No transcript selected. "
elif not theme_list :
status = " No themes defined. Please add at least one theme. "
else :
status = f " Labelled { len ( theme_label_results ) } themes for current interview. JSON files written to ' { OUTPUT_THEME_DIR } ' . "
else :
status = " Click ' Run Theme Labelling for This Interview ' to start. "
mo . md ( f """ ### Theme Labelling Status
{ status }
""" )
return
@app.cell
def _ ( Path , mo ) :
# Step 3c: Load all labeled transcripts (assumed precomputed)
LABELED_DIR = Path ( " data/labeled_transcripts " )
LABELED_DIR . mkdir ( parents = True , exist_ok = True )
labeled_files = sorted ( LABELED_DIR . glob ( " *.json " ) )
mo . md ( f """ ### Step 3c: Use Pre-Labeled Transcripts
Found ** { len ( labeled_files ) } ** labeled transcript files in ` { LABELED_DIR } `.
These will be used to aggregate themes across all interviews.
""" )
labeled_files
return ( labeled_files , )
@app.cell
def _ ( labeled_files ) :
import json
all_labeled_records = [ ]
for f in labeled_files :
try :
data = json . loads ( f . read_text ( encoding = " utf-8 " ) )
except Exception :
# Skip unreadable files
continue
interview_id = data . get ( " interview_id " ) or f . stem . split ( " __ " , 1 ) [ 0 ]
theme = data . get ( " theme " , " " )
present = bool ( data . get ( " present " , False ) )
try :
relevance = float ( data . get ( " relevance " , 0.0 ) )
except ( TypeError , ValueError ) :
relevance = 0.0
sections = data . get ( " sections " ) or [ ]
all_labeled_records . append (
{
" interview_id " : interview_id ,
" theme " : theme ,
" present " : present ,
" relevance " : relevance ,
" sections " : sections ,
}
)
return ( all_labeled_records , )
@app.cell
def _ ( all_labeled_records , mo ) :
# Derive full theme and interview sets
all_themes = sorted ( { r [ " theme " ] for r in all_labeled_records if r [ " theme " ] } )
all_interviews = sorted ( { r [ " interview_id " ] for r in all_labeled_records } )
theme_selector = mo . ui . dropdown (
options = { t : t for t in all_themes } ,
label = " Select theme to explore across all interviews " ,
)
mo . md ( " ### Step 3d: Explore Themes Across All Labeled Transcripts " )
theme_selector
return all_interviews , theme_selector
@app.cell
def _ ( all_interviews , all_labeled_records , mo , theme_selector ) :
import statistics
selected_theme = theme_selector . value
theme_summary = { }
theme_sections = [ ]
if selected_theme :
theme_records = [
r for r in all_labeled_records if r [ " theme " ] == selected_theme
]
present_flags = [ r [ " present " ] for r in theme_records ]
relevances = [ r [ " relevance " ] for r in theme_records if r [ " present " ] ]
theme_summary = {
" theme " : selected_theme ,
" num_interviews " : len ( all_interviews ) ,
" num_interviews_with_theme " : sum ( present_flags ) ,
" share_of_interviews_with_theme " : (
sum ( present_flags ) / len ( all_interviews ) if all_interviews else 0.0
) ,
" avg_relevance_if_present " : (
statistics . mean ( relevances ) if relevances else 0.0
) ,
}
for r in theme_records :
interview_id = r [ " interview_id " ]
for s in r [ " sections " ] :
theme_sections . append (
{
" interview_id " : interview_id ,
" speaker " : s . get ( " speaker " , " " ) ,
" section_text " : s . get ( " section_text " , " " ) ,
" relevance " : r [ " relevance " ] ,
}
)
mo . md (
f """ #### Theme Overview: ` { selected_theme or " None selected " } `
- Total interviews: ** { len ( all_interviews ) } **
- Interviews where theme is present: ** { theme_summary . get ( " num_interviews_with_theme " , 0 ) } **
- Share of interviews with theme: ** { theme_summary . get ( " share_of_interviews_with_theme " , 0.0 ) : .2f } **
- Avg. relevance (when present): ** { theme_summary . get ( " avg_relevance_if_present " , 0.0 ) : .2f } **
"""
)
if theme_sections :
table_rows = [
{
" Interview " : s [ " interview_id " ] ,
" Speaker " : s [ " speaker " ] ,
" Relevance " : f " { s [ ' relevance ' ] : .2f } " ,
" Section " : s [ " section_text " ] ,
}
for s in theme_sections
]
mo . ui . table ( table_rows )
else :
mo . md ( " _No sections for this theme yet._ " )
return
@app.cell
def _ ( mo ) :
# Editable analysis task prompt