import marimo __generated_with = "0.18.3" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import pandas as pd from pathlib import Path return Path, mo, pd @app.cell def _(Path): INPUT_DIR = Path("data/transcripts/raw") OUTPUT_DIR = Path("data/transcripts/clean") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) return INPUT_DIR, OUTPUT_DIR @app.cell def _(INPUT_DIR, mo): csv_files = list(INPUT_DIR.glob("*.csv")) file_options = {f.stem: str(f) for f in csv_files} file_dropdown = mo.ui.dropdown( options=file_options, label="Select CSV Transcript", full_width=True ) file_dropdown return (file_dropdown,) @app.function(hide_code=True) def csv_to_markdown(df): """Convert transcript DataFrame to markdown, merging consecutive same-speaker turns.""" lines = ["# Interview Transcript"] # Track previous speaker to detect when speaker changes prev_speaker = None # Accumulate text from consecutive turns by same speaker merged_text = [] for _, row in df.iterrows(): speaker = row["Speaker"] text = str(row["Transcript"]).strip() if speaker == prev_speaker: # Same speaker continues — append text to current block merged_text.append(text) else: # New speaker detected — flush previous speaker's block if prev_speaker is not None: # Format: **Speaker**: text-part-1\n\ntext-part-2 # Use \n\n to ensure distinct paragraphs for readability lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}") # Start new block for current speaker prev_speaker = speaker merged_text = [text] # Flush final speaker's block if prev_speaker is not None: lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}") # Join all blocks with double newlines for clear separation return "\n\n".join(lines) @app.cell(hide_code=True) def _(file_dropdown, mo, pd): # Preview preview = mo.md("") if file_dropdown.value: df = pd.read_csv(file_dropdown.value) md_content = csv_to_markdown(df.head(10)) preview = mo.md(md_content) preview return @app.cell def _(mo): convert_btn = mo.ui.run_button(label="Convert to Markdown") convert_btn return (convert_btn,) @app.cell def _(OUTPUT_DIR, Path, convert_btn, file_dropdown, mo, pd): result = mo.md("") saved_md = None if convert_btn.value and file_dropdown.value: _df = pd.read_csv(file_dropdown.value) saved_md = csv_to_markdown(_df) _out_path = OUTPUT_DIR / (Path(file_dropdown.value).stem + ".md") _out_path.write_text(saved_md) result = mo.callout(f"✅ Saved to `{_out_path}`", kind="success") result return (saved_md,) @app.cell def _(mo, saved_md): saved_preview = mo.md("") if saved_md: saved_preview = mo.vstack([ mo.md("### Saved Markdown Preview"), mo.md(saved_md) ]) saved_preview return @app.cell(hide_code=True) def _(mo): mo.md(r""" # Taguette Upload and process using taguette: http://taguette.tail44fa00.ts.net/ """) return @app.cell def _(): return if __name__ == "__main__": app.run()